Sparsity detection and accuracy evaluation
## Checking how sparse the ASV is
sum(apply(seqtab.nochim, 2, function(x){x > 0}))/(nrow(seqtab.nochim) * ncol(seqtab.nochim))
## [1] 0.3521552
## Checking the distribution of reads counts
hist(log2(seqtab.nochim[seqtab.nochim > 0]), breaks = 100,
col = 'cornflowerblue', las = 1, main = 'Distribution of reads counts')
quantile(seqtab.nochim[seqtab.nochim > 0])
## 0% 25% 50% 75% 100%
## 2 10 20 54 3488
quantile(seqtab.nochim[seqtab.nochim > 0], 0.05)
## 5%
## 4
abline(v = log2(5),lty = 2)
## Barplot investigating the number of consensus sequence detected
barplot(apply(seqtab.nochim, 1, function(x){length(which( x > 4))}),
las = 2, ylab = 'Number of consensus sequence')
abline (h = 20, lty = 2)
## Accuracy evaluation by using mock sequence
## Loading the reference seqence for Mock sample
mock_seq <- read.table('mock.sequence.tab.txt', header = T)
seqtab.nochim_mock <- seqtab.nochim[20,]
seqtab.nochim_seq <- names(seqtab.nochim_mock)[which(seqtab.nochim_mock > 4)]
## Printing out the number of consensus sequence in Mock sample match to reference sequences
sum( sapply(seqtab.nochim_seq, function(s){any( grep(s, mock_seq$sequence) )} ) )
## [1] 20
Shannon and Simpson diversity
##Matrix generation for sample information
daysAfterWeaning <- sapply(strsplit(rownames(seqtab.nochim)[1:19], 'D'), function(x){
x[2]
})
samdf <- data.frame(id = rownames(seqtab.nochim)[1:19],
daysAfterWeaning = as.numeric(daysAfterWeaning))
samdf$col <- 'red'
samdf$col[samdf$daysAfterWeaning > 100] <- 'navy'
##Function for Shannon diversity
#input should be samples x taxonomy
shannonDiversity <- function(df)
{
p <- t( apply(df, 1, function(x){x/sum(x)}) )
H <- apply(p , 1, function(x){x <- x[x > 0]; -sum( log(x) * x )})
return(H)
}
##Function for Simpson diversity
SimpsonDiversity <- function(df)
{
p <- t( apply(df, 1, function(x){x/sum(x)}) )
H <- apply(p , 1, function(x){x <- x[x > 0];1-sum( (x ^ 2))})
return(H)
}
shD <- shannonDiversity(seqtab.nochim[1:19, ])
siD <- SimpsonDiversity(seqtab.nochim[1:19, ])
par(mfrow = c(1,2))
plot(samdf$daysAfterWeaning,shD, col = samdf$col, pch = 19 ,las = 1,
xlab = 'Days', ylab = 'Shannon Diversity')
plot(samdf$daysAfterWeaning,siD, col = samdf$col, pch = 19 ,las = 1,
xlab = 'Days', ylab = 'Simpson Diversity')
Projecting the samples on 2-D plot by using PCA
## Transform data to proportions as appropriate for Bray-Curtis distance
ps.prop <- t(apply(seqtab.nochim,1,function(x){x/sum(x)}))
BC_dis <- function(prop)
{
m <- matrix(, nrow(prop), nrow(prop))
for(i in 1:nrow(prop))
{
for(j in 1:nrow(prop))
{
m[i,j] <- 1 - sum( apply(prop[c(i,j),],2,min))
}
}
return(m)
}
## Bray-Curtis distance calculation
ps.BC_dis<- BC_dis (ps.prop[1:19, ])
ps.PCA <- prcomp(ps.BC_dis,scale=F )
plot(ps.PCA$rotation[,1],
ps.PCA$rotation[,2],
bg = samdf$col,pch = 21,cex = 2,
las = 1,xlab = 'PCA_direction1',
ylab = 'PCA_direction2')