ASV_initial_analysis

Sparsity detection and accuracy evaluation

## Checking how sparse the ASV is
sum(apply(seqtab.nochim, 2, function(x){x > 0}))/(nrow(seqtab.nochim) * ncol(seqtab.nochim))

## [1] 0.3521552

## Checking the distribution of reads counts
hist(log2(seqtab.nochim[seqtab.nochim > 0]), breaks = 100,
     col = 'cornflowerblue', las = 1, main = 'Distribution of reads counts')
quantile(seqtab.nochim[seqtab.nochim > 0])

##   0%  25%  50%  75% 100% 
##    2   10   20   54 3488

quantile(seqtab.nochim[seqtab.nochim > 0], 0.05)

## 5% 
##  4

abline(v = log2(5),lty = 2)

## Barplot investigating the number of consensus sequence detected
barplot(apply(seqtab.nochim, 1, function(x){length(which( x > 4))}),
        las = 2, ylab = 'Number of consensus sequence')
abline (h = 20, lty = 2)

## Accuracy evaluation by using mock sequence
## Loading the reference seqence for Mock sample
mock_seq <- read.table('mock.sequence.tab.txt', header = T)
seqtab.nochim_mock <- seqtab.nochim[20,]
seqtab.nochim_seq  <- names(seqtab.nochim_mock)[which(seqtab.nochim_mock > 4)]
## Printing out the number of consensus sequence in Mock sample match to reference sequences
sum( sapply(seqtab.nochim_seq, function(s){any( grep(s, mock_seq$sequence) )} ) )

## [1] 20

Shannon and Simpson diversity

##Matrix generation for sample information 
daysAfterWeaning <- sapply(strsplit(rownames(seqtab.nochim)[1:19], 'D'), function(x){
                           x[2]   
                    })
samdf <- data.frame(id = rownames(seqtab.nochim)[1:19],
                    daysAfterWeaning = as.numeric(daysAfterWeaning))
samdf$col <- 'red'
samdf$col[samdf$daysAfterWeaning > 100] <- 'navy'

##Function for Shannon diversity 
#input should be samples x taxonomy
shannonDiversity <- function(df) 
{
     p <- t( apply(df, 1, function(x){x/sum(x)}) )
     H <- apply(p , 1, function(x){x <- x[x > 0]; -sum( log(x) * x )})
     return(H)
}

##Function for Simpson diversity
SimpsonDiversity <- function(df) 
{
     p <- t( apply(df, 1, function(x){x/sum(x)}) )
     H <- apply(p , 1, function(x){x <- x[x > 0];1-sum( (x ^ 2))})
     return(H)
}
shD <- shannonDiversity(seqtab.nochim[1:19, ])
siD <- SimpsonDiversity(seqtab.nochim[1:19, ])
par(mfrow = c(1,2))
plot(samdf$daysAfterWeaning,shD, col = samdf$col, pch = 19 ,las = 1, 
     xlab = 'Days', ylab = 'Shannon Diversity')
plot(samdf$daysAfterWeaning,siD, col = samdf$col, pch = 19 ,las = 1, 
     xlab = 'Days', ylab = 'Simpson Diversity')

Projecting the samples on 2-D plot by using PCA

## Transform data to proportions as appropriate for Bray-Curtis distance
ps.prop <- t(apply(seqtab.nochim,1,function(x){x/sum(x)})) 
BC_dis  <- function(prop) 
{
       m <- matrix(, nrow(prop), nrow(prop))
       for(i in 1:nrow(prop)) 
       {
           for(j in 1:nrow(prop)) 
           {
                 m[i,j] <- 1 - sum( apply(prop[c(i,j),],2,min))
           }  
       } 
       return(m)
}

## Bray-Curtis distance calculation
ps.BC_dis<- BC_dis (ps.prop[1:19, ])
ps.PCA   <- prcomp(ps.BC_dis,scale=F )
plot(ps.PCA$rotation[,1],
     ps.PCA$rotation[,2], 
     bg = samdf$col,pch = 21,cex = 2,
     las = 1,xlab = 'PCA_direction1',
     ylab = 'PCA_direction2')

ASV_initial_analysis

Chengqi(Charley) Wang

11/29/2020

Data loading

Here, we are going to load the ASV table with the row indicating the sampleID. The column is the consensus sequence. The file name is asv.table.txt

Sparsity detection and accuracy evaluation

Shannon and Simpson diversity

Projecting the samples on 2-D plot by using PCA