Microbiome workshop: Data analysis and visualization

knitr::opts_chunk$set(echo=TRUE, root.dir = "../")
#knitr::opts_knit$set(root.dir = "../")

source("Rsource/Hub_microbiome_workshop_source.R")
library(microbiome)
library(ggplot2)
library(ggpubr)
library(knitr)

Create a phyloseq object from DADA2 output

infile_asv_counts<-"Rdata/demo_asv_counts.tsv"
infile_asv_tax<-"Rdata/demo_asvs_taxonomy.tsv"
infile_sample_data<-"Rdata/made_up_sample_data.tsv"

df_asv_counts<-read.delim(infile_asv_counts)
df_asv_tax<-read.delim(infile_asv_tax)
df_sample_data<-read.delim(infile_sample_data)
#Convert the data to matrix format
m_asv_counts<-as.matrix(df_asv_counts[2:length(df_asv_counts)])
rownames(m_asv_counts)<-df_asv_counts[,1]

m_asv_tax<-as.matrix(df_asv_tax[2:length(df_asv_tax)])
rownames(m_asv_tax)<-df_asv_tax[,1]

##There are 3 possible components to a phyloseq object: otu_table, sample_data, tax_table

pseq<-phyloseq(otu_table(m_asv_counts,taxa_are_rows = T),tax_table(m_asv_tax),sample_data(df_sample_data))


##Lets look at the data in our phyloseq object
##The functions used to create the components of a phyloseq object
##can be used to view the components of a phyloseq object
View(otu_table(pseq)@.Data)
View(microbiome::meta(pseq))
View(tax_table(pseq)@.Data)

Read in a dataset with interesting results and perform some initial processing

infile<-"Rdata/otu_table.biom"
infile_meta<-"Rdata/meta_data.csv"


pseq<-read_biom2phyloseq(infile,metadata.file = infile_meta)

####Let's filter out some of the samples to simplify the analysis####
pseq<-subset_samples(pseq,group_name=="Without.comp.food" & diet !="Experimental infant formula")
####Summarize data at the Genus level. Not required, but convienent####
pseq<-aggregate_taxa(pseq,level="Genus")

Inspect the phyloseq object

View(otu_table(pseq)@.Data)
View(microbiome::meta(pseq)) 
#Alternative method View(sample_data(pseq))
View(tax_table(pseq)@.Data)

Calculate alpha diversity from milk study

Remember alpha diversity is calculated within a sample so normalization is not neccessary

tab<-microbiome::diversity(pseq,index="all")
kable(tab)

	inverse_simpson	gini_simpson	shannon	fisher	coverage
12021.BF.31.2	1.461086	0.3155776	0.7334518	2.764950	1
12021.BF.35.2	3.239409	0.6913017	1.4592188	4.316774	2
12021.BF.36.2	1.209815	0.1734276	0.4126838	1.926832	1
12021.BF.40.2	3.314990	0.6983399	1.5670103	3.967838	2
12021.BP.14.2	1.316280	0.2402834	0.6508000	4.193167	1
12021.BP.39.2	1.347396	0.2578277	0.6238770	3.716607	1
12021.BP.39.3	1.394276	0.2827821	0.7055823	3.781214	1
12021.EP.44.2	3.365172	0.7028384	1.5589336	4.433606	2
12021.BF.24.2	2.526996	0.6042732	1.1008349	3.514795	1
12021.EF.72.2	2.496498	0.5994389	1.3141284	3.934783	1
12021.EP.73.2	1.268385	0.2115956	0.5902808	3.367423	1
12021.EF.43.2	2.890154	0.6539977	1.6377025	5.441104	1
12021.EF.18.2	4.096520	0.7558903	1.7336332	4.706827	2
12021.EF.18.3	4.506792	0.7781127	2.0434085	4.501478	2
12021.EF.3.2	3.236652	0.6910388	1.7216149	5.177305	1
12021.EP.21.2	3.109226	0.6783766	1.5413540	4.758156	2
12021.BF.2.3	3.907144	0.7440586	1.8177519	2.780483	2
12021.BF.32.2	2.844232	0.6484113	1.4098474	4.108981	1
12021.BP.14.3	2.506362	0.6010153	1.5259038	3.992907	1
12021.BF.13.3	3.113530	0.6788211	1.4755282	4.222052	2
12021.BF.14.2	1.815080	0.4490600	1.0369118	4.848389	1
12021.BF.27.3	2.983619	0.6648365	1.5698604	4.327041	1
12021.BP.13.2	3.842460	0.7397500	1.7151828	3.688179	2
12021.BP.30.2	1.130083	0.1151096	0.2888944	2.724956	1
12021.BP.36.2	1.636226	0.3888376	0.9279660	3.347275	1
12021.BP.36.3	2.829877	0.6466278	1.6325278	5.011669	1
12021.EF.34.2	3.199228	0.6874246	1.4182747	5.132423	2
12021.EF.43.3	2.772195	0.6392750	1.5234099	4.756638	1
12021.EF.56.2	5.024868	0.8009898	1.9333194	4.758426	2
12021.EP.11.2	5.512141	0.8185823	1.9482030	5.340070	3
12021.EP.11.3	4.333263	0.7692270	1.8614695	5.052094	2
12021.EP.21.3	4.868894	0.7946145	1.9279716	5.046062	2
12021.EP.39.3	3.389778	0.7049954	1.6301370	4.516335	2
12021.EP.57.2	2.777151	0.6399188	1.4065867	4.547713	1
12021.EF.75.3	2.591677	0.6141494	1.2782498	4.530516	1
12021.EP.2.2	2.673321	0.6259335	1.4773889	5.037734	1
12021.EP.29.2	3.633452	0.7247796	1.6710140	6.401069	2
12021.EP.70.3	2.891594	0.6541699	1.3999026	5.194287	1
12021.BP.31.2	2.657929	0.6237672	1.5945862	4.517235	1
12021.BF.23.2	3.931692	0.7456566	1.7785290	4.712511	2
12021.BP.22.2	1.160414	0.1382388	0.3943185	3.369552	1
12021.EP.39.2	3.954261	0.7471082	1.8022514	4.693744	2
12021.EF.54.2	3.140337	0.6815628	1.5221985	6.326166	2
12021.BP.31.3	2.533217	0.6052450	1.4851213	4.360590	1
12021.EF.63.2	2.697340	0.6292644	1.5830571	5.767317	1
12021.EF.63.3	1.693944	0.4096617	1.1229086	5.111896	1
12021.EP.2.3	1.601809	0.3757060	0.7573976	2.181124	1
12021.BP.23.3	1.319199	0.2419644	0.5438181	2.890437	1
12021.EF.75.2	2.096718	0.5230642	1.2058485	5.514221	1
12021.BF.13.2	3.902346	0.7437439	1.8001431	5.024132	2
12021.BF.7.2	2.349271	0.5743361	1.4671624	4.989032	1
12021.EF.48.2	3.196198	0.6871283	1.3842727	4.304816	2
12021.EF.48.3	5.915751	0.8309598	2.0115561	4.525398	3
12021.BP.30.3	1.574503	0.3648789	0.8245899	3.066261	1
12021.EF.13.2	1.877121	0.4672692	1.1608239	4.293382	1
12021.BP.10.2	2.101399	0.5241266	1.0757043	2.905318	1
12021.BP.23.2	1.467612	0.3186209	0.6800216	2.939048	1
12021.BF.9.2	1.444252	0.3076002	0.7877636	3.005226	1
12021.BP.4.2	2.776562	0.6398424	1.5383686	3.706074	1
12021.EP.70.2	3.135717	0.6810936	1.4509156	3.904831	2
12021.BF.27.2	1.304069	0.2331691	0.6358506	3.651137	1
12021.EF.1.2	2.001443	0.5003606	1.0494527	3.414295	1
12021.BP.8.3	1.375470	0.2729759	0.6691807	3.261206	1
12021.EP.23.2	2.759147	0.6375691	1.4449632	4.823316	1
12021.BF.2.2	2.052458	0.5127794	1.1499245	2.621351	1
12021.BP.8.2	1.882113	0.4686823	1.0043024	4.435313	1

##Let's create a dataframe for plotting the results
df_milk_meta<-microbiome::meta(pseq)
##Add Shannon divesity data to the dataframe
df_milk_meta$Shannon<-tab$shannon

##Let's create a plot comparing the diversity between breast milk and formula
plot_div<-ggviolin(df_milk_meta,x="diet",y="Shannon",
             add="boxplot",fill="diet")
##Add stats (Wilcoxon test)
plot_div<-plot_div+stat_compare_means(comparisons = list(c("Breast milk","Standard infant formula")),
                          method="wilcox.test")
print(plot_div)

##Diversity is higher in the standard infant formula group

Composition plot

##Compute the relative level for each taxa
pseq.rel<-microbiome::transform(pseq,"compositional")
##Remove rare OTUs
pseq.rel.core<-core(pseq.rel,detection = 0,prevalence = 0.5)
##aggregate taxa to the class level (so data will fit on graph)
pseq.rel.core.family<-aggregate_taxa(pseq.rel.core,level="Family")

##Recommend to save as 10X10
comp_plot<-plot_composition(pseq.rel.core.family,group_by="diet")+
  guides(fill=guide_legend(ncol=1))+
    labs(x = "Samples", y = "Relative abundance",
     title = "Relative abundance data")
print(comp_plot)

####Beta diversity and microbiome divergence

##Result is sensitive to sample size. Recommended to subsample or bootstrap to
##avoid bias
##Rarefying data is inappropriate for differential abundance testing,
##but rarefying gives the best results when comparing sample similarity
##Calculate the divergences within the diet groups
pseq.rarefied<-rarefy_even_depth(pseq,rngseed=8675309,replace=F)

##Calculate beta diversity with respect to the median taxa
##profile for each group
b.breast_milk<-as.numeric(divergence(x=subset_samples(pseq.rarefied,diet=="Breast milk"),y=apply(abundances(subset_samples(pseq.rarefied,diet=="Breast milk")),1,median),method="bray"))

b.formula<-as.numeric(divergence(x=subset_samples(pseq.rarefied,diet=="Standard infant formula"),y=apply(abundances(subset_samples(pseq.rarefied,diet=="Standard infant formula")),1,median),method="bray"))

#b.formula<-divergence(subset_samples(pseq.rarefied,diet="Standard infant formula"))
boxplot(list(Breast_Milk=b.breast_milk,Formula=b.formula))

wilcox.test(x=b.breast_milk,y=b.formula,alternative = "less")

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  b.breast_milk and b.formula
## W = 181, p-value = 1.651e-06
## alternative hypothesis: true location shift is less than 0

##The breast milk feed infants have more similar microbiomes to each other
##than the forumla fed infants to do each other

####Sample ordination

library(vegan) #Perform statistics on ordination results
##Rarefying data is inappropriate for differential abundance testing,
##but rarefying gives the best results when comparing sample similarity
##Calculate the divergences within the diet groups
pseq.rarefied<-rarefy_even_depth(pseq,rngseed=8675309,replace=F)

##First lets plot the sample ordination
 ord_plot<-plot_landscape(pseq.rarefied,method="PCoA",distance = "bray",col="diet")+labs(title="PCoA/Bray-Curtis")
 print(ord_plot)

##We will use a PERMONVA to see if there are statistically distinguishable communties 
dist<-vegdist(t(otu_table(pseq.rarefied)),method="bray")

##Check if the variances are equal between the groups. If not assumtion for PERMOVNA not met
print(anova(betadisper(dist,microbiome::meta(pseq.rarefied)$diet)))

## Analysis of Variance Table
## 
## Response: Distances
##           Df  Sum Sq Mean Sq F value Pr(>F)   
## Groups     1 0.48369 0.48369  11.494 0.0012 **
## Residuals 64 2.69320 0.04208                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

##For this data the assumtion is not met, but here's the code
permonva_results<-adonis(t(otu_table(pseq.rarefied))~diet,data=microbiome::meta(pseq.rarefied),
                 permutations = 100,method="bray")
print(permonva_results)

## 
## Call:
## adonis(formula = t(otu_table(pseq.rarefied)) ~ diet, data = microbiome::meta(pseq.rarefied),      permutations = 100, method = "bray") 
## 
## Permutation: free
## Number of permutations: 100
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)   
## diet       1    1.5153 1.51528  8.3636 0.11558 0.009901 **
## Residuals 64   11.5953 0.18118         0.88442            
## Total     65   13.1105                 1.00000            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

##Show coefficients for the top taxa separating the groups
permonva_result_coefs<-coefficients(permonva_results)
permonva_result_diet<-permonva_result_coefs["diet1",]

top.diet.coef<-permonva_result_diet[rev(order(abs(permonva_result_diet)))[1:5]]
par(mar=c(3,9,2,1)) #Setting margin sizes requires trial and error
barplot(sort(top.diet.coef),horiz=T,las=1,main="Top taxa")

####To use metagenomeSeq need to create a new data object called MRexperiment####

library(metagenomeSeq)
pseq_mrexp<-phyloseq_to_metagenomeSeq(pseq)


###Keep samples with at least 1 count. Keep features with at least 30 count###
pseq_mrexp<-filterData(pseq_mrexp,present=30,depth=1)

###Normalization###
p<-cumNormStat(pseq_mrexp)
pseq_mrexp<-cumNorm(pseq_mrexp,p=p) #Calculate the scaling factors

###Create a design matrix and run statistical model###
phenotype_data<-pData(pseq_mrexp) #Get the metadata associated with this experiment

###Make breast milk the reference group (Reference is the denominator in logFC)###
phenotype_data$diet<-relevel(factor(phenotype_data$diet),"Breast milk")
##A design matrix specicies what comparisons we want to make
##For more on design matrices check this youtube video: https://www.youtube.com/watch?v=CqLGvwi-5Pc
design_matrix<-model.matrix(~1+diet,data=phenotype_data) #Can drop 1, we're just explicitly showing an intercept

feature_fit_model<-fitFeatureModel(obj=pseq_mrexp,mod=design_matrix)

###Identify the differentially abundant microbes###
taxas<-fData(pseq_mrexp)$unique
sig_taxa<-MRcoefs(feature_fit_model,adjustMethod = "fdr",alpha = 0.1,taxa=taxas)
write.csv(sig_taxa,"Routput/sig_taxa_table_metagenomeSeq.csv")
####Graph the results###

##Identify the rows containing the abundance info for the significant taxa
sig_taxa_index<-which(rownames(MRcounts(pseq_mrexp)) %in% rownames(sig_taxa))
##Get the descriptions of the significant taxa
sig_taxa_desc<-rownames(MRcounts(pseq_mrexp))[sig_taxa_index]
##Find data locations for the 2 groups
classIndex<-list(Breast_milk=which(pData(pseq_mrexp)$diet=="Breast milk",arr.ind = T))
classIndex$Infant_formula<-which(pData(pseq_mrexp)=="Standard infant formula",arr.ind = T)


pdf("Rfigs/sig_taxa_metagenomeSeq.pdf")
for(i in seq(1,length(sig_taxa_index))){
  taxa_index<-sig_taxa_index[[i]]
  taxa_desc<-sig_taxa_desc[[i]]
  plotOTU(pseq_mrexp,otu=taxa_index,classIndex = classIndex,main=taxa_desc)
}

dev.off()

## quartz_off_screen 
##                 2

Use DESeq2 to perform differential abundance analysis

library(DESeq2)
sample_data(pseq)$diet<-relevel(factor(sample_data(pseq)$diet), "Breast milk")

##convert data to a format used by DESeq (group by diet)
pseq.deseq<-phyloseq_to_deseq2(pseq,~diet)

##Perform normalization and differential abundance test
pseq.deseq<-DESeq(pseq.deseq,test="Wald",fitType = "parametric")

##Get and filter the results
res<-results(pseq.deseq,cooksCutoff = F,pAdjustMethod = "BH",alpha=0.1)

res<-res[complete.cases(res),] ##Remove any NA results

##Write the results to file
res_table<-cbind(as(res,"data.frame"),as(tax_table(pseq)[rownames(res),],"matrix"))
write.csv(res_table,"Routput/sig_taxa_table_deseq2.csv")

##Graph the results

##Normalize the expersion data
pseq.clr<-microbiome::transform(pseq,"clr")
exp_data_clr<-abundances(pseq.clr)

##Keep only the differentially abundant microbes
exp_data_clr_sig<-exp_data_clr[which(rownames(exp_data_clr) %in% rownames(res)),]
exp_data_clr_sig<-melt(exp_data_clr_sig)
colnames(exp_data_clr_sig)<-c("Taxa","Sample","CLR")

##Extract the  metadata
df_meta<-microbiome::meta(pseq.clr)
df_meta$Sample<-rownames(df_meta)
df_meta<-df_meta %>% dplyr::select(Sample,diet)

df_dam<-left_join(exp_data_clr_sig,df_meta,by="Sample")

##Create the graphs
pdf("Rfigs/graph_deseq2_sig.pdf")
for(taxon in unique(df_dam$Taxa)){

  df_dam_taxon<-df_dam %>% filter(Taxa==taxon)
  plot<-ggplot(df_dam_taxon,aes(x=diet,y=CLR))+geom_dotplot(binaxis = "y",stackdir = "center")
  plot<-plot+ggtitle(taxon)+theme(plot.title = element_text(hjust=0.5))
  print(plot)
}
dev.off()

## quartz_off_screen 
##                 2

##Normalize the expersion data
pseq.norm<-microbiome::transform(pseq,"compositional")
exp_data_norm<-abundances(pseq.norm)

##Keep only the differentially abundant microbes
exp_data_norm_sig<-exp_data_norm[which(rownames(exp_data_norm) %in% rownames(res)),]
exp_data_norm_sig<-melt(exp_data_norm_sig)
colnames(exp_data_norm_sig)<-c("Taxa","Sample","Relative_Abundance")

##Extract the  metadata
df_meta<-microbiome::meta(pseq.norm)
df_meta$Sample<-rownames(df_meta)
df_meta<-df_meta %>% dplyr::select(Sample,diet)

df_dam<-left_join(exp_data_norm_sig,df_meta,by="Sample")

##Create the graphs
pdf("Rfigs/graph_deseq2_sig_ra.pdf")
for(taxon in unique(df_dam$Taxa)){
  
  df_dam_taxon<-df_dam %>% filter(Taxa==taxon)
  plot<-ggplot(df_dam_taxon,aes(x=diet,y=Relative_Abundance))+geom_dotplot(binaxis = "y",stackdir = "center")
  plot<-plot+ggtitle(taxon)+theme(plot.title = element_text(hjust=0.5))
  print(plot)
}
dev.off()

## quartz_off_screen 
##                 2