knitr::opts_chunk$set(echo=TRUE, root.dir = "../")
#knitr::opts_knit$set(root.dir = "../")
source("Rsource/Hub_microbiome_workshop_source.R")
library(microbiome)
library(ggplot2)
library(ggpubr)
library(knitr)

Create a phyloseq object from DADA2 output

infile_asv_counts<-"Rdata/demo_asv_counts.tsv"
infile_asv_tax<-"Rdata/demo_asvs_taxonomy.tsv"
infile_sample_data<-"Rdata/made_up_sample_data.tsv"

df_asv_counts<-read.delim(infile_asv_counts)
df_asv_tax<-read.delim(infile_asv_tax)
df_sample_data<-read.delim(infile_sample_data)
#Convert the data to matrix format
m_asv_counts<-as.matrix(df_asv_counts[2:length(df_asv_counts)])
rownames(m_asv_counts)<-df_asv_counts[,1]

m_asv_tax<-as.matrix(df_asv_tax[2:length(df_asv_tax)])
rownames(m_asv_tax)<-df_asv_tax[,1]

##There are 3 possible components to a phyloseq object: otu_table, sample_data, tax_table

pseq<-phyloseq(otu_table(m_asv_counts,taxa_are_rows = T),tax_table(m_asv_tax),sample_data(df_sample_data))


##Lets look at the data in our phyloseq object
##The functions used to create the components of a phyloseq object
##can be used to view the components of a phyloseq object
View(otu_table(pseq)@.Data)
View(microbiome::meta(pseq))
View(tax_table(pseq)@.Data)

Read in a dataset with interesting results and perform some initial processing

infile<-"Rdata/otu_table.biom"
infile_meta<-"Rdata/meta_data.csv"


pseq<-read_biom2phyloseq(infile,metadata.file = infile_meta)

####Let's filter out some of the samples to simplify the analysis####
pseq<-subset_samples(pseq,group_name=="Without.comp.food" & diet !="Experimental infant formula")
####Summarize data at the Genus level. Not required, but convienent####
pseq<-aggregate_taxa(pseq,level="Genus")

Inspect the phyloseq object

View(otu_table(pseq)@.Data)
View(microbiome::meta(pseq)) 
#Alternative method View(sample_data(pseq))
View(tax_table(pseq)@.Data)

Calculate alpha diversity from milk study

Remember alpha diversity is calculated within a sample so normalization is not neccessary

tab<-microbiome::diversity(pseq,index="all")
kable(tab)
inverse_simpson gini_simpson shannon fisher coverage
12021.BF.31.2 1.461086 0.3155776 0.7334518 2.764950 1
12021.BF.35.2 3.239409 0.6913017 1.4592188 4.316774 2
12021.BF.36.2 1.209815 0.1734276 0.4126838 1.926832 1
12021.BF.40.2 3.314990 0.6983399 1.5670103 3.967838 2
12021.BP.14.2 1.316280 0.2402834 0.6508000 4.193167 1
12021.BP.39.2 1.347396 0.2578277 0.6238770 3.716607 1
12021.BP.39.3 1.394276 0.2827821 0.7055823 3.781214 1
12021.EP.44.2 3.365172 0.7028384 1.5589336 4.433606 2
12021.BF.24.2 2.526996 0.6042732 1.1008349 3.514795 1
12021.EF.72.2 2.496498 0.5994389 1.3141284 3.934783 1
12021.EP.73.2 1.268385 0.2115956 0.5902808 3.367423 1
12021.EF.43.2 2.890154 0.6539977 1.6377025 5.441104 1
12021.EF.18.2 4.096520 0.7558903 1.7336332 4.706827 2
12021.EF.18.3 4.506792 0.7781127 2.0434085 4.501478 2
12021.EF.3.2 3.236652 0.6910388 1.7216149 5.177305 1
12021.EP.21.2 3.109226 0.6783766 1.5413540 4.758156 2
12021.BF.2.3 3.907144 0.7440586 1.8177519 2.780483 2
12021.BF.32.2 2.844232 0.6484113 1.4098474 4.108981 1
12021.BP.14.3 2.506362 0.6010153 1.5259038 3.992907 1
12021.BF.13.3 3.113530 0.6788211 1.4755282 4.222052 2
12021.BF.14.2 1.815080 0.4490600 1.0369118 4.848389 1
12021.BF.27.3 2.983619 0.6648365 1.5698604 4.327041 1
12021.BP.13.2 3.842460 0.7397500 1.7151828 3.688179 2
12021.BP.30.2 1.130083 0.1151096 0.2888944 2.724956 1
12021.BP.36.2 1.636226 0.3888376 0.9279660 3.347275 1
12021.BP.36.3 2.829877 0.6466278 1.6325278 5.011669 1
12021.EF.34.2 3.199228 0.6874246 1.4182747 5.132423 2
12021.EF.43.3 2.772195 0.6392750 1.5234099 4.756638 1
12021.EF.56.2 5.024868 0.8009898 1.9333194 4.758426 2
12021.EP.11.2 5.512141 0.8185823 1.9482030 5.340070 3
12021.EP.11.3 4.333263 0.7692270 1.8614695 5.052094 2
12021.EP.21.3 4.868894 0.7946145 1.9279716 5.046062 2
12021.EP.39.3 3.389778 0.7049954 1.6301370 4.516335 2
12021.EP.57.2 2.777151 0.6399188 1.4065867 4.547713 1
12021.EF.75.3 2.591677 0.6141494 1.2782498 4.530516 1
12021.EP.2.2 2.673321 0.6259335 1.4773889 5.037734 1
12021.EP.29.2 3.633452 0.7247796 1.6710140 6.401069 2
12021.EP.70.3 2.891594 0.6541699 1.3999026 5.194287 1
12021.BP.31.2 2.657929 0.6237672 1.5945862 4.517235 1
12021.BF.23.2 3.931692 0.7456566 1.7785290 4.712511 2
12021.BP.22.2 1.160414 0.1382388 0.3943185 3.369552 1
12021.EP.39.2 3.954261 0.7471082 1.8022514 4.693744 2
12021.EF.54.2 3.140337 0.6815628 1.5221985 6.326166 2
12021.BP.31.3 2.533217 0.6052450 1.4851213 4.360590 1
12021.EF.63.2 2.697340 0.6292644 1.5830571 5.767317 1
12021.EF.63.3 1.693944 0.4096617 1.1229086 5.111896 1
12021.EP.2.3 1.601809 0.3757060 0.7573976 2.181124 1
12021.BP.23.3 1.319199 0.2419644 0.5438181 2.890437 1
12021.EF.75.2 2.096718 0.5230642 1.2058485 5.514221 1
12021.BF.13.2 3.902346 0.7437439 1.8001431 5.024132 2
12021.BF.7.2 2.349271 0.5743361 1.4671624 4.989032 1
12021.EF.48.2 3.196198 0.6871283 1.3842727 4.304816 2
12021.EF.48.3 5.915751 0.8309598 2.0115561 4.525398 3
12021.BP.30.3 1.574503 0.3648789 0.8245899 3.066261 1
12021.EF.13.2 1.877121 0.4672692 1.1608239 4.293382 1
12021.BP.10.2 2.101399 0.5241266 1.0757043 2.905318 1
12021.BP.23.2 1.467612 0.3186209 0.6800216 2.939048 1
12021.BF.9.2 1.444252 0.3076002 0.7877636 3.005226 1
12021.BP.4.2 2.776562 0.6398424 1.5383686 3.706074 1
12021.EP.70.2 3.135717 0.6810936 1.4509156 3.904831 2
12021.BF.27.2 1.304069 0.2331691 0.6358506 3.651137 1
12021.EF.1.2 2.001443 0.5003606 1.0494527 3.414295 1
12021.BP.8.3 1.375470 0.2729759 0.6691807 3.261206 1
12021.EP.23.2 2.759147 0.6375691 1.4449632 4.823316 1
12021.BF.2.2 2.052458 0.5127794 1.1499245 2.621351 1
12021.BP.8.2 1.882113 0.4686823 1.0043024 4.435313 1
##Let's create a dataframe for plotting the results
df_milk_meta<-microbiome::meta(pseq)
##Add Shannon divesity data to the dataframe
df_milk_meta$Shannon<-tab$shannon

##Let's create a plot comparing the diversity between breast milk and formula
plot_div<-ggviolin(df_milk_meta,x="diet",y="Shannon",
             add="boxplot",fill="diet")
##Add stats (Wilcoxon test)
plot_div<-plot_div+stat_compare_means(comparisons = list(c("Breast milk","Standard infant formula")),
                          method="wilcox.test")
print(plot_div)

##Diversity is higher in the standard infant formula group

Composition plot

##Compute the relative level for each taxa
pseq.rel<-microbiome::transform(pseq,"compositional")
##Remove rare OTUs
pseq.rel.core<-core(pseq.rel,detection = 0,prevalence = 0.5)
##aggregate taxa to the class level (so data will fit on graph)
pseq.rel.core.family<-aggregate_taxa(pseq.rel.core,level="Family")

##Recommend to save as 10X10
comp_plot<-plot_composition(pseq.rel.core.family,group_by="diet")+
  guides(fill=guide_legend(ncol=1))+
    labs(x = "Samples", y = "Relative abundance",
     title = "Relative abundance data")
print(comp_plot)

####Beta diversity and microbiome divergence

##Result is sensitive to sample size. Recommended to subsample or bootstrap to
##avoid bias
##Rarefying data is inappropriate for differential abundance testing,
##but rarefying gives the best results when comparing sample similarity
##Calculate the divergences within the diet groups
pseq.rarefied<-rarefy_even_depth(pseq,rngseed=8675309,replace=F)

##Calculate beta diversity with respect to the median taxa
##profile for each group
b.breast_milk<-as.numeric(divergence(x=subset_samples(pseq.rarefied,diet=="Breast milk"),y=apply(abundances(subset_samples(pseq.rarefied,diet=="Breast milk")),1,median),method="bray"))

b.formula<-as.numeric(divergence(x=subset_samples(pseq.rarefied,diet=="Standard infant formula"),y=apply(abundances(subset_samples(pseq.rarefied,diet=="Standard infant formula")),1,median),method="bray"))

#b.formula<-divergence(subset_samples(pseq.rarefied,diet="Standard infant formula"))
boxplot(list(Breast_Milk=b.breast_milk,Formula=b.formula))

wilcox.test(x=b.breast_milk,y=b.formula,alternative = "less")
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  b.breast_milk and b.formula
## W = 181, p-value = 1.651e-06
## alternative hypothesis: true location shift is less than 0
##The breast milk feed infants have more similar microbiomes to each other
##than the forumla fed infants to do each other

####Sample ordination

library(vegan) #Perform statistics on ordination results
##Rarefying data is inappropriate for differential abundance testing,
##but rarefying gives the best results when comparing sample similarity
##Calculate the divergences within the diet groups
pseq.rarefied<-rarefy_even_depth(pseq,rngseed=8675309,replace=F)

##First lets plot the sample ordination
 ord_plot<-plot_landscape(pseq.rarefied,method="PCoA",distance = "bray",col="diet")+labs(title="PCoA/Bray-Curtis")
 print(ord_plot)

##We will use a PERMONVA to see if there are statistically distinguishable communties 
dist<-vegdist(t(otu_table(pseq.rarefied)),method="bray")

##Check if the variances are equal between the groups. If not assumtion for PERMOVNA not met
print(anova(betadisper(dist,microbiome::meta(pseq.rarefied)$diet)))
## Analysis of Variance Table
## 
## Response: Distances
##           Df  Sum Sq Mean Sq F value Pr(>F)   
## Groups     1 0.48369 0.48369  11.494 0.0012 **
## Residuals 64 2.69320 0.04208                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##For this data the assumtion is not met, but here's the code
permonva_results<-adonis(t(otu_table(pseq.rarefied))~diet,data=microbiome::meta(pseq.rarefied),
                 permutations = 100,method="bray")
print(permonva_results)
## 
## Call:
## adonis(formula = t(otu_table(pseq.rarefied)) ~ diet, data = microbiome::meta(pseq.rarefied),      permutations = 100, method = "bray") 
## 
## Permutation: free
## Number of permutations: 100
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)   
## diet       1    1.5153 1.51528  8.3636 0.11558 0.009901 **
## Residuals 64   11.5953 0.18118         0.88442            
## Total     65   13.1105                 1.00000            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##Show coefficients for the top taxa separating the groups
permonva_result_coefs<-coefficients(permonva_results)
permonva_result_diet<-permonva_result_coefs["diet1",]

top.diet.coef<-permonva_result_diet[rev(order(abs(permonva_result_diet)))[1:5]]
par(mar=c(3,9,2,1)) #Setting margin sizes requires trial and error
barplot(sort(top.diet.coef),horiz=T,las=1,main="Top taxa")

####To use metagenomeSeq need to create a new data object called MRexperiment####

library(metagenomeSeq)
pseq_mrexp<-phyloseq_to_metagenomeSeq(pseq)


###Keep samples with at least 1 count. Keep features with at least 30 count###
pseq_mrexp<-filterData(pseq_mrexp,present=30,depth=1)

###Normalization###
p<-cumNormStat(pseq_mrexp)
pseq_mrexp<-cumNorm(pseq_mrexp,p=p) #Calculate the scaling factors

###Create a design matrix and run statistical model###
phenotype_data<-pData(pseq_mrexp) #Get the metadata associated with this experiment

###Make breast milk the reference group (Reference is the denominator in logFC)###
phenotype_data$diet<-relevel(factor(phenotype_data$diet),"Breast milk")
##A design matrix specicies what comparisons we want to make
##For more on design matrices check this youtube video: https://www.youtube.com/watch?v=CqLGvwi-5Pc
design_matrix<-model.matrix(~1+diet,data=phenotype_data) #Can drop 1, we're just explicitly showing an intercept

feature_fit_model<-fitFeatureModel(obj=pseq_mrexp,mod=design_matrix)

###Identify the differentially abundant microbes###
taxas<-fData(pseq_mrexp)$unique
sig_taxa<-MRcoefs(feature_fit_model,adjustMethod = "fdr",alpha = 0.1,taxa=taxas)
write.csv(sig_taxa,"Routput/sig_taxa_table_metagenomeSeq.csv")
####Graph the results###

##Identify the rows containing the abundance info for the significant taxa
sig_taxa_index<-which(rownames(MRcounts(pseq_mrexp)) %in% rownames(sig_taxa))
##Get the descriptions of the significant taxa
sig_taxa_desc<-rownames(MRcounts(pseq_mrexp))[sig_taxa_index]
##Find data locations for the 2 groups
classIndex<-list(Breast_milk=which(pData(pseq_mrexp)$diet=="Breast milk",arr.ind = T))
classIndex$Infant_formula<-which(pData(pseq_mrexp)=="Standard infant formula",arr.ind = T)


pdf("Rfigs/sig_taxa_metagenomeSeq.pdf")
for(i in seq(1,length(sig_taxa_index))){
  taxa_index<-sig_taxa_index[[i]]
  taxa_desc<-sig_taxa_desc[[i]]
  plotOTU(pseq_mrexp,otu=taxa_index,classIndex = classIndex,main=taxa_desc)
}

dev.off()
## quartz_off_screen 
##                 2

Use DESeq2 to perform differential abundance analysis

library(DESeq2)
sample_data(pseq)$diet<-relevel(factor(sample_data(pseq)$diet), "Breast milk")

##convert data to a format used by DESeq (group by diet)
pseq.deseq<-phyloseq_to_deseq2(pseq,~diet)

##Perform normalization and differential abundance test
pseq.deseq<-DESeq(pseq.deseq,test="Wald",fitType = "parametric")

##Get and filter the results
res<-results(pseq.deseq,cooksCutoff = F,pAdjustMethod = "BH",alpha=0.1)

res<-res[complete.cases(res),] ##Remove any NA results

##Write the results to file
res_table<-cbind(as(res,"data.frame"),as(tax_table(pseq)[rownames(res),],"matrix"))
write.csv(res_table,"Routput/sig_taxa_table_deseq2.csv")

##Graph the results

##Normalize the expersion data
pseq.clr<-microbiome::transform(pseq,"clr")
exp_data_clr<-abundances(pseq.clr)

##Keep only the differentially abundant microbes
exp_data_clr_sig<-exp_data_clr[which(rownames(exp_data_clr) %in% rownames(res)),]
exp_data_clr_sig<-melt(exp_data_clr_sig)
colnames(exp_data_clr_sig)<-c("Taxa","Sample","CLR")

##Extract the  metadata
df_meta<-microbiome::meta(pseq.clr)
df_meta$Sample<-rownames(df_meta)
df_meta<-df_meta %>% dplyr::select(Sample,diet)

df_dam<-left_join(exp_data_clr_sig,df_meta,by="Sample")

##Create the graphs
pdf("Rfigs/graph_deseq2_sig.pdf")
for(taxon in unique(df_dam$Taxa)){

  df_dam_taxon<-df_dam %>% filter(Taxa==taxon)
  plot<-ggplot(df_dam_taxon,aes(x=diet,y=CLR))+geom_dotplot(binaxis = "y",stackdir = "center")
  plot<-plot+ggtitle(taxon)+theme(plot.title = element_text(hjust=0.5))
  print(plot)
}
dev.off()
## quartz_off_screen 
##                 2
##Normalize the expersion data
pseq.norm<-microbiome::transform(pseq,"compositional")
exp_data_norm<-abundances(pseq.norm)

##Keep only the differentially abundant microbes
exp_data_norm_sig<-exp_data_norm[which(rownames(exp_data_norm) %in% rownames(res)),]
exp_data_norm_sig<-melt(exp_data_norm_sig)
colnames(exp_data_norm_sig)<-c("Taxa","Sample","Relative_Abundance")

##Extract the  metadata
df_meta<-microbiome::meta(pseq.norm)
df_meta$Sample<-rownames(df_meta)
df_meta<-df_meta %>% dplyr::select(Sample,diet)

df_dam<-left_join(exp_data_norm_sig,df_meta,by="Sample")

##Create the graphs
pdf("Rfigs/graph_deseq2_sig_ra.pdf")
for(taxon in unique(df_dam$Taxa)){
  
  df_dam_taxon<-df_dam %>% filter(Taxa==taxon)
  plot<-ggplot(df_dam_taxon,aes(x=diet,y=Relative_Abundance))+geom_dotplot(binaxis = "y",stackdir = "center")
  plot<-plot+ggtitle(taxon)+theme(plot.title = element_text(hjust=0.5))
  print(plot)
}
dev.off()
## quartz_off_screen 
##                 2