knitr::opts_chunk$set(echo=TRUE, root.dir = "../")
#knitr::opts_knit$set(root.dir = "../")

Install packages from source script

source("Rsource/Hub_microbiome_workshop_source.R")
library(tidyverse)
library(microbiome)
library(ggpubr)

Load the input files

infile_counts<-"Rdata/kaiju_species_preterm.tsv"
infile_meta<-"Rdata/meta_data.tsv"

df_species<-read.delim(infile_counts)
df_meta<-read.delim(infile_meta)

Munge the data into a usable form

##The sample info is in the "file" column. Let's simplify it to just the sample name
file_path<-"/work/m/mcmindsr/outputs/Metagenomics_workshop_USF/full/04_kaiju_reads/"
file_ext<-"_kaiju_taxonomy.out"

df_species$file<-gsub(file_path,"",df_species$file,fixed=T)
df_species$file<-gsub(file_ext,"",df_species$file,fixed=T)

##Change file to "Sample" to match the meta data
colnames(df_species)[1]<-"Sample"

##Convert the count data to matrix format
df_counts<-df_species %>% pivot_wider(id_cols=Sample,names_from=taxon_name,values_from=reads)
df_counts<-df_counts %>% replace(is.na(.),0)
##Need to replace "missing" values with zero
m_counts<-as.matrix(df_counts[,2:ncol(df_counts)])
rownames(m_counts)<-df_counts$Sample

##Create a taxonomy table from the columns in m_counts
df_taxa<-as_tibble(colnames(m_counts))
##Remove duplicated taxa
df_taxa<-df_taxa %>% separate(value,sep="[;]",
                                  into=c("Domain","Kingdom","Phylum",
                                         "Class","Order",
                                         "Family","Genus",
                                         "Species"),
                                  remove=T)
## Warning: Expected 8 pieces. Additional pieces discarded in 2981 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## Warning: Expected 8 pieces. Missing pieces filled with `NA` in 1873 rows [104,
## 117, 148, 212, 228, 243, 244, 259, 285, 298, 350, 369, 370, 396, 419, 533, 541,
## 581, 601, 614, ...].
##Convert taxa table to a matrix so we can use it to create a phyloseq object
m_taxa<-as.matrix(df_taxa)
rownames(m_taxa)<-colnames(m_counts)
##First make sure the data is in the order expected
m_counts<-m_counts[df_meta$Sample,]
rownames(df_meta)<-df_meta$Sample

##Since this is a complicated dataset lets focus on the first time group for now
df_meta<-df_meta %>% filter(Time_Group==3)
m_counts<-m_counts[which(rownames(m_counts) %in% df_meta$Sample),]
all(rownames(m_counts)==df_meta$Sample)
## [1] TRUE
##We shouldn't have to sort the taxa, but we will check it to be safe
all(colnames(m_counts)==rownames(m_taxa))
## [1] TRUE
pseq<-phyloseq(otu_table(m_counts,taxa_are_rows = F),
               tax_table(m_taxa),
               sample_data(df_meta))

View(otu_table(pseq))
View(microbiome::meta(pseq))
View(tax_table(pseq))

Create composition plot

##First normalize the data
pseq.rel<-microbiome::transform(pseq,"compositional")
pseq.rel.species<-aggregate_taxa(pseq.rel,level="Species")
##Remove rare species
pseq.species.rel.core<-core(pseq.rel.species,detection = 0.01,prevalence = 0.1)
pseq.species.rel.core<-subset_taxa(pseq.species.rel.core,Species !="Unknown")

comp_plot<-plot_composition(pseq.species.rel.core,group_by="Group")+
  guides(fill=guide_legend(ncol=1))+
  labs(x = "Samples", y = "Relative abundance",
       title = "Relative abundance data")
print(comp_plot)

Compare alpha diversity between groups

tab_shannon<-microbiome::diversity(pseq,index="shannon")
##Lets add the Shannon diversity to our meta data dataframe to plot the results
df_meta$Shannon<-unlist(tab_shannon)

##Create a plot comparing diverity based on antibiotic exposure
plot_div<-ggviolin(df_meta,x="Group",y="Shannon",add="boxplot",fill="Group")

##Add stats (Wilcoxon test)
comparisons<-list(c("Early and subsequent abx","Early only abx" ),
                  c("Early and subsequent abx","Term"),
                  c("Early only abx","Term"))

plot_div<-plot_div+stat_compare_means(comparisons=comparisons,method="wilcox.test")
print(plot_div)

Compare beta diversity

##Result is sensitive to sample size. Recommended to subsample or bootstrap to
##avoid bias
##Rarefying data is inappropriate for differential abundance testing,
##but rarefying gives the best results when comparing sample similarity
##Calculate the divergences within the diet groups
pseq.rarefied<-rarefy_even_depth(pseq,rngseed=8675309,replace=F)
## `set.seed(8675309)` was used to initialize repeatable random subsampling.
## Please record this for your records so others can reproduce.
## Try `set.seed(8675309); .Random.seed` for the full vector
## ...
## 3348OTUs were removed because they are no longer 
## present in any sample after random subsampling
## ...
##Calculate beta diversity with respect to the median taxa
##profile for each group
b_early_and_sub<-as.numeric(divergence(x=subset_samples(pseq.rarefied,Group=="Early and subsequent abx"),
                               y=apply(abundances(subset_samples(pseq.rarefied,Group=="Early and subsequent abx")),1,median),method="bray"))

b_early_only<-as.numeric(divergence(x=subset_samples(pseq.rarefied,Group=="Early only abx"),
                                    y=apply(abundances(subset_samples(pseq.rarefied,Group=="Early only abx")),1,median),method="bray"))

b_term<-as.numeric(divergence(x=subset_samples(pseq.rarefied,Group=="Term"),
                              y=apply(abundances(subset_samples(pseq.rarefied,Group=="Term")),1,median),method="bray"))

##Lets create a data frame of the beta diversity calculations to facilitate analyzing the results
df_beta_early_and_sub<-data.frame(Bray_Dist=b_early_and_sub,Group="Early and subsequent abx")
df_beta_early_only<-data.frame(Bray_Dist=b_early_only, Group="Early only abx")
df_beta_term<-data.frame(Bray_Dist=b_term,Group="Term")

df_beta<-bind_rows(df_beta_early_and_sub,df_beta_early_only,df_beta_term)

plot_beta<-ggviolin(df_beta,x="Group",y="Bray_Dist",add="boxplot",fill="Group")

##Add stats (Wilcoxon test)
comparisons<-list(c("Early and subsequent abx","Early only abx" ),
                  c("Early and subsequent abx","Term"),
                  c("Early only abx","Term"))

plot_beta<-plot_beta+stat_compare_means(comparisons=comparisons,method="wilcox.test")
print(plot_beta)
## Warning in wilcox.test.default(c(0.597668091721726, 0.461795569374271,
## 0.223940924990284, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.597668091721726, 0.461795569374271,
## 0.223940924990284, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.370712742980562, 0.572872570194384,
## 0.371576673866091, : cannot compute exact p-value with ties

Sample ordination

library(vegan) #Perform statistics on ordination results
##Rarefying data is inappropriate for differential abundance testing,
##but rarefying gives the best results when comparing sample similarity
pseq.rarefied<-rarefy_even_depth(pseq,rngseed=8675309,replace=F)
## `set.seed(8675309)` was used to initialize repeatable random subsampling.
## Please record this for your records so others can reproduce.
## Try `set.seed(8675309); .Random.seed` for the full vector
## ...
## 3348OTUs were removed because they are no longer 
## present in any sample after random subsampling
## ...
##First lets plot the sample ordination
ord_plot<-plot_landscape(pseq.rarefied,method="NMDS",distance = "bray",col="Group")+labs(title="NMDS/Bray-Curtis")
print(ord_plot)

##We will use a PERMONVA to see if there are statistically distinguishable communties
dist<-vegdist(otu_table(pseq.rarefied),method="bray")

##Check if the variances are equal between the groups. If not assumtion for PERMOVNA not met
print(anova(betadisper(dist,microbiome::meta(pseq.rarefied)$Group)))
## Analysis of Variance Table
## 
## Response: Distances
##           Df  Sum Sq  Mean Sq F value Pr(>F)
## Groups     2 0.05707 0.028537  2.0624 0.1335
## Residuals 84 1.16231 0.013837
##For this data the assumtion is not met, but here's the code
permonva_results<-adonis(otu_table(pseq.rarefied)~Group,data=microbiome::meta(pseq.rarefied),
                         permutations = 100,method="bray")
print(permonva_results)
## 
## Call:
## adonis(formula = otu_table(pseq.rarefied) ~ Group, data = microbiome::meta(pseq.rarefied),      permutations = 100, method = "bray") 
## 
## Permutation: free
## Number of permutations: 100
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)   
## Group      2     1.164 0.58198  3.8337 0.08364 0.009901 **
## Residuals 84    12.752 0.15181         0.91636            
## Total     86    13.916                 1.00000            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1