requires randomForest package
install.packages('randomForest', repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/nb/cvg7wbc956z89hzlnhvv5hfm0000gp/T//Rtmpu4c6ky/downloaded_packages
library(randomForest)
load the count data and rearrange it into a simple matrix
# without quote="", some lines break because the file has quotation characters within cells
counts_raw <- read.table('data/kaiju_species.tsv', header=T, sep='\t', quote="")
# clean up sample names but stripping all the extra info off of them
counts_raw[,1] <- gsub('_kaiju.*','',gsub('.*/','',counts_raw[,1]))
# transform the data from long format to wide
counts <- reshape(counts_raw, idvar='taxon_id', timevar='file', v.names='reads', direction='wide', drop=c('percent','taxon_name'))
# sample/taxon combinations that had no data in the original long format get NA's in the wide matrix. These are actually zeros, so let's change those
counts[is.na(counts)] <- 0
# make sure the taxon IDs look like character vectors by adding text, just to be safe, and make these rownames
rownames(counts) <- paste0('taxon_id_',counts[,1])
# get rid of the taxon ids from the actual data of the matrix
counts <- counts[,-1]
# clean up the sample names again
colnames(counts) <- sub('reads.','',colnames(counts))
# tell R that the data are all uniformly numeric rather than treating each column as an independent factor
counts <- as.matrix(counts)
# filter out taxa that are extremely rare
counts <- counts[apply(counts,1,function(x) sum(x>5)>5),]
first we need to make our data easier for our human brains to understand, by converting to relative abundance
relabund <- apply(counts,2,function(x) x/sum(x))
run randomForest
res <- randomForest(y=meta$Group, x=t(counts), importance=T, mtry = ceiling(nrow(counts) / 3))
imp <- res$importance
imp <- imp[order(imp[,'MeanDecreaseAccuracy']),]
topdiff <- rownames(imp)[[2]]
relabund_0na <- relabund
relabund_0na[relabund_0na==0] <- NA
merged_rel_0na <- cbind(meta,t(relabund_0na))
plot
beanplot::beanplot(as.formula(paste0(topdiff,' ~ Group')), merged_rel_0na, log='y', las=2, cex.axis=0.5)