load the txt count data

df      <- read.table('~/Documents/work/paper_write/genome_training/R_training/vehicle_drug_feature_counts.txt',
                 header = T, sep = '\t', row.names = 1)

df      <- df[,6:9]



check the distribution

par(mfrow = c(2,2), mar = c(4,4,1,1))
print( apply(df, 2, function(x) quantile(as.numeric(x))) )
##      vehicle_rep1.bam vehicle_rep2.bam drug_rep1.bam drug_rep2.bam
## 0%                  0                0             0             0
## 25%                14               16            15            18
## 50%                76               77            80            93
## 75%               200              211           228           262
## 100%            42634            34913         46189         45975
log1 <- apply(df, 2, function(x) {hist(log2(x), breaks = 100)})



Pick up the gene with expression bigger than 2^4
idExp <- apply(df, 1, function(x) any(x > 16))
dfExp <- df[idExp, ]
print(dim(dfExp))
## [1] 4542    4



build DESeq class

library(DESeq)
condition<- c('c','c', 't', 't')
cds      <- newCountDataSet( dfExp, condition )



Differential expression gene calling

##normalize the data, extract the size factor for each assay
cds      <- estimateSizeFactors(cds)
head( counts( cds, normalized=TRUE ) )
##               vehicle_rep1.bam vehicle_rep2.bam drug_rep1.bam
## PF3D7_1314600        69.870870         54.19117      13.70677
## PF3D7_0209400       313.391404        287.01249     219.30829
## PF3D7_1142200         8.220102         20.07080      11.59803
## PF3D7_0507200        44.183050         45.15931      53.77271
## PF3D7_0806000        83.228537         62.21949      60.09891
## PF3D7_1006800      1231.987848        927.27112    1083.88904
##               drug_rep2.bam
## PF3D7_1314600     22.852766
## PF3D7_0209400    189.220906
## PF3D7_1142200      2.742332
## PF3D7_0507200     39.306758
## PF3D7_0806000     60.331303
## PF3D7_1006800   1082.307015
##estimate the background gene expression 
cds      <- estimateDispersions( cds )

##estimate the background distribution
plotDispEsts(cds)

##p-value fit
res        <- nbinomTest( cds, 'c', 't' )
plotMA(res)



Differential expression gene extraction and visulation

## extract the genes with adjP < 0.0001
resSig = res[ which(res$padj < 0.0001), ]
dim(resSig)
## [1] 412   8
norData <- counts( cds, normalized=TRUE )
sigNorData <- norData[which(res$padj < 0.0001),]
save(sigNorData, resSig,
     file = 'deseq.sig.R')


##extract normalized data
norDF           <- counts( cds, normalized=TRUE )
heatMapDf       <- norDF[(rownames(norDF) %in% resSig$id), ]

##heatmap plot
##plot heatmap
library("RColorBrewer")
library("gplots")
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
hmcol =  colorRampPalette(brewer.pal(9, "GnBu"))(100)
## expression heatmap
heatMapDF_nor <- t( apply(heatMapDf, 1, function(x){(x-mean(x))/sd(x)}) )

colnames(heatMapDF_nor) <- c('control1','control2',
                             'treat1'  , 'treat2')
#pdf('aaaaa.pdf', height = 10, width = 10)
heatmap.2(heatMapDF_nor, col = hmcol, trace="none", margin=c(6, 6),labRow = F)

#dev.off()