Data_normalization_FPKM

load the data

setwd('~/Documents/work/paper_write/genome_training/R_training/')
df <- read.table('vehicle_drug_feature_counts.txt',
                 header = T, sep = '\t', row.names = 1)

data normalization (FPKM)

dfReads <- df[,6:9]
fpkm    <- apply(dfReads, 2, 
                 function(x){x/df$Length *10^9/sum(x)})
##change the col name
colnames(fpkm) <- c('c1','c2','t1','t2')
##save the data
write.table(fpkm, file = 'vehicle_drug_feature_counts.fpkm.txt',
            row.names = T, col.names = T, sep = '\t',
            quote = F)

compare the expression by scatter plot (biological replicates)

plot(fpkm[,1],fpkm[,2])

## change to log2
plot(log2(fpkm[,1]), log2(fpkm[,2]))

## change the points color 
plot(log2(fpkm[,1]), log2(fpkm[,2]),
     col = 'red', pch = 19)

plot(log2(fpkm[,1]), log2(fpkm[,2]),
     bg = 'red', pch = 21)

## change the lable
plot(log2(fpkm[,1]), log2(fpkm[,2]),
     bg = 'red', pch = 21,
     xlab = 'log2(control1_FPKM)',
     ylab = 'log2(control2_FPKM)')

## change the direction of the number at y-axis
plot(log2(fpkm[,1]), log2(fpkm[,2]),
     bg = 'red', pch = 21,
     xlab = 'log2(control1_FPKM)',
     ylab = 'log2(control2_FPKM)', las = 1)
##add regression line
id <- which(is.finite(log2(fpkm[,2])) & is.finite(log2(fpkm[,1])) )
nrow(fpkm)

## [1] 5712

length(id)

## [1] 5023

abline(lm(log2(fpkm[id,2]) ~ log2(fpkm[id,1])))
## change the width of the line 
abline(lm(log2(fpkm[id,2]) ~ log2(fpkm[id,1])),
       col = 'black', lwd = 3)

use ggplot2 to draw scatter plot

library(ggplot2)
fpkm_c <- data.frame(c1 = as.numeric( fpkm[,1] ),
                     c2 = as.numeric( fpkm[,2] ) )
ggplot(fpkm_c, aes(x = c1, y = c2)) +
  geom_point() +
  geom_smooth(method = lm, se = F)

## with other color 
ggplot(fpkm_c, aes(x = c1, y = c2)) +
  geom_point(col = 'red') +
  geom_smooth(method = lm, se = F, col = 'navy')

use ggplot2 to draw scatter plot with 2d density estimation

ggplot(log2(fpkm_c), aes(x = c1, y = c2)) +
  geom_point() +
  geom_density_2d()

## Warning: Removed 689 rows containing non-finite values (stat_density2d).

Data_normalization_FPKM_Scatter Plot

Chengqi(Charley) Wang

2/3/2020

load the data

data normalization (FPKM)

compare the expression by scatter plot (biological replicates)

use ggplot2 to draw scatter plot

use ggplot2 to draw scatter plot with 2d density estimation