Load the RNA-Seq data
##set your working directory and load the data
setwd('~/Documents/work/paper_write/genome_training/R_training/')
df <- read.table('vehicle_drug_feature_counts.txt',
header = T, sep = '\t', row.names = 1)
summary(df)
## Chr Start End Strand
## Pf3D7_14_v3: 802 Min. : 1 Min. : 33 -:2816
## Pf3D7_13_v3: 730 1st Qu.: 406746 1st Qu.: 410100 +:2896
## Pf3D7_12_v3: 549 Median : 831132 Median : 832578
## Pf3D7_11_v3: 503 Mean : 971100 Mean : 973547
## Pf3D7_10_v3: 408 3rd Qu.:1336580 3rd Qu.:1338937
## Pf3D7_09_v3: 378 Max. :3290953 Max. :3291501
## (Other) :2342
## Length vehicle_rep1.bam vehicle_rep2.bam drug_rep1.bam
## Min. : 20 Min. : 0.0 Min. : 0 Min. : 0.0
## 1st Qu.: 963 1st Qu.: 14.0 1st Qu.: 16 1st Qu.: 15.0
## Median : 1606 Median : 76.0 Median : 77 Median : 80.0
## Mean : 2447 Mean : 268.3 Mean : 274 Mean : 250.5
## 3rd Qu.: 2967 3rd Qu.: 200.0 3rd Qu.: 211 3rd Qu.: 228.0
## Max. :30864 Max. :42634.0 Max. :34913 Max. :46189.0
##
## drug_rep2.bam
## Min. : 0.0
## 1st Qu.: 18.0
## Median : 93.0
## Mean : 278.4
## 3rd Qu.: 262.0
## Max. :45975.0
##
df <- df[,6:9]
Save the data
write.table(df,
file = 'vehicle_drug_feature_counts.copy.txt.txt', #either a character string naming a file or a connection open for writing. "" indicates output to the console.
row.names = T, #indicating whether the row names of df are to be written
col.names = T, #indicating whether the column names of df are to be written
sep = '\t', #the field separator string
quote = F #a logical value (TRUE or FALSE) or a numeric vector. If TRUE, any character or factor columns will be surrounded by double quotes.
)
###load again
df2 <- read.table('vehicle_drug_feature_counts.copy.txt.txt',header = T, sep = '\t', row.names = 1)
head(df2)
## vehicle_rep1.bam vehicle_rep2.bam drug_rep1.bam
## PF3D7_0621350 0 7 3
## PF3D7_1314600 68 54 13
## PF3D7_0209400 305 286 208
## PF3D7_1142200 8 20 11
## PF3D7_0507200 43 45 51
## PF3D7_0806000 81 62 57
## drug_rep2.bam
## PF3D7_0621350 4
## PF3D7_1314600 25
## PF3D7_0209400 207
## PF3D7_1142200 3
## PF3D7_0507200 43
## PF3D7_0806000 66
‘For’ loop
df <- as.matrix(df)
##check the quantile for each assay
for (i in 1:ncol(df))
{
print( colnames(df)[i] )
qu <- quantile( df[, i] )
print(qu)
}
## [1] "vehicle_rep1.bam"
## 0% 25% 50% 75% 100%
## 0 14 76 200 42634
## [1] "vehicle_rep2.bam"
## 0% 25% 50% 75% 100%
## 0 16 77 211 34913
## [1] "drug_rep1.bam"
## 0% 25% 50% 75% 100%
## 0 15 80 228 46189
## [1] "drug_rep2.bam"
## 0% 25% 50% 75% 100%
## 0 18 93 262 45975
‘Apply’ replaces ‘for’ loop
qu <- apply(df, 2, function(z){quantile(z)})
print(qu)
## vehicle_rep1.bam vehicle_rep2.bam drug_rep1.bam drug_rep2.bam
## 0% 0 0 0 0
## 25% 14 16 15 18
## 50% 76 77 80 93
## 75% 200 211 228 262
## 100% 42634 34913 46189 45975
if…else statement
### pick the expressed genes in first assay
exp_cutoff <- quantile(df[,1], 0.35)
unEXP_cutoff <- quantile(df[,1], 0.3)
n_remaining <- 0
exp_gene <- c() #set a empty vector for expressed genes
unexp_gene <- c() #set a empty vector for un_expressed genes
for (i in (1:length(df[,1])))
{
if (df[i, 1] > exp_cutoff )
{
exp_gene <- c(exp_gene, rownames(df)[i])
}
else if (df[i, 1] < unEXP_cutoff)
{
unexp_gene <- c(unexp_gene, rownames(df)[i])
}
else
{
n_remaining <- n_remaining + 1
}
}
print(c(length(unexp_gene),
length(exp_gene),
n_remaining))
## [1] 1714 3710 288