Load the RNA-Seq data

##set your working directory and load the data
setwd('~/Documents/work/paper_write/genome_training/R_training/')
df <- read.table('vehicle_drug_feature_counts.txt',
                 header = T, sep = '\t', row.names = 1)
summary(df)
##           Chr           Start              End          Strand  
##  Pf3D7_14_v3: 802   Min.   :      1   Min.   :     33   -:2816  
##  Pf3D7_13_v3: 730   1st Qu.: 406746   1st Qu.: 410100   +:2896  
##  Pf3D7_12_v3: 549   Median : 831132   Median : 832578           
##  Pf3D7_11_v3: 503   Mean   : 971100   Mean   : 973547           
##  Pf3D7_10_v3: 408   3rd Qu.:1336580   3rd Qu.:1338937           
##  Pf3D7_09_v3: 378   Max.   :3290953   Max.   :3291501           
##  (Other)    :2342                                               
##      Length      vehicle_rep1.bam  vehicle_rep2.bam drug_rep1.bam    
##  Min.   :   20   Min.   :    0.0   Min.   :    0    Min.   :    0.0  
##  1st Qu.:  963   1st Qu.:   14.0   1st Qu.:   16    1st Qu.:   15.0  
##  Median : 1606   Median :   76.0   Median :   77    Median :   80.0  
##  Mean   : 2447   Mean   :  268.3   Mean   :  274    Mean   :  250.5  
##  3rd Qu.: 2967   3rd Qu.:  200.0   3rd Qu.:  211    3rd Qu.:  228.0  
##  Max.   :30864   Max.   :42634.0   Max.   :34913    Max.   :46189.0  
##                                                                      
##  drug_rep2.bam    
##  Min.   :    0.0  
##  1st Qu.:   18.0  
##  Median :   93.0  
##  Mean   :  278.4  
##  3rd Qu.:  262.0  
##  Max.   :45975.0  
## 
df <- df[,6:9]



Save the data

write.table(df, 
            file = 'vehicle_drug_feature_counts.copy.txt.txt', #either a character string naming a file or a connection open for writing. "" indicates output to the console.
            row.names = T, #indicating whether the row names of df are to be written
            col.names = T, #indicating whether the column names of df are to be written
            sep = '\t',    #the field separator string
            quote = F      #a logical value (TRUE or FALSE) or a numeric vector. If TRUE, any character or factor columns will be surrounded by double quotes.
            )

###load again
df2 <- read.table('vehicle_drug_feature_counts.copy.txt.txt',header = T, sep = '\t', row.names = 1)
head(df2)
##               vehicle_rep1.bam vehicle_rep2.bam drug_rep1.bam
## PF3D7_0621350                0                7             3
## PF3D7_1314600               68               54            13
## PF3D7_0209400              305              286           208
## PF3D7_1142200                8               20            11
## PF3D7_0507200               43               45            51
## PF3D7_0806000               81               62            57
##               drug_rep2.bam
## PF3D7_0621350             4
## PF3D7_1314600            25
## PF3D7_0209400           207
## PF3D7_1142200             3
## PF3D7_0507200            43
## PF3D7_0806000            66



‘For’ loop

df <- as.matrix(df)
##check the quantile for each assay
for (i in 1:ncol(df))
{
   print(  colnames(df)[i] )
   qu <- quantile( df[, i] )
   print(qu)
}
## [1] "vehicle_rep1.bam"
##    0%   25%   50%   75%  100% 
##     0    14    76   200 42634 
## [1] "vehicle_rep2.bam"
##    0%   25%   50%   75%  100% 
##     0    16    77   211 34913 
## [1] "drug_rep1.bam"
##    0%   25%   50%   75%  100% 
##     0    15    80   228 46189 
## [1] "drug_rep2.bam"
##    0%   25%   50%   75%  100% 
##     0    18    93   262 45975


‘Apply’ replaces ‘for’ loop

qu <- apply(df, 2, function(z){quantile(z)})
print(qu)
##      vehicle_rep1.bam vehicle_rep2.bam drug_rep1.bam drug_rep2.bam
## 0%                  0                0             0             0
## 25%                14               16            15            18
## 50%                76               77            80            93
## 75%               200              211           228           262
## 100%            42634            34913         46189         45975


if…else statement

### pick the expressed genes in first assay
exp_cutoff   <- quantile(df[,1], 0.35)
unEXP_cutoff <- quantile(df[,1], 0.3)
n_remaining  <- 0
exp_gene   <- c() #set a empty vector for expressed genes
unexp_gene <- c() #set a empty vector for un_expressed genes
for (i in (1:length(df[,1])))
{
    if (df[i, 1] > exp_cutoff  )
    {
        exp_gene <- c(exp_gene, rownames(df)[i])  
    }
    else if (df[i, 1] < unEXP_cutoff) 
    {
        unexp_gene <- c(unexp_gene, rownames(df)[i])
    }
    else
    {
        n_remaining <- n_remaining + 1    
    }
}

print(c(length(unexp_gene),
      length(exp_gene),
      n_remaining))
## [1] 1714 3710  288

use ‘which’ to replace former script

exp_gene2 <- rownames(df)[which(df[,1] > exp_cutoff)]
length(exp_gene2)
## [1] 3710
length(exp_gene)
## [1] 3710