Download this Rmd file

Please download and modify your input/output file location

Download RNA-seq data from NCBI

Let’s use published RNA-seq data from Arabidopsis thaliana mutants (npr1-1 and npr4-4D) Ding (2018).

library(tidyverse)
library(stringr)
library(edgeR)

Trimming unnecessary sequences from data (such as adaptors) by trimmomatic

Needs to trim? Do fastqc and see results

trim?

# working in directories containing fastq files
fastfiles<-list.files(pattern="fastq")
system(paste("fastqc ",fastfiles,sep=""))
# and then check reports to see if there are adaptor contamination
# It seems those files were not contaminated so that no need to trim read files.
fastfiles.SE<- fastfiles %>% as_tibble() %>% separate(value,into=c("SRA","type","compress"),sep="\\.") %>% separate(SRA,into=c("SRA","pair"),sep="_") %>% group_by(SRA) %>%summarize(num=n()) %>% filter(num==1) %>% select(SRA) %>% as_vector()
# trim by trimmomatic (not necessary this time)
# system("makedir trimmed")
#for (x in fastfiles) {
#system(paste("trimmomatic.sh SE -threads 4 ",x," trimmed/trimmed_",x," ILLUMINACLIP:Bradseq_adapter.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36",sep="")) # bug fixed
#}
# Or using inline loop?
#system("for i in *fastq.gz; do trimmomatic.sh SE -threads 4 $i trimmed/trimmed$i ILLUMINACLIP:Bradseq_adapter.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36; done")

Mapping to reference cDNA and creating count data file (copied from Julin’s scripts “01_20170617_Map and normalize”)

read Kallisto manual https://pachterlab.github.io/kallisto/starting.html

Build index

#system("kallisto index -i TAIR10_cdna_20110103_representative_gene_model_kallisto_index TAIR10_cdna_20110103_representative_gene_model.gz")
# This reference seq has been updated in 2012! Redo mapping (101018)
#system("kallisto index -i TAIR10_cdna_20110103_representative_gene_model_updated_kallisto_index TAIR10_cdna_20110103_representative_gene_model_updated")

mapping

# makin directory for mapped 
system("mkdir kallisto_sam_out2")
# reading fastq files
fastqfiles<-list.files(pattern="fastq.gz")
# fastq files with single end 
fastqfiles.SE<- fastqfiles %>% as_tibble() %>% separate(value,into=c("SRA","type","compress"),sep="\\.") %>% separate(SRA,into=c("SRA","pair"),sep="_") %>% group_by(SRA) %>%summarize(num=n()) %>% filter(num==1) %>% select(SRA) %>% as_vector()
for(x in fastqfiles.SE) {
system(paste("kallisto quant -i TAIR10_cdna_20110103_representative_gene_model_updated_kallisto_index  -o kallisto_sam_out2/",x," --single -l 200 -s 40 --pseudobam ",x,"_1.fastq.gz | samtools view -Sb - > kallisto_sam_out2/",x,"/",x,".bam",sep=""))
setwd(paste("kallisto_sam_out2/",x,sep=""))
system(paste("samtools sort *.bam -o ",x,".sorted.bam",sep=""))
system("samtools index *.sorted.bam")
setwd("../../")
}

#samtools view -u alignment.sam | samtools sort -@ 4 - output_prefix # https://www.biostars.org/p/263589/ # Move the counts to my local computer # Julin used lftp (https://www.networkworld.com/article/2833203/operating-systems/unix--flexibly-moving-files-with-lftp.html)

#In my computer, 
# lftp sftp://kazu@whitney.plb.ucdavis.edu
#  cd NGS/Ding_2018_Cell
#  mirror kallisto_sam_out2

remove unused files

system("cd  2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2")
system("rm */*.json")
system("rm */*.bam*")

compress tsv files

system("gzip */abundance.tsv")

Get counts into R

kallisto_files <- dir(path = file.path("..","..","..","resources","2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files","kallisto_sam_out2"),pattern="abundance.tsv",recursive = TRUE,full.names = TRUE)
kallisto_files

##  [1] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739807/abundance.tsv.gz"
##  [2] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739808/abundance.tsv.gz"
##  [3] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739809/abundance.tsv.gz"
##  [4] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739810/abundance.tsv.gz"
##  [5] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739811/abundance.tsv.gz"
##  [6] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739812/abundance.tsv.gz"
##  [7] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739813/abundance.tsv.gz"
##  [8] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739814/abundance.tsv.gz"
##  [9] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739815/abundance.tsv.gz"
## [10] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739816/abundance.tsv.gz"
## [11] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739817/abundance.tsv.gz"
## [12] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739818/abundance.tsv.gz"

getwd()

## [1] "/Volumes/data_personal/Kazu_blog14/content/post/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data"

kallisto_files2<-list.files(path=file.path("..","..","..","resources","2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files","kallisto_sam_out2")) # this is not working
kallisto_files2

##  [1] "SRR6739807" "SRR6739808" "SRR6739809" "SRR6739810" "SRR6739811"
##  [6] "SRR6739812" "SRR6739813" "SRR6739814" "SRR6739815" "SRR6739816"
## [11] "SRR6739817" "SRR6739818"

kallisto_names <- str_split(kallisto_files,"/",simplify=TRUE)[,3]
kallisto_names <- kallisto_files2
kallisto_names

##  [1] "SRR6739807" "SRR6739808" "SRR6739809" "SRR6739810" "SRR6739811"
##  [6] "SRR6739812" "SRR6739813" "SRR6739814" "SRR6739815" "SRR6739816"
## [11] "SRR6739817" "SRR6739818"

combine count data in list

counts.list <- lapply(kallisto_files,read_tsv)
names(counts.list) <- kallisto_names
counts.list[[1]]

## # A tibble: 33,602 x 5
##    target_id   length eff_length est_counts   tpm
##    <chr>        <dbl>      <dbl>      <dbl> <dbl>
##  1 AT1G50920.1   2394       2195     3598   88.8 
##  2 AT1G36960.1    546        347        0    0   
##  3 AT1G44020.1   2314       2115       50.3  1.29
##  4 AT1G15970.1   1658       1459      427   15.9 
##  5 AT1G73440.1   1453       1254      133.   5.74
##  6 AT1G75120.1   1520       1321       52    2.13
##  7 AT1G17600.1   3332       3133      685   11.9 
##  8 AT1G50860.1   3846       3647        0    0   
##  9 AT1G51380.1   1452       1253       65    2.81
## 10 AT1G77370.1    620        421      728   93.7 
## # … with 33,592 more rows

combine counts data

counts <- sapply(counts.list, dplyr::select, est_counts) %>% 
  bind_cols(counts.list[[1]][,"target_id"],.) 
counts[is.na(counts)] <- 0
colnames(counts) <- sub(".est_counts","",colnames(counts),fixed = TRUE)
counts

## # A tibble: 33,602 x 13
##    target_id SRR6739807 SRR6739808 SRR6739809 SRR6739810 SRR6739811 SRR6739812
##    <chr>          <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
##  1 AT1G5092…     3598       3442      3653          3531       4027       3775
##  2 AT1G3696…        0          0         0             0          0          0
##  3 AT1G4402…       50.3       64.7       5.57         13          7         10
##  4 AT1G1597…      427        412       316           348        319        357
##  5 AT1G7344…      133.       129.      154           141        155        131
##  6 AT1G7512…       52         46        74           110        122         91
##  7 AT1G1760…      685        764       231           220        264        247
##  8 AT1G5086…        0          0         0             0          0          0
##  9 AT1G5138…       65         81        72            78         73         55
## 10 AT1G7737…      728        760       495           559        499        561
## # … with 33,592 more rows, and 6 more variables: SRR6739813 <dbl>,
## #   SRR6739814 <dbl>, SRR6739815 <dbl>, SRR6739816 <dbl>, SRR6739817 <dbl>,
## #   SRR6739818 <dbl>

counts.updated<-counts

# write counts data
write_csv(counts.updated,file.path("..","..","..","resources","2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files","counts.updated.csv.gz"))

sample info

#sample.info<-read_csv("2018-10-01-differential-expression-analysis-with-public-available-sequencing-data/SraRunInfo.csv")
sample.info<-read_csv("http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&rettype=runinfo&db=sra&term=PRJNA434313") # Directly from ncbi site. This works!

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_character(),
##   ReleaseDate = col_datetime(format = ""),
##   LoadDate = col_datetime(format = ""),
##   spots = col_double(),
##   bases = col_double(),
##   spots_with_mates = col_double(),
##   avgLength = col_double(),
##   size_MB = col_double(),
##   AssemblyName = col_logical(),
##   LibraryName = col_logical(),
##   InsertSize = col_double(),
##   InsertDev = col_double(),
##   Study_Pubmed_id = col_double(),
##   ProjectID = col_double(),
##   TaxID = col_double(),
##   g1k_pop_code = col_logical(),
##   source = col_logical(),
##   g1k_analysis_group = col_logical(),
##   Subject_ID = col_logical(),
##   Sex = col_logical(),
##   Disease = col_logical()
##   # ... with 5 more columns
## )
## ℹ Use `spec()` for the full column specifications.

sample.info2<-tibble(SampleName=c("GSM3014771","GSM3014772","GSM3014773","GSM3014774","GSM3014775","GSM3014776","GSM3014777","GSM3014778","GSM3014779","GSM3014780","GSM3014781","GSM3014782","GSM3014783","GSM3014784","GSM3014785","GSM3014786"),sample=c("WT_treated_1","WT_treated_2","WT_untreated_1","WT_untreated_2","npr1-1_treated_1","npr1-1_treated_2","npr1-1_untreated_1","npr1-1_untreated_2","npr4-4D_treated_1","npr4-4D_treated_2","npr4-4D_untreated_1","npr4-4D_untreated_2","npr1-1_npr4-4D_treated_1","npr1-1_npr4-4D_treated_2","npr1-1_npr4-4D_untreated_1","npr1-1_npr4-4D_untreated_2")) # # from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE110702
# combine 
sample.info.final <- left_join(sample.info,sample.info2,by="SampleName") %>% dplyr::select(Run,sample,LibraryLayout) %>%  separate(sample,into=c("genotype1","genotype2","treatment","rep"),sep="_",fill="left",remove=FALSE) %>% unite(genotype,genotype1,genotype2,sep="_") %>% mutate(genotype=str_replace(genotype,"NA_","")) %>% unite(group,genotype,treatment,sep="_",remove=FALSE)

check libraries by barplot and MDS plots

counts <- read_csv(file.path("..","..","..","resources","2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files","counts.updated.csv.gz"))

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   target_id = col_character(),
##   SRR6739807 = col_double(),
##   SRR6739808 = col_double(),
##   SRR6739809 = col_double(),
##   SRR6739810 = col_double(),
##   SRR6739811 = col_double(),
##   SRR6739812 = col_double(),
##   SRR6739813 = col_double(),
##   SRR6739814 = col_double(),
##   SRR6739815 = col_double(),
##   SRR6739816 = col_double(),
##   SRR6739817 = col_double(),
##   SRR6739818 = col_double()
## )

all(colnames(counts)[-1]==sample.info.final[sample.info.final$LibraryLayout=="SINGLE","Run"])

## [1] TRUE

dge <- DGEList(counts[,-1],
               group=as_vector(sample.info.final[sample.info.final$LibraryLayout=="SINGLE","group"]),
               samples=sample.info.final[sample.info.final$LibraryLayout=="SINGLE",],
               genes=counts$target_id)
# Filtering.
## From edgeR User guide (section 2.6. Filtering) "As a rule of thumb, genes are dropped if they can’t possibly be expressed in all the samples for any of the conditions. Users can set their own definition of genes being expressed. Usually a gene is required to have a count of 5-10 in a library to be considered expressed in that library. Users should also filter with count-per-million (CPM) rather than filtering on the counts directly, as the latter does not account for differences in library sizes between samples"
dge.nolow<-dge[rowSums(cpm(dge)> 5) >= 2,] # two replicates in this experiment. 
table(rowSums(cpm(dge)> 5) >= 2)

## 
## FALSE  TRUE 
## 16803 16799

# normalization
dge.nolow <- calcNormFactors(dge.nolow)

# lib.size check
ggplot(dge.nolow$samples,aes(x=sample,y=norm.factors,fill=genotype)) + geom_col() + 
  theme(axis.text.x  = element_text(angle=90, vjust=0.5,size = 7))

MDS plot

mds <- plotMDS(dge.nolow,method = "bcv",labels=dge.nolow$samples$group,gene.selection = "pairwise", ndim=3)

# fix this 
mds.pl <- as_tibble(mds$cmdscale.out) %>% 
  bind_cols(data.frame(sample=row.names(mds$cmdscale.out)),.) %>%
  left_join(dge.nolow$sample,by=c(sample="Run"))

## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

mds.pl %>% ggplot(aes(x=V1,y=V2,color=as.factor(treatment),label=genotype)) + geom_text(angle=30) + ggtitle("DIM 1 vs 2")

# edge.nolowR (model with glm)

# set references in genotype and treatment
dge.nolow$samples$genotype<-factor(dge.nolow$samples$genotype,levels=c("WT","npr4-4D","npr1-1"))
dge.nolow$samples$treatment<-factor(dge.nolow$samples$treatment,levels=c("untreated","treated"))
save(dge.nolow,file=file.path("..","..","..","resources","2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files","dge.nolow.Ding2018.Rdata"))
# making model matrix
design.int <- with(dge.nolow$samples, model.matrix(~ genotype*treatment + rep))
# estimateDisp
dge.nolow.int <- estimateDisp(dge.nolow,design = design.int)
## fit linear model
fit.int <- glmFit(dge.nolow.int,design.int)
# get DEGs, treatment effects in Col (rWT = reference WT, rU = reference is untreated)
trt.lrt.rWT.rU <- glmLRT(fit.int,coef = c("treatmenttreated")) 
trt.DEGs.int.rWT.rU <- topTags(trt.lrt.rWT.rU,n = Inf,p.value = 0.05)$table;nrow(trt.DEGs.int.rWT.rU) #

## [1] 7862

# get DEGs, genotype effects in npr1-1 (no matter treated or untreated) (coef=c("genotypenpr1-1","genotypenpr1-1:treatmenttreated"))
genotypenpr1_genotypenpr1.treatmenttreated.lrt.rWT.rU <- glmLRT(fit.int,coef = c("genotypenpr1-1","genotypenpr1-1:treatmenttreated"))
genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU <- topTags(genotypenpr1_genotypenpr1.treatmenttreated.lrt.rWT.rU,n = Inf,p.value = 0.05)$table;nrow(genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU) #

## [1] 6460

# get DEGs, genotype effects in npr4-4D (no matter treated or untreated) (coef=c("genotypenpr4-4D","genotypenpr4-4D:treatmenttreated"))
genotypenpr4_genotypenpr4.treatmenttreated.lrt.rWT.rU <- glmLRT(fit.int,coef = c("genotypenpr4-4D","genotypenpr4-4D:treatmenttreated"))
genotypenpr4_genotypenpr4.treatmenttreated.DEGs.int.rWT.rU <- topTags(genotypenpr4_genotypenpr4.treatmenttreated.lrt.rWT.rU,n = Inf,p.value = 0.05)$table;nrow(genotypenpr4_genotypenpr4.treatmenttreated.DEGs.int.rWT.rU) # 846

## [1] 854

Download updated gene description

Most updated annotation can be downloaded from TAIR website (for subscribers). I used “TAIR Data 20180630”. “gene_aliases_20201231.txt.gz”.

#At.gene.name <-read_tsv("https://www.arabidopsis.org/download_files/Subscriber_Data_Releases/TAIR_Data_20180630/gene_aliases_20180702.txt.gz") # Does work from home when I use Pulse Secure. (as of Mar 23, 2021) SSL certificate problem: certificate has expired. Download file and use downloaded file (March 23, 2021)
At.gene.name <-read_tsv("https://www.arabidopsis.org/download_files/Subscriber_Data_Releases/TAIR_Data_20180630/gene_aliases_20201231.txt.gz") # SSL certificate problem: certificate has expired. Download file and use downloaded file (March 23, 2021)

Using downloaded file (gene_aliases_20201231.txt.gz)

getwd()

## [1] "/Volumes/data_personal/Kazu_blog14/content/post/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data"

system("gunzip -c ../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/gene_aliases_20201231.txt.gz > ../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/gene_aliases_20201231.txt")
At.gene.name <- read_tsv(file.path("..","..","..","resources","2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files","gene_aliases_20201231.txt"))

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   name = col_character(),
##   symbol = col_character(),
##   full_name = col_character()
## )

At.gene.name

## # A tibble: 27,644 x 3
##    name      symbol    full_name                
##    <chr>     <chr>     <chr>                    
##  1 18S RRNA  18S RRNA  <NA>                     
##  2 25S RRNA  25S RRNA  <NA>                     
##  3 5.8S RRNA 5.8S RRNA <NA>                     
##  4 AAN       AAN       ANOTHER ANDANTE          
##  5 AAR1      AAR1      ALLYL ALCOHOL RESISTANT 1
##  6 AAR2      AAR2      ALLYL ALCOHOL RESISTANT 2
##  7 ACL1      ACL1      ACAULIS 1                
##  8 ACL2      ACL2      ACAULIS 2                
##  9 ACL3      ACL3      ACAULIS 3                
## 10 ACL4      ACL4      ACAULIS 4                
## # … with 27,634 more rows

# remove uncompressed file
system("rm ../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/gene_aliases_20201231.txt")

cleanup At.gene.name

# How to concatenate multiple symbols in the same AGI
At.gene.name <- At.gene.name %>% group_by(name) %>% summarise(symbol2=paste(symbol,collapse=";"),full_name=paste(full_name,collapse=";"))

## `summarise()` ungrouping output (override with `.groups` argument)

At.gene.name %>% dplyr::slice(100:110) # OK

## # A tibble: 11 x 3
##    name     symbol2       full_name                                             
##    <chr>    <chr>         <chr>                                                 
##  1 AT1G017… ATKEA1;KEA1   K+ EFFLUX ANTIPORTER 1;K+ efflux antiporter 1         
##  2 AT1G018… PEX11C        peroxin 11c                                           
##  3 AT1G018… PFC1          PALEFACE 1                                            
##  4 AT1G018… AtGEN1;GEN1   NA;ortholog of HsGEN1                                 
##  5 AT1G019… ATSBT1.1;SBT… NA;NA                                                 
##  6 AT1G019… AtGET3a;GET3a Guided Entry of Tail-anchored protein 3a;Guided Entry…
##  7 AT1G019… ARK2;AtKINUb  armadillo repeat kinesin 2;Arabidopsis thaliana KINES…
##  8 AT1G019… BIG3;EDA10    BIG3;embryo sac development arrest 10                 
##  9 AT1G019… AtBBE1;OGOX4  NA;oligogalacturonide oxidase 4                       
## 10 AT1G020… GAE2          UDP-D-glucuronate 4-epimerase 2                       
## 11 AT1G020… SEC1A         secretory 1A

Make “output” directory for saving DEG results as csv files

# system("mkdir 2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/output")

Add gene descripion to DEG files

# WT (Col) treatment DEG
trt.DEGs.int.rWT.rU.desc<- trt.DEGs.int.rWT.rU %>% mutate(AGI=str_remove(genes,pattern="\\.[[:digit:]]+$")) %>% left_join(At.gene.name,by=c("AGI"="name"))
write_csv(trt.DEGs.int.rWT.rU.desc,file.path("..","..","..","resources","2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files","output","trt.DEGs.int.rWT.rU.csv"))

# npr1-1 (either treated or untreated)
genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU.desc <- genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU %>% mutate(AGI=str_remove(genes,pattern="\\.[[:digit:]]+$")) %>% left_join(At.gene.name,by=c("AGI"="name"))
write_csv(genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU.desc,file.path("..","..","..","resources","2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files","output","genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU.csv"))
# npr4-4 (either treated or untreated)
genotypenpr4_genotypenpr4.treatmenttreated.DEGs.int.rWT.rU.desc <- genotypenpr4_genotypenpr4.treatmenttreated.DEGs.int.rWT.rU %>% mutate(AGI=str_remove(genes,pattern="\\.[[:digit:]]+$")) %>% left_join(At.gene.name,by=c("AGI"="name"))
write_csv(genotypenpr4_genotypenpr4.treatmenttreated.DEGs.int.rWT.rU.desc,file.path("..","..","..","resources","2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files","output","genotypenpr4_genotypenpr4.treatmenttreated.DEGs.int.rWT.rU.csv"))

table for DEGs summary

# getting csv file names and path
DEG.objs2 <- list.files(path=file.path("..","..","..","resources","2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files","output"),pattern="\\.csv$",recursive =TRUE,full.names = TRUE)
DEG.objs2

## [1] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/output/genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU.csv"
## [2] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/output/genotypenpr4_genotypenpr4.treatmenttreated.DEGs.int.rWT.rU.csv"
## [3] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/output/trt.DEGs.int.rWT.rU.csv"

# reading count data
DEG.count.list2 <- DEG.objs2 %>% map(~read_csv(.))

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   genes = col_character(),
##   logFC.genotypenpr1.1 = col_double(),
##   logFC.genotypenpr1.1.treatmenttreated = col_double(),
##   logCPM = col_double(),
##   LR = col_double(),
##   PValue = col_double(),
##   FDR = col_double(),
##   AGI = col_character(),
##   symbol2 = col_character(),
##   full_name = col_character()
## )

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   genes = col_character(),
##   logFC.genotypenpr4.4D = col_double(),
##   logFC.genotypenpr4.4D.treatmenttreated = col_double(),
##   logCPM = col_double(),
##   LR = col_double(),
##   PValue = col_double(),
##   FDR = col_double(),
##   AGI = col_character(),
##   symbol2 = col_character(),
##   full_name = col_character()
## )

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   genes = col_character(),
##   logFC = col_double(),
##   logCPM = col_double(),
##   LR = col_double(),
##   PValue = col_double(),
##   FDR = col_double(),
##   AGI = col_character(),
##   symbol2 = col_character(),
##   full_name = col_character()
## )

DEG.count.list2

## [[1]]
## # A tibble: 6,460 x 10
##    genes logFC.genotypen… logFC.genotypen… logCPM    LR    PValue       FDR
##    <chr>            <dbl>            <dbl>  <dbl> <dbl>     <dbl>     <dbl>
##  1 AT2G…           -4.19            -0.733   6.69 1327. 6.75e-289 1.13e-284
##  2 AT5G…            3.26             0.517   4.76 1072. 1.33e-233 1.11e-229
##  3 AT1G…           -2.01            -0.488   7.19  659. 6.38e-144 3.57e-140
##  4 AT1G…           -2.25            -0.525   7.51  624. 2.54e-136 1.07e-132
##  5 AT3G…           -3.15             0.874   5.33  622. 1.00e-135 3.36e-132
##  6 AT5G…           -0.206           -6.73    3.90  595. 7.88e-130 2.21e-126
##  7 AT1G…           -3.64            -1.53    3.37  581. 7.28e-127 1.75e-123
##  8 AT5G…            1.83             1.82    9.40  571. 1.04e-124 2.18e-121
##  9 AT2G…            2.23             1.94    9.22  559. 4.26e-122 7.94e-119
## 10 AT5G…           -2.01            -3.49    4.19  538. 1.19e-117 2.00e-114
## # … with 6,450 more rows, and 3 more variables: AGI <chr>, symbol2 <chr>,
## #   full_name <chr>
## 
## [[2]]
## # A tibble: 854 x 10
##    genes logFC.genotypen… logFC.genotypen… logCPM    LR    PValue       FDR
##    <chr>            <dbl>            <dbl>  <dbl> <dbl>     <dbl>     <dbl>
##  1 AT5G…             3.04            0.632   4.76  966. 1.54e-210 2.59e-206
##  2 AT5G…            -3.31            0.358   3.10  463. 3.40e-101 2.85e- 97
##  3 AT3G…            -5.46           -0.178   1.98  454. 2.07e- 99 1.16e- 95
##  4 AT1G…            -7.41           -3.70    2.00  424. 6.68e- 93 2.80e- 89
##  5 AT3G…            -2.97           -0.297   3.36  393. 4.39e- 86 1.47e- 82
##  6 AT2G…            -1.42           -0.699   6.69  284. 2.63e- 62 7.35e- 59
##  7 AT1G…            -2.50            0.894   6.30  280. 1.23e- 61 2.96e- 58
##  8 AT2G…            -1.10           -0.233   5.87  221. 9.62e- 49 2.02e- 45
##  9 AT5G…            -3.04            0.438   4.19  208. 6.15e- 46 1.15e- 42
## 10 AT5G…            -1.84           -0.485   2.78  190. 4.64e- 42 7.79e- 39
## # … with 844 more rows, and 3 more variables: AGI <chr>, symbol2 <chr>,
## #   full_name <chr>
## 
## [[3]]
## # A tibble: 7,862 x 9
##    genes  logFC logCPM    LR    PValue       FDR AGI    symbol2  full_name      
##    <chr>  <dbl>  <dbl> <dbl>     <dbl>     <dbl> <chr>  <chr>    <chr>          
##  1 AT5G1…  8.58   3.90  662. 5.49e-146 9.22e-142 AT5G1… AT-HSP1… heat shock pro…
##  2 AT1G0…  4.01   4.60  613. 2.14e-135 1.80e-131 AT1G0… <NA>     <NA>           
##  3 AT4G2…  3.86   5.39  586. 2.36e-129 1.32e-125 AT4G2… CRK14    cysteine-rich …
##  4 AT5G2…  6.17   4.19  582. 1.27e-128 5.32e-125 AT5G2… ATWRKY3… ARABIDOPSIS TH…
##  5 AT3G2…  4.93   3.36  578. 9.46e-128 3.18e-124 AT3G2… <NA>     <NA>           
##  6 AT5G1…  9.65   3.77  569. 1.18e-125 3.30e-122 AT5G1… HSP17.6… 17.6 kDa class…
##  7 AT4G2…  3.34   6.70  563. 1.80e-124 4.32e-121 AT4G2… SGT1A    <NA>           
##  8 AT1G2…  2.88   7.19  539. 2.54e-119 5.34e-116 AT1G2… AtWAK1;… NA;NA;cell wal…
##  9 AT5G2…  3.83   6.70  529. 4.88e-117 9.12e-114 AT5G2… AtDMR6;… NA;DOWNY MILDE…
## 10 AT2G1…  2.96   7.30  512. 2.22e-113 3.73e-110 AT2G1… ATSERK4… SOMATIC EMBRYO…
## # … with 7,852 more rows

# choose file names
names(DEG.count.list2) <- str_split(DEG.objs2,pattern="/",simplify=TRUE)[,7] %>% str_remove(".csv")
# count DEGs
DEGs.count <- sapply(DEG.count.list2,function(x) {
                value <- nrow(x)
                ifelse(is.null(value),0,value)
        }  )
DEGs.count

## genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU 
##                                                       6460 
## genotypenpr4_genotypenpr4.treatmenttreated.DEGs.int.rWT.rU 
##                                                        854 
##                                        trt.DEGs.int.rWT.rU 
##                                                       7862

Plot gene expression pattern

For some reasons, labeller = label_wrap_gen(width=30) does not work here. Needs to make a function for plotting graph later to reduce redundancy of scripts.

# calculate normalized gene expression level
cpm.wide <- bind_cols(dge.nolow$gene,as_tibble(cpm(dge.nolow))) %>% as_tibble() %>% rename(transcript_ID=genes)
cpm.wide

## # A tibble: 16,799 x 13
##    transcript_ID SRR6739807 SRR6739808 SRR6739809 SRR6739810 SRR6739811
##    <chr>              <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
##  1 AT1G50920.1       169.       164.       175.       158.       184.  
##  2 AT1G15970.1        20.0       19.7       15.2       15.6       14.6 
##  3 AT1G73440.1         6.22       6.17       7.39       6.33       7.10
##  4 AT1G75120.1         2.44       2.20       3.55       4.94       5.59
##  5 AT1G17600.1        32.1       36.5       11.1        9.87      12.1 
##  6 AT1G77370.1        34.1       36.3       23.7       25.1       22.9 
##  7 AT1G10950.1       172.       171.       133.       148.       173.  
##  8 AT1G31870.1        40.8       38.5       26.8       26.9       33.3 
##  9 AT1G51360.1         7.40       7.64       9.07      10.4        6.82
## 10 AT1G50950.1         4.45       4.49       3.74       5.16       5.50
## # … with 16,789 more rows, and 7 more variables: SRR6739812 <dbl>,
## #   SRR6739813 <dbl>, SRR6739814 <dbl>, SRR6739815 <dbl>, SRR6739816 <dbl>,
## #   SRR6739817 <dbl>, SRR6739818 <dbl>

# plot
# format data for plot
plot.data <- DEG.count.list2[["trt.DEGs.int.rWT.rU"]] %>% left_join(cpm.wide,by=c("genes"="transcript_ID")) %>% unite(AGI_desc,c("AGI","symbol2")) %>% dplyr::select(-LR,-PValue,-genes) 
# plot: use labeller = label_wrap_gen(width=10) for multiple line in each gene name
p1<-plot.data %>% filter(logFC>0 & FDR<1e-100) %>% dplyr::select(-logFC,-logCPM,-FDR,-full_name) %>%  gather(Run,value,-1) %>% left_join(sample.info.final,by="Run") %>% select(-Run,-sample,-group,-LibraryLayout) %>% ggplot(aes(x=fct_relevel(treatment,"untreated"),y=value,color=genotype,shape=rep)) + geom_jitter() + facet_wrap(AGI_desc~.,scale="free",ncol=5,labeller = label_wrap_gen(width=30))+theme(strip.text = element_text(size=6)) + labs(title="WT SA up-regulated genes")
p1

# SA down-regulated genes
plot.data2 <- DEG.count.list2[["trt.DEGs.int.rWT.rU"]] %>% left_join(cpm.wide,by=c("genes"="transcript_ID")) %>% unite(AGI_desc,c("AGI","symbol2")) %>% dplyr::select(-LR,-PValue,-genes) 
# plot: use labeller = label_wrap_gen(width=10) for multiple line in each gene name
p2<-plot.data2 %>% filter(logFC<0 & FDR<1e-50) %>% dplyr::select(-logFC,-logCPM,-FDR,-full_name) %>%  gather(Run,value,-1) %>% left_join(sample.info.final,by="Run") %>% select(-Run,-sample,-group,-LibraryLayout) %>% ggplot(aes(x=fct_relevel(treatment,"untreated"),y=value,color=genotype,shape=rep)) + geom_jitter() + facet_wrap(AGI_desc~.,scale="free",ncol=5,labeller = label_wrap_gen(width=30))+theme(strip.text = element_text(size=6)) + labs(title="WT SA down-regulated genes")
p2

# npr1-1

# npr1-1 misregulated genes 
plot.data3 <- DEG.count.list2[["genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU"]] %>% left_join(cpm.wide,by=c("genes"="transcript_ID")) %>% unite(AGI_desc,c("AGI","symbol2")) %>% dplyr::select(-LR,-PValue,-genes) 
# plot: use labeller = label_wrap_gen(width=10) for multiple line in each gene name
p3 <- plot.data3 %>% filter(logFC.genotypenpr1.1>0 & FDR<1e-100) %>% dplyr::select(-logFC.genotypenpr1.1,-logFC.genotypenpr1.1.treatmenttreated,-FDR,-full_name,-logCPM) %>%  gather(Run,value,-1) %>% left_join(sample.info.final,by="Run") %>% select(-Run,-sample,-group,-LibraryLayout) %>% ggplot(aes(x=fct_relevel(treatment,"untreated"),y=value,color=genotype,shape=rep)) + geom_jitter() + facet_wrap(AGI_desc~.,scale="free",ncol=5,labeller = label_wrap_gen(width=30))+theme(strip.text = element_text(size=6)) + labs(title="logFC.genotypenpr1.1>0")
p3

# plot: use labeller = label_wrap_gen(width=10) for multiple line in each gene name
p4 <- plot.data3 %>% filter(logFC.genotypenpr1.1<0 & FDR<1e-100) %>% dplyr::select(-logFC.genotypenpr1.1,-logFC.genotypenpr1.1.treatmenttreated,-FDR,-full_name,-logCPM) %>%  gather(Run,value,-1) %>% left_join(sample.info.final,by="Run") %>% select(-Run,-sample,-group,-LibraryLayout) %>% ggplot(aes(x=fct_relevel(treatment,"untreated"),y=value,color=genotype,shape=rep)) + geom_jitter() + facet_wrap(AGI_desc~.,scale="free",ncol=5,labeller = label_wrap_gen(width=30))+theme(strip.text = element_text(size=6)) + labs(title="logFC.genotypenpr1.1<0")
p4

# plot: use labeller = label_wrap_gen(width=10) for multiple line in each gene name
p5 <- plot.data3 %>% filter(logFC.genotypenpr1.1.treatmenttreated>0 & FDR<1e-100) %>% dplyr::select(-logFC.genotypenpr1.1,-logFC.genotypenpr1.1.treatmenttreated,-FDR,-full_name,-logCPM) %>%  gather(Run,value,-1) %>% left_join(sample.info.final,by="Run") %>% select(-Run,-sample,-group,-LibraryLayout) %>% ggplot(aes(x=fct_relevel(treatment,"untreated"),y=value,color=genotype,shape=rep)) + geom_jitter() + facet_wrap(AGI_desc~.,scale="free",ncol=5,labeller = label_wrap_gen(width=30))+theme(strip.text = element_text(size=6)) + labs(title="logFC.genotypenpr1.1.treatmenttreated>0")
p5

# plot: use labeller = label_wrap_gen(width=10) for multiple line in each gene name
p6 <- plot.data3 %>% filter(logFC.genotypenpr1.1.treatmenttreated<0 & FDR<1e-100) %>% dplyr::select(-logFC.genotypenpr1.1,-logFC.genotypenpr1.1.treatmenttreated,-FDR,-full_name,-logCPM) %>%  gather(Run,value,-1) %>% left_join(sample.info.final,by="Run") %>% select(-Run,-sample,-group,-LibraryLayout) %>% ggplot(aes(x=fct_relevel(treatment,"untreated"),y=value,color=genotype,shape=rep)) + geom_jitter() + facet_wrap(AGI_desc~.,scale="free",ncol=5,labeller = label_wrap_gen(width=30))+theme(strip.text = element_text(size=6)) + labs(title="logFC.genotypenpr1.1.treatmenttreated<0")
p6

Session info

sessionInfo()

## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] edgeR_3.28.1    limma_3.42.2    forcats_0.5.0   stringr_1.4.0  
##  [5] dplyr_1.0.2     purrr_0.3.4     readr_1.4.0     tidyr_1.1.2    
##  [9] tibble_3.0.4    ggplot2_3.3.3   tidyverse_1.3.0
## 
## loaded via a namespace (and not attached):
##  [1] locfit_1.5-9.4    tidyselect_1.1.0  xfun_0.22         splines_3.6.2    
##  [5] lattice_0.20-41   haven_2.3.1       colorspace_2.0-0  vctrs_0.3.6      
##  [9] generics_0.1.0    htmltools_0.5.1.1 yaml_2.2.1        utf8_1.1.4       
## [13] rlang_0.4.10      pillar_1.4.7      glue_1.4.2        withr_2.3.0      
## [17] DBI_1.1.0         dbplyr_2.0.0      modelr_0.1.8      readxl_1.3.1     
## [21] lifecycle_0.2.0   munsell_0.5.0     blogdown_1.2.2    gtable_0.3.0     
## [25] cellranger_1.1.0  rvest_0.3.6       evaluate_0.14     labeling_0.4.2   
## [29] knitr_1.31        curl_4.3          fansi_0.4.1       highr_0.8        
## [33] broom_0.7.3       Rcpp_1.0.6        scales_1.1.1      backports_1.2.1  
## [37] jsonlite_1.7.2    farver_2.0.3      fs_1.5.0          hms_0.5.3        
## [41] digest_0.6.27     stringi_1.5.3     bookdown_0.21     grid_3.6.2       
## [45] cli_2.2.0         tools_3.6.2       magrittr_2.0.1    crayon_1.3.4     
## [49] pkgconfig_2.0.3   ellipsis_0.3.1    xml2_1.3.2        reprex_0.3.0     
## [53] lubridate_1.7.9.2 assertthat_0.2.1  rmarkdown_2.7     httr_1.4.2       
## [57] rstudioapi_0.13   R6_2.5.0          compiler_3.6.2

Differential expression analysis with public available sequencing data

Kazu Nozue