Download this Rmd file
Please download and modify your input/output file location
Download RNA-seq data from NCBI
Let’s use published RNA-seq data from Arabidopsis thaliana mutants (npr1-1 and npr4-4D) Ding (2018).
library(tidyverse)
library(stringr)
library(edgeR)
Trimming unnecessary sequences from data (such as adaptors) by trimmomatic
Needs to trim? Do fastqc and see results
trim?
# working in directories containing fastq files
fastfiles<-list.files(pattern="fastq")
system(paste("fastqc ",fastfiles,sep=""))
# and then check reports to see if there are adaptor contamination
# It seems those files were not contaminated so that no need to trim read files.
fastfiles.SE<- fastfiles %>% as_tibble() %>% separate(value,into=c("SRA","type","compress"),sep="\\.") %>% separate(SRA,into=c("SRA","pair"),sep="_") %>% group_by(SRA) %>%summarize(num=n()) %>% filter(num==1) %>% select(SRA) %>% as_vector()
# trim by trimmomatic (not necessary this time)
# system("makedir trimmed")
#for (x in fastfiles) {
#system(paste("trimmomatic.sh SE -threads 4 ",x," trimmed/trimmed_",x," ILLUMINACLIP:Bradseq_adapter.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36",sep="")) # bug fixed
#}
# Or using inline loop?
#system("for i in *fastq.gz; do trimmomatic.sh SE -threads 4 $i trimmed/trimmed$i ILLUMINACLIP:Bradseq_adapter.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36; done")
Mapping to reference cDNA and creating count data file (copied from Julin’s scripts “01_20170617_Map and normalize”)
read Kallisto manual https://pachterlab.github.io/kallisto/starting.html
Build index
#system("kallisto index -i TAIR10_cdna_20110103_representative_gene_model_kallisto_index TAIR10_cdna_20110103_representative_gene_model.gz")
# This reference seq has been updated in 2012! Redo mapping (101018)
#system("kallisto index -i TAIR10_cdna_20110103_representative_gene_model_updated_kallisto_index TAIR10_cdna_20110103_representative_gene_model_updated")
mapping
# makin directory for mapped
system("mkdir kallisto_sam_out2")
# reading fastq files
fastqfiles<-list.files(pattern="fastq.gz")
# fastq files with single end
fastqfiles.SE<- fastqfiles %>% as_tibble() %>% separate(value,into=c("SRA","type","compress"),sep="\\.") %>% separate(SRA,into=c("SRA","pair"),sep="_") %>% group_by(SRA) %>%summarize(num=n()) %>% filter(num==1) %>% select(SRA) %>% as_vector()
for(x in fastqfiles.SE) {
system(paste("kallisto quant -i TAIR10_cdna_20110103_representative_gene_model_updated_kallisto_index -o kallisto_sam_out2/",x," --single -l 200 -s 40 --pseudobam ",x,"_1.fastq.gz | samtools view -Sb - > kallisto_sam_out2/",x,"/",x,".bam",sep=""))
setwd(paste("kallisto_sam_out2/",x,sep=""))
system(paste("samtools sort *.bam -o ",x,".sorted.bam",sep=""))
system("samtools index *.sorted.bam")
setwd("../../")
}
#samtools view -u alignment.sam | samtools sort -@ 4 - output_prefix # https://www.biostars.org/p/263589/ # Move the counts to my local computer # Julin used lftp (https://www.networkworld.com/article/2833203/operating-systems/unix--flexibly-moving-files-with-lftp.html)
#In my computer,
# lftp sftp://kazu@whitney.plb.ucdavis.edu
# cd NGS/Ding_2018_Cell
# mirror kallisto_sam_out2
remove unused files
system("cd 2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2")
system("rm */*.json")
system("rm */*.bam*")
compress tsv files
system("gzip */abundance.tsv")
Get counts into R
kallisto_files <- dir(path = file.path("..","..","..","resources","2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files","kallisto_sam_out2"),pattern="abundance.tsv",recursive = TRUE,full.names = TRUE)
kallisto_files
## [1] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739807/abundance.tsv.gz"
## [2] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739808/abundance.tsv.gz"
## [3] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739809/abundance.tsv.gz"
## [4] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739810/abundance.tsv.gz"
## [5] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739811/abundance.tsv.gz"
## [6] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739812/abundance.tsv.gz"
## [7] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739813/abundance.tsv.gz"
## [8] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739814/abundance.tsv.gz"
## [9] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739815/abundance.tsv.gz"
## [10] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739816/abundance.tsv.gz"
## [11] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739817/abundance.tsv.gz"
## [12] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/kallisto_sam_out2/SRR6739818/abundance.tsv.gz"
getwd()
## [1] "/Volumes/data_personal/Kazu_blog14/content/post/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data"
kallisto_files2<-list.files(path=file.path("..","..","..","resources","2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files","kallisto_sam_out2")) # this is not working
kallisto_files2
## [1] "SRR6739807" "SRR6739808" "SRR6739809" "SRR6739810" "SRR6739811"
## [6] "SRR6739812" "SRR6739813" "SRR6739814" "SRR6739815" "SRR6739816"
## [11] "SRR6739817" "SRR6739818"
kallisto_names <- str_split(kallisto_files,"/",simplify=TRUE)[,3]
kallisto_names <- kallisto_files2
kallisto_names
## [1] "SRR6739807" "SRR6739808" "SRR6739809" "SRR6739810" "SRR6739811"
## [6] "SRR6739812" "SRR6739813" "SRR6739814" "SRR6739815" "SRR6739816"
## [11] "SRR6739817" "SRR6739818"
combine count data in list
counts.list <- lapply(kallisto_files,read_tsv)
names(counts.list) <- kallisto_names
counts.list[[1]]
## # A tibble: 33,602 x 5
## target_id length eff_length est_counts tpm
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 AT1G50920.1 2394 2195 3598 88.8
## 2 AT1G36960.1 546 347 0 0
## 3 AT1G44020.1 2314 2115 50.3 1.29
## 4 AT1G15970.1 1658 1459 427 15.9
## 5 AT1G73440.1 1453 1254 133. 5.74
## 6 AT1G75120.1 1520 1321 52 2.13
## 7 AT1G17600.1 3332 3133 685 11.9
## 8 AT1G50860.1 3846 3647 0 0
## 9 AT1G51380.1 1452 1253 65 2.81
## 10 AT1G77370.1 620 421 728 93.7
## # … with 33,592 more rows
combine counts data
counts <- sapply(counts.list, dplyr::select, est_counts) %>%
bind_cols(counts.list[[1]][,"target_id"],.)
counts[is.na(counts)] <- 0
colnames(counts) <- sub(".est_counts","",colnames(counts),fixed = TRUE)
counts
## # A tibble: 33,602 x 13
## target_id SRR6739807 SRR6739808 SRR6739809 SRR6739810 SRR6739811 SRR6739812
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AT1G5092… 3598 3442 3653 3531 4027 3775
## 2 AT1G3696… 0 0 0 0 0 0
## 3 AT1G4402… 50.3 64.7 5.57 13 7 10
## 4 AT1G1597… 427 412 316 348 319 357
## 5 AT1G7344… 133. 129. 154 141 155 131
## 6 AT1G7512… 52 46 74 110 122 91
## 7 AT1G1760… 685 764 231 220 264 247
## 8 AT1G5086… 0 0 0 0 0 0
## 9 AT1G5138… 65 81 72 78 73 55
## 10 AT1G7737… 728 760 495 559 499 561
## # … with 33,592 more rows, and 6 more variables: SRR6739813 <dbl>,
## # SRR6739814 <dbl>, SRR6739815 <dbl>, SRR6739816 <dbl>, SRR6739817 <dbl>,
## # SRR6739818 <dbl>
counts.updated<-counts
# write counts data
write_csv(counts.updated,file.path("..","..","..","resources","2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files","counts.updated.csv.gz"))
sample info
#sample.info<-read_csv("2018-10-01-differential-expression-analysis-with-public-available-sequencing-data/SraRunInfo.csv")
sample.info<-read_csv("http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&rettype=runinfo&db=sra&term=PRJNA434313") # Directly from ncbi site. This works!
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_character(),
## ReleaseDate = col_datetime(format = ""),
## LoadDate = col_datetime(format = ""),
## spots = col_double(),
## bases = col_double(),
## spots_with_mates = col_double(),
## avgLength = col_double(),
## size_MB = col_double(),
## AssemblyName = col_logical(),
## LibraryName = col_logical(),
## InsertSize = col_double(),
## InsertDev = col_double(),
## Study_Pubmed_id = col_double(),
## ProjectID = col_double(),
## TaxID = col_double(),
## g1k_pop_code = col_logical(),
## source = col_logical(),
## g1k_analysis_group = col_logical(),
## Subject_ID = col_logical(),
## Sex = col_logical(),
## Disease = col_logical()
## # ... with 5 more columns
## )
## ℹ Use `spec()` for the full column specifications.
sample.info2<-tibble(SampleName=c("GSM3014771","GSM3014772","GSM3014773","GSM3014774","GSM3014775","GSM3014776","GSM3014777","GSM3014778","GSM3014779","GSM3014780","GSM3014781","GSM3014782","GSM3014783","GSM3014784","GSM3014785","GSM3014786"),sample=c("WT_treated_1","WT_treated_2","WT_untreated_1","WT_untreated_2","npr1-1_treated_1","npr1-1_treated_2","npr1-1_untreated_1","npr1-1_untreated_2","npr4-4D_treated_1","npr4-4D_treated_2","npr4-4D_untreated_1","npr4-4D_untreated_2","npr1-1_npr4-4D_treated_1","npr1-1_npr4-4D_treated_2","npr1-1_npr4-4D_untreated_1","npr1-1_npr4-4D_untreated_2")) # # from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE110702
# combine
sample.info.final <- left_join(sample.info,sample.info2,by="SampleName") %>% dplyr::select(Run,sample,LibraryLayout) %>% separate(sample,into=c("genotype1","genotype2","treatment","rep"),sep="_",fill="left",remove=FALSE) %>% unite(genotype,genotype1,genotype2,sep="_") %>% mutate(genotype=str_replace(genotype,"NA_","")) %>% unite(group,genotype,treatment,sep="_",remove=FALSE)
check libraries by barplot and MDS plots
counts <- read_csv(file.path("..","..","..","resources","2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files","counts.updated.csv.gz"))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## target_id = col_character(),
## SRR6739807 = col_double(),
## SRR6739808 = col_double(),
## SRR6739809 = col_double(),
## SRR6739810 = col_double(),
## SRR6739811 = col_double(),
## SRR6739812 = col_double(),
## SRR6739813 = col_double(),
## SRR6739814 = col_double(),
## SRR6739815 = col_double(),
## SRR6739816 = col_double(),
## SRR6739817 = col_double(),
## SRR6739818 = col_double()
## )
all(colnames(counts)[-1]==sample.info.final[sample.info.final$LibraryLayout=="SINGLE","Run"])
## [1] TRUE
dge <- DGEList(counts[,-1],
group=as_vector(sample.info.final[sample.info.final$LibraryLayout=="SINGLE","group"]),
samples=sample.info.final[sample.info.final$LibraryLayout=="SINGLE",],
genes=counts$target_id)
# Filtering.
## From edgeR User guide (section 2.6. Filtering) "As a rule of thumb, genes are dropped if they can’t possibly be expressed in all the samples for any of the conditions. Users can set their own definition of genes being expressed. Usually a gene is required to have a count of 5-10 in a library to be considered expressed in that library. Users should also filter with count-per-million (CPM) rather than filtering on the counts directly, as the latter does not account for differences in library sizes between samples"
dge.nolow<-dge[rowSums(cpm(dge)> 5) >= 2,] # two replicates in this experiment.
table(rowSums(cpm(dge)> 5) >= 2)
##
## FALSE TRUE
## 16803 16799
# normalization
dge.nolow <- calcNormFactors(dge.nolow)
# lib.size check
ggplot(dge.nolow$samples,aes(x=sample,y=norm.factors,fill=genotype)) + geom_col() +
theme(axis.text.x = element_text(angle=90, vjust=0.5,size = 7))

MDS plot
mds <- plotMDS(dge.nolow,method = "bcv",labels=dge.nolow$samples$group,gene.selection = "pairwise", ndim=3)

# fix this
mds.pl <- as_tibble(mds$cmdscale.out) %>%
bind_cols(data.frame(sample=row.names(mds$cmdscale.out)),.) %>%
left_join(dge.nolow$sample,by=c(sample="Run"))
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
mds.pl %>% ggplot(aes(x=V1,y=V2,color=as.factor(treatment),label=genotype)) + geom_text(angle=30) + ggtitle("DIM 1 vs 2")
# edge.nolowR (model with glm)
# set references in genotype and treatment
dge.nolow$samples$genotype<-factor(dge.nolow$samples$genotype,levels=c("WT","npr4-4D","npr1-1"))
dge.nolow$samples$treatment<-factor(dge.nolow$samples$treatment,levels=c("untreated","treated"))
save(dge.nolow,file=file.path("..","..","..","resources","2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files","dge.nolow.Ding2018.Rdata"))
# making model matrix
design.int <- with(dge.nolow$samples, model.matrix(~ genotype*treatment + rep))
# estimateDisp
dge.nolow.int <- estimateDisp(dge.nolow,design = design.int)
## fit linear model
fit.int <- glmFit(dge.nolow.int,design.int)
# get DEGs, treatment effects in Col (rWT = reference WT, rU = reference is untreated)
trt.lrt.rWT.rU <- glmLRT(fit.int,coef = c("treatmenttreated"))
trt.DEGs.int.rWT.rU <- topTags(trt.lrt.rWT.rU,n = Inf,p.value = 0.05)$table;nrow(trt.DEGs.int.rWT.rU) #
## [1] 7862
# get DEGs, genotype effects in npr1-1 (no matter treated or untreated) (coef=c("genotypenpr1-1","genotypenpr1-1:treatmenttreated"))
genotypenpr1_genotypenpr1.treatmenttreated.lrt.rWT.rU <- glmLRT(fit.int,coef = c("genotypenpr1-1","genotypenpr1-1:treatmenttreated"))
genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU <- topTags(genotypenpr1_genotypenpr1.treatmenttreated.lrt.rWT.rU,n = Inf,p.value = 0.05)$table;nrow(genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU) #
## [1] 6460
# get DEGs, genotype effects in npr4-4D (no matter treated or untreated) (coef=c("genotypenpr4-4D","genotypenpr4-4D:treatmenttreated"))
genotypenpr4_genotypenpr4.treatmenttreated.lrt.rWT.rU <- glmLRT(fit.int,coef = c("genotypenpr4-4D","genotypenpr4-4D:treatmenttreated"))
genotypenpr4_genotypenpr4.treatmenttreated.DEGs.int.rWT.rU <- topTags(genotypenpr4_genotypenpr4.treatmenttreated.lrt.rWT.rU,n = Inf,p.value = 0.05)$table;nrow(genotypenpr4_genotypenpr4.treatmenttreated.DEGs.int.rWT.rU) # 846
## [1] 854
Download updated gene description
Most updated annotation can be downloaded from TAIR website (for subscribers). I used “TAIR Data 20180630”. “gene_aliases_20201231.txt.gz”.
#At.gene.name <-read_tsv("https://www.arabidopsis.org/download_files/Subscriber_Data_Releases/TAIR_Data_20180630/gene_aliases_20180702.txt.gz") # Does work from home when I use Pulse Secure. (as of Mar 23, 2021) SSL certificate problem: certificate has expired. Download file and use downloaded file (March 23, 2021)
At.gene.name <-read_tsv("https://www.arabidopsis.org/download_files/Subscriber_Data_Releases/TAIR_Data_20180630/gene_aliases_20201231.txt.gz") # SSL certificate problem: certificate has expired. Download file and use downloaded file (March 23, 2021)
Using downloaded file (gene_aliases_20201231.txt.gz)
getwd()
## [1] "/Volumes/data_personal/Kazu_blog14/content/post/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data"
system("gunzip -c ../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/gene_aliases_20201231.txt.gz > ../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/gene_aliases_20201231.txt")
At.gene.name <- read_tsv(file.path("..","..","..","resources","2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files","gene_aliases_20201231.txt"))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## name = col_character(),
## symbol = col_character(),
## full_name = col_character()
## )
At.gene.name
## # A tibble: 27,644 x 3
## name symbol full_name
## <chr> <chr> <chr>
## 1 18S RRNA 18S RRNA <NA>
## 2 25S RRNA 25S RRNA <NA>
## 3 5.8S RRNA 5.8S RRNA <NA>
## 4 AAN AAN ANOTHER ANDANTE
## 5 AAR1 AAR1 ALLYL ALCOHOL RESISTANT 1
## 6 AAR2 AAR2 ALLYL ALCOHOL RESISTANT 2
## 7 ACL1 ACL1 ACAULIS 1
## 8 ACL2 ACL2 ACAULIS 2
## 9 ACL3 ACL3 ACAULIS 3
## 10 ACL4 ACL4 ACAULIS 4
## # … with 27,634 more rows
# remove uncompressed file
system("rm ../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/gene_aliases_20201231.txt")
cleanup At.gene.name
# How to concatenate multiple symbols in the same AGI
At.gene.name <- At.gene.name %>% group_by(name) %>% summarise(symbol2=paste(symbol,collapse=";"),full_name=paste(full_name,collapse=";"))
## `summarise()` ungrouping output (override with `.groups` argument)
At.gene.name %>% dplyr::slice(100:110) # OK
## # A tibble: 11 x 3
## name symbol2 full_name
## <chr> <chr> <chr>
## 1 AT1G017… ATKEA1;KEA1 K+ EFFLUX ANTIPORTER 1;K+ efflux antiporter 1
## 2 AT1G018… PEX11C peroxin 11c
## 3 AT1G018… PFC1 PALEFACE 1
## 4 AT1G018… AtGEN1;GEN1 NA;ortholog of HsGEN1
## 5 AT1G019… ATSBT1.1;SBT… NA;NA
## 6 AT1G019… AtGET3a;GET3a Guided Entry of Tail-anchored protein 3a;Guided Entry…
## 7 AT1G019… ARK2;AtKINUb armadillo repeat kinesin 2;Arabidopsis thaliana KINES…
## 8 AT1G019… BIG3;EDA10 BIG3;embryo sac development arrest 10
## 9 AT1G019… AtBBE1;OGOX4 NA;oligogalacturonide oxidase 4
## 10 AT1G020… GAE2 UDP-D-glucuronate 4-epimerase 2
## 11 AT1G020… SEC1A secretory 1A
Make “output” directory for saving DEG results as csv files
# system("mkdir 2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/output")
Add gene descripion to DEG files
# WT (Col) treatment DEG
trt.DEGs.int.rWT.rU.desc<- trt.DEGs.int.rWT.rU %>% mutate(AGI=str_remove(genes,pattern="\\.[[:digit:]]+$")) %>% left_join(At.gene.name,by=c("AGI"="name"))
write_csv(trt.DEGs.int.rWT.rU.desc,file.path("..","..","..","resources","2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files","output","trt.DEGs.int.rWT.rU.csv"))
# npr1-1 (either treated or untreated)
genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU.desc <- genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU %>% mutate(AGI=str_remove(genes,pattern="\\.[[:digit:]]+$")) %>% left_join(At.gene.name,by=c("AGI"="name"))
write_csv(genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU.desc,file.path("..","..","..","resources","2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files","output","genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU.csv"))
# npr4-4 (either treated or untreated)
genotypenpr4_genotypenpr4.treatmenttreated.DEGs.int.rWT.rU.desc <- genotypenpr4_genotypenpr4.treatmenttreated.DEGs.int.rWT.rU %>% mutate(AGI=str_remove(genes,pattern="\\.[[:digit:]]+$")) %>% left_join(At.gene.name,by=c("AGI"="name"))
write_csv(genotypenpr4_genotypenpr4.treatmenttreated.DEGs.int.rWT.rU.desc,file.path("..","..","..","resources","2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files","output","genotypenpr4_genotypenpr4.treatmenttreated.DEGs.int.rWT.rU.csv"))
table for DEGs summary
# getting csv file names and path
DEG.objs2 <- list.files(path=file.path("..","..","..","resources","2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files","output"),pattern="\\.csv$",recursive =TRUE,full.names = TRUE)
DEG.objs2
## [1] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/output/genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU.csv"
## [2] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/output/genotypenpr4_genotypenpr4.treatmenttreated.DEGs.int.rWT.rU.csv"
## [3] "../../../resources/2018-10-01-differential-expression-analysis-with-public-available-sequencing-data_files/output/trt.DEGs.int.rWT.rU.csv"
# reading count data
DEG.count.list2 <- DEG.objs2 %>% map(~read_csv(.))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## genes = col_character(),
## logFC.genotypenpr1.1 = col_double(),
## logFC.genotypenpr1.1.treatmenttreated = col_double(),
## logCPM = col_double(),
## LR = col_double(),
## PValue = col_double(),
## FDR = col_double(),
## AGI = col_character(),
## symbol2 = col_character(),
## full_name = col_character()
## )
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## genes = col_character(),
## logFC.genotypenpr4.4D = col_double(),
## logFC.genotypenpr4.4D.treatmenttreated = col_double(),
## logCPM = col_double(),
## LR = col_double(),
## PValue = col_double(),
## FDR = col_double(),
## AGI = col_character(),
## symbol2 = col_character(),
## full_name = col_character()
## )
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## genes = col_character(),
## logFC = col_double(),
## logCPM = col_double(),
## LR = col_double(),
## PValue = col_double(),
## FDR = col_double(),
## AGI = col_character(),
## symbol2 = col_character(),
## full_name = col_character()
## )
DEG.count.list2
## [[1]]
## # A tibble: 6,460 x 10
## genes logFC.genotypen… logFC.genotypen… logCPM LR PValue FDR
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AT2G… -4.19 -0.733 6.69 1327. 6.75e-289 1.13e-284
## 2 AT5G… 3.26 0.517 4.76 1072. 1.33e-233 1.11e-229
## 3 AT1G… -2.01 -0.488 7.19 659. 6.38e-144 3.57e-140
## 4 AT1G… -2.25 -0.525 7.51 624. 2.54e-136 1.07e-132
## 5 AT3G… -3.15 0.874 5.33 622. 1.00e-135 3.36e-132
## 6 AT5G… -0.206 -6.73 3.90 595. 7.88e-130 2.21e-126
## 7 AT1G… -3.64 -1.53 3.37 581. 7.28e-127 1.75e-123
## 8 AT5G… 1.83 1.82 9.40 571. 1.04e-124 2.18e-121
## 9 AT2G… 2.23 1.94 9.22 559. 4.26e-122 7.94e-119
## 10 AT5G… -2.01 -3.49 4.19 538. 1.19e-117 2.00e-114
## # … with 6,450 more rows, and 3 more variables: AGI <chr>, symbol2 <chr>,
## # full_name <chr>
##
## [[2]]
## # A tibble: 854 x 10
## genes logFC.genotypen… logFC.genotypen… logCPM LR PValue FDR
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AT5G… 3.04 0.632 4.76 966. 1.54e-210 2.59e-206
## 2 AT5G… -3.31 0.358 3.10 463. 3.40e-101 2.85e- 97
## 3 AT3G… -5.46 -0.178 1.98 454. 2.07e- 99 1.16e- 95
## 4 AT1G… -7.41 -3.70 2.00 424. 6.68e- 93 2.80e- 89
## 5 AT3G… -2.97 -0.297 3.36 393. 4.39e- 86 1.47e- 82
## 6 AT2G… -1.42 -0.699 6.69 284. 2.63e- 62 7.35e- 59
## 7 AT1G… -2.50 0.894 6.30 280. 1.23e- 61 2.96e- 58
## 8 AT2G… -1.10 -0.233 5.87 221. 9.62e- 49 2.02e- 45
## 9 AT5G… -3.04 0.438 4.19 208. 6.15e- 46 1.15e- 42
## 10 AT5G… -1.84 -0.485 2.78 190. 4.64e- 42 7.79e- 39
## # … with 844 more rows, and 3 more variables: AGI <chr>, symbol2 <chr>,
## # full_name <chr>
##
## [[3]]
## # A tibble: 7,862 x 9
## genes logFC logCPM LR PValue FDR AGI symbol2 full_name
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 AT5G1… 8.58 3.90 662. 5.49e-146 9.22e-142 AT5G1… AT-HSP1… heat shock pro…
## 2 AT1G0… 4.01 4.60 613. 2.14e-135 1.80e-131 AT1G0… <NA> <NA>
## 3 AT4G2… 3.86 5.39 586. 2.36e-129 1.32e-125 AT4G2… CRK14 cysteine-rich …
## 4 AT5G2… 6.17 4.19 582. 1.27e-128 5.32e-125 AT5G2… ATWRKY3… ARABIDOPSIS TH…
## 5 AT3G2… 4.93 3.36 578. 9.46e-128 3.18e-124 AT3G2… <NA> <NA>
## 6 AT5G1… 9.65 3.77 569. 1.18e-125 3.30e-122 AT5G1… HSP17.6… 17.6 kDa class…
## 7 AT4G2… 3.34 6.70 563. 1.80e-124 4.32e-121 AT4G2… SGT1A <NA>
## 8 AT1G2… 2.88 7.19 539. 2.54e-119 5.34e-116 AT1G2… AtWAK1;… NA;NA;cell wal…
## 9 AT5G2… 3.83 6.70 529. 4.88e-117 9.12e-114 AT5G2… AtDMR6;… NA;DOWNY MILDE…
## 10 AT2G1… 2.96 7.30 512. 2.22e-113 3.73e-110 AT2G1… ATSERK4… SOMATIC EMBRYO…
## # … with 7,852 more rows
# choose file names
names(DEG.count.list2) <- str_split(DEG.objs2,pattern="/",simplify=TRUE)[,7] %>% str_remove(".csv")
# count DEGs
DEGs.count <- sapply(DEG.count.list2,function(x) {
value <- nrow(x)
ifelse(is.null(value),0,value)
} )
DEGs.count
## genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU
## 6460
## genotypenpr4_genotypenpr4.treatmenttreated.DEGs.int.rWT.rU
## 854
## trt.DEGs.int.rWT.rU
## 7862
Plot gene expression pattern
For some reasons, labeller = label_wrap_gen(width=30) does not work here. Needs to make a function for plotting graph later to reduce redundancy of scripts.
# calculate normalized gene expression level
cpm.wide <- bind_cols(dge.nolow$gene,as_tibble(cpm(dge.nolow))) %>% as_tibble() %>% rename(transcript_ID=genes)
cpm.wide
## # A tibble: 16,799 x 13
## transcript_ID SRR6739807 SRR6739808 SRR6739809 SRR6739810 SRR6739811
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AT1G50920.1 169. 164. 175. 158. 184.
## 2 AT1G15970.1 20.0 19.7 15.2 15.6 14.6
## 3 AT1G73440.1 6.22 6.17 7.39 6.33 7.10
## 4 AT1G75120.1 2.44 2.20 3.55 4.94 5.59
## 5 AT1G17600.1 32.1 36.5 11.1 9.87 12.1
## 6 AT1G77370.1 34.1 36.3 23.7 25.1 22.9
## 7 AT1G10950.1 172. 171. 133. 148. 173.
## 8 AT1G31870.1 40.8 38.5 26.8 26.9 33.3
## 9 AT1G51360.1 7.40 7.64 9.07 10.4 6.82
## 10 AT1G50950.1 4.45 4.49 3.74 5.16 5.50
## # … with 16,789 more rows, and 7 more variables: SRR6739812 <dbl>,
## # SRR6739813 <dbl>, SRR6739814 <dbl>, SRR6739815 <dbl>, SRR6739816 <dbl>,
## # SRR6739817 <dbl>, SRR6739818 <dbl>
# plot
# format data for plot
plot.data <- DEG.count.list2[["trt.DEGs.int.rWT.rU"]] %>% left_join(cpm.wide,by=c("genes"="transcript_ID")) %>% unite(AGI_desc,c("AGI","symbol2")) %>% dplyr::select(-LR,-PValue,-genes)
# plot: use labeller = label_wrap_gen(width=10) for multiple line in each gene name
p1<-plot.data %>% filter(logFC>0 & FDR<1e-100) %>% dplyr::select(-logFC,-logCPM,-FDR,-full_name) %>% gather(Run,value,-1) %>% left_join(sample.info.final,by="Run") %>% select(-Run,-sample,-group,-LibraryLayout) %>% ggplot(aes(x=fct_relevel(treatment,"untreated"),y=value,color=genotype,shape=rep)) + geom_jitter() + facet_wrap(AGI_desc~.,scale="free",ncol=5,labeller = label_wrap_gen(width=30))+theme(strip.text = element_text(size=6)) + labs(title="WT SA up-regulated genes")
p1

# SA down-regulated genes
plot.data2 <- DEG.count.list2[["trt.DEGs.int.rWT.rU"]] %>% left_join(cpm.wide,by=c("genes"="transcript_ID")) %>% unite(AGI_desc,c("AGI","symbol2")) %>% dplyr::select(-LR,-PValue,-genes)
# plot: use labeller = label_wrap_gen(width=10) for multiple line in each gene name
p2<-plot.data2 %>% filter(logFC<0 & FDR<1e-50) %>% dplyr::select(-logFC,-logCPM,-FDR,-full_name) %>% gather(Run,value,-1) %>% left_join(sample.info.final,by="Run") %>% select(-Run,-sample,-group,-LibraryLayout) %>% ggplot(aes(x=fct_relevel(treatment,"untreated"),y=value,color=genotype,shape=rep)) + geom_jitter() + facet_wrap(AGI_desc~.,scale="free",ncol=5,labeller = label_wrap_gen(width=30))+theme(strip.text = element_text(size=6)) + labs(title="WT SA down-regulated genes")
p2
# npr1-1
# npr1-1 misregulated genes
plot.data3 <- DEG.count.list2[["genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU"]] %>% left_join(cpm.wide,by=c("genes"="transcript_ID")) %>% unite(AGI_desc,c("AGI","symbol2")) %>% dplyr::select(-LR,-PValue,-genes)
# plot: use labeller = label_wrap_gen(width=10) for multiple line in each gene name
p3 <- plot.data3 %>% filter(logFC.genotypenpr1.1>0 & FDR<1e-100) %>% dplyr::select(-logFC.genotypenpr1.1,-logFC.genotypenpr1.1.treatmenttreated,-FDR,-full_name,-logCPM) %>% gather(Run,value,-1) %>% left_join(sample.info.final,by="Run") %>% select(-Run,-sample,-group,-LibraryLayout) %>% ggplot(aes(x=fct_relevel(treatment,"untreated"),y=value,color=genotype,shape=rep)) + geom_jitter() + facet_wrap(AGI_desc~.,scale="free",ncol=5,labeller = label_wrap_gen(width=30))+theme(strip.text = element_text(size=6)) + labs(title="logFC.genotypenpr1.1>0")
p3

# plot: use labeller = label_wrap_gen(width=10) for multiple line in each gene name
p4 <- plot.data3 %>% filter(logFC.genotypenpr1.1<0 & FDR<1e-100) %>% dplyr::select(-logFC.genotypenpr1.1,-logFC.genotypenpr1.1.treatmenttreated,-FDR,-full_name,-logCPM) %>% gather(Run,value,-1) %>% left_join(sample.info.final,by="Run") %>% select(-Run,-sample,-group,-LibraryLayout) %>% ggplot(aes(x=fct_relevel(treatment,"untreated"),y=value,color=genotype,shape=rep)) + geom_jitter() + facet_wrap(AGI_desc~.,scale="free",ncol=5,labeller = label_wrap_gen(width=30))+theme(strip.text = element_text(size=6)) + labs(title="logFC.genotypenpr1.1<0")
p4

# plot: use labeller = label_wrap_gen(width=10) for multiple line in each gene name
p5 <- plot.data3 %>% filter(logFC.genotypenpr1.1.treatmenttreated>0 & FDR<1e-100) %>% dplyr::select(-logFC.genotypenpr1.1,-logFC.genotypenpr1.1.treatmenttreated,-FDR,-full_name,-logCPM) %>% gather(Run,value,-1) %>% left_join(sample.info.final,by="Run") %>% select(-Run,-sample,-group,-LibraryLayout) %>% ggplot(aes(x=fct_relevel(treatment,"untreated"),y=value,color=genotype,shape=rep)) + geom_jitter() + facet_wrap(AGI_desc~.,scale="free",ncol=5,labeller = label_wrap_gen(width=30))+theme(strip.text = element_text(size=6)) + labs(title="logFC.genotypenpr1.1.treatmenttreated>0")
p5

# plot: use labeller = label_wrap_gen(width=10) for multiple line in each gene name
p6 <- plot.data3 %>% filter(logFC.genotypenpr1.1.treatmenttreated<0 & FDR<1e-100) %>% dplyr::select(-logFC.genotypenpr1.1,-logFC.genotypenpr1.1.treatmenttreated,-FDR,-full_name,-logCPM) %>% gather(Run,value,-1) %>% left_join(sample.info.final,by="Run") %>% select(-Run,-sample,-group,-LibraryLayout) %>% ggplot(aes(x=fct_relevel(treatment,"untreated"),y=value,color=genotype,shape=rep)) + geom_jitter() + facet_wrap(AGI_desc~.,scale="free",ncol=5,labeller = label_wrap_gen(width=30))+theme(strip.text = element_text(size=6)) + labs(title="logFC.genotypenpr1.1.treatmenttreated<0")
p6

Session info
sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] edgeR_3.28.1 limma_3.42.2 forcats_0.5.0 stringr_1.4.0
## [5] dplyr_1.0.2 purrr_0.3.4 readr_1.4.0 tidyr_1.1.2
## [9] tibble_3.0.4 ggplot2_3.3.3 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] locfit_1.5-9.4 tidyselect_1.1.0 xfun_0.22 splines_3.6.2
## [5] lattice_0.20-41 haven_2.3.1 colorspace_2.0-0 vctrs_0.3.6
## [9] generics_0.1.0 htmltools_0.5.1.1 yaml_2.2.1 utf8_1.1.4
## [13] rlang_0.4.10 pillar_1.4.7 glue_1.4.2 withr_2.3.0
## [17] DBI_1.1.0 dbplyr_2.0.0 modelr_0.1.8 readxl_1.3.1
## [21] lifecycle_0.2.0 munsell_0.5.0 blogdown_1.2.2 gtable_0.3.0
## [25] cellranger_1.1.0 rvest_0.3.6 evaluate_0.14 labeling_0.4.2
## [29] knitr_1.31 curl_4.3 fansi_0.4.1 highr_0.8
## [33] broom_0.7.3 Rcpp_1.0.6 scales_1.1.1 backports_1.2.1
## [37] jsonlite_1.7.2 farver_2.0.3 fs_1.5.0 hms_0.5.3
## [41] digest_0.6.27 stringi_1.5.3 bookdown_0.21 grid_3.6.2
## [45] cli_2.2.0 tools_3.6.2 magrittr_2.0.1 crayon_1.3.4
## [49] pkgconfig_2.0.3 ellipsis_0.3.1 xml2_1.3.2 reprex_0.3.0
## [53] lubridate_1.7.9.2 assertthat_0.2.1 rmarkdown_2.7 httr_1.4.2
## [57] rstudioapi_0.13 R6_2.5.0 compiler_3.6.2