Download updated gene description (copied from “2018-10-01-…Rmd”)
Most updated annotation can be downloaded from TAIR website (for subscribers). I used “TAIR Data 20180630”.
library(tidyverse);library(readxl);library(readr)
## ── Attaching packages ───────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.7
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
# turn on TAIR subsciption before this procedure
At.gene.name <-read_tsv("https://www.arabidopsis.org/download_files/Subscriber_Data_Releases/TAIR_Data_20180630/gene_aliases_20180702.txt.gz") # Does work from home when I use Pulse Secure.
# How to concatenate multiple symbols in the same AGI
At.gene.name <- At.gene.name %>% group_by(name) %>% summarise(symbol2=paste(symbol,collapse=";"),full_name=paste(full_name,collapse=";"))
At.gene.name %>% dplyr::slice(100:110) # OK
## # A tibble: 11 x 3
## name symbol2 full_name
## <chr> <chr> <chr>
## 1 AT1G018… AtGEN1;GEN1 NA;ortholog of HsGEN1
## 2 AT1G019… ATSBT1.1;SBTI1.1 NA;NA
## 3 AT1G019… AtGET3a;GET3a Guided Entry of Tail-anchored protein 3a;Gui…
## 4 AT1G019… ARK2;AtKINUb armadillo repeat kinesin 2;Arabidopsis thali…
## 5 AT1G019… BIG3;EDA10 BIG3;embryo sac development arrest 10
## 6 AT1G019… AtBBE1;OGOX4 NA;oligogalacturonide oxidase 4
## 7 AT1G020… GAE2 UDP-D-glucuronate 4-epimerase 2
## 8 AT1G020… SEC1A secretory 1A
## 9 AT1G020… LAP6;PKSA LESS ADHESIVE POLLEN 6;polyketide synthase A
## 10 AT1G020… SPL8 squamosa promoter binding protein-like 8
## 11 AT1G020… ATCSN7;COP15;CS… ARABIDOPSIS THALIANA COP9 SIGNALOSOME SUBUNI…
# download cDNA info directly from TAIR
At_cdna<-Biostrings::readDNAStringSet("https://www.arabidopsis.org/download_files/Sequences/TAIR10_blastsets/TAIR10_cdna_20110103_representative_gene_model_updated")
At_cdna # 33602 genes
## A DNAStringSet instance of length 33602
## width seq names
## [1] 2394 AGAAAACAGTCGACCGTCA...TTGGTAATTTTTTGAGTC AT1G50920.1 | Sym...
## [2] 546 ATGACTCGTTTGTTGCCTT...GTTGATTCTGGTACATAG AT1G36960.1 | Sym...
## [3] 2314 ATGGATTCAGAGTCAGAGT...GGTGCATTGTGTTTCTCC AT1G44020.1 | Sym...
## [4] 1658 TCGTTTCGTCGTCGATCAG...GATTACATGCTACATTTT AT1G15970.1 | Sym...
## [5] 1453 ATTGAAAAGAAAACACATC...CACCAAAATCTTCTCATA AT1G73440.1 | Sym...
## ... ... ...
## [33598] 87 GGATGGATGTCTGAGCGGT...CGAATCCCTCTCCATCCG ATMG00420.1 | Sym...
## [33599] 384 ATGCTCCCCGCCGGTTGTT...CGATACTTAACTATATAA ATMG01330.1 | Sym...
## [33600] 573 ATGGATAACCAATTCATTT...CAGCGTAGCGACGGATAA ATMG00070.1 | Sym...
## [33601] 366 ATGGCATCAAAAATCCGCA...CCTTCTGCATACGCATAA ATMG00130.1 | Sym...
## [33602] 74 GCGCTCTTAGTTCAGTTCG...CAAATCCTACAGAGCGTG ATMG00930.1 | Sym...
Updated hormone responsive gene list1
hormone.responsiveness6.DF.s.list2.DF2<-readxl::read_xlsx(file.path("2018-10-10-over-representation-analysis-2-goseq-using-custom-categories_files","Supplemental_Dataset3_source_of_custom_categories.xlsx"),skip=15,sheet=1) # Does not work. why?
# because uglyURLs and canonifyURLs = true on 100818. I commented out them.
# Ask Kazu for this xlsx file.
hormone.responsiveness6.DF.s.list2.DF2
## # A tibble: 519 x 20
## ABAdown ACCup ACCdown BLup BLdown IAAup IAAdown MJup MJdown CKup
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 AT1G21… AT1G… AT1G70… AT1G… AT1G2… AT1G… AT1G74… AT1G… AT1G7… AT1G…
## 2 AT1G21… AT1G… AT1G29… AT1G… AT1G1… AT1G… AT1G31… AT1G… AT1G3… AT1G…
## 3 AT1G49… AT1G… AT1G28… AT1G… AT1G6… AT1G… AT1G17… AT1G… AT1G6… AT1G…
## 4 AT1G73… AT1G… AT1G64… AT2G… AT1G0… AT1G… AT1G13… AT1G… AT1G7… AT1G…
## 5 AT1G70… AT1G… AT1G72… AT2G… AT1G7… AT1G… AT1G18… AT1G… AT1G4… AT1G…
## 6 AT1G31… AT2G… AT2G26… AT2G… AT1G1… AT1G… AT1G31… AT1G… AT1G7… AT1G…
## 7 AT1G77… AT2G… AT2G24… AT2G… AT1G7… AT1G… AT1G77… AT1G… AT1G1… AT1G…
## 8 AT1G16… AT2G… AT2G03… AT2G… AT2G2… AT1G… AT1G13… AT1G… AT1G6… AT1G…
## 9 AT1G70… AT2G… AT3G13… AT2G… AT2G3… AT1G… AT1G72… AT1G… AT1G1… AT1G…
## 10 AT1G77… AT2G… AT3G56… AT3G… AT2G1… AT1G… AT1G72… AT1G… AT1G6… AT1G…
## # ... with 509 more rows, and 10 more variables: CKdown <chr>, GAup <chr>,
## # GAdown <chr>, PIFtarget <chr>, MYC234up <chr>, MYC234down <chr>,
## # SAup <chr>, Sadown <chr>, WRKY33up <chr>, WRKY33down <chr>
# conver to list obejct compatible with goseq() below
format hormone.responsiveness6.DF.s.list2.DF2 into goseq() compatible list (run once)
# extract AGI from At_cdna
At.gene.name<-tibble(name=names(At_cdna)) %>% separate(name,into=c("name2","Symbol","description"),sep=" \\|",extra="drop",fill="left") %>% mutate(AGI=str_remove(name2,pattern="\\.[[:digit:]]+"))
# make presence/absense hormone responsive gene table (dataframe)
for(i in 1:20) {
genes<-hormone.responsiveness6.DF.s.list2.DF2[!is.na(hormone.responsiveness6.DF.s.list2.DF2[,i]),i] %>% as_vector()
temp<-data.frame(AGI=genes,category=names(hormone.responsiveness6.DF.s.list2.DF2)[i])
#At.gene.name %>% left_join(temp,by=c("name"="AGI")) -> At.gene.name
At.gene.name %>% left_join(temp,by="AGI") -> At.gene.name
}
names(At.gene.name)[5:24] <- names(hormone.responsiveness6.DF.s.list2.DF2)
At.gene.name %>% filter(str_detect(AGI,pattern="AT1G|AT2G|AT3G|AT4G|AT5G|ATC|ATM")) %>% unite(category,5:24,sep=",")->test2
test2[1:10,] %>% mutate(category=gsub("NA","",category)) %>% mutate(category=gsub(",","",category))
# only select genes with AGI name and concatenate categories
At.gene.name %>% filter(str_detect(AGI,pattern="AT1G|AT2G|AT3G|AT4G|AT5G|ATC|ATM")) %>% unite(category,5:24,sep=",") %>% mutate(category=gsub("NA,","",category)) %>% mutate(category=gsub(",NA","",category)) %>% mutate(category=gsub("NA","",category)) ->test3
# convert into list object
temp.list<-list()
for(i in 1:dim(test3)[1]) {
temp.list[[i]]<-test3 %>% slice(i) %>% pull(category)
}
names(temp.list)<-test3 %>% pull(AGI)
# split concatanated categories in each gene
At.hormone.responsive.list<-lapply(temp.list,function(x) unlist(strsplit(x, split=",")))
# save
save(At.hormone.responsive.list,file="2018-09-27-over-representation-analysis-2-goseq-with-custom-categories_files/At.hormone.responsive.list.Rdata")
GOseq ORA function with Arabidopss thaliana hormone responsive genes (under construction)
load("2018-10-10-over-representation-analysis-2-goseq-using-custom-categories_files/At.hormone.responsive.list.Rdata")
head(At.hormone.responsive.list)
## $AT1G50920
## character(0)
##
## $AT1G36960
## character(0)
##
## $AT1G44020
## character(0)
##
## $AT1G15970
## character(0)
##
## $AT1G73440
## character(0)
##
## $AT1G75120
## character(0)
library(ShortRead);library(goseq);library(GO.db);library("annotate")
# bias.data vector must have the same length as DEgenes vector!
## if you want to test expressed genes as background, add background in this function
GOseq.At.customcat.ORA<-function(genelist,padjust=0.05,custom.category.list=At.hormone.responsive.list,cdna=At_cdna,background="") { # return GO enrichment table, padjus, padjust=0.05. New BLAST2GO results with only BP Brgo.v2.5.BP.list is used (042518). Brgo.v2.5.BP.list is based on Hajar's Blast2go, which should be replaced with Brgo.Atgoslim.BP.list and giving a new name to this function (080818). cdna is DNAstring object. background is a vector of genes.
#bias<-nchar(cdna)
bias<-Biostrings::width(cdna)
# add name to "bias". cdna
#names(bias)<-names(cdna)
names(bias) <- tibble(name=names(cdna)) %>% separate(name,into=c("name2","Symbol","description"),sep=" \\|",extra="drop",fill="left") %>% mutate(AGI=str_remove(name2,pattern="\\.[[:digit:]]+")) %>% pull(AGI)
if(background=="") {print("Use all genes in genome as background")} else {
# select only expressed geens
bias <- bias[background]
print("Expessed genes are used as background.")
print(paste("length of bias is ",length(bias)))
}
TF<-(names(bias) %in% genelist)*1
names(TF)<-names(bias)
#print(TF)
pwf<-nullp(TF,bias.data=bias)
#print(pwf$DEgenes)
GO.pval <- goseq(pwf,gene2cat=custom.category.list,use_genes_without_cat=TRUE) # format became different in new goseq version (021111). Does not work (042716)
# calculate p ajust by BH
GO.pval$over_represented_padjust<-p.adjust(GO.pval$over_represented_pvalue,method="BH")
if(GO.pval$over_represented_padjust[1]>padjust) return("no enriched GO")
else {
enriched.GO<-GO.pval[GO.pval$over_represented_padjust<padjust,]
print("enriched.GO is")
print(enriched.GO)
return(enriched.GO)
}
}
test the function
load("2018-10-10-over-representation-analysis-2-goseq-using-custom-categories_files/dge.nolow.Rdata")
expressed.genes<-gsub("(.)(\\.[[:digit:]]+)","\\1",as_vector(dge.nolow$genes))
## Loading required package: edgeR
## Loading required package: limma
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
GOseq.At.customcat.ORA(genelist=hormone.responsiveness6.DF.s.list2.DF2$IAAup, background="") # Total genes in genome are background
## [1] "Use all genes in genome as background"
## Warning in pcls(G): initial point very close to some inequality constraints
## Using manually entered categories.
## Calculating the p-values...

## [1] "enriched.GO is"
## category over_represented_pvalue under_represented_pvalue numDEInCat
## 11 IAAup 0.000000e+00 1.0000000 198
## 12 MJdown 4.395205e-21 1.0000000 23
## 5 BLup 7.235275e-18 1.0000000 11
## 19 WRKY33down 1.763824e-16 1.0000000 18
## 16 PIFtarget 2.001183e-14 1.0000000 11
## 3 ACCup 2.464518e-11 1.0000000 8
## 13 MJup 6.723578e-09 1.0000000 18
## 1 ABAdown 1.782286e-07 1.0000000 12
## 4 BLdown 2.043251e-06 0.9999999 5
## 18 SAup 8.089212e-06 0.9999992 8
## 8 GAdown 1.526185e-05 0.9999993 5
## 7 CKup 5.622343e-04 0.9999614 4
## 10 IAAdown 6.516900e-03 0.9994291 3
## numInCat over_represented_padjust
## 11 198 0.000000e+00
## 12 230 4.395205e-20
## 5 28 4.823517e-17
## 19 195 8.819122e-16
## 16 52 8.004731e-14
## 3 34 8.215059e-11
## 13 519 1.921022e-08
## 1 269 4.455716e-07
## 4 33 4.540558e-06
## 18 168 1.617842e-05
## 8 50 2.774883e-05
## 7 60 9.370572e-04
## 10 61 1.002600e-02
## category over_represented_pvalue under_represented_pvalue numDEInCat
## 11 IAAup 0.000000e+00 1.0000000 198
## 12 MJdown 4.395205e-21 1.0000000 23
## 5 BLup 7.235275e-18 1.0000000 11
## 19 WRKY33down 1.763824e-16 1.0000000 18
## 16 PIFtarget 2.001183e-14 1.0000000 11
## 3 ACCup 2.464518e-11 1.0000000 8
## 13 MJup 6.723578e-09 1.0000000 18
## 1 ABAdown 1.782286e-07 1.0000000 12
## 4 BLdown 2.043251e-06 0.9999999 5
## 18 SAup 8.089212e-06 0.9999992 8
## 8 GAdown 1.526185e-05 0.9999993 5
## 7 CKup 5.622343e-04 0.9999614 4
## 10 IAAdown 6.516900e-03 0.9994291 3
## numInCat over_represented_padjust
## 11 198 0.000000e+00
## 12 230 4.395205e-20
## 5 28 4.823517e-17
## 19 195 8.819122e-16
## 16 52 8.004731e-14
## 3 34 8.215059e-11
## 13 519 1.921022e-08
## 1 269 4.455716e-07
## 4 33 4.540558e-06
## 18 168 1.617842e-05
## 8 50 2.774883e-05
## 7 60 9.370572e-04
## 10 61 1.002600e-02
GOseq.At.customcat.ORA(genelist=hormone.responsiveness6.DF.s.list2.DF2$IAAup, background=expressed.genes) # Expressed genes in samples used for DEGs are background.
## Warning in if (background == "") {: the condition has length > 1 and only
## the first element will be used
## [1] "Expessed genes are used as background."
## [1] "length of bias is 16799"
## Using manually entered categories.
## Calculating the p-values...

## [1] "enriched.GO is"
## category over_represented_pvalue under_represented_pvalue numDEInCat
## 11 IAAup 0.000000e+00 1.0000000 143
## 12 MJdown 5.092312e-16 1.0000000 21
## 5 BLup 6.670465e-13 1.0000000 9
## 19 WRKY33down 1.372204e-12 1.0000000 16
## 16 PIFtarget 4.126366e-10 1.0000000 8
## 3 ACCup 3.002866e-06 0.9999999 5
## 4 BLdown 8.892919e-06 0.9999997 5
## 13 MJup 4.273610e-05 0.9999900 14
## 1 ABAdown 7.261961e-05 0.9999870 10
## 18 SAup 9.374241e-05 0.9999867 8
## 8 GAdown 5.425846e-04 0.9999639 4
## 7 CKup 7.477068e-03 0.9993214 3
## numInCat over_represented_padjust
## 11 143 0.000000e+00
## 12 221 5.092312e-15
## 5 25 4.446976e-12
## 19 159 6.861019e-12
## 16 34 1.650546e-09
## 3 26 1.000955e-05
## 4 33 2.540834e-05
## 13 454 1.068403e-04
## 1 257 1.613769e-04
## 18 154 1.874848e-04
## 8 43 9.865175e-04
## 7 46 1.246178e-02
## category over_represented_pvalue under_represented_pvalue numDEInCat
## 11 IAAup 0.000000e+00 1.0000000 143
## 12 MJdown 5.092312e-16 1.0000000 21
## 5 BLup 6.670465e-13 1.0000000 9
## 19 WRKY33down 1.372204e-12 1.0000000 16
## 16 PIFtarget 4.126366e-10 1.0000000 8
## 3 ACCup 3.002866e-06 0.9999999 5
## 4 BLdown 8.892919e-06 0.9999997 5
## 13 MJup 4.273610e-05 0.9999900 14
## 1 ABAdown 7.261961e-05 0.9999870 10
## 18 SAup 9.374241e-05 0.9999867 8
## 8 GAdown 5.425846e-04 0.9999639 4
## 7 CKup 7.477068e-03 0.9993214 3
## numInCat over_represented_padjust
## 11 143 0.000000e+00
## 12 221 5.092312e-15
## 5 25 4.446976e-12
## 19 159 6.861019e-12
## 16 34 1.650546e-09
## 3 26 1.000955e-05
## 4 33 2.540834e-05
## 13 454 1.068403e-04
## 1 257 1.613769e-04
## 18 154 1.874848e-04
## 8 43 9.865175e-04
## 7 46 1.246178e-02
Session info
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] edgeR_3.22.5 limma_3.36.5
## [3] annotate_1.58.0 XML_3.98-1.16
## [5] GO.db_3.6.0 AnnotationDbi_1.42.1
## [7] goseq_1.32.0 geneLenDataBase_1.16.0
## [9] BiasedUrn_1.07 ShortRead_1.38.0
## [11] GenomicAlignments_1.16.0 SummarizedExperiment_1.10.1
## [13] DelayedArray_0.6.6 matrixStats_0.54.0
## [15] Biobase_2.40.0 Rsamtools_1.32.3
## [17] GenomicRanges_1.32.7 GenomeInfoDb_1.16.0
## [19] Biostrings_2.48.0 XVector_0.20.0
## [21] IRanges_2.14.12 S4Vectors_0.18.3
## [23] BiocParallel_1.14.2 BiocGenerics_0.26.0
## [25] bindrcpp_0.2.2 readxl_1.1.0
## [27] forcats_0.3.0 stringr_1.3.1
## [29] dplyr_0.7.7 purrr_0.2.5
## [31] readr_1.1.1 tidyr_0.8.2
## [33] tibble_1.4.2 ggplot2_3.1.0
## [35] tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-137 bitops_1.0-6 bit64_0.9-7
## [4] lubridate_1.7.4 progress_1.2.0 RColorBrewer_1.1-2
## [7] httr_1.3.1 rprojroot_1.3-2 tools_3.5.1
## [10] backports_1.1.2 utf8_1.1.4 R6_2.3.0
## [13] mgcv_1.8-25 DBI_1.0.0 lazyeval_0.2.1
## [16] colorspace_1.3-2 withr_2.1.2 prettyunits_1.0.2
## [19] tidyselect_0.2.5 bit_1.1-14 curl_3.2
## [22] compiler_3.5.1 cli_1.0.1 rvest_0.3.2
## [25] xml2_1.2.0 rtracklayer_1.40.6 bookdown_0.7
## [28] scales_1.0.0 digest_0.6.18 rmarkdown_1.10
## [31] pkgconfig_2.0.2 htmltools_0.3.6 rlang_0.3.0.1
## [34] RSQLite_2.1.1 rstudioapi_0.8 bindr_0.1.1
## [37] hwriter_1.3.2 jsonlite_1.5 RCurl_1.95-4.11
## [40] magrittr_1.5 GenomeInfoDbData_1.1.0 Matrix_1.2-15
## [43] Rcpp_0.12.19 munsell_0.5.0 fansi_0.4.0
## [46] stringi_1.2.4 yaml_2.2.0 zlibbioc_1.26.0
## [49] plyr_1.8.4 blob_1.1.1 grid_3.5.1
## [52] crayon_1.3.4 lattice_0.20-35 haven_1.1.2
## [55] GenomicFeatures_1.32.3 hms_0.4.2 locfit_1.5-9.1
## [58] knitr_1.20 pillar_1.3.0 biomaRt_2.36.1
## [61] glue_1.3.0 evaluate_0.12 blogdown_0.9
## [64] latticeExtra_0.6-28 modelr_0.1.2 cellranger_1.1.0
## [67] gtable_0.2.0 assertthat_0.2.0 xfun_0.4
## [70] xtable_1.8-3 broom_0.5.0 memoise_1.1.0
References
Nozue K, Devisetty UK, Lekkala S, Mueller-Moule P, Bak A, Casteel CL, Maloof JN (2018) Network analysis reveals a role for salicylic acid pathway components in shade avoidance. Plant Physiol. doi: 10.1104/pp.18.00920↩