Download updated gene description (copied from “2018-10-01-…Rmd”)

Most updated annotation can be downloaded from TAIR website (for subscribers). I used “TAIR Data 20180630”.

library(tidyverse);library(readxl);library(readr)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
# turn on TAIR subsciption before this procedure
At.gene.name <-read_tsv("https://www.arabidopsis.org/download_files/Subscriber_Data_Releases/TAIR_Data_20180630/gene_aliases_20180702.txt.gz") # Does work from home when I use Pulse Secure.
# How to concatenate multiple symbols in the same AGI
At.gene.name <- At.gene.name %>% group_by(name) %>% summarise(symbol2=paste(symbol,collapse=";"),full_name=paste(full_name,collapse=";"))
At.gene.name %>% dplyr::slice(100:110) # OK
# download cDNA info directly from TAIR
At_cdna<-Biostrings::readDNAStringSet("https://www.arabidopsis.org/download_files/Sequences/TAIR10_blastsets/TAIR10_cdna_20110103_representative_gene_model_updated")
At_cdna # 33602 genes

using downloaded file

system("gunzip -c ../../../resources/2018-09-26-over-representation-analysis-1-goseq-with-arabidopsis-go-term_files/TAIR10_cdna_20110103_representative_gene_model_updated.gz > ../../../resources/2018-09-26-over-representation-analysis-1-goseq-with-arabidopsis-go-term_files/TAIR10_cdna_20110103_representative_gene_model_updated")
At_cdna <- Biostrings::readDNAStringSet(file.path("..","..","..","resources","2018-09-26-over-representation-analysis-1-goseq-with-arabidopsis-go-term_files","TAIR10_cdna_20110103_representative_gene_model_updated"))
At_cdna
##   A DNAStringSet instance of length 33602
##         width seq                                           names               
##     [1]  2394 AGAAAACAGTCGACCGTCATT...TATTTGGTAATTTTTTGAGTC AT1G50920.1 | Sym...
##     [2]   546 ATGACTCGTTTGTTGCCTTAC...CAAGTTGATTCTGGTACATAG AT1G36960.1 | Sym...
##     [3]  2314 ATGGATTCAGAGTCAGAGTCA...CCTGGTGCATTGTGTTTCTCC AT1G44020.1 | Sym...
##     [4]  1658 TCGTTTCGTCGTCGATCAGAA...ATCGATTACATGCTACATTTT AT1G15970.1 | Sym...
##     [5]  1453 ATTGAAAAGAAAACACATCCC...GACCACCAAAATCTTCTCATA AT1G73440.1 | Sym...
##     ...   ... ...
## [33598]    87 GGATGGATGTCTGAGCGGTTG...GTTCGAATCCCTCTCCATCCG ATMG00420.1 | Sym...
## [33599]   384 ATGCTCCCCGCCGGTTGTTGG...CTACGATACTTAACTATATAA ATMG01330.1 | Sym...
## [33600]   573 ATGGATAACCAATTCATTTTC...GAACAGCGTAGCGACGGATAA ATMG00070.1 | Sym...
## [33601]   366 ATGGCATCAAAAATCCGCAAA...GATCCTTCTGCATACGCATAA ATMG00130.1 | Sym...
## [33602]    74 GCGCTCTTAGTTCAGTTCGGT...GTTCAAATCCTACAGAGCGTG ATMG00930.1 | Sym...
system("rm ../../../resources/2018-09-26-over-representation-analysis-1-goseq-with-arabidopsis-go-term_files/TAIR10_cdna_20110103_representative_gene_model_updated")

Updated hormone responsive gene list1

hormone.responsiveness6.DF.s.list2.DF2<-readxl::read_xlsx(file.path("..","..","..","resources","2018-10-10-over-representation-analysis-2-goseq-using-custom-categories_files","Supplemental_Dataset3_source_of_custom_categories.xlsx"),skip=15,sheet=1) # Does not work. why? 
# because uglyURLs and canonifyURLs = true on 100818. I commented out them.
# Ask Kazu for this xlsx file.
hormone.responsiveness6.DF.s.list2.DF2
## # A tibble: 519 x 20
##    ABAdown ACCup ACCdown BLup  BLdown IAAup IAAdown MJup  MJdown CKup  CKdown
##    <chr>   <chr> <chr>   <chr> <chr>  <chr> <chr>   <chr> <chr>  <chr> <chr> 
##  1 AT1G21… AT1G… AT1G70… AT1G… AT1G2… AT1G… AT1G74… AT1G… AT1G7… AT1G… AT1G4…
##  2 AT1G21… AT1G… AT1G29… AT1G… AT1G1… AT1G… AT1G31… AT1G… AT1G3… AT1G… AT1G7…
##  3 AT1G49… AT1G… AT1G28… AT1G… AT1G6… AT1G… AT1G17… AT1G… AT1G6… AT1G… AT1G7…
##  4 AT1G73… AT1G… AT1G64… AT2G… AT1G0… AT1G… AT1G13… AT1G… AT1G7… AT1G… AT2G4…
##  5 AT1G70… AT1G… AT1G72… AT2G… AT1G7… AT1G… AT1G18… AT1G… AT1G4… AT1G… AT2G3…
##  6 AT1G31… AT2G… AT2G26… AT2G… AT1G1… AT1G… AT1G31… AT1G… AT1G7… AT1G… AT2G3…
##  7 AT1G77… AT2G… AT2G24… AT2G… AT1G7… AT1G… AT1G77… AT1G… AT1G1… AT1G… AT2G2…
##  8 AT1G16… AT2G… AT2G03… AT2G… AT2G2… AT1G… AT1G13… AT1G… AT1G6… AT1G… AT4G1…
##  9 AT1G70… AT2G… AT3G13… AT2G… AT2G3… AT1G… AT1G72… AT1G… AT1G1… AT1G… AT4G3…
## 10 AT1G77… AT2G… AT3G56… AT3G… AT2G1… AT1G… AT1G72… AT1G… AT1G6… AT1G… AT4G3…
## # … with 509 more rows, and 9 more variables: GAup <chr>, GAdown <chr>,
## #   PIFtarget <chr>, MYC234up <chr>, MYC234down <chr>, SAup <chr>,
## #   Sadown <chr>, WRKY33up <chr>, WRKY33down <chr>
# conver to list obejct compatible with goseq() below

format hormone.responsiveness6.DF.s.list2.DF2 into goseq() compatible list (run once)

# extract AGI from At_cdna
At.gene.name<-tibble(name=names(At_cdna)) %>% separate(name,into=c("name2","Symbol","description"),sep=" \\|",extra="drop",fill="left") %>%  mutate(AGI=str_remove(name2,pattern="\\.[[:digit:]]+")) 
# make presence/absense hormone responsive gene table (dataframe)
  for(i in 1:20) {
  genes<-hormone.responsiveness6.DF.s.list2.DF2[!is.na(hormone.responsiveness6.DF.s.list2.DF2[,i]),i] %>% as_vector()
  temp<-data.frame(AGI=genes,category=names(hormone.responsiveness6.DF.s.list2.DF2)[i])
  #At.gene.name %>% left_join(temp,by=c("name"="AGI")) -> At.gene.name
At.gene.name %>% left_join(temp,by="AGI") -> At.gene.name 
  }
names(At.gene.name)[5:24] <- names(hormone.responsiveness6.DF.s.list2.DF2)
At.gene.name %>% filter(str_detect(AGI,pattern="AT1G|AT2G|AT3G|AT4G|AT5G|ATC|ATM")) %>% unite(category,5:24,sep=",")->test2
test2[1:10,] %>%  mutate(category=gsub("NA","",category)) %>% mutate(category=gsub(",","",category))
# only select genes with AGI name and concatenate categories
At.gene.name %>% filter(str_detect(AGI,pattern="AT1G|AT2G|AT3G|AT4G|AT5G|ATC|ATM")) %>% unite(category,5:24,sep=",") %>%  mutate(category=gsub("NA,","",category)) %>% mutate(category=gsub(",NA","",category)) %>% mutate(category=gsub("NA","",category)) ->test3
# convert into list object
temp.list<-list()
for(i in 1:dim(test3)[1]) {
  temp.list[[i]]<-test3 %>% slice(i) %>% pull(category)
}
names(temp.list)<-test3 %>% pull(AGI)
# split concatanated categories in each gene
At.hormone.responsive.list<-lapply(temp.list,function(x) unlist(strsplit(x, split=",")))
# save
save(At.hormone.responsive.list,file=file.path("..","..","..","resources","2018-10-10-over-representation-analysis-2-goseq-using-custom-categories_files","At.hormone.responsive.list.Rdata"))

GOseq ORA function with Arabidopss thaliana hormone responsive genes (under construction)

load(file.path("..","..","..","resources","2018-10-10-over-representation-analysis-2-goseq-using-custom-categories_files","At.hormone.responsive.list.Rdata"))
head(At.hormone.responsive.list)
## $AT1G50920
## character(0)
## 
## $AT1G36960
## character(0)
## 
## $AT1G44020
## character(0)
## 
## $AT1G15970
## character(0)
## 
## $AT1G73440
## character(0)
## 
## $AT1G75120
## character(0)
library(ShortRead);library(goseq);library(GO.db);library("annotate")
## Warning: package 'S4Vectors' was built under R version 3.6.3
## Warning: package 'GenomeInfoDb' was built under R version 3.6.3
## Warning: package 'DelayedArray' was built under R version 3.6.3
#  bias.data vector must have the same length as DEgenes vector! 
## if you want to test expressed genes as background, add background in this function
GOseq.At.customcat.ORA<-function(genelist,padjust=0.05,custom.category.list=At.hormone.responsive.list,cdna=At_cdna,background="") { # return GO enrichment table, padjus, padjust=0.05. New BLAST2GO results with only BP Brgo.v2.5.BP.list is used (042518). Brgo.v2.5.BP.list is based on Hajar's Blast2go, which should be replaced with Brgo.Atgoslim.BP.list and giving a new name to this function (080818). cdna is DNAstring object. background is a vector of genes.
  #bias<-nchar(cdna)
  bias<-Biostrings::width(cdna)
  # add name to "bias". cdna
  #names(bias)<-names(cdna)
 names(bias) <- tibble(name=names(cdna)) %>% separate(name,into=c("name2","Symbol","description"),sep=" \\|",extra="drop",fill="left") %>%  mutate(AGI=str_remove(name2,pattern="\\.[[:digit:]]+")) %>% pull(AGI)
 if(background=="") {print("Use all genes in genome as background")} else {
 # select only expressed geens 
  bias <- bias[background]
  print("Expessed genes are used as background.")
  print(paste("length of bias is ",length(bias)))
 }
  TF<-(names(bias) %in% genelist)*1
  names(TF)<-names(bias)
  #print(TF)
  pwf<-nullp(TF,bias.data=bias)
  #print(pwf$DEgenes)
  GO.pval <- goseq(pwf,gene2cat=custom.category.list,use_genes_without_cat=TRUE) # format became different in new goseq version (021111). Does not work (042716)
  # calculate p ajust by BH
  GO.pval$over_represented_padjust<-p.adjust(GO.pval$over_represented_pvalue,method="BH")
  if(GO.pval$over_represented_padjust[1]>padjust) return("no enriched GO")
  else {
    enriched.GO<-GO.pval[GO.pval$over_represented_padjust<padjust,] 
    print("enriched.GO is")
    print(enriched.GO)
    return(enriched.GO)
  }
}

test the function

load(file.path("..","..","..","resources","2018-10-10-over-representation-analysis-2-goseq-using-custom-categories_files","dge.nolow.Rdata"))
expressed.genes<-gsub("(.)(\\.[[:digit:]]+)","\\1",as_vector(dge.nolow$genes))
## Loading required package: edgeR
## Loading required package: limma
## 
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
GOseq.At.customcat.ORA(genelist=hormone.responsiveness6.DF.s.list2.DF2$IAAup, background="") # Total genes in genome are background
## [1] "Use all genes in genome as background"
## Warning in pcls(G): initial point very close to some inequality constraints
## Using manually entered categories.
## Calculating the p-values...

## [1] "enriched.GO is"
##      category over_represented_pvalue under_represented_pvalue numDEInCat
## 11      IAAup            0.000000e+00                1.0000000        198
## 12     MJdown            4.395205e-21                1.0000000         23
## 5        BLup            7.235275e-18                1.0000000         11
## 19 WRKY33down            1.763824e-16                1.0000000         18
## 16  PIFtarget            2.001183e-14                1.0000000         11
## 3       ACCup            2.464518e-11                1.0000000          8
## 13       MJup            6.723578e-09                1.0000000         18
## 1     ABAdown            1.782286e-07                1.0000000         12
## 4      BLdown            2.043251e-06                0.9999999          5
## 18       SAup            8.089212e-06                0.9999992          8
## 8      GAdown            1.526185e-05                0.9999993          5
## 7        CKup            5.622343e-04                0.9999614          4
## 10    IAAdown            6.516900e-03                0.9994291          3
##    numInCat over_represented_padjust
## 11      198             0.000000e+00
## 12      230             4.395205e-20
## 5        28             4.823517e-17
## 19      195             8.819122e-16
## 16       52             8.004731e-14
## 3        34             8.215059e-11
## 13      519             1.921022e-08
## 1       269             4.455716e-07
## 4        33             4.540558e-06
## 18      168             1.617842e-05
## 8        50             2.774883e-05
## 7        60             9.370572e-04
## 10       61             1.002600e-02
##      category over_represented_pvalue under_represented_pvalue numDEInCat
## 11      IAAup            0.000000e+00                1.0000000        198
## 12     MJdown            4.395205e-21                1.0000000         23
## 5        BLup            7.235275e-18                1.0000000         11
## 19 WRKY33down            1.763824e-16                1.0000000         18
## 16  PIFtarget            2.001183e-14                1.0000000         11
## 3       ACCup            2.464518e-11                1.0000000          8
## 13       MJup            6.723578e-09                1.0000000         18
## 1     ABAdown            1.782286e-07                1.0000000         12
## 4      BLdown            2.043251e-06                0.9999999          5
## 18       SAup            8.089212e-06                0.9999992          8
## 8      GAdown            1.526185e-05                0.9999993          5
## 7        CKup            5.622343e-04                0.9999614          4
## 10    IAAdown            6.516900e-03                0.9994291          3
##    numInCat over_represented_padjust
## 11      198             0.000000e+00
## 12      230             4.395205e-20
## 5        28             4.823517e-17
## 19      195             8.819122e-16
## 16       52             8.004731e-14
## 3        34             8.215059e-11
## 13      519             1.921022e-08
## 1       269             4.455716e-07
## 4        33             4.540558e-06
## 18      168             1.617842e-05
## 8        50             2.774883e-05
## 7        60             9.370572e-04
## 10       61             1.002600e-02
GOseq.At.customcat.ORA(genelist=hormone.responsiveness6.DF.s.list2.DF2$IAAup, background=expressed.genes) # Expressed genes in samples used for DEGs are background.
## Warning in if (background == "") {: the condition has length > 1 and only the
## first element will be used
## [1] "Expessed genes are used as background."
## [1] "length of bias is  16799"
## Using manually entered categories.
## Calculating the p-values...

## [1] "enriched.GO is"
##      category over_represented_pvalue under_represented_pvalue numDEInCat
## 11      IAAup            0.000000e+00                1.0000000        143
## 12     MJdown            5.092312e-16                1.0000000         21
## 5        BLup            6.670465e-13                1.0000000          9
## 19 WRKY33down            1.372204e-12                1.0000000         16
## 16  PIFtarget            4.126366e-10                1.0000000          8
## 3       ACCup            3.002866e-06                0.9999999          5
## 4      BLdown            8.892919e-06                0.9999997          5
## 13       MJup            4.273610e-05                0.9999900         14
## 1     ABAdown            7.261961e-05                0.9999870         10
## 18       SAup            9.374241e-05                0.9999867          8
## 8      GAdown            5.425846e-04                0.9999639          4
## 7        CKup            7.477068e-03                0.9993214          3
##    numInCat over_represented_padjust
## 11      143             0.000000e+00
## 12      221             5.092312e-15
## 5        25             4.446976e-12
## 19      159             6.861019e-12
## 16       34             1.650546e-09
## 3        26             1.000955e-05
## 4        33             2.540834e-05
## 13      454             1.068403e-04
## 1       257             1.613769e-04
## 18      154             1.874848e-04
## 8        43             9.865175e-04
## 7        46             1.246178e-02
##      category over_represented_pvalue under_represented_pvalue numDEInCat
## 11      IAAup            0.000000e+00                1.0000000        143
## 12     MJdown            5.092312e-16                1.0000000         21
## 5        BLup            6.670465e-13                1.0000000          9
## 19 WRKY33down            1.372204e-12                1.0000000         16
## 16  PIFtarget            4.126366e-10                1.0000000          8
## 3       ACCup            3.002866e-06                0.9999999          5
## 4      BLdown            8.892919e-06                0.9999997          5
## 13       MJup            4.273610e-05                0.9999900         14
## 1     ABAdown            7.261961e-05                0.9999870         10
## 18       SAup            9.374241e-05                0.9999867          8
## 8      GAdown            5.425846e-04                0.9999639          4
## 7        CKup            7.477068e-03                0.9993214          3
##    numInCat over_represented_padjust
## 11      143             0.000000e+00
## 12      221             5.092312e-15
## 5        25             4.446976e-12
## 19      159             6.861019e-12
## 16       34             1.650546e-09
## 3        26             1.000955e-05
## 4        33             2.540834e-05
## 13      454             1.068403e-04
## 1       257             1.613769e-04
## 18      154             1.874848e-04
## 8        43             9.865175e-04
## 7        46             1.246178e-02

Session info

sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] edgeR_3.28.1                limma_3.42.2               
##  [3] annotate_1.64.0             XML_3.99-0.3               
##  [5] GO.db_3.10.0                AnnotationDbi_1.48.0       
##  [7] goseq_1.38.0                geneLenDataBase_1.22.0     
##  [9] BiasedUrn_1.07              ShortRead_1.44.3           
## [11] GenomicAlignments_1.22.1    SummarizedExperiment_1.16.1
## [13] DelayedArray_0.12.3         matrixStats_0.57.0         
## [15] Biobase_2.46.0              Rsamtools_2.2.3            
## [17] GenomicRanges_1.38.0        GenomeInfoDb_1.22.1        
## [19] Biostrings_2.54.0           XVector_0.26.0             
## [21] IRanges_2.20.2              S4Vectors_0.24.4           
## [23] BiocParallel_1.20.1         BiocGenerics_0.32.0        
## [25] readxl_1.3.1                forcats_0.5.0              
## [27] stringr_1.4.0               dplyr_1.0.2                
## [29] purrr_0.3.4                 readr_1.4.0                
## [31] tidyr_1.1.2                 tibble_3.0.4               
## [33] ggplot2_3.3.3               tidyverse_1.3.0            
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_2.0-0       hwriter_1.3.2          ellipsis_0.3.1        
##  [4] fs_1.5.0               rstudioapi_0.13        bit64_4.0.5           
##  [7] fansi_0.4.1            lubridate_1.7.9.2      xml2_1.3.2            
## [10] splines_3.6.2          knitr_1.31             jsonlite_1.7.2        
## [13] broom_0.7.3            dbplyr_2.0.0           png_0.1-7             
## [16] compiler_3.6.2         httr_1.4.2             backports_1.2.1       
## [19] assertthat_0.2.1       Matrix_1.3-2           cli_2.2.0             
## [22] htmltools_0.5.1.1      prettyunits_1.1.1      tools_3.6.2           
## [25] gtable_0.3.0           glue_1.4.2             GenomeInfoDbData_1.2.2
## [28] rappdirs_0.3.1         Rcpp_1.0.6             cellranger_1.1.0      
## [31] vctrs_0.3.6            nlme_3.1-151           blogdown_1.2.2        
## [34] rtracklayer_1.46.0     xfun_0.22              rvest_0.3.6           
## [37] lifecycle_0.2.0        zlibbioc_1.32.0        scales_1.1.1          
## [40] hms_0.5.3              RColorBrewer_1.1-2     yaml_2.2.1            
## [43] curl_4.3               memoise_1.1.0          biomaRt_2.42.1        
## [46] latticeExtra_0.6-29    stringi_1.5.3          RSQLite_2.2.1         
## [49] highr_0.8              GenomicFeatures_1.38.2 rlang_0.4.10          
## [52] pkgconfig_2.0.3        bitops_1.0-6           evaluate_0.14         
## [55] lattice_0.20-41        bit_4.0.4              tidyselect_1.1.0      
## [58] magrittr_2.0.1         bookdown_0.21          R6_2.5.0              
## [61] generics_0.1.0         DBI_1.1.0              pillar_1.4.7          
## [64] haven_2.3.1            withr_2.3.0            mgcv_1.8-33           
## [67] RCurl_1.98-1.2         modelr_0.1.8           crayon_1.3.4          
## [70] utf8_1.1.4             BiocFileCache_1.10.2   rmarkdown_2.7         
## [73] jpeg_0.1-8.1           progress_1.2.2         locfit_1.5-9.4        
## [76] grid_3.6.2             blob_1.2.1             reprex_0.3.0          
## [79] digest_0.6.27          xtable_1.8-4           openssl_1.4.3         
## [82] munsell_0.5.0          askpass_1.1

References


  1. Nozue K, Devisetty UK, Lekkala S, Mueller-Moule P, Bak A, Casteel CL, Maloof JN (2018) Network analysis reveals a role for salicylic acid pathway components in shade avoidance. Plant Physiol. doi: 10.1104/pp.18.00920