Download updated gene description (copied from “2018-10-01-…Rmd”)

Most updated annotation can be downloaded from TAIR website (for subscribers). I used “TAIR Data 20180630”.

library(tidyverse);library(readxl);library(readr)
## ── Attaching packages ───────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.7
## ✔ tidyr   0.8.2     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
# turn on TAIR subsciption before this procedure
At.gene.name <-read_tsv("https://www.arabidopsis.org/download_files/Subscriber_Data_Releases/TAIR_Data_20180630/gene_aliases_20180702.txt.gz") # Does work from home when I use Pulse Secure.
# How to concatenate multiple symbols in the same AGI
At.gene.name <- At.gene.name %>% group_by(name) %>% summarise(symbol2=paste(symbol,collapse=";"),full_name=paste(full_name,collapse=";"))
At.gene.name %>% dplyr::slice(100:110) # OK
## # A tibble: 11 x 3
##    name     symbol2          full_name                                    
##    <chr>    <chr>            <chr>                                        
##  1 AT1G018… AtGEN1;GEN1      NA;ortholog of HsGEN1                        
##  2 AT1G019… ATSBT1.1;SBTI1.1 NA;NA                                        
##  3 AT1G019… AtGET3a;GET3a    Guided Entry of Tail-anchored protein 3a;Gui…
##  4 AT1G019… ARK2;AtKINUb     armadillo repeat kinesin 2;Arabidopsis thali…
##  5 AT1G019… BIG3;EDA10       BIG3;embryo sac development arrest 10        
##  6 AT1G019… AtBBE1;OGOX4     NA;oligogalacturonide oxidase 4              
##  7 AT1G020… GAE2             UDP-D-glucuronate 4-epimerase 2              
##  8 AT1G020… SEC1A            secretory 1A                                 
##  9 AT1G020… LAP6;PKSA        LESS ADHESIVE POLLEN 6;polyketide synthase A 
## 10 AT1G020… SPL8             squamosa promoter binding protein-like 8     
## 11 AT1G020… ATCSN7;COP15;CS… ARABIDOPSIS THALIANA COP9 SIGNALOSOME SUBUNI…
# download cDNA info directly from TAIR
At_cdna<-Biostrings::readDNAStringSet("https://www.arabidopsis.org/download_files/Sequences/TAIR10_blastsets/TAIR10_cdna_20110103_representative_gene_model_updated")
At_cdna # 33602 genes
##   A DNAStringSet instance of length 33602
##         width seq                                      names               
##     [1]  2394 AGAAAACAGTCGACCGTCA...TTGGTAATTTTTTGAGTC AT1G50920.1 | Sym...
##     [2]   546 ATGACTCGTTTGTTGCCTT...GTTGATTCTGGTACATAG AT1G36960.1 | Sym...
##     [3]  2314 ATGGATTCAGAGTCAGAGT...GGTGCATTGTGTTTCTCC AT1G44020.1 | Sym...
##     [4]  1658 TCGTTTCGTCGTCGATCAG...GATTACATGCTACATTTT AT1G15970.1 | Sym...
##     [5]  1453 ATTGAAAAGAAAACACATC...CACCAAAATCTTCTCATA AT1G73440.1 | Sym...
##     ...   ... ...
## [33598]    87 GGATGGATGTCTGAGCGGT...CGAATCCCTCTCCATCCG ATMG00420.1 | Sym...
## [33599]   384 ATGCTCCCCGCCGGTTGTT...CGATACTTAACTATATAA ATMG01330.1 | Sym...
## [33600]   573 ATGGATAACCAATTCATTT...CAGCGTAGCGACGGATAA ATMG00070.1 | Sym...
## [33601]   366 ATGGCATCAAAAATCCGCA...CCTTCTGCATACGCATAA ATMG00130.1 | Sym...
## [33602]    74 GCGCTCTTAGTTCAGTTCG...CAAATCCTACAGAGCGTG ATMG00930.1 | Sym...

Updated hormone responsive gene list1

hormone.responsiveness6.DF.s.list2.DF2<-readxl::read_xlsx(file.path("2018-10-10-over-representation-analysis-2-goseq-using-custom-categories_files","Supplemental_Dataset3_source_of_custom_categories.xlsx"),skip=15,sheet=1) # Does not work. why? 
# because uglyURLs and canonifyURLs = true on 100818. I commented out them.
# Ask Kazu for this xlsx file.
hormone.responsiveness6.DF.s.list2.DF2
## # A tibble: 519 x 20
##    ABAdown ACCup ACCdown BLup  BLdown IAAup IAAdown MJup  MJdown CKup 
##    <chr>   <chr> <chr>   <chr> <chr>  <chr> <chr>   <chr> <chr>  <chr>
##  1 AT1G21… AT1G… AT1G70… AT1G… AT1G2… AT1G… AT1G74… AT1G… AT1G7… AT1G…
##  2 AT1G21… AT1G… AT1G29… AT1G… AT1G1… AT1G… AT1G31… AT1G… AT1G3… AT1G…
##  3 AT1G49… AT1G… AT1G28… AT1G… AT1G6… AT1G… AT1G17… AT1G… AT1G6… AT1G…
##  4 AT1G73… AT1G… AT1G64… AT2G… AT1G0… AT1G… AT1G13… AT1G… AT1G7… AT1G…
##  5 AT1G70… AT1G… AT1G72… AT2G… AT1G7… AT1G… AT1G18… AT1G… AT1G4… AT1G…
##  6 AT1G31… AT2G… AT2G26… AT2G… AT1G1… AT1G… AT1G31… AT1G… AT1G7… AT1G…
##  7 AT1G77… AT2G… AT2G24… AT2G… AT1G7… AT1G… AT1G77… AT1G… AT1G1… AT1G…
##  8 AT1G16… AT2G… AT2G03… AT2G… AT2G2… AT1G… AT1G13… AT1G… AT1G6… AT1G…
##  9 AT1G70… AT2G… AT3G13… AT2G… AT2G3… AT1G… AT1G72… AT1G… AT1G1… AT1G…
## 10 AT1G77… AT2G… AT3G56… AT3G… AT2G1… AT1G… AT1G72… AT1G… AT1G6… AT1G…
## # ... with 509 more rows, and 10 more variables: CKdown <chr>, GAup <chr>,
## #   GAdown <chr>, PIFtarget <chr>, MYC234up <chr>, MYC234down <chr>,
## #   SAup <chr>, Sadown <chr>, WRKY33up <chr>, WRKY33down <chr>
# conver to list obejct compatible with goseq() below

format hormone.responsiveness6.DF.s.list2.DF2 into goseq() compatible list (run once)

# extract AGI from At_cdna
At.gene.name<-tibble(name=names(At_cdna)) %>% separate(name,into=c("name2","Symbol","description"),sep=" \\|",extra="drop",fill="left") %>%  mutate(AGI=str_remove(name2,pattern="\\.[[:digit:]]+")) 
# make presence/absense hormone responsive gene table (dataframe)
  for(i in 1:20) {
  genes<-hormone.responsiveness6.DF.s.list2.DF2[!is.na(hormone.responsiveness6.DF.s.list2.DF2[,i]),i] %>% as_vector()
  temp<-data.frame(AGI=genes,category=names(hormone.responsiveness6.DF.s.list2.DF2)[i])
  #At.gene.name %>% left_join(temp,by=c("name"="AGI")) -> At.gene.name
At.gene.name %>% left_join(temp,by="AGI") -> At.gene.name 
  }
names(At.gene.name)[5:24] <- names(hormone.responsiveness6.DF.s.list2.DF2)
At.gene.name %>% filter(str_detect(AGI,pattern="AT1G|AT2G|AT3G|AT4G|AT5G|ATC|ATM")) %>% unite(category,5:24,sep=",")->test2
test2[1:10,] %>%  mutate(category=gsub("NA","",category)) %>% mutate(category=gsub(",","",category))
# only select genes with AGI name and concatenate categories
At.gene.name %>% filter(str_detect(AGI,pattern="AT1G|AT2G|AT3G|AT4G|AT5G|ATC|ATM")) %>% unite(category,5:24,sep=",") %>%  mutate(category=gsub("NA,","",category)) %>% mutate(category=gsub(",NA","",category)) %>% mutate(category=gsub("NA","",category)) ->test3
# convert into list object
temp.list<-list()
for(i in 1:dim(test3)[1]) {
  temp.list[[i]]<-test3 %>% slice(i) %>% pull(category)
}
names(temp.list)<-test3 %>% pull(AGI)
# split concatanated categories in each gene
At.hormone.responsive.list<-lapply(temp.list,function(x) unlist(strsplit(x, split=",")))
# save
save(At.hormone.responsive.list,file="2018-09-27-over-representation-analysis-2-goseq-with-custom-categories_files/At.hormone.responsive.list.Rdata")

GOseq ORA function with Arabidopss thaliana hormone responsive genes (under construction)

load("2018-10-10-over-representation-analysis-2-goseq-using-custom-categories_files/At.hormone.responsive.list.Rdata")
head(At.hormone.responsive.list)
## $AT1G50920
## character(0)
## 
## $AT1G36960
## character(0)
## 
## $AT1G44020
## character(0)
## 
## $AT1G15970
## character(0)
## 
## $AT1G73440
## character(0)
## 
## $AT1G75120
## character(0)
library(ShortRead);library(goseq);library(GO.db);library("annotate")

#  bias.data vector must have the same length as DEgenes vector! 
## if you want to test expressed genes as background, add background in this function
GOseq.At.customcat.ORA<-function(genelist,padjust=0.05,custom.category.list=At.hormone.responsive.list,cdna=At_cdna,background="") { # return GO enrichment table, padjus, padjust=0.05. New BLAST2GO results with only BP Brgo.v2.5.BP.list is used (042518). Brgo.v2.5.BP.list is based on Hajar's Blast2go, which should be replaced with Brgo.Atgoslim.BP.list and giving a new name to this function (080818). cdna is DNAstring object. background is a vector of genes.
  #bias<-nchar(cdna)
  bias<-Biostrings::width(cdna)
  # add name to "bias". cdna
  #names(bias)<-names(cdna)
 names(bias) <- tibble(name=names(cdna)) %>% separate(name,into=c("name2","Symbol","description"),sep=" \\|",extra="drop",fill="left") %>%  mutate(AGI=str_remove(name2,pattern="\\.[[:digit:]]+")) %>% pull(AGI)
 if(background=="") {print("Use all genes in genome as background")} else {
 # select only expressed geens 
  bias <- bias[background]
  print("Expessed genes are used as background.")
  print(paste("length of bias is ",length(bias)))
 }
  TF<-(names(bias) %in% genelist)*1
  names(TF)<-names(bias)
  #print(TF)
  pwf<-nullp(TF,bias.data=bias)
  #print(pwf$DEgenes)
  GO.pval <- goseq(pwf,gene2cat=custom.category.list,use_genes_without_cat=TRUE) # format became different in new goseq version (021111). Does not work (042716)
  # calculate p ajust by BH
  GO.pval$over_represented_padjust<-p.adjust(GO.pval$over_represented_pvalue,method="BH")
  if(GO.pval$over_represented_padjust[1]>padjust) return("no enriched GO")
  else {
    enriched.GO<-GO.pval[GO.pval$over_represented_padjust<padjust,] 
    print("enriched.GO is")
    print(enriched.GO)
    return(enriched.GO)
  }
}

test the function

load("2018-10-10-over-representation-analysis-2-goseq-using-custom-categories_files/dge.nolow.Rdata")
expressed.genes<-gsub("(.)(\\.[[:digit:]]+)","\\1",as_vector(dge.nolow$genes))
## Loading required package: edgeR
## Loading required package: limma
## 
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
GOseq.At.customcat.ORA(genelist=hormone.responsiveness6.DF.s.list2.DF2$IAAup, background="") # Total genes in genome are background
## [1] "Use all genes in genome as background"
## Warning in pcls(G): initial point very close to some inequality constraints
## Using manually entered categories.
## Calculating the p-values...

## [1] "enriched.GO is"
##      category over_represented_pvalue under_represented_pvalue numDEInCat
## 11      IAAup            0.000000e+00                1.0000000        198
## 12     MJdown            4.395205e-21                1.0000000         23
## 5        BLup            7.235275e-18                1.0000000         11
## 19 WRKY33down            1.763824e-16                1.0000000         18
## 16  PIFtarget            2.001183e-14                1.0000000         11
## 3       ACCup            2.464518e-11                1.0000000          8
## 13       MJup            6.723578e-09                1.0000000         18
## 1     ABAdown            1.782286e-07                1.0000000         12
## 4      BLdown            2.043251e-06                0.9999999          5
## 18       SAup            8.089212e-06                0.9999992          8
## 8      GAdown            1.526185e-05                0.9999993          5
## 7        CKup            5.622343e-04                0.9999614          4
## 10    IAAdown            6.516900e-03                0.9994291          3
##    numInCat over_represented_padjust
## 11      198             0.000000e+00
## 12      230             4.395205e-20
## 5        28             4.823517e-17
## 19      195             8.819122e-16
## 16       52             8.004731e-14
## 3        34             8.215059e-11
## 13      519             1.921022e-08
## 1       269             4.455716e-07
## 4        33             4.540558e-06
## 18      168             1.617842e-05
## 8        50             2.774883e-05
## 7        60             9.370572e-04
## 10       61             1.002600e-02
##      category over_represented_pvalue under_represented_pvalue numDEInCat
## 11      IAAup            0.000000e+00                1.0000000        198
## 12     MJdown            4.395205e-21                1.0000000         23
## 5        BLup            7.235275e-18                1.0000000         11
## 19 WRKY33down            1.763824e-16                1.0000000         18
## 16  PIFtarget            2.001183e-14                1.0000000         11
## 3       ACCup            2.464518e-11                1.0000000          8
## 13       MJup            6.723578e-09                1.0000000         18
## 1     ABAdown            1.782286e-07                1.0000000         12
## 4      BLdown            2.043251e-06                0.9999999          5
## 18       SAup            8.089212e-06                0.9999992          8
## 8      GAdown            1.526185e-05                0.9999993          5
## 7        CKup            5.622343e-04                0.9999614          4
## 10    IAAdown            6.516900e-03                0.9994291          3
##    numInCat over_represented_padjust
## 11      198             0.000000e+00
## 12      230             4.395205e-20
## 5        28             4.823517e-17
## 19      195             8.819122e-16
## 16       52             8.004731e-14
## 3        34             8.215059e-11
## 13      519             1.921022e-08
## 1       269             4.455716e-07
## 4        33             4.540558e-06
## 18      168             1.617842e-05
## 8        50             2.774883e-05
## 7        60             9.370572e-04
## 10       61             1.002600e-02
GOseq.At.customcat.ORA(genelist=hormone.responsiveness6.DF.s.list2.DF2$IAAup, background=expressed.genes) # Expressed genes in samples used for DEGs are background.
## Warning in if (background == "") {: the condition has length > 1 and only
## the first element will be used
## [1] "Expessed genes are used as background."
## [1] "length of bias is  16799"
## Using manually entered categories.
## Calculating the p-values...

## [1] "enriched.GO is"
##      category over_represented_pvalue under_represented_pvalue numDEInCat
## 11      IAAup            0.000000e+00                1.0000000        143
## 12     MJdown            5.092312e-16                1.0000000         21
## 5        BLup            6.670465e-13                1.0000000          9
## 19 WRKY33down            1.372204e-12                1.0000000         16
## 16  PIFtarget            4.126366e-10                1.0000000          8
## 3       ACCup            3.002866e-06                0.9999999          5
## 4      BLdown            8.892919e-06                0.9999997          5
## 13       MJup            4.273610e-05                0.9999900         14
## 1     ABAdown            7.261961e-05                0.9999870         10
## 18       SAup            9.374241e-05                0.9999867          8
## 8      GAdown            5.425846e-04                0.9999639          4
## 7        CKup            7.477068e-03                0.9993214          3
##    numInCat over_represented_padjust
## 11      143             0.000000e+00
## 12      221             5.092312e-15
## 5        25             4.446976e-12
## 19      159             6.861019e-12
## 16       34             1.650546e-09
## 3        26             1.000955e-05
## 4        33             2.540834e-05
## 13      454             1.068403e-04
## 1       257             1.613769e-04
## 18      154             1.874848e-04
## 8        43             9.865175e-04
## 7        46             1.246178e-02
##      category over_represented_pvalue under_represented_pvalue numDEInCat
## 11      IAAup            0.000000e+00                1.0000000        143
## 12     MJdown            5.092312e-16                1.0000000         21
## 5        BLup            6.670465e-13                1.0000000          9
## 19 WRKY33down            1.372204e-12                1.0000000         16
## 16  PIFtarget            4.126366e-10                1.0000000          8
## 3       ACCup            3.002866e-06                0.9999999          5
## 4      BLdown            8.892919e-06                0.9999997          5
## 13       MJup            4.273610e-05                0.9999900         14
## 1     ABAdown            7.261961e-05                0.9999870         10
## 18       SAup            9.374241e-05                0.9999867          8
## 8      GAdown            5.425846e-04                0.9999639          4
## 7        CKup            7.477068e-03                0.9993214          3
##    numInCat over_represented_padjust
## 11      143             0.000000e+00
## 12      221             5.092312e-15
## 5        25             4.446976e-12
## 19      159             6.861019e-12
## 16       34             1.650546e-09
## 3        26             1.000955e-05
## 4        33             2.540834e-05
## 13      454             1.068403e-04
## 1       257             1.613769e-04
## 18      154             1.874848e-04
## 8        43             9.865175e-04
## 7        46             1.246178e-02

Session info

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] edgeR_3.22.5                limma_3.36.5               
##  [3] annotate_1.58.0             XML_3.98-1.16              
##  [5] GO.db_3.6.0                 AnnotationDbi_1.42.1       
##  [7] goseq_1.32.0                geneLenDataBase_1.16.0     
##  [9] BiasedUrn_1.07              ShortRead_1.38.0           
## [11] GenomicAlignments_1.16.0    SummarizedExperiment_1.10.1
## [13] DelayedArray_0.6.6          matrixStats_0.54.0         
## [15] Biobase_2.40.0              Rsamtools_1.32.3           
## [17] GenomicRanges_1.32.7        GenomeInfoDb_1.16.0        
## [19] Biostrings_2.48.0           XVector_0.20.0             
## [21] IRanges_2.14.12             S4Vectors_0.18.3           
## [23] BiocParallel_1.14.2         BiocGenerics_0.26.0        
## [25] bindrcpp_0.2.2              readxl_1.1.0               
## [27] forcats_0.3.0               stringr_1.3.1              
## [29] dplyr_0.7.7                 purrr_0.2.5                
## [31] readr_1.1.1                 tidyr_0.8.2                
## [33] tibble_1.4.2                ggplot2_3.1.0              
## [35] tidyverse_1.2.1            
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-137           bitops_1.0-6           bit64_0.9-7           
##  [4] lubridate_1.7.4        progress_1.2.0         RColorBrewer_1.1-2    
##  [7] httr_1.3.1             rprojroot_1.3-2        tools_3.5.1           
## [10] backports_1.1.2        utf8_1.1.4             R6_2.3.0              
## [13] mgcv_1.8-25            DBI_1.0.0              lazyeval_0.2.1        
## [16] colorspace_1.3-2       withr_2.1.2            prettyunits_1.0.2     
## [19] tidyselect_0.2.5       bit_1.1-14             curl_3.2              
## [22] compiler_3.5.1         cli_1.0.1              rvest_0.3.2           
## [25] xml2_1.2.0             rtracklayer_1.40.6     bookdown_0.7          
## [28] scales_1.0.0           digest_0.6.18          rmarkdown_1.10        
## [31] pkgconfig_2.0.2        htmltools_0.3.6        rlang_0.3.0.1         
## [34] RSQLite_2.1.1          rstudioapi_0.8         bindr_0.1.1           
## [37] hwriter_1.3.2          jsonlite_1.5           RCurl_1.95-4.11       
## [40] magrittr_1.5           GenomeInfoDbData_1.1.0 Matrix_1.2-15         
## [43] Rcpp_0.12.19           munsell_0.5.0          fansi_0.4.0           
## [46] stringi_1.2.4          yaml_2.2.0             zlibbioc_1.26.0       
## [49] plyr_1.8.4             blob_1.1.1             grid_3.5.1            
## [52] crayon_1.3.4           lattice_0.20-35        haven_1.1.2           
## [55] GenomicFeatures_1.32.3 hms_0.4.2              locfit_1.5-9.1        
## [58] knitr_1.20             pillar_1.3.0           biomaRt_2.36.1        
## [61] glue_1.3.0             evaluate_0.12          blogdown_0.9          
## [64] latticeExtra_0.6-28    modelr_0.1.2           cellranger_1.1.0      
## [67] gtable_0.2.0           assertthat_0.2.0       xfun_0.4              
## [70] xtable_1.8-3           broom_0.5.0            memoise_1.1.0

References


  1. Nozue K, Devisetty UK, Lekkala S, Mueller-Moule P, Bak A, Casteel CL, Maloof JN (2018) Network analysis reveals a role for salicylic acid pathway components in shade avoidance. Plant Physiol. doi: 10.1104/pp.18.00920