Download updated gene description (copied from “2018-10-01-…Rmd”)

Most updated annotation can be downloaded from TAIR website (for subscribers). I used “TAIR Data 20180630”.

library(tidyverse);library(readxl);library(readr)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

# turn on TAIR subsciption before this procedure
At.gene.name <-read_tsv("https://www.arabidopsis.org/download_files/Subscriber_Data_Releases/TAIR_Data_20180630/gene_aliases_20180702.txt.gz") # Does work from home when I use Pulse Secure.
# How to concatenate multiple symbols in the same AGI
At.gene.name <- At.gene.name %>% group_by(name) %>% summarise(symbol2=paste(symbol,collapse=";"),full_name=paste(full_name,collapse=";"))
At.gene.name %>% dplyr::slice(100:110) # OK
# download cDNA info directly from TAIR
At_cdna<-Biostrings::readDNAStringSet("https://www.arabidopsis.org/download_files/Sequences/TAIR10_blastsets/TAIR10_cdna_20110103_representative_gene_model_updated")
At_cdna # 33602 genes

using downloaded file

system("gunzip -c ../../../resources/2018-09-26-over-representation-analysis-1-goseq-with-arabidopsis-go-term_files/TAIR10_cdna_20110103_representative_gene_model_updated.gz > ../../../resources/2018-09-26-over-representation-analysis-1-goseq-with-arabidopsis-go-term_files/TAIR10_cdna_20110103_representative_gene_model_updated")
At_cdna <- Biostrings::readDNAStringSet(file.path("..","..","..","resources","2018-09-26-over-representation-analysis-1-goseq-with-arabidopsis-go-term_files","TAIR10_cdna_20110103_representative_gene_model_updated"))
At_cdna

##   A DNAStringSet instance of length 33602
##         width seq                                           names               
##     [1]  2394 AGAAAACAGTCGACCGTCATT...TATTTGGTAATTTTTTGAGTC AT1G50920.1 | Sym...
##     [2]   546 ATGACTCGTTTGTTGCCTTAC...CAAGTTGATTCTGGTACATAG AT1G36960.1 | Sym...
##     [3]  2314 ATGGATTCAGAGTCAGAGTCA...CCTGGTGCATTGTGTTTCTCC AT1G44020.1 | Sym...
##     [4]  1658 TCGTTTCGTCGTCGATCAGAA...ATCGATTACATGCTACATTTT AT1G15970.1 | Sym...
##     [5]  1453 ATTGAAAAGAAAACACATCCC...GACCACCAAAATCTTCTCATA AT1G73440.1 | Sym...
##     ...   ... ...
## [33598]    87 GGATGGATGTCTGAGCGGTTG...GTTCGAATCCCTCTCCATCCG ATMG00420.1 | Sym...
## [33599]   384 ATGCTCCCCGCCGGTTGTTGG...CTACGATACTTAACTATATAA ATMG01330.1 | Sym...
## [33600]   573 ATGGATAACCAATTCATTTTC...GAACAGCGTAGCGACGGATAA ATMG00070.1 | Sym...
## [33601]   366 ATGGCATCAAAAATCCGCAAA...GATCCTTCTGCATACGCATAA ATMG00130.1 | Sym...
## [33602]    74 GCGCTCTTAGTTCAGTTCGGT...GTTCAAATCCTACAGAGCGTG ATMG00930.1 | Sym...

system("rm ../../../resources/2018-09-26-over-representation-analysis-1-goseq-with-arabidopsis-go-term_files/TAIR10_cdna_20110103_representative_gene_model_updated")

Updated hormone responsive gene list¹

hormone.responsiveness6.DF.s.list2.DF2<-readxl::read_xlsx(file.path("..","..","..","resources","2018-10-10-over-representation-analysis-2-goseq-using-custom-categories_files","Supplemental_Dataset3_source_of_custom_categories.xlsx"),skip=15,sheet=1) # Does not work. why? 
# because uglyURLs and canonifyURLs = true on 100818. I commented out them.
# Ask Kazu for this xlsx file.
hormone.responsiveness6.DF.s.list2.DF2

## # A tibble: 519 x 20
##    ABAdown ACCup ACCdown BLup  BLdown IAAup IAAdown MJup  MJdown CKup  CKdown
##    <chr>   <chr> <chr>   <chr> <chr>  <chr> <chr>   <chr> <chr>  <chr> <chr> 
##  1 AT1G21… AT1G… AT1G70… AT1G… AT1G2… AT1G… AT1G74… AT1G… AT1G7… AT1G… AT1G4…
##  2 AT1G21… AT1G… AT1G29… AT1G… AT1G1… AT1G… AT1G31… AT1G… AT1G3… AT1G… AT1G7…
##  3 AT1G49… AT1G… AT1G28… AT1G… AT1G6… AT1G… AT1G17… AT1G… AT1G6… AT1G… AT1G7…
##  4 AT1G73… AT1G… AT1G64… AT2G… AT1G0… AT1G… AT1G13… AT1G… AT1G7… AT1G… AT2G4…
##  5 AT1G70… AT1G… AT1G72… AT2G… AT1G7… AT1G… AT1G18… AT1G… AT1G4… AT1G… AT2G3…
##  6 AT1G31… AT2G… AT2G26… AT2G… AT1G1… AT1G… AT1G31… AT1G… AT1G7… AT1G… AT2G3…
##  7 AT1G77… AT2G… AT2G24… AT2G… AT1G7… AT1G… AT1G77… AT1G… AT1G1… AT1G… AT2G2…
##  8 AT1G16… AT2G… AT2G03… AT2G… AT2G2… AT1G… AT1G13… AT1G… AT1G6… AT1G… AT4G1…
##  9 AT1G70… AT2G… AT3G13… AT2G… AT2G3… AT1G… AT1G72… AT1G… AT1G1… AT1G… AT4G3…
## 10 AT1G77… AT2G… AT3G56… AT3G… AT2G1… AT1G… AT1G72… AT1G… AT1G6… AT1G… AT4G3…
## # … with 509 more rows, and 9 more variables: GAup <chr>, GAdown <chr>,
## #   PIFtarget <chr>, MYC234up <chr>, MYC234down <chr>, SAup <chr>,
## #   Sadown <chr>, WRKY33up <chr>, WRKY33down <chr>

# conver to list obejct compatible with goseq() below

format hormone.responsiveness6.DF.s.list2.DF2 into goseq() compatible list (run once)

# extract AGI from At_cdna
At.gene.name<-tibble(name=names(At_cdna)) %>% separate(name,into=c("name2","Symbol","description"),sep=" \\|",extra="drop",fill="left") %>%  mutate(AGI=str_remove(name2,pattern="\\.[[:digit:]]+")) 
# make presence/absense hormone responsive gene table (dataframe)
  for(i in 1:20) {
  genes<-hormone.responsiveness6.DF.s.list2.DF2[!is.na(hormone.responsiveness6.DF.s.list2.DF2[,i]),i] %>% as_vector()
  temp<-data.frame(AGI=genes,category=names(hormone.responsiveness6.DF.s.list2.DF2)[i])
  #At.gene.name %>% left_join(temp,by=c("name"="AGI")) -> At.gene.name
At.gene.name %>% left_join(temp,by="AGI") -> At.gene.name 
  }
names(At.gene.name)[5:24] <- names(hormone.responsiveness6.DF.s.list2.DF2)
At.gene.name %>% filter(str_detect(AGI,pattern="AT1G|AT2G|AT3G|AT4G|AT5G|ATC|ATM")) %>% unite(category,5:24,sep=",")->test2
test2[1:10,] %>%  mutate(category=gsub("NA","",category)) %>% mutate(category=gsub(",","",category))
# only select genes with AGI name and concatenate categories
At.gene.name %>% filter(str_detect(AGI,pattern="AT1G|AT2G|AT3G|AT4G|AT5G|ATC|ATM")) %>% unite(category,5:24,sep=",") %>%  mutate(category=gsub("NA,","",category)) %>% mutate(category=gsub(",NA","",category)) %>% mutate(category=gsub("NA","",category)) ->test3
# convert into list object
temp.list<-list()
for(i in 1:dim(test3)[1]) {
  temp.list[[i]]<-test3 %>% slice(i) %>% pull(category)
}
names(temp.list)<-test3 %>% pull(AGI)
# split concatanated categories in each gene
At.hormone.responsive.list<-lapply(temp.list,function(x) unlist(strsplit(x, split=",")))
# save
save(At.hormone.responsive.list,file=file.path("..","..","..","resources","2018-10-10-over-representation-analysis-2-goseq-using-custom-categories_files","At.hormone.responsive.list.Rdata"))

GOseq ORA function with Arabidopss thaliana hormone responsive genes (under construction)

load(file.path("..","..","..","resources","2018-10-10-over-representation-analysis-2-goseq-using-custom-categories_files","At.hormone.responsive.list.Rdata"))
head(At.hormone.responsive.list)

## $AT1G50920
## character(0)
## 
## $AT1G36960
## character(0)
## 
## $AT1G44020
## character(0)
## 
## $AT1G15970
## character(0)
## 
## $AT1G73440
## character(0)
## 
## $AT1G75120
## character(0)

library(ShortRead);library(goseq);library(GO.db);library("annotate")

## Warning: package 'S4Vectors' was built under R version 3.6.3

## Warning: package 'GenomeInfoDb' was built under R version 3.6.3

## Warning: package 'DelayedArray' was built under R version 3.6.3

#  bias.data vector must have the same length as DEgenes vector! 
## if you want to test expressed genes as background, add background in this function
GOseq.At.customcat.ORA<-function(genelist,padjust=0.05,custom.category.list=At.hormone.responsive.list,cdna=At_cdna,background="") { # return GO enrichment table, padjus, padjust=0.05. New BLAST2GO results with only BP Brgo.v2.5.BP.list is used (042518). Brgo.v2.5.BP.list is based on Hajar's Blast2go, which should be replaced with Brgo.Atgoslim.BP.list and giving a new name to this function (080818). cdna is DNAstring object. background is a vector of genes.
  #bias<-nchar(cdna)
  bias<-Biostrings::width(cdna)
  # add name to "bias". cdna
  #names(bias)<-names(cdna)
 names(bias) <- tibble(name=names(cdna)) %>% separate(name,into=c("name2","Symbol","description"),sep=" \\|",extra="drop",fill="left") %>%  mutate(AGI=str_remove(name2,pattern="\\.[[:digit:]]+")) %>% pull(AGI)
 if(background=="") {print("Use all genes in genome as background")} else {
 # select only expressed geens 
  bias <- bias[background]
  print("Expessed genes are used as background.")
  print(paste("length of bias is ",length(bias)))
 }
  TF<-(names(bias) %in% genelist)*1
  names(TF)<-names(bias)
  #print(TF)
  pwf<-nullp(TF,bias.data=bias)
  #print(pwf$DEgenes)
  GO.pval <- goseq(pwf,gene2cat=custom.category.list,use_genes_without_cat=TRUE) # format became different in new goseq version (021111). Does not work (042716)
  # calculate p ajust by BH
  GO.pval$over_represented_padjust<-p.adjust(GO.pval$over_represented_pvalue,method="BH")
  if(GO.pval$over_represented_padjust[1]>padjust) return("no enriched GO")
  else {
    enriched.GO<-GO.pval[GO.pval$over_represented_padjust<padjust,] 
    print("enriched.GO is")
    print(enriched.GO)
    return(enriched.GO)
  }
}

test the function

load(file.path("..","..","..","resources","2018-10-10-over-representation-analysis-2-goseq-using-custom-categories_files","dge.nolow.Rdata"))
expressed.genes<-gsub("(.)(\\.[[:digit:]]+)","\\1",as_vector(dge.nolow$genes))

## Loading required package: edgeR

## Loading required package: limma

## 
## Attaching package: 'limma'

## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA

GOseq.At.customcat.ORA(genelist=hormone.responsiveness6.DF.s.list2.DF2$IAAup, background="") # Total genes in genome are background

## [1] "Use all genes in genome as background"

## Warning in pcls(G): initial point very close to some inequality constraints

## Using manually entered categories.

## Calculating the p-values...

## [1] "enriched.GO is"
##      category over_represented_pvalue under_represented_pvalue numDEInCat
## 11      IAAup            0.000000e+00                1.0000000        198
## 12     MJdown            4.395205e-21                1.0000000         23
## 5        BLup            7.235275e-18                1.0000000         11
## 19 WRKY33down            1.763824e-16                1.0000000         18
## 16  PIFtarget            2.001183e-14                1.0000000         11
## 3       ACCup            2.464518e-11                1.0000000          8
## 13       MJup            6.723578e-09                1.0000000         18
## 1     ABAdown            1.782286e-07                1.0000000         12
## 4      BLdown            2.043251e-06                0.9999999          5
## 18       SAup            8.089212e-06                0.9999992          8
## 8      GAdown            1.526185e-05                0.9999993          5
## 7        CKup            5.622343e-04                0.9999614          4
## 10    IAAdown            6.516900e-03                0.9994291          3
##    numInCat over_represented_padjust
## 11      198             0.000000e+00
## 12      230             4.395205e-20
## 5        28             4.823517e-17
## 19      195             8.819122e-16
## 16       52             8.004731e-14
## 3        34             8.215059e-11
## 13      519             1.921022e-08
## 1       269             4.455716e-07
## 4        33             4.540558e-06
## 18      168             1.617842e-05
## 8        50             2.774883e-05
## 7        60             9.370572e-04
## 10       61             1.002600e-02

##      category over_represented_pvalue under_represented_pvalue numDEInCat
## 11      IAAup            0.000000e+00                1.0000000        198
## 12     MJdown            4.395205e-21                1.0000000         23
## 5        BLup            7.235275e-18                1.0000000         11
## 19 WRKY33down            1.763824e-16                1.0000000         18
## 16  PIFtarget            2.001183e-14                1.0000000         11
## 3       ACCup            2.464518e-11                1.0000000          8
## 13       MJup            6.723578e-09                1.0000000         18
## 1     ABAdown            1.782286e-07                1.0000000         12
## 4      BLdown            2.043251e-06                0.9999999          5
## 18       SAup            8.089212e-06                0.9999992          8
## 8      GAdown            1.526185e-05                0.9999993          5
## 7        CKup            5.622343e-04                0.9999614          4
## 10    IAAdown            6.516900e-03                0.9994291          3
##    numInCat over_represented_padjust
## 11      198             0.000000e+00
## 12      230             4.395205e-20
## 5        28             4.823517e-17
## 19      195             8.819122e-16
## 16       52             8.004731e-14
## 3        34             8.215059e-11
## 13      519             1.921022e-08
## 1       269             4.455716e-07
## 4        33             4.540558e-06
## 18      168             1.617842e-05
## 8        50             2.774883e-05
## 7        60             9.370572e-04
## 10       61             1.002600e-02

GOseq.At.customcat.ORA(genelist=hormone.responsiveness6.DF.s.list2.DF2$IAAup, background=expressed.genes) # Expressed genes in samples used for DEGs are background.

## Warning in if (background == "") {: the condition has length > 1 and only the
## first element will be used

## [1] "Expessed genes are used as background."
## [1] "length of bias is  16799"

## Using manually entered categories.
## Calculating the p-values...

## [1] "enriched.GO is"
##      category over_represented_pvalue under_represented_pvalue numDEInCat
## 11      IAAup            0.000000e+00                1.0000000        143
## 12     MJdown            5.092312e-16                1.0000000         21
## 5        BLup            6.670465e-13                1.0000000          9
## 19 WRKY33down            1.372204e-12                1.0000000         16
## 16  PIFtarget            4.126366e-10                1.0000000          8
## 3       ACCup            3.002866e-06                0.9999999          5
## 4      BLdown            8.892919e-06                0.9999997          5
## 13       MJup            4.273610e-05                0.9999900         14
## 1     ABAdown            7.261961e-05                0.9999870         10
## 18       SAup            9.374241e-05                0.9999867          8
## 8      GAdown            5.425846e-04                0.9999639          4
## 7        CKup            7.477068e-03                0.9993214          3
##    numInCat over_represented_padjust
## 11      143             0.000000e+00
## 12      221             5.092312e-15
## 5        25             4.446976e-12
## 19      159             6.861019e-12
## 16       34             1.650546e-09
## 3        26             1.000955e-05
## 4        33             2.540834e-05
## 13      454             1.068403e-04
## 1       257             1.613769e-04
## 18      154             1.874848e-04
## 8        43             9.865175e-04
## 7        46             1.246178e-02

##      category over_represented_pvalue under_represented_pvalue numDEInCat
## 11      IAAup            0.000000e+00                1.0000000        143
## 12     MJdown            5.092312e-16                1.0000000         21
## 5        BLup            6.670465e-13                1.0000000          9
## 19 WRKY33down            1.372204e-12                1.0000000         16
## 16  PIFtarget            4.126366e-10                1.0000000          8
## 3       ACCup            3.002866e-06                0.9999999          5
## 4      BLdown            8.892919e-06                0.9999997          5
## 13       MJup            4.273610e-05                0.9999900         14
## 1     ABAdown            7.261961e-05                0.9999870         10
## 18       SAup            9.374241e-05                0.9999867          8
## 8      GAdown            5.425846e-04                0.9999639          4
## 7        CKup            7.477068e-03                0.9993214          3
##    numInCat over_represented_padjust
## 11      143             0.000000e+00
## 12      221             5.092312e-15
## 5        25             4.446976e-12
## 19      159             6.861019e-12
## 16       34             1.650546e-09
## 3        26             1.000955e-05
## 4        33             2.540834e-05
## 13      454             1.068403e-04
## 1       257             1.613769e-04
## 18      154             1.874848e-04
## 8        43             9.865175e-04
## 7        46             1.246178e-02

Session info

sessionInfo()

## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] edgeR_3.28.1                limma_3.42.2               
##  [3] annotate_1.64.0             XML_3.99-0.3               
##  [5] GO.db_3.10.0                AnnotationDbi_1.48.0       
##  [7] goseq_1.38.0                geneLenDataBase_1.22.0     
##  [9] BiasedUrn_1.07              ShortRead_1.44.3           
## [11] GenomicAlignments_1.22.1    SummarizedExperiment_1.16.1
## [13] DelayedArray_0.12.3         matrixStats_0.57.0         
## [15] Biobase_2.46.0              Rsamtools_2.2.3            
## [17] GenomicRanges_1.38.0        GenomeInfoDb_1.22.1        
## [19] Biostrings_2.54.0           XVector_0.26.0             
## [21] IRanges_2.20.2              S4Vectors_0.24.4           
## [23] BiocParallel_1.20.1         BiocGenerics_0.32.0        
## [25] readxl_1.3.1                forcats_0.5.0              
## [27] stringr_1.4.0               dplyr_1.0.2                
## [29] purrr_0.3.4                 readr_1.4.0                
## [31] tidyr_1.1.2                 tibble_3.0.4               
## [33] ggplot2_3.3.3               tidyverse_1.3.0            
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_2.0-0       hwriter_1.3.2          ellipsis_0.3.1        
##  [4] fs_1.5.0               rstudioapi_0.13        bit64_4.0.5           
##  [7] fansi_0.4.1            lubridate_1.7.9.2      xml2_1.3.2            
## [10] splines_3.6.2          knitr_1.31             jsonlite_1.7.2        
## [13] broom_0.7.3            dbplyr_2.0.0           png_0.1-7             
## [16] compiler_3.6.2         httr_1.4.2             backports_1.2.1       
## [19] assertthat_0.2.1       Matrix_1.3-2           cli_2.2.0             
## [22] htmltools_0.5.1.1      prettyunits_1.1.1      tools_3.6.2           
## [25] gtable_0.3.0           glue_1.4.2             GenomeInfoDbData_1.2.2
## [28] rappdirs_0.3.1         Rcpp_1.0.6             cellranger_1.1.0      
## [31] vctrs_0.3.6            nlme_3.1-151           blogdown_1.2.2        
## [34] rtracklayer_1.46.0     xfun_0.22              rvest_0.3.6           
## [37] lifecycle_0.2.0        zlibbioc_1.32.0        scales_1.1.1          
## [40] hms_0.5.3              RColorBrewer_1.1-2     yaml_2.2.1            
## [43] curl_4.3               memoise_1.1.0          biomaRt_2.42.1        
## [46] latticeExtra_0.6-29    stringi_1.5.3          RSQLite_2.2.1         
## [49] highr_0.8              GenomicFeatures_1.38.2 rlang_0.4.10          
## [52] pkgconfig_2.0.3        bitops_1.0-6           evaluate_0.14         
## [55] lattice_0.20-41        bit_4.0.4              tidyselect_1.1.0      
## [58] magrittr_2.0.1         bookdown_0.21          R6_2.5.0              
## [61] generics_0.1.0         DBI_1.1.0              pillar_1.4.7          
## [64] haven_2.3.1            withr_2.3.0            mgcv_1.8-33           
## [67] RCurl_1.98-1.2         modelr_0.1.8           crayon_1.3.4          
## [70] utf8_1.1.4             BiocFileCache_1.10.2   rmarkdown_2.7         
## [73] jpeg_0.1-8.1           progress_1.2.2         locfit_1.5-9.4        
## [76] grid_3.6.2             blob_1.2.1             reprex_0.3.0          
## [79] digest_0.6.27          xtable_1.8-4           openssl_1.4.3         
## [82] munsell_0.5.0          askpass_1.1

References

Nozue K, Devisetty UK, Lekkala S, Mueller-Moule P, Bak A, Casteel CL, Maloof JN (2018) Network analysis reveals a role for salicylic acid pathway components in shade avoidance. Plant Physiol. doi: 10.1104/pp.18.00920↩

Over-representation analysis 2: GOseq using custom categories

Kazu Nozue

Download updated gene description (copied from “2018-10-01-…Rmd”)

using downloaded file

Updated hormone responsive gene list¹

format hormone.responsiveness6.DF.s.list2.DF2 into goseq() compatible list (run once)

GOseq ORA function with Arabidopss thaliana hormone responsive genes (under construction)

test the function

Session info

References

Over-representation analysis 2: GOseq using custom categories

Kazu Nozue

Download updated gene description (copied from “2018-10-01-…Rmd”)

using downloaded file

Updated hormone responsive gene list1

format hormone.responsiveness6.DF.s.list2.DF2 into goseq() compatible list (run once)

GOseq ORA function with Arabidopss thaliana hormone responsive genes (under construction)

test the function

Session info

References

Updated hormone responsive gene list¹