Download updated gene description (copied from “2018-10-01-…Rmd”)

Most updated annotation can be downloaded from TAIR website (for subscribers). I used “TAIR Data 20180630”.

library(tidyverse);library(readxl);library(readr)

## ── Attaching packages ───────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.1.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.7
## ✔ tidyr   0.8.2     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0

## ── Conflicts ──────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

# turn on TAIR subsciption before this procedure
At.gene.name <-read_tsv("https://www.arabidopsis.org/download_files/Subscriber_Data_Releases/TAIR_Data_20180630/gene_aliases_20180702.txt.gz") # Does work from home when I use Pulse Secure.
# How to concatenate multiple symbols in the same AGI
At.gene.name <- At.gene.name %>% group_by(name) %>% summarise(symbol2=paste(symbol,collapse=";"),full_name=paste(full_name,collapse=";"))
At.gene.name %>% dplyr::slice(100:110) # OK

## # A tibble: 11 x 3
##    name     symbol2          full_name                                    
##    <chr>    <chr>            <chr>                                        
##  1 AT1G018… AtGEN1;GEN1      NA;ortholog of HsGEN1                        
##  2 AT1G019… ATSBT1.1;SBTI1.1 NA;NA                                        
##  3 AT1G019… AtGET3a;GET3a    Guided Entry of Tail-anchored protein 3a;Gui…
##  4 AT1G019… ARK2;AtKINUb     armadillo repeat kinesin 2;Arabidopsis thali…
##  5 AT1G019… BIG3;EDA10       BIG3;embryo sac development arrest 10        
##  6 AT1G019… AtBBE1;OGOX4     NA;oligogalacturonide oxidase 4              
##  7 AT1G020… GAE2             UDP-D-glucuronate 4-epimerase 2              
##  8 AT1G020… SEC1A            secretory 1A                                 
##  9 AT1G020… LAP6;PKSA        LESS ADHESIVE POLLEN 6;polyketide synthase A 
## 10 AT1G020… SPL8             squamosa promoter binding protein-like 8     
## 11 AT1G020… ATCSN7;COP15;CS… ARABIDOPSIS THALIANA COP9 SIGNALOSOME SUBUNI…

# download cDNA info directly from TAIR
At_cdna<-Biostrings::readDNAStringSet("https://www.arabidopsis.org/download_files/Sequences/TAIR10_blastsets/TAIR10_cdna_20110103_representative_gene_model_updated")
At_cdna # 33602 genes

##   A DNAStringSet instance of length 33602
##         width seq                                      names               
##     [1]  2394 AGAAAACAGTCGACCGTCA...TTGGTAATTTTTTGAGTC AT1G50920.1 | Sym...
##     [2]   546 ATGACTCGTTTGTTGCCTT...GTTGATTCTGGTACATAG AT1G36960.1 | Sym...
##     [3]  2314 ATGGATTCAGAGTCAGAGT...GGTGCATTGTGTTTCTCC AT1G44020.1 | Sym...
##     [4]  1658 TCGTTTCGTCGTCGATCAG...GATTACATGCTACATTTT AT1G15970.1 | Sym...
##     [5]  1453 ATTGAAAAGAAAACACATC...CACCAAAATCTTCTCATA AT1G73440.1 | Sym...
##     ...   ... ...
## [33598]    87 GGATGGATGTCTGAGCGGT...CGAATCCCTCTCCATCCG ATMG00420.1 | Sym...
## [33599]   384 ATGCTCCCCGCCGGTTGTT...CGATACTTAACTATATAA ATMG01330.1 | Sym...
## [33600]   573 ATGGATAACCAATTCATTT...CAGCGTAGCGACGGATAA ATMG00070.1 | Sym...
## [33601]   366 ATGGCATCAAAAATCCGCA...CCTTCTGCATACGCATAA ATMG00130.1 | Sym...
## [33602]    74 GCGCTCTTAGTTCAGTTCG...CAAATCCTACAGAGCGTG ATMG00930.1 | Sym...

Updated hormone responsive gene list¹

hormone.responsiveness6.DF.s.list2.DF2<-readxl::read_xlsx(file.path("2018-10-10-over-representation-analysis-2-goseq-using-custom-categories_files","Supplemental_Dataset3_source_of_custom_categories.xlsx"),skip=15,sheet=1) # Does not work. why? 
# because uglyURLs and canonifyURLs = true on 100818. I commented out them.
# Ask Kazu for this xlsx file.
hormone.responsiveness6.DF.s.list2.DF2

## # A tibble: 519 x 20
##    ABAdown ACCup ACCdown BLup  BLdown IAAup IAAdown MJup  MJdown CKup 
##    <chr>   <chr> <chr>   <chr> <chr>  <chr> <chr>   <chr> <chr>  <chr>
##  1 AT1G21… AT1G… AT1G70… AT1G… AT1G2… AT1G… AT1G74… AT1G… AT1G7… AT1G…
##  2 AT1G21… AT1G… AT1G29… AT1G… AT1G1… AT1G… AT1G31… AT1G… AT1G3… AT1G…
##  3 AT1G49… AT1G… AT1G28… AT1G… AT1G6… AT1G… AT1G17… AT1G… AT1G6… AT1G…
##  4 AT1G73… AT1G… AT1G64… AT2G… AT1G0… AT1G… AT1G13… AT1G… AT1G7… AT1G…
##  5 AT1G70… AT1G… AT1G72… AT2G… AT1G7… AT1G… AT1G18… AT1G… AT1G4… AT1G…
##  6 AT1G31… AT2G… AT2G26… AT2G… AT1G1… AT1G… AT1G31… AT1G… AT1G7… AT1G…
##  7 AT1G77… AT2G… AT2G24… AT2G… AT1G7… AT1G… AT1G77… AT1G… AT1G1… AT1G…
##  8 AT1G16… AT2G… AT2G03… AT2G… AT2G2… AT1G… AT1G13… AT1G… AT1G6… AT1G…
##  9 AT1G70… AT2G… AT3G13… AT2G… AT2G3… AT1G… AT1G72… AT1G… AT1G1… AT1G…
## 10 AT1G77… AT2G… AT3G56… AT3G… AT2G1… AT1G… AT1G72… AT1G… AT1G6… AT1G…
## # ... with 509 more rows, and 10 more variables: CKdown <chr>, GAup <chr>,
## #   GAdown <chr>, PIFtarget <chr>, MYC234up <chr>, MYC234down <chr>,
## #   SAup <chr>, Sadown <chr>, WRKY33up <chr>, WRKY33down <chr>

# conver to list obejct compatible with goseq() below

format hormone.responsiveness6.DF.s.list2.DF2 into goseq() compatible list (run once)

# extract AGI from At_cdna
At.gene.name<-tibble(name=names(At_cdna)) %>% separate(name,into=c("name2","Symbol","description"),sep=" \\|",extra="drop",fill="left") %>%  mutate(AGI=str_remove(name2,pattern="\\.[[:digit:]]+")) 
# make presence/absense hormone responsive gene table (dataframe)
  for(i in 1:20) {
  genes<-hormone.responsiveness6.DF.s.list2.DF2[!is.na(hormone.responsiveness6.DF.s.list2.DF2[,i]),i] %>% as_vector()
  temp<-data.frame(AGI=genes,category=names(hormone.responsiveness6.DF.s.list2.DF2)[i])
  #At.gene.name %>% left_join(temp,by=c("name"="AGI")) -> At.gene.name
At.gene.name %>% left_join(temp,by="AGI") -> At.gene.name 
  }
names(At.gene.name)[5:24] <- names(hormone.responsiveness6.DF.s.list2.DF2)
At.gene.name %>% filter(str_detect(AGI,pattern="AT1G|AT2G|AT3G|AT4G|AT5G|ATC|ATM")) %>% unite(category,5:24,sep=",")->test2
test2[1:10,] %>%  mutate(category=gsub("NA","",category)) %>% mutate(category=gsub(",","",category))
# only select genes with AGI name and concatenate categories
At.gene.name %>% filter(str_detect(AGI,pattern="AT1G|AT2G|AT3G|AT4G|AT5G|ATC|ATM")) %>% unite(category,5:24,sep=",") %>%  mutate(category=gsub("NA,","",category)) %>% mutate(category=gsub(",NA","",category)) %>% mutate(category=gsub("NA","",category)) ->test3
# convert into list object
temp.list<-list()
for(i in 1:dim(test3)[1]) {
  temp.list[[i]]<-test3 %>% slice(i) %>% pull(category)
}
names(temp.list)<-test3 %>% pull(AGI)
# split concatanated categories in each gene
At.hormone.responsive.list<-lapply(temp.list,function(x) unlist(strsplit(x, split=",")))
# save
save(At.hormone.responsive.list,file="2018-09-27-over-representation-analysis-2-goseq-with-custom-categories_files/At.hormone.responsive.list.Rdata")

GOseq ORA function with Arabidopss thaliana hormone responsive genes (under construction)

load("2018-10-10-over-representation-analysis-2-goseq-using-custom-categories_files/At.hormone.responsive.list.Rdata")
head(At.hormone.responsive.list)

## $AT1G50920
## character(0)
## 
## $AT1G36960
## character(0)
## 
## $AT1G44020
## character(0)
## 
## $AT1G15970
## character(0)
## 
## $AT1G73440
## character(0)
## 
## $AT1G75120
## character(0)

library(ShortRead);library(goseq);library(GO.db);library("annotate")

#  bias.data vector must have the same length as DEgenes vector! 
## if you want to test expressed genes as background, add background in this function
GOseq.At.customcat.ORA<-function(genelist,padjust=0.05,custom.category.list=At.hormone.responsive.list,cdna=At_cdna,background="") { # return GO enrichment table, padjus, padjust=0.05. New BLAST2GO results with only BP Brgo.v2.5.BP.list is used (042518). Brgo.v2.5.BP.list is based on Hajar's Blast2go, which should be replaced with Brgo.Atgoslim.BP.list and giving a new name to this function (080818). cdna is DNAstring object. background is a vector of genes.
  #bias<-nchar(cdna)
  bias<-Biostrings::width(cdna)
  # add name to "bias". cdna
  #names(bias)<-names(cdna)
 names(bias) <- tibble(name=names(cdna)) %>% separate(name,into=c("name2","Symbol","description"),sep=" \\|",extra="drop",fill="left") %>%  mutate(AGI=str_remove(name2,pattern="\\.[[:digit:]]+")) %>% pull(AGI)
 if(background=="") {print("Use all genes in genome as background")} else {
 # select only expressed geens 
  bias <- bias[background]
  print("Expessed genes are used as background.")
  print(paste("length of bias is ",length(bias)))
 }
  TF<-(names(bias) %in% genelist)*1
  names(TF)<-names(bias)
  #print(TF)
  pwf<-nullp(TF,bias.data=bias)
  #print(pwf$DEgenes)
  GO.pval <- goseq(pwf,gene2cat=custom.category.list,use_genes_without_cat=TRUE) # format became different in new goseq version (021111). Does not work (042716)
  # calculate p ajust by BH
  GO.pval$over_represented_padjust<-p.adjust(GO.pval$over_represented_pvalue,method="BH")
  if(GO.pval$over_represented_padjust[1]>padjust) return("no enriched GO")
  else {
    enriched.GO<-GO.pval[GO.pval$over_represented_padjust<padjust,] 
    print("enriched.GO is")
    print(enriched.GO)
    return(enriched.GO)
  }
}

test the function

load("2018-10-10-over-representation-analysis-2-goseq-using-custom-categories_files/dge.nolow.Rdata")
expressed.genes<-gsub("(.)(\\.[[:digit:]]+)","\\1",as_vector(dge.nolow$genes))

## Loading required package: edgeR

## Loading required package: limma

## 
## Attaching package: 'limma'

## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA

GOseq.At.customcat.ORA(genelist=hormone.responsiveness6.DF.s.list2.DF2$IAAup, background="") # Total genes in genome are background

## [1] "Use all genes in genome as background"

## Warning in pcls(G): initial point very close to some inequality constraints

## Using manually entered categories.

## Calculating the p-values...

## [1] "enriched.GO is"
##      category over_represented_pvalue under_represented_pvalue numDEInCat
## 11      IAAup            0.000000e+00                1.0000000        198
## 12     MJdown            4.395205e-21                1.0000000         23
## 5        BLup            7.235275e-18                1.0000000         11
## 19 WRKY33down            1.763824e-16                1.0000000         18
## 16  PIFtarget            2.001183e-14                1.0000000         11
## 3       ACCup            2.464518e-11                1.0000000          8
## 13       MJup            6.723578e-09                1.0000000         18
## 1     ABAdown            1.782286e-07                1.0000000         12
## 4      BLdown            2.043251e-06                0.9999999          5
## 18       SAup            8.089212e-06                0.9999992          8
## 8      GAdown            1.526185e-05                0.9999993          5
## 7        CKup            5.622343e-04                0.9999614          4
## 10    IAAdown            6.516900e-03                0.9994291          3
##    numInCat over_represented_padjust
## 11      198             0.000000e+00
## 12      230             4.395205e-20
## 5        28             4.823517e-17
## 19      195             8.819122e-16
## 16       52             8.004731e-14
## 3        34             8.215059e-11
## 13      519             1.921022e-08
## 1       269             4.455716e-07
## 4        33             4.540558e-06
## 18      168             1.617842e-05
## 8        50             2.774883e-05
## 7        60             9.370572e-04
## 10       61             1.002600e-02

##      category over_represented_pvalue under_represented_pvalue numDEInCat
## 11      IAAup            0.000000e+00                1.0000000        198
## 12     MJdown            4.395205e-21                1.0000000         23
## 5        BLup            7.235275e-18                1.0000000         11
## 19 WRKY33down            1.763824e-16                1.0000000         18
## 16  PIFtarget            2.001183e-14                1.0000000         11
## 3       ACCup            2.464518e-11                1.0000000          8
## 13       MJup            6.723578e-09                1.0000000         18
## 1     ABAdown            1.782286e-07                1.0000000         12
## 4      BLdown            2.043251e-06                0.9999999          5
## 18       SAup            8.089212e-06                0.9999992          8
## 8      GAdown            1.526185e-05                0.9999993          5
## 7        CKup            5.622343e-04                0.9999614          4
## 10    IAAdown            6.516900e-03                0.9994291          3
##    numInCat over_represented_padjust
## 11      198             0.000000e+00
## 12      230             4.395205e-20
## 5        28             4.823517e-17
## 19      195             8.819122e-16
## 16       52             8.004731e-14
## 3        34             8.215059e-11
## 13      519             1.921022e-08
## 1       269             4.455716e-07
## 4        33             4.540558e-06
## 18      168             1.617842e-05
## 8        50             2.774883e-05
## 7        60             9.370572e-04
## 10       61             1.002600e-02

GOseq.At.customcat.ORA(genelist=hormone.responsiveness6.DF.s.list2.DF2$IAAup, background=expressed.genes) # Expressed genes in samples used for DEGs are background.

## Warning in if (background == "") {: the condition has length > 1 and only
## the first element will be used

## [1] "Expessed genes are used as background."
## [1] "length of bias is  16799"

## Using manually entered categories.
## Calculating the p-values...

## [1] "enriched.GO is"
##      category over_represented_pvalue under_represented_pvalue numDEInCat
## 11      IAAup            0.000000e+00                1.0000000        143
## 12     MJdown            5.092312e-16                1.0000000         21
## 5        BLup            6.670465e-13                1.0000000          9
## 19 WRKY33down            1.372204e-12                1.0000000         16
## 16  PIFtarget            4.126366e-10                1.0000000          8
## 3       ACCup            3.002866e-06                0.9999999          5
## 4      BLdown            8.892919e-06                0.9999997          5
## 13       MJup            4.273610e-05                0.9999900         14
## 1     ABAdown            7.261961e-05                0.9999870         10
## 18       SAup            9.374241e-05                0.9999867          8
## 8      GAdown            5.425846e-04                0.9999639          4
## 7        CKup            7.477068e-03                0.9993214          3
##    numInCat over_represented_padjust
## 11      143             0.000000e+00
## 12      221             5.092312e-15
## 5        25             4.446976e-12
## 19      159             6.861019e-12
## 16       34             1.650546e-09
## 3        26             1.000955e-05
## 4        33             2.540834e-05
## 13      454             1.068403e-04
## 1       257             1.613769e-04
## 18      154             1.874848e-04
## 8        43             9.865175e-04
## 7        46             1.246178e-02

##      category over_represented_pvalue under_represented_pvalue numDEInCat
## 11      IAAup            0.000000e+00                1.0000000        143
## 12     MJdown            5.092312e-16                1.0000000         21
## 5        BLup            6.670465e-13                1.0000000          9
## 19 WRKY33down            1.372204e-12                1.0000000         16
## 16  PIFtarget            4.126366e-10                1.0000000          8
## 3       ACCup            3.002866e-06                0.9999999          5
## 4      BLdown            8.892919e-06                0.9999997          5
## 13       MJup            4.273610e-05                0.9999900         14
## 1     ABAdown            7.261961e-05                0.9999870         10
## 18       SAup            9.374241e-05                0.9999867          8
## 8      GAdown            5.425846e-04                0.9999639          4
## 7        CKup            7.477068e-03                0.9993214          3
##    numInCat over_represented_padjust
## 11      143             0.000000e+00
## 12      221             5.092312e-15
## 5        25             4.446976e-12
## 19      159             6.861019e-12
## 16       34             1.650546e-09
## 3        26             1.000955e-05
## 4        33             2.540834e-05
## 13      454             1.068403e-04
## 1       257             1.613769e-04
## 18      154             1.874848e-04
## 8        43             9.865175e-04
## 7        46             1.246178e-02

Session info

sessionInfo()

## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] edgeR_3.22.5                limma_3.36.5               
##  [3] annotate_1.58.0             XML_3.98-1.16              
##  [5] GO.db_3.6.0                 AnnotationDbi_1.42.1       
##  [7] goseq_1.32.0                geneLenDataBase_1.16.0     
##  [9] BiasedUrn_1.07              ShortRead_1.38.0           
## [11] GenomicAlignments_1.16.0    SummarizedExperiment_1.10.1
## [13] DelayedArray_0.6.6          matrixStats_0.54.0         
## [15] Biobase_2.40.0              Rsamtools_1.32.3           
## [17] GenomicRanges_1.32.7        GenomeInfoDb_1.16.0        
## [19] Biostrings_2.48.0           XVector_0.20.0             
## [21] IRanges_2.14.12             S4Vectors_0.18.3           
## [23] BiocParallel_1.14.2         BiocGenerics_0.26.0        
## [25] bindrcpp_0.2.2              readxl_1.1.0               
## [27] forcats_0.3.0               stringr_1.3.1              
## [29] dplyr_0.7.7                 purrr_0.2.5                
## [31] readr_1.1.1                 tidyr_0.8.2                
## [33] tibble_1.4.2                ggplot2_3.1.0              
## [35] tidyverse_1.2.1            
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-137           bitops_1.0-6           bit64_0.9-7           
##  [4] lubridate_1.7.4        progress_1.2.0         RColorBrewer_1.1-2    
##  [7] httr_1.3.1             rprojroot_1.3-2        tools_3.5.1           
## [10] backports_1.1.2        utf8_1.1.4             R6_2.3.0              
## [13] mgcv_1.8-25            DBI_1.0.0              lazyeval_0.2.1        
## [16] colorspace_1.3-2       withr_2.1.2            prettyunits_1.0.2     
## [19] tidyselect_0.2.5       bit_1.1-14             curl_3.2              
## [22] compiler_3.5.1         cli_1.0.1              rvest_0.3.2           
## [25] xml2_1.2.0             rtracklayer_1.40.6     bookdown_0.7          
## [28] scales_1.0.0           digest_0.6.18          rmarkdown_1.10        
## [31] pkgconfig_2.0.2        htmltools_0.3.6        rlang_0.3.0.1         
## [34] RSQLite_2.1.1          rstudioapi_0.8         bindr_0.1.1           
## [37] hwriter_1.3.2          jsonlite_1.5           RCurl_1.95-4.11       
## [40] magrittr_1.5           GenomeInfoDbData_1.1.0 Matrix_1.2-15         
## [43] Rcpp_0.12.19           munsell_0.5.0          fansi_0.4.0           
## [46] stringi_1.2.4          yaml_2.2.0             zlibbioc_1.26.0       
## [49] plyr_1.8.4             blob_1.1.1             grid_3.5.1            
## [52] crayon_1.3.4           lattice_0.20-35        haven_1.1.2           
## [55] GenomicFeatures_1.32.3 hms_0.4.2              locfit_1.5-9.1        
## [58] knitr_1.20             pillar_1.3.0           biomaRt_2.36.1        
## [61] glue_1.3.0             evaluate_0.12          blogdown_0.9          
## [64] latticeExtra_0.6-28    modelr_0.1.2           cellranger_1.1.0      
## [67] gtable_0.2.0           assertthat_0.2.0       xfun_0.4              
## [70] xtable_1.8-3           broom_0.5.0            memoise_1.1.0

References

Nozue K, Devisetty UK, Lekkala S, Mueller-Moule P, Bak A, Casteel CL, Maloof JN (2018) Network analysis reveals a role for salicylic acid pathway components in shade avoidance. Plant Physiol. doi: 10.1104/pp.18.00920↩

Over-representation analysis 2: GOseq using custom categories

Kazu Nozue

Download updated gene description (copied from “2018-10-01-…Rmd”)

Updated hormone responsive gene list¹

format hormone.responsiveness6.DF.s.list2.DF2 into goseq() compatible list (run once)

GOseq ORA function with Arabidopss thaliana hormone responsive genes (under construction)

test the function

Session info

References

Over-representation analysis 2: GOseq using custom categories

Kazu Nozue

Download updated gene description (copied from “2018-10-01-…Rmd”)

Updated hormone responsive gene list1

format hormone.responsiveness6.DF.s.list2.DF2 into goseq() compatible list (run once)

GOseq ORA function with Arabidopss thaliana hormone responsive genes (under construction)

test the function

Session info

References

Updated hormone responsive gene list¹