What genes were found in given categories (eg. GO term) in given gene list?

library(tidyverse);library(readxl);library(readr)
# prepare GO term - gene table (eg. GOSLIM result), gene list of interest, GO term of interest
# output data format is a dataframe with columns (each GO term) and rows (gene names listed in gene list of interest provided by a user) and value (0 (absent) or 1 (present))

read GOseq result table

eGOseqs<-list.files(pattern="enrich.Rdata",path=file.path("..","..","..","resources","2018-10-07-over-representation-analysis-5-useful-tools_files","output_copy_ORA4"))
eGOseqs.list2<-sapply(file.path("..","..","..","resources","2018-10-07-over-representation-analysis-5-useful-tools_files","output_copy_ORA4",eGOseqs),function(x) mget(load(x))) # mget will return the value of the object(or objects) in a list. see https://stackoverflow.com/questions/29398630/load-data-frames-into-list
#names(eGOseqs.list2)
# Make list into data.frame 
eGOseqs.list2.summary<-do.call("rbind",eGOseqs.list2) 
#head(eGOseqs.list2.summary) # make sure those are file names
rownames(eGOseqs.list2.summary)<-1:nrow(eGOseqs.list2.summary)
#View(eGOseqs.list2.summary)
# eGOseqs.list2.summary (more than three genes in each KO)
eGOseqs.list2.summary<-eGOseqs.list2.summary[eGOseqs.list2.summary$numDEInCat>3,]
# focused on only very significant GO terms
eGOseqs.list2.summary<-eGOseqs.list2.summary[eGOseqs.list2.summary$over_represented_padjust<1e-4,]
head(eGOseqs.list2.summary)
##     category over_represented_pvalue under_represented_pvalue numDEInCat
## 1 GO:0010200            4.858293e-26                        1         91
## 2 GO:0006952            5.724864e-26                        1        277
## 3 GO:0042742            1.352377e-23                        1        153
## 4 GO:0080167            1.496577e-18                        1         80
## 5 GO:0009751            6.725063e-18                        1         91
## 6 GO:0006468            3.953862e-15                        1        312
##   numInCat                          term ontology over_represented_padjust
## 1      136            response to chitin       BP             1.083144e-22
## 2      620              defense response       BP             1.083144e-22
## 3      288 defense response to bacterium       BP             1.705798e-20
## 4      134          response to karrikin       BP             1.415762e-15
## 5      162    response to salicylic acid       BP             5.089528e-15
## 6      774       protein phosphorylation       BP             2.493569e-12
##                            Term
## 1            response to chitin
## 2              defense response
## 3 defense response to bacterium
## 4          response to karrikin
## 5    response to salicylic acid
## 6       protein phosphorylation
##                                                                                                                                                                                                                                                                                                                                     Definition
## 1                                                                                                                                               A process that results in a change in state or activity of a cell or an organism (in terms of movement, secretion, enzyme production, gene expression, etc.) as a result of a chitin stimulus.
## 2                                                                                                            Reactions, triggered in response to the presence of a foreign body or the occurrence of an injury, which result in restriction of damage to the organism attacked or prevention/recovery from the infection caused by the attack.
## 3                                                                                                                                                                                                                                     Reactions triggered in response to the presence of a bacterium that act to protect the cell or organism.
## 4 Any process that results in a change in state or activity of a cell or an organism (in terms of movement, secretion, enzyme production, gene expression, etc.) as a result of a karrikin stimulus. Karrikins are signaling molecules in smoke from burning vegetation that trigger seed germination for many angiosperms (flowering plants).
## 5                                                                                                                                     Any process that results in a change in state or activity of a cell or an organism (in terms of movement, secretion, enzyme production, gene expression, etc.) as a result of a salicylic acid stimulus.
## 6                                                                                                                                                                                                                                                                                The process of introducing a phosphate group on to a protein.
##    FC                                                         sample
## 1 all genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU.csv
## 2 all genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU.csv
## 3 all genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU.csv
## 4 all genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU.csv
## 5 all genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU.csv
## 6 all genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU.csv

GOterm data (Arabidopsis thaliana)

load(file.path("..","..","..","resources","2018-10-07-over-representation-analysis-5-useful-tools_files/Atgoslim.TAIR.BP.list.Rdata"))
head(Atgoslim.TAIR.BP.list)
## $`18S RRNA`
## [1] "GO:0006412" "GO:0006412" "GO:0006412"
## 
## $`25S RRNA`
## [1] "GO:0043043" "GO:0043043"
## 
## $`5.8S RRNA`
## [1] "GO:0043043" "GO:0043043"
## 
## $AAN
## [1] "GO:0007623"
## 
## $AAR1
## [1] "GO:0006970" "GO:0006970" "GO:0040029"
## 
## $AAR2
## [1] "GO:0009845" "GO:0040029"

DGE list of interest

# Ding (2018) Cell DEGs
getwd()
## [1] "/Volumes/data_personal/Kazu_blog14/content/post/2018-10-07-over-representation-analysis-5-useful-tools"
DEG.objs<-list.files(path=file.path("..","..","..","resources","2018-10-07-over-representation-analysis-5-useful-tools_files","output_copy_ORA4"),pattern="\\.csv$") # under construction
DEG.objs
## [1] "genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU.csv"
## [2] "genotypenpr4_genotypenpr4.treatmenttreated.DEGs.int.rWT.rU.csv"
## [3] "trt.DEGs.int.rWT.rU.csv"
# read csv file (using lapply())
DEG.list<-lapply(DEG.objs, function(x) read_csv(file.path("..","..","..","resources","2018-10-07-over-representation-analysis-5-useful-tools_files","output_copy_ORA4",x)))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   genes = col_character(),
##   logFC.genotypenpr1.1 = col_double(),
##   logFC.genotypenpr1.1.treatmenttreated = col_double(),
##   logCPM = col_double(),
##   LR = col_double(),
##   PValue = col_double(),
##   FDR = col_double(),
##   AGI = col_character(),
##   symbol2 = col_character(),
##   full_name = col_character()
## )
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   genes = col_character(),
##   logFC.genotypenpr4.4D = col_double(),
##   logFC.genotypenpr4.4D.treatmenttreated = col_double(),
##   logCPM = col_double(),
##   LR = col_double(),
##   PValue = col_double(),
##   FDR = col_double(),
##   AGI = col_character(),
##   symbol2 = col_character(),
##   full_name = col_character()
## )
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   genes = col_character(),
##   logFC = col_double(),
##   logCPM = col_double(),
##   LR = col_double(),
##   PValue = col_double(),
##   FDR = col_double(),
##   AGI = col_character(),
##   symbol2 = col_character(),
##   full_name = col_character()
## )
names(DEG.list)<-gsub(".csv","",DEG.objs)
head(DEG.list)
## $genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU
## # A tibble: 6,694 x 10
##    genes logFC.genotypen… logFC.genotypen… logCPM    LR    PValue       FDR
##    <chr>            <dbl>            <dbl>  <dbl> <dbl>     <dbl>     <dbl>
##  1 AT2G…        -4.20               -0.720   6.69 1300. 5.30e-283 1.78e-278
##  2 AT5G…         3.26                0.530   4.76 1000. 5.96e-218 1.00e-213
##  3 AT1G…        -2.02               -0.475   7.19  629. 2.69e-137 3.01e-133
##  4 AT1G…        -2.26               -0.511   7.51  620. 2.83e-135 2.38e-131
##  5 AT3G…        -3.16                0.888   5.33  618. 5.42e-135 3.64e-131
##  6 AT5G…        -0.212              -6.72    3.89  607. 1.67e-132 9.35e-129
##  7 AT2G…         2.22                1.95    9.21  572. 7.80e-125 3.74e-121
##  8 AT5G…         1.82                1.83    9.40  570. 1.54e-124 6.46e-121
##  9 AT1G…        -3.65               -1.52    3.37  565. 1.92e-123 7.17e-120
## 10 AT5G…         0.000149           -7.96    3.77  546. 2.42e-119 8.12e-116
## # … with 6,684 more rows, and 3 more variables: AGI <chr>, symbol2 <chr>,
## #   full_name <chr>
## 
## $genotypenpr4_genotypenpr4.treatmenttreated.DEGs.int.rWT.rU
## # A tibble: 846 x 10
##    genes logFC.genotypen… logFC.genotypen… logCPM    LR    PValue       FDR
##    <chr>            <dbl>            <dbl>  <dbl> <dbl>     <dbl>     <dbl>
##  1 AT5G…             3.04            0.628   4.76  904. 5.00e-197 1.68e-192
##  2 AT3G…            -5.45           -0.183   1.98  459. 1.80e-100 3.03e- 96
##  3 AT5G…            -3.30            0.354   3.10  444. 4.51e- 97 5.05e- 93
##  4 AT1G…            -7.40           -3.70    2.00  432. 1.32e- 94 1.11e- 90
##  5 AT3G…            -2.97           -0.301   3.36  372. 1.83e- 81 1.23e- 77
##  6 AT1G…            -2.50            0.891   6.30  288. 2.54e- 63 1.42e- 59
##  7 AT2G…            -1.42           -0.703   6.69  277. 7.63e- 61 3.66e- 57
##  8 AT5G…            -3.04            0.434   4.19  211. 1.75e- 46 7.36e- 43
##  9 AT2G…            -1.10           -0.237   5.87  209. 5.09e- 46 1.90e- 42
## 10 AT5G…            -1.84           -0.489   2.78  187. 2.68e- 41 9.01e- 38
## # … with 836 more rows, and 3 more variables: AGI <chr>, symbol2 <chr>,
## #   full_name <chr>
## 
## $trt.DEGs.int.rWT.rU
## # A tibble: 7,811 x 9
##    genes  logFC logCPM    LR    PValue       FDR AGI    symbol2  full_name      
##    <chr>  <dbl>  <dbl> <dbl>     <dbl>     <dbl> <chr>  <chr>    <chr>          
##  1 AT5G1…  8.57   3.89  675. 7.55e-149 2.54e-144 AT5G1… AT-HSP1… heat shock pro…
##  2 AT1G0…  4.00   4.60  592. 7.94e-131 1.33e-126 AT1G0… <NA>     <NA>           
##  3 AT5G2…  6.16   4.19  588. 5.77e-130 6.46e-126 AT5G2… ATWRKY3… ARABIDOPSIS TH…
##  4 AT4G2…  3.85   5.39  586. 1.92e-129 1.59e-125 AT4G2… CRK14    cysteine-rich …
##  5 AT5G1…  9.64   3.77  586. 2.37e-129 1.59e-125 AT5G1… HSP17.6… 17.6 kDa class…
##  6 AT3G2…  4.92   3.36  546. 9.68e-121 5.42e-117 AT3G2… <NA>     <NA>           
##  7 AT4G2…  3.34   6.70  539. 2.78e-119 1.33e-115 AT4G2… SGT1A    <NA>           
##  8 AT5G2…  3.82   6.70  535. 2.42e-118 1.02e-114 AT5G2… AtDMR6;… NA;DOWNY MILDE…
##  9 AT5G0…  4.14   7.14  523. 1.07e-115 3.98e-112 AT5G0… AtHsp70… NA;NA          
## 10 AT1G2…  2.87   7.19  511. 3.43e-113 1.15e-109 AT1G2… AtWAK1;… NA;NA;cell wal…
## # … with 7,801 more rows
# read csv file (using map())
DEG.objs2<-list.files(path=file.path("..","..","..","resources","2018-10-07-over-representation-analysis-5-useful-tools_files","output_copy_ORA4"),pattern="\\.csv$", full.names=TRUE,recursive=TRUE) # under construction
DEG.map<-DEG.objs2 %>% map(~ read_csv(.))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   genes = col_character(),
##   logFC.genotypenpr1.1 = col_double(),
##   logFC.genotypenpr1.1.treatmenttreated = col_double(),
##   logCPM = col_double(),
##   LR = col_double(),
##   PValue = col_double(),
##   FDR = col_double(),
##   AGI = col_character(),
##   symbol2 = col_character(),
##   full_name = col_character()
## )
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   genes = col_character(),
##   logFC.genotypenpr4.4D = col_double(),
##   logFC.genotypenpr4.4D.treatmenttreated = col_double(),
##   logCPM = col_double(),
##   LR = col_double(),
##   PValue = col_double(),
##   FDR = col_double(),
##   AGI = col_character(),
##   symbol2 = col_character(),
##   full_name = col_character()
## )
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   genes = col_character(),
##   logFC = col_double(),
##   logCPM = col_double(),
##   LR = col_double(),
##   PValue = col_double(),
##   FDR = col_double(),
##   AGI = col_character(),
##   symbol2 = col_character(),
##   full_name = col_character()
## )
names(DEG.map) <-  str_split(DEG.objs2,pattern="/",simplify=TRUE)[,7] %>% str_remove(".csv")
head(DEG.map)
## $genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU
## # A tibble: 6,694 x 10
##    genes logFC.genotypen… logFC.genotypen… logCPM    LR    PValue       FDR
##    <chr>            <dbl>            <dbl>  <dbl> <dbl>     <dbl>     <dbl>
##  1 AT2G…        -4.20               -0.720   6.69 1300. 5.30e-283 1.78e-278
##  2 AT5G…         3.26                0.530   4.76 1000. 5.96e-218 1.00e-213
##  3 AT1G…        -2.02               -0.475   7.19  629. 2.69e-137 3.01e-133
##  4 AT1G…        -2.26               -0.511   7.51  620. 2.83e-135 2.38e-131
##  5 AT3G…        -3.16                0.888   5.33  618. 5.42e-135 3.64e-131
##  6 AT5G…        -0.212              -6.72    3.89  607. 1.67e-132 9.35e-129
##  7 AT2G…         2.22                1.95    9.21  572. 7.80e-125 3.74e-121
##  8 AT5G…         1.82                1.83    9.40  570. 1.54e-124 6.46e-121
##  9 AT1G…        -3.65               -1.52    3.37  565. 1.92e-123 7.17e-120
## 10 AT5G…         0.000149           -7.96    3.77  546. 2.42e-119 8.12e-116
## # … with 6,684 more rows, and 3 more variables: AGI <chr>, symbol2 <chr>,
## #   full_name <chr>
## 
## $genotypenpr4_genotypenpr4.treatmenttreated.DEGs.int.rWT.rU
## # A tibble: 846 x 10
##    genes logFC.genotypen… logFC.genotypen… logCPM    LR    PValue       FDR
##    <chr>            <dbl>            <dbl>  <dbl> <dbl>     <dbl>     <dbl>
##  1 AT5G…             3.04            0.628   4.76  904. 5.00e-197 1.68e-192
##  2 AT3G…            -5.45           -0.183   1.98  459. 1.80e-100 3.03e- 96
##  3 AT5G…            -3.30            0.354   3.10  444. 4.51e- 97 5.05e- 93
##  4 AT1G…            -7.40           -3.70    2.00  432. 1.32e- 94 1.11e- 90
##  5 AT3G…            -2.97           -0.301   3.36  372. 1.83e- 81 1.23e- 77
##  6 AT1G…            -2.50            0.891   6.30  288. 2.54e- 63 1.42e- 59
##  7 AT2G…            -1.42           -0.703   6.69  277. 7.63e- 61 3.66e- 57
##  8 AT5G…            -3.04            0.434   4.19  211. 1.75e- 46 7.36e- 43
##  9 AT2G…            -1.10           -0.237   5.87  209. 5.09e- 46 1.90e- 42
## 10 AT5G…            -1.84           -0.489   2.78  187. 2.68e- 41 9.01e- 38
## # … with 836 more rows, and 3 more variables: AGI <chr>, symbol2 <chr>,
## #   full_name <chr>
## 
## $trt.DEGs.int.rWT.rU
## # A tibble: 7,811 x 9
##    genes  logFC logCPM    LR    PValue       FDR AGI    symbol2  full_name      
##    <chr>  <dbl>  <dbl> <dbl>     <dbl>     <dbl> <chr>  <chr>    <chr>          
##  1 AT5G1…  8.57   3.89  675. 7.55e-149 2.54e-144 AT5G1… AT-HSP1… heat shock pro…
##  2 AT1G0…  4.00   4.60  592. 7.94e-131 1.33e-126 AT1G0… <NA>     <NA>           
##  3 AT5G2…  6.16   4.19  588. 5.77e-130 6.46e-126 AT5G2… ATWRKY3… ARABIDOPSIS TH…
##  4 AT4G2…  3.85   5.39  586. 1.92e-129 1.59e-125 AT4G2… CRK14    cysteine-rich …
##  5 AT5G1…  9.64   3.77  586. 2.37e-129 1.59e-125 AT5G1… HSP17.6… 17.6 kDa class…
##  6 AT3G2…  4.92   3.36  546. 9.68e-121 5.42e-117 AT3G2… <NA>     <NA>           
##  7 AT4G2…  3.34   6.70  539. 2.78e-119 1.33e-115 AT4G2… SGT1A    <NA>           
##  8 AT5G2…  3.82   6.70  535. 2.42e-118 1.02e-114 AT5G2… AtDMR6;… NA;DOWNY MILDE…
##  9 AT5G0…  4.14   7.14  523. 1.07e-115 3.98e-112 AT5G0… AtHsp70… NA;NA          
## 10 AT1G2…  2.87   7.19  511. 3.43e-113 1.15e-109 AT1G2… AtWAK1;… NA;NA;cell wal…
## # … with 7,801 more rows

Making a function

genes.in.enriched.category<-function(enrich.result,gene.list,category.table=Atgoslim.TAIR.BP.list) { # return data frame with all input data with ... which format is good? VennDiagram input format?  
  enrich.category<-enrich.result[enrich.result$over_represented_pvalue<0.05,"category"]
  temp<-category.table[gene.list] # short category.table for enriched.result # gene.list shas to be character (not factor)  
  #names(temp[!is.na(names(temp))])
  temp<-temp[!is.na(names(temp))] # remove non matched AGI
  test<-data.frame(AGI=names(temp))
  for(i in enrich.category) {
    x<-rep(0,length(names(temp)))
    x[grep(i,temp)]<-1
    test<-cbind(test,x)
    names(test)[dim(test)[2]]<-i
  }
  return(test) 
}

A case study: all misregulated genes in npr1-1 (Ding (2018) Cell)

names(eGOseqs.list2)[1] # all misregulated genes in npr1-1  genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU.csv
## [1] "../../../resources/2018-10-07-over-representation-analysis-5-useful-tools_files/output_copy_ORA4/genotypenpr1_genotypenpr1.treatmenttreated.DEGs.int.rWT.rU.all.enrich.Rdata.GO.ORA.result"
results.table<-genes.in.enriched.category(enrich.result=eGOseqs.list2[[1]],gene.list=gsub("(.)(\\.[[:digit:]]+)","\\1",DEG.list[[1]]$genes))
results.table[1:10,1:10] # the first 10 rows and columns
##          AGI GO:0010200 GO:0006952 GO:0042742 GO:0080167 GO:0009751 GO:0006468
## 1  AT2G14560          0          0          0          0          1          0
## 2  AT1G21250          0          0          0          0          1          1
## 3  AT1G35710          0          0          0          0          0          1
## 4  AT3G02550          0          0          0          0          0          0
## 5  AT5G12030          0          0          0          0          0          0
## 6  AT2G34430          0          0          0          0          0          0
## 7  AT5G54270          0          0          0          0          0          0
## 8  AT1G13470          0          0          0          0          0          0
## 9  AT5G12020          0          0          0          0          0          0
## 10 AT5G22570          0          0          1          0          0          0
##    GO:0009651 GO:0009416 GO:0009408
## 1           0          0          0
## 2           0          0          0
## 3           0          0          0
## 4           0          0          0
## 5           1          0          1
## 6           0          1          0
## 7           0          1          0
## 8           0          0          0
## 9           1          0          1
## 10          0          0          0

Session info

sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] readxl_1.3.1    forcats_0.5.0   stringr_1.4.0   dplyr_1.0.2    
##  [5] purrr_0.3.4     readr_1.4.0     tidyr_1.1.2     tibble_3.0.4   
##  [9] ggplot2_3.3.3   tidyverse_1.3.0
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.6        cellranger_1.1.0  pillar_1.4.7      compiler_3.6.2   
##  [5] dbplyr_2.0.0      tools_3.6.2       digest_0.6.27     lubridate_1.7.9.2
##  [9] jsonlite_1.7.2    evaluate_0.14     lifecycle_0.2.0   gtable_0.3.0     
## [13] pkgconfig_2.0.3   rlang_0.4.10      reprex_0.3.0      cli_2.2.0        
## [17] rstudioapi_0.13   DBI_1.1.0         yaml_2.2.1        blogdown_1.2.2   
## [21] haven_2.3.1       xfun_0.22         withr_2.3.0       xml2_1.3.2       
## [25] httr_1.4.2        knitr_1.31        fs_1.5.0          hms_0.5.3        
## [29] generics_0.1.0    vctrs_0.3.6       grid_3.6.2        tidyselect_1.1.0 
## [33] glue_1.4.2        R6_2.5.0          fansi_0.4.1       rmarkdown_2.7    
## [37] bookdown_0.21     modelr_0.1.8      magrittr_2.0.1    backports_1.2.1  
## [41] scales_1.1.1      ellipsis_0.3.1    htmltools_0.5.1.1 rvest_0.3.6      
## [45] assertthat_0.2.1  colorspace_2.0-0  utf8_1.1.4        stringi_1.5.3    
## [49] munsell_0.5.0     broom_0.7.3       crayon_1.3.4