If you want to analyze non-model organisms, what you can do?

Please prepare

fasta file for cDNA of your species (eg. Brassica rapa),
fasta file for cDNA of a close model organisms to your species (eg. Arabidopsis thaliana for Brassica rapa),
blastn program installed in your computer (UNIX (or Mac OSX) or LINUX) (detailed instruciton from NCBI)

required libraries

library(tidyverse);library(readr);library(readxl)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

BLASTN

Use the Chifu V1.0 file BLAST against TAIR10 CDS

Because these organisms are relatively closely related I will use blastn instead of blastp to focus on best match

# look Julin's scripts and work by myself
list.files()
default.path <- getwd()
setwd(file.path("..","..","..","resources","2018-10-17-over-representation-analysis-3-goseq-with-go-term-in-non-model-organisms"))
system("makeblastdb -in TAIR10_cds_20110103_representative_gene_model_updated.fa -dbtype nucl")
#
system("gunzip Brassica_rapa.20100830.cds.gz")
# BLASTN
system("blastn -query Brassica_rapa.20100830.cds -db TAIR10_cds_20110103_representative_gene_model_updated.fa -strand both -task dc-megablast -outfmt 10 -culling_limit 1 -max_hsps 1 -evalue 10e-4 -num_threads 3 -template_type coding -template_length 16 -out Brapa1.0_vs_At_dc-megablast_out.csv")
# system("rm Brassica_rapa.20100830.cds")
setwd(default.path)

import results

brapa.blast <- read_csv(file.path("..","..","..","resources","2018-10-17-over-representation-analysis-3-goseq-with-go-term-in-non-model-organisms","Brapa1.0_vs_At_dc-megablast_out.csv"), col_names = FALSE)
colnames(brapa.blast) <- c("query","subject","perc_ID","aln_length","mismatch","gap_open","qstart","qend","sstart","send","eval","score")
head(brapa.blast)
summary(brapa.blast)
brapa.blast %>% group_by(query) %>% summarise(n())
# select highest score within one query
brapa.blast %>% group_by(query) %>% filter(score==max(score)) %>% summarise(n())
Br.v1.0anno.At.BLAST <- brapa.blast %>% group_by(query) %>% filter(score==max(score))
write_csv(Br.v1.0anno.At.BLAST,path=file.path("..","..","..","resources","2018-10-17-over-representation-analysis-3-goseq-with-go-term-in-non-model-organisms","Brapa_V1.0_annotated.csv"))

Reading Br-At gene name conversion table

### prerequisit: results from reciprocal BLAST against Arabidopsis thaliana
Br.v1.0anno.At.BLAST<-readr::read_csv(file=file.path("..","..","..","resources","2018-10-17-over-representation-analysis-3-goseq-with-go-term-in-non-model-organisms","Brapa_V1.0_annotated.csv")) # needs to use updated one from Julin (111718)

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   query = col_character(),
##   subject = col_character(),
##   perc_ID = col_double(),
##   aln_length = col_double(),
##   mismatch = col_double(),
##   gap_open = col_double(),
##   qstart = col_double(),
##   qend = col_double(),
##   sstart = col_double(),
##   send = col_double(),
##   eval = col_double(),
##   score = col_double()
## )

Br.v1.0anno.At.BLAST

## # A tibble: 37,670 x 12
##    query subject perc_ID aln_length mismatch gap_open qstart  qend sstart  send
##    <chr> <chr>     <dbl>      <dbl>    <dbl>    <dbl>  <dbl> <dbl>  <dbl> <dbl>
##  1 Bra0… AT2G37…    85.0       1509      149        7      1  1506      1  1434
##  2 Bra0… AT2G37…    86.8       1080      141        2      1  1079      1  1079
##  3 Bra0… AT2G37…    83.1        590       75        7      1   570      1   585
##  4 Bra0… AT2G37…    81.9        365       63        2      1   365      1   362
##  5 Bra0… AT2G37…    82.8       1218      161        9      1  1179      1  1209
##  6 Bra0… AT3G53…    85.1        308       46        0      1   308      1   308
##  7 Bra0… AT2G37…    78.6        729      117       10      1   711      1   708
##  8 Bra0… AT2G37…    81.7        750       77        7      1   702      1   738
##  9 Bra0… AT2G37…    88.1        337       37        1    319   652      1   337
## 10 Bra0… AT2G37…    91.5       1132       96        0      1  1132      1  1132
## # … with 37,660 more rows, and 2 more variables: eval <dbl>, score <dbl>

Making Brgo.v1.5anno.Atgoslim.BP.list

# Reading Arabidopsis thaliana GOslim list object (see ["Over-representation analysis 1: GOseq with GO terms in unsupported model organisms"](https://knozue.github.io/post/2018/09/26/over-representation-analysis-1-goseq-with-arabidopsis-go-term.html) for details)
load(file.path("..","..","..","resources","2018-10-17-over-representation-analysis-3-goseq-with-go-term-in-non-model-organisms","Atgoslim.TAIR.BP.list.Rdata"))
head(Atgoslim.TAIR.BP.list)
# remove splicing variant
Br.v1.0anno.At.BLAST <- Br.v1.0anno.At.BLAST %>% mutate(AGI=str_remove(subject,pattern="\\.[[:digit:]]+"))
# asign At GO into corresponding Br genes
Brgo.v1.0anno.Atgoslim.BP.list<-list()
for(i in 1:length(Br.v1.0anno.At.BLAST$query)) {
    if(is.null(Atgoslim.TAIR.BP.list[[as_vector(Br.v1.0anno.At.BLAST[i,"AGI"])]])) next else {
      Brgo.v1.0anno.Atgoslim.BP.list[[i]]<-Atgoslim.TAIR.BP.list[[as_vector(Br.v1.0anno.At.BLAST[i,"AGI"])]]
      names(Brgo.v1.0anno.Atgoslim.BP.list)[[i]]<-as_vector(Br.v1.0anno.At.BLAST[i,"query"])
    }
  }
  table(sapply(Brgo.v1.0anno.Atgoslim.BP.list,is.null)) # FALSE 33474 TRUE 4196
  # remove gene with no GO term
  Brgo.v1.0anno.Atgoslim.BP.list<-Brgo.v1.0anno.Atgoslim.BP.list[!sapply(Brgo.v1.0anno.Atgoslim.BP.list,is.null)]
  table(sapply(Brgo.v1.0anno.Atgoslim.BP.list,is.null))
  save(Brgo.v1.0anno.Atgoslim.BP.list,file=file.path("..","..","..","resources","2018-10-17-over-representation-analysis-3-goseq-with-go-term-in-non-model-organisms","Brgo.v1.0anno.Atgoslim.BP.list.Rdata"))
#} else {print("change directory");stop}

prep for Brassica rapa GOseq analysis (v1.0 annotation) (for Greenham.2016.clock.drought data below)

load(file.path("..","..","..","resources","2018-10-17-over-representation-analysis-3-goseq-with-go-term-in-non-model-organisms","Brgo.v1.0anno.Atgoslim.BP.list.Rdata"))
library(ShortRead);library(goseq);library(GO.db);library("annotate")

## Warning: package 'S4Vectors' was built under R version 3.6.3

## Warning: package 'GenomeInfoDb' was built under R version 3.6.3

## Warning: package 'DelayedArray' was built under R version 3.6.3

# read cDNA fasta file with v1.0 annotation
#Br_cdna<-readDNAStringSet("http://brassicadb.org/brad/datasets/pub/Genomes/Brassica_rapa/V1.0/Scaffold1.0/Brassica_rapa.20100830.cds.gz") 
Br_cdna<-readDNAStringSet(file.path("..","..","..","resources","2018-10-17-over-representation-analysis-3-goseq-with-go-term-in-non-model-organisms","Brassica_rapa.20100830.cds.gz")) # when above line is slow use this line.
head(Br_cdna)

##   A DNAStringSet instance of length 6
##     width seq                                               names               
## [1]  1515 ATGGGGAAGATCTTGAAAACTAA...TTATATCGTCTGCTATTCATTAG Bra000001  [mRNA]...
## [2]  1089 ATGGAGGAAGTAAGGAAGATGGG...AGGATGGTGTAGAAACACTCTAA Bra000002  [mRNA]...
## [3]   570 ATGAGCTCTGTTTGTGGTAAGCT...CTATAGTTGCAGCATCTTCTTGA Bra000003  [mRNA]...
## [4]   375 ATGATTCGCCGTCTATTCTCGTC...AGGAAGAAGAAGCAGCCCTTTAA Bra000004  [mRNA]...
## [5]  1434 ATGGCGGCAGCTAGACGATTGCG...GGACCGGTGGAGGTTTTCTCTAG Bra000005  [mRNA]...
## [6]   312 ATGTCTGGGCGAGGAAAAGGAGG...CTTTATATGGATTCGGCGGTTGA Bra000006  [mRNA]...

GOseq ORA function for Brassica rapa

GOseq.Brgo.v1.0.Atgoslim.BP.list.ORA<-function(genelist,padjust=0.05,ontology="BP",custom.category.list=Brgo.v1.0anno.Atgoslim.BP.list,cdna=Br_cdna) { # return GO enrichment table, padjus, padjust=0.05. 
  bias<-nchar(cdna)
  names(bias)<-tibble(Bra=names(cdna)) %>% separate(Bra,into="Bra2",sep=" ",extra="drop") %>% dplyr::select(Bra2) %>% as_vector()
  TF<-as.integer(names(bias) %in% genelist)
    names(TF)<-names(bias)
    #print(TF)
  pwf<-nullp(TF,bias.data=bias)
  #print(pwf$DEgenes)
  GO.pval <- goseq(pwf,gene2cat=custom.category.list,use_genes_without_cat=TRUE) # format became different in new goseq version (021111). Does not work (042716)
  #GO.pval <- goseq(pwf,gene2cat=Brgo.DF3,use_genes_without_cat=TRUE) # format became different in new goseq version (021111)
  
  #head(GO.pval) 
  if(ontology=="BP") {
    GO.pval2<-subset(GO.pval,ontology=="BP")
  } else if(ontology=="CC") {
    GO.pval2<-subset(GO.pval,ontology=="CC")
  } else {
    GO.pval2<-subset(GO.pval,ontology=="MF")
  }
    
  GO.pval2$over_represented_padjust<-p.adjust(GO.pval2$over_represented_pvalue,method="BH")
  if(GO.pval2$over_represented_padjust[1]>padjust) return("no enriched GO")
  else {
    enriched.GO<-GO.pval2[GO.pval2$over_represented_padjust<padjust,] 
    print("enriched.GO is")
    print(enriched.GO)
    
    ## write Term and Definition 
    for(i in 1:dim(enriched.GO)[1]) {
      if(is.null(Term(GOTERM[enriched.GO[i,"category"]]))) {next} else {
      enriched.GO$Term[i]<-Term(GOTERM[[enriched.GO[i,"category"]]])
      enriched.GO$Definition[i]<-Definition(GOTERM[[enriched.GO[i,"category"]]])
      }
    }
    return(enriched.GO)
  }
}

GOseq analysis of differentially expressed gene list

Using a Brassica rapa gene list in Greenham (2016). Compare our GOseq results with one in Figure 8B.

# Note: v1.0
download.file("https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvMjk2NTUvZWxpZmUtMjk2NTUtc3VwcDEtdjIueGxzeA==/elife-29655-supp1-v2.xlsx?_hash=IzWtqJP3ae%2BeIAz0DN%2Fyp44pKc2grNAxM7z%2Bd%2FzBibM%3D",destfile=file.path("..","..","..","resources","2018-10-17-over-representation-analysis-3-goseq-with-go-term-in-non-model-organisms","elife-29655-supp1-v2.xlsx"))
Greenham.2016.clock.drought<-read_excel(path=file.path("..","..","..","resources","2018-10-17-over-representation-analysis-3-goseq-with-go-term-in-non-model-organisms","elife-29655-supp1-v2.xlsx"),sheet=2,skip=4)
system("rm ../../../resources/2018-10-17-over-representation-analysis-3-goseq-with-go-term-in-non-model-organisms/elife-29655-supp1-v2.xlsx")
Greenham.2016.clock.drought.dM5<-Greenham.2016.clock.drought %>% filter(Module=="M5") %>% dplyr::select(Gene) %>% as_vector() # v1.0 annotation
head(Greenham.2016.clock.drought.dM5)

##       Gene1       Gene2       Gene3       Gene4       Gene5       Gene6 
## "Bra011755" "Bra011741" "Bra011671" "Bra011655" "Bra011606" "Bra011581"

eGO.dM5<-GOseq.Brgo.v1.0.Atgoslim.BP.list.ORA(genelist=Greenham.2016.clock.drought.dM5)

## Warning in pcls(G): initial point very close to some inequality constraints

## Using manually entered categories.

## Calculating the p-values...

## 'select()' returned 1:1 mapping between keys and columns

## [1] "enriched.GO is"
##        category over_represented_pvalue under_represented_pvalue numDEInCat
## 1517 GO:0015979            2.192366e-19                1.0000000         27
## 977  GO:0009768            2.651766e-17                1.0000000         14
## 978  GO:0009769            1.204903e-10                1.0000000          7
## 914  GO:0009645            9.683773e-10                1.0000000          7
## 913  GO:0009644            3.394233e-08                1.0000000         12
## 860  GO:0009416            8.568277e-08                1.0000000         22
## 1168 GO:0010114            2.127734e-07                1.0000000         11
## 854  GO:0009409            5.068985e-06                0.9999983         26
## 1720 GO:0019684            8.198031e-06                0.9999997          5
## 640  GO:0006970            2.153608e-05                0.9999952         14
##      numInCat                                               term ontology
## 1517      192                                     photosynthesis       BP
## 977        37  photosynthesis, light harvesting in photosystem I       BP
## 978        13 photosynthesis, light harvesting in photosystem II       BP
## 914        16           response to low light intensity stimulus       BP
## 913       106                   response to high light intensity       BP
## 860       386                         response to light stimulus       BP
## 1168      100                              response to red light       BP
## 854       667                                   response to cold       BP
## 1720       21                     photosynthesis, light reaction       BP
## 640       251                         response to osmotic stress       BP
##      over_represented_padjust
## 1517             8.162180e-16
## 977              4.936262e-14
## 978              1.495284e-07
## 914              9.013172e-07
## 913              2.527346e-05
## 860              5.316616e-05
## 1168             1.131651e-04
## 854              2.358979e-03
## 1720             3.391252e-03
## 640              8.017883e-03

Over-representation analysis 3: GOseq with GO term in non-model organisms

Kazu Nozue