If you want to analyze non-model organisms, what you can do?

Please prepare

fasta file for cDNA of your species (eg. Brassica rapa),
fasta file for cDNA of a close model organisms to your species (eg. Arabidopsis thaliana for Brassica rapa),
blastn program installed in your computer (UNIX (or Mac OSX) or LINUX) (detailed instruciton from NCBI)

required libraries

library(tidyverse);library(readr);library(readxl)

## ── Attaching packages ───────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.1.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.7
## ✔ tidyr   0.8.2     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0

## ── Conflicts ──────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

BLASTN

Use the Chifu V1.0 file BLAST against TAIR10 CDS

Because these organisms are relatively closely related I will use blastn instead of blastp to focus on best match

# look Julin's scripts and work by myself
list.files()
setwd("2018-09-27-over-representation-analysis-3-goseq-with-non-model-go-term_files")
system("makeblastdb -in TAIR10_cds_20110103_representative_gene_model_updated.fa -dbtype nucl")
#
system("gunzip Brassica_rapa.20100830.cds.gz")
# BLASTN
system("blastn -query Brassica_rapa.20100830.cds -db TAIR10_cds_20110103_representative_gene_model_updated.fa -strand both -task dc-megablast -outfmt 10 -culling_limit 1 -max_hsps 1 -evalue 10e-4 -num_threads 3 -template_type coding -template_length 16 -out Brapa1.0_vs_At_dc-megablast_out.csv")
# system("rm Brassica_rapa.20100830.cds")
setwd("..")

import results

brapa.blast <- read_csv("2018-09-27-over-representation-analysis-3-goseq-with-non-model-go-term_files/Brapa1.0_vs_At_dc-megablast_out.csv", col_names = FALSE)
colnames(brapa.blast) <- c("query","subject","perc_ID","aln_length","mismatch","gap_open","qstart","qend","sstart","send","eval","score")
head(brapa.blast)
summary(brapa.blast)
brapa.blast %>% group_by(query) %>% summarise(n())
# select highest score within one query
brapa.blast %>% group_by(query) %>% filter(score==max(score)) %>% summarise(n())
Br.v1.0anno.At.BLAST <- brapa.blast %>% group_by(query) %>% filter(score==max(score))
write_csv(Br.v1.0anno.At.BLAST,path="2018-09-27-over-representation-analysis-3-goseq-with-non-model-go-term_files/Brapa_V1.0_annotated.csv")

Reading Br-At gene name conversion table

### prerequisit: results from reciprocal BLAST against Arabidopsis thaliana
Br.v1.0anno.At.BLAST<-readr::read_csv(file="2018-09-27-over-representation-analysis-3-goseq-with-non-model-go-term_files/Brapa_V1.0_annotated.csv") # needs to use updated one from Julin (111718)

## Parsed with column specification:
## cols(
##   query = col_character(),
##   subject = col_character(),
##   perc_ID = col_double(),
##   aln_length = col_integer(),
##   mismatch = col_integer(),
##   gap_open = col_integer(),
##   qstart = col_integer(),
##   qend = col_integer(),
##   sstart = col_integer(),
##   send = col_integer(),
##   eval = col_double(),
##   score = col_double()
## )

Br.v1.0anno.At.BLAST

## # A tibble: 37,670 x 12
##    query subject perc_ID aln_length mismatch gap_open qstart  qend sstart
##    <chr> <chr>     <dbl>      <int>    <int>    <int>  <int> <int>  <int>
##  1 Bra0… AT2G37…    85.0       1509      149        7      1  1506      1
##  2 Bra0… AT2G37…    86.8       1080      141        2      1  1079      1
##  3 Bra0… AT2G37…    83.1        590       75        7      1   570      1
##  4 Bra0… AT2G37…    81.9        365       63        2      1   365      1
##  5 Bra0… AT2G37…    82.8       1218      161        9      1  1179      1
##  6 Bra0… AT3G53…    85.1        308       46        0      1   308      1
##  7 Bra0… AT2G37…    78.6        729      117       10      1   711      1
##  8 Bra0… AT2G37…    81.7        750       77        7      1   702      1
##  9 Bra0… AT2G37…    88.1        337       37        1    319   652      1
## 10 Bra0… AT2G37…    91.5       1132       96        0      1  1132      1
## # ... with 37,660 more rows, and 3 more variables: send <int>, eval <dbl>,
## #   score <dbl>

Making Brgo.v1.5anno.Atgoslim.BP.list

# Reading Arabidopsis thaliana GOslim list object (see ["Over-representation analysis 1: GOseq with GO terms in unsupported model organisms"](https://knozue.github.io/post/2018/09/26/over-representation-analysis-1-goseq-with-arabidopsis-go-term.html) for details)
load("2018-09-27-over-representation-analysis-3-goseq-with-non-model-go-term_files/Atgoslim.TAIR.BP.list.Rdata")
head(Atgoslim.TAIR.BP.list)
# remove splicing variant
Br.v1.0anno.At.BLAST <- Br.v1.0anno.At.BLAST %>% mutate(AGI=str_remove(subject,pattern="\\.[[:digit:]]+"))
# asign At GO into corresponding Br genes
Brgo.v1.0anno.Atgoslim.BP.list<-list()
for(i in 1:length(Br.v1.0anno.At.BLAST$query)) {
    if(is.null(Atgoslim.TAIR.BP.list[[as_vector(Br.v1.0anno.At.BLAST[i,"AGI"])]])) next else {
      Brgo.v1.0anno.Atgoslim.BP.list[[i]]<-Atgoslim.TAIR.BP.list[[as_vector(Br.v1.0anno.At.BLAST[i,"AGI"])]]
      names(Brgo.v1.0anno.Atgoslim.BP.list)[[i]]<-as_vector(Br.v1.0anno.At.BLAST[i,"query"])
    }
  }
  table(sapply(Brgo.v1.0anno.Atgoslim.BP.list,is.null)) # FALSE 33474 TRUE 4196
  # remove gene with no GO term
  Brgo.v1.0anno.Atgoslim.BP.list<-Brgo.v1.0anno.Atgoslim.BP.list[!sapply(Brgo.v1.0anno.Atgoslim.BP.list,is.null)]
  table(sapply(Brgo.v1.0anno.Atgoslim.BP.list,is.null))
  save(Brgo.v1.0anno.Atgoslim.BP.list,file=file.path("2018-09-27-over-representation-analysis-3-goseq-with-non-model-go-term_files","Brgo.v1.0anno.Atgoslim.BP.list.Rdata"))
#} else {print("change directory");stop}

prep for Brassica rapa GOseq analysis (v1.0 annotation) (for Greenham.2016.clock.drought data below)

load("2018-09-27-over-representation-analysis-3-goseq-with-non-model-go-term_files/Brgo.v1.0anno.Atgoslim.BP.list.Rdata") 
library(ShortRead);library(goseq);library(GO.db);library("annotate")
# read cDNA fasta file with v1.0 annotation
#Br_cdna<-readDNAStringSet("http://brassicadb.org/brad/datasets/pub/Genomes/Brassica_rapa/V1.0/Scaffold1.0/Brassica_rapa.20100830.cds.gz") 
Br_cdna<-readDNAStringSet("2018-09-27-over-representation-analysis-3-goseq-with-non-model-go-term_files/Brassica_rapa.20100830.cds.gz") # when above line is slow use this line.
head(Br_cdna)

##   A DNAStringSet instance of length 6
##     width seq                                          names               
## [1]  1515 ATGGGGAAGATCTTGAAAACT...TATCGTCTGCTATTCATTAG Bra000001  [mRNA]...
## [2]  1089 ATGGAGGAAGTAAGGAAGATG...ATGGTGTAGAAACACTCTAA Bra000002  [mRNA]...
## [3]   570 ATGAGCTCTGTTTGTGGTAAG...TAGTTGCAGCATCTTCTTGA Bra000003  [mRNA]...
## [4]   375 ATGATTCGCCGTCTATTCTCG...AAGAAGAAGCAGCCCTTTAA Bra000004  [mRNA]...
## [5]  1434 ATGGCGGCAGCTAGACGATTG...CCGGTGGAGGTTTTCTCTAG Bra000005  [mRNA]...
## [6]   312 ATGTCTGGGCGAGGAAAAGGA...TATATGGATTCGGCGGTTGA Bra000006  [mRNA]...

GOseq ORA function for Brassica rapa

GOseq.Brgo.v1.0.Atgoslim.BP.list.ORA<-function(genelist,padjust=0.05,ontology="BP",custom.category.list=Brgo.v1.0anno.Atgoslim.BP.list,cdna=Br_cdna) { # return GO enrichment table, padjus, padjust=0.05. 
  bias<-nchar(cdna)
  names(bias)<-tibble(Bra=names(cdna)) %>% separate(Bra,into="Bra2",sep=" ",extra="drop") %>% dplyr::select(Bra2) %>% as_vector()
  TF<-as.integer(names(bias) %in% genelist)
    names(TF)<-names(bias)
    #print(TF)
  pwf<-nullp(TF,bias.data=bias)
  #print(pwf$DEgenes)
  GO.pval <- goseq(pwf,gene2cat=custom.category.list,use_genes_without_cat=TRUE) # format became different in new goseq version (021111). Does not work (042716)
  #GO.pval <- goseq(pwf,gene2cat=Brgo.DF3,use_genes_without_cat=TRUE) # format became different in new goseq version (021111)
  
  #head(GO.pval) 
  if(ontology=="BP") {
    GO.pval2<-subset(GO.pval,ontology=="BP")
  } else if(ontology=="CC") {
    GO.pval2<-subset(GO.pval,ontology=="CC")
  } else {
    GO.pval2<-subset(GO.pval,ontology=="MF")
  }
    
  GO.pval2$over_represented_padjust<-p.adjust(GO.pval2$over_represented_pvalue,method="BH")
  if(GO.pval2$over_represented_padjust[1]>padjust) return("no enriched GO")
  else {
    enriched.GO<-GO.pval2[GO.pval2$over_represented_padjust<padjust,] 
    print("enriched.GO is")
    print(enriched.GO)
    
    ## write Term and Definition 
    for(i in 1:dim(enriched.GO)[1]) {
      if(is.null(Term(GOTERM[enriched.GO[i,"category"]]))) {next} else {
      enriched.GO$Term[i]<-Term(GOTERM[[enriched.GO[i,"category"]]])
      enriched.GO$Definition[i]<-Definition(GOTERM[[enriched.GO[i,"category"]]])
      }
    }
    return(enriched.GO)
  }
}

GOseq analysis of differentially expressed gene list

Using a Brassica rapa gene list in Greenham (2016). Compare our GOseq results with one in Figure 8B.

# Note: v1.0
download.file("https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvMjk2NTUvZWxpZmUtMjk2NTUtc3VwcDEtdjIueGxzeA==/elife-29655-supp1-v2.xlsx?_hash=IzWtqJP3ae%2BeIAz0DN%2Fyp44pKc2grNAxM7z%2Bd%2FzBibM%3D",destfile="2018-09-27-over-representation-analysis-3-goseq-with-non-model-go-term_files/elife-29655-supp1-v2.xlsx")
Greenham.2016.clock.drought<-read_excel(path="2018-09-27-over-representation-analysis-3-goseq-with-non-model-go-term_files/elife-29655-supp1-v2.xlsx",sheet=2,skip=4)
system("rm 2018-09-27-over-representation-analysis-3-goseq-with-non-model-go-term_files/elife-29655-supp1-v2.xlsx")
Greenham.2016.clock.drought.dM5<-Greenham.2016.clock.drought %>% filter(Module=="M5") %>% dplyr::select(Gene) %>% as_vector() # v1.0 annotation
head(Greenham.2016.clock.drought.dM5)

##       Gene1       Gene2       Gene3       Gene4       Gene5       Gene6 
## "Bra011755" "Bra011741" "Bra011671" "Bra011655" "Bra011606" "Bra011581"

eGO.dM5<-GOseq.Brgo.v1.0.Atgoslim.BP.list.ORA(genelist=Greenham.2016.clock.drought.dM5)

## Warning in pcls(G): initial point very close to some inequality constraints

## Using manually entered categories.

## Calculating the p-values...

## 'select()' returned 1:1 mapping between keys and columns

## [1] "enriched.GO is"
##        category over_represented_pvalue under_represented_pvalue
## 1517 GO:0015979            2.192366e-19                1.0000000
## 977  GO:0009768            2.651766e-17                1.0000000
## 978  GO:0009769            1.204903e-10                1.0000000
## 914  GO:0009645            9.683773e-10                1.0000000
## 913  GO:0009644            3.394233e-08                1.0000000
## 860  GO:0009416            8.568277e-08                1.0000000
## 1168 GO:0010114            2.127734e-07                1.0000000
## 854  GO:0009409            5.068985e-06                0.9999983
## 1720 GO:0019684            8.198031e-06                0.9999997
## 640  GO:0006970            2.153608e-05                0.9999952
##      numDEInCat numInCat
## 1517         27      192
## 977          14       37
## 978           7       13
## 914           7       16
## 913          12      106
## 860          22      386
## 1168         11      100
## 854          26      667
## 1720          5       21
## 640          14      251
##                                                    term ontology
## 1517                                     photosynthesis       BP
## 977   photosynthesis, light harvesting in photosystem I       BP
## 978  photosynthesis, light harvesting in photosystem II       BP
## 914            response to low light intensity stimulus       BP
## 913                    response to high light intensity       BP
## 860                          response to light stimulus       BP
## 1168                              response to red light       BP
## 854                                    response to cold       BP
## 1720                     photosynthesis, light reaction       BP
## 640                          response to osmotic stress       BP
##      over_represented_padjust
## 1517             8.162180e-16
## 977              4.936262e-14
## 978              1.495284e-07
## 914              9.013172e-07
## 913              2.527346e-05
## 860              5.316616e-05
## 1168             1.131651e-04
## 854              2.358979e-03
## 1720             3.391252e-03
## 640              8.017883e-03

Over-representation analysis 3: GOseq with GO term in non-model organisms

Kazu Nozue