If you want to analyze non-model organisms, what you can do?
Please prepare
- fasta file for cDNA of your species (eg. Brassica rapa),
- fasta file for cDNA of a close model organisms to your species (eg. Arabidopsis thaliana for Brassica rapa),
- blastn program installed in your computer (UNIX (or Mac OSX) or LINUX) (detailed instruciton from NCBI)
required libraries
library(tidyverse);library(readr);library(readxl)
## ── Attaching packages ───────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.7
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
BLASTN
Use the Chifu V1.0 file BLAST against TAIR10 CDS
Because these organisms are relatively closely related I will use blastn instead of blastp to focus on best match
# look Julin's scripts and work by myself
list.files()
setwd("2018-09-27-over-representation-analysis-3-goseq-with-non-model-go-term_files")
system("makeblastdb -in TAIR10_cds_20110103_representative_gene_model_updated.fa -dbtype nucl")
#
system("gunzip Brassica_rapa.20100830.cds.gz")
# BLASTN
system("blastn -query Brassica_rapa.20100830.cds -db TAIR10_cds_20110103_representative_gene_model_updated.fa -strand both -task dc-megablast -outfmt 10 -culling_limit 1 -max_hsps 1 -evalue 10e-4 -num_threads 3 -template_type coding -template_length 16 -out Brapa1.0_vs_At_dc-megablast_out.csv")
# system("rm Brassica_rapa.20100830.cds")
setwd("..")
import results
brapa.blast <- read_csv("2018-09-27-over-representation-analysis-3-goseq-with-non-model-go-term_files/Brapa1.0_vs_At_dc-megablast_out.csv", col_names = FALSE)
colnames(brapa.blast) <- c("query","subject","perc_ID","aln_length","mismatch","gap_open","qstart","qend","sstart","send","eval","score")
head(brapa.blast)
summary(brapa.blast)
brapa.blast %>% group_by(query) %>% summarise(n())
# select highest score within one query
brapa.blast %>% group_by(query) %>% filter(score==max(score)) %>% summarise(n())
Br.v1.0anno.At.BLAST <- brapa.blast %>% group_by(query) %>% filter(score==max(score))
write_csv(Br.v1.0anno.At.BLAST,path="2018-09-27-over-representation-analysis-3-goseq-with-non-model-go-term_files/Brapa_V1.0_annotated.csv")
Reading Br-At gene name conversion table
### prerequisit: results from reciprocal BLAST against Arabidopsis thaliana
Br.v1.0anno.At.BLAST<-readr::read_csv(file="2018-09-27-over-representation-analysis-3-goseq-with-non-model-go-term_files/Brapa_V1.0_annotated.csv") # needs to use updated one from Julin (111718)
## Parsed with column specification:
## cols(
## query = col_character(),
## subject = col_character(),
## perc_ID = col_double(),
## aln_length = col_integer(),
## mismatch = col_integer(),
## gap_open = col_integer(),
## qstart = col_integer(),
## qend = col_integer(),
## sstart = col_integer(),
## send = col_integer(),
## eval = col_double(),
## score = col_double()
## )
Br.v1.0anno.At.BLAST
## # A tibble: 37,670 x 12
## query subject perc_ID aln_length mismatch gap_open qstart qend sstart
## <chr> <chr> <dbl> <int> <int> <int> <int> <int> <int>
## 1 Bra0… AT2G37… 85.0 1509 149 7 1 1506 1
## 2 Bra0… AT2G37… 86.8 1080 141 2 1 1079 1
## 3 Bra0… AT2G37… 83.1 590 75 7 1 570 1
## 4 Bra0… AT2G37… 81.9 365 63 2 1 365 1
## 5 Bra0… AT2G37… 82.8 1218 161 9 1 1179 1
## 6 Bra0… AT3G53… 85.1 308 46 0 1 308 1
## 7 Bra0… AT2G37… 78.6 729 117 10 1 711 1
## 8 Bra0… AT2G37… 81.7 750 77 7 1 702 1
## 9 Bra0… AT2G37… 88.1 337 37 1 319 652 1
## 10 Bra0… AT2G37… 91.5 1132 96 0 1 1132 1
## # ... with 37,660 more rows, and 3 more variables: send <int>, eval <dbl>,
## # score <dbl>
Making Brgo.v1.5anno.Atgoslim.BP.list
# Reading Arabidopsis thaliana GOslim list object (see ["Over-representation analysis 1: GOseq with GO terms in unsupported model organisms"](https://knozue.github.io/post/2018/09/26/over-representation-analysis-1-goseq-with-arabidopsis-go-term.html) for details)
load("2018-09-27-over-representation-analysis-3-goseq-with-non-model-go-term_files/Atgoslim.TAIR.BP.list.Rdata")
head(Atgoslim.TAIR.BP.list)
# remove splicing variant
Br.v1.0anno.At.BLAST <- Br.v1.0anno.At.BLAST %>% mutate(AGI=str_remove(subject,pattern="\\.[[:digit:]]+"))
# asign At GO into corresponding Br genes
Brgo.v1.0anno.Atgoslim.BP.list<-list()
for(i in 1:length(Br.v1.0anno.At.BLAST$query)) {
if(is.null(Atgoslim.TAIR.BP.list[[as_vector(Br.v1.0anno.At.BLAST[i,"AGI"])]])) next else {
Brgo.v1.0anno.Atgoslim.BP.list[[i]]<-Atgoslim.TAIR.BP.list[[as_vector(Br.v1.0anno.At.BLAST[i,"AGI"])]]
names(Brgo.v1.0anno.Atgoslim.BP.list)[[i]]<-as_vector(Br.v1.0anno.At.BLAST[i,"query"])
}
}
table(sapply(Brgo.v1.0anno.Atgoslim.BP.list,is.null)) # FALSE 33474 TRUE 4196
# remove gene with no GO term
Brgo.v1.0anno.Atgoslim.BP.list<-Brgo.v1.0anno.Atgoslim.BP.list[!sapply(Brgo.v1.0anno.Atgoslim.BP.list,is.null)]
table(sapply(Brgo.v1.0anno.Atgoslim.BP.list,is.null))
save(Brgo.v1.0anno.Atgoslim.BP.list,file=file.path("2018-09-27-over-representation-analysis-3-goseq-with-non-model-go-term_files","Brgo.v1.0anno.Atgoslim.BP.list.Rdata"))
#} else {print("change directory");stop}
prep for Brassica rapa GOseq analysis (v1.0 annotation) (for Greenham.2016.clock.drought data below)
load("2018-09-27-over-representation-analysis-3-goseq-with-non-model-go-term_files/Brgo.v1.0anno.Atgoslim.BP.list.Rdata")
library(ShortRead);library(goseq);library(GO.db);library("annotate")
# read cDNA fasta file with v1.0 annotation
#Br_cdna<-readDNAStringSet("http://brassicadb.org/brad/datasets/pub/Genomes/Brassica_rapa/V1.0/Scaffold1.0/Brassica_rapa.20100830.cds.gz")
Br_cdna<-readDNAStringSet("2018-09-27-over-representation-analysis-3-goseq-with-non-model-go-term_files/Brassica_rapa.20100830.cds.gz") # when above line is slow use this line.
head(Br_cdna)
## A DNAStringSet instance of length 6
## width seq names
## [1] 1515 ATGGGGAAGATCTTGAAAACT...TATCGTCTGCTATTCATTAG Bra000001 [mRNA]...
## [2] 1089 ATGGAGGAAGTAAGGAAGATG...ATGGTGTAGAAACACTCTAA Bra000002 [mRNA]...
## [3] 570 ATGAGCTCTGTTTGTGGTAAG...TAGTTGCAGCATCTTCTTGA Bra000003 [mRNA]...
## [4] 375 ATGATTCGCCGTCTATTCTCG...AAGAAGAAGCAGCCCTTTAA Bra000004 [mRNA]...
## [5] 1434 ATGGCGGCAGCTAGACGATTG...CCGGTGGAGGTTTTCTCTAG Bra000005 [mRNA]...
## [6] 312 ATGTCTGGGCGAGGAAAAGGA...TATATGGATTCGGCGGTTGA Bra000006 [mRNA]...
GOseq ORA function for Brassica rapa
GOseq.Brgo.v1.0.Atgoslim.BP.list.ORA<-function(genelist,padjust=0.05,ontology="BP",custom.category.list=Brgo.v1.0anno.Atgoslim.BP.list,cdna=Br_cdna) { # return GO enrichment table, padjus, padjust=0.05.
bias<-nchar(cdna)
names(bias)<-tibble(Bra=names(cdna)) %>% separate(Bra,into="Bra2",sep=" ",extra="drop") %>% dplyr::select(Bra2) %>% as_vector()
TF<-as.integer(names(bias) %in% genelist)
names(TF)<-names(bias)
#print(TF)
pwf<-nullp(TF,bias.data=bias)
#print(pwf$DEgenes)
GO.pval <- goseq(pwf,gene2cat=custom.category.list,use_genes_without_cat=TRUE) # format became different in new goseq version (021111). Does not work (042716)
#GO.pval <- goseq(pwf,gene2cat=Brgo.DF3,use_genes_without_cat=TRUE) # format became different in new goseq version (021111)
#head(GO.pval)
if(ontology=="BP") {
GO.pval2<-subset(GO.pval,ontology=="BP")
} else if(ontology=="CC") {
GO.pval2<-subset(GO.pval,ontology=="CC")
} else {
GO.pval2<-subset(GO.pval,ontology=="MF")
}
GO.pval2$over_represented_padjust<-p.adjust(GO.pval2$over_represented_pvalue,method="BH")
if(GO.pval2$over_represented_padjust[1]>padjust) return("no enriched GO")
else {
enriched.GO<-GO.pval2[GO.pval2$over_represented_padjust<padjust,]
print("enriched.GO is")
print(enriched.GO)
## write Term and Definition
for(i in 1:dim(enriched.GO)[1]) {
if(is.null(Term(GOTERM[enriched.GO[i,"category"]]))) {next} else {
enriched.GO$Term[i]<-Term(GOTERM[[enriched.GO[i,"category"]]])
enriched.GO$Definition[i]<-Definition(GOTERM[[enriched.GO[i,"category"]]])
}
}
return(enriched.GO)
}
}
GOseq analysis of differentially expressed gene list
Using a Brassica rapa gene list in Greenham (2016). Compare our GOseq results with one in Figure 8B.
# Note: v1.0
download.file("https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvMjk2NTUvZWxpZmUtMjk2NTUtc3VwcDEtdjIueGxzeA==/elife-29655-supp1-v2.xlsx?_hash=IzWtqJP3ae%2BeIAz0DN%2Fyp44pKc2grNAxM7z%2Bd%2FzBibM%3D",destfile="2018-09-27-over-representation-analysis-3-goseq-with-non-model-go-term_files/elife-29655-supp1-v2.xlsx")
Greenham.2016.clock.drought<-read_excel(path="2018-09-27-over-representation-analysis-3-goseq-with-non-model-go-term_files/elife-29655-supp1-v2.xlsx",sheet=2,skip=4)
system("rm 2018-09-27-over-representation-analysis-3-goseq-with-non-model-go-term_files/elife-29655-supp1-v2.xlsx")
Greenham.2016.clock.drought.dM5<-Greenham.2016.clock.drought %>% filter(Module=="M5") %>% dplyr::select(Gene) %>% as_vector() # v1.0 annotation
head(Greenham.2016.clock.drought.dM5)
## Gene1 Gene2 Gene3 Gene4 Gene5 Gene6
## "Bra011755" "Bra011741" "Bra011671" "Bra011655" "Bra011606" "Bra011581"
eGO.dM5<-GOseq.Brgo.v1.0.Atgoslim.BP.list.ORA(genelist=Greenham.2016.clock.drought.dM5)
## Warning in pcls(G): initial point very close to some inequality constraints
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
## [1] "enriched.GO is"
## category over_represented_pvalue under_represented_pvalue
## 1517 GO:0015979 2.192366e-19 1.0000000
## 977 GO:0009768 2.651766e-17 1.0000000
## 978 GO:0009769 1.204903e-10 1.0000000
## 914 GO:0009645 9.683773e-10 1.0000000
## 913 GO:0009644 3.394233e-08 1.0000000
## 860 GO:0009416 8.568277e-08 1.0000000
## 1168 GO:0010114 2.127734e-07 1.0000000
## 854 GO:0009409 5.068985e-06 0.9999983
## 1720 GO:0019684 8.198031e-06 0.9999997
## 640 GO:0006970 2.153608e-05 0.9999952
## numDEInCat numInCat
## 1517 27 192
## 977 14 37
## 978 7 13
## 914 7 16
## 913 12 106
## 860 22 386
## 1168 11 100
## 854 26 667
## 1720 5 21
## 640 14 251
## term ontology
## 1517 photosynthesis BP
## 977 photosynthesis, light harvesting in photosystem I BP
## 978 photosynthesis, light harvesting in photosystem II BP
## 914 response to low light intensity stimulus BP
## 913 response to high light intensity BP
## 860 response to light stimulus BP
## 1168 response to red light BP
## 854 response to cold BP
## 1720 photosynthesis, light reaction BP
## 640 response to osmotic stress BP
## over_represented_padjust
## 1517 8.162180e-16
## 977 4.936262e-14
## 978 1.495284e-07
## 914 9.013172e-07
## 913 2.527346e-05
## 860 5.316616e-05
## 1168 1.131651e-04
## 854 2.358979e-03
## 1720 3.391252e-03
## 640 8.017883e-03