If you want to analyze non-model organisms, what you can do?
Please prepare
- fasta file for cDNA of your species (eg. Brassica rapa),
- fasta file for cDNA of a close model organisms to your species (eg. Arabidopsis thaliana for Brassica rapa),
- blastn program installed in your computer (UNIX (or Mac OSX) or LINUX) (detailed instruciton from NCBI)
required libraries
library(tidyverse);library(readr);library(readxl)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
BLASTN
Use the Chifu V1.0 file BLAST against TAIR10 CDS
Because these organisms are relatively closely related I will use blastn instead of blastp to focus on best match
# look Julin's scripts and work by myself
list.files()
default.path <- getwd()
setwd(file.path("..","..","..","resources","2018-10-17-over-representation-analysis-3-goseq-with-go-term-in-non-model-organisms"))
system("makeblastdb -in TAIR10_cds_20110103_representative_gene_model_updated.fa -dbtype nucl")
#
system("gunzip Brassica_rapa.20100830.cds.gz")
# BLASTN
system("blastn -query Brassica_rapa.20100830.cds -db TAIR10_cds_20110103_representative_gene_model_updated.fa -strand both -task dc-megablast -outfmt 10 -culling_limit 1 -max_hsps 1 -evalue 10e-4 -num_threads 3 -template_type coding -template_length 16 -out Brapa1.0_vs_At_dc-megablast_out.csv")
# system("rm Brassica_rapa.20100830.cds")
setwd(default.path)
import results
brapa.blast <- read_csv(file.path("..","..","..","resources","2018-10-17-over-representation-analysis-3-goseq-with-go-term-in-non-model-organisms","Brapa1.0_vs_At_dc-megablast_out.csv"), col_names = FALSE)
colnames(brapa.blast) <- c("query","subject","perc_ID","aln_length","mismatch","gap_open","qstart","qend","sstart","send","eval","score")
head(brapa.blast)
summary(brapa.blast)
brapa.blast %>% group_by(query) %>% summarise(n())
# select highest score within one query
brapa.blast %>% group_by(query) %>% filter(score==max(score)) %>% summarise(n())
Br.v1.0anno.At.BLAST <- brapa.blast %>% group_by(query) %>% filter(score==max(score))
write_csv(Br.v1.0anno.At.BLAST,path=file.path("..","..","..","resources","2018-10-17-over-representation-analysis-3-goseq-with-go-term-in-non-model-organisms","Brapa_V1.0_annotated.csv"))
Reading Br-At gene name conversion table
### prerequisit: results from reciprocal BLAST against Arabidopsis thaliana
Br.v1.0anno.At.BLAST<-readr::read_csv(file=file.path("..","..","..","resources","2018-10-17-over-representation-analysis-3-goseq-with-go-term-in-non-model-organisms","Brapa_V1.0_annotated.csv")) # needs to use updated one from Julin (111718)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## query = col_character(),
## subject = col_character(),
## perc_ID = col_double(),
## aln_length = col_double(),
## mismatch = col_double(),
## gap_open = col_double(),
## qstart = col_double(),
## qend = col_double(),
## sstart = col_double(),
## send = col_double(),
## eval = col_double(),
## score = col_double()
## )
Br.v1.0anno.At.BLAST
## # A tibble: 37,670 x 12
## query subject perc_ID aln_length mismatch gap_open qstart qend sstart send
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Bra0… AT2G37… 85.0 1509 149 7 1 1506 1 1434
## 2 Bra0… AT2G37… 86.8 1080 141 2 1 1079 1 1079
## 3 Bra0… AT2G37… 83.1 590 75 7 1 570 1 585
## 4 Bra0… AT2G37… 81.9 365 63 2 1 365 1 362
## 5 Bra0… AT2G37… 82.8 1218 161 9 1 1179 1 1209
## 6 Bra0… AT3G53… 85.1 308 46 0 1 308 1 308
## 7 Bra0… AT2G37… 78.6 729 117 10 1 711 1 708
## 8 Bra0… AT2G37… 81.7 750 77 7 1 702 1 738
## 9 Bra0… AT2G37… 88.1 337 37 1 319 652 1 337
## 10 Bra0… AT2G37… 91.5 1132 96 0 1 1132 1 1132
## # … with 37,660 more rows, and 2 more variables: eval <dbl>, score <dbl>
Making Brgo.v1.5anno.Atgoslim.BP.list
# Reading Arabidopsis thaliana GOslim list object (see ["Over-representation analysis 1: GOseq with GO terms in unsupported model organisms"](https://knozue.github.io/post/2018/09/26/over-representation-analysis-1-goseq-with-arabidopsis-go-term.html) for details)
load(file.path("..","..","..","resources","2018-10-17-over-representation-analysis-3-goseq-with-go-term-in-non-model-organisms","Atgoslim.TAIR.BP.list.Rdata"))
head(Atgoslim.TAIR.BP.list)
# remove splicing variant
Br.v1.0anno.At.BLAST <- Br.v1.0anno.At.BLAST %>% mutate(AGI=str_remove(subject,pattern="\\.[[:digit:]]+"))
# asign At GO into corresponding Br genes
Brgo.v1.0anno.Atgoslim.BP.list<-list()
for(i in 1:length(Br.v1.0anno.At.BLAST$query)) {
if(is.null(Atgoslim.TAIR.BP.list[[as_vector(Br.v1.0anno.At.BLAST[i,"AGI"])]])) next else {
Brgo.v1.0anno.Atgoslim.BP.list[[i]]<-Atgoslim.TAIR.BP.list[[as_vector(Br.v1.0anno.At.BLAST[i,"AGI"])]]
names(Brgo.v1.0anno.Atgoslim.BP.list)[[i]]<-as_vector(Br.v1.0anno.At.BLAST[i,"query"])
}
}
table(sapply(Brgo.v1.0anno.Atgoslim.BP.list,is.null)) # FALSE 33474 TRUE 4196
# remove gene with no GO term
Brgo.v1.0anno.Atgoslim.BP.list<-Brgo.v1.0anno.Atgoslim.BP.list[!sapply(Brgo.v1.0anno.Atgoslim.BP.list,is.null)]
table(sapply(Brgo.v1.0anno.Atgoslim.BP.list,is.null))
save(Brgo.v1.0anno.Atgoslim.BP.list,file=file.path("..","..","..","resources","2018-10-17-over-representation-analysis-3-goseq-with-go-term-in-non-model-organisms","Brgo.v1.0anno.Atgoslim.BP.list.Rdata"))
#} else {print("change directory");stop}
prep for Brassica rapa GOseq analysis (v1.0 annotation) (for Greenham.2016.clock.drought data below)
load(file.path("..","..","..","resources","2018-10-17-over-representation-analysis-3-goseq-with-go-term-in-non-model-organisms","Brgo.v1.0anno.Atgoslim.BP.list.Rdata"))
library(ShortRead);library(goseq);library(GO.db);library("annotate")
## Warning: package 'S4Vectors' was built under R version 3.6.3
## Warning: package 'GenomeInfoDb' was built under R version 3.6.3
## Warning: package 'DelayedArray' was built under R version 3.6.3
# read cDNA fasta file with v1.0 annotation
#Br_cdna<-readDNAStringSet("http://brassicadb.org/brad/datasets/pub/Genomes/Brassica_rapa/V1.0/Scaffold1.0/Brassica_rapa.20100830.cds.gz")
Br_cdna<-readDNAStringSet(file.path("..","..","..","resources","2018-10-17-over-representation-analysis-3-goseq-with-go-term-in-non-model-organisms","Brassica_rapa.20100830.cds.gz")) # when above line is slow use this line.
head(Br_cdna)
## A DNAStringSet instance of length 6
## width seq names
## [1] 1515 ATGGGGAAGATCTTGAAAACTAA...TTATATCGTCTGCTATTCATTAG Bra000001 [mRNA]...
## [2] 1089 ATGGAGGAAGTAAGGAAGATGGG...AGGATGGTGTAGAAACACTCTAA Bra000002 [mRNA]...
## [3] 570 ATGAGCTCTGTTTGTGGTAAGCT...CTATAGTTGCAGCATCTTCTTGA Bra000003 [mRNA]...
## [4] 375 ATGATTCGCCGTCTATTCTCGTC...AGGAAGAAGAAGCAGCCCTTTAA Bra000004 [mRNA]...
## [5] 1434 ATGGCGGCAGCTAGACGATTGCG...GGACCGGTGGAGGTTTTCTCTAG Bra000005 [mRNA]...
## [6] 312 ATGTCTGGGCGAGGAAAAGGAGG...CTTTATATGGATTCGGCGGTTGA Bra000006 [mRNA]...
GOseq ORA function for Brassica rapa
GOseq.Brgo.v1.0.Atgoslim.BP.list.ORA<-function(genelist,padjust=0.05,ontology="BP",custom.category.list=Brgo.v1.0anno.Atgoslim.BP.list,cdna=Br_cdna) { # return GO enrichment table, padjus, padjust=0.05.
bias<-nchar(cdna)
names(bias)<-tibble(Bra=names(cdna)) %>% separate(Bra,into="Bra2",sep=" ",extra="drop") %>% dplyr::select(Bra2) %>% as_vector()
TF<-as.integer(names(bias) %in% genelist)
names(TF)<-names(bias)
#print(TF)
pwf<-nullp(TF,bias.data=bias)
#print(pwf$DEgenes)
GO.pval <- goseq(pwf,gene2cat=custom.category.list,use_genes_without_cat=TRUE) # format became different in new goseq version (021111). Does not work (042716)
#GO.pval <- goseq(pwf,gene2cat=Brgo.DF3,use_genes_without_cat=TRUE) # format became different in new goseq version (021111)
#head(GO.pval)
if(ontology=="BP") {
GO.pval2<-subset(GO.pval,ontology=="BP")
} else if(ontology=="CC") {
GO.pval2<-subset(GO.pval,ontology=="CC")
} else {
GO.pval2<-subset(GO.pval,ontology=="MF")
}
GO.pval2$over_represented_padjust<-p.adjust(GO.pval2$over_represented_pvalue,method="BH")
if(GO.pval2$over_represented_padjust[1]>padjust) return("no enriched GO")
else {
enriched.GO<-GO.pval2[GO.pval2$over_represented_padjust<padjust,]
print("enriched.GO is")
print(enriched.GO)
## write Term and Definition
for(i in 1:dim(enriched.GO)[1]) {
if(is.null(Term(GOTERM[enriched.GO[i,"category"]]))) {next} else {
enriched.GO$Term[i]<-Term(GOTERM[[enriched.GO[i,"category"]]])
enriched.GO$Definition[i]<-Definition(GOTERM[[enriched.GO[i,"category"]]])
}
}
return(enriched.GO)
}
}
GOseq analysis of differentially expressed gene list
Using a Brassica rapa gene list in Greenham (2016). Compare our GOseq results with one in Figure 8B.
# Note: v1.0
download.file("https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvMjk2NTUvZWxpZmUtMjk2NTUtc3VwcDEtdjIueGxzeA==/elife-29655-supp1-v2.xlsx?_hash=IzWtqJP3ae%2BeIAz0DN%2Fyp44pKc2grNAxM7z%2Bd%2FzBibM%3D",destfile=file.path("..","..","..","resources","2018-10-17-over-representation-analysis-3-goseq-with-go-term-in-non-model-organisms","elife-29655-supp1-v2.xlsx"))
Greenham.2016.clock.drought<-read_excel(path=file.path("..","..","..","resources","2018-10-17-over-representation-analysis-3-goseq-with-go-term-in-non-model-organisms","elife-29655-supp1-v2.xlsx"),sheet=2,skip=4)
system("rm ../../../resources/2018-10-17-over-representation-analysis-3-goseq-with-go-term-in-non-model-organisms/elife-29655-supp1-v2.xlsx")
Greenham.2016.clock.drought.dM5<-Greenham.2016.clock.drought %>% filter(Module=="M5") %>% dplyr::select(Gene) %>% as_vector() # v1.0 annotation
head(Greenham.2016.clock.drought.dM5)
## Gene1 Gene2 Gene3 Gene4 Gene5 Gene6
## "Bra011755" "Bra011741" "Bra011671" "Bra011655" "Bra011606" "Bra011581"
eGO.dM5<-GOseq.Brgo.v1.0.Atgoslim.BP.list.ORA(genelist=Greenham.2016.clock.drought.dM5)
## Warning in pcls(G): initial point very close to some inequality constraints
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
## [1] "enriched.GO is"
## category over_represented_pvalue under_represented_pvalue numDEInCat
## 1517 GO:0015979 2.192366e-19 1.0000000 27
## 977 GO:0009768 2.651766e-17 1.0000000 14
## 978 GO:0009769 1.204903e-10 1.0000000 7
## 914 GO:0009645 9.683773e-10 1.0000000 7
## 913 GO:0009644 3.394233e-08 1.0000000 12
## 860 GO:0009416 8.568277e-08 1.0000000 22
## 1168 GO:0010114 2.127734e-07 1.0000000 11
## 854 GO:0009409 5.068985e-06 0.9999983 26
## 1720 GO:0019684 8.198031e-06 0.9999997 5
## 640 GO:0006970 2.153608e-05 0.9999952 14
## numInCat term ontology
## 1517 192 photosynthesis BP
## 977 37 photosynthesis, light harvesting in photosystem I BP
## 978 13 photosynthesis, light harvesting in photosystem II BP
## 914 16 response to low light intensity stimulus BP
## 913 106 response to high light intensity BP
## 860 386 response to light stimulus BP
## 1168 100 response to red light BP
## 854 667 response to cold BP
## 1720 21 photosynthesis, light reaction BP
## 640 251 response to osmotic stress BP
## over_represented_padjust
## 1517 8.162180e-16
## 977 4.936262e-14
## 978 1.495284e-07
## 914 9.013172e-07
## 913 2.527346e-05
## 860 5.316616e-05
## 1168 1.131651e-04
## 854 2.358979e-03
## 1720 3.391252e-03
## 640 8.017883e-03