library(DESeq2)
library(magrittr)
load("RNAseqGierlinski.RData")
We need to ensure that the fold change will be calculated using the WT as the base line. DESeq
used the levels of the condition to determine the order of the comparison.
str(DESeq.ds$condition)
## Factor w/ 2 levels "SNF2","WT": 1 1 1 1 1 2 2 2 2 2
DESeq.ds$condition <- relevel(DESeq.ds$condition, ref="WT")
str(DESeq.ds$condition)
## Factor w/ 2 levels "WT","SNF2": 2 2 2 2 2 1 1 1 1 1
Analysis design – check that the contrast is set up correctly (we want to compare the samples based on their condition):
design(DESeq.ds)
## ~condition
DESeq.ds <- DESeq(DESeq.ds)
This one line of code is equivalent to these three lines of code:
DESeq.ds <- estimateSizeFactors(DESeq.ds) # sequencing depth normalization between the samples
DESeq.ds <- estimateDispersions(DESeq.ds) # gene-wise dispersion estimates across all samples
DESeq.ds <- nbinomWaldTest(DESeq.ds) # this fits a negative binomial GLM and applies Wald statistics to each gene
Extract the base means across samples, log2 fold changes, standard errors, test statistics, p-values and adjusted p-values for every gene using results()
.
resultsNames(DESeq.ds) # tells you which types of values can be extracted with results()
## [1] "Intercept" "condition_SNF2_vs_WT"
DGE.results <- results(DESeq.ds,
independentFiltering = TRUE,
alpha = 0.05)
head(DGE.results) # the first line will tell you which comparison was done to achieve the log2FC
## log2 fold change (MLE): condition SNF2 vs WT
## Wald test p-value: condition SNF2 vs WT
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## YAL012W 5542.3913626 0.3162503 0.1621299 1.9505979 5.110490e-02
## YAL068C 0.9678543 0.4024211 1.2266865 0.3280553 7.428698e-01
## YAL067C 40.8786754 1.0896297 0.2295630 4.7465385 2.069274e-06
## YAL066W 0.1402759 0.6884496 3.1599365 0.2178682 8.275318e-01
## YAL065C 5.1639417 -0.5548612 0.6492826 -0.8545758 3.927860e-01
## YAL064W.B 8.3560022 -0.1925099 0.4398400 -0.4376816 6.616171e-01
## padj
## <numeric>
## YAL012W 1.009289e-01
## YAL068C NA
## YAL067C 1.265448e-05
## YAL066W NA
## YAL065C 5.070495e-01
## YAL064W.B 7.478938e-01
summary(DGE.results)
##
## out of 6394 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 1289, 20%
## LFC < 0 (down) : 1455, 23%
## outliers [1] : 0, 0%
## low counts [2] : 248, 3.9%
## (mean count < 3)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# the DESeqResult object can basically be handled like a data.frame
table(DGE.results$padj < 0.05)
##
## FALSE TRUE
## 3402 2744
NAs in the padj
column (but values in both log2FC
and pvalue
) are indicative of that gene being filtered out by the independent filtering [because it was very lowly expressed].
The MA-plot provides a global view of the differential genes, with the log2 fold change on the y-axis over the mean of normalized counts.
Genes that pass the significance threshold (adjusted p.value 0.05) are colored in red.
plotMA(DGE.results, alpha = 0.05,
main = "Test: p.adj.value < 0.05", ylim = c(-4,4))
A adj. p-value histogram:
hist(DGE.results$padj,
col="grey", border="white", xlab="", ylab="", main="frequencies of adj. p-values\n(all genes)")
A sorted results table:
DGE.results.sorted <- DGE.results[order(DGE.results$padj),]
head(DGE.results.sorted)
## log2 fold change (MLE): condition SNF2 vs WT
## Wald test p-value: condition SNF2 vs WT
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## YGR234W 2866.1031 -4.241757 0.10721525 -39.56300 0.000000e+00
## YDR033W 4101.9799 -3.699637 0.09924320 -37.27850 3.661333e-304
## YOR290C 778.5012 -6.922261 0.18643117 -37.13038 9.088530e-302
## YML123C 4891.2088 -4.684599 0.14896521 -31.44761 4.526327e-217
## YIL121W 880.4174 -2.763201 0.08823873 -31.31506 2.910754e-215
## YHR215W 291.3474 -4.525512 0.15355665 -29.47128 6.720177e-191
## padj
## <numeric>
## YGR234W 0.000000e+00
## YDR033W 1.125128e-300
## YOR290C 1.861937e-298
## YML123C 6.954702e-214
## YIL121W 3.577899e-212
## YHR215W 6.883701e-188
Plotting counts for single genes (seq. depth normalized, log2-transformed)
par(mfrow=c(1,2))
plotCounts(DESeq.ds, gene="YAL056W", normalized = TRUE)
plotCounts(DESeq.ds, gene=which.max(DGE.results$padj), main = "Max. p.adj.")
plotCounts
simply uses counts(dds, normalized = TRUE) + 0.5
.
You can also use pcaExplorer
for individual gene plots of rlog
values.
A heatmap of the genes that show differential expression with adjusted p-value 0.05 :
# identify genes with the desired adjusted p-value cut-off
DGEgenes <- rownames(subset(DGE.results.sorted, padj < 0.05))
# extract rlog-transformed values into a matrix
rlog.dge <- DESeq.rlog[DGEgenes,] %>% assay
library(pheatmap)
# heatmap of DEG sorted by p.adjust
pheatmap(rlog.dge, scale="none", show_rownames = FALSE, main = "DGE (no scaling)")
pheatmap(rlog.dge, scale="row", show_rownames = FALSE, main = "DGE (row-based z-score)")
To find this out, we need to retrieve the gene names and match them to the ORF IDs that we’ve used so far. http://www.bioconductor.org/packages/3.1/data/annotation/ lists annotation packages that are available within R through bioconductor.
We will go with org.Sc.sgd.db
.
#source("http://bioconductor.org/biocLite.R")
#biocLite("org.Sc.sgd.db")
library(org.Sc.sgd.db) # org.Hs.eg.db, org.Mm.eg.db
# list keytypes that are available to query the annotation data base
keytypes(org.Sc.sgd.db)
## [1] "ALIAS" "COMMON" "DESCRIPTION" "ENSEMBL"
## [5] "ENSEMBLPROT" "ENSEMBLTRANS" "ENTREZID" "ENZYME"
## [9] "EVIDENCE" "EVIDENCEALL" "GENENAME" "GO"
## [13] "GOALL" "INTERPRO" "ONTOLOGY" "ONTOLOGYALL"
## [17] "ORF" "PATH" "PFAM" "PMID"
## [21] "REFSEQ" "SGD" "SMART" "UNIPROT"
# list columns that can be retrieved from the annotation data base
columns(org.Sc.sgd.db)
## [1] "ALIAS" "COMMON" "DESCRIPTION" "ENSEMBL"
## [5] "ENSEMBLPROT" "ENSEMBLTRANS" "ENTREZID" "ENZYME"
## [9] "EVIDENCE" "EVIDENCEALL" "GENENAME" "GO"
## [13] "GOALL" "INTERPRO" "ONTOLOGY" "ONTOLOGYALL"
## [17] "ORF" "PATH" "PFAM" "PMID"
## [21] "REFSEQ" "SGD" "SMART" "UNIPROT"
# make a batch retrieval for all DE genes
DGEgenes <- rownames(subset(DGE.results.sorted, padj < 0.05))
anno.DGE <- select(org.Sc.sgd.db,
keys = DGEgenes, # rownames
keytype="ORF", # our rownames are ORF identifiers
columns=c("SGD","GENENAME")) # what to return
# check whether SNF2 pops up among the top downregulated genes
#head(anno.DGE[match(DGEgenes, anno.DGE$ORF),])
head(anno.DGE)
## ORF SGD GENENAME
## 1 YGR234W S000003466 YHB1
## 2 YDR033W S000002440 MRH1
## 3 YOR290C S000005816 SNF2
## 4 YML123C S000004592 PHO84
## 5 YIL121W S000001383 QDR2
## 6 YHR215W S000001258 PHO12
To get a feeling for how the difference between WT and snf2 ko looks like for a housekeeping gene, let’s repeat the exercise.
par(mfrow=c(1,2))
plotCounts(dds = DESeq.ds,
gene = "YOR290C",
normalized = TRUE, transform = FALSE,
main = expression(atop("Expression of "*italic("snf2"), "(YOR290C)")))
plotCounts(dds = DESeq.ds,
gene = "YGL012W", # the last gene in DGE
normalized = TRUE, transform = FALSE,
main = expression(atop("Expression of "*italic("erg4"), "(YGL012W)")))
Export the log2FC, p-values etc. into a text file:
out.df <- merge(as.data.frame(DGE.results), anno.DGE, by.x = "row.names", by.y = "ORF")
write.table(subset(out.df, padj < 0.05), file = "DESeq2results_WT-vs-SNF2.txt",
sep = "\t", quote = FALSE, row.names = FALSE)
Among our list of DE genes, which GO terms are enriched?
Transcripts that are longer or more highly expressed give more statistical power for detecting differential expression between samples
The same bias holds true for GO categories: categories with predominantly highly expressed or long genes are more likely to be found to be over-represented within the DEG.
GOseq
:
Manual work-around for mouse:
#biocLite("org.Mm.eg.db")
#biocLite("goseq")
#biocLite("biomaRt")
library(biomaRt)
library(org.Mm.eg.db)
library(goseq)
library(geneLenDataBase)
## gene list from an experiment involving immune cells
## you can download the list here:
## https://github.com/friedue/course_RNA-seq2018/tree/master/Days03-04_ReadCounts_to_DGE
gns <- read.table("mm_genes.txt", stringsAsFactors = FALSE, skip = 1)
## retrieve the ENSEMBL symbols for the gene names
anno.mm <- select(org.Mm.eg.db,
keys = gns$V1,
keytype="SYMBOL",
columns=c("ENSEMBL","SYMBOL","GENENAME")) # what to return
## in addition, retrieve all possible mouse gene names
## because goseq wants to know about the universe of *all* genes
mart <- useMart(biomart = "ensembl", dataset = "mmusculus_gene_ensembl")
mm_all <- getBM(attributes = c("ensembl_gene_id"), mart = mart)
## get length data using goseq's data base of gene lengths
## first, load the data from the geneLenDataBase package
data(list = "mm9.ensGene.LENGTH", package = "geneLenDataBase")
## split the length values per ENSEMBL ID into a list
len_data = split(mm9.ensGene.LENGTH$Length,
mm9.ensGene.LENGTH$Gene)
## get the gene names from the length data base
gn_names = mm9.ensGene.LENGTH$Gene
## calculate the median (and other) lengths summary values for the
## different genes
len = data.frame(Gene = names(len_data),
Median = sapply(len_data,median),
Min = sapply(len_data, min),
Max = sapply(len_data, max),
Count = sapply(len_data, length))
## goseq wants a named binary vector where
## 1 = DE, 0 = not DE
## and the names of the [0,1]-vector are gene IDs
gogns <- len$Gene %in% anno.mm$ENSEMBL %>% as.integer
names(gogns) <- len$Gene
## Quantifying the length bias (= weight for each gene)
## using a Probability Weighting Function (PWF): probability of a gene being DE ~ length
## proportion of DE genes is plotted as a function of the transcript length
pwf <- goseq::nullp(gogns, bias.data = len$Median)
## Warning in pcls(G): initial point very close to some inequality constraints
## do the actual test for enrichment of GO terms
GO.wall <- goseq(pwf, "mm9", "ensGene")
head(GO.wall)
## category over_represented_pvalue under_represented_pvalue
## 1071 GO:0002376 5.314460e-57 1
## 3584 GO:0006955 7.921425e-43 1
## 1243 GO:0002682 8.428431e-41 1
## 1245 GO:0002684 1.762375e-35 1
## 3580 GO:0006950 2.006670e-35 1
## 3581 GO:0006952 1.059548e-34 1
## numDEInCat numInCat term
## 1071 389 2101 immune system process
## 3584 237 1142 immune response
## 1243 237 1138 regulation of immune system process
## 1245 179 780 positive regulation of immune system process
## 3580 456 3161 response to stress
## 3581 233 1279 defense response
## ontology
## 1071 BP
## 3584 BP
## 1243 BP
## 1245 BP
## 3580 BP
## 3581 BP
## can be summarized in http://revigo.irb.hr/
subset(GO.wall, over_represented_pvalue < 0.01,
select = c("category","over_represented_pvalue")) %>%
write.table(., file = "GOterms.txt", # feed this into REVIGO
quote = FALSE,
row.names = FALSE, col.names = FALSE)
For more tools dealing with GO term enrichments and pathway analyses, see the HBC Training Site.