A typical RNASeq analysis
Hong Zheng / 2017-09-18
This is a typical RNASeq analysis pipeline, not atypical.
Purpose
Given RNASeq data from two group of samples (sequenced from mice), control and treatment, find out the differentially expressed genes.
Scripts can also be found in https://github.com/zhengh42/RNASeq_MM.
Download reference and prepare index
#######
# These steps are performed in linux/bash, in reference/ directory
# Script location: reference/run_download_index.sh
# Kallisto and Salmon are from pre-built Docker images. Scripts for building the images can be found in https://github.com/zhengh42/Dockerfiles
#######
#######
# Download from Gencode release M15
#######
wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_mouse/release_M15/GRCm38.primary_assembly.genome.fa.gz
wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_mouse/release_M15/gencode.vM15.annotation.gtf.gz
wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_mouse/release_M15/gencode.vM15.transcripts.fa.gz
gzip -d *.gz
#######
# Stats of Gencode release M15
#######
# Number of genes: 52550
less gencode.vM15.annotation.gtf | awk '$3=="gene"' | wc -l
# Number of transcript: 131100
less gencode.vM15.annotation.gtf | awk '$3=="transcript"' | wc -l
less gencode.vM15.transcripts.fa | grep '^>' | wc -l
# Get gene name and type
less gencode.vM15.annotation.gtf | awk '$3=="gene"' | cut -f9 | awk '{print $2,$4,$6}' | sed 's/[";]//g;s/\s\+/\t/g' | sed '1i geneid\tgenetype\tgenename' > gene.info
#######
# Generate transcript to gene mapping file
#######
less gencode.vM15.transcripts.fa | egrep '^>' | sed 's/^>//' | awk 'OFS="\t"{print $0,$0}' | sed 's/|/\t/;s/|/\t/' | awk 'OFS="\t"{print $4,$2}' > tx2gene.txt
transcripts_fasta=gencode.vM15.transcripts.fa
#######
# Kallisto index
#######
docker run -v $PWD:/mnt zhengh42/kallisto:0.43.1 kallisto index -i /mnt/M15.gencode.kallisto.idx /mnt/$transcripts_fasta
#######
# Salmon index
#######
docker run -v $PWD:/mnt zhengh42/salmon:0.8.2 salmon index -i /mnt/M15.gencode.salmon.idx -t /mnt/$transcripts_fasta
QC and transcript-level expression quantification
#######
# These steps are performed in linux/bash, in scripts/ directory
# Trim-galore is from pre-built Docker images. Scripts for building the images can be found in https://github.com/zhengh42/Dockerfiles
# Script location: scripts/run_indiv.sh
# How to run:
# ./run_indiv.sh sham1 Raw_sham1
# ./run_indiv.sh sham2 Raw_sham2
# ./run_indiv.sh sham3 Raw_sham3
# ./run_indiv.sh WT_Ang1 Raw_WT_Ang1
# ./run_indiv.sh WT_Ang2 Raw_WT_Ang2
# ./run_indiv.sh WT_Ang3 Raw_WT_Ang3
#######
#######
# Arguments setup
#######
id=$1
id2=$2
raw_dir=/srv/gevaertlab/data/Hong/RNASeq/hb/Reads
work_dir=/srv/gevaertlab/data/Hong/RNASeq/hb/RNASeq_MM_HB/
ref_dir=$work_dir/reference
#######
# QC with trim-galore and fastqc
#######
docker run -v $raw_dir:/home zhengh42/trim-galore:0.4.4 \
trim_galore -q 15 --stringency 3 --gzip --length 15 --paired
/home/$id/${id2}_1.fq.gz /home/$id/${id2}_2.fq.gz --fastqc --output_dir /home
1> ../logs/$id.trim_galore.log 2>&1
#######
# Get the stand-specific information of the reads
# In the log file of salmon output: Automatically detected most likely library type as ISR
#######
zcat $raw_dir/${id2}_1_val_1.fq.gz | head -n 400000 | gzip > $raw_dir/${id2}_test_1.fq.gz
zcat $raw_dir/${id2}_2_val_2.fq.gz | head -n 400000 | gzip > $raw_dir/${id2}_test_2.fq.gz
docker run -v $raw_dir:/home/seq -v $ref_dir:/home/ref -v $work_dir/results/salmon:/home/out \
zhengh42/salmon:0.8.2 \
salmon quant -i /home/ref/M15.gencode.salmon.idx -l A \
-o /home/out/$id.test -1 /home/seq/${id2}_test_1.fq.gz -2 /home/seq/${id2}_test_2.fq.gz \
1> ../logs/$id.salmon.log 2>&1
#######
# Get transcript level expression estimate using Kallisto
# Since the library type is ISR, --rf-stranded argument is specified
# More information on the library type and strand information, see http://fishycat.netlify.com/en/2017/08/strandness_in_rnaseq/
#######
docker run -v $raw_dir:/home/seq -v $ref_dir:/home/ref -v $work_dir/results/kallisto:/home/out \
zhengh42/kallisto:0.43.1 \
kallisto quant -i /home/ref/M15.gencode.kallisto.idx \
-o /home/out/$id /home/seq/${id2}_1_val_1.fq.gz /home/seq/${id2}_2_val_2.fq.gz \
-b 100 --rf-stranded --fusion 1> ../logs/$id.kallisto.log 2>&1
Differential expression analysis
#######
# The following steps are performed in R, in scripts/ directory
#######
#######
# Load required packages, set up directory, read in files
#######
library(readr)
library(tximport)
require("knitr")
require(DESeq2) # version 1.16.1
require(IHW)
require(dplyr)
require(ComplexHeatmap)
require(vsn)
require(RColorBrewer)
require(ggplot2)
work_dir<- "/srv/gevaertlab/data/Hong/RNASeq/hb/RNASeq_MM_HB/"
opts_knit$set(root.dir = work_dir)
sampleID <- scan(paste0(work_dir,"sampleID"),what="character",quiet=TRUE)
geneinfo<-read.table(paste0(work_dir,"reference/gene.info"),head=T,sep = "\t")
#######
# Get gene-level expression from transcript level results of Kallisto
#######
files.kallisto <- file.path(paste0(work_dir,"results/kallisto"),sampleID,"abundance.tsv")
all(file.exists(files.kallisto))
names(files.kallisto)<- sampleID
tx2gene<-read.table(paste0(work_dir,"reference/tx2gene.txt"))
Kallisto.txim <- tximport(files.kallisto,type="kallisto",tx2gene = tx2gene)
write.csv(Kallisto.txim,file=paste0(work_dir, "results/kallisto/Kallisto.txim.txt"),row.names = T)
1. DESeq2
#######
# Read in data and pre-filtering
#######
sampleTable=data.frame(condition = factor(rep(c("wt-sham","wt-ang"),each=3)))
rownames(sampleTable) = colnames(Kallisto.txim$counts)
dds<- DESeqDataSetFromTximport(Kallisto.txim,sampleTable,~condition)
dds$condition <- factor(dds$condition, levels = c("wt-sham","wt-ang"))
dim(dds) #[1] 52550 6
# Only genes with at least two counts across the samples were kept
dds <- dds[ rowSums(counts(dds)) > 1, ]
dim(dds) #[1] 31043 6
#######
# Generate DESeq object
#######
dds <- DESeq(dds)
#######
# Different approaches of tranformation. DESeq uses regularized log transformation as default.
#######
# Shifted logarithm transformation, log2(n + 1), has elevated standard deviation in the lower count range.
ntd <- normTransform(dds)
meanSdPlot(assay(ntd))
# Regularized log transformation, less elevated standard deviation in the lower count range.
rld <- rlog(dds, blind=FALSE)
meanSdPlot(assay(rld))
# Variance stabilizing transformation.It may over-correct standard deviation and mask true differences due to the experimental conditions.
vsd <- varianceStabilizingTransformation(dds, blind=FALSE)
meanSdPlot(assay(vsd))
#######
# Data quality assessment by sample clustering and visualization
#######
# Sample clustering based on first 400 highly-expressed genes
select <- order(rowMeans(counts(dds,normalized=TRUE)),
decreasing=TRUE)[1:400]
p<- Heatmap(assay(rld)[select,],
clustering_distance_rows = "euclidean", clustering_distance_columns = "euclidean",
clustering_method_rows = "average",clustering_method_columns = "average",
show_row_names =F,
name="distance",
column_names_gp = gpar(fontsize = 9)
)
draw(p,newpage = T)
### Heatmap of the sample-to-sample distances
sampleDists <- dist(t(assay(rld)))
sampleDistMatrix <- as.matrix(sampleDists)
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Oranges")) )(255)
p<- Heatmap(sampleDistMatrix,
clustering_distance_rows = "euclidean", clustering_distance_columns = "euclidean",
clustering_method_rows = "average",clustering_method_columns = "average",
col=colors,
name="Distance",
column_names_gp = gpar(fontsize = 9)
)
draw(p,newpage = T)
# PCA plot
pcaData <- plotPCA(rld, intgroup=c("condition"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=condition,shape=name)) +
geom_point(size=3) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()
QC summary
From the clustering and visualization plots above, we can see that sham1 and sham2 always cluster together, as well as Ang1 and Ang2. However, instead of clustering with sham group, sham3 always clusters with Ang group. Is there any problem with this sample?
To accurately measure fold change, two versions of differential expression analysis was performed, one with sham3 and one with out.
Differential expression analysis with all samples
#######
# Differential expression analysis
#######
res <- results(dds)
# Get moderated and shrunken log2 fold changes. No effect on p values
resLFC <- lfcShrink(dds, coef=2, res=res)
#######
# MA plot
#######
plotMA(res, ylim=c(-2,2))
plotMA(resLFC, ylim=c(-2,2))
#######
# Choose top differentially-expressed genes
#######
# By default, DESeq performs independent filtering based on the mean of normalized counts for each gene, optimizing the number of genes which will have an adjusted p value below a given FDR cutoff (default 0.1). Using alpha to change the threshold: results(dds, alpha=0.2)
summary(res)
##
## out of 31043 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 365, 1.2%
## LFC < 0 (down) : 444, 1.4%
## outliers [1] : 15, 0.048%
## low counts [2] : 12635, 41%
## (mean count < 14)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Filtering by Independent Hypothesis Weighting
resIHW <- results(dds, filterFun=ihw)
summary(resIHW)
##
## out of 31043 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 404, 1.3%
## LFC < 0 (down) : 439, 1.4%
## outliers [1] : 15, 0.048%
## [1] see 'cooksCutoff' argument of ?results
## [2] see metadata(res)$ihwResult on hypothesis weighting
There are 809 genes with adjusted p value less than 0.1 using default independent filtering.
There are 843 genes with adjusted p value less than 0.1 using independent hypothesis weighting filtering.
######
# Write results
#######
DESeq.gene.out <- as.data.frame(res)
DESeq.gene.out$log2FoldChange_lfc <- resLFC$log2FoldChange
DESeq.gene.out$padj_IHW <- resIHW$padj
DESeq.gene.out$geneid <- rownames(res)
DESeq.geneanno.out <- merge(x=DESeq.gene.out,y=geneinfo,by = "geneid", all.x = TRUE)
Kallisto.txim.abundance <- as.data.frame(Kallisto.txim$abundance)
colnames(Kallisto.txim.abundance) <- gsub("^","TPM.",colnames(Kallisto.txim.abundance))
Kallisto.txim.abundance$geneid<-rownames(Kallisto.txim.abundance)
Kallisto.txim.counts <- as.data.frame(Kallisto.txim$counts)
colnames(Kallisto.txim.counts) <- gsub("^","counts.",colnames(Kallisto.txim.counts))
Kallisto.txim.counts$geneid<-rownames(Kallisto.txim.counts)
DESeq.geneanno.out<-merge(x=DESeq.geneanno.out,y=Kallisto.txim.abundance,by = "geneid", all.x = TRUE)
DESeq.geneanno.out<-merge(x=DESeq.geneanno.out,y=Kallisto.txim.counts,by = "geneid", all.x = TRUE)
DESeq.geneanno.out.ordered<-DESeq.geneanno.out[order(DESeq.geneanno.out$padj_IHW),]
# What are the types of the genes?
table(DESeq.geneanno.out.ordered$genetype)
##
## 3prime_overlapping_ncRNA antisense_RNA
## 1 1684
## bidirectional_promoter_lncRNA IG_C_gene
## 107 11
## IG_C_pseudogene IG_D_gene
## 0 0
## IG_D_pseudogene IG_J_gene
## 0 9
## IG_LV_gene IG_pseudogene
## 0 0
## IG_V_gene IG_V_pseudogene
## 111 2
## lincRNA macro_lncRNA
## 2763 0
## miRNA misc_RNA
## 811 173
## Mt_rRNA Mt_tRNA
## 2 22
## polymorphic_pseudogene processed_pseudogene
## 23 2306
## processed_transcript protein_coding
## 626 18179
## pseudogene ribozyme
## 49 9
## rRNA scaRNA
## 31 29
## scRNA sense_intronic
## 1 284
## sense_overlapping snoRNA
## 20 323
## snRNA sRNA
## 299 1
## TEC TR_C_gene
## 2510 7
## TR_D_gene TR_J_gene
## 0 46
## TR_J_pseudogene TR_V_gene
## 3 29
## TR_V_pseudogene transcribed_processed_pseudogene
## 2 159
## transcribed_unitary_pseudogene transcribed_unprocessed_pseudogene
## 10 128
## unitary_pseudogene unprocessed_pseudogene
## 4 269
# Get lncRNA genes
lncrna<-c("lincRNA","antisense_RNA","sense_intronic","sense_overlapping","TEC")
DESeq.geneanno.out.ordered.lncRNA <- DESeq.geneanno.out.ordered %>% filter(genetype %in% lncrna)
write.table(DESeq.geneanno.out.ordered.lncRNA,file = "DESeq.geneanno.out.ordered.lncRNA.txt",sep="\t",quote = F,row.names = F)
write.table(DESeq.geneanno.out.ordered,file = "DESeq.geneanno.out.ordered.txt",sep="\t",quote = F,row.names = F)
Differential expression analysis without sham3
# Start with Kallisto data import
files.kallisto_a <- files.kallisto[-which(names(files.kallisto) %in% c("sham3") )]
Kallisto.txim_a <- tximport(files.kallisto_a,type="kallisto",tx2gene = tx2gene)
# DESeq
sampleTable_a <- sampleTable[-which(rownames(sampleTable) %in% c("sham3")),,F]
dds_a<- DESeqDataSetFromTximport(Kallisto.txim_a,sampleTable_a,~condition)
dds_a$condition <- factor(dds_a$condition, levels = c("wt-sham","wt-ang"))
dim(dds_a) #[1] 52550 6
# Only genes with at least two counts across the samples were kept
dds_a <- dds_a[ rowSums(counts(dds_a)) > 1, ]
dim(dds_a) #[1] 30796 5
dds_a <- DESeq(dds_a)
res_a <- results(dds_a)
resLFC_a <- lfcShrink(dds_a, coef=2, res=res_a)
#######
# MA plot
#######
plotMA(res_a, ylim=c(-2,2))
plotMA(resLFC_a, ylim=c(-2,2))
# Choose top differentially-expressed genes
summary(res_a)
##
## out of 30796 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 1251, 4.1%
## LFC < 0 (down) : 1102, 3.6%
## outliers [1] : 0, 0%
## low counts [2] : 14330, 47%
## (mean count < 28)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Filtering by Independent Hypothesis Weighting
resIHW_a <- results(dds_a, filterFun=ihw)
summary(resIHW_a)
##
## out of 30796 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 1342, 4.4%
## LFC < 0 (down) : 1108, 3.6%
## outliers [1] : 0, 0%
## [1] see 'cooksCutoff' argument of ?results
## [2] see metadata(res)$ihwResult on hypothesis weighting
There are 2353 genes with adjusted p value less than 0.1 using default independent filtering.
There are 2450 genes with adjusted p value less than 0.1 using independent hypothesis weighting filtering.
# Write results
DESeq.gene.out_a <- as.data.frame(res_a)
DESeq.gene.out_a$log2FoldChange_lfc <- resLFC_a$log2FoldChange
DESeq.gene.out_a$padj_IHW <- resIHW_a$padj
DESeq.gene.out_a$geneid <- rownames(res_a)
DESeq.geneanno.out_a <- merge(x=DESeq.gene.out_a,y=geneinfo,by = "geneid", all.x = TRUE)
DESeq.geneanno.out_a<-merge(x=DESeq.geneanno.out_a,y=Kallisto.txim.abundance,by = "geneid", all.x = TRUE)
DESeq.geneanno.out_a<-merge(x=DESeq.geneanno.out_a,y=Kallisto.txim.counts,by = "geneid", all.x = TRUE)
DESeq.geneanno.out.ordered_a<-DESeq.geneanno.out_a[order(DESeq.geneanno.out_a$padj_IHW),]
# What are the types of the genes?
table(DESeq.geneanno.out.ordered_a$genetype)
##
## 3prime_overlapping_ncRNA antisense_RNA
## 1 1654
## bidirectional_promoter_lncRNA IG_C_gene
## 107 11
## IG_C_pseudogene IG_D_gene
## 0 0
## IG_D_pseudogene IG_J_gene
## 0 8
## IG_LV_gene IG_pseudogene
## 0 0
## IG_V_gene IG_V_pseudogene
## 105 2
## lincRNA macro_lncRNA
## 2736 0
## miRNA misc_RNA
## 799 171
## Mt_rRNA Mt_tRNA
## 2 21
## polymorphic_pseudogene processed_pseudogene
## 22 2242
## processed_transcript protein_coding
## 623 18117
## pseudogene ribozyme
## 48 9
## rRNA scaRNA
## 31 29
## scRNA sense_intronic
## 1 284
## sense_overlapping snoRNA
## 20 320
## snRNA sRNA
## 294 1
## TEC TR_C_gene
## 2498 7
## TR_D_gene TR_J_gene
## 0 45
## TR_J_pseudogene TR_V_gene
## 3 27
## TR_V_pseudogene transcribed_processed_pseudogene
## 2 158
## transcribed_unitary_pseudogene transcribed_unprocessed_pseudogene
## 10 125
## unitary_pseudogene unprocessed_pseudogene
## 3 260
# Get lncRNA genes
lncrna<-c("lincRNA","antisense_RNA","sense_intronic","sense_overlapping","TEC")
DESeq.geneanno.out.ordered.lncRNA_a <- DESeq.geneanno.out.ordered_a %>% filter(genetype %in% lncrna)
write.table(DESeq.geneanno.out.ordered.lncRNA_a,file = "DESeq.geneanno.out.ordered.lncRNA.wosham3.txt",sep="\t",quote = F,row.names = F)
write.table(DESeq.geneanno.out.ordered_a,file = "DESeq.geneanno.out.ordered.wosham3.txt",sep="\t",quote = F,row.names = F)
Easy report
#######
# This report is based on DESeq analysis of all samples, default filtering. See ./reports for details.
#######
des2Report <- HTMLReport(shortName = 'RNAseq_analysis_with_DESeq2',
title = 'RNA-seq analysis of differential expression using DESeq2',
reportDirectory = "./reports")
publish(dds,des2Report, pvalueCutoff=0.1,
annotation.db="org.Mm.eg.db", factor = colData(dds)$condition,
reportDir="./reports")
finish(des2Report)
Other notes
What’s in the output file?
- geneid
- log2FoldChange, default fold change output by DESeq2.
- pvalue
- padj, adjusted p value
- log2FoldChange_lfc, moderated and shrunken log2 fold changes, more conservative (and accurate?)
- padj_IHW, adjusted p value of independent hypothesis weighting
(smaller, which results in more differentially-expressed genes)
- genetype
- genename
- TPM.sample, the transcript per million of the gene in the sample
- counts.sample, the read counts of the gene in the sample