## ----knitr, echo=FALSE, results="hide"-----------------------------------
library("knitr")
opts_chunk$set(tidy=FALSE,dev="png",fig.show="hide",
               fig.width=4,fig.height=4.5,
               message=FALSE)


## ----style, eval=TRUE, echo=FALSE, results="asis"------------------------
BiocStyle::latex()


## ----loadDESeq2, echo=FALSE----------------------------------------------
library("DESeq2")


## ----options, results="hide", echo=FALSE---------------------------------
options(digits=3, width=80, prompt=" ", continue=" ")


## ----quick, eval=FALSE---------------------------------------------------
## dds <- DESeqDataSet(se, ~ batch + condition)
## dds <- DESeq(dds)
## res <- results(dds, contrast=c("condition","treat","untreat"))


## ----loadSumExp,cache=TRUE-----------------------------------------------
library("airway")
data("airway")
se <- airway


## ----sumExpInput, cache=TRUE---------------------------------------------
library("DESeq2")
ddsSE <- DESeqDataSet(se, design = ~ cell + dex)
ddsSE


## ----loadPasilla---------------------------------------------------------
library("pasilla")
library("Biobase")
data("pasillaGenes")
countData <- counts(pasillaGenes)
colData <- pData(pasillaGenes)[,c("condition","type")]


## ----showPasilla---------------------------------------------------------
head(countData)
head(colData)


## ----matrixInput---------------------------------------------------------
dds <- DESeqDataSetFromMatrix(countData = countData,
                              colData = colData,
                              design = ~ condition)
dds


## ----addFeatureData------------------------------------------------------
featureData <- data.frame(gene=rownames(pasillaGenes))
(mcols(dds) <- DataFrame(mcols(dds), featureData))


## ----htseqDirI, eval=FALSE-----------------------------------------------
## directory <- "/path/to/your/files/"


## ----htseqDirII----------------------------------------------------------
directory <- system.file("extdata", package="pasilla", mustWork=TRUE)


## ----htseqInput, cache=TRUE----------------------------------------------
sampleFiles <- grep("treated",list.files(directory),value=TRUE)
sampleCondition <- sub("(.*treated).*","\\1",sampleFiles)
sampleTable <- data.frame(sampleName = sampleFiles,
                          fileName = sampleFiles,
                          condition = sampleCondition)
ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
                                       directory = directory,
                                       design= ~ condition)
ddsHTSeq


## ----relevel-------------------------------------------------------------
dds$condition <- relevel(dds$condition, "untreated")


## ----droplevels----------------------------------------------------------
dds$condition <- droplevels(dds$condition)


## ----deseq, cache=TRUE---------------------------------------------------
dds <- DESeq(dds)
res <- results(dds)
res


## ----parallel, eval=FALSE------------------------------------------------
## library("BiocParallel")
## register(MulticoreParam(4))


## ----resOrder------------------------------------------------------------
resOrdered <- res[order(res$padj),]
head(resOrdered)


## ----sumRes--------------------------------------------------------------
summary(res)


## ----MA, fig.width=4.5, fig.height=4.5-----------------------------------
plotMA(res, main="DESeq2", ylim=c(-2,2))


## ----resMLE--------------------------------------------------------------
resMLE <- results(dds, addMLE=TRUE)
head(resMLE, 4)


## ----MANoPrior, echo=FALSE, fig.width=4.5, fig.height=4.5----------------
df <- data.frame(resMLE$baseMean, resMLE$lfcMLE, ifelse(is.na(res$padj), FALSE, res$padj < .1))
plotMA(df, main=expression(unshrunken~log[2]~fold~changes), ylim=c(-2,2))


## ----MAidentify, eval=FALSE----------------------------------------------
## identify(res$baseMean, res$log2FoldChange)


## ----plotCounts, dev="pdf", fig.width=4.5, fig.height=5------------------
plotCounts(dds, gene=which.min(res$padj), intgroup="condition")


## ----plotCountsAdv, dev="pdf", fig.width=3.5, fig.height=3.5-------------
d <- plotCounts(dds, gene=which.min(res$padj), intgroup="condition", 
                returnData=TRUE)
library("ggplot2")
ggplot(d, aes(x=condition, y=count)) + 
  geom_point(position=position_jitter(w=0.1,h=0)) + 
  scale_y_log10(breaks=c(25,100,400))


## ----metadata------------------------------------------------------------
mcols(res)$description


## ----addMLE--------------------------------------------------------------
head(results(dds, addMLE=TRUE),4)


## ----export, eval=FALSE--------------------------------------------------
## write.csv(as.data.frame(resOrdered),
##           file="condition_treated_results.csv")


## ----subset--------------------------------------------------------------
resSig <- subset(resOrdered, padj < 0.1)
resSig


## ----multifactor---------------------------------------------------------
colData(dds)


## ----copyMultifactor-----------------------------------------------------
ddsMF <- dds


## ----replaceDesign,cache=TRUE--------------------------------------------
design(ddsMF) <- formula(~ type + condition)
ddsMF <- DESeq(ddsMF)


## ----multiResults--------------------------------------------------------
resMF <- results(ddsMF)
head(resMF)


## ----multiTypeResults----------------------------------------------------
resMFType <- results(ddsMF, contrast=c("type","single-read","paired-end"))
head(resMFType)


## ----rlogAndVST----------------------------------------------------------
rld <- rlog(dds)
vsd <- varianceStabilizingTransformation(dds)
rlogMat <- assay(rld)
vstMat <- assay(vsd)


## ----vsd1, echo=FALSE, fig.width=4.5, fig.height=4.5---------------------
px     <- counts(dds)[,1] / sizeFactors(dds)[1]
ord    <- order(px)
ord    <- ord[px[ord] < 150]
ord    <- ord[seq(1, length(ord), length=50)]
last   <- ord[length(ord)]
vstcol <- c("blue", "black")
matplot(px[ord],
        cbind(assay(vsd)[, 1], log2(px))[ord, ],
        type="l", lty=1, col=vstcol, xlab="n", ylab="f(n)")
legend("bottomright",
       legend = c(
        expression("variance stabilizing transformation"),
        expression(log[2](n/s[1]))),
       fill=vstcol)


## ----vsd2, fig.width=8, fig.height=3-------------------------------------
library("vsn")
par(mfrow=c(1,3))
notAllZero <- (rowSums(counts(dds))>0)
meanSdPlot(log2(counts(dds,normalized=TRUE)[notAllZero,] + 1))
meanSdPlot(assay(rld[notAllZero,]))
meanSdPlot(assay(vsd[notAllZero,]))


## ----heatmap-------------------------------------------------------------
library("RColorBrewer")
library("gplots")
select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:30]
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)


## ----figHeatmap2a, dev="pdf", fig.width=7, fig.height=10-----------------
heatmap.2(counts(dds,normalized=TRUE)[select,], col = hmcol,
          Rowv = FALSE, Colv = FALSE, scale="none",
          dendrogram="none", trace="none", margin=c(10,6))


## ----figHeatmap2b, dev="pdf", fig.width=7, fig.height=10-----------------
heatmap.2(assay(rld)[select,], col = hmcol,
          Rowv = FALSE, Colv = FALSE, scale="none",
          dendrogram="none", trace="none", margin=c(10, 6))


## ----figHeatmap2c, dev="pdf", fig.width=7, fig.height=10-----------------
heatmap.2(assay(vsd)[select,], col = hmcol,
          Rowv = FALSE, Colv = FALSE, scale="none",
          dendrogram="none", trace="none", margin=c(10, 6))


## ----sampleClust---------------------------------------------------------
distsRL <- dist(t(assay(rld)))


## ----figHeatmapSamples, dev="pdf", fig.width=7, fig.height=7-------------
mat <- as.matrix(distsRL)
rownames(mat) <- colnames(mat) <- with(colData(dds),
                                       paste(condition, type, sep=" : "))
hc <- hclust(distsRL)
heatmap.2(mat, Rowv=as.dendrogram(hc),
          symm=TRUE, trace="none", 
          col = rev(hmcol), margin=c(13, 13))


## ----figPCA, dev="pdf", fig.width=5, fig.height=3------------------------
plotPCA(rld, intgroup=c("condition", "type"))


## ----figPCA2, dev="pdf", fig.width=5, fig.height=3-----------------------
data <- plotPCA(rld, intgroup=c("condition", "type"), returnData=TRUE)
percentVar <- round(100 * attr(data, "percentVar"))
ggplot(data, aes(PC1, PC2, color=condition, shape=type)) +
  geom_point(size=3) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance"))


## ----WaldTest, eval=FALSE------------------------------------------------
## dds <- estimateSizeFactors(dds)
## dds <- estimateDispersions(dds)
## dds <- nbinomWaldTest(dds)


## ----combineFactors, eval=FALSE------------------------------------------
## dds$group <- factor(paste0(dds$set, dds$condition))
## design(dds) <- ~ group
## dds <- DESeq(dds)
## resultsNames(dds)
## results(dds, contrast=c("group", "1B", "1A"))


## ----expInter, eval=FALSE------------------------------------------------
## results(dds, contrast=c("condition","B","A"))


## ----expInter2, eval=FALSE-----------------------------------------------
## results(dds, contrast=list("setY.conditionB","setY.conditionA"))


## ----expInter3, eval=FALSE-----------------------------------------------
## results(dds, contrast=list(c("conditionB","setY.conditionB"),
##                            c("conditionA","setY.conditionA")))


## ----expInter4, eval=FALSE-----------------------------------------------
## results(dds, name="setY.conditionB")


## ----expInter5, eval=FALSE-----------------------------------------------
## results(dds, contrast=list(c("conditionB","setY.conditionB")))


## ----expInter6, eval=FALSE-----------------------------------------------
## resultsNames(dds)
## results(dds, contrast=c(0,0,1,0.5))


## ----mmtype, eval=FALSE--------------------------------------------------
## attr(dds, "modelMatrixType")
## attr(dds, "modelMatrix")


## ----dispFit-------------------------------------------------------------
plotDispEsts(dds)


## ----dispFitCustom-------------------------------------------------------
ddsCustom <- dds
useForMedian <- mcols(ddsCustom)$dispGeneEst > 1e-7
medianDisp <- median(mcols(ddsCustom)$dispGeneEst[useForMedian],na.rm=TRUE)
dispersionFunction(ddsCustom) <- function(mu) medianDisp
ddsCustom <- estimateDispersionsMAP(ddsCustom)


## ----filtByMean, dev="pdf"-----------------------------------------------
attr(res,"filterThreshold")
plot(attr(res,"filterNumRej"),type="b",
     ylab="number of rejections")


## ----noFilt--------------------------------------------------------------
resNoFilt <- results(dds, independentFiltering=FALSE)
addmargins(table(filtering=(res$padj < .1), noFiltering=(resNoFilt$padj < .1)))


## ----ddsNoPrior----------------------------------------------------------
ddsNoPrior <- DESeq(dds, betaPrior=FALSE)


## ----lfcThresh-----------------------------------------------------------
par(mfrow=c(2,2),mar=c(2,2,1,1))
yl <- c(-2.5,2.5)

resGA <- results(dds, lfcThreshold=.5, altHypothesis="greaterAbs")
resLA <- results(ddsNoPrior, lfcThreshold=.5, altHypothesis="lessAbs")
resG <- results(dds, lfcThreshold=.5, altHypothesis="greater")
resL <- results(dds, lfcThreshold=.5, altHypothesis="less")

plotMA(resGA, ylim=yl)
abline(h=c(-.5,.5),col="dodgerblue",lwd=2)
plotMA(resLA, ylim=yl)
abline(h=c(-.5,.5),col="dodgerblue",lwd=2)
plotMA(resG, ylim=yl)
abline(h=.5,col="dodgerblue",lwd=2)
plotMA(resL, ylim=yl)
abline(h=-.5,col="dodgerblue",lwd=2)


## ----mcols---------------------------------------------------------------
mcols(dds,use.names=TRUE)[1:4,1:4]
# here using substr() only for display purposes
substr(names(mcols(dds)),1,10) 
mcols(mcols(dds), use.names=TRUE)[1:4,]


## ----muAndCooks----------------------------------------------------------
head(assays(dds)[["mu"]])
head(assays(dds)[["cooks"]])


## ----dispersions---------------------------------------------------------
head(dispersions(dds))
# which is the same as 
head(mcols(dds)$dispersion)


## ----sizefactors---------------------------------------------------------
sizeFactors(dds)


## ----coef----------------------------------------------------------------
head(coef(dds))


## ----betaPriorVar--------------------------------------------------------
attr(dds, "betaPriorVar")


## ----dispPriorVar--------------------------------------------------------
dispersionFunction(dds)
attr(dispersionFunction(dds), "dispPriorVar")


## ----normFactors, eval=FALSE---------------------------------------------
## normFactors <- normFactors / exp(rowMeans(log(normFactors)))
## normalizationFactors(dds) <- normFactors


## ----offsetTransform, eval=FALSE-----------------------------------------
## cqnOffset <- cqnObject$glm.offset
## cqnNormFactors <- exp(cqnOffset)
## EDASeqNormFactors <- exp(-1 * EDASeqOffset)


## ----lineardep, echo=FALSE-----------------------------------------------
data.frame(batch=factor(c(1,1,2,2)), condition=factor(c("A","A","B","B")))


## ----lineardep2, echo=FALSE----------------------------------------------
data.frame(batch=factor(c(1,1,1,1,2,2)), condition=factor(c("A","A","B","B","C","C")))


## ----lineardep3, echo=FALSE----------------------------------------------
data.frame(batch=factor(c(1,1,1,2,2,2)), condition=factor(c("A","B","C","A","B","C")))


## ----groupeffect---------------------------------------------------------
(coldata <- data.frame(grp=factor(rep(c("X","Y"),each=4)),
                       ind=factor(rep(1:4,each=2)),
                       cnd=factor(rep(c("A","B"),4))))


## ----groupeffect2--------------------------------------------------------
coldata$ind.n <- factor(rep(rep(1:2,each=2),2))
coldata


## ----groupeffect3--------------------------------------------------------
model.matrix(~ grp + grp:ind.n + grp:cnd, coldata)


## ----missingcombo--------------------------------------------------------
group <- factor(rep(1:3,each=6))
condition <- factor(rep(rep(c("A","B","C"),each=2),3))
(d <- data.frame(group, condition)[-c(17,18),])


## ----missingcombo2-------------------------------------------------------
m1 <- model.matrix(~ condition*group, d)
colnames(m1)
unname(m1)


## ----missingcombo3-------------------------------------------------------
m1 <- m1[,-9]
unname(m1)


## ----cooksPlot-----------------------------------------------------------
W <- res$stat
maxCooks <- apply(assays(dds)[["cooks"]],1,max)
idx <- !is.na(W)
plot(rank(W[idx]), maxCooks[idx], xlab="rank of Wald statistic", 
     ylab="maximum Cook's distance per gene",
     ylim=c(0,5), cex=.4, col=rgb(0,0,0,.3))
m <- ncol(dds)
p <- 3
abline(h=qf(.99, p, m - p))


## ----indFilt-------------------------------------------------------------
plot(res$baseMean+1, -log10(res$pvalue),
     log="x", xlab="mean of normalized counts",
     ylab=expression(-log[10](pvalue)),
     ylim=c(0,30),
     cex=.4, col=rgb(0,0,0,.3))


## ----histindepfilt, dev="pdf", fig.width=7, fig.height=5-----------------
use <- res$baseMean > attr(res,"filterThreshold")
table(use)
h1 <- hist(res$pvalue[!use], breaks=0:50/50, plot=FALSE)
h2 <- hist(res$pvalue[use], breaks=0:50/50, plot=FALSE)
colori <- c(`do not pass`="khaki", `pass`="powderblue")


## ----fighistindepfilt----------------------------------------------------
barplot(height = rbind(h1$counts, h2$counts), beside = FALSE,
        col = colori, space = 0, main = "", ylab="frequency")
text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)),
     adj = c(0.5,1.7), xpd=NA)
legend("topright", fill=rev(colori), legend=rev(names(colori)))


## ----sortP, cache=TRUE---------------------------------------------------
resFilt <- res[use & !is.na(res$pvalue),]
orderInPlot <- order(resFilt$pvalue)
showInPlot <- (resFilt$pvalue[orderInPlot] <= 0.08)
alpha <- 0.1


## ----sortedP, fig.width=4.5, fig.height=4.5------------------------------
plot(seq(along=which(showInPlot)), resFilt$pvalue[orderInPlot][showInPlot],
     pch=".", xlab = expression(rank(p[i])), ylab=expression(p[i]))
abline(a=0, b=alpha/length(resFilt$pvalue), col="red3", lwd=2)


## ----doBH, echo=FALSE, results="hide"------------------------------------
whichBH <- which(resFilt$pvalue[orderInPlot] <= alpha * seq(along=resFilt$pvalue)/length(resFilt$pvalue))
whichBH <- seq_len(max(whichBH))
## Test some assertions:
## - whichBH is a contiguous set of integers from 1 to length(whichBH)
## - the genes selected by this graphical method coincide with those
##   from p.adjust (i.e. padjFilt)
stopifnot(length(whichBH)>0,
          identical(whichBH, seq(along=whichBH)),
          resFilt$padj[orderInPlot][ whichBH] <= alpha,
          resFilt$padj[orderInPlot][-whichBH]  > alpha)


## ----SchwSpjot, echo=FALSE, results="hide"-------------------------------
j  <- round(length(resFilt$pvalue)*c(1, .66))
px <- (1-resFilt$pvalue[orderInPlot[j]])
py <- ((length(resFilt$pvalue)-1):0)[j]
slope <- diff(py)/diff(px)


## ----SchwederSpjotvoll, fig.width=4.5, fig.height=4.5--------------------
plot(1-resFilt$pvalue[orderInPlot],
     (length(resFilt$pvalue)-1):0, pch=".",
     xlab=expression(1-p[i]), ylab=expression(N(p[i])))
abline(a=0, slope, col="red3", lwd=2)


## ----vanillaDESeq, eval=FALSE--------------------------------------------
## dds <- DESeq(dds, minReplicatesForReplace=Inf)
## res <- results(dds, cooksCutoff=FALSE, independentFiltering=FALSE)


## ----sessInfo, results="asis", echo=FALSE--------------------------------
toLatex(sessionInfo())


## ----resetOptions, results="hide", echo=FALSE----------------------------
options(prompt="> ", continue="+ ")