% \VignetteEngine{knitr::knitr}
% \VignetteIndexEntry{04. Bioconductor for Sequence Analysis -- slides}

\documentclass[xcolor=dvipsnames]{beamer}

\usepackage{BioconductorSlides}
\hypersetup{colorlinks,linkcolor=,urlcolor=Blue}

\title{\Bioconductor{} for Sequence Analysis}
\author{Martin T.\ Morgan\footnote{\url{mtmorgan@fhcrc.org}}}
\date{27-28 February 2014}

\begin{document}

\maketitle

\begin{frame}{Introduction: What is \Bioconductor{} good for?}
  \begin{itemize}
  \item Sequencing: RNA-seq, ChIP-seq, called variants, \ldots 
    \begin{itemize}
    \item Especially \emph{after} assembly / alignment
    \end{itemize}
  \item Annotation: genes, pathways, gene models (exons, transcripts,
    etc.), \ldots
  \item Microarrays: expression, copy number, SNPs, methylation, \ldots
  \item Flow cytometry, proteomics, image analysis, high-throughput
    screens, \ldots
  \end{itemize}
\end{frame}

\begin{frame}[fragile]{Sequencing: Work flows}
  \begin{columns}
    \column{.5\textwidth}
    \begin{enumerate}
    \item Experimental design
    \item `Wet lab' sample prep
    \item Sequencing
        \begin{itemize}
        \item 100's of millions of reads
        \item 30-150 nucleotides
        \item Single and paired-end
        \item Bar codes, lanes \& flow cells
        \end{itemize}
    \item Alignment
    \item Analysis: DNA, RNA, epigenetics, integrative, microbiome,
      \ldots
    \end{enumerate}
    \column{.5\textwidth}
    \includegraphics[width=\textwidth,height=!]{figures/Solexa-bridge-pcr.jpg}
    \par Bentley et al., 2008, Nature 456:
    \href{http://www.ncbi.nlm.nih.gov/pubmed/18987734}{53-9}
  \end{columns}
\end{frame}

{\scriptsize\begin{verbatim}
@ERR127302.1703 HWI-EAS350_0441:1:1:1460:19184#0/1
CCTGAGTGAAGCTGATCTTGATCTACGAAGAGAGATAGATCTTGATCGTCGAGGAGATGCTGACCTTGACCT
+
HHGHHGHHHHHHHHDGG<GDGGE@GDGGD<?B8??ADAD<BE@EE8EGDGA3CB85*,77@>>CE?=896=:
@ERR127302.1704 HWI-EAS350_0441:1:1:1460:16861#0/1
GCGGTATGCTGGAAGGTGCTCGAATGGAGAGCGCCAGCGCCCCGGCGCTGAGCCGCAGCCTCAGGTCCGCCC
+
DE?DD>ED4>EEE>DE8EEEDE8B?EB<@3;BA79?,881B?@73;1?########################
@ERR127302.1705 HWI-EAS350_0441:1:1:1460:13054#0/1
AAAACACCCTGCAATCTTTCAGACAGGATGTTGACAATGCGTCTCTGGCACGTCTTGACCTTGAACGCAAAG
+
EEDEE>AD>BBGGB8E8EEEGBGGGGBGGGGG3G>E3*?BE??BBC8GB8??:??GGDGDDD>D>B<GDDC8
@ERR127302.1706 HWI-EAS350_0441:1:1:1460:14924#0/1
CACCCAGTGGGGTGGAGTCGGAGCCACTGGTCCTGCTGCTGGCTGCCTCTCTGCTCCACCTTGTGACCCAGG
+
HHHHHGEEGEEADDGDBG>GGD8EG,<6<?AGGADFEHHC@>D@<@G@>AB@B?8AA>CE@D8@B=?CC>AG
@ERR127302.1707 HWI-EAS350_0441:1:1:1461:6983#0/1
CGACGCTGACACCGGAACGGCAGCAGCAGCAGGACGATTAAGACAAGGAGGATGGCTCCACAGACGCTCATG
+
GEEGEGE@GGGGGGEGGGGGBB>G3?33?8*;;79?<9@?DD8@DDEE888;-BB?.A##############
@ERR127302.1708 HWI-EAS350_0441:1:1:1461:10827#0/1
AAAGAAGGTCCTTGCAATAGACTGCCTCTGCTTGAGAACTTATGATGTAATTATTGCATGCTGCTAATATAC
+
GGGGGDDEBFGGGGGBE,DAGDDGGGEEEG<EEFDECFFEEEDE@<>ACEBEFDEEFE<EDC@E<EECCBEB
@ERR127302.1709 HWI-EAS350_0441:1:1:1461:7837#0/1
CAGCCACAGAACCACGGCACGGAAGACATGAGGCAGCATGCTCACGAGAGAGGTGAGGGTCTCCCCTCCAGG
+
HHGHHHH>DH:@.7@49;88G8>G>DDG@D>D@G@GE>@DDBDDG<A82?######################
\end{verbatim}}

\begin{frame}[fragile]{Sequencing: The \Biocpkg{ShortRead} package}
<<ShortRead, eval=FALSE, tidy=FALSE>>=
## Use the 'ShortRead' package
library(ShortRead)
## Create an object to represent a sample from a file
sampler <- FastqSampler("ERR127302_1.fastq.gz")
## Apply a method to yield a random sample
fq <- yield(sampler)
## Access sequences of sampled reads using `sread()`
## Summarize nucleotide use by cycle
## 'abc' is a nucleotide x cycle matrix of counts
abc <- alphabetByCycle(sread(fq))
## Subset of interesting nucleotides
abc <- abc[c("A", "C", "G", "T", "N"),]
@     
\end{frame}

\begin{frame}[fragile]{Sequencing: The \Biocpkg{ShortRead} package}
  \begin{columns}
    \column{.5\textwidth}
<<ShortRead-vis, eval=FALSE, tidy=FALSE>>=
## Create a plot from a
## matrix
matplot(t(abc), type="l",
  lty=1, lwd=3, 
  xlab="Cycle",
  ylab="Count",
  cex.lab=2)
## Add a legend
legend("topright", 
  legend=rownames(abc),
  lty=1, lwd=3, col=1:5,
  cex=1.8)
@     
    \column{.5\textwidth}
    \includegraphics[width=\textwidth]{figures/abc}
  \end{columns}
\end{frame}

\begin{frame}{Sequencing: Essential packages and classes}
  \begin{itemize}
  \item \Biocpkg{Biostrings} and \Rclass{DNAStringSet}
  \item \Biocpkg{GenomicAlignments} and \Rclass{GAlignments}
  \item \Biocpkg{GenomicRanges} and \Rclass{GRanges}
  \item \Biocpkg{GenomicFeatures} and \Rclass{TranscriptDb}
  \item \Biocpkg{VariantAnnotation} and \Rclass{VCF}
  \item Input and output: \Biocpkg{rtracklayer} (WIG, BED, etc.),
    \Biocpkg{Rsamtools} (BAM), \Biocpkg{ShortRead} (FASTQ) file input
  \end{itemize}
\end{frame}

\section*{Sequencing: package tour}

\begin{frame}{Reads}
  \begin{description}
  \item[Data] Short reads and their qualities
  \item[Tasks] Input, quality assessment, summary, trimming, \ldots
  \item[Packages] \Biocpkg{ShortRead}, \Biocpkg{Biostrings}
  \item[Functions]
    \begin{itemize}
    \item \Rfunction{readFastq}, \Rfunction{FastqSampler},
      \Rfunction{FasqtStreamer}.
    \item \Rfunction{qa}, \Rfunction{report}.
    \item \Rfunction{alphabetFrequency}, \Rfunction{alphabetByCycle},
      \Rfunction{consensusMatrix}.
    \item \Rfunction{trimTails}, \Rfunction{trimLRPatterns},
      \Rfunction{matchPDict}, \ldots
    \end{itemize}
  \end{description}
\end{frame}

\begin{frame}{Alignments}
  \begin{description}
  \item[Data] BAM files of aligned reads
  \item[Tasks] Input, BAM file manipulation, pileups
  \item[Packages] \Biocpkg{GenomicAlignments}, \Biocpkg{Rsamtools}
    (also: \Biocpkg{GenomicRanges})
  \item[Functions]
    \begin{itemize}
    \item \Rfunction{readGAlignments}
    \item \Rfunction{BamFile}, \Rfunction{BamFileList}
    \item \Rfunction{scanBam}, \Rfunction{ScanBamParam} (select a
      subset of the BAM file)
    \item \Rfunction{asBam}, \Rfunction{sortBam},
      \Rfunction{indexBam}, \Rfunction{mergeBam},
      \Rfunction{filterBam}
    \item \Rfunction{BamSampler}, \Rfunction{applyPileups}
    \end{itemize}
  \end{description}
\end{frame}

\begin{frame}{Ranges}
  \begin{description}
  \item[Data] Genomic coordinates to represent data (e.g., aligned
    reads) or annotation (e.g., gene models).
  \item[Tasks] Input, counting, coverage, manipulation, \ldots
  \item[Packages] \Biocpkg{GenomicRanges}, \Biocpkg{IRanges}
  \item[Functions]
    \begin{itemize}
    \item \Rfunction{readGAlignments}, \Rfunction{readGAlignmentsList}
    \item Many intra-, inter-, and between-range manipulating, e.g.,
      \Rfunction{narrow}, \Rfunction{flank}, \Rfunction{shift},
      \Rfunction{intersect}, \Rfunction{findOverlaps},
      \Rfunction{countOverlaps}
    \end{itemize}
  \end{description}
\end{frame}

\begin{frame}{Variants}
  \begin{description}
  \item[Data] VCF (Variant Call Format) file
  \item[Tasks] Calling, input, summary, coding consequences
  \item[Packages] \Biocpkg{VariantTools} (linux only),
    \Biocpkg{VariantAnnotation}, \Biocpkg{ensemblVEP}
  \item[Functions]
    \begin{itemize}
    \item \Rfunction{tallyVariants}
    \item \Rfunction{readVcf}, \Rfunction{locateVariants},
      \Rfunction{predictCoding}
    \item Also: SIFT, PolyPhen data bases
    \end{itemize}
  \end{description}
\end{frame}

\begin{frame}{Annotations}
  \begin{description}
  \item[Data] Gene symbols or other identifiers
  \item[Tasks] Discover annotations associated with genes or symbols
  \item[Packages] \Biocpkg{AnnotationDbi} (\Rpackage{org.*},
    \Biocannopkg{GO.db}, \ldots), \Biocpkg{biomaRt}
  \item[Functions]
    \begin{itemize}
    \item Discovery: \Rfunction{columns}, \Rfunction{keytype},
      \Rfunction{keys}
    \item \Rfunction{select}, \Rfunction{merge}
    \item \Biocpkg{biomaRt}: \Rfunction{listMarts},
      \Rfunction{listDatasets}, \Rfunction{listAttributes},
      \Rfunction{listFilters}, \Rfunction{getBM}
    \end{itemize}
  \end{description}
\end{frame}

\begin{frame}{Features}
  \begin{description}
  \item[Data] Genomic coordinates
  \item[Tasks] Group exons by transcript or gene; discover transcript
    / gene identifier mappings
  \item[Packages] \Biocpkg{GenomicFeatures} and \Rpackage{TxDb.*}
    packages (also: \Biocpkg{rtracklayer})
  \item[Functions]
    \begin{itemize}
    \item \Rfunction{exonsBy}, \Rfunction{cdsBy}, \Rfunction{transcriptsBy}
    \item \Rfunction{select} (see Annotations, below)
    \item \Rfunction{makeTranscriptDb*}
    \end{itemize}
  \end{description}
\end{frame}

\begin{frame}{Genome annotations}
  \begin{description}
  \item[Data] FASTA, GTF, VCF, \ldots from internet resources
  \item[Tasks] Define regions of interests; incorporate known features
    (e.g., ENCODE marks, dbSNP variants) in work flows
  \item[Packages] \Biocpkg{AnnotationHub}
  \item[Functions]
    \begin{itemize}
    \item \Rfunction{AnnotationHub}, \Rfunction{filters}
    \item \Rfunction{metadata}, \Rcode{hub\$<tab>}
    \end{itemize}
  \end{description}
\end{frame}

\begin{frame}{Sequences}
  \begin{description}
  \item[Data] Whole-genome sequences
  \item[Tasks] View sequences, match position weight matricies, match
    patterns
  \item[Packages] \Biocpkg{Biostrings}, \Biocpkg{BSgenome}
  \item[Functions]
    \begin{itemize}
    \item \Rfunction{available.genomes}
    \item \Rcode{Hsapiens[["chr3"]]}, \Rfunction{getSeq}, \Rfunction{mask}
    \item \Rfunction{matchPWM}, \Rfunction{vcountPattern}, \ldots
    \item \Rfunction{forgeBSgenomeDataPkg}
    \end{itemize}
  \end{description}
\end{frame}

\begin{frame}{Import / export}
  \begin{description}
  \item[Data] Common text-based formats, \texttt{gff}, \texttt{wig},
    \texttt{bed}; UCSC tracks
  \item[Tasks] Import and export
  \item[Packages] \Biocpkg{rtracklayer}
  \item[Functions]
    \begin{itemize}
    \item \Rfunction{import}, \Rfunction{export}
    \item \Rfunction{browserSession}, \Rfunction{genome}
    \end{itemize}
  \end{description}
\end{frame}

\begin{frame}{And\ldots}
  
    Data representation: \Biocpkg{IRanges}, \Biocpkg{GenomicRanges},
    \Biocpkg{GenomicFeatures}, \Biocpkg{Biostrings},
    \Biocpkg{BSgenome}, \Biocpkg{girafe}.
    Input / output: \Biocpkg{ShortRead} (fastq), \Biocpkg{Rsamtools}
    (bam), \Biocpkg{rtracklayer} (gff, wig, bed),
    \Biocpkg{VariantAnnotation} (vcf), \Biocpkg{R453Plus1Toolbox}
    (454).
    Annotation: \Biocpkg{GenomicFeatures}, \Biocpkg{ChIPpeakAnno},
    \Biocpkg{VariantAnnotation}.
    Alignment: \Biocpkg{Rsubread}, \Biocpkg{Biostrings}.
    Visualization: \Biocpkg{ggbio}, \Biocpkg{Gviz}.
    Quality assessment: \Biocpkg{qrqc}, \Biocpkg{seqbias},
    \Biocpkg{ReQON}, \Biocpkg{htSeqTools}, \Biocpkg{TEQC},
    \Biocpkg{Rolexa}, \Biocpkg{ShortRead}.
    RNA-seq: \Biocpkg{BitSeq}, \Biocpkg{cqn}, \Biocpkg{cummeRbund},
    \Biocpkg{DESeq}, \Biocpkg{DEXSeq}, \Biocpkg{EDASeq},
    \Biocpkg{edgeR}, \Biocpkg{gage}, \Biocpkg{goseq},
    \Biocpkg{iASeq}, \Biocpkg{tweeDEseq}.
    ChIP-seq, etc.: \Biocpkg{BayesPeak}, \Biocpkg{baySeq},
    \Biocpkg{ChIPpeakAnno}, \Biocpkg{chipseq}, \Biocpkg{ChIPseqR},
    \Biocpkg{ChIPsim}, \Biocpkg{CSAR}, \Biocpkg{DiffBind},
    \Biocpkg{MEDIPS}, \Biocpkg{mosaics}, \Biocpkg{NarrowPeaks},
    \Biocpkg{nucleR}, \Biocpkg{PICS}, \Biocpkg{PING},
    \Biocpkg{REDseq}, \Biocpkg{Repitools}, \Biocpkg{TSSi}.
    Motifs: \Biocpkg{BCRANK}, \Biocpkg{cosmo}, \Biocpkg{cosmoGUI},
    \Biocpkg{MotIV}, \Biocpkg{seqLogo}, \Biocpkg{rGADEM}.
    3C, etc.: \Biocpkg{HiTC}, \Biocpkg{r3Cseq}.
    Copy number: \Biocpkg{cn.mops}, \Biocpkg{CNAnorm},
    \Biocpkg{exomeCopy}, \Biocpkg{seqmentSeq}.
    Microbiome: \Biocpkg{phyloseq}, \Biocpkg{DirichletMultinomial},
    \Biocpkg{clstutils}, \Biocpkg{manta}, \Biocpkg{mcaGUI}.
    Work flows: \Biocpkg{ArrayExpressHTS}, \Biocpkg{Genominator},
    \Biocpkg{easyRNASeq}, \Biocpkg{oneChannelGUI},
    \Biocpkg{rnaSeqMap}.
    Database: \Biocpkg{SRAdb}. \ldots
\end{frame}

\section*{Exemplars}

\begin{frame}{Exemplars: Algorithms to action}
  \begin{enumerate}
  \item Batch effects
  \item Methylation
  \item RNA-seq Differential Representation
  \item Visualization
  \end{enumerate}
\end{frame}

\begin{frame}{Exemplar: Differential Representation}
  Haglund et al., 2012
  \href{http://www.ncbi.nlm.nih.gov/pubmed/23024189}{J Clin Endocrin
    Metab}
  \bigskip\par
  \begin{columns}
    \column{.5\textwidth}
    \begin{itemize}
    \item Scientific finding: identify genes whose expression is
      regulated by estrogen receptors in parathyroid adenoma cells
    \item Statistical challenges: between-sample normalization;
      appropriate statistical model; efficient estimation; \ldots
    \end{itemize}
    \column{.5\textwidth}
    \includegraphics[width=\textwidth]{figures/DESeq2_parathyroid-plotMApadjchange.png}
  \end{columns}
  \bigskip\par\Bioconductor{} support: \Biocpkg{DESeq2}, \Biocpkg{edgeR}, many
  statistical `lessons learned' from microarrays; extensive
  integration with down-stream tools
\end{frame}

\begin{frame}{Exemplar: Batch Effects}
  Leek et al., 2010, Nature Reviews Genetics 11,
  \href{http://www.nature.com/nrg/journal/v11/n10/abs/nrg2825.html}{733-739},
  Leek \& Story
  \href{http://dx.doi.org/10.1371/journal.pgen.0030161}{PLoS Genet
    3(9): e161}
  \begin{columns}
    \column{.5\textwidth} 
    \begin{itemize}
    \item Scientific finding: pervasive batch effects
    \item Statistical insights: surrogate variable analysis: identify
      and build surrogate variables; remove known batch effects
    \item Benefits: reduce dependence, stabilize error rate estimates,
      and improve reproducibility
    \end{itemize}
    \Bioconductor{} support: \Biocpkg{sva}
    \column{.5\textwidth}
    \only<1>{
      \includegraphics[width=\textwidth]{figures/nrg2825-f2.jpg}
      \par{\small HapMap samples from one facility, ordered by
        date of processing.  From }
    }
    \only<2>{
      \begin{enumerate}
      \item Remove signal due to variable(s) of interest
      \item Identify subset of genes driving orthogonal signatures
        of EH
      \item Build a surrogate variable based on full EH signature of
        that subset
      \item Include significant surrogate variables as covariates
      \end{enumerate}
      EH: expression heterogeneity
    }
  \end{columns}
\end{frame}

\begin{frame}{Exemplar: Methylation}
  Hansen et al., 2011, Nature Genetics 43, 
  \href{http://www.nature.com/ng/journal/v43/n8/full/ng.865.html}{768-775}
  \begin{itemize}
  \item Scientific finding: stochastic methylation variation of
    cancer-specific de-methylated regions (DMR), distinguishing cancer from
    normal tissue, in several cancers.
  \item Statistical challenges: smoothing, non-specific filtering, $t$
    statistics, find DMRs
  \end{itemize}
  \bigskip\par
  \includegraphics[width=\textwidth]{figures/bsseq_analysis-1.png}
  \medskip\par \Bioconductor{} support: whole-genome (\Biocpkg{bsseq})
  or reduced representation (\Biocpkg{MethylSeekR}) bisulfite
  sequencing; Illumina 450k arrays (\Biocpkg{minfi})
\end{frame}

\begin{frame}{Exemplar: Visualization}
  \begin{columns}
    \column{.5\textwidth}
    \Biocpkg{Gviz}\par
    \only<1-2>{
    \begin{itemize}
    \item Track-like visualizations
    \item Data panels
    \item Fully integrated with \Bioconductor{} sequence
      representations
    \end{itemize}
    }
    \Biocpkg{ggbio}\par
    \only<3>{
    \begin{itemize}
    \item Comprehensive visualizations
    \item \Rfunction{autoplot} file and data types
    \item Fully integrated with \Bioconductor{} sequence
      representations
    \end{itemize}
    }
    \Biocpkg{epivizr}\par
    \only<4>{
      \begin{itemize}
      \item Genome browser with socket communication to \R{}
      \item Fully integrated with \Bioconductor{} sequence
        representations
      \end{itemize}
    }
    \column{.5\textwidth}
    \only<1>{\includegraphics[width=\textwidth]{figures/Gviz-vignette-1.png}}
    \only<2>{\includegraphics[width=\textwidth]{figures/Gviz-vignette-2.png}}
    \only<3>{\includegraphics[width=\textwidth]{figures/ggbio-vignette-1.png}}
    \only<4>{\includegraphics[width=\textwidth]{figures/epivisr.png}}
  \end{columns}
\end{frame}


\begin{frame}{Principles: Some key points}
  \begin{itemize}
  \item \R{} is a high-level programming language, so lots can be
    accomplished with just a little code
  \item Packages such as \Biocpkg{ShortRead} provide a great way to
    benefit from the expertise of others (and to contribute your own
    expertise back to the community!)
    \begin{itemize}
    \item The path from `user' to `developer' is not that long, and
      has been taken by many!
    \end{itemize}
  \item Objects and methods such as \Rclass{data.frame},
    \Rclass{ShortReadQ} and \Rcode{alphabetByCycle()}) help to manage
    complicated data
    \begin{itemize}
    \item Reducing possibility for clerical and other mistakes
    \item Facilitating inter-operability between different parts of an
      analysis
    \end{itemize}
  \item Scripts make work flows reproducible
  \item Visualizing data is an important part of exploratory analysis
  \end{itemize}
\end{frame}

\begin{frame}{Principles: Successful computational biology software}
  \begin{enumerate}
  \item Extensive: software, annotation, integration
      \begin{itemize}
      \item 750 inter-operable \Bioconductor{} packages
      \end{itemize}
  \item Statistical: volume, technology, experimental design
      \begin{itemize}
      \item \R{} a `natural' for statistical analysis
      \end{itemize}
  \item Reproducible: long-term, multi-participant science
      \begin{itemize}
      \item Objects, scripts, vignettes, packages, \ldots encourage
        reproducible research
      \end{itemize}
  \item Leading edge: novel, technology-driven
      \begin{itemize}
      \item Packages and user community closely track leading edge
        science
      \end{itemize}
  \item Accessible: affordable, transparent, usable
      \begin{itemize}
      \item \Bioconductor{} is free and open, with extensive
        documentation and an active and supportive user community
      \end{itemize}
  \end{enumerate}
  Case study: differential expression of known genes; see also
  \href{http://bioconductor.org/help/course-materials/2013/EMBOBGI/reproducible-research.pdf}{reproducible
    research} lecture.
\end{frame}

\begin{frame}{Challenges \& Opportunities}
  \begin{itemize}
  \item Big data -- transparent management within \R, facile use of
    established resources
  \item Developer and user training
  \end{itemize}
  Resources
  \begin{itemize}
  \item \url{http://r-project.org}, \emph{An Introduction to \R}
    manual; Dalgaard, \emph{Introductory Statistics with \R};
    \href{http://rfordummies.com/}{\R{} for Dummies}
  \item \url{http://bioconductor.org/}
  \item \url{http://rstudio.org}
  \item 
    \href{http://stackoverflow.com/questions/tagged/r}{StackOverflow},
    \Bioconductor{}
    \href{http://bioconductor.org/help/mailing-list/mailform/}{mailing
      list}
  \end{itemize}
\end{frame}


\end{document}