%% \VignetteIndexEntry{Project Overview -- Slides}
%% \VignetteEngine{knitr::knitr}

\documentclass[xcolor=dvipsnames]{beamer}
\usepackage{BioconductorSlides}
\usepackage[round]{natbib}

\hypersetup{colorlinks,linkcolor=,urlcolor=Blue}
\AtBeginSection[]
{
  \begin{frame}<beamer>{Outline}
    \tableofcontents[currentsection]
  \end{frame}
}

<<cache-knitr, echo=FALSE>>=
knitr::opts_chunk$set(cache=TRUE)
@ 

\begin{document}

\title{\R{} / \Bioconductor{} for Epigenomic Analysis}
\author{Martin Morgan \url{mtmorgan@fhcrc.org} \\ 
  Fred Hutchinson Cancer Research Center \\
  Seattle, WA, USA}
\date{25 August 2014}
\maketitle

\section*{Introduction}

\begin{frame}{\R{} and \Bioconductor{}}
  \R, \url{http://r-project.org}
  \begin{itemize}
  \item Open-source, statistical programming language; widely used in
    academia, finance, pharma, \ldots
  \item Core language, `base' and $>5000$ contributed packages
  \item Interactive sessions, scripts, packages
  \end{itemize}
  \Bioconductor, \url{http://bioconductor.org}
  \begin{itemize}
  \item Analysis and comprehension of high-throughput genomic data
  \item Themes: rigorous statistical analysis; reproducible work
    flows; integrative analysis
  \item $>12$ years old, $825$ packages
  \item Courses, conferences (package developers: travel scholarships
    available!), mailing list, \ldots
  \end{itemize}
\end{frame}

\begin{frame}{Bioconductor}
  \begin{center}
    \includegraphics{figures/worldmap.png}
  \end{center}
  Trailing 12 month statistics
  \begin{itemize}
  \item 1702 PubMedCentral full-text citations
  \item 9.5M package downloads to 242,000 distinct IP addresses
  \item 1M sessions from 400k visitors to web site
  \item Annual conferences; courses; active mailing list; \ldots
  \end{itemize}
\end{frame}

\section*{Epigenomics}

\begin{frame}{Epigenomics}
  \begin{enumerate}
  \item Methylation
    \begin{itemize}
    \item Illumina 450k arrays
    \item Whole genome and restricted representation bisulfite sequencing
    \end{itemize}
  \item ChIP-Seq
  \item Integration \& visualization
    \begin{itemize}
    \item Data types, e.g., expression
    \item Consortium resources \emph{via} \Biocpkg{AnnotationHub}
    \end{itemize}
  \item A key data structure: \Rclass{GRanges}
  \end{enumerate}
  Find relevant packages using
  \href{http://bioconductor.org/packages/release/BiocViews.html}{BiocViews},
  in addition to standard scholarly approaches.
\end{frame}
  
\section*{Methylation}

\begin{frame}{Methylation: selected packages}
  450k arrays
  \begin{itemize}
  \item \Biocpkg{minfi} -- pre-processing, differential methylation
  \item \Biocpkg{ChAMP} -- comprehensive work flow
  \end{itemize}
  Bisulfite sequencing
  \begin{itemize}
  \item \Biocpkg{bsseq} -- whole genome
  \item \Biocpkg{BiSeq} -- restricted representation
  \end{itemize}
\end{frame}

\begin{frame}{Methylation: Illumina 450k arrays}
  \begin{center}
    \includegraphics{figures/600px-Illuminamethylationworkflow.png}
  \end{center}
  \par\url{http://en.wikipedia.org/wiki/Illumina_Methylation_Assay}
\end{frame}

\begin{frame}{Methylation: \Biocpkg{minfi}}
  Work flow steps and representative functions
  \begin{enumerate}
  \item Data input: \Rfunction{read.450k.exp()}
  \item Quality assessment: \Rfunction{densityPlot()}
  \item Pre-processing, e.g., background correction, normalization: 
    \Rfunction{preprocessIllumina()}
  \item Differentially methylated probes: \Rfunction{dmpFinder()}
  \item Differentially methylated regions: \Rfunction{bumphunter()}
  \end{enumerate}
  See the
  \href{http://bioconductor.org/packages/release/bioc/html/minfi.html}{vignette}
  for additional detail.
  \bigskip\par
  \includegraphics[width=\textwidth]{figures/bsseq_analysis-1.png}
\end{frame}

%% \begin{frame}{Methylation: \Biocpkg{minfi} exemplar}
%%   Hansen et al., 2011, Nature Genetics 43, 
%%   \href{http://www.nature.com/ng/journal/v43/n8/full/ng.865.html}{768-775}
%%   \begin{itemize}
%%   \item Scientific finding: stochastic methylation variation of
%%     cancer-specific de-methylated regions (DMR), distinguishing cancer from
%%     normal tissue, in several cancers.
%%   \item Statistical challenges: non-specific filtering, $t$
%%     statistics, find DMRs, smoothing
%%   \end{itemize}
%% \end{frame}

\section*{ChIP-seq}

%% \begin{frame}{ChIP-seq}
%%   \begin{columns}
%%     \column{.5\textwidth}
%%     \includegraphics[width=\textwidth]{figures/protocol.png}
%%     \column{.5\textwidth}
%%     Chromatin immunoprecipitation, followed by sequencing
%%     \begin{itemize}
%%     \item Determine location of proteins bound to DNA
%%     \end{itemize}
%%     Useful for detecting
%%     \begin{itemize}
%%     \item Transcription factor binding sites
%%     \item Histone modifications
%%     \end{itemize}
%%     Common questions
%%     \begin{itemize}
%%     \item Which genes is this TF regulating?
%%     \item How do histone modifications affect expression?
%%     \end{itemize}
%%   \end{columns}
%% \end{frame}

\begin{frame}{ChIP-seq: peak calling}
  \includegraphics[width=\textwidth]{figures/nbt-1508-F1.jpg}
  \footnote{\citet{pmid19029915}}
  \begin{itemize}
  \item Chromatin immunoprecipitation, followed by sequencing to
    determine location of proteins bound to DNA
  \item Useful for locating transcription factor binding sites,
    histone modifications, \ldots
  \end{itemize}
\end{frame}

\begin{frame}{ChIP-seq work flow}
  \begin{columns}
    \column{.5\textwidth}
    \includegraphics[width=\textwidth]{figures/ChIPSeq-workflow}
    \par Analysis overview: \citet{10.1371/journal.pcbi.1003326}
    \column{.5\textwidth}
    \begin{itemize}
    \item Annotation: what genes are my peaks near?
    \item Differential representation: which peaks are over- or
      under-represented in treatment 1, compared to treatment 2?
    \item Motif identification (peaks over known motifs?)  and
      discovery
    \item Integrative analysis, e.g., association of regulatory
      elements and expression
    \end{itemize}
  \end{columns}
\end{frame}

\begin{frame}[fragile]{ChIP-seq quality assessment: \Biocpkg{ChIPQC}}
  Inputs: BAM files (raw data) and BED files (called peaks)
<<ChIPQC, eval=FALSE>>=
experiment <- ChIPQC(samples)
ChIPQCreport(experiment)
@ 
  Output: HTML report ---
  \url{http://starkhome.com/ChIPQC/Reports/tamoxifen/ChIPQC.html}
\end{frame}

\begin{frame}[fragile]{ChIP-seq annotation: \Biocpkg{ChIPpeakAnno},
    \Biocpkg{ChIPseeker}}
  Inputs
  \begin{itemize}
  \item Peaks: e.g., from \Rcode{rtracklayer::import()} BED files
  \item Annotation: gene boundaries or queries to \Biocpkg{biomaRt}
  \end{itemize}
<<ChIPpeakAnno, eval=FALSE>>=
library(ChIPpeakAnno)
## ...
annotated <- annotatePeakInBatch(peaks,
    AnnotationData=annotation)
@ 
\noindent Output: \Rclass{RangedData} with annotations about near-by peaks.
\end{frame}

\begin{frame}[fragile]{ChIP-seq differential representation: \Biocpkg{DiffBind}}
  Inputs: called peaks and raw BED or BAM files
<<DiffBind, eval=FALSE>>=
library(DiffBind)
tamoxifen = dba(sampleSheet="tamoxifen.csv")
tamoxifen = dba.count(tamoxifen)
tamoxifen = dba.contrast(tamoxifen,
    categories=DBA_CONDITION)
tamoxifen = dba.analyze(tamoxifen)
tamoxifen.DB = dba.report(tamoxifen)
@ 
\noindent Outputs: diagnostics, visualizations, and `top table' of
differentially expressed regions.
\end{frame}

%% \begin{frame}{ChIP-seq motifs}
%%   Identification
%%   \begin{itemize}
%%   \item \href{http://jaspar.genereg.net/}{JASPAR} and other motif
%%     catalogs
%%   \item Position Weight Matrix describing probability of nucleotide(s)
%%     at each position
%%   \item Scan genome / under peaks for known motifs
%%   \item \Biocpkg{MotifDb}, \Rfunction{matchPWM} (\Biocpkg{Biostrings});
%%   \item FIMO, etc
%%   \end{itemize}
%%   Discovery
%%   \begin{itemize}
%%   \item Collate sequences under peaks, search for recurrent sequences
%%   \item e.g.,
%%     \href{http://meme.nbcr.net/meme/cgi-bin/dreme.cgi}{DREME} /
%%     \href{http://meme.nbcr.net/meme/memechip-intro.html}{MEME-ChIP}
%%   \end{itemize}
%%   Also: enrichment, regulatory modules (2+ motifs co-occurring),
%%   function, \ldots
%% \end{frame}

\section*{Integration \& Visualization}

\begin{frame}{Integration \& visualization}
  \begin{itemize}
  \item Combining multiple data types
    \begin{itemize}
    \item \Biocpkg{Rcade}, \Biocpkg{Repitools}: ChIP / expression
    \end{itemize}
  \item Import / export from common formats (BED, WIG, ...)
    \begin{itemize}
    \item \Biocpkg{rtracklayer} \Rfunction{import()}, \Rfunction{export()}
    \end{itemize}
  \item \Biocpkg{AnnotationHub}: accessing large-scale resources,
    e.g., ENCODE tracks
  \item Visualization
  \end{itemize}
\end{frame}

%% AnnotationHub

\begin{frame}[fragile]{Integration \& visualization: \Biocpkg{AnnotationHub}}
\begin{verbatim}
> library(AnnotationHub)
> hub = AnnotationHub()
> hub
class: AnnotationHub 
length: 10780 
filters: none 
hubUrl: http://annotationhub.bioconductor.org/ah 
snapshotVersion: 2.14/1.4.0; snapshotDate: 2014-05-15
hubCache: /home/mtmorgan/.AnnotationHub 
> hub$<tab>
hub$dbSNP.organisms.human_9606.VCF. ... [302]
hub$haemcode.blood. ... [899]
hub$ensembl.release. ... [2611]
hub$inparanoid8.Orthologs.hom. ... [265]
hub$goldenpath. ... [6699]
hub$refnet. ... [4]
\end{verbatim}
\end{frame}

%% Visualization

\begin{frame}{Integration \& visualization}
  \begin{columns}
    \column{.5\textwidth}
    \Biocpkg{Gviz}\par
    \only<1>{
    \begin{itemize}
    \item Static track-like visualizations
    \item Data panels
    \end{itemize}
    }
    \Biocpkg{ggbio}\par
    \only<2>{
    \begin{itemize}
    \item Comprehensive visualizations
    \item \Rfunction{autoplot} file and data types
    \end{itemize}
    }
    \Biocpkg{epivizr}\par
    \column{.5\textwidth}
    \only<1>{\includegraphics[width=\textwidth]{figures/Gviz-vignette-1.png}}
    %% \only<2>{\includegraphics[width=\textwidth]{figures/Gviz-vignette-2.png}}
    \only<2>{\includegraphics[width=\textwidth]{figures/ggbio-vignette-1.png}}
  \end{columns}
\end{frame}

\begin{frame}{Integration \& visualization}
  \Biocpkg{Gviz}\par
  \Biocpkg{ggbio}\par
  \Biocpkg{epivizr}
  \begin{center}
    \includegraphics[height=.35\textheight]{figures/epivisr.png}
  \end{center}
  \begin{itemize}
  \item Genome browser with tight communication to \R{} / \Bioconductor{}
  \item Flexible \emph{interactive}, \emph{representation} and
    \emph{computation}, e.g., `brushing'
  \end{itemize}
\end{frame}

\section*{Genomic Ranges}

\begin{frame}{Genomic ranges for data integration}
  \begin{itemize}
  \item Chromosome, start, end, strand define a \emph{genomic range}
  \item Data (reads, CpG islands, peaks, \ldots) are genomic ranges
  \item Annotations (exons, genes, binding sites, \ldots) are genomic
    ranges
  \end{itemize}
  \includegraphics[width=\textwidth]{figures/GRanges.pdf}
  \par \Biocpkg{GenomicRanges}, \Biocpkg{GenomicAlignments} packages
\end{frame}

\begin{frame}{Operating on genomic ranges}
  \includegraphics[width=\textwidth]{figures/RangeOperations.pdf}
\end{frame}

\section*{Conclusions}

\begin{frame}
  \begin{center}
    \includegraphics[width=!,
    height=.2\textheight]{figures/bioconductor_logo_cmyk.pdf}
  \end{center}
  Funding
  \begin{itemize}
  \item US NIH / NHGRI 2U41HG004059; NSF 1247813
  \end{itemize}
  People
  \begin{itemize}
  \item Seattle Bioconductor team: Sonali Arora, Marc Carlson, Nate
    Hayden, Valerie Obenchain, Herv\'e Pag\`es, Dan Tenenbaum
  \item Vincent Carey, Robert Gentleman, Rafael Irizzary, Sean Davis,
    Kasper Hansen, Michael Lawrence, Levi Waldron
  \item International community of \Bioconductor{} developers and
    users
  \end{itemize}
\end{frame}


\begin{frame}[allowframebreaks]{References}
  \bibliographystyle{abbrvnat}
  \bibliography{ChIPSeq.bib}
\end{frame}

\end{document}