%% \VignetteIndexEntry{Project Overview -- Slides} %% \VignetteEngine{knitr::knitr} \documentclass[xcolor=dvipsnames]{beamer} \usepackage{BioconductorSlides} \usepackage[round]{natbib} \hypersetup{colorlinks,linkcolor=,urlcolor=Blue} \AtBeginSection[] { \begin{frame}{Outline} \tableofcontents[currentsection] \end{frame} } <>= knitr::opts_chunk$set(cache=TRUE) @ \begin{document} \title{\R{} / \Bioconductor{} for Epigenomic Analysis} \author{Martin Morgan \url{mtmorgan@fhcrc.org} \\ Fred Hutchinson Cancer Research Center \\ Seattle, WA, USA} \date{25 August 2014} \maketitle \section*{Introduction} \begin{frame}{\R{} and \Bioconductor{}} \R, \url{http://r-project.org} \begin{itemize} \item Open-source, statistical programming language; widely used in academia, finance, pharma, \ldots \item Core language, `base' and $>5000$ contributed packages \item Interactive sessions, scripts, packages \end{itemize} \Bioconductor, \url{http://bioconductor.org} \begin{itemize} \item Analysis and comprehension of high-throughput genomic data \item Themes: rigorous statistical analysis; reproducible work flows; integrative analysis \item $>12$ years old, $825$ packages \item Courses, conferences (package developers: travel scholarships available!), mailing list, \ldots \end{itemize} \end{frame} \begin{frame}{Bioconductor} \begin{center} \includegraphics{figures/worldmap.png} \end{center} Trailing 12 month statistics \begin{itemize} \item 1702 PubMedCentral full-text citations \item 9.5M package downloads to 242,000 distinct IP addresses \item 1M sessions from 400k visitors to web site \item Annual conferences; courses; active mailing list; \ldots \end{itemize} \end{frame} \section*{Epigenomics} \begin{frame}{Epigenomics} \begin{enumerate} \item Methylation \begin{itemize} \item Illumina 450k arrays \item Whole genome and restricted representation bisulfite sequencing \end{itemize} \item ChIP-Seq \item Integration \& visualization \begin{itemize} \item Data types, e.g., expression \item Consortium resources \emph{via} \Biocpkg{AnnotationHub} \end{itemize} \item A key data structure: \Rclass{GRanges} \end{enumerate} Find relevant packages using \href{http://bioconductor.org/packages/release/BiocViews.html}{BiocViews}, in addition to standard scholarly approaches. \end{frame} \section*{Methylation} \begin{frame}{Methylation: selected packages} 450k arrays \begin{itemize} \item \Biocpkg{minfi} -- pre-processing, differential methylation \item \Biocpkg{ChAMP} -- comprehensive work flow \end{itemize} Bisulfite sequencing \begin{itemize} \item \Biocpkg{bsseq} -- whole genome \item \Biocpkg{BiSeq} -- restricted representation \end{itemize} \end{frame} \begin{frame}{Methylation: Illumina 450k arrays} \begin{center} \includegraphics{figures/600px-Illuminamethylationworkflow.png} \end{center} \par\url{http://en.wikipedia.org/wiki/Illumina_Methylation_Assay} \end{frame} \begin{frame}{Methylation: \Biocpkg{minfi}} Work flow steps and representative functions \begin{enumerate} \item Data input: \Rfunction{read.450k.exp()} \item Quality assessment: \Rfunction{densityPlot()} \item Pre-processing, e.g., background correction, normalization: \Rfunction{preprocessIllumina()} \item Differentially methylated probes: \Rfunction{dmpFinder()} \item Differentially methylated regions: \Rfunction{bumphunter()} \end{enumerate} See the \href{http://bioconductor.org/packages/release/bioc/html/minfi.html}{vignette} for additional detail. \bigskip\par \includegraphics[width=\textwidth]{figures/bsseq_analysis-1.png} \end{frame} %% \begin{frame}{Methylation: \Biocpkg{minfi} exemplar} %% Hansen et al., 2011, Nature Genetics 43, %% \href{http://www.nature.com/ng/journal/v43/n8/full/ng.865.html}{768-775} %% \begin{itemize} %% \item Scientific finding: stochastic methylation variation of %% cancer-specific de-methylated regions (DMR), distinguishing cancer from %% normal tissue, in several cancers. %% \item Statistical challenges: non-specific filtering, $t$ %% statistics, find DMRs, smoothing %% \end{itemize} %% \end{frame} \section*{ChIP-seq} %% \begin{frame}{ChIP-seq} %% \begin{columns} %% \column{.5\textwidth} %% \includegraphics[width=\textwidth]{figures/protocol.png} %% \column{.5\textwidth} %% Chromatin immunoprecipitation, followed by sequencing %% \begin{itemize} %% \item Determine location of proteins bound to DNA %% \end{itemize} %% Useful for detecting %% \begin{itemize} %% \item Transcription factor binding sites %% \item Histone modifications %% \end{itemize} %% Common questions %% \begin{itemize} %% \item Which genes is this TF regulating? %% \item How do histone modifications affect expression? %% \end{itemize} %% \end{columns} %% \end{frame} \begin{frame}{ChIP-seq: peak calling} \includegraphics[width=\textwidth]{figures/nbt-1508-F1.jpg} \footnote{\citet{pmid19029915}} \begin{itemize} \item Chromatin immunoprecipitation, followed by sequencing to determine location of proteins bound to DNA \item Useful for locating transcription factor binding sites, histone modifications, \ldots \end{itemize} \end{frame} \begin{frame}{ChIP-seq work flow} \begin{columns} \column{.5\textwidth} \includegraphics[width=\textwidth]{figures/ChIPSeq-workflow} \par Analysis overview: \citet{10.1371/journal.pcbi.1003326} \column{.5\textwidth} \begin{itemize} \item Annotation: what genes are my peaks near? \item Differential representation: which peaks are over- or under-represented in treatment 1, compared to treatment 2? \item Motif identification (peaks over known motifs?) and discovery \item Integrative analysis, e.g., association of regulatory elements and expression \end{itemize} \end{columns} \end{frame} \begin{frame}[fragile]{ChIP-seq quality assessment: \Biocpkg{ChIPQC}} Inputs: BAM files (raw data) and BED files (called peaks) <>= experiment <- ChIPQC(samples) ChIPQCreport(experiment) @ Output: HTML report --- \url{http://starkhome.com/ChIPQC/Reports/tamoxifen/ChIPQC.html} \end{frame} \begin{frame}[fragile]{ChIP-seq annotation: \Biocpkg{ChIPpeakAnno}, \Biocpkg{ChIPseeker}} Inputs \begin{itemize} \item Peaks: e.g., from \Rcode{rtracklayer::import()} BED files \item Annotation: gene boundaries or queries to \Biocpkg{biomaRt} \end{itemize} <>= library(ChIPpeakAnno) ## ... annotated <- annotatePeakInBatch(peaks, AnnotationData=annotation) @ \noindent Output: \Rclass{RangedData} with annotations about near-by peaks. \end{frame} \begin{frame}[fragile]{ChIP-seq differential representation: \Biocpkg{DiffBind}} Inputs: called peaks and raw BED or BAM files <>= library(DiffBind) tamoxifen = dba(sampleSheet="tamoxifen.csv") tamoxifen = dba.count(tamoxifen) tamoxifen = dba.contrast(tamoxifen, categories=DBA_CONDITION) tamoxifen = dba.analyze(tamoxifen) tamoxifen.DB = dba.report(tamoxifen) @ \noindent Outputs: diagnostics, visualizations, and `top table' of differentially expressed regions. \end{frame} %% \begin{frame}{ChIP-seq motifs} %% Identification %% \begin{itemize} %% \item \href{http://jaspar.genereg.net/}{JASPAR} and other motif %% catalogs %% \item Position Weight Matrix describing probability of nucleotide(s) %% at each position %% \item Scan genome / under peaks for known motifs %% \item \Biocpkg{MotifDb}, \Rfunction{matchPWM} (\Biocpkg{Biostrings}); %% \item FIMO, etc %% \end{itemize} %% Discovery %% \begin{itemize} %% \item Collate sequences under peaks, search for recurrent sequences %% \item e.g., %% \href{http://meme.nbcr.net/meme/cgi-bin/dreme.cgi}{DREME} / %% \href{http://meme.nbcr.net/meme/memechip-intro.html}{MEME-ChIP} %% \end{itemize} %% Also: enrichment, regulatory modules (2+ motifs co-occurring), %% function, \ldots %% \end{frame} \section*{Integration \& Visualization} \begin{frame}{Integration \& visualization} \begin{itemize} \item Combining multiple data types \begin{itemize} \item \Biocpkg{Rcade}, \Biocpkg{Repitools}: ChIP / expression \end{itemize} \item Import / export from common formats (BED, WIG, ...) \begin{itemize} \item \Biocpkg{rtracklayer} \Rfunction{import()}, \Rfunction{export()} \end{itemize} \item \Biocpkg{AnnotationHub}: accessing large-scale resources, e.g., ENCODE tracks \item Visualization \end{itemize} \end{frame} %% AnnotationHub \begin{frame}[fragile]{Integration \& visualization: \Biocpkg{AnnotationHub}} \begin{verbatim} > library(AnnotationHub) > hub = AnnotationHub() > hub class: AnnotationHub length: 10780 filters: none hubUrl: http://annotationhub.bioconductor.org/ah snapshotVersion: 2.14/1.4.0; snapshotDate: 2014-05-15 hubCache: /home/mtmorgan/.AnnotationHub > hub$ hub$dbSNP.organisms.human_9606.VCF. ... [302] hub$haemcode.blood. ... [899] hub$ensembl.release. ... [2611] hub$inparanoid8.Orthologs.hom. ... [265] hub$goldenpath. ... [6699] hub$refnet. ... [4] \end{verbatim} \end{frame} %% Visualization \begin{frame}{Integration \& visualization} \begin{columns} \column{.5\textwidth} \Biocpkg{Gviz}\par \only<1>{ \begin{itemize} \item Static track-like visualizations \item Data panels \end{itemize} } \Biocpkg{ggbio}\par \only<2>{ \begin{itemize} \item Comprehensive visualizations \item \Rfunction{autoplot} file and data types \end{itemize} } \Biocpkg{epivizr}\par \column{.5\textwidth} \only<1>{\includegraphics[width=\textwidth]{figures/Gviz-vignette-1.png}} %% \only<2>{\includegraphics[width=\textwidth]{figures/Gviz-vignette-2.png}} \only<2>{\includegraphics[width=\textwidth]{figures/ggbio-vignette-1.png}} \end{columns} \end{frame} \begin{frame}{Integration \& visualization} \Biocpkg{Gviz}\par \Biocpkg{ggbio}\par \Biocpkg{epivizr} \begin{center} \includegraphics[height=.35\textheight]{figures/epivisr.png} \end{center} \begin{itemize} \item Genome browser with tight communication to \R{} / \Bioconductor{} \item Flexible \emph{interactive}, \emph{representation} and \emph{computation}, e.g., `brushing' \end{itemize} \end{frame} \section*{Genomic Ranges} \begin{frame}{Genomic ranges for data integration} \begin{itemize} \item Chromosome, start, end, strand define a \emph{genomic range} \item Data (reads, CpG islands, peaks, \ldots) are genomic ranges \item Annotations (exons, genes, binding sites, \ldots) are genomic ranges \end{itemize} \includegraphics[width=\textwidth]{figures/GRanges.pdf} \par \Biocpkg{GenomicRanges}, \Biocpkg{GenomicAlignments} packages \end{frame} \begin{frame}{Operating on genomic ranges} \includegraphics[width=\textwidth]{figures/RangeOperations.pdf} \end{frame} \section*{Conclusions} \begin{frame} \begin{center} \includegraphics[width=!, height=.2\textheight]{figures/bioconductor_logo_cmyk.pdf} \end{center} Funding \begin{itemize} \item US NIH / NHGRI 2U41HG004059; NSF 1247813 \end{itemize} People \begin{itemize} \item Seattle Bioconductor team: Sonali Arora, Marc Carlson, Nate Hayden, Valerie Obenchain, Herv\'e Pag\`es, Dan Tenenbaum \item Vincent Carey, Robert Gentleman, Rafael Irizzary, Sean Davis, Kasper Hansen, Michael Lawrence, Levi Waldron \item International community of \Bioconductor{} developers and users \end{itemize} \end{frame} \begin{frame}[allowframebreaks]{References} \bibliographystyle{abbrvnat} \bibliography{ChIPSeq.bib} \end{frame} \end{document}