% \VignetteEngine{knitr::knitr} % \VignetteIndexEntry{04. Bioconductor for Sequence Analysis -- slides} \documentclass[xcolor=dvipsnames]{beamer} \usepackage{BioconductorSlides} \hypersetup{colorlinks,linkcolor=,urlcolor=Blue} \title{\Bioconductor{} for Sequence Analysis} \author{Martin T.\ Morgan\footnote{\url{mtmorgan@fhcrc.org}}} \date{27-28 February 2014} \begin{document} \maketitle \begin{frame}{Introduction: What is \Bioconductor{} good for?} \begin{itemize} \item Sequencing: RNA-seq, ChIP-seq, called variants, \ldots \begin{itemize} \item Especially \emph{after} assembly / alignment \end{itemize} \item Annotation: genes, pathways, gene models (exons, transcripts, etc.), \ldots \item Microarrays: expression, copy number, SNPs, methylation, \ldots \item Flow cytometry, proteomics, image analysis, high-throughput screens, \ldots \end{itemize} \end{frame} \begin{frame}[fragile]{Sequencing: Work flows} \begin{columns} \column{.5\textwidth} \begin{enumerate} \item Experimental design \item `Wet lab' sample prep \item Sequencing \begin{itemize} \item 100's of millions of reads \item 30-150 nucleotides \item Single and paired-end \item Bar codes, lanes \& flow cells \end{itemize} \item Alignment \item Analysis: DNA, RNA, epigenetics, integrative, microbiome, \ldots \end{enumerate} \column{.5\textwidth} \includegraphics[width=\textwidth,height=!]{figures/Solexa-bridge-pcr.jpg} \par Bentley et al., 2008, Nature 456: \href{http://www.ncbi.nlm.nih.gov/pubmed/18987734}{53-9} \end{columns} \end{frame} {\scriptsize\begin{verbatim} @ERR127302.1703 HWI-EAS350_0441:1:1:1460:19184#0/1 CCTGAGTGAAGCTGATCTTGATCTACGAAGAGAGATAGATCTTGATCGTCGAGGAGATGCTGACCTTGACCT + HHGHHGHHHHHHHHDGG>CE?=896=: @ERR127302.1704 HWI-EAS350_0441:1:1:1460:16861#0/1 GCGGTATGCTGGAAGGTGCTCGAATGGAGAGCGCCAGCGCCCCGGCGCTGAGCCGCAGCCTCAGGTCCGCCC + DE?DD>ED4>EEE>DE8EEEDE8B?EB<@3;BA79?,881B?@73;1?######################## @ERR127302.1705 HWI-EAS350_0441:1:1:1460:13054#0/1 AAAACACCCTGCAATCTTTCAGACAGGATGTTGACAATGCGTCTCTGGCACGTCTTGACCTTGAACGCAAAG + EEDEE>AD>BBGGB8E8EEEGBGGGGBGGGGG3G>E3*?BE??BBC8GB8??:??GGDGDDD>D>BGGD8EG,<6D@<@G@>AB@B?8AA>CE@D8@B=?CC>AG @ERR127302.1707 HWI-EAS350_0441:1:1:1461:6983#0/1 CGACGCTGACACCGGAACGGCAGCAGCAGCAGGACGATTAAGACAAGGAGGATGGCTCCACAGACGCTCATG + GEEGEGE@GGGGGGEGGGGGBB>G3?33?8*;;79?<9@?DD8@DDEE888;-BB?.A############## @ERR127302.1708 HWI-EAS350_0441:1:1:1461:10827#0/1 AAAGAAGGTCCTTGCAATAGACTGCCTCTGCTTGAGAACTTATGATGTAATTATTGCATGCTGCTAATATAC + GGGGGDDEBFGGGGGBE,DAGDDGGGEEEGACEBEFDEEFEDH:@.7@49;88G8>G>DDG@D>D@G@GE>@DDBDDG>= ## Use the 'ShortRead' package library(ShortRead) ## Create an object to represent a sample from a file sampler <- FastqSampler("ERR127302_1.fastq.gz") ## Apply a method to yield a random sample fq <- yield(sampler) ## Access sequences of sampled reads using `sread()` ## Summarize nucleotide use by cycle ## 'abc' is a nucleotide x cycle matrix of counts abc <- alphabetByCycle(sread(fq)) ## Subset of interesting nucleotides abc <- abc[c("A", "C", "G", "T", "N"),] @ \end{frame} \begin{frame}[fragile]{Sequencing: The \Biocpkg{ShortRead} package} \begin{columns} \column{.5\textwidth} <>= ## Create a plot from a ## matrix matplot(t(abc), type="l", lty=1, lwd=3, xlab="Cycle", ylab="Count", cex.lab=2) ## Add a legend legend("topright", legend=rownames(abc), lty=1, lwd=3, col=1:5, cex=1.8) @ \column{.5\textwidth} \includegraphics[width=\textwidth]{figures/abc} \end{columns} \end{frame} \begin{frame}{Sequencing: Essential packages and classes} \begin{itemize} \item \Biocpkg{Biostrings} and \Rclass{DNAStringSet} \item \Biocpkg{GenomicAlignments} and \Rclass{GAlignments} \item \Biocpkg{GenomicRanges} and \Rclass{GRanges} \item \Biocpkg{GenomicFeatures} and \Rclass{TranscriptDb} \item \Biocpkg{VariantAnnotation} and \Rclass{VCF} \item Input and output: \Biocpkg{rtracklayer} (WIG, BED, etc.), \Biocpkg{Rsamtools} (BAM), \Biocpkg{ShortRead} (FASTQ) file input \end{itemize} \end{frame} \section*{Sequencing: package tour} \begin{frame}{Reads} \begin{description} \item[Data] Short reads and their qualities \item[Tasks] Input, quality assessment, summary, trimming, \ldots \item[Packages] \Biocpkg{ShortRead}, \Biocpkg{Biostrings} \item[Functions] \begin{itemize} \item \Rfunction{readFastq}, \Rfunction{FastqSampler}, \Rfunction{FasqtStreamer}. \item \Rfunction{qa}, \Rfunction{report}. \item \Rfunction{alphabetFrequency}, \Rfunction{alphabetByCycle}, \Rfunction{consensusMatrix}. \item \Rfunction{trimTails}, \Rfunction{trimLRPatterns}, \Rfunction{matchPDict}, \ldots \end{itemize} \end{description} \end{frame} \begin{frame}{Alignments} \begin{description} \item[Data] BAM files of aligned reads \item[Tasks] Input, BAM file manipulation, pileups \item[Packages] \Biocpkg{GenomicAlignments}, \Biocpkg{Rsamtools} (also: \Biocpkg{GenomicRanges}) \item[Functions] \begin{itemize} \item \Rfunction{readGAlignments} \item \Rfunction{BamFile}, \Rfunction{BamFileList} \item \Rfunction{scanBam}, \Rfunction{ScanBamParam} (select a subset of the BAM file) \item \Rfunction{asBam}, \Rfunction{sortBam}, \Rfunction{indexBam}, \Rfunction{mergeBam}, \Rfunction{filterBam} \item \Rfunction{BamSampler}, \Rfunction{applyPileups} \end{itemize} \end{description} \end{frame} \begin{frame}{Ranges} \begin{description} \item[Data] Genomic coordinates to represent data (e.g., aligned reads) or annotation (e.g., gene models). \item[Tasks] Input, counting, coverage, manipulation, \ldots \item[Packages] \Biocpkg{GenomicRanges}, \Biocpkg{IRanges} \item[Functions] \begin{itemize} \item \Rfunction{readGAlignments}, \Rfunction{readGAlignmentsList} \item Many intra-, inter-, and between-range manipulating, e.g., \Rfunction{narrow}, \Rfunction{flank}, \Rfunction{shift}, \Rfunction{intersect}, \Rfunction{findOverlaps}, \Rfunction{countOverlaps} \end{itemize} \end{description} \end{frame} \begin{frame}{Variants} \begin{description} \item[Data] VCF (Variant Call Format) file \item[Tasks] Calling, input, summary, coding consequences \item[Packages] \Biocpkg{VariantTools} (linux only), \Biocpkg{VariantAnnotation}, \Biocpkg{ensemblVEP} \item[Functions] \begin{itemize} \item \Rfunction{tallyVariants} \item \Rfunction{readVcf}, \Rfunction{locateVariants}, \Rfunction{predictCoding} \item Also: SIFT, PolyPhen data bases \end{itemize} \end{description} \end{frame} \begin{frame}{Annotations} \begin{description} \item[Data] Gene symbols or other identifiers \item[Tasks] Discover annotations associated with genes or symbols \item[Packages] \Biocpkg{AnnotationDbi} (\Rpackage{org.*}, \Biocannopkg{GO.db}, \ldots), \Biocpkg{biomaRt} \item[Functions] \begin{itemize} \item Discovery: \Rfunction{columns}, \Rfunction{keytype}, \Rfunction{keys} \item \Rfunction{select}, \Rfunction{merge} \item \Biocpkg{biomaRt}: \Rfunction{listMarts}, \Rfunction{listDatasets}, \Rfunction{listAttributes}, \Rfunction{listFilters}, \Rfunction{getBM} \end{itemize} \end{description} \end{frame} \begin{frame}{Features} \begin{description} \item[Data] Genomic coordinates \item[Tasks] Group exons by transcript or gene; discover transcript / gene identifier mappings \item[Packages] \Biocpkg{GenomicFeatures} and \Rpackage{TxDb.*} packages (also: \Biocpkg{rtracklayer}) \item[Functions] \begin{itemize} \item \Rfunction{exonsBy}, \Rfunction{cdsBy}, \Rfunction{transcriptsBy} \item \Rfunction{select} (see Annotations, below) \item \Rfunction{makeTranscriptDb*} \end{itemize} \end{description} \end{frame} \begin{frame}{Genome annotations} \begin{description} \item[Data] FASTA, GTF, VCF, \ldots from internet resources \item[Tasks] Define regions of interests; incorporate known features (e.g., ENCODE marks, dbSNP variants) in work flows \item[Packages] \Biocpkg{AnnotationHub} \item[Functions] \begin{itemize} \item \Rfunction{AnnotationHub}, \Rfunction{filters} \item \Rfunction{metadata}, \Rcode{hub\$} \end{itemize} \end{description} \end{frame} \begin{frame}{Sequences} \begin{description} \item[Data] Whole-genome sequences \item[Tasks] View sequences, match position weight matricies, match patterns \item[Packages] \Biocpkg{Biostrings}, \Biocpkg{BSgenome} \item[Functions] \begin{itemize} \item \Rfunction{available.genomes} \item \Rcode{Hsapiens[["chr3"]]}, \Rfunction{getSeq}, \Rfunction{mask} \item \Rfunction{matchPWM}, \Rfunction{vcountPattern}, \ldots \item \Rfunction{forgeBSgenomeDataPkg} \end{itemize} \end{description} \end{frame} \begin{frame}{Import / export} \begin{description} \item[Data] Common text-based formats, \texttt{gff}, \texttt{wig}, \texttt{bed}; UCSC tracks \item[Tasks] Import and export \item[Packages] \Biocpkg{rtracklayer} \item[Functions] \begin{itemize} \item \Rfunction{import}, \Rfunction{export} \item \Rfunction{browserSession}, \Rfunction{genome} \end{itemize} \end{description} \end{frame} \begin{frame}{And\ldots} Data representation: \Biocpkg{IRanges}, \Biocpkg{GenomicRanges}, \Biocpkg{GenomicFeatures}, \Biocpkg{Biostrings}, \Biocpkg{BSgenome}, \Biocpkg{girafe}. Input / output: \Biocpkg{ShortRead} (fastq), \Biocpkg{Rsamtools} (bam), \Biocpkg{rtracklayer} (gff, wig, bed), \Biocpkg{VariantAnnotation} (vcf), \Biocpkg{R453Plus1Toolbox} (454). Annotation: \Biocpkg{GenomicFeatures}, \Biocpkg{ChIPpeakAnno}, \Biocpkg{VariantAnnotation}. Alignment: \Biocpkg{Rsubread}, \Biocpkg{Biostrings}. Visualization: \Biocpkg{ggbio}, \Biocpkg{Gviz}. Quality assessment: \Biocpkg{qrqc}, \Biocpkg{seqbias}, \Biocpkg{ReQON}, \Biocpkg{htSeqTools}, \Biocpkg{TEQC}, \Biocpkg{Rolexa}, \Biocpkg{ShortRead}. RNA-seq: \Biocpkg{BitSeq}, \Biocpkg{cqn}, \Biocpkg{cummeRbund}, \Biocpkg{DESeq}, \Biocpkg{DEXSeq}, \Biocpkg{EDASeq}, \Biocpkg{edgeR}, \Biocpkg{gage}, \Biocpkg{goseq}, \Biocpkg{iASeq}, \Biocpkg{tweeDEseq}. ChIP-seq, etc.: \Biocpkg{BayesPeak}, \Biocpkg{baySeq}, \Biocpkg{ChIPpeakAnno}, \Biocpkg{chipseq}, \Biocpkg{ChIPseqR}, \Biocpkg{ChIPsim}, \Biocpkg{CSAR}, \Biocpkg{DiffBind}, \Biocpkg{MEDIPS}, \Biocpkg{mosaics}, \Biocpkg{NarrowPeaks}, \Biocpkg{nucleR}, \Biocpkg{PICS}, \Biocpkg{PING}, \Biocpkg{REDseq}, \Biocpkg{Repitools}, \Biocpkg{TSSi}. Motifs: \Biocpkg{BCRANK}, \Biocpkg{cosmo}, \Biocpkg{cosmoGUI}, \Biocpkg{MotIV}, \Biocpkg{seqLogo}, \Biocpkg{rGADEM}. 3C, etc.: \Biocpkg{HiTC}, \Biocpkg{r3Cseq}. Copy number: \Biocpkg{cn.mops}, \Biocpkg{CNAnorm}, \Biocpkg{exomeCopy}, \Biocpkg{seqmentSeq}. Microbiome: \Biocpkg{phyloseq}, \Biocpkg{DirichletMultinomial}, \Biocpkg{clstutils}, \Biocpkg{manta}, \Biocpkg{mcaGUI}. Work flows: \Biocpkg{ArrayExpressHTS}, \Biocpkg{Genominator}, \Biocpkg{easyRNASeq}, \Biocpkg{oneChannelGUI}, \Biocpkg{rnaSeqMap}. Database: \Biocpkg{SRAdb}. \ldots \end{frame} \section*{Exemplars} \begin{frame}{Exemplars: Algorithms to action} \begin{enumerate} \item Batch effects \item Methylation \item RNA-seq Differential Representation \item Visualization \end{enumerate} \end{frame} \begin{frame}{Exemplar: Differential Representation} Haglund et al., 2012 \href{http://www.ncbi.nlm.nih.gov/pubmed/23024189}{J Clin Endocrin Metab} \bigskip\par \begin{columns} \column{.5\textwidth} \begin{itemize} \item Scientific finding: identify genes whose expression is regulated by estrogen receptors in parathyroid adenoma cells \item Statistical challenges: between-sample normalization; appropriate statistical model; efficient estimation; \ldots \end{itemize} \column{.5\textwidth} \includegraphics[width=\textwidth]{figures/DESeq2_parathyroid-plotMApadjchange.png} \end{columns} \bigskip\par\Bioconductor{} support: \Biocpkg{DESeq2}, \Biocpkg{edgeR}, many statistical `lessons learned' from microarrays; extensive integration with down-stream tools \end{frame} \begin{frame}{Exemplar: Batch Effects} Leek et al., 2010, Nature Reviews Genetics 11, \href{http://www.nature.com/nrg/journal/v11/n10/abs/nrg2825.html}{733-739}, Leek \& Story \href{http://dx.doi.org/10.1371/journal.pgen.0030161}{PLoS Genet 3(9): e161} \begin{columns} \column{.5\textwidth} \begin{itemize} \item Scientific finding: pervasive batch effects \item Statistical insights: surrogate variable analysis: identify and build surrogate variables; remove known batch effects \item Benefits: reduce dependence, stabilize error rate estimates, and improve reproducibility \end{itemize} \Bioconductor{} support: \Biocpkg{sva} \column{.5\textwidth} \only<1>{ \includegraphics[width=\textwidth]{figures/nrg2825-f2.jpg} \par{\small HapMap samples from one facility, ordered by date of processing. From } } \only<2>{ \begin{enumerate} \item Remove signal due to variable(s) of interest \item Identify subset of genes driving orthogonal signatures of EH \item Build a surrogate variable based on full EH signature of that subset \item Include significant surrogate variables as covariates \end{enumerate} EH: expression heterogeneity } \end{columns} \end{frame} \begin{frame}{Exemplar: Methylation} Hansen et al., 2011, Nature Genetics 43, \href{http://www.nature.com/ng/journal/v43/n8/full/ng.865.html}{768-775} \begin{itemize} \item Scientific finding: stochastic methylation variation of cancer-specific de-methylated regions (DMR), distinguishing cancer from normal tissue, in several cancers. \item Statistical challenges: smoothing, non-specific filtering, $t$ statistics, find DMRs \end{itemize} \bigskip\par \includegraphics[width=\textwidth]{figures/bsseq_analysis-1.png} \medskip\par \Bioconductor{} support: whole-genome (\Biocpkg{bsseq}) or reduced representation (\Biocpkg{MethylSeekR}) bisulfite sequencing; Illumina 450k arrays (\Biocpkg{minfi}) \end{frame} \begin{frame}{Exemplar: Visualization} \begin{columns} \column{.5\textwidth} \Biocpkg{Gviz}\par \only<1-2>{ \begin{itemize} \item Track-like visualizations \item Data panels \item Fully integrated with \Bioconductor{} sequence representations \end{itemize} } \Biocpkg{ggbio}\par \only<3>{ \begin{itemize} \item Comprehensive visualizations \item \Rfunction{autoplot} file and data types \item Fully integrated with \Bioconductor{} sequence representations \end{itemize} } \Biocpkg{epivizr}\par \only<4>{ \begin{itemize} \item Genome browser with socket communication to \R{} \item Fully integrated with \Bioconductor{} sequence representations \end{itemize} } \column{.5\textwidth} \only<1>{\includegraphics[width=\textwidth]{figures/Gviz-vignette-1.png}} \only<2>{\includegraphics[width=\textwidth]{figures/Gviz-vignette-2.png}} \only<3>{\includegraphics[width=\textwidth]{figures/ggbio-vignette-1.png}} \only<4>{\includegraphics[width=\textwidth]{figures/epivisr.png}} \end{columns} \end{frame} \begin{frame}{Principles: Some key points} \begin{itemize} \item \R{} is a high-level programming language, so lots can be accomplished with just a little code \item Packages such as \Biocpkg{ShortRead} provide a great way to benefit from the expertise of others (and to contribute your own expertise back to the community!) \begin{itemize} \item The path from `user' to `developer' is not that long, and has been taken by many! \end{itemize} \item Objects and methods such as \Rclass{data.frame}, \Rclass{ShortReadQ} and \Rcode{alphabetByCycle()}) help to manage complicated data \begin{itemize} \item Reducing possibility for clerical and other mistakes \item Facilitating inter-operability between different parts of an analysis \end{itemize} \item Scripts make work flows reproducible \item Visualizing data is an important part of exploratory analysis \end{itemize} \end{frame} \begin{frame}{Principles: Successful computational biology software} \begin{enumerate} \item Extensive: software, annotation, integration \begin{itemize} \item 750 inter-operable \Bioconductor{} packages \end{itemize} \item Statistical: volume, technology, experimental design \begin{itemize} \item \R{} a `natural' for statistical analysis \end{itemize} \item Reproducible: long-term, multi-participant science \begin{itemize} \item Objects, scripts, vignettes, packages, \ldots encourage reproducible research \end{itemize} \item Leading edge: novel, technology-driven \begin{itemize} \item Packages and user community closely track leading edge science \end{itemize} \item Accessible: affordable, transparent, usable \begin{itemize} \item \Bioconductor{} is free and open, with extensive documentation and an active and supportive user community \end{itemize} \end{enumerate} Case study: differential expression of known genes; see also \href{http://bioconductor.org/help/course-materials/2013/EMBOBGI/reproducible-research.pdf}{reproducible research} lecture. \end{frame} \begin{frame}{Challenges \& Opportunities} \begin{itemize} \item Big data -- transparent management within \R, facile use of established resources \item Developer and user training \end{itemize} Resources \begin{itemize} \item \url{http://r-project.org}, \emph{An Introduction to \R} manual; Dalgaard, \emph{Introductory Statistics with \R}; \href{http://rfordummies.com/}{\R{} for Dummies} \item \url{http://bioconductor.org/} \item \url{http://rstudio.org} \item \href{http://stackoverflow.com/questions/tagged/r}{StackOverflow}, \Bioconductor{} \href{http://bioconductor.org/help/mailing-list/mailform/}{mailing list} \end{itemize} \end{frame} \end{document}