%\VignetteIndexEntry{04 Range-based Containers -- Slides} %\VignetteDepends{GenomicFeatures, TxDb.Dmelanogaster.UCSC.dm3.ensGene, pasillaBamSubset} \SweaveOpts{keep.source=TRUE, eps=FALSE, width=9, height=3} \documentclass[8pt]{beamer} \usepackage{BioconductorSlides} \renewcommand\Rclass[1]{{\texttt{#1}\index{#1 (class)}}} % A Beamer Quickstart: % http://www.math.umbc.edu/~rouben/beamer/ \newcommand\DefaultBackground{\setbeamertemplate{background canvas}[vertical shading][bottom=black!10,top=black!10]\setbeamertemplate{background canvas}[vertical shading][bottom=blue!40,top=blue!15]} \DefaultBackground \setbeamercolor{block body example}{bg=white} \definecolor{YESgreen}{RGB}{0, 160, 80} \newcommand\YES{\textcolor{YESgreen}{\textbf{YES}}} \newcommand\PartiallySupported{\textcolor{YESgreen}{\textbf{partially supported}}} \definecolor{NOgray}{RGB}{144, 144, 144} \newcommand\NO{\textcolor{NOgray}{\textbf{NO}}} \AtBeginSection[] { \setbeamertemplate{background canvas}[vertical shading][bottom=black!47,top=black!47] \begin{frame}{} \tableofcontents[currentsection,currentsubsection] \end{frame} \DefaultBackground } \AtBeginSubsection[] { \setbeamertemplate{background canvas}[vertical shading][bottom=black!47,top=black!47] \begin{frame}{} \tableofcontents[currentsection,currentsubsection] \end{frame} \DefaultBackground } \title{Range-based containers in \Bioconductor{}} \author{Herv\'e Pag\`es\\ \href{mailto:hpages@fhcrc.org}{hpages@fhcrc.org}} \institute[FHCRC]{Fred Hutchinson Cancer Research Center\\ Seattle, WA, USA} \date{21 January 2014} \begin{document} <>= options(width=84) plotRanges <- function(x, xlim = x, main = deparse(substitute(x)), col = "black", sep = 0.5, ...) { height <- 1 if (is(xlim, "Ranges")) xlim <- c(min(start(xlim)), max(end(xlim))) bins <- disjointBins(IRanges(start(x), end(x) + 1)) plot.new() par(mai=c(0.5, 0.2, 1.2, 0.2)) plot.window(xlim, c(0, max(bins)*(height + sep))) ybottom <- bins * (sep + height) - height rect(start(x)-0.5, ybottom, end(x)+0.5, ybottom + height, col = col, ...) title(main, cex.main=2.8, font.main=1) axis(1) } @ \maketitle \frame{\tableofcontents} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Introduction} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Range-based containers in \Bioconductor{}} \begin{block}{} Implemented and documented in the \Rpackage{IRanges} package: \begin{itemize} \item \Rclass{IRanges} \end{itemize} \medskip Implemented and documented in the \Rpackage{GenomicRanges} package: \begin{itemize} \item \Rclass{GRanges} \item \Rclass{GRangesList} \item \Rclass{GAlignments} \item \Rclass{GAlignmentPairs} \item \Rclass{GAlignmentsList} (not covered in this presentation) \end{itemize} \end{block} \end{frame} \begin{frame}[fragile] \frametitle{About the implementation} \begin{block}{} S4 classes (a.k.a. {\em formal} classes) --> relies heavily on the \Rpackage{methods} package. \end{block} \begin{block}{} Current implementation tries to provide an API that is as consistent as possible. In particular: \begin{itemize} \item The end-user should never need to use \Rcode{new()}: a {\em constructor}, named as the container, is provided for each container. E.g. \Rcode{GRanges()}. \item The end-user should never need to use \Rcode{@} (a.k.a. {\em direct slot access}): slot {\em accessors} ({\em getters} and {\em setters}) are provided for each container. Not all getters have a corresponding setter! \item Standard functions/operators like \Rcode{length()}, \Rcode{names()}, \Rcode{[}, \Rcode{c()}, \Rcode{[[}, \Rcode{\$}, etc... work almost everywhere and behave ``as expected''. \item Additional functions that work almost everywhere: \Rcode{mcols()}, \Rcode{elementLengths()}, \Rcode{seqinfo()}, etc... \item Consistent display (\Rcode{show} methods). \end{itemize} \end{block} \end{frame} \setbeamertemplate{background canvas}[vertical shading][bottom=green!20,top=green!20] \begin{frame}[fragile] \frametitle{Basic operations} \begin{block}{} \begin{columns}[t] \begin{column}{0.093\textwidth} \end{column} \begin{column}{0.48\textwidth} {\bf Vector operations} \small \medskip Operate on {\em vector-like} objects \medskip (e.g. on \Rclass{Rle}, \Rclass{IRanges}, \Rclass{GRanges}, \Rclass{DNAStringSet}, etc... objects) \medskip \begin{itemize} \item Accessors: \Rcode{length()}, \Rcode{names()}, \Rcode{mcols()} \item Single-bracket subsetting: \Rcode{[} \item Combining: \Rcode{c()} \item Splitting/relisting: \Rcode{split()}, \Rcode{relist()} \item Comparing: \Rcode{==}, \Rcode{!=}, \Rcode{match()}, \Rcode{\%in\%}, \Rcode{duplicated()}, \Rcode{unique()} \item Ordering: \Rcode{<=}, \Rcode{>=}, \Rcode{<}, \Rcode{>}, \Rcode{order()}, \Rcode{sort()}, \Rcode{rank()} \end{itemize} \end{column} \begin{column}{0.04\textwidth} \end{column} \begin{column}{0.42\textwidth} {\bf List operations} \small \medskip Operate on {\em list-like} objects\footnote{{\em list-like} objects are also {\em vector-like} objects} \medskip (e.g. on \Rclass{IRangesList}, \Rclass{GRangesList}, \Rclass{DNAStringSetList}, etc... objects) \medskip \begin{itemize} \item Double-bracket subsetting: \Rcode{[[} \item \Rcode{elementLengths()}, \Rcode{unlist()} \item \Rcode{lapply()}, \Rcode{sapply()}, \Rcode{endoapply()} \item \Rcode{mendoapply()} (not covered in this presentation) \end{itemize} \end{column} \end{columns} \end{block} \begin{block}{} {\bf Coercion methods} \small \begin{itemize} \item \Rcode{as()} \item S3-style form: \Rcode{as.vector()}, \Rcode{as.character()}, \Rcode{as.factor()}, etc... \end{itemize} \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Range-based operations} Range-based operations operate on {\em range-based} objects \medskip (e.g. on \Rclass{IRanges}, \Rclass{IRangesList}, \Rclass{GRanges}, \Rclass{GRangesList}, etc... objects) \begin{columns}[t] \begin{column}{0.44\textwidth} \begin{block}{} {\bf Intra range transformations} \Rcode{shift()}, \Rcode{narrow()}, \Rcode{flank()}, \Rcode{resize()} \end{block} \begin{block}{} {\bf Inter range transformations} \Rcode{disjoin()}, \Rcode{range()}, \Rcode{reduce()}, \Rcode{gaps()} \end{block} \begin{block}{} {\bf Range-based set operations} \Rcode{union()}, \Rcode{intersect()}, \Rcode{setdiff()}, \Rcode{punion()}, \Rcode{pintersect()}, \Rcode{psetdiff()}, \Rcode{pgap()} \end{block} \end{column} \begin{column}{0.44\textwidth} \begin{block}{} {\bf Coverage and slicing} \Rcode{coverage()}, \Rcode{slice()} \end{block} \begin{block}{} {\bf Finding/counting overlapping ranges} \Rcode{findOverlaps()}, \Rcode{countOverlaps()} \end{block} \begin{block}{} {\bf Finding the nearest range neighbor} \Rcode{nearest()}, \Rcode{precede()}, \Rcode{follow()} \end{block} \begin{block}{} and more... \end{block} \end{column} \end{columns} \end{frame} \DefaultBackground %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{\Rclass{IRanges} objects} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{The purpose of the \Rclass{IRanges} container is...} ... to store a set of {\em integer ranges} (a.k.a. {\em integer intervals}). \begin{block}{} \begin{itemize} \item Each range can be defined by a {\em start} and an {\em end} value: both are included in the interval (except when the range is empty). \item The {\em width} of the range is the number of integer values in it: {\em width} = {\em end} - {\em start} + 1. \item {\em end} is always >= {\em start}, except for empty ranges (a.k.a. zero-width ranges) where {\em end} = {\em start} - 1. \end{itemize} \end{block} \setbeamercolor{block title}{bg=green!40} \setbeamercolor{block body}{bg=green!20} \begin{block}{Supported operations} \begin{itemize} \item {\em Vector operations}: \YES{} (splitting/relisting produces an \Rclass{IRangesList} object) \item {\em List operations}: \YES{} (not covered in this presentation) \item {\em Coercion methods}: \YES{} (from logical or integer vector to \Rclass{IRanges}) \item {\em Range-based operations}: \YES{} \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Constructor and accessors} \begin{frame}[fragile] \frametitle{\Rclass{IRanges} constructor and accessors} \begin{exampleblock}{} {\small <>= library(IRanges) ir1 <- IRanges(start=c(12, -9, NA, 12), end=c(NA, 0, 15, NA), width=c(4, NA, 4, 3)) ir1 # "show" method not yet consistent with the other "show" methods (TODO) start(ir1) end(ir1) width(ir1) successiveIRanges(c(10, 5, 38), from=101) @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{\Rclass{IRanges} accessors (continued)} \begin{exampleblock}{} {\scriptsize <>= names(ir1) <- LETTERS[1:4] names(ir1) mcols(ir1) <- DataFrame(score=11:14, GC=seq(1, 0, length=4)) mcols(ir1) ir1 @ } \end{exampleblock} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Vector operations} \begin{frame}[fragile] \frametitle{Vector operations on \Rclass{IRanges} objects} \begin{columns}[t] \begin{column}{0.306\textwidth} \begin{exampleblock}{} {\scriptsize <>= ir1[-2] ir2 <- c(ir1, IRanges(-10, 0)) ir2 @ } \end{exampleblock} \end{column} \begin{column}{0.34\textwidth} \begin{exampleblock}{} {\scriptsize <>= duplicated(ir2) unique(ir2) @ } \end{exampleblock} \end{column} \begin{column}{0.22\textwidth} \begin{exampleblock}{} {\scriptsize <>= order(ir2) sort(ir2) @ } \end{exampleblock} \end{column} \end{columns} \begin{columns}[t] \begin{column}{0.60\textwidth} \begin{exampleblock}{} {\scriptsize <>= ok <- c(FALSE, FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, TRUE) ir4 <- as(ok, "IRanges") # from logical vector to IRanges ir4 @ } \end{exampleblock} \end{column} \begin{column}{0.35\textwidth} \begin{exampleblock}{} {\scriptsize <>= as.data.frame(ir4) @ } \end{exampleblock} \end{column} \end{columns} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Range-based operations} \begin{frame}[fragile] \frametitle{Range-based operations on \Rclass{IRanges} objects} \begin{exampleblock}{} <>= ir0 <- IRanges(start=c(7, 9, 12, 14, 22:24), end=c(15, 11, 12, 18, 26, 27, 28)) png("ranges-ir0-plot.png", width=800, height=170) plotRanges(ir0, xlim=c(5, 35), main="ir0", col="blue") dev.off() @ <>= png("ranges-shift-ir0-plot.png", width=800, height=170) plotRanges(shift(ir0, 5), xlim=c(5, 35), main="shift(ir0, 5)", col="blue") dev.off() @ <>= png("ranges-disjoin-ir0-plot.png", width=800, height=170) plotRanges(disjoin(ir0), xlim=c(5, 35), main="disjoin(ir0)", col="blue") dev.off() @ <>= png("ranges-reduce-ir0-plot.png", width=800, height=170) plotRanges(reduce(ir0), xlim=c(5, 35), main="reduce(ir0)", col="blue") dev.off() @ \begin{figure} \centering \includegraphics[width=0.8\textwidth,height=!]{ranges-ir0-plot}\\ \includegraphics[width=0.8\textwidth,height=!]{ranges-shift-ir0-plot}\\ \includegraphics[width=0.8\textwidth,height=!]{ranges-disjoin-ir0-plot}\\ \includegraphics[width=0.8\textwidth,height=!]{ranges-reduce-ir0-plot} %\caption{Range-based operations} \end{figure} \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{Range-based operations on \Rclass{IRanges} objects (continued)} \begin{columns}[t] \begin{column}{0.44\textwidth} \begin{exampleblock}{} <>= ir1 @ \end{exampleblock} \end{column} \begin{column}{0.44\textwidth} \begin{exampleblock}{} <>= shift(ir1, -start(ir1)) flank(ir1, 10, start=FALSE) @ \end{exampleblock} \end{column} \end{columns} \end{frame} \begin{frame}[fragile] \frametitle{Range-based operations on \Rclass{IRanges} objects (continued)} \begin{columns}[t] \begin{column}{0.44\textwidth} \begin{exampleblock}{} <>= ir1 @ \end{exampleblock} \begin{exampleblock}{} <>= range(ir1) reduce(ir1) @ \end{exampleblock} \end{column} \begin{column}{0.46\textwidth} \begin{exampleblock}{} <>= union(ir1, IRanges(-2, 6)) intersect(ir1, IRanges(-2, 13)) setdiff(ir1, IRanges(-2, 13)) @ \end{exampleblock} \end{column} \end{columns} \end{frame} \begin{frame}[fragile] \frametitle{Range-based operations on \Rclass{IRanges} objects (continued)} \begin{columns}[b] \begin{column}{0.44\textwidth} \begin{exampleblock}{} <>= ir3 <- IRanges(5:1, width=12) ir3 @ \end{exampleblock} \end{column} \begin{column}{0.44\textwidth} \begin{exampleblock}{} <>= ir2 @ \end{exampleblock} \end{column} \end{columns} \begin{exampleblock}{} <>= pintersect(ir3, ir2, resolve.empty="max.start") @ \end{exampleblock} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{\Rclass{GRanges} objects} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{The purpose of the \Rclass{GRanges} container is...} ... to store a set of {\em genomic ranges} (a.k.a. {\em genomic regions} or {\em genomic intervals}). \begin{block}{} \begin{itemize} \item Like for \Rclass{IRanges} objects, each range can be defined by a {\em start} and an {\em end} value. \item In addition, each range is also assigned a chromosome name and a strand. \item {\em start} and {\em end} are both {\bf 1-based} positions relative to the 5' end of the plus strand of the chromosome (a.k.a. {\em reference sequence}), even when the range is on the minus strand. \item So the {\em start} is always the leftmost position and the {\em end} the rightmost, even when the range is on the minus strand. \item As a consequence, {\bf if a genomic range represents a gene on the minus strand, the gene "starts" (biologically speaking) at the {\em end} of it}. \end{itemize} \end{block} \setbeamercolor{block title}{bg=green!40} \setbeamercolor{block body}{bg=green!20} \begin{block}{Supported operations} \begin{itemize} \item {\em Vector operations}: \YES{} (splitting/relisting produces a \Rclass{GRangesList} object) \item {\em List operations}: \NO{} \item {\em Coercion methods}: to \Rclass{IRangesList} (not covered in this presentation) \item {\em Range-based operations}: \YES{} \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Constructor and accessors} \begin{frame}[fragile] \frametitle{\Rclass{GRanges} constructor} \begin{exampleblock}{} {\small <>= library(GenomicRanges) gr1 <- GRanges(seqnames=Rle(c("ch1", "chMT"), lengths=c(2, 4)), ranges=IRanges(start=16:21, end=20), strand=rep(c("+", "-", "*"), 2)) gr1 @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{\Rclass{GRanges} accessors} \begin{exampleblock}{} {\small <>= length(gr1) seqnames(gr1) ranges(gr1) @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{\Rclass{GRanges} accessors (continued)} \begin{exampleblock}{} {\small <>= start(gr1) end(gr1) width(gr1) strand(gr1) strand(gr1) <- c("-", "-", "+") strand(gr1) @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{\Rclass{GRanges} accessors (continued)} \begin{exampleblock}{} {\scriptsize <>= names(gr1) <- LETTERS[1:6] names(gr1) mcols(gr1) <- DataFrame(score=11:16, GC=seq(1, 0, length=6)) mcols(gr1) gr1 @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{\Rclass{GRanges} accessors (continued)} \begin{exampleblock}{} <>= seqinfo(gr1) seqlevels(gr1) seqlengths(gr1) seqlengths(gr1) <- c(50000, 800) seqlengths(gr1) @ \end{exampleblock} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Vector operations} \begin{frame}[fragile] \frametitle{Vector operations on \Rclass{GRanges} objects} \begin{exampleblock}{} {\small <>= gr1[c("F", "A")] gr1[strand(gr1) == "+"] @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{Vector operations on \Rclass{GRanges} objects (continued)} \begin{exampleblock}{} {\small <>= gr1 <- gr1[-5] gr1 @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{Vector operations on \Rclass{GRanges} objects (continued)} \begin{exampleblock}{} {\small <>= gr2 <- GRanges(seqnames="ch2", ranges=IRanges(start=c(2:1,2), width=6), score=15:13, GC=seq(0, 0.4, length=3)) gr12 <- c(gr1, gr2) gr12 @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{Vector operations on \Rclass{GRanges} objects (continued)} \begin{exampleblock}{} {\small <>= gr12[length(gr12)] == gr12 duplicated(gr12) unique(gr12) @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{Vector operations on \Rclass{GRanges} objects (continued)} \begin{exampleblock}{} {\small <>= sort(gr12) @ } \end{exampleblock} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Range-based operations} \begin{frame}[fragile] \frametitle{Range-based operations on \Rclass{GRanges} objects} \begin{exampleblock}{} {\scriptsize <>= gr2 shift(gr2, 50) narrow(gr2, start=2, end=-2) @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{Range-based operations on \Rclass{GRanges} objects (continued)} \begin{exampleblock}{} {\small <>= gr1 resize(gr1, 12) @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{Range-based operations on \Rclass{GRanges} objects (continued)} \begin{exampleblock}{} {\small <>= gr1 flank(gr1, 3) @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{Range-based operations on \Rclass{GRanges} objects (continued)} \begin{exampleblock}{} {\small <>= gr3 <- shift(gr1, c(35000, rep(0, 3), 100)) width(gr3)[c(3,5)] <- 117 gr3 range(gr3) @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{Range-based operations on \Rclass{GRanges} objects (continued)} \begin{exampleblock}{} {\scriptsize <>= gr3 disjoin(gr3) @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{Range-based operations on \Rclass{GRanges} objects (continued)} \begin{exampleblock}{} {\small <>= gr3 reduce(gr3) @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{Range-based operations on \Rclass{GRanges} objects (continued)} \begin{exampleblock}{} {\scriptsize <>= gr3 gaps(gr3) @ } \end{exampleblock} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Splitting a \Rclass{GRanges} object} \begin{frame}[fragile] \frametitle{Splitting a \Rclass{GRanges} object} \begin{exampleblock}{} {\small <>= split(gr3, seqnames(gr3)) @ } \end{exampleblock} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{\Rclass{GRangesList} objects} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{The purpose of the \Rclass{GRangesList} container is...} ... to store a list of {\em compatible} \Rclass{GRanges} objects. \begin{block}{} {\em compatible} means: \begin{itemize} \item they are relative to the same genome, \item AND they have the same metadata columns (accessible with the \Rcode{mcols()} accessor). \end{itemize} \end{block} \setbeamercolor{block title}{bg=green!40} \setbeamercolor{block body}{bg=green!20} \begin{block}{Supported operations} \begin{itemize} \item {\em Vector operations}: \PartiallySupported{} (no splitting/relisting, no comparing or ordering) \item {\em List operations}: \YES{} \item {\em Coercion methods}: to \Rclass{IRangesList} (not covered in this presentation) \item {\em Range-based operations}: \PartiallySupported{} (some operations like \Rcode{gaps()} are missing but they could/will be added) \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Constructor and accessors} \begin{frame}[fragile] \frametitle{\Rclass{GRangesList} constructor} \begin{exampleblock}{} {\small <>= grl <- GRangesList(gr3, gr2) grl @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{\Rclass{GRangesList} accessors} \begin{exampleblock}{} <>= length(grl) @ \end{exampleblock} \begin{columns}[t] \begin{column}{0.44\textwidth} \begin{exampleblock}{} {\small <>= seqnames(grl) @ } \end{exampleblock} \end{column} \begin{column}{0.44\textwidth} \begin{exampleblock}{} {\small <>= strand(grl) @ } \end{exampleblock} \end{column} \end{columns} \end{frame} \begin{frame}[fragile] \frametitle{\Rclass{GRangesList} accessors (continued)} \begin{columns}[t] \begin{column}{0.44\textwidth} \begin{exampleblock}{} {\small <>= ranges(grl) @ } \end{exampleblock} \end{column} \begin{column}{0.44\textwidth} \begin{exampleblock}{} {\small <>= start(grl) end(grl) width(grl) @ } \end{exampleblock} \end{column} \end{columns} \end{frame} \begin{frame}[fragile] \frametitle{\Rclass{GRangesList} accessors (continued)} \begin{exampleblock}{} {\small <>= names(grl) <- c("TX1", "TX2") grl @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{\Rclass{GRangesList} accessors (continued)} \begin{exampleblock}{} {\scriptsize <>= mcols(grl)$geneid <- c("GENE1", "GENE2") mcols(grl) grl @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{\Rclass{GRangesList} accessors (continued)} \begin{exampleblock}{} <>= seqinfo(grl) @ \end{exampleblock} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Vector operations} \begin{frame}[fragile] \frametitle{Vector operations on \Rclass{GRangesList} objects} \begin{exampleblock}{} {\small <>= grl[c("TX2", "TX1")] @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{Vector operations on \Rclass{GRangesList} objects (continued)} \begin{exampleblock}{} {\scriptsize <>= c(grl, GRangesList(gr3)) @ } \end{exampleblock} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{List operations} \begin{frame}[fragile] \frametitle{List operations on \Rclass{GRangesList} objects} \begin{exampleblock}{} {\scriptsize <>= grl[[2]] elementLengths(grl) unlisted <- unlist(grl, use.names=FALSE) # same as c(grl[[1]], grl[[2]]) unlisted @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{List operations on \Rclass{GRangesList} objects (continued)} \begin{exampleblock}{} {\small <>= grl100 <- relist(shift(unlisted, 100), grl) grl100 @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{List operations on \Rclass{GRangesList} objects (continued)} \begin{exampleblock}{} {\scriptsize <>= grl100b <- endoapply(grl, shift, 100) grl100b mcols(grl100) mcols(grl100b) @ } \end{exampleblock} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Range-based operations} \begin{frame}[fragile] \frametitle{Range-based operations on \Rclass{GRangesList} objects} \begin{columns}[t] \begin{column}{0.49\textwidth} \begin{exampleblock}{} {\scriptsize <>= grl @ } \end{exampleblock} \end{column} \begin{column}{0.49\textwidth} \begin{exampleblock}{} {\scriptsize <>= shift(grl, 100) @ } \end{exampleblock} \end{column} \end{columns} \begin{block}{} \Rcode{shift(grl, 100)} is equivalent to \Rcode{endoapply(grl, shift, 100)} \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Range-based operations on \Rclass{GRangesList} objects (continued)} \begin{columns}[t] \begin{column}{0.49\textwidth} \begin{exampleblock}{} {\scriptsize <>= grl @ } \end{exampleblock} \end{column} \begin{column}{0.49\textwidth} \begin{exampleblock}{} {\scriptsize <>= flank(grl, 10) @ } \end{exampleblock} \end{column} \end{columns} \begin{block}{} \Rcode{flank(grl, 10)} is equivalent to \Rcode{endoapply(grl, flank, 10)} \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Range-based operations on \Rclass{GRangesList} objects (continued)} \begin{columns}[t] \begin{column}{0.49\textwidth} \begin{exampleblock}{} {\scriptsize <>= grl @ } \end{exampleblock} \end{column} \begin{column}{0.49\textwidth} \begin{exampleblock}{} {\scriptsize <>= range(grl) @ } \end{exampleblock} \end{column} \end{columns} \begin{block}{} \Rcode{range(grl)} is equivalent to \Rcode{endoapply(grl, range)} \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Range-based operations on \Rclass{GRangesList} objects (continued)} \begin{columns}[t] \begin{column}{0.49\textwidth} \begin{exampleblock}{} {\scriptsize <>= grl @ } \end{exampleblock} \end{column} \begin{column}{0.49\textwidth} \begin{exampleblock}{} {\scriptsize <>= reduce(grl) @ } \end{exampleblock} \end{column} \end{columns} \begin{block}{} \Rcode{reduce(grl)} is equivalent to \Rcode{endoapply(grl, reduce)} \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Range-based operations on \Rclass{GRangesList} objects (continued)} <>= grl2 <- grl grl2[[1]] <- grl2[[1]][3]; grl2[[2]] <- grl2[[2]][1] grl3 <- unname(grl2) grl3[[1]] <- narrow(unname(grl3[[1]]), start=5, end=-5) @ \begin{columns}[t] \begin{column}{0.49\textwidth} \begin{exampleblock}{} {\scriptsize <>= grl2 grl3 @ } \end{exampleblock} \end{column} \begin{column}{0.49\textwidth} \begin{exampleblock}{} {\scriptsize <>= psetdiff(grl2, grl3) @ } \end{exampleblock} \end{column} \end{columns} \begin{block}{} \Rcode{psetdiff(grl2, grl)} is equivalent to \Rcode{mendoapply(setdiff, grl2, grl)} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{\Rclass{GAlignments} objects} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{The purpose of the \Rclass{GAlignments} container is...} ... to store a set of genomic alignments (aligned reads, typically). \begin{block}{} The alignments can be loaded from a BAM file with \Rcode{readGAlignments()}. By default, only the following information is loaded for each alignment: \begin{itemize} \item RNAME field: name of the reference sequence to which the query is aligned. \item strand bit (from FLAG field): strand in the reference sequence to which the query is aligned. \item CIGAR field: a string in the "Extended CIGAR format" describing the "gemoetry" of the alignment (i.e. locations of insertions, deletions and gaps). See the SAM Spec for the details. \item POS field: {\bf 1-based} position of the leftmost mapped base. \end{itemize} In particular, the query sequences (SEQ) and qualities (QUAL) are not loaded by default. \end{block} \setbeamercolor{block title}{bg=green!40} \setbeamercolor{block body}{bg=green!20} \begin{block}{Supported operations} \begin{itemize} \item {\em Vector} operations: \PartiallySupported{} (no splitting/relisting, no comparing or ordering) \item {\em List} operations: \NO{} \item {\em Ranges} operations: only \Rcode{narrow()} and \Rcode{qnarrow()} (\Rclass{GAlignments} specific) are supported \item {\em Coercion methods}: to \Rclass{GRanges} or \Rclass{GRangesList} \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Constructor and accessors} \begin{frame}[fragile] \frametitle{\Rclass{GAlignments} constructor} Typically not used directly! \begin{exampleblock}{} {\small <>= gal0 <- GAlignments(seqnames=Rle(c("ch1", "ch2"), c(3, 1)), pos=1L + 10L*0:3, cigar=c("36M", "20M3D16M", "20M703N16M", "14M2I20M"), strand=strand(c("+", "-", "-", "+"))) gal0 @ } \end{exampleblock} An N in the cigar indicates a gap (!= deletion). \end{frame} \begin{frame}[fragile] \frametitle{\Rcode{readGAlignments()}} \begin{exampleblock}{} {\small <>= library(pasillaBamSubset) U1gal <- readGAlignments(untreated1_chr4()) length(U1gal) head(U1gal) @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{\Rclass{GAlignments} accessors} \begin{exampleblock}{} {\scriptsize <>= seqnames(U1gal) table(as.factor(seqnames(U1gal))) strand(U1gal) table(as.factor(strand(U1gal))) head(cigar(U1gal)) head(qwidth(U1gal)) table(qwidth(U1gal)) @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{\Rclass{GAlignments} accessors (continued)} \begin{columns}[t] \begin{column}{0.40\textwidth} \begin{exampleblock}{} {\small <>= head(start(U1gal)) head(end(U1gal)) head(width(U1gal)) head(ngap(U1gal)) table(ngap(U1gal)) @ } \end{exampleblock} \end{column} \begin{column}{0.52\textwidth} \begin{exampleblock}{} {\small <>= mcols(U1gal) seqinfo(U1gal) @ } \end{exampleblock} \end{column} \end{columns} \end{frame} \begin{frame}[fragile] \frametitle{Loading additional information from the BAM file} \begin{exampleblock}{} {\small <>= param <- ScanBamParam(what=c("flag", "mapq"), tag=c("NH", "NM")) U1gal <- readGAlignments(untreated1_chr4(), use.names=TRUE, param=param) U1gal[1:5] any(duplicated(names(U1gal))) @ } \end{exampleblock} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Coercion to \Rclass{GRanges} or \Rclass{GRangesList}} \begin{frame}[fragile] \frametitle{From \Rclass{GAlignments} to \Rclass{GRanges}} {\bf Gaps are ignored}, that is, each alignment is converted into a {\em single} genomic range defined by the {\em start} and {\em end} of the alignment. \begin{exampleblock}{} {\scriptsize <>= as(U1gal, "GRanges") @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{From \Rclass{GAlignments} to \Rclass{GRangesList}} {\bf Gaps are NOT ignored}, that is, each alignment is converted into one or more genomic ranges (one more range than the number of gaps in the alignment). \begin{exampleblock}{} {\small <>= U1grl <- as(U1gal, "GRangesList") U1grl @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{From \Rclass{GAlignments} to \Rclass{GRangesList} (continued)} One more range than the number of gaps in the alignment: \begin{exampleblock}{} {\small <>= all(elementLengths(U1grl) == ngap(U1gal) + 1) @ } \end{exampleblock} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{\Rclass{GAlignmentPairs} objects} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{The purpose of the \Rclass{GAlignmentPairs} container is...} ... to store a set of aligned {\em paired-end} reads. \begin{block}{} \begin{itemize} \item Implemented on top of the \Rclass{GAlignments} class. \item The alignments can be loaded from a BAM file with \Rcode{readGAlignmentPairs()}. \item \Rcode{first(x)}, \Rcode{last(x)}: extract the {\it first} and {\it last} ends in 2 separate \Rclass{GAlignments} objects of the same length. \end{itemize} \end{block} \setbeamercolor{block title}{bg=green!40} \setbeamercolor{block body}{bg=green!20} \begin{block}{Supported operations} \begin{itemize} \item {\em Vector} operations: \PartiallySupported{} (no splitting/relisting, no comparing or ordering) \item {\em List} operations: \YES{} \item {\em Ranges} operations: \NO{} \item {\em Coercion methods}: to \Rclass{GRanges} or \Rclass{GRangesList} \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Constructor and accessors} \begin{frame}[fragile] \frametitle{\Rcode{readGAlignmentPairs()}} \begin{exampleblock}{} {\small <>= library(pasillaBamSubset) U3galp <- readGAlignmentPairs(untreated3_chr4()) length(U3galp) head(U3galp) @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{\Rclass{GAlignmentPairs} accessors} \begin{exampleblock}{} {\scriptsize <>= head(first(U3galp)) head(last(U3galp)) @ } \end{exampleblock} Currently, \Rcode{readGAlignmentPairs()} drops pairs where the {\it first} and {\it last} ends have incompatible sequence names and/or strands (a rare situation). \end{frame} \begin{frame}[fragile] \frametitle{\Rclass{GAlignmentPairs} accessors (continued)} \begin{exampleblock}{} {\small <>= seqnames(U3galp) strand(U3galp) head(ngap(U3galp)) table(ngap(U3galp)) @ } \end{exampleblock} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Coercion to \Rclass{GRangesList}} \begin{frame}[fragile] \frametitle{From \Rclass{GAlignmentPairs} to \Rclass{GRangesList}} \begin{exampleblock}{} {\small <>= U3grl <- as(U3galp, "GRangesList") U3grl @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{From \Rclass{GAlignmentPairs} to \Rclass{GRangesList} (continued)} \begin{exampleblock}{} {\scriptsize <>= U3grl[ngap(U3galp) != 0] @ } \end{exampleblock} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Advanced operations} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Coverage and slicing} \begin{frame}[fragile] \frametitle{Coverage} \begin{exampleblock}{} {\scriptsize <>= U1cvg <- coverage(U1grl) U1cvg @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{Coverage (continued)} \begin{exampleblock}{} {\small <>= mean(U1cvg) max(U1cvg) @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{Slicing the coverage} \begin{exampleblock}{} {\small <>= U1sl <- slice(U1cvg, lower=10) U1sl elementLengths(U1sl) head(U1sl$chr4) head(mean(U1sl$chr4)) head(max(U1sl$chr4)) @ } \end{exampleblock} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Finding/counting overlaps} \begin{frame}[fragile] \frametitle{Finding/counting overlaps} \begin{block}{} A typical use case: count the number of {\em hits} (a.k.a. {\em overlaps}) per transcript. \end{block} \begin{block}{} {\bf Typical input} \begin{itemize} \item A BAM file with the aligned reads ({\em single-} or {\em paired-end}). \item Transcript annotations {\em for the same reference genome} that was used to align the reads. \end{itemize} \end{block} \begin{block}{} {\bf Typical tools} \begin{itemize} \item \Rcode{readGAlignments()} or \Rcode{readGAlignmentPairs()} to load the reads in a \Rclass{GAlignments} or \Rclass{GAlignmentPairs} object. \item A \Rclass{TranscriptDb} object containing the transcript annotations. \item The \Rcode{exonsBy()} extractor (defined in the \Rpackage{GenomicFeatures} package) to extract the exons ranges grouped by transcript from the \Rclass{TranscriptDb} object. The exons ranges are returned in a \Rclass{GRangesList} object with 1 top-level element per transcript. \item The \Rcode{findOverlaps()} and/or \Rcode{countOverlaps()} functions. \end{itemize} \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Load the transcripts} \begin{exampleblock}{} {\small <>= library(TxDb.Dmelanogaster.UCSC.dm3.ensGene) txdb <- TxDb.Dmelanogaster.UCSC.dm3.ensGene exbytx <- exonsBy(txdb, by="tx", use.names=TRUE) exbytx @ } \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{{\em Single-end} overlaps} \begin{exampleblock}{} {\small <>= U1txhits <- countOverlaps(exbytx, U1grl) length(U1txhits) head(U1txhits) sum(U1txhits) # total nb of hits head(sort(U1txhits, decreasing=TRUE)) @ } \end{exampleblock} \begin{block}{} Rough counting! \begin{itemize} \item More than 1 alignment per read can be reported in the BAM file (sometimes the same read hits the same transcript many times). \item A hit is counted even if it's not {\em compatible} with the splicing of the transcript. \end{itemize} \end{block} \end{frame} \begin{frame}[fragile] \frametitle{{\em Paired-end} overlaps} \begin{exampleblock}{} {\small <>= U3txhits <- countOverlaps(exbytx, U3grl) length(U3txhits) head(U3txhits) sum(U3txhits) # total nb of hits head(sort(U3txhits, decreasing=TRUE)) @ } \end{exampleblock} Note that exons that fall within the {\it inter-read} gap are NOT considered to overlap. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Resources} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Resources} \begin{block}{} {\bf Resources} \begin{itemize} \item Vignettes in the \Rpackage{GenomicRanges} package (\Rcode{browseVignettes("GenomicRanges")}). \item \Rclass{GRanges}, \Rclass{GRangesList}, \Rclass{GAlignments}, and \Rclass{GAlignmentPairs} man pages in the \Rpackage{GenomicRanges} package. \item SAMtools website: \url{http://samtools.sourceforge.net/} \item \Bioconductor{} mailing lists: \url{http://bioconductor.org/help/mailing-list/} \end{itemize} \end{block} \begin{block}{} {\bf Where to look next} \begin{itemize} \item \Rcode{summarizeOverlaps()} function in the \Rpackage{GenomicRanges} package for counting overlaps between reads and genomic features, and resolve reads that overlap multiple features. \end{itemize} \end{block} \begin{block}{} \centerline{THANKS!} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \end{document}