%\VignetteEngine{knitr::knitr} %\VignetteIndexEntry{02 Annotation -- Slides} \documentclass[xcolor=dvipsnames]{beamer} \usepackage{BioconductorSlides} \hypersetup{colorlinks,linkcolor=,urlcolor=Blue} \AtBeginSection[] { \begin{frame}{Outline} \tableofcontents[currentsection] \end{frame} } \begin{document} <>= library(knitr) opts_chunk$set(tidy=FALSE) @ \title{Annotation} \author{Martin Morgan (\href{mailto:mtmorgan@fhcrc.org}{mtmorgan@fhcrc.org}) \\ Fred Hutchinson Cancer Research Center \\ Seattle, WA} \date{3 February 2014} \maketitle \begin{frame}{What is `Annotation'?} \begin{itemize} \item Genes -- classification schemes (e.g., Entrez, Ensembl), pathway membership, \ldots \item Genomes -- reference genomes; exons, transcripts, coding sequence; coding consequences \item System / network biology -- pathways, biochemical reactions, \ldots \end{itemize} Other defintions (not covered here): assigning function to novel sequence assemblies, \ldots \end{frame} \begin{frame}[fragile]{\Bioconductor{} Annotation Resources -- Packages} Model organism annotation packages \begin{itemize} \item \Rpackage{org.*} -- gene names and pathways \item \Rpackage{TxDb.*} -- gene models \item \Rpackage{BSgenome.*} -- whole-genome sequences \end{itemize} \end{frame} \section{Gene and pathway annotations} \begin{frame}[fragile]{\Rpackage{org.*} packages} The `select' interface: \begin{itemize} \item Discovery: \Rfunction{keytypes}, \Rfunction{columns}, \Rfunction{keys} \item Retrieval: \Rfunction{select} \end{itemize} <>= library(org.Hs.eg.db) keytypes(org.Hs.eg.db) columns(org.Hs.eg.db) egid <- select(org.Hs.eg.db, "BRCA1", "ENTREZID", "SYMBOL") @ \end{frame} \begin{frame}{\Rpackage{org.*} packages -- Useful \R{} commands} Within-\Rclass{vector} or \Rclass{data.frame} \begin{itemize} \item Finding and removing duplicates: \Rfunction{duplicated}, \Rfunction{unique} \item \Rfunction{any}, \Rfunction{all} \end{itemize} Between-\Rclass{vector} or \Rclass{data.frame} \begin{itemize} \item Matching \Rfunction{\%in\%}, \Rfunction{match} \item Set operations: \Rfunction{setdiff}, \Rfunction{union}, \Rfunction{intersect} \item \Rfunction{merge} Join two \Rclass{data.frame}s based on shared column. \end{itemize} \end{frame} \begin{frame}{\Rpackage{org.*} pacakges -- Under the hood\ldots} SQL (sqlite) data bases \begin{itemize} \item \Rcode{org.Hs.eg\_dbconn()} to query using \Rpkg{RSQLite} package \item \Rcode{org.Hs.eg\_dbfile()} to discover location and query outside \R{}. \end{itemize} \end{frame} \section{Genomes and genome coordinates} \begin{frame}[fragile]{\Rpackage{TxDb.*} packages} \begin{itemize} \item Gene models for common model organsisms / genome builds / known gene schemes \item Supports the `select' interface (\Rfunction{keytypes}, \Rfunction{columns}, \Rfunction{keys}, \Rfunction{select}) \item `Easy' to build custom packages when gene model exist \end{itemize} Retrieving genomic ranges \begin{itemize} \item \Rfunction{transcripts}, \Rfunction{exons}, \Rfunction{cds}, \item \Rfunction{transcriptsBy} , \Rfunction{exonsBy}, \Rfunction{cdsBy} -- group by gene, transcirpt, etc. \end{itemize} <>= library(TxDb.Hsapiens.UCSC.hg19.knownGene) txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene cdsByTx <- cdsBy(txdb, "tx") @ \end{frame} \begin{frame}[fragile]{\Rpackage{BSgenome.*} packages} Whole-genome sequences \begin{itemize} \item `Masks' when available, e.g., repeat regions \item Load chromosomes, range-based queries: \Rfunction{getSeq}, \Rfunction{extactTranscriptsFromGenome} \end{itemize} <>= library(BSgenome.Hsapiens.UCSC.hg19) library(GenomicFeatures) dna <- extractTranscriptsFromGenome(Hsapiens, cdsByTx) @ \end{frame} \section{Web resources} \begin{frame}[fragile]{\Bioconductor{} Annotation Resources -- Web-based} Rich web resources \begin{itemize} \item \Biocpkg{biomaRt} (\url{http://biomart.org}), \Biocpkg{rtracklayer} (UCSC genome browser) \item \Biocpkg{ArrayExpress}, \Biocpkg{GEOquery}, Biocpkg{SRAdb} \item \Biocpkg{PSICQUIC}, \Biocpkg{KEGGREST}, \Biocpkg{uniprot.ws}, \ldots \item \Biocpkg{AnnotationHub} \end{itemize} \end{frame} \begin{frame}[fragile]{\Biocpkg{biomaRt}} \begin{itemize} \item \url{http://biomart.org} \item Drill-down discovery: \Rfunction{listMarts}, \Rfunction{listDatasets}, \Rfunction{listFilters}, \Rfunction{listAttributes} \item Retrieval: \Rfunction{getBM} \end{itemize} <>= library(biomaRt) ensembl <- ## discover & use useMart("ensembl", dataset="hsapiens_gene_ensembl") head(listFilters(ensembl), 3) myFilter <- "chromosome_name" myValues <- c("21", "22") myAttributes <- c("ensembl_gene_id","chromosome_name") res <- getBM(attributes=myAttributes, filters=myFilter, values=myValues, mart=ensembl) @ \end{frame} \begin{frame}[fragile]{\Biocpkg{PSICQUIC}} \begin{itemize} \item \textbf{P}rotemics \textbf{S}tandard \textbf{I}nitiative \textbf{C}ommon \textbf{QU}ery \textbf{I}nterfa\textbf{C}e \item Programmatic access to molecular interaction data bases. \item \url{https://code.google.com/p/psicquic/} \end{itemize} <>= library(PSICQUIC) ## Query web service for available providers psicquic <- PSICQUIC() providers(psicquic) # 25 available providers ## interactions between TP53 and MYC tbl <- interactions(psicquic, c("TP53", "MYC"), "9606") nrow(tbl) # 7 interactions @ %% See the package \href{http://bioconductor.org/packages/release/bioc/vignettes/PSICQUIC/inst/doc/PSICQUIC.pdf}{vignette} for additional detail. \end{frame} \begin{frame}[fragile]{\Biocpkg{AnnotationHub}} \begin{itemize} \item Large-scale genome resources, lightly curated for easy access from \R. \item Supports tab-completion, \Rfunction{metadata} discovery, selection and filtering. \end{itemize} <>= library(AnnotationHub) hub <- AnnotationHub() hub ## 10511 resources @ \end{frame} \section{Conclusions} \begin{frame}{Conclusions} Rich annotation resources \begin{itemize} \item Model organism and custom \Rpackage{org.*}, \Rpackage{TxDb.*}, \Rpackage{BSgenome.*} packages \item Web-based access to public (e.g., \Biocpkg{biomaRt} and \Bioconductor-specific (e.g., \Biocpkg{AnnotationHub}) resources \end{itemize} \end{frame} \end{document}