library(ShortRead)
library(VariantAnnotation)
library(parallel); options(mc.cores=detectCores())
library(ggplot2)
library(RNAseqData.HNRNPC.bam.chr14)
library(org.Hs.eg.db)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(BSgenome.Hsapiens.UCSC.hg19)
library(AnnotationHub)
library(rtracklayer)

\title{Practical: Annotation}
\author{Martin Morgan (mtmorgan@fhcrc.org)}
\date{27-28 February, 2014}

\newcommand{\Hsap}{\emph{H.~sapiens}}
\newcommand{\Dmel}{\emph{D.~melanogaster}}

\section{Gene annotation}

\subsection{Data packages}

Organism-level (`org') packages contain mappings between a central identifier (e.g., Entrez gene ids) and other identifiers (e.g. GenBank or Uniprot accession number, RefSeq id, etc.). The name of an org package is always of the form \texttt{org...db} (e.g. \Biocannopkg{org.Sc.sgd.db}) where \texttt{} is a 2-letter abbreviation of the organism (e.g. \texttt{Sc} for \emph{Saccharomyces cerevisiae}) and \texttt{} is an abbreviation (in lower-case) describing the type of central identifier (e.g. \texttt{sgd} for gene identifiers assigned by the \emph{Saccharomyces} Genome Database, or \texttt{eg} for Entrez gene ids).

The ``How to use the `.db' annotation packages'' vignette in the \Biocpkg{AnnotationDbi} package (org packages are only one type of ``.db'' annotation packages) is a key reference. The `.db' and most other \Bioconductor{} annotation packages are updated every 6 months.

Annotation packages usually contain an object named after the package itself. These objects are collectively called \Rclass{AnnotationDb} objects, with more specific classes named \Rclass{OrgDb}, \Rclass{ChipDb} or \Rclass{TranscriptDb} objects. Methods that can be applied to these objects include \Rfunction{cols}, \Rfunction{keys}, \Rfunction{keytypes} and \Rfunction{select}. Common operations for retrieving annotations are summarized in Table~\ref{tab:select-ops}.

\begin{table}
  \centering
  \caption{Common operations for retrieving and manipulating annotations.}
  \label{tab:select-ops}
  \begin{tabular}{lll}
    Category & Function & Description \\
    \hline\noalign{\smallskip}
    Discover
      & \Rfunction{columns} & List the kinds of columns that can be returned \\
      & \Rfunction{keytypes} & List columns that can be used as keys \\
      & \Rfunction{keys} & List values that can be expected for a given keytype \\
      & \Rfunction{select} & Retrieve annotations matching \Rcode{keys}, \Rcode{keytype} and \Rcode{columns} \\
    Manipulate
      & \Rfunction{setdiff}, \Rfunction{union}, \Rfunction{intersect} & Operations on sets \\
      & \Rfunction{duplicated}, \Rfunction{unique} & Mark or remove duplicates \\
      & \Rfunction{\%in\%}, \Rfunction{match} & Find matches \\
      & \Rfunction{any}, \Rfunction{all} & Are any \Rcode{TRUE}? Are all? \\
      & \Rfunction{merge} & Combine two different \Robject{data.frames} based on shared keys \\
    \Rclass{GRanges*}
      & \Rfunction{transcripts}, \Rfunction{exons}, \Rfunction{cds} & Features (transcripts, exons, coding sequence) as \Rclass{GRanges}. \\
      & \Rfunction{transcriptsBy} , \Rfunction{exonsBy} & Features group by gene, transcript, etc., as \Rclass{GRangesList}.\\
      & \Rfunction{cdsBy}\\
    \hline
  \end{tabular}
\end{table}

\begin{Exercise}
  This exercise illustrates basic use of the `select' interface to
  annotation packages.
  \begin{enumerate}
  \item What is the name of the org package for \emph{Homo sapiens}?
    Load it. Display the \Rclass{OrgDb} object for the
    \Biocpkg{org.Hs.eg.db} package. Use the \Rfunction{columns} method
    to discover which sorts of annotations can be extracted from it.
  \item Use the \Rfunction{keys} method to extract ENSEMBL identifiers
    and then pass those keys in to the \Rfunction{select} method in
    such a way that you extract the SYMBOL (gene symbol) and GENENAME
    information for each. Use the following ENSEMBL ids.
  \end{enumerate}
<<=
ensids <- c("ENSG00000130720", "ENSG00000103257", "ENSG00000156414",
            "ENSG00000144644", "ENSG00000159307", "ENSG00000144485")
@
\end{Exercise}

\begin{Solution}
  The \Rclass{OrgDb} object is named \Rcode{org.Hs.eg.db}.
<<