%\VignetteEngine{knitr::knitr} %\VignetteIndexEntry{08 Working with Called Variants -- Slides} \documentclass[xcolor=dvipsnames]{beamer} \usepackage{BioconductorSlides} \hypersetup{colorlinks,linkcolor=,urlcolor=Blue} \AtBeginSection[] { \begin{frame}{Outline} \tableofcontents[currentsection] \end{frame} } \begin{document} <>= library(knitr) opts_chunk$set(tidy=FALSE) @ \title{Variants} \author{Martin Morgan (\href{mailto:mtmorgan@fhcrc.org}{mtmorgan@fhcrc.org}) \\ Fred Hutchinson Cancer Research Center \\ Seattle, WA} \date{4 February 2014} \maketitle \section*{Work flows} \begin{frame}{Work flows} \begin{enumerate} \item Experimental design -- tumor / normal pairs; cell lines; \ldots \item Sequencing -- DNA or Exome \item Alignment \& other pre-processing steps \item Variant discovery \& preliminary analysis \item \textbf{Variant evaluation, annotation, biological and experimental context} \end{enumerate} \end{frame} \begin{frame} \includegraphics[width=\textwidth]{figures/Broad_Variants_Best_Practices_workflow.png} \end{frame} \section*{Variant Call Format (VCF) files} \begin{frame}{Variant Call Format (VCF) files} \begin{itemize} \item \href{https://github.com/samtools/hts-specs}{Specification} \item Header documenting file content \item CHROMosome, POSition, IDentifier of each variant \item REFerence and ALTernate allele sequence. \item INFOrmation on variants \item FORMAT of sample genotype information, followed by each genotype \end{itemize} \end{frame} \begin{frame}[fragile]{VCF content: location} \begin{verbatim} #CHROM POS ID REF ALT QUAL FILTER ... 20 14370 rs6054257 G A 29 PASS ... 20 17330 . T A 3 q10 ... 20 1110696 rs6040355 A G,T 67 PASS ... 20 1230237 . T . 47 PASS ... 20 1234567 microsat1 GTC G,GTCT 50 PASS ... \end{verbatim} Lines: good SNP, poor quality SNP, multipe variants, called monomorphic, indel \end{frame} \begin{frame}[fragile]{VCF content: variant INFO} \begin{verbatim} #CHROM POS ... INFO ... 20 14370 ... NS=3;DP=14;AF=0.5;DB;H2 ... 20 17330 ... NS=3;DP=11;AF=0.017 ... 20 1110696 ... NS=2;DP=10;AF=0.333,0.667;AA=T;DB ... 20 1230237 ... NS=3;DP=13;AA=T ... 20 1234567 ... NS=3;DP=9;AA=G ... \end{verbatim} Information supporting the SNP: NS, \# samples with data; DP, total depth; AF, ancestral frequency; DB, dbSNP membership; H2, HapMap 2 membership. \end{frame} \begin{frame}[fragile]{VCF content: Genotype FORMAT and samples} \begin{verbatim} ... POS ... FORMAT NA00001 NA00002 NA00003 ... 14370 ... GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. ... 17330 ... GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 ... 1110696 ... GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 ... 1230237 ... GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 ... 1234567 ... GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 \end{verbatim} Genotype inforamation in three samples. FORMAT specifies the order and type of information: GT, Genotype, `\textbf{|}' phased, vs.\ `\textbf{/}' unphased; GQ, quality; DP, read depth; HQ, haplotype quality. \end{frame} \begin{frame}[fragile]{VCF Header} \begin{small} \begin{verbatim} ##fileformat=VCFv4.2 ##fileDate=20090805 ##source=myImputationProgramV3.1 ##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta ##contig= ##phasing=partial ##INFO= ##INFO= ... ##FILTER= ##FILTER= ... ##FORMAT= ##FORMAT= \end{verbatim} \end{small} \end{frame} \begin{frame}{VCF files} \begin{itemize} \item Very complicated data. \item Content of INFO, FORMAT fields very flexible, depends entirely on up-stream processing. \item Often interested in only part of the file -- specific genomic ranges, INFO or FORMAT fields, samples. \end{itemize} \end{frame} \section*{VariantAnnotation} \end{document}