## ----style, echo = FALSE, results = 'asis'------------------------------- BiocStyle::markdown() options(width=100, max.print=1000) knitr::opts_chunk$set( eval=as.logical(Sys.getenv("KNITR_EVAL", "TRUE")), cache=as.logical(Sys.getenv("KNITR_CACHE", "TRUE"))) ## ----packages, eval=TRUE, echo=FALSE, warning=FALSE, message=FALSE------- suppressPackageStartupMessages({ library(BiocEMBO2015) }) ## ----echo=TRUE, eval=FALSE----------------------------------------------- ## fname <- file.choose() ## "ALLphenoData.tsv" ## stopifnot(file.exists(fname)) ## pdata <- read.delim(fname) ## ----echo=FALSE---------------------------------------------------------- fname <- "ALLphenoData.tsv" stopifnot(file.exists(fname)) pdata <- read.delim(fname) ## ----ALL-properties------------------------------------------------------ class(pdata) colnames(pdata) dim(pdata) head(pdata) summary(pdata$sex) summary(pdata$cyto.normal) ## ----ALL-subset---------------------------------------------------------- pdata[1:5, 3:4] pdata[1:5, ] head(pdata[, 3:5]) tail(pdata[, 3:5], 3) head(pdata$age) head(pdata$sex) head(pdata[pdata$age > 21,]) ## ----ALL-subset-NA------------------------------------------------------- idx <- pdata$sex == "F" & pdata$age > 40 table(idx) dim(pdata[idx,]) ## ----ALL-BCR/ABL-subset-------------------------------------------------- bcrabl <- pdata[pdata$mol.biol %in% c("BCR/ABL", "NEG"),] ## ----ALL-BCR/ABL-drop-unused--------------------------------------------- bcrabl$mol.biol <- factor(bcrabl$mol.biol) ## ----ALL-BT-------------------------------------------------------------- levels(bcrabl$BT) ## ----ALL-BT-recode------------------------------------------------------- table(bcrabl$BT) levels(bcrabl$BT) <- substring(levels(bcrabl$BT), 1, 1) table(bcrabl$BT) ## ----ALL-BCR/ABL-BT------------------------------------------------------ xtabs(~ BT + mol.biol, bcrabl) ## ----ALL-aggregate------------------------------------------------------- aggregate(age ~ mol.biol + sex, bcrabl, mean) ## ----ALL-age------------------------------------------------------------- t.test(age ~ mol.biol, bcrabl) boxplot(age ~ mol.biol, bcrabl) ## ----ShortRead, messages=FALSE------------------------------------------- ## 1. attach ShortRead and BiocParallel library(ShortRead) library(BiocParallel) ## 2. create a vector of file paths ## replace 'bigdata' with '/mnt/nfs/practicals/day1/martin_morgan/' fls <- dir("bigdata", pattern="*fastq.gz", full=TRUE) stopifnot(all(file.exists(fls))) ## 3. collect statistics stats <- qa(fls) ## 4. generate and browse the report browseURL(report(stats)) ## ----ShortRead-qa-all---------------------------------------------------- ## replace 'bigdata' with '/mnt/nfs/practicals/day1/martin_morgan/' load("bigdata/qa_all.Rda") browseURL(report(qa_all)) ## ----org----------------------------------------------------------------- library(airway) data(airway) library(org.Hs.eg.db) ensid <- head(rownames(airway)) mapIds(org.Hs.eg.db, ensid, "SYMBOL", "ENSEMBL") keytypes(org.Hs.eg.db) ## ----txdb---------------------------------------------------------------- library(TxDb.Hsapiens.UCSC.hg19.knownGene) txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene exons(txdb) exonsBy(txdb, "tx") p <- promoters(txdb) ## ----BSgenome------------------------------------------------------------ library(BSgenome.Hsapiens.UCSC.hg19) bsgenome <- BSgenome.Hsapiens.UCSC.hg19 ps <- getSeq(bsgenome, p) ps hist(letterFrequency(ps, "GC", as.prob=TRUE)) ## ----annotationhub-gtf, eval=FALSE--------------------------------------- ## library(AnnotationHub) ## hub <- AnnotationHub() ## hub ## query(hub, c("Ensembl", "80", "gtf")) ## ## ensgtf = display(hub) # visual choice ## hub["AH47107"] ## gtf <- hub[["AH47107"]] ## gtf ## txdb <- GenomicFeatures::makeTxDbFromGRanges(gtf) ## ----annotationhub-orgdb, eval=FALSE------------------------------------- ## library(AnnotationHub) ## hub <- AnnotationHub() ## query(hub, "OrgDb") ## ----annotationhub-roadmap, eval=FALSE----------------------------------- ## library(AnnotationHub) ## hub <- AnnotationHub() ## query(hub , c("EpigenomeRoadMap", "E126", "H3K4ME2")) ## E126 <- hub[["AH29817"]] ## ----annotationhub-liftover, eval=FALSE---------------------------------- ## query(hub , c("hg19", "hg38", "chainfile")) ## chain <- hub[["AH14150"]] ## ----liftover, eval=FALSE------------------------------------------------ ## library(rtracklayer) ## E126hg38 <- liftOver(E126, chain) ## E126hg38 ## ----setup-view, message=FALSE, warning=FALSE---------------------------- ## 1.a 'Annotation' packages library(TxDb.Hsapiens.UCSC.hg19.knownGene) library(org.Hs.eg.db) txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene ## 1.b -- map 'seqlevels' as recorded in the TxDb file to those in the ## BAM file fl <- "~/igv/genomes/hg19_alias.tab" map <- with(read.delim(fl, header=FALSE, stringsAsFactors=FALSE), setNames(V1, V2)) seqlevels(txdb, force=TRUE) <- map ## 2. Symbol -> Entrez ID -> Gene coordinates sym2eg <- mapIds(org.Hs.eg.db, "SPARCL1", "ENTREZID", "SYMBOL") exByGn <- exonsBy(txdb, "gene") sparcl1exons <- exByGn[[sym2eg]] ## 3. Aligned reads library(GenomicAlignments) ## replace 'bigdata' with '/mnt/nfs/practicals/day1/martin_morgan/' fl <- "bigdata/SRR1039508_sorted.bam" sparcl1gene <- range(sparcl1exons) param <- ScanBamParam(which=sparcl1gene) aln <- readGAlignmentPairs(fl, param=param) ## ----compatibleAlignments, warning=FALSE--------------------------------- ## 5.a. exons-by-transcript for our gene of interest txids <- select(txdb, sym2eg, "TXID", "GENEID")$TXID exByTx <- exonsBy(txdb, "tx")[txids] ## 5.b compatible alignments hits <- findCompatibleOverlaps(query=aln, subject=exByTx) good <- seq_along(aln) %in% queryHits(hits) table(good) ## ----coding-sequence, warning=FALSE-------------------------------------- ## reset seqlevels restoreSeqlevels(txdb) ## a. cds coordinates, grouped by transcript txids <- mapIds(txdb, sym2eg, "TXID", "GENEID") cdsByTx <- cdsBy(txdb, "tx")[txids] ## b. coding sequence from relevant reference genome library(BSgenome.Hsapiens.UCSC.hg19) dna <- extractTranscriptSeqs(BSgenome.Hsapiens.UCSC.hg19, cdsByTx) protein <- translate(dna) ## ----biomaRt1, eval=FALSE, results="hide"-------------------------------- ## library(biomaRt) ## head(listMarts(), 3) ## list the marts ## head(listDatasets(useMart("ensembl")), 3) ## mart datasets ## ensembl <- ## fully specified mart ## useMart("ensembl", dataset = "hsapiens_gene_ensembl") ## ## head(listFilters(ensembl), 3) ## filters ## myFilter <- "chromosome_name" ## head(filterOptions(myFilter, ensembl), 3) ## return values ## myValues <- c("21", "22") ## head(listAttributes(ensembl), 3) ## attributes ## myAttributes <- c("ensembl_gene_id","chromosome_name") ## ## ## assemble and query the mart ## res <- getBM(attributes = myAttributes, filters = myFilter, ## values = myValues, mart = ensembl)