## ----loadFiles, eval = .Platform$OS.type == "unix"----------------------- suppressPackageStartupMessages(library(h5vc)) # h5vc is needed suppressPackageStartupMessages(library(rhdf5)) # rhdf5 is needed suppressPackageStartupMessages(library(deepSNV)) # we use deepSNV for tallying files <- c("NRAS.AML.bam","NRAS.Control.bam") bamFiles <- file.path( system.file("extdata", package = "h5vcData"), files) ## ----chromDim, eval = .Platform$OS.type == "unix"------------------------ chromdim <- sapply( scanBamHeader(bamFiles), function(x) x$targets ) colnames(chromdim) <- files head(chromdim) ## ----hdf5SetUp, eval = .Platform$OS.type == "unix"----------------------- chrom <- "1" chromlength <- chromdim[chrom,1] study <- "/NRAS" tallyFile <- file.path( tempdir(), "NRAS.tally.hfs5" ) if( file.exists(tallyFile) ){ file.remove(tallyFile) } h5createFile(tallyFile) group <- paste( study, chrom, sep = "/" ) h5createGroup(tallyFile, study) # create the toplevel group first h5createGroup(tallyFile, group) h5createDataset(tallyFile, paste(group, "Counts", sep = "/"), dims=c(12,2,2,chromlength), storage.mode="integer", level=9) #Creating the Counts group for chromosome 1 with 12 bases, 2 samples, 2 strands and 249250621 positions h5createDataset(tallyFile, paste(group, "Deletions", sep = "/"), dims=c(2,2,chromlength), storage.mode="integer", level=9) #Creating the Deletions group for chromosome 1 with 2 samples, 2 strands and 249250621 positions h5createDataset(tallyFile, paste(group, "Coverages", sep = "/"), dims=c(2,2,chromlength), storage.mode="integer", level=9) #Creating the Coverages group for chromosome 1 with 2 samples, 2 strands and 249250621 positions h5createDataset(tallyFile, paste(group, "Reference", sep = "/"), dims=c(chromlength), storage.mode="integer", level=9) #Creating the Reference group for chromosome 1 with 249250621 positions h5ls(tallyFile) ## ----sampleDataSetUp, eval = .Platform$OS.type == "unix"----------------- sampleData <- data.frame( Sample = c("AML","Control"), Column = c(1,2), Patient = c("Pt1","Pt1"), Type = c("Case","Control"), stringsAsFactors = FALSE ) setSampleData( tallyFile, group, sampleData ) getSampleData( tallyFile, group ) ## ----tallyingTheBamFiles, eval = .Platform$OS.type == "unix"------------- startpos <- 115247090 endpos <- 115259515 Counts <- lapply( bamFiles, function(bamf){ bam2R( file = bamf, chr = chrom, start = startpos, stop = endpos, head.clip = 10 )#we tell it to ignore the first and last 10 sequencing cycles }) ## ----preparingDatasetsOne, eval = .Platform$OS.type == "unix"------------ Counts[[1]][5000:5010,] Coverages <- lapply( Counts, function(count) matrix( c(rowSums(count[,c("A","C","G","T","DEL")]), rowSums(count[,c("a","c","g","t","del")])), ncol = 2, byrow = FALSE, dimnames = list(NULL, c("Fwd","Rev") ) ) ) Coverages[[1]][5000:5010,] Deletions <- lapply( Counts, function(count) count[,c("DEL","del")] ) ## ----preparingDatasetsTwo, eval = .Platform$OS.type == "unix"------------ Counts <- lapply(Counts, function(count) count[,c("A","C","G","T","a","c","g","t")] ) # kick out all unnecessary columns ref <- apply(Counts[[1]][,1:4] + Counts[[1]][5:8] + Counts[[2]][,1:4] + Counts[[2]][5:8], 1, which.max) for( i in seq(length(ref))){ Counts[[1]][i,ref[i]] <- 0 #Set the forward strand in the first sample to zero Counts[[1]][i,(ref[i] + 4)] <- 0 Counts[[2]][i,ref[i]] <- 0 Counts[[2]][i,(ref[i] + 4)] <- 0 #Set the reverse strand in the first sample to zero } Reference <- ref - 1 #the tally file encodes the reference as A=0,C=1,G=2,T=3 ## ----writingToTallyFile, eval = .Platform$OS.type == "unix"-------------- for( sample in 1:2 ){ h5write( t(Counts[[sample]][,1:4]), tallyFile, paste( group, "Counts", sep = "/" ), index = list( 5:8, sample, 1, startpos:endpos ) ) #Sample One Forward Strand h5write( t(Counts[[sample]][,5:8]), tallyFile, paste( group, "Counts", sep = "/" ), index = list( 5:8, sample, 2, startpos:endpos ) ) #Sample One Reverse Strand h5write( Coverages[[sample]][,"Fwd"], tallyFile, paste( group, "Coverages", sep = "/" ), index = list( sample, 1, startpos:endpos ) ) #Sample One Forward Strand h5write( Coverages[[sample]][,"Rev"], tallyFile, paste( group, "Coverages", sep = "/" ), index = list( sample, 2, startpos:endpos ) ) #Sample One Reverse Strand h5write( Deletions[[sample]][,"DEL"], tallyFile, paste( group, "Deletions", sep = "/" ), index = list( sample, 1, startpos:endpos ) ) #Sample Two Forward Strand h5write( Deletions[[sample]][,"del"], tallyFile, paste( group, "Deletions", sep = "/" ), index = list( sample, 2, startpos:endpos ) ) #Sample Two Reverse Strand } h5write( Reference, tallyFile, paste( group, "Reference", sep = "/" ), index = list( startpos:endpos ) ) #The Reference h5ls(tallyFile) ## ----extractingData, eval = .Platform$OS.type == "unix"------------------ data <- h5dapply( filename = tallyFile, group = group, blocksize = 1e8, range = c(startpos, endpos) )[[1]] # we use a blocksize larger than the range to get all data in one block (extracted by [[1]]) str(data) ## ----callingVariants, eval = .Platform$OS.type == "unix"----------------- vars <- h5dapply( filename = tallyFile, group = group, blocksize = 1000, FUN = callVariantsPaired, sampledata = getSampleData(tallyFile,group), cl = vcConfParams(returnDataPoints=TRUE), range = c(startpos, endpos) ) vars <- do.call(rbind, vars) vars ## ----plotting Variant, eval = .Platform$OS.type == "unix"---------------- position <- vars$End[1] windowsize <- 50 data <- h5dapply( filename = tallyFile, group = group, blocksize = 1e8, range = c(position - windowsize, position + windowsize) )[[1]] p <- mismatchPlot( data, getSampleData(tallyFile,group), samples = c("AML","Control"),windowsize=windowsize,position=position ) ## ----fig.width=18, fig.height=12, eval = .Platform$OS.type == "unix"----- print(p) ## ----, eval=FALSE-------------------------------------------------------- ## library(h5vc) ## library(rhdf5) ## tallyFile <- system.file( "extdata", "example.tally.hfs5", package = "h5vcData" ) ## library(ShortRead) ## reference <- readDNAStringSet("GRCh37.72.fa") ## names(reference) <- sapply( strsplit( names(reference), " " ), function(x) x[1] ) ## ## regions <- list( ## "16" = c(29000000,30000000), ## "22" = c(38000000,40000000) ## ) ## ## for( chrom in names(regions) ){ ## start <- regions[[chrom]][1] ## end <- regions[[chrom]][2] ## ref <- Views( reference[[chrom]], start = start, end = end ) ## ds <- encodeDNAString(ref[[1]]) ## h5write( ds, tallyFile, paste( "/ExampleStudy", chrom, "Reference", sep = "/" ), index = list( start:end ) ) ## } ## ----insertionsCreate, eval = .Platform$OS.type == "unix"---------------- h5createDataset(tallyFile, paste(group, "Insertions", sep = "/"), dims=c(2,2,chromlength), storage.mode="integer", level=9) #Creating the Deletions group for chromosome 1 with 2 samples, 2 strands and 249250621 positions h5ls(tallyFile) ## ----insertionsPrepare, eval = .Platform$OS.type == "unix"--------------- Counts <- lapply( bamFiles, function(bamf){ bam2R( file = bamf, chr = chrom, start = startpos, stop = endpos, head.clip = 10 )#we tell it to ignore the first and last 10 sequencing cycles }) Insertions <- lapply(Counts, function(count) count[,c("INS","ins")] ) # kick out all unnecessary columns ## ----insertionsWrite, eval = .Platform$OS.type == "unix"----------------- for( sample in 1:2 ){ h5write( Insertions[[sample]][,"INS"], tallyFile, paste( group, "Insertions", sep = "/" ), index = list( sample, 1, startpos:endpos ) ) #Sample Two Forward Strand h5write( Insertions[[sample]][,"ins"], tallyFile, paste( group, "Insertions", sep = "/" ), index = list( sample, 2, startpos:endpos ) ) #Sample Two Reverse Strand } h5write( Reference, tallyFile, paste( group, "Reference", sep = "/" ), index = list( startpos:endpos ) ) #The Reference h5ls(tallyFile) ## ----secondPlot, fig.width=18, fig.height=12, eval = .Platform$OS.type == "unix"---- data <- h5dapply( filename = tallyFile, group = group, blocksize = 1e8, range = c(position - windowsize, position + windowsize) )[[1]] p <- mismatchPlot( data, getSampleData(tallyFile,group), samples = c("AML","Control"),windowsize=windowsize,position=position ) p <- p + geom_h5vc( data, getSampleData(tallyFile,group), samples = c("AML","Control"), windowsize = windowsize, position = position, dataset = "Insertions", fill = "orange" ) print(p)