## ----loadFiles, eval = .Platform$OS.type == "unix"-----------------------
suppressPackageStartupMessages(library(h5vc)) # h5vc is needed
suppressPackageStartupMessages(library(rhdf5)) # rhdf5 is needed
suppressPackageStartupMessages(library(deepSNV)) # we use deepSNV for tallying
files <- c("NRAS.AML.bam","NRAS.Control.bam")
bamFiles <- file.path( system.file("extdata", package = "h5vcData"), files)


## ----chromDim, eval = .Platform$OS.type == "unix"------------------------
chromdim <- sapply( scanBamHeader(bamFiles), function(x) x$targets )
colnames(chromdim) <- files
head(chromdim)


## ----hdf5SetUp, eval = .Platform$OS.type == "unix"-----------------------
chrom <- "1"
chromlength <- chromdim[chrom,1]
study <- "/NRAS"
tallyFile <- file.path( tempdir(), "NRAS.tally.hfs5" )
if( file.exists(tallyFile) ){
  file.remove(tallyFile)
}
h5createFile(tallyFile)
group <- paste( study, chrom, sep = "/" )
h5createGroup(tallyFile, study) # create the toplevel group first
h5createGroup(tallyFile, group)
h5createDataset(tallyFile, paste(group, "Counts", sep = "/"), dims=c(12,2,2,chromlength), storage.mode="integer", level=9) #Creating the Counts group for chromosome 1 with 12 bases, 2 samples, 2 strands and 249250621 positions
h5createDataset(tallyFile, paste(group, "Deletions", sep = "/"), dims=c(2,2,chromlength), storage.mode="integer", level=9) #Creating the Deletions group for chromosome 1 with 2 samples, 2 strands and 249250621 positions
h5createDataset(tallyFile, paste(group, "Coverages", sep = "/"), dims=c(2,2,chromlength), storage.mode="integer", level=9) #Creating the Coverages group for chromosome 1 with 2 samples, 2 strands and 249250621 positions
h5createDataset(tallyFile, paste(group, "Reference", sep = "/"), dims=c(chromlength), storage.mode="integer", level=9) #Creating the Reference group for chromosome 1 with 249250621 positions
h5ls(tallyFile)


## ----sampleDataSetUp, eval = .Platform$OS.type == "unix"-----------------
sampleData <- data.frame(
  Sample = c("AML","Control"),
  Column = c(1,2),
  Patient = c("Pt1","Pt1"),
  Type = c("Case","Control"),
  stringsAsFactors = FALSE
  )
setSampleData( tallyFile, group, sampleData )
getSampleData( tallyFile, group )


## ----tallyingTheBamFiles, eval = .Platform$OS.type == "unix"-------------
startpos <- 115247090
endpos <- 115259515
Counts <- lapply( bamFiles, function(bamf){
  bam2R( file = bamf, chr = chrom, start = startpos, stop = endpos, head.clip = 10 )#we tell it to ignore the first and last 10 sequencing cycles
})


## ----preparingDatasetsOne, eval = .Platform$OS.type == "unix"------------
Counts[[1]][5000:5010,]
Coverages <- lapply( Counts, function(count) matrix( c(rowSums(count[,c("A","C","G","T","DEL")]), rowSums(count[,c("a","c","g","t","del")])), ncol = 2, byrow = FALSE, dimnames = list(NULL, c("Fwd","Rev") ) ) )
Coverages[[1]][5000:5010,]
Deletions <- lapply( Counts, function(count) count[,c("DEL","del")] )


## ----preparingDatasetsTwo, eval = .Platform$OS.type == "unix"------------
Counts <- lapply(Counts, function(count) count[,c("A","C","G","T","a","c","g","t")] ) # kick out all unnecessary columns
ref <- apply(Counts[[1]][,1:4] + Counts[[1]][5:8] + Counts[[2]][,1:4] + Counts[[2]][5:8], 1, which.max)
for( i in seq(length(ref))){
  Counts[[1]][i,ref[i]] <- 0 #Set the forward strand in the first sample to zero
  Counts[[1]][i,(ref[i] + 4)] <- 0
  Counts[[2]][i,ref[i]] <- 0
  Counts[[2]][i,(ref[i] + 4)] <- 0 #Set the reverse strand in the first sample to zero
}
Reference <- ref - 1 #the tally file encodes the reference as A=0,C=1,G=2,T=3


## ----writingToTallyFile, eval = .Platform$OS.type == "unix"--------------
for( sample in 1:2 ){
  h5write( t(Counts[[sample]][,1:4]), tallyFile, paste( group, "Counts", sep = "/" ), index = list( 5:8, sample, 1, startpos:endpos ) ) #Sample One Forward Strand
  h5write( t(Counts[[sample]][,5:8]), tallyFile, paste( group, "Counts", sep = "/" ), index = list( 5:8, sample, 2, startpos:endpos ) ) #Sample One Reverse Strand

  h5write( Coverages[[sample]][,"Fwd"], tallyFile, paste( group, "Coverages", sep = "/" ), index = list( sample, 1, startpos:endpos ) ) #Sample One Forward Strand
  h5write( Coverages[[sample]][,"Rev"], tallyFile, paste( group, "Coverages", sep = "/" ), index = list( sample, 2, startpos:endpos ) ) #Sample One Reverse Strand

  h5write( Deletions[[sample]][,"DEL"], tallyFile, paste( group, "Deletions", sep = "/" ), index = list( sample, 1, startpos:endpos ) ) #Sample Two Forward Strand
  h5write( Deletions[[sample]][,"del"], tallyFile, paste( group, "Deletions", sep = "/" ), index = list( sample, 2, startpos:endpos ) ) #Sample Two Reverse Strand
}
h5write( Reference, tallyFile, paste( group, "Reference", sep = "/" ), index = list( startpos:endpos ) ) #The Reference
h5ls(tallyFile)


## ----extractingData, eval = .Platform$OS.type == "unix"------------------
data <- h5dapply(
    filename = tallyFile,
    group = group,
    blocksize = 1e8, 
    range = c(startpos, endpos)
  )[[1]] # we use a blocksize larger than the range to get all data in one block (extracted by [[1]])
str(data)


## ----callingVariants, eval = .Platform$OS.type == "unix"-----------------
vars <- h5dapply(
    filename = tallyFile,
    group = group,
    blocksize = 1000,
    FUN = callVariantsPaired,
    sampledata = getSampleData(tallyFile,group),
    cl = vcConfParams(returnDataPoints=TRUE),
    range = c(startpos, endpos)
  )
vars <- do.call(rbind, vars)
vars


## ----plotting Variant, eval = .Platform$OS.type == "unix"----------------
position <- vars$End[1]
windowsize <- 50
data <- h5dapply(
    filename = tallyFile,
    group = group,
    blocksize = 1e8,
    range = c(position - windowsize, position + windowsize)
  )[[1]]
p <- mismatchPlot( data, getSampleData(tallyFile,group), samples = c("AML","Control"),windowsize=windowsize,position=position )


## ----fig.width=18, fig.height=12, eval = .Platform$OS.type == "unix"-----
print(p)


## ----, eval=FALSE--------------------------------------------------------
## library(h5vc)
## library(rhdf5)
## tallyFile <- system.file( "extdata", "example.tally.hfs5", package = "h5vcData" )
## library(ShortRead)
## reference <- readDNAStringSet("GRCh37.72.fa")
## names(reference) <- sapply( strsplit( names(reference), " " ), function(x) x[1] )
## 
## regions <- list(
##   "16" = c(29000000,30000000),
##   "22" = c(38000000,40000000)
## )
## 
## for( chrom in names(regions) ){
##   start <- regions[[chrom]][1]
##   end <- regions[[chrom]][2]
##   ref <- Views( reference[[chrom]], start = start, end = end )
##   ds <- encodeDNAString(ref[[1]])
##   h5write( ds, tallyFile, paste( "/ExampleStudy", chrom, "Reference", sep = "/" ), index = list( start:end ) )
## }


## ----insertionsCreate, eval = .Platform$OS.type == "unix"----------------
h5createDataset(tallyFile, paste(group, "Insertions", sep = "/"), dims=c(2,2,chromlength), storage.mode="integer", level=9) #Creating the Deletions group for chromosome 1 with 2 samples, 2 strands and 249250621 positions
h5ls(tallyFile)


## ----insertionsPrepare, eval = .Platform$OS.type == "unix"---------------
Counts <- lapply( bamFiles, function(bamf){
  bam2R( file = bamf, chr = chrom, start = startpos, stop = endpos, head.clip = 10 )#we tell it to ignore the first and last 10 sequencing cycles
})
Insertions <- lapply(Counts, function(count) count[,c("INS","ins")] ) # kick out all unnecessary columns


## ----insertionsWrite, eval = .Platform$OS.type == "unix"-----------------
for( sample in 1:2 ){
  h5write( Insertions[[sample]][,"INS"], tallyFile, paste( group, "Insertions", sep = "/" ), index = list( sample, 1, startpos:endpos ) ) #Sample Two Forward Strand
  h5write( Insertions[[sample]][,"ins"], tallyFile, paste( group, "Insertions", sep = "/" ), index = list( sample, 2, startpos:endpos ) ) #Sample Two Reverse Strand
}
h5write( Reference, tallyFile, paste( group, "Reference", sep = "/" ), index = list( startpos:endpos ) ) #The Reference
h5ls(tallyFile)


## ----secondPlot, fig.width=18, fig.height=12, eval = .Platform$OS.type == "unix"----
data <- h5dapply(
    filename = tallyFile,
    group = group,
    blocksize = 1e8,
    range = c(position - windowsize, position + windowsize)
  )[[1]]
p <- mismatchPlot( data, getSampleData(tallyFile,group), samples = c("AML","Control"),windowsize=windowsize,position=position )
p <- p + geom_h5vc(
            data,
            getSampleData(tallyFile,group),
            samples = c("AML","Control"),
            windowsize = windowsize,
            position = position,
            dataset = "Insertions",
            fill = "orange"
            )
print(p)