<!--
%\VignetteEngine{knitr}
%\VignetteIndexEntry{Advanced analyses}
-->
Advanced Analyses
=================

Prepare the environment
-----------------------
Firstly we need to make sure the tallyFile has been downloaded correctly and we load the sample data. See the vignette *h5vc* for more in-depth explanations of this step.

```{r}
library(h5vc)
library(rhdf5)
library(GenomicRanges)
tallyFile <- system.file( "extdata", "example.tally.hfs5", package = "h5vcData" )
sampleData <- getSampleData( tallyFile, "/ExampleStudy/16" )
```

Calling Variants
----------------

We use the `callVariantsPaired` function to perform variant calling in a cohort paired tumour/normal samples.
This function has the following signature:
```{r, eval = FALSE, tidy = FALSE}
callVariantsPaired <- function (data, sampledata, cl = vcConfParams())
```

Where `vcConfParams()` is a function used to configure the behaviour of the calling function, it returns a list of paramters.
```{r}
vcConfParams()
```

```{r, tidy=FALSE}
calls16 <- h5dapply(
  filename = tallyFile,
  group = "/ExampleStudy/16",
  blocksize = 10000,
  FUN = callVariantsPaired,
  sampledata = sampleData,
  cl = vcConfParams( returnDataPoints = TRUE, annotateWithBackground = TRUE ),
  names = c( "Coverages", "Counts", "Reference" ),
  range = c(29950000,30000000)
  )

calls22 <- h5dapply(
  filename = tallyFile,
  group = "/ExampleStudy/22",
  blocksize = 10000,
  FUN = callVariantsPaired,
  sampledata = sampleData,
  cl = vcConfParams( returnDataPoints = TRUE, annotateWithBackground = TRUE ),
  names = c( "Coverages", "Counts", "Reference" ),
  range = c(39350000,39400000)
  )

calls <- rbind(do.call(rbind,calls16), do.call(rbind,calls22))
```

Plotting Variants
-----------------

We can use the `mismatchPlot` function to create plots of the suspected variant regions.
First we extract a set of interesting positions:
```{r}
calls$Patient = sampleData$Patient[ match( calls$Sample, sampleData$Sample) ]
calls <- subset( calls, pValueFwd < 0.05 & pValueRev < 0.05 )
calls$varID = paste( calls$seqnames, calls$start, calls$altAllele, calls$Patient )
calls <- calls[!duplicated(calls$varID),]
calls
```
The remaining two calls have good significance (`binom.test` against the estimated background error rate, i.e. the mean of the mismatch rates of all control samples in the cohort) and we filter out double entries that come from the same patient. You will see in the plots below, that patient 5 has the variant in both the primary and the relapse tumour.
Now we can generate the mismatch plots.

```{r, fig.width=18, fig.height=10}
windowsize <- 35
for( i in seq(nrow(calls))){
  position <- calls$Start[i]
  data <- h5dapply(
      filename = tallyFile,
      group = paste( "/ExampleStudy", calls$Chrom[i], sep="/" ),
      blocksize = windowsize * 3,
      names = c("Coverages","Counts","Deletions"),
      range = c( position - windowsize, position + windowsize)
    )[[1]]
  p <- mismatchPlot(
    data = data,
    sampledata = sampleData,
    samples = sampleData$Sample[sampleData$Patient == calls$Patient[i]],
    windowsize = windowsize,
    position = position
    )
  print(p)
}
```

Plotting Arbitrary Datasets
---------------------------

Sometimes you want more flexibility that what `mismatchPlot` can provide, for this case the `geom_h5vc` function can be used to create an independed `ggplot2` plot layer from a dataset that can be added to any `ggplot` object, e.g. an existing `mismatchPlot` or an empty plot.
The example below demonstrates this.

```{r, fig.width=18, fig.height=10}
library(h5vc)
library(ggplot2)
tallyFile <- system.file( "extdata", "example.tally.hfs5", package = "h5vcData" )
sampleData <- getSampleData( tallyFile, "/ExampleStudy/16" )
position <- 29979629
windowsize <- 30
samples <- sampleData$Sample[sampleData$Patient == "Patient8"]
data <- h5dapply(
  filename = tallyFile,
  group = "/ExampleStudy/16",
  blocksize = windowsize * 3,
  names = c("Coverages", "Counts", "Deletions"),
  range = c(position - windowsize, position + windowsize)
)[[1]] #choose blocksize larger than range so that all needed data is collected as one block
# Summing up all mismatches irrespective of the alternative allele
data$CountsAggregate = colSums(data$Counts)
# Simple overview plot showing number of mismatches per position
p <- ggplot() + 
geom_h5vc( data=data, sampledata=sampleData, windowsize = 35, position = 500, dataset = "Coverages", fill = "gray" ) + 
geom_h5vc( data=data, sampledata=sampleData, windowsize = 35, position = 500, dataset = "CountsAggregate", fill = "#D50000" ) + 
facet_wrap( ~ Sample, ncol = 2 )
print(p)
```

Mutation Spectrum Analysis
--------------------------

We can also easily perform mutation spectrum analysis by using the function `mutationSpectrum` which works on a set of variant calls in a `data.frame` form as it would be produced by a call to e.g. `callVariantsPaired` and a tallyFile parameter specifying hte location of a tally file as well as a context parameter.
The context parameter specifies how much sequence context should be taken into account for the mutation spectrum. An example with context 1 (i.e. one base up- and one base downstream of the variant) is shown below.

We skip the calling of variants (since ittakes al ong time on larger regions) and load the corresponding example data instead, we would generate the calls like this:
```{r variantCallingForMutationSpectrum, eval=FALSE}
# loading library and example data
library(h5vc)
tallyFile <- system.file( "extdata", "example.tally.hfs5", package = "h5vcData" )
sampleData <- getSampleData( tallyFile, "/ExampleStudy/16" )
windowsize <- 100000
samples <- sampleData$Sample[sampleData$Patient == "Patient8"]
# Calling Variants
vars <- h5dapply(
  filename = tallyFile,
  group = "/ExampleStudy/16",
  blocksize = 10000,
  FUN = callVariantsPaired,
  sampledata = sampleData,
  cl = vcConfParams(returnDataPoints=TRUE),
  names = c("Coverages", "Counts", "Reference"),
  range = c(29000000, 30000000),
  verbose=TRUE
)

variantCalls <- do.call(rbind, vars)

vars <- h5dapply(
  filename = tallyFile,
  group = "/ExampleStudy/22",
  blocksize = 10000,
  FUN = callVariantsPaired,
  sampledata = sampleData,
  cl = vcConfParams(returnDataPoints=TRUE),
  names = c("Coverages", "Counts", "Reference"),
  range = c(38000000, 40000000),
  verbose=TRUE
)

variantCalls <- rbind( variantCalls, do.call(rbind, vars) )
rownames(variantCalls) <- NULL

save(variantCalls, file = "example.variants.RDa")
```

The mutation spectrum can now be extracted from the tallyFile and variant calls like so:
```{r}
data("example.variants", package="h5vcData")
ms <- mutationSpectrum( variantCalls, tallyFile, "/ExampleStudy" )
head(ms)
```

An overview is given in the following plot:
```{r mutationSpectrumPlot, fig.width=18, fig.height=10}
suppressPackageStartupMessages(library(ggplot2))
p <- ggplot( ms, aes( x = paste( Prefix, Suffix, sep = "." ), fill = Transition) ) + 
  geom_bar(width = 0.5) + 
  facet_grid( Sample ~ Transition) +
  scale_y_discrete(name="Number of Events") +
  scale_x_discrete(name="Mutation Context") +
  theme(
    axis.text = element_text( size = 14 ),
    axis.title = element_text( size = 16 ),
    axis.text.x = element_text( angle = 90, hjust = 0 ),
    strip.text = element_text(size = 14),
    legend.position="none"
    )
print(p)
```
The y axis shows the number of this kind of mutation and the x-axis shows the sequence context (of size `1` in this case). At this point one could go on to perform a NMF-based analysis of the mutation spectra as is described in *Alexandrov et. al.* - [Signatures of mutational processes in human cancer](http://www.nature.com/nature/journal/vaop/ncurrent/full/nature12477.html)