## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(tidy.opts = list(width.cutoff = 65), tidy = TRUE, 
                      echo = TRUE, message = FALSE, warning = FALSE)

## ----diagramaCrop, fig.cap="Workflow diagram of the package ASURI showing the decision-making lines and the five main functions.", out.width = "100%", fig.align = "center", echo=FALSE, results="asis"----
knitr::include_graphics("figures/diagramaCrop.png")

## ----eval = FALSE-------------------------------------------------------------
# if (!requireNamespace("BiocManager", quietly = TRUE)) {
#     install.packages("BiocManager")
# }
# 
# BiocManager::install("asuri")

## ----echo = TRUE, eval = TRUE-------------------------------------------------
library(asuri); library(SummarizedExperiment)
data(seBRCA)
# To extract the normalized expression
mExprs <- assay(seBRCA)
mPheno <- colData(seBRCA)

## ----echo = TRUE, eval = TRUE-------------------------------------------------
# To visualize the dimensions of the two extracted objects
dim(mExprs)
dim(mPheno)

## ----echo = TRUE, eval = TRUE, fig.show = "hide", results="asis"--------------
data(seBRCA)
time <- "time"
status <- "status"
geneName <- "ESR1"
# The TIME value must be transformed to YEARS
# The gene expression vector must be provided with the NAMES of each sample,
# that should match the time and status NAMES.
set.seed(5)
outputKM <- geneSurv(seBRCA, time, status, geneName, type = "exprs", verbose = FALSE)

## ----pclassesr1, fig.cap="Patient Class Probability for low exprs (89) and high exprs (111) groups).", out.width = "80%", fig.align = "center", echo=FALSE, results="asis"----
plotProbClass(outputKM)

## ----boxesr1, fig.cap="Boxplot: high expression vs low expression for ESR1 gene.", out.width = "80%", fig.align = "center", echo=FALSE, results="asis", results="asis"----
plotBoxplot(outputKM)

## ----kmesr1, fig.cap="Kaplan Meier curves obtained after the assignment of the patients by bootstrap to high or low survival using the selected feature ESR1.", out.width = "80%", fig.align = "center", echo=FALSE, results="asis"----
plotKM(outputKM)

## ----echo = TRUE, eval = TRUE, fig.show = "hide"------------------------------
# If we instead consider to run the function as *type* = risk
geneName <- "BRCA1"
set.seed(5)
outputKM.TP53 <- geneSurv(seBRCA, time, status, geneName, type = "risk", verbose = FALSE)

## ----kmrisk, fig.cap="Kaplan Meier curves obtained after the assignment of the patients by bootstrap to high or low survival using the estimated risk based on feature BRCA1.", out.width = "80%", fig.align = "center", echo=FALSE, results="asis"----
plotKM(outputKM.TP53)

## ----echo = TRUE, eval = TRUE-------------------------------------------------
library(SummarizedExperiment)
data(seBRCA)

# Bootstrapped differential expression based on SAM.
# Parameters: FDR = 0.05, iter = 100, percentage filter = 80 %
# CAUTION: if the data have a high number of genes this function will take several minutes to compute.

groupsVector <- colData(seBRCA)$ER.IHC
  
set.seed(5)
DE_list_genes <- prefilterSAM(seBRCA, groupsVector, verbose = FALSE)

## ----echo = TRUE, eval = TRUE-------------------------------------------------
data(ex_prefilterSAM)
DE_list_genes <- ex_prefilterSAM
vectorSampleID <- as.character(rownames(colData(seBRCA)))
vectorGroups <- as.numeric(colData(seBRCA)$ER.IHC)
Pred_ER.IHC <- genePheno(seBRCA, DE_list_genes, vectorGroups, vectorSampleID, 
                         verbose = FALSE)

# Pred_ER.IHC is an output object with the list of genes that show a significant correlation with the clinical variable. Since a bootstrap is performed, the results of how many times across iterations a gene is found significant are reported as *stability* (in relative numbers 0-1, 1=100%) and the *beta values* from the regression across iterations are also provided as *betaMedian* and *betaMean* :

Pred_ER.IHC$stability
Pred_ER.IHC$betasMedian
Pred_ER.IHC$betasMean

## ----echo = TRUE, eval = TRUE-------------------------------------------------
knitr::kable(head(Pred_ER.IHC$betasTable, 10), 
             caption = "List of genes by Stability.")

## ----echo = TRUE, eval = TRUE-------------------------------------------------
names(Pred_ER.IHC)
# [1] "genes" "listCoeff" "stability" "betasMedian" "betasMean" "betasTable"

## ----echo = TRUE, eval = TRUE, fig.show = "hide"------------------------------
# Survival times should be provided in YEARS
time <- 'time'
status <- 'status'
# Pred_ER.IHC$genes is the subset of genes to be tested. In our case study,
# it is the list of genes related to the ER clinical variable that was
# obtained using the function **genePheno()**.
geneList <- names(Pred_ER.IHC$genes)
# Training of the multivariate COX model. Provide the SummarizedExperiment 
# object with the normalized expression data and the phenotypic data in colData,
# the time and the status vectors, and the method to stratify the patients
# (select one of these methods: `min.pval`, `med.pval`, `class.probs`).
set.seed(5)
multivariate_risk_predictor <- patientRisk(seBRCA, geneList, time, status, method = "class.probs")

## ----plot1risk, fig.cap="Curve of log-rank p-values for each possible splitting of patients into 2 risk groups. The X axis represents the patient number ranked from low to high risk. The green line marks the selected cutpoint, i.e., the point that provides the threshold for stratifying patients into 2 risk groups: low risk, in blue; high risk, in red. The blue & red lines define thresholds for classifying patients into 3 risk groups: *low*, *intermediate* and *high*. These lines mark samples that fall in an *intermediate* region, where risk assignment is not very precise and samples may fluctuate in position.", out.width = "80%", fig.align = "center", echo=FALSE, results="asis"----
plotLogRank(multivariate_risk_predictor)

## ----plot2risk, fig.cap="Risk Score, between 0 and 100, calculated for each patient, i.e., for each of the 200 patient tumor samples included in this study, obtained with a multivariate UNICOX regression model. The model includes as features the subset of selected genes included in the *geneList*, which are tested to find out their correlation with risk. The green and blue & red lines provide the thresholds estimated by the **patientRisk()** function to split the patients respectively in 2 or 3 groups, as described in the legend of the previous figure. Note that the implemented algorithm is capable of detecting changes in the slope of the risk curve, giving rise to risk groups of differenciated prognosis.", out.width = "80%", fig.align = "center", echo=FALSE, results="asis"----
plotSigmoid(multivariate_risk_predictor)

## ----plot3risk, fig.cap="Log-rank p-values of the minimum optimal versus the number of genes selected by the model. As the regularization parameter in UNICOX increases, the regression model becomes more especific in the selection of features and, therefore the number of genes that have non-zero (non-null) coefficients is reduced.", out.width = "80%", fig.align = "center", echo=FALSE, results="asis"----
plotLambda(multivariate_risk_predictor)

## ----plot4risk, fig.cap="Plot of the genes selected by the multivariate COX model, ranked by the value of the regression coefficients in the model. Genes with *positive* values (red) increase the risk in the samples. While genes with *negative* values (blue) reduce the risk in the samples. Genes with significant p-values are marked with asterisks (** p-value<0.01, * p-value<0.05). Similarly, genes with slightly significance level are marked with a dot (\u00B7 between 0.1 and 0.05). All the analyses carried out in this part are multivariate and take into account additive interactions among the genes.", out.width = "80%", fig.align = "center", echo=FALSE, results="asis"----
plotBetas(multivariate_risk_predictor)

## ----echo = TRUE, eval = TRUE-------------------------------------------------
knitr::kable(multivariate_risk_predictor$betasplot, 
             caption = "List of genes by relevance for risk assignement.")

## ----plot5risk, fig.cap="Kaplan Meier plot: Blue (low risk) and red (high risk) of the two groups of samples with different survival obtained from the stratification of the multivariate risk curve (Figure 6).", out.width = "80%", fig.align = "center", echo=FALSE, results="asis"----
plotKM(multivariate_risk_predictor)

## ----echo = TRUE, eval = TRUE, fig.align = "center", fig.cap="Risk score prediction for an independent set of patients using the optimal multivariate COX model trained by function patientRisk(). Green and blue/red lines correspond to the optimal thresholds that split the patients in two and three groups respectively. These thresholds were learned in training phase.", results="asis", fig.height=10, fig.width=10,  out.width = "80%"----
# Generate the validation set, mExprs_testData if necessary.
# Vector of genes (same ones used in Cox model training)

# Simulate expression data
num_samples <- 20
set.seed(5)
mExprs_testData <- matrix(rnorm(length(geneList) * num_samples, mean = 10, sd = 3),
                          nrow = length(geneList), ncol = num_samples)

# Assign row names (genes) and column names (samples)
rownames(mExprs_testData) <- geneList
colnames(mExprs_testData) <- paste0("Sample", 1:num_samples)

set.seed(5)
risk_prediction_validation_set <- predict_PatientRisk(multivariate_risk_predictor, mExprs_testData)

## ----echo = TRUE, eval = TRUE, fig.align = "center", results="asis", fig.height=10, fig.width=10, out.width = "80%"----
# Example for single patient prediction: Patient fourth is selected.
mExprs_testSingleData <- data.frame(mExprs_testData[, 4])
colnames(mExprs_testSingleData) <- colnames(mExprs_testData)[4]
# Risk prediction for the optimal subset of genes selected by patientRisk function
set.seed(5)
risk_prediction_one_patient <- predict_PatientRisk(multivariate_risk_predictor, mExprs_testSingleData)
risk_prediction_one_patient$risk_score

## ----echo = TRUE, eval = TRUE, fig.show = "hide"------------------------------
data(seBRCA)
# COX prediction for the training set
mPheno <- colData(seBRCA)
mExprs <- assay(seBRCA)

set.seed(5)
mExprSelectedGenes <- mExprs[rownames(mExprs) %in% geneList,]

cox_pred_training <- predict_PatientRisk(multivariate_risk_predictor, mExprSelectedGenes)
head(cox_pred_training$risk_score)
# COX prediction for the test patient
set.seed(5)
cox_pred_test <- predict_PatientRisk(multivariate_risk_predictor, mExprs_testSingleData)
cox_pred_test$risk_score

## ----predSurv, echo = TRUE, eval = TRUE, fig.align = "center", fig.cap="Survival curve estimation for a single patient based on the risk score predicted by a multivariate COX model. The blue line shows the median survival time at which the survival probability drops to 0.5.", results="asis", fig.height=10, fig.width=10, out.width = "80%"----
# Survival curve estimation
eval_surv_times <- seq(0, max(mPheno$time), by = 0.1)
set.seed(5)
surv_curv_cox <- predict_SurvCurve(cox_pred_training$risk_score, mPheno[, c(2, 3)], cox_pred_test$risk_score, eval_surv_times)

## ----echo = F, eval = TRUE----------------------------------------------------
devtools::session_info()