## ----style, echo = FALSE, results = 'asis'-------------------------------------------------------- options(width=100) knitr::opts_chunk$set( eval=as.logical(Sys.getenv("KNITR_EVAL", "TRUE")), cache=as.logical(Sys.getenv("KNITR_CACHE", "TRUE"))) ## ----echo=FALSE----------------------------------------------------------------------------------- path <- system.file(package="BiocIntroRPCI", "extdata", "ALL-phenoData.csv") ## ----ALL-choose, eval=FALSE----------------------------------------------------------------------- # path <- file.choose() # look for ALLphenoData.tsv ## ----ALL-input------------------------------------------------------------------------------------ stopifnot(file.exists(path)) pdata <- read.csv(path) ## ------------------------------------------------------------------------------------------------- median(pdata$age) ## ----pdata-anyNA---------------------------------------------------------------------------------- any(is.na(pdata$age)) anyNA(pdata$age) # same, but more efficient ## ------------------------------------------------------------------------------------------------- median(pdata$age, na.rm=TRUE) range(pdata$age, na.rm=TRUE) quantile(pdata$age, na.rm=TRUE) ## ---- eval=FALSE---------------------------------------------------------------------------------- # plot(pdata$age) # plot(sort(pdata$age)) # sortedAge = sort(pdata$age) # ?plot ## ----boxplot, eval=FALSE-------------------------------------------------------------------------- # plot(age ~ sex, pdata) ## ----eval=FALSE----------------------------------------------------------------------------------- # hist(pdata$age) # ?hist # hist(pdata$age, br=25) ## ------------------------------------------------------------------------------------------------- xtabs(~sex, data=pdata, exclude=NA) xtabs(~sex + remission, data=pdata, exclude=NA) ## ----plot-ages, eval=FALSE------------------------------------------------------------------------ # plot(age ~ sex, pdata) ## ----ttest0--------------------------------------------------------------------------------------- t.test(age ~ sex, pdata) ## ----t.test-help, eval=FALSE---------------------------------------------------------------------- # ?t.test ## ----ttest1--------------------------------------------------------------------------------------- t.test(age ~ sex, pdata, var.equal=TRUE) ## ----lm------------------------------------------------------------------------------------------- (fit <- lm(age ~ sex, pdata)) anova(fit) ## ----plot-fit, eval=FALSE------------------------------------------------------------------------- # plot(fit) ## ----class---------------------------------------------------------------------------------------- class(fit) ## ----methods-------------------------------------------------------------------------------------- methods(plot) ## ----eval=FALSE----------------------------------------------------------------------------------- # ?plot.lm ## ----predict-------------------------------------------------------------------------------------- df = data.frame(sex=c("F", "M")) predict(fit, df) ## ----coef----------------------------------------------------------------------------------------- coefficients(fit) ## ----bt------------------------------------------------------------------------------------------- summary(pdata[,c("BT", "cyto.normal")]) ## ----BT------------------------------------------------------------------------------------------- pdata$BorT <- factor(substr(pdata$BT, 1, 1)) ## ----map------------------------------------------------------------------------------------------ xtabs(~ BorT + cyto.normal, pdata) ## ----chisq---------------------------------------------------------------------------------------- chisq.test(pdata$BorT, pdata$cyto.normal) ## ----echo=FALSE----------------------------------------------------------------------------------- path <- system.file(package="BiocIntroRPCI", "extdata", "ALL-expression.csv") stopifnot(file.exists(path)) ## ----ALL-choose-again, eval=FALSE----------------------------------------------------------------- # path <- file.choose() # look for ALL-expression.csv # stopifnot(file.exists(path)) ## ----ALL-input-exprs------------------------------------------------------------------------------ exprs <- read.csv(path, row.names=1, check.names=FALSE) exprs <- as.matrix(exprs) ## ----ALL-phenoData.csv-clustering-lab, echo=FALSE------------------------------------------------- path <- system.file(package="BiocIntroRPCI", "extdata", "ALL-phenoData.csv") stopifnot(file.exists(path)) pdata <- read.csv(path, row.names=1) ## ----ALL-phenoData.csv-clustering-student, eval=FALSE--------------------------------------------- # path <- file.choose() # look for ALL-phenoData.csv # stopifnot(file.exists(path)) # pdata <- read.csv(path, row.names=1) ## ----ALL-BorT------------------------------------------------------------------------------------- pdata$BorT <- factor(substr(pdata$BT, 1, 1)) xtabs(~BorT + BT, pdata) ## ----colors--------------------------------------------------------------------------------------- library(RColorBrewer) divergent <- brewer.pal(11, "RdBu") highlight <- brewer.pal(3, "Set2")[1:2] ## ----exprs-explore-------------------------------------------------------------------------------- class(exprs) dim(exprs) exprs[1:5, 1:5] ## ----var------------------------------------------------------------------------------------------ var(exprs[1,]) ## ----ALL-row-vars--------------------------------------------------------------------------------- v <- apply(exprs, 1, var) hist(sqrt(v)) ## ----ALL-row-vars-ordered------------------------------------------------------------------------- o <- order(v, decreasing=TRUE) plot(sqrt(v[o])) ## ----ALL-exprs-1000------------------------------------------------------------------------------- exprs1000 <- exprs[head(o, 1000), ] ## ----ALL-dist------------------------------------------------------------------------------------- d <- dist(t(exprs1000)) ## ----ALL-hclust, fig.width=9---------------------------------------------------------------------- plot(hclust(d)) ## ----ALL-pdata-exprs-columns---------------------------------------------------------------------- identical(colnames(exprs1000), rownames(pdata)) ## ----ALL-hclust-BT, fig.width=9------------------------------------------------------------------- colnames(exprs1000) <- pdata$BT plot(hclust(dist(t(exprs1000)))) colnames(exprs1000) <- rownames(pdata) ## ----ALL-sample-cor------------------------------------------------------------------------------- d <- dist(t(exprs1000)) ## ----heatmap-------------------------------------------------------------------------------------- color <- highlight[pdata$BorT] heatmap(as.matrix(d), ColSideColor=color, col=divergent, symm=TRUE) ## ----ALL-cmdscale, fig.width=5, fig.height=5------------------------------------------------------ mds <- cmdscale(dist(t(exprs1000))) plot(mds, col=color) ## ----class-library-------------------------------------------------------------------------------- library(class) ## ----seq_along------------------------------------------------------------------------------------ seq_along(pdata$BorT) ## ----split---------------------------------------------------------------------------------------- idxByType <- split(seq_along(pdata$BorT), pdata$BorT) idxByType ## ----head-half-B---------------------------------------------------------------------------------- Bidx <- idxByType[["B"]] BTrainIdx <- head(Bidx, length(Bidx) / 2) ## ----head-half-T---------------------------------------------------------------------------------- Tidx <- idxByType[["T"]] TTrainIdx <- head(Tidx, length(Tidx) / 2) ## ----head-half-fun-------------------------------------------------------------------------------- firstHalf <- function(x) { head(x, length(x) / 2) } ## ----head-half-fun-T------------------------------------------------------------------------------ firstHalf(idxByType[["B"]]) firstHalf(idxByType[["T"]]) ## ----head-half-lapply----------------------------------------------------------------------------- trainIdx <- lapply(idxByType, firstHalf) trainIdx ## ----BT-train------------------------------------------------------------------------------------- Bhalf <- exprs1000[, trainIdx[["B"]] ] Thalf <- exprs1000[, trainIdx[["T"]] ] train <- cbind(Bhalf, Thalf) ## ----BT-test-------------------------------------------------------------------------------------- test <- exprs1000[, ! colnames(exprs1000) %in% colnames(train) ] ## ----BT-class------------------------------------------------------------------------------------- cl_train <- pdata[colnames(train), "BorT"] cl_test <- pdata[colnames(test), "BorT"] ## ----knn------------------------------------------------------------------------------------------ cl_test_est <- knn(t(train), t(test), cl_train) ## ----knn-xtabs------------------------------------------------------------------------------------ xtabs(~cl_test + cl_test_est) ## ----knn.cv--------------------------------------------------------------------------------------- knn_loo <- knn.cv(t(exprs1000), pdata$BorT) xtabs(~pdata$BorT + knn_loo) ## ----knn-T-subset--------------------------------------------------------------------------------- keep <- pdata$BorT == "T" exprT <- exprs1000[, keep] classT <- factor(pdata$BT[keep]) ## ----knn-T-confusion------------------------------------------------------------------------------ xtabs(~classT + knn.cv(t(exprT), classT))