## ----setup, echo=FALSE,error=FALSE,message=FALSE-------------------------

datadir= "/data/IntPathAnalysis/BRCA/"
library(knitcitations)
library(bibtex)
library(knitr)
library(AnnotationDbi)
library(org.Hs.eg.db)
allbib = read.bibtex(file.path(".", "bioc.bib"))
opts_chunk$set(cache=TRUE, fig.path='figure/',  cache.path = 'cache-local/')


## ----loadBRCA------------------------------------------------------------
load(file.path(datadir, "BRCA_TCGA_79_filtered_matchedNames_Basal+LuminalA.RData"))
summary(BRCA_TCGA_79)


## ----codeToCreateDataset, echo=FALSE, eval=FALSE, message=FALSE----------
## # Files downloaded using wget from https://tcga-data.nci.nih.gov/docs/publications/brca_2012/
## 
## # Read Each file
## miRNA<-read.table("BRCA.348.mimat.txt", header=TRUE, sep=",", row.names=1)
## miRNAprecursor<-read.table("BRCA.348.precursor.txt", header=TRUE, sep=",", row.n
## ames=1)
## RNAseq<-read.table("BRCA.exp.348.med.txt", sep="\t", header=TRUE, row.names=1, q
## uote="\"")
## Methyl<-read.table("BRCA.Methylation.574probes.802.txt", sep="\t", header=TRUE)
## GISTIC<-read.table("brca_scna_all_thresholded.by_genes.txt", sep="\t", header=TR
## UE)
## RPPA<-read.table("rppaData-403Samp-171Ab-Trimmed.txt", sep="\t", header=TRUE, ro
## w.names=1)
## 
## # Create a vector of the names of these data matrices
## rawData<-ls()
## 
## # In GISTIC data, the first 3 columns are gene annotation. Omit these.
## GISTIC_fData= GISTIC[,1:3]
## GISTIC= GISTIC[,-c(1:3)]
## 
## # Create a list of sampleNames for each Dataset
## colNames<-lapply(rawData, function(x) cbind(colnames(get(x)), substr(colnames(ge
## t(x)), 1, 12)))
## names(colNames) = rawData
## 
## # Get a list of intersecting sampleNames
## BRCA_348<-list(intersect(colNames[["RPPA"]][,2], colNames[["miRNAprecursor"]][,2]))
## 
## 
## # Filter each dataset to 348 intersecting samples
## for (i in rawData) {
##  colNames[[i]]<-colNames[[i]][colNames[[i]][,2]%in%BRCA_348[[1]],]
##  rownames(colNames[[i]]) <-colNames[[i]][,2]
##  colNames[[i]]<-colNames[[i]][BRCA_348[[1]],]
## }
## 
## rm(i)
## 
## # Filter Datasets
## BRCA_TCGA<- list(GISTIC=GISTIC[, colNames[["GISTIC"]][,1]],
##     miRNA=miRNA[, colNames[["miRNA"]][,1]],
##     miRNAprecursor=miRNAprecursor[, colNames[["miRNAprecursor"]][,1]],
##     RNAseq=RNAseq[, colNames[["RNAseq"]][,1]],
##     Methyl=Methyl[, colNames[["Methyl"]][,1]],
##     RPPA=RPPA[, colNames[["RPPA"]][,1]])
## 
## 
## BRCA_TCGA$LOH= BRCA_TCGA$GISTIC
## BRCA_TCGA$LOH= apply(BRCA_TCGA$LOH, 2, function(x) ifelse(x== -1, 1,0))
## BRCA_TCGA$CNA= BRCA_TCGA$GISTIC
## BRCA_TCGA$CNA = apply(BRCA_TCGA$CNA,2, function(x) ifelse(x==-1, 0,x))
## 
## lapply(BRCA_TCGA, dim)
## lapply(BRCA_TCGA, function(x) sum(is.na(x)))
## 
## 
## Thres=0.9
## xx<-apply(BRCA_TCGA$LOH, 1, function(x) sum(x==0)<round(ncol(BRCA_TCGA$LOH)*Thre
## s))
## BRCA_TCGA$LOH<-BRCA_TCGA$LOH[xx,]
## xx<- apply(BRCA_TCGA$LOH, 1, function(x) sum(is.na(x))) < round(ncol(BRCA_TCGA$L
## OH)*Thres)
## BRCA_TCGA$LOH<-BRCA_TCGA$LOH[xx,]
## 
## xx<-apply(BRCA_TCGA$CNA, 1, function(x) sum(x==0)<round(ncol(BRCA_TCGA$CNA)*.9))
## BRCA_TCGA$CNA<-BRCA_TCGA$CNA[xx,]
## xx<- apply(BRCA_TCGA$CNA, 1, function(x) sum(is.na(x)) ) <round(ncol(BRCA_TCGA$C
## NA)*Thres)
## BRCA_TCGA$CNA<-BRCA_TCGA$CNA[xx,]
## # For RNAseq exclude those with lowest IQR
## xx<- apply(BRCA_TCGA$RNAseq,1, IQR, na.rm=TRUE)
## quantile(xx)
## xx= xx>quantile(xx)[2]
## BRCA_TCGA$RNAseq<-BRCA_TCGA$RNAseq[xx,]
## 
## # Get rid of NA and 0
## tt<-apply(BRCA_TCGA$RNAseq, 2, function(x)ifelse(is.na(x), 10^-6,x))
## tt<-apply(tt, 2, function(x)ifelse(x==0, 10^-6,x))
## BRCA_TCGA$RNAseq<-tt
## 
## lapply(BRCA_TCGA, function(x) sum(is.na(x)))
## lapply(BRCA_TCGA, dim)
## 
## save(BRCA_TCGA, file="BRCA_TCGA_348_filtered.RData")
## 
## 
## system("wget http://www.nature.com/nature/journal/v490/n7418/extref/nature11412-s2.zip")
## system("unzip nature11412-s2.zip")
## system("xls2csv nature11412-s2/Supplementary Tables 1-4.xls")
## clin= read.csv("Supplementary Tables 1.csv", header=TRUE, row.names=1, skip=1)
## gsub("\\.", "-", BRCA_348[[1]]) %in% rownames(clin)
## 
## BRCA_TCGA$clin<-clin[gsub("\\.", "-", BRCA_348[[1]]),]
## 
## save(BRCA_TCGA, file="BRCA_TCGA_348_filtered.RData")
## 
## ## Make all of the sample names the same (for Omicade4) and save the sampleNameData
## lapply(BRCA_TCGA[1:7], function(x) all(substr(colnames(x),1,12) == BRCA_348[[1]]
## ))
## for (i in 1:7) colnames(BRCA_TCGA[[i]])<- gsub("\\.", "-", BRCA_348[[1]])
## colNames<-lapply(colNames, function(x) cbind(x, gsub("\\.", "-", BRCA_348[[1]]))
## )
## for (i in colNames) rownames(i) <- rownames(BRCA_TCGA$clin)
## BRCA_TCGA$sampleNameInfo<-colNames
## save(BRCA_TCGA, file="BRCA_TCGA_348_filtered_matchedNames.RData")
## 
## # Small subset for faster computation
## lum<-BRCA_TCGA$clin$RPPA.Clusters=="LumA" & BRCA_TCGA$clin$PAM50.mRNA=="Luminal A" & BRCA_TCGA$clin$ER.Status=="Positive"
## basal<- BRCA_TCGA$clin$RPPA.Clusters=="Basal" & BRCA_TCGA$clin$PAM50.mRNA=="Basal-like" & BRCA_TCGA$clin$ER.Status=="Negative"
## table(basal)
## table(lum)
## table(basal | lum)
## BRCA_TCGA_79<-lapply(BRCA_TCGA[1:7], function(x) return(x[,rownames(BRCA_TCGA$cl
## in)[basal|lum]]))
## BRCA_TCGA_79$clin<- BRCA_TCGA$clin[colnames(BRCA_TCGA_79[[1]]),]
## save(BRCA_TCGA_79, file="BRCA_TCGA_79_filtered_matchedNames_Basal+LuminalA.RData")


## ----pca-----------------------------------------------------------------
library(ade4)
BRCApca<-dudi.pca(BRCA_TCGA_79$RNAseq, scannf=FALSE, nf=5)
print(BRCApca)


## ----pcasumm-------------------------------------------------------------
summary(BRCApca)


## ----plotcoa,message=FALSE, warning=FALSE--------------------------------
library(made4)
cl<-as.character(BRCA_TCGA_79$clin$PAM50.mRNA)
BRCAord= ord(BRCA_TCGA_79$RNAseq, classvec=factor(cl))
summary(BRCAord$ord)
plot(BRCAord, nlab=3,   arraylabels=rep("T", 79))


## ----plotarrays2,  message=FALSE-----------------------------------------
par(mfrow=c(2,1))
plotarrays(BRCAord$ord$co, classvec=BRCA_TCGA_79$clin$PAM50.mRNA)
plotgenes(BRCAord, n=5, col="red")


## ----topgenes------------------------------------------------------------
ax1<- topgenes(BRCAord, axis=1, n=5)


## ----BRCAciaplot, echo=FALSE, warning=FALSE------------------------------
BRCAcia<-cia(BRCA_TCGA_79$RNAseq, BRCA_TCGA_79$RPPA)
BRCAcia$coinertia
plot(BRCAcia, classvec=BRCA_TCGA_79$clin$PAM50.mRNA, nlab=3, clab=0, cpoint=3 )


## ----ciaMoreCocircle-----------------------------------------------------
par(mfrow=c(1,2))
s.corcircle(BRCAcia$coinertia$aX)
s.corcircle(BRCAcia$coinertia$aY)


## ----mcia, cache=TRUE, eval=FALSE----------------------------------------
## # Check that there are no zero or low variant low count rows.
## for (i in 1:7) BRCA_TCGA_79[[i]]<-BRCA_TCGA_79[[i]][!apply(BRCA_TCGA_79[[i]],1, sum)==min(BRCA_TCGA_79[[i]]),]
## 
## library(omicade4)
## mcia79<-mcia(BRCA_TCGA_79[1:7])
## save(mcia79, file="mciaRes.RData")


## ----loadmCia, echo=FALSE------------------------------------------------
require(omicade4)
load(file.path(datadir,"mciaRes.RData"))


## ----plotmcia, warning=FALSE, message=FALSE------------------------------
plot(mcia79, axes=1:2, sample.lab=FALSE, sample.legend=FALSE, phenovec=as.numeric(BRCA_TCGA_79$clin$PAM50.mRNA), gene.nlab=2, df.color=c("navy", "cyan", "magenta", "red4", "brown","yellow", "orange"),df.pch=2:8)


## ----rv, echo=FALSE------------------------------------------------------
# RV to reference.  Computes the RV coefficient between the representations of individual Cases ($Tl1) with the synthetic variables (reference, $SynVar). 


RV.mcoa <- function(m,...){ 
# see RV.rtest 
# Thanks Pierre Bady (Lyon)
# require(ade4) 
    if (!inherits(m, "mcoa")) 
        stop("non convenient data") 
    blo <- sort(unique(m$TL[, 1])) 
    nblo <- length(blo) 
    res <- NULL 
    for(i in 1:nblo){ 
        X <- scale(m$SynVar, scale = FALSE) 
        Y <- scale(m$Tl1[m$TL[,1]==i,], scale = FALSE) 
        X <- X/(sum(svd(X)$d^4)^0.25) 
        Y <- Y/(sum(svd(Y)$d^4)^0.25) 
        X <- as.matrix(X) 
        Y <- as.matrix(Y) 
        w <- sum(svd(t(X) %*% Y)$d^2) 
        res <- c(res,w) 
        } 
    names(res)<- row.names(m$cov2) 
    return(res) 
} 


## ----rvm-----------------------------------------------------------------
RV.mcoa(mcia79$mcoa)


## ----rvm2----------------------------------------------------------------
mcia79$mcoa$RV


## ----assesingMCIAcov-----------------------------------------------------

mcia79$mcoa$cov2
plot(mcia79$mcoa$cov2,  xlab = "pseudoeig 1", ylab = "pseudoeig 2", pch=19, col="red")
text(mcia79$mcoa$cov2, labels=rownames(mcia79$mcoa$cov2), cex=0.8, adj=0)


## ----assessingMCIA, echo=FALSE, eval=FALSE-------------------------------
## mcia79$mcoa$lambda
## plot(mcia79$mcoa$lambda)


## ----mciaAxes, message=FALSE, warning=FALSE,fig.keep='all'---------------
mcia79$mcoa$Tax
dev.off()
par(mfrow=c(4,2))
xx<-by(mcia79$mcoa$Tax, substr(rownames(mcia79$mcoa$Tax),1,3), s.corcircle)


## ----samplescores--------------------------------------------------------
#plotarrays(mcia79$mcoa$SynVar, classvec=BRCA_TCGA_79$clin$PAM50.mRNA)
kplot(mcia79$mcoa, mfrow = c(3,4), clab = .8, csub = 3, cpoi = 3)


## ----genescores----------------------------------------------------------
summary(mcia79$mcoa$axis)
par(mfrow=c(1,2))
plot(mcia79$mcoa$axis[,1]~factor(mcia79$mcoa$TC[,1]), col=1:7, names=names(mcia79$coa), ylab="Gene Scores PC1", xlab="", las=2)
plot(mcia79$mcoa$axis[,2]~factor(mcia79$mcoa$TC[,1]), col=1:7, names=names(mcia79$coa), ylab="Gene Scores PC2", xlab="", las=2)


## ----features------------------------------------------------------------
mcia79$mcoa$axis[order(mcia79$mcoa$axis[,1]),][1:10,1, drop=FALSE]

## Dataset suffix
cbind(1:7,rownames(mcia79$mcoa$cov2))


## ----catData-------------------------------------------------------------
## To "concatentate"" data, 
mm<-function(x) substr(x, 1, nchar(x)-2)
ids<-mm(rownames(mcia79$mcoa$axis))

# Whilst it would be great, to have alll of our data mapped to genome co-ordinates and really take the union of everything, to keep it simple, I will only look at the Gene Symbols (RNAseq, RPPA) 
library(HGNChelper)
library(Biobase)

idsFix<- checkGeneSymbols(ids)

## Get PC1
idsPC<-cbind(idsFix, PC=mcia79$mcoa$axis)

GOs<-select(org.Hs.eg.db, columns="GO",keytype="SYMBOL",keys=idsPC$Suggested.Symbol)


#Get Coordinates for GOs terms
res<-tapply(GOs$SYMBOL, GOs$GO, function(Syms) colMeans(idsPC[idsPC$Suggested.Symbol%in%Syms,c("PC.Axis1","PC.Axis2")]))
res<-do.call(rbind,res)

res[1:2,]

plot(mcia79$mcoa$axis, col =as.numeric(mcia79$mcoa$TC[,1]), pch=as.numeric(mcia79$mcoa$TC[,1]))
tt<-topgenes(res, n=5)
points(res[tt,], pch=19, col="gray")
text(res[tt,], labels=tt, cex=0.8, adj=0)


## ----gsva, eval=FALSE----------------------------------------------------
## #Exclude NA
## idsPC<-idsPC[!is.na(idsPC$Suggested.Symbol),]
## 
## #Reduce to GeneSymbol and MCIA score, taking max
## idsPC1<-tapply(idsPC$PC.Axis1, idsPC$Suggested.Symbol,max)
## idsPC2<-tapply(idsPC$PC.Axis2, idsPC$Suggested.Symbol,max)
## if( !all(names(idsPC1)==names(idsPC2))) stop()
## idsPCs<-cbind(idsPC1, idsPC2)
## 
## 
## # The "built-in" gsva library are mapped to entrezIDs so map symbols
## entrezPC1<-select(org.Hs.eg.db, columns="ENTREZID",keytype="SYMBOL",keys=rownames(idsPCs))
## entrezPC1<-entrezPC1[!duplicated(entrezPC1[,1]),]
## table(rownames(idsPCs)==entrezPC1[,1])
## rownames(idsPCs)= entrezPC1[,2]
## 
## # Run any enrichment test with gsva
## require(GSVA)
## require(GSVAdata)
## gsva(idsPCs, c2BroadSets)