## ----setup, echo=FALSE,error=FALSE,message=FALSE------------------------- datadir= "/data/IntPathAnalysis/BRCA/" library(knitcitations) library(bibtex) library(knitr) library(AnnotationDbi) library(org.Hs.eg.db) allbib = read.bibtex(file.path(".", "bioc.bib")) opts_chunk$set(cache=TRUE, fig.path='figure/', cache.path = 'cache-local/') ## ----loadBRCA------------------------------------------------------------ load(file.path(datadir, "BRCA_TCGA_79_filtered_matchedNames_Basal+LuminalA.RData")) summary(BRCA_TCGA_79) ## ----codeToCreateDataset, echo=FALSE, eval=FALSE, message=FALSE---------- ## # Files downloaded using wget from https://tcga-data.nci.nih.gov/docs/publications/brca_2012/ ## ## # Read Each file ## miRNA<-read.table("BRCA.348.mimat.txt", header=TRUE, sep=",", row.names=1) ## miRNAprecursor<-read.table("BRCA.348.precursor.txt", header=TRUE, sep=",", row.n ## ames=1) ## RNAseq<-read.table("BRCA.exp.348.med.txt", sep="\t", header=TRUE, row.names=1, q ## uote="\"") ## Methyl<-read.table("BRCA.Methylation.574probes.802.txt", sep="\t", header=TRUE) ## GISTIC<-read.table("brca_scna_all_thresholded.by_genes.txt", sep="\t", header=TR ## UE) ## RPPA<-read.table("rppaData-403Samp-171Ab-Trimmed.txt", sep="\t", header=TRUE, ro ## w.names=1) ## ## # Create a vector of the names of these data matrices ## rawData<-ls() ## ## # In GISTIC data, the first 3 columns are gene annotation. Omit these. ## GISTIC_fData= GISTIC[,1:3] ## GISTIC= GISTIC[,-c(1:3)] ## ## # Create a list of sampleNames for each Dataset ## colNames<-lapply(rawData, function(x) cbind(colnames(get(x)), substr(colnames(ge ## t(x)), 1, 12))) ## names(colNames) = rawData ## ## # Get a list of intersecting sampleNames ## BRCA_348<-list(intersect(colNames[["RPPA"]][,2], colNames[["miRNAprecursor"]][,2])) ## ## ## # Filter each dataset to 348 intersecting samples ## for (i in rawData) { ## colNames[[i]]<-colNames[[i]][colNames[[i]][,2]%in%BRCA_348[[1]],] ## rownames(colNames[[i]]) <-colNames[[i]][,2] ## colNames[[i]]<-colNames[[i]][BRCA_348[[1]],] ## } ## ## rm(i) ## ## # Filter Datasets ## BRCA_TCGA<- list(GISTIC=GISTIC[, colNames[["GISTIC"]][,1]], ## miRNA=miRNA[, colNames[["miRNA"]][,1]], ## miRNAprecursor=miRNAprecursor[, colNames[["miRNAprecursor"]][,1]], ## RNAseq=RNAseq[, colNames[["RNAseq"]][,1]], ## Methyl=Methyl[, colNames[["Methyl"]][,1]], ## RPPA=RPPA[, colNames[["RPPA"]][,1]]) ## ## ## BRCA_TCGA$LOH= BRCA_TCGA$GISTIC ## BRCA_TCGA$LOH= apply(BRCA_TCGA$LOH, 2, function(x) ifelse(x== -1, 1,0)) ## BRCA_TCGA$CNA= BRCA_TCGA$GISTIC ## BRCA_TCGA$CNA = apply(BRCA_TCGA$CNA,2, function(x) ifelse(x==-1, 0,x)) ## ## lapply(BRCA_TCGA, dim) ## lapply(BRCA_TCGA, function(x) sum(is.na(x))) ## ## ## Thres=0.9 ## xx<-apply(BRCA_TCGA$LOH, 1, function(x) sum(x==0)quantile(xx)[2] ## BRCA_TCGA$RNAseq<-BRCA_TCGA$RNAseq[xx,] ## ## # Get rid of NA and 0 ## tt<-apply(BRCA_TCGA$RNAseq, 2, function(x)ifelse(is.na(x), 10^-6,x)) ## tt<-apply(tt, 2, function(x)ifelse(x==0, 10^-6,x)) ## BRCA_TCGA$RNAseq<-tt ## ## lapply(BRCA_TCGA, function(x) sum(is.na(x))) ## lapply(BRCA_TCGA, dim) ## ## save(BRCA_TCGA, file="BRCA_TCGA_348_filtered.RData") ## ## ## system("wget http://www.nature.com/nature/journal/v490/n7418/extref/nature11412-s2.zip") ## system("unzip nature11412-s2.zip") ## system("xls2csv nature11412-s2/Supplementary Tables 1-4.xls") ## clin= read.csv("Supplementary Tables 1.csv", header=TRUE, row.names=1, skip=1) ## gsub("\\.", "-", BRCA_348[[1]]) %in% rownames(clin) ## ## BRCA_TCGA$clin<-clin[gsub("\\.", "-", BRCA_348[[1]]),] ## ## save(BRCA_TCGA, file="BRCA_TCGA_348_filtered.RData") ## ## ## Make all of the sample names the same (for Omicade4) and save the sampleNameData ## lapply(BRCA_TCGA[1:7], function(x) all(substr(colnames(x),1,12) == BRCA_348[[1]] ## )) ## for (i in 1:7) colnames(BRCA_TCGA[[i]])<- gsub("\\.", "-", BRCA_348[[1]]) ## colNames<-lapply(colNames, function(x) cbind(x, gsub("\\.", "-", BRCA_348[[1]])) ## ) ## for (i in colNames) rownames(i) <- rownames(BRCA_TCGA$clin) ## BRCA_TCGA$sampleNameInfo<-colNames ## save(BRCA_TCGA, file="BRCA_TCGA_348_filtered_matchedNames.RData") ## ## # Small subset for faster computation ## lum<-BRCA_TCGA$clin$RPPA.Clusters=="LumA" & BRCA_TCGA$clin$PAM50.mRNA=="Luminal A" & BRCA_TCGA$clin$ER.Status=="Positive" ## basal<- BRCA_TCGA$clin$RPPA.Clusters=="Basal" & BRCA_TCGA$clin$PAM50.mRNA=="Basal-like" & BRCA_TCGA$clin$ER.Status=="Negative" ## table(basal) ## table(lum) ## table(basal | lum) ## BRCA_TCGA_79<-lapply(BRCA_TCGA[1:7], function(x) return(x[,rownames(BRCA_TCGA$cl ## in)[basal|lum]])) ## BRCA_TCGA_79$clin<- BRCA_TCGA$clin[colnames(BRCA_TCGA_79[[1]]),] ## save(BRCA_TCGA_79, file="BRCA_TCGA_79_filtered_matchedNames_Basal+LuminalA.RData") ## ----pca----------------------------------------------------------------- library(ade4) BRCApca<-dudi.pca(BRCA_TCGA_79$RNAseq, scannf=FALSE, nf=5) print(BRCApca) ## ----pcasumm------------------------------------------------------------- summary(BRCApca) ## ----plotcoa,message=FALSE, warning=FALSE-------------------------------- library(made4) cl<-as.character(BRCA_TCGA_79$clin$PAM50.mRNA) BRCAord= ord(BRCA_TCGA_79$RNAseq, classvec=factor(cl)) summary(BRCAord$ord) plot(BRCAord, nlab=3, arraylabels=rep("T", 79)) ## ----plotarrays2, message=FALSE----------------------------------------- par(mfrow=c(2,1)) plotarrays(BRCAord$ord$co, classvec=BRCA_TCGA_79$clin$PAM50.mRNA) plotgenes(BRCAord, n=5, col="red") ## ----topgenes------------------------------------------------------------ ax1<- topgenes(BRCAord, axis=1, n=5) ## ----BRCAciaplot, echo=FALSE, warning=FALSE------------------------------ BRCAcia<-cia(BRCA_TCGA_79$RNAseq, BRCA_TCGA_79$RPPA) BRCAcia$coinertia plot(BRCAcia, classvec=BRCA_TCGA_79$clin$PAM50.mRNA, nlab=3, clab=0, cpoint=3 ) ## ----ciaMoreCocircle----------------------------------------------------- par(mfrow=c(1,2)) s.corcircle(BRCAcia$coinertia$aX) s.corcircle(BRCAcia$coinertia$aY) ## ----mcia, cache=TRUE, eval=FALSE---------------------------------------- ## # Check that there are no zero or low variant low count rows. ## for (i in 1:7) BRCA_TCGA_79[[i]]<-BRCA_TCGA_79[[i]][!apply(BRCA_TCGA_79[[i]],1, sum)==min(BRCA_TCGA_79[[i]]),] ## ## library(omicade4) ## mcia79<-mcia(BRCA_TCGA_79[1:7]) ## save(mcia79, file="mciaRes.RData") ## ----loadmCia, echo=FALSE------------------------------------------------ require(omicade4) load(file.path(datadir,"mciaRes.RData")) ## ----plotmcia, warning=FALSE, message=FALSE------------------------------ plot(mcia79, axes=1:2, sample.lab=FALSE, sample.legend=FALSE, phenovec=as.numeric(BRCA_TCGA_79$clin$PAM50.mRNA), gene.nlab=2, df.color=c("navy", "cyan", "magenta", "red4", "brown","yellow", "orange"),df.pch=2:8) ## ----rv, echo=FALSE------------------------------------------------------ # RV to reference. Computes the RV coefficient between the representations of individual Cases ($Tl1) with the synthetic variables (reference, $SynVar). RV.mcoa <- function(m,...){ # see RV.rtest # Thanks Pierre Bady (Lyon) # require(ade4) if (!inherits(m, "mcoa")) stop("non convenient data") blo <- sort(unique(m$TL[, 1])) nblo <- length(blo) res <- NULL for(i in 1:nblo){ X <- scale(m$SynVar, scale = FALSE) Y <- scale(m$Tl1[m$TL[,1]==i,], scale = FALSE) X <- X/(sum(svd(X)$d^4)^0.25) Y <- Y/(sum(svd(Y)$d^4)^0.25) X <- as.matrix(X) Y <- as.matrix(Y) w <- sum(svd(t(X) %*% Y)$d^2) res <- c(res,w) } names(res)<- row.names(m$cov2) return(res) } ## ----rvm----------------------------------------------------------------- RV.mcoa(mcia79$mcoa) ## ----rvm2---------------------------------------------------------------- mcia79$mcoa$RV ## ----assesingMCIAcov----------------------------------------------------- mcia79$mcoa$cov2 plot(mcia79$mcoa$cov2, xlab = "pseudoeig 1", ylab = "pseudoeig 2", pch=19, col="red") text(mcia79$mcoa$cov2, labels=rownames(mcia79$mcoa$cov2), cex=0.8, adj=0) ## ----assessingMCIA, echo=FALSE, eval=FALSE------------------------------- ## mcia79$mcoa$lambda ## plot(mcia79$mcoa$lambda) ## ----mciaAxes, message=FALSE, warning=FALSE,fig.keep='all'--------------- mcia79$mcoa$Tax dev.off() par(mfrow=c(4,2)) xx<-by(mcia79$mcoa$Tax, substr(rownames(mcia79$mcoa$Tax),1,3), s.corcircle) ## ----samplescores-------------------------------------------------------- #plotarrays(mcia79$mcoa$SynVar, classvec=BRCA_TCGA_79$clin$PAM50.mRNA) kplot(mcia79$mcoa, mfrow = c(3,4), clab = .8, csub = 3, cpoi = 3) ## ----genescores---------------------------------------------------------- summary(mcia79$mcoa$axis) par(mfrow=c(1,2)) plot(mcia79$mcoa$axis[,1]~factor(mcia79$mcoa$TC[,1]), col=1:7, names=names(mcia79$coa), ylab="Gene Scores PC1", xlab="", las=2) plot(mcia79$mcoa$axis[,2]~factor(mcia79$mcoa$TC[,1]), col=1:7, names=names(mcia79$coa), ylab="Gene Scores PC2", xlab="", las=2) ## ----features------------------------------------------------------------ mcia79$mcoa$axis[order(mcia79$mcoa$axis[,1]),][1:10,1, drop=FALSE] ## Dataset suffix cbind(1:7,rownames(mcia79$mcoa$cov2)) ## ----catData------------------------------------------------------------- ## To "concatentate"" data, mm<-function(x) substr(x, 1, nchar(x)-2) ids<-mm(rownames(mcia79$mcoa$axis)) # Whilst it would be great, to have alll of our data mapped to genome co-ordinates and really take the union of everything, to keep it simple, I will only look at the Gene Symbols (RNAseq, RPPA) library(HGNChelper) library(Biobase) idsFix<- checkGeneSymbols(ids) ## Get PC1 idsPC<-cbind(idsFix, PC=mcia79$mcoa$axis) GOs<-select(org.Hs.eg.db, columns="GO",keytype="SYMBOL",keys=idsPC$Suggested.Symbol) #Get Coordinates for GOs terms res<-tapply(GOs$SYMBOL, GOs$GO, function(Syms) colMeans(idsPC[idsPC$Suggested.Symbol%in%Syms,c("PC.Axis1","PC.Axis2")])) res<-do.call(rbind,res) res[1:2,] plot(mcia79$mcoa$axis, col =as.numeric(mcia79$mcoa$TC[,1]), pch=as.numeric(mcia79$mcoa$TC[,1])) tt<-topgenes(res, n=5) points(res[tt,], pch=19, col="gray") text(res[tt,], labels=tt, cex=0.8, adj=0) ## ----gsva, eval=FALSE---------------------------------------------------- ## #Exclude NA ## idsPC<-idsPC[!is.na(idsPC$Suggested.Symbol),] ## ## #Reduce to GeneSymbol and MCIA score, taking max ## idsPC1<-tapply(idsPC$PC.Axis1, idsPC$Suggested.Symbol,max) ## idsPC2<-tapply(idsPC$PC.Axis2, idsPC$Suggested.Symbol,max) ## if( !all(names(idsPC1)==names(idsPC2))) stop() ## idsPCs<-cbind(idsPC1, idsPC2) ## ## ## # The "built-in" gsva library are mapped to entrezIDs so map symbols ## entrezPC1<-select(org.Hs.eg.db, columns="ENTREZID",keytype="SYMBOL",keys=rownames(idsPCs)) ## entrezPC1<-entrezPC1[!duplicated(entrezPC1[,1]),] ## table(rownames(idsPCs)==entrezPC1[,1]) ## rownames(idsPCs)= entrezPC1[,2] ## ## # Run any enrichment test with gsva ## require(GSVA) ## require(GSVAdata) ## gsva(idsPCs, c2BroadSets)