## ----echo=FALSE, results="hide", warning=FALSE, message=FALSE------------ suppressPackageStartupMessages({ library(SRAdb) library(reshape2) library(Rsubread) library(Rsamtools) library(ChIPpeakAnno) library(GeneNetworkBuilder) library(rtracklayer) library(GenomicAlignments) library(idr) library(ChIPQC) library(csaw) library(ggplot2) library(TxDb.Hsapiens.UCSC.hg19.knownGene) library(org.Hs.eg.db) library(reactome.db) library(BSgenome.Hsapiens.UCSC.hg19) library(motifStack) library(knitr) library(trackViewer) library(png) library(grid) library(limma) library(simpIntLists) library(plotly) library(Rgraphviz) }) opts_chunk$set(eval=FALSE, fig.width=8, fig.height=8, warning = FALSE, message = FALSE) extdata <- system.file("extdata", package = "ChIPseqStepByStep") load(file.path(extdata, "minimal.rds")) ## ----eval=FALSE---------------------------------------------------------- # ## load library SRAdb to retrieve SRA files for GSE66081 dataset # library(SRAdb) # ## SRAdb need to download a light sqlite file # ## to extract sra file information # ## and then download by getSRAfile function. # sqlfile <- "SRAmetadb.sqlite" # if(!file.exists(sqlfile)) # sqlfile <- getSRAdbFile() # sra_con <- dbConnect(SQLite(), sqlfile) # conversion <- sraConvert(c("SRP055170"), # sra_con=sra_con) # out <- getSRAfile(conversion$sample, # sra_con=sra_con, # fileType="sra") ## ----eval=FALSE---------------------------------------------------------- # ## extract file annotations # rs <- listSRAfile(conversion$sample, # sra_con=sra_con, # fileType="sra") # experiment <- # dbGetQuery(sra_con, # paste0("select * from experiment where", # " experiment_accession in ('", # paste(conversion$experiment, # collapse="','"), "')")) # info <- merge(experiment[, c("experiment_accession", # "title", "library_layout")], # rs[, c("experiment", "run")], by=1) ## ----eval=TRUE----------------------------------------------------------- info[1:5, 2:3] ## ----eval=FALSE---------------------------------------------------------- # runs <- info[, "run"] # ## extract fastq files from sra by sratoolkit # sapply(paste0(runs, ".sra"), # function(.ele) system(paste("fastq-dump", # .ele))) ## ----eval=FALSE---------------------------------------------------------- # grp <- do.call(rbind, # strsplit(info$title, "(;|:) ")) # info <- cbind(info, grp) # runs <- split(runs, grp[, 2]) # group <- gsub("_ChIPSeq", "", names(runs)) ## ----eval=TRUE----------------------------------------------------------- runs[1:2] ## ----eval=FALSE---------------------------------------------------------- # mapply(function(.ele, n) # system(paste("cat", paste0(.ele, ".fastq", collapse=" "), # ">>", paste0(n, ".fastq"), collapse=" ")), # runs, group) # ## remove unused files to save storage. # unlink(paste0(info$run, ".fastq")) # unlink(paste0(info$run, ".sra")) ## ----eval=FALSE---------------------------------------------------------- # library(Rsubread) # fastq.files <- paste0(group, ".fastq") # bam.files <- paste0(group, ".bam") # # ##getPhredOffset, if the offset is not correct, it will report error. # x <- qualityScores(filename=fastq.files[1], # input_format="FASTQ", offset=33, # nreads=1000) ## ----eval=TRUE----------------------------------------------------------- x[1:3, 1:10] ## ------------------------------------------------------------------------ # library(Rsubread) # reads <- system.file("extdata","reads.txt.gz",package="Rsubread") # head(qualityScores(filename = reads, # input_format = "gzFASTQ", # offset = 33, # nreads = 100)) ## please check the errors # head(qualityScores(filename = reads, # input_format = "gzFASTQ", # offset = 64, # nreads = 100)) ## ----eval=FALSE---------------------------------------------------------- # ## build index and this can be re-used. # hg19.fasta <- # "Genomes/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa" # dir.create("index") # buildindex(basename="index/hg19", # reference=hg19.fasta) # # ## alignment # align(index="./index/hg19", readfile1=fastq.files, # type=1, # output_file=bam.files, maxMismatches=2, # nthreads=2, # phredOffset=33, unique=TRUE) ## ----eval=FALSE---------------------------------------------------------- # library(Rsamtools) # filter <- FilterRules(list(seqn = function(x) x$rname == "chr1")) # ## sort and index # null <- sapply(group, function(gp) { # sortBam(paste0(gp, ".bam"), paste0(gp, ".srt")) # file.copy(paste0(gp, ".srt.bam"), # paste0(gp, ".bam"), overwrite = TRUE) # indexBam(paste0(gp, ".bam")) # }) # ## subset (optional) # null <- sapply(group, function(gp) { # dest <- paste0(gp, ".chr1.bam") # filterBam(paste0(gp, ".bam"), paste0(gp, ".chr1.bam"), # filter=filter) # file.rename(paste0(gp, ".chr1.bam"), paste0(gp, ".bam")) # indexBam(paste0(gp, ".bam")) # }) ## ----eval=FALSE---------------------------------------------------------- # ## remove reads which are mapped to identical locations, # ## using mapping location of the first base of each read. # ## and resort by position # library(Rsamtools) # library(Rsubread) # null <- sapply(group[c(1, 11:13)], function(gp){ # out <- asSam(paste0(gp, ".bam"), gp) # removeDupReads(out, threshold=2, # outputFile=paste0(gp, ".rmdup.sam")) # asBam(paste0(gp, ".rmdup.sam"), paste0(gp, ".rmdup"), # indexDestination=TRUE) # unlink(out) # unlink(paste0(gp, ".rmdup.sam")) # }) ## ----eval=FALSE---------------------------------------------------------- # ## pool IgG depend on experiment # IgG.bg <- list(rep1.2=c("IgG_rep1.rmdup.bam", # "IgG_rep2.rmdup.bam", # "IgG_rep1.2.rmdup.bam"), # rep3.4=c("IgG_rep3.rmdup.bam", # "IgG_rep4.rmdup.bam", # "IgG_rep3.4.rmdup.bam"), # rep3.5=c("IgG_rep3.rmdup.bam", # "IgG_rep5.rmdup.bam", # "IgG_rep3.5.rmdup.bam")) # null <- sapply(IgG.bg[3], function(.ele){ # out <- mergeBam(files=.ele[-3], # destination = .ele[3], # indexDestination=TRUE) # }) ## ----eval=FALSE---------------------------------------------------------- # library(ChIPpeakAnno) # library(TxDb.Hsapiens.UCSC.hg19.knownGene) # pair <- data.frame(treat=c("YAP", "TAZ", "TEAD4", "JUN"), # control=c("1.2", "1.2", "3.4", "3.5"), # stringsAsFactors = FALSE) # ## use chromosome 1 to save the time. # chr1 <- as(seqinfo(TxDb.Hsapiens.UCSC.hg19.knownGene)["chr1"], "GRanges") # ## check the difference between TEAD4 ChIP and background signal # i <- 3 # cumulativePercentage( # bamfiles = c(paste0(pair[i, "treat"], # "_rep1.rmdup.bam"), # paste0("IgG_rep", pair[i, "control"], # ".rmdup.bam")), # gr=chr1) ## ----eval=FALSE---------------------------------------------------------- # for(i in seq.int(nrow(pair))){ # system(paste("macs2 callpeak --to-large -t", # paste0(pair[i, "treat"], "_rep1.rmdup.bam"), # "-c ", paste0("IgG_rep", pair[i, "control"], ".rmdup.bam"), # " -f BAM -g hs -q 0.1 -n", # paste0(pair[i, "treat"], "_rep1.q1"))) # system(paste("macs2 callpeak --to-large -t", # paste0(pair[i, "treat"], "_rep2.rmdup.bam"), # "-c ", paste0("IgG_rep", pair[i, "control"], ".rmdup.bam"), # " -f BAM -g hs -q 0.1 -n", # paste0(pair[i, "treat"], "_rep2.q1"))) # ## we will use idr to filter the results later # } ## ----eval=FALSE, warning=FALSE------------------------------------------- # library(ChIPpeakAnno) # macs2.files <- dir(".", pattern="*.q1_peaks.narrowPeak$") # peaks <- sapply(macs2.files, function(.ele) # toGRanges(.ele, format="narrowPeak")) # peaks.bk <- peaks # peaks <- lapply(peaks, function(.ele) # .ele[order(.ele$pValue, decreasing=TRUE)]) ## ------------------------------------------------------------------------ # library(ChIPpeakAnno) # packagePath <- system.file("extdata", package = "ChIPpeakAnno") # dir(packagePath, "gff|bed|narrowPeak|broadPeak|gz") # toGRanges(file.path(packagePath, "GFF_peaks.gff"), format = "GFF") # toGRanges(file.path(packagePath, "WS220.bed"), format = "BED") # toGRanges(file.path(packagePath, "peaks.narrowPeak"), format = "narrowPeak") # toGRanges(file.path(packagePath, "TAF.broadPeak"), format = "broadPeak") # toGRanges(file.path(packagePath, "wgEncodeUmassDekker5CGm12878PkV2.bed.gz"), format = "BED") # packagePath <- system.file("extdata", "macs", package = "ChIPseqStepByStep") # macs2.files <- dir(packagePath, pattern="*.q1_peaks.narrowPeak$") # peaks <- sapply(macs2.files, function(.ele) # toGRanges(file.path(packagePath, .ele), format="narrowPeak")) # peaks.bk <- peaks # peaks <- lapply(peaks, function(.ele) .ele[order(.ele$pValue, decreasing=TRUE)]) ## ----eval=TRUE----------------------------------------------------------- peaks <- lapply(peaks, function(.ele) .ele[1:min(1e5, length(.ele))]) lengths(peaks) ## ----eval=FALSE---------------------------------------------------------- # new.group <- gsub("^(.*?)_.*$", "\\1", names(peaks)) # peaks.grp <- split(peaks, new.group) # names(peaks.grp) # ## find overlapping peaks # GSE66081 <- sapply(peaks.grp, findOverlapsOfPeaks, simplify = FALSE) # GSE66081.bk <- GSE66081 # GSE66081 <- sapply(GSE66081, function(.ele) # .ele$peaklist[[names(.ele$peaklist)[grepl("\\/\\/\\/", # names(.ele$peaklist))]]], # simplify=FALSE) ## ----eval=TRUE----------------------------------------------------------- lengths(GSE66081) ## ----eval=FALSE---------------------------------------------------------- # library(ChIPpeakAnno) # new.group.names <- lapply(GSE66081.bk, function(.ele) # sub(".q1_peaks.narrowPeak", ".bam", # names(.ele$peaklist)[!grepl("\\/\\/\\/", names(.ele$peaklist))])) # GSE66081 <- mapply(function(.peaks, .bamfiles) # IDRfilter(peaksA=.peaks[[1]], peaksB=.peaks[[2]], # bamfileA=.bamfiles[1], bamfileB=.bamfiles[2], # IDRcutoff=0.01), # peaks.grp, new.group.names) # ## The original peaks are filtered to decrease the memory usage. # peaks.keep <- sapply(GSE66081, function(.ele) # as.character(unlist(.ele$peakNames))) # peaks.keep <- lapply(peaks.keep, function(.ele) gsub("^.*__", "", .ele)) # peaks.keep <- unlist(peaks.keep) # peaks.s <- lapply(peaks, function(.ele) .ele[names(.ele) %in% peaks.keep]) # peaks.unlist <- unlist(GRangesList(peaks.s), use.names = FALSE) ## ----eval=FALSE---------------------------------------------------------- # library(rtracklayer) # null <- mapply(function(.dat, .name) # export(.dat, # sub("^(.*?).q1.*$", "\\1.fil.bed", .name), # format="BED"), # peaks.s, names(peaks.s)) # ## filter the peaks by qValue < 10^-10 to get similar number peaks # ## as filtered by idr to confirm the effect of IDR filter. # peaks.bk <- lapply(peaks.bk, function(.ele) # .ele[.ele$qValue>10]) # null <- mapply(function(.dat, .name) # export(.dat, # sub("^(.*?).q1.*$", "\\1.fil.q01.bed", .name), # format="BED"), # peaks.bk, names(peaks.bk)) ## ----eval=FALSE---------------------------------------------------------- # ## strand cross-correlation # library(csaw) # scc <- lapply(sub("^(.*?).q1.*$", "\\1.rmdup.bam", names(peaks)), # correlateReads, cross=TRUE, # param=readParam(minq=30, # restrict="chr1", # dedup=TRUE)) # names(scc) <- gsub(".q1.*$", "", names(peaks)) # scc.param <- lapply(scc, function(.ele){ # readsLength <- 50 # cutline <- 2 * readsLength # read_length <- which.max(.ele[1:cutline]) # fragment_length <- which.max(.ele[cutline:length(.ele)]) + cutline - 1 # baseline <- which.min(.ele[cutline:length(.ele)]) + cutline - 1 # #normalized.strand.coefficient # NSC <- .ele[fragment_length] / .ele[baseline] # # relative strand correlation # RSC <- (.ele[fragment_length] - .ele[baseline])/(.ele[read_length] - .ele[baseline]) # c(read_length=read_length-1, # fragment_length=fragment_length-1, # baseline=baseline-1, # NSC=NSC, # RSC=RSC) # }) ## ----eval=TRUE----------------------------------------------------------- do.call(rbind, scc.param)[, c("NSC", "RSC")] ## ----eval=FALSE---------------------------------------------------------- # op <- par(mfrow=c(4, 2)) # for(i in 1:length(scc)){ # plot(1:length(scc[[i]])-1, scc[[i]], # xlab="Delay (bp)", ylab="cross-correlation", # main=names(scc)[i], type="l", # ylim=c(scc[[i]][scc.param[[i]]["baseline"]]*.8, max(scc[[i]])*1.1)) # abline(v=scc.param[[i]]["fragment_length"], col="red", lty=3) # abline(h=scc[[i]][scc.param[[i]][c("fragment_length", "read_length", "baseline")]+1], # col=c("blue", "green", "gray30"), lty=2) # } ## ----eval=TRUE, fig.width=9, fig.height=6, echo=FALSE-------------------- op <- par(mfrow=c(2, 4)) for(i in seq_along(scc)){ plot(seq_along(scc[[i]])-1, scc[[i]], xlab="Delay (bp)", ylab="cross-correlation", main=names(scc)[i], type="l", ylim=c(scc[[i]][scc.param[[i]]["baseline"]]*.8, max(scc[[i]])*1.1)) abline(v=scc.param[[i]]["fragment_length"], col="red", lty=3) abline(h=scc[[i]][scc.param[[i]][c("fragment_length", "read_length", "baseline")]+1], col=c("blue", "green", "gray30"), lty=2) } ## ----eval=FALSE---------------------------------------------------------- # library(GenomicAlignments) # library(rtracklayer) # bwPath <- "bw" # dir.create(bwPath) # cvgs <- mapply(function(.ele, .name){ # gal <- readGAlignments(paste0(.name, ".rmdup.bam")) # cvg <- coverage(gal, width = as.numeric(.ele["fragment_length"])) # seqinfo(cvg) <- seqinfo(TxDb.Hsapiens.UCSC.hg19.knownGene) # export.bw(cvg, # con=file.path(bwPath, # paste0(.name, ".rmdup.bigWig"))) # cvg # }, scc.param, names(scc.param)) ## ----eval=TRUE----------------------------------------------------------- library(ChIPQC) samples <- data.frame(SampleID=gsub("^(.*?).q1.*$", "\\1", names(peaks)), Factor=gsub("^(.*?)_.*$", "\\1", names(peaks)), Replicate=gsub("^.*?_rep(.*?).q1.*$", "\\1", names(peaks)), bamReads=gsub("^(.*?).q1.*$", "\\1.rmdup.bam", names(peaks)), Peaks=names(peaks), PeakCaller="narrow") samples[1:3, ] ## ----eval=FALSE---------------------------------------------------------- # ## before filter # exampleExp <- ChIPQC(experiment=samples, annotaiton="hg19") # ChIPQCreport(exampleExp, reportFolder="ChIPQC", facetBy="Factor") # ## after IDR filter # samples.fil <- samples # samples.fil$Peaks <- gsub("^(.*?).q1.*$", "\\1.fil.bed", names(peaks)) # samples.fil$PeakCaller="bed" # exampleExp.fil <- ChIPQC(experiment=samples.fil, annotaiton="hg19") # ChIPQCreport(exampleExp.fil, # reportFolder="ChIPQC.fil", # facetBy="Factor") # ## filtered by qValue from the MACS2 result # samples.fil.q01 <- samples.fil # samples.fil.q01$Peaks <- # gsub("^(.*?).q1.*$", "\\1.fil.q01.bed", names(peaks)) # exampleExp.fil.q01 <- ChIPQC(experiment=samples.fil.q01, # annotaiton="hg19") # ChIPQCreport(exampleExp.fil.q01, # reportFolder="ChIPQC.fil.q01", # facetBy="Factor") ## ----eval=FALSE---------------------------------------------------------- # ol <- findOverlapsOfPeaks(GSE66081, connectedPeaks="keepAll") ## ------------------------------------------------------------------------ # library(ChIPpeakAnno) # packagePath <- system.file("extdata", "filtered", package = "ChIPseqStepByStep") # filtedFiles <- dir(packagePath, ".bed") # GSE66081 <- sapply(filtedFiles, function(.ele){ # toGRanges(file.path(packagePath, .ele), format="BED") # }) # names(GSE66081) <- sub(".bed", "", names(GSE66081)) # lengths(GSE66081) # ol <- findOverlapsOfPeaks(GSE66081) # names(ol) # names(ol$peaklist) # makeVennDiagram(ol) ## ----eval=TRUE, message=FALSE, warning=FALSE, fig.width=4, fig.height=4---- makeVennDiagram(ol) ## ----eval=FALSE---------------------------------------------------------- # peaks.summit <- lapply(peaks, function(.ele) { # .ele <- shift(.ele, .ele$peak) # width(.ele) <- 1 # .ele # }) # # TEAD.summit <- unlist(GRangesList(peaks.summit[new.group=="TEAD4"])) # TAZ.summit <- unlist(GRangesList(peaks.summit[new.group=="TAZ"])) # bof <- binOverFeature(TEAD.summit, # annotationData=TAZ.summit, # radius=300, nbins=300, FUN=length) ## ----eval=TRUE, message=FALSE, warning=FALSE, fig.width=5, fig.height=5---- plot(as.numeric(rownames(bof)), bof, ylab="Peak density", xlab="Distance to the summit of TAZ peaks", main="Position of TEAD4 peak summit", type="l", xlim=c(-300, 300)) ## ----eval=FALSE---------------------------------------------------------- # library(TxDb.Hsapiens.UCSC.hg19.knownGene) # annoData <- toGRanges(TxDb.Hsapiens.UCSC.hg19.knownGene, feature="gene") ## ----eval=TRUE----------------------------------------------------------- annoData[1] ## ------------------------------------------------------------------------ # library(EnsDb.Hsapiens.v75) # toGRanges(EnsDb.Hsapiens.v75) # toGRanges(EnsDb.Hsapiens.v75, feature = "transcript") # library(TxDb.Hsapiens.UCSC.hg19.knownGene) # annoData <- toGRanges(TxDb.Hsapiens.UCSC.hg19.knownGene) # toGRanges(TxDb.Hsapiens.UCSC.hg19.knownGene, feature = "geneModel") # toGRanges(TxDb.Hsapiens.UCSC.hg19.knownGene, feature = "threeUTR") ## ----eval=FALSE---------------------------------------------------------- # overlaps <- ol$peaklist[["TAZ///TEAD4///YAP"]] # ## annotate the peaks by nearest annotations. # YAP.TAZ.TEAD4.nearest <- # annotatePeakInBatch(overlaps, # AnnotationData=annoData) # # ## annotate the peaks by both side in a given range. # YAP.TAZ.TEAD4.2side <- # annotatePeakInBatch(overlaps, # AnnotationData=annoData, # output="nearestBiDirectionalPromoters", # bindingRegion=c(-100000, 100000)) ## ------------------------------------------------------------------------ # overlaps <- ol$peaklist[["TAZ///TEAD4///YAP"]] # YAP.TAZ.TEAD4.nearest <- # annotatePeakInBatch(overlaps, # AnnotationData=annoData) # YAP.TAZ.TEAD4.promoter <- # annotatePeakInBatch(overlaps, # AnnotationData=annoData, # output="nearestBiDirectionalPromoters", # bindingRegion=c(-10000, 2000)) ## ----eval=TRUE, message=FALSE, warning=FALSE, fig.width=4.5, fig.height=4.5---- pie1(table(YAP.TAZ.TEAD4.2side$insideFeature)) ## ----eval=TRUE----------------------------------------------------------- library(org.Hs.eg.db) YAP.TAZ.TEAD4.nearest.anno <- addGeneIDs(annotatedPeak = YAP.TAZ.TEAD4.nearest, orgAnn = org.Hs.eg.db, feature_id_type = "entrez_id") YAP.TAZ.TEAD4.nearest.anno[1:3] ## ------------------------------------------------------------------------ # library(org.Hs.eg.db) # YAP.TAZ.TEAD4.promoter.anno <- # addGeneIDs(annotatedPeak = YAP.TAZ.TEAD4.promoter, # orgAnn = org.Hs.eg.db, # feature_id_type = "entrez_id") ## ----eval=FALSE---------------------------------------------------------- # ## save annotations # YAP.TAZ.TEAD4.2side.m <- as.data.frame(unname(YAP.TAZ.TEAD4.2side)) # YAP.TAZ.TEAD4.2side.m$peakNames <- NULL # library(WriteXLS) # WriteXLS("YAP.TAZ.TEAD4.2side.m", # "YAP.TAZ.TEAD4.overlapping.peaks.anno.xls") ## ----eval=FALSE---------------------------------------------------------- # ## enrichment analysis # library(org.Hs.eg.db) # over <- getEnrichedGO(YAP.TAZ.TEAD4.2side, # orgAnn="org.Hs.eg.db", # feature_id_type="entrez_id", # maxP=.01, condense=TRUE) ## ----eval=TRUE, message=FALSE, warning=FALSE----------------------------- over[["bp"]][1:3, -c(3, 10)] dim(over[["bp"]]) ## ----eval=FALSE---------------------------------------------------------- # library(reactome.db) # path <- getEnrichedPATH(YAP.TAZ.TEAD4.2side, # orgAnn="org.Hs.eg.db", # pathAnn="reactome.db", # feature_id_type="entrez_id", # maxP=.05) ## ----eval=TRUE, message=FALSE, warning=FALSE----------------------------- path[1:2, ] ## ----eval=FALSE---------------------------------------------------------- # bdp <- peaksNearBDP(overlaps, annoData, MaxDistance=100000) ## ----eval=TRUE, message=FALSE, warning=FALSE----------------------------- bdp ## ----eval=FALSE---------------------------------------------------------- # seqlevelsStyle(overlaps) ## == UCSC # aCR<-assignChromosomeRegion(overlaps, nucleotideLevel=FALSE, # TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene) ## ----eval=TRUE, message=FALSE, warning=FALSE, fig.width=9, fig.height=4.5---- par(mar=c(10.1, 4.1, 4.1, 2.1)) barplot(aCR$percentage, las=3) ## ----eval=FALSE---------------------------------------------------------- # dist2TSS <- # lapply(list(YAP=GSE66081[["YAP"]], # TAZ=GSE66081[["TAZ"]], # TEAD4=GSE66081[["TEAD4"]], # YAP.TAZ.TEAD4=ol$peaklist[["TAZ///TEAD4///YAP"]]), # function(.ele) { # .ele <- annotatePeakInBatch(.ele, # AnnotationData=annoData, # output="nearestLocation") # .ele$shortestDistance}) # dist2TSS.cut <- lapply(dist2TSS, cut, breaks=c(0, 1e3, 1e4, 1e5, 1e10)) # dist2TSS.table <- sapply(dist2TSS.cut, table) # dist2TSS.percentage <- apply(dist2TSS.table, 2, # function(.ele) .ele/sum(.ele)) # library(ggplot2) # library(reshape2) # dist2TSS.percentage <- melt(dist2TSS.percentage) # dist2TSS.percentage$value <- dist2TSS.percentage$value * 100 ## ----eval=TRUE, message=FALSE, warning=FALSE, fig.width=12, fig.height=8---- ggplot(dist2TSS.percentage, aes(x=Var2, y=value, fill=Var1)) + geom_bar(stat="identity") + xlab("") + ylab("%") + labs(title="Distance to TSS") + theme_bw() + scale_fill_manual(name="", values=c("#FF0000FF", "#FF0000BB", "#FF000077", "#FF000033"), labels=c("<1kb", "1-10kb", "10-100kb", ">100kb")) ## ----eval=FALSE---------------------------------------------------------- # library(GEOquery) # getGEOSuppFiles("GSE49651") # epipeaks.files <- dir("GSE49651", full.names=TRUE) # library(ChIPpeakAnno) # epipeaks <- lapply(epipeaks.files[3:5], # function(.ele) toGRanges(gzfile(.ele), format="BED")) # names(epipeaks) <- # gsub("GSE49651_MDAMB231.(.*?).hg19.*._peaks.txt.gz", "\\1", # basename(epipeaks.files)[3:5]) ## ----eval=FALSE---------------------------------------------------------- # promoter.UCSC <- promoters(TxDb.Hsapiens.UCSC.hg19.knownGene, 5e3, 5e3) # promoter.UCSC <- unique(promoter.UCSC) # attach(epipeaks) # ol.promoter <- findOverlapsOfPeaks(H3K4me1, H3K4me3, H3K27Ac, # promoter.UCSC, # ignore.strand = FALSE) # promoter <- c(ol.promoter$peaklist[["H3K4me3///H3K27Ac///promoter.UCSC"]], # ol.promoter$peaklist[["H3K4me1///H3K4me3///H3K27Ac///promoter.UCSC"]]) # enhancer.active <- ol.promoter$peaklist[["H3K4me1///H3K27Ac"]] # enhancer.inactive <- ol.promoter$peaklist[["H3K4me1"]] # # mcols(enhancer.active) <- DataFrame(type="enhancer.active") # mcols(enhancer.inactive) <- DataFrame(type="enhancer.inactive") # mcols(promoter) <- DataFrame(type="promoter") # types <- c(promoter, enhancer.active, enhancer.inactive) # # YAP.TAZ.TEAD <- ol$peaklist[["TAZ///TEAD4///YAP"]] # seqlevelsStyle(YAP.TAZ.TEAD) <- "UCSC" # YAP.TAZ.TEAD <- YAP.TAZ.TEAD[seqnames(YAP.TAZ.TEAD) %in% # seqlevels(types)] # YAP.TAZ.TEAD4.type <- findOverlaps(YAP.TAZ.TEAD, types) # tbl <- table(types[subjectHits(YAP.TAZ.TEAD4.type)]$type) # tbl["not.classified"] <- length(YAP.TAZ.TEAD) - # length(unique(queryHits(YAP.TAZ.TEAD4.type))) ## ----eval=TRUE, message=FALSE, warning=FALSE, fig.width=4.5, fig.height=4.5---- pie1(tbl) ## ----eval=FALSE---------------------------------------------------------- # ##heatmap # YAP.TAZ.TEAD.assigned <- YAP.TAZ.TEAD[queryHits(YAP.TAZ.TEAD4.type)] # YAP.TAZ.TEAD.assigned$type <- types[subjectHits(YAP.TAZ.TEAD4.type)]$type # YAP.TAZ.TEAD.assigned <- c(YAP.TAZ.TEAD.assigned[YAP.TAZ.TEAD.assigned$type=="enhancer.active"], YAP.TAZ.TEAD.assigned[YAP.TAZ.TEAD.assigned$type=="promoter"]) # YAP.TAZ.TEAD.assigned <- unique(YAP.TAZ.TEAD.assigned) # # library(rtracklayer) # YAP.TAZ.TEAD.assigned.center <- # reCenterPeaks(YAP.TAZ.TEAD.assigned, width=1) # YAP.TAZ.TEAD.assigned.reCenter <- # reCenterPeaks(YAP.TAZ.TEAD.assigned, width=2000) # untar("GSE49651/GSE49651_RAW.tar") # sapply(c("GSM1204470_MDAMB231.H3K4me1_1.hg19.tags.bedGraph.gz", # "GSM1204472_MDAMB231.H3K4me3_1.hg19.tags.bedGraph.gz", # "GSM1204474_MDAMB231.H3K27Ac_1.hg19.tags.bedGraph.gz"), gunzip) # # bams <- c("YAP", "TAZ", "TEAD4", "JUN") # library(BSgenome.Hsapiens.UCSC.hg19) # len <- seqlengths(Hsapiens) # write.table(len, file="hg19.genome.txt", # quote=FALSE, col.names=FALSE, # sep="\t") # sapply(bams, function(.ele){ # system(paste0("genomeCoverageBed -bga -split -ibam ", # .ele, "_rep1.rmdup.bam -g hg19.genome.txt > ", # .ele, "_rep1.bedGraph")) # }) # # # files <- c(YAP="YAP_rep1.bedGraph", # TAZ="TAZ_rep1.bedGraph", # TEAD4="TEAD4_rep1.bedGraph", # JUN="JUN_rep1.bedGraph", # H3K4me1="GSM1204470_MDAMB231.H3K4me1_1.hg19.tags.bedGraph", # H3K4me3="GSM1204472_MDAMB231.H3K4me3_1.hg19.tags.bedGraph", # H3K27Ac="GSM1204474_MDAMB231.H3K27Ac_1.hg19.tags.bedGraph") # export(YAP.TAZ.TEAD.assigned.reCenter, "tmp.bed", format="BED") # sapply(files, function(.ele) # system(paste("intersectBed -a", .ele, "-b tmp.bed >", # gsub("bedGraph", "sub.bedGraph", .ele)))) # unlink("tmp.bed") # library(trackViewer) # cvglists <- sapply(gsub("bedGraph", "sub.bedGraph", files), # importData, format="bedGraph") # names(cvglists) <- names(files) # sig <- featureAlignedSignal(cvglists, YAP.TAZ.TEAD.assigned.center, # upstream=1000, downstream=1000) ## ----eval=TRUE, message=FALSE, warning=FALSE, fig.width=6, fig.height=6---- names(sig) heatmap <- featureAlignedHeatmap(sig, YAP.TAZ.TEAD.assigned.center, upstream=1000, downstream=1000, annoMcols=c("type"), margin=c(.1, .01, .15, .25)) ## ----eval=FALSE---------------------------------------------------------- # library(BSgenome.Hsapiens.UCSC.hg19) # YAP.TAZ.TEAD4.uniq <- unique(YAP.TAZ.TEAD4.nearest) # YAP.TAZ.TEAD4.uniq <- YAP.TAZ.TEAD4.uniq[!is.na(YAP.TAZ.TEAD4.uniq$feature)] # strand(YAP.TAZ.TEAD4.uniq) <- as.character(YAP.TAZ.TEAD4.uniq$feature_strand) # seq <- getAllPeakSequence(YAP.TAZ.TEAD4.uniq, upstream=0, downstream=0, # genome=Hsapiens) # freqs <- oligoFrequency(Hsapiens$chr1, MarkovOrder=3) # os <- oligoSummary(seq, oligoLength=6, MarkovOrder=3, # quickMotif=TRUE, freqs=freqs, revcomp=TRUE) # zscore <- sort(os$zscore) ## ----eval=TRUE, message=FALSE, warning=FALSE----------------------------- h <- hist(zscore, breaks=100) text(zscore[length(zscore)], max(h$counts)/10, labels=names(zscore[length(zscore)]), adj=1) ## ----eval=FALSE---------------------------------------------------------- # library(motifStack) # pfms <- mapply(function(.ele, id) # new("pfm", mat=.ele, name=paste("SAMPLE motif", id)), # os$motifs, 1:length(os$motifs)) ## ----eval=TRUE, message=FALSE, warning=FALSE, fig.width=6, fig.height=4---- motifStack(pfms) ## ------------------------------------------------------------------------ # library(BSgenome.Hsapiens.UCSC.hg19) # YAP.TAZ.TEAD4.uniq <- unique(YAP.TAZ.TEAD4.nearest) # YAP.TAZ.TEAD4.uniq <- YAP.TAZ.TEAD4.uniq[!is.na(YAP.TAZ.TEAD4.uniq$feature)] # strand(YAP.TAZ.TEAD4.uniq) <- as.character(YAP.TAZ.TEAD4.uniq$feature_strand) # seq <- getAllPeakSequence(YAP.TAZ.TEAD4.uniq, upstream=0, downstream=0, genome=Hsapiens) # freqs <- oligoFrequency(Hsapiens$chr1, MarkovOrder=3) # os <- oligoSummary(seq, oligoLength=6, MarkovOrder=3, # quickMotif=TRUE, freqs=freqs, revcomp=TRUE) # zscore <- sort(os$zscore) # h <- hist(zscore, breaks=100) # text(zscore[length(zscore)], max(h$counts)/10, # labels=names(zscore[length(zscore)]), adj=1) # library(motifStack) # pfms <- mapply(function(.ele, id) # new("pfm", mat=.ele, name=paste("SAMPLE motif", id)), # os$motifs, 1:length(os$motifs)) # motifStack(pfms) ## ----eval=FALSE---------------------------------------------------------- # ap1 <- GSE66081[["JUN"]] # seq.ap1 <- getAllPeakSequence(ap1, upstream=0, downstream=0, genome=Hsapiens) # os.ap1 <- oligoSummary(seq.ap1, oligoLength=7, MarkovOrder=3, # quickMotif=FALSE, freqs=freqs, revcomp=TRUE) # zscore.ap1 <- sort(os.ap1$zscore) ## ----eval=TRUE, fig.width=6, fig.height=4-------------------------------- h.ap1 <- hist(zscore.ap1, breaks=100) text(zscore.ap1[length(zscore.ap1)], max(h.ap1$counts)/10, labels=names(zscore.ap1[length(zscore.ap1)]), adj=1) ## ----eval=FALSE---------------------------------------------------------- # library(GEOquery) # library(limma) # gset <- getGEO("GSE59232", GSEMatrix =TRUE, AnnotGPL=TRUE)[[1]] # pD <- pData(gset)[, c("title", "geo_accession")] # gset <- gset[, grepl("MDA-MB-231 cells", pD$title, fixed = TRUE)] # pD <- pData(gset)[, c("title", "geo_accession")] # ex <- exprs(gset) ## log2 transformed expression profile ## ----eval=FALSE---------------------------------------------------------- # fl <- do.call(rbind, # strsplit(sub("MDA-MB-231 cells ", # "", pD$title), ", ")) # sml <- sub("#.$", "", fl[, 1]) # design.table <- data.frame(group=sml, # batch=fl[, 1], # replicate=fl[, 2]) # design <- model.matrix(~ -1 + group + batch + replicate, # data = design.table) # colnames(design) <- sub("group", "", # make.names(colnames(design))) # fit <- lmFit(gset, design) # cont.matrix <- makeContrasts(contrasts = "siYAP.TAZ-siControl", # levels=design) # fit2 <- contrasts.fit(fit, cont.matrix) # fit2 <- eBayes(fit2) # tT <- topTable(fit2, adjust="fdr", number = nrow(fit2)) ## ----eval=FALSE---------------------------------------------------------- # library(simpIntLists) # interactionmap <- findInteractionList("human", "Official") # interactionmap <- lapply(interactionmap, function(.ele) # cbind(from=as.character(.ele$name), # to=as.character(.ele$interactors))) # interactionmap <- do.call(rbind, interactionmap) # library(GeneNetworkBuilder) # rootgene <- "TAZ" # TFbindingTable <- # cbind(from=rootgene, # to=unique(YAP.TAZ.TEAD4.nearest.anno$symbol)) # sifNetwork <- # buildNetwork(TFbindingTable=TFbindingTable, # interactionmap=interactionmap, level=2) # tT$symbols <- tT$Gene.symbol # tT <- tT[, c("ID", "logFC", "P.Value", "symbols")] # expressionData <- tT[!duplicated(tT[, "symbols"]), ] # cifNetwork <- # filterNetwork(rootgene=rootgene, sifNetwork=sifNetwork, # exprsData=expressionData, mergeBy="symbols", # miRNAlist=character(0), # tolerance=1, cutoffPVal=0.001, # cutoffLFC=1.5) # ## polish network # gR<-polishNetwork(cifNetwork) ## ----eval=TRUE, fig.width=5, fig.height=3.5------------------------------ ## plot by Rgraphviz library(Rgraphviz) plotNetwork<-function(gR, layouttype="dot", ...){ if(!is(gR,"graphNEL")) stop("gR must be a graphNEL object") if(!(GeneNetworkBuilder:::inList(layouttype, c("dot", "neato", "twopi", "circo", "fdp")))){ stop("layouttype must be dot, neato, twopi, circo or fdp") } g1<-Rgraphviz::layoutGraph(gR, layoutType=layouttype, ...) nodeRenderInfo(g1)$col <- nodeRenderInfo(gR)$col nodeRenderInfo(g1)$fill <- nodeRenderInfo(gR)$fill renderGraph(g1) } plotNetwork(gR) ## ----eval=TRUE----------------------------------------------------------- browseNetwork(gR) ## ----eval=FALSE---------------------------------------------------------- # library(trackViewer) # ## prepare ranges to view # gene <- "SYDE2" # eID <- mget(gene, org.Hs.egSYMBOL2EG) # gr <- as(annoData[eID[[1]]], "GRanges") # gr.promoter <- promoters(gr, upstream=5000, downstream=2000) # seqlevels(gr.promoter) <- "chr1" # seqinfo(gr.promoter) <- seqinfo(gr.promoter)["chr1"] # ## import data # bams.rep1 <- sub(".bam", ".rmdup.bam", # bam.files[grepl("rep1", bam.files)]) # syde <- lapply(bams.rep1, importBam, ranges=gr.promoter) # names(syde) <- gsub("_rep1.rmdup.bam", "", bams.rep1) # ## get gene model # trs <- geneModelFromTxdb(TxDb.Hsapiens.UCSC.hg19.knownGene, # orgDb = org.Hs.eg.db, gr = gr.promoter) # trs.names <- sapply(trs, function(.ele) .ele@name) # trs <- trs[match(gene, trs.names)] # names(trs) <- gene # ## modify the plot styles # optSty <- optimizeStyle(trackList(syde, trs, heightDist=c(.9, .1)), # theme="col") # trackList <- optSty$tracks # viewerStyle <- optSty$style # for(i in 1:length(trackList)) # setTrackStyleParam(trackList[[i]], "ylabgp", list(cex=.8)) ## ----eval=TRUE, fig.width=12, fig.height=8------------------------------- ## plot vp <- viewTracks(trackList, gr=gr.promoter, viewerStyle=viewerStyle) addGuideLine(guideLine = c(85666950, 85667700), vp = vp) ## ------------------------------------------------------------------------ # library(trackViewer) # library(org.Hs.eg.db) # library(TxDb.Hsapiens.UCSC.hg19.knownGene) # library(ChIPseqStepByStep) # (ex.gr <- parse2GRanges("chr1:85664729-85671728:-")) # ## get gene model # trs <- geneModelFromTxdb(TxDb.Hsapiens.UCSC.hg19.knownGene, # orgDb = org.Hs.eg.db, gr = ex.gr) # ## import signals # packagePath <- system.file("extdata", "bedGraph", package = "ChIPseqStepByStep") # (ex.bg <- dir(packagePath, "bedGraph")) # ex.scores <- sapply(ex.bg, function(.ele){ # importScore(file.path(packagePath, .ele), # format = "bedGraph", # ranges = ex.gr) # }) # ## browse the tracks # browseTracks(trackList(ex.scores, trs, heightDist = c(3/4, 1/4)), gr=ex.gr) ## ----eval=TRUE----------------------------------------------------------- sessionInfo()