## ----style, echo = FALSE, results = 'asis'---------------- BiocStyle::markdown() options(width = 60, max.print = 1000) knitr::opts_chunk$set( eval = as.logical(Sys.getenv("KNITR_EVAL", "TRUE")), cache = as.logical(Sys.getenv("KNITR_CACHE", "TRUE")), tidy.opts = list(width.cutoff = 60), tidy = TRUE) ## ----setup, echo=FALSE, messages=FALSE, warnings=FALSE---- suppressPackageStartupMessages({ library(systemPipeR) }) ## ----genNew_wf, eval=FALSE-------------------------------- # systemPipeRdata::genWorkenvir(workflow = "chipseq", mydirname = "chipseq") # setwd("chipseq") ## ----load_targets_file, eval=TRUE------------------------- targetspath <- system.file("extdata", "targetsPE_chip.txt", package = "systemPipeR") targets <- read.delim(targetspath, comment.char = "#") targets[1:4,-c(5,6)] ## ----create_workflow, message=FALSE, eval=FALSE----------- # library(systemPipeR) # sal <- SPRproject() # sal ## ----load_SPR, message=FALSE, eval=FALSE, spr=TRUE-------- # appendStep(sal) <- LineWise(code = { # library(systemPipeR) # }, step_name = "load_SPR") ## ----fastq_report, eval=FALSE, message=FALSE, spr=TRUE---- # appendStep(sal) <- LineWise( # code = { # targets <- read.delim("targetsPE_chip.txt", comment.char = "#") # updateColumn(sal, step = "load_SPR", position = "targetsWF") <- targets # fq_files <- getColumn(sal, "load_SPR", "targetsWF", column = 1) # fqlist <- seeFastq(fastq = fq_files, batchsize = 10000, klength = 8) # png("./results/fastqReport.png", height = 162, width = 288 * length(fqlist)) # seeFastqPlot(fqlist) # dev.off() # }, # step_name = "fastq_report", # dependency = "load_SPR" # ) ## ----preprocessing, message=FALSE, eval=FALSE, spr=TRUE---- # appendStep(sal) <- SYSargsList( # step_name = "preprocessing", # targets = "targetsPE_chip.txt", dir = TRUE, # wf_file = "preprocessReads/preprocessReads-pe.cwl", # input_file = "preprocessReads/preprocessReads-pe.yml", # dir_path = system.file("extdata/cwl", package = "systemPipeR"), # inputvars = c( # FileName1 = "_FASTQ_PATH1_", # FileName2 = "_FASTQ_PATH2_", # SampleName = "_SampleName_" # ), # dependency = c("fastq_report") # ) ## ----custom_preprocessing_function, eval=FALSE------------ # appendStep(sal) <- LineWise( # code = { # filterFct <- function(fq, cutoff = 20, Nexceptions = 0) { # qcount <- rowSums(as(quality(fq), "matrix") <= cutoff, na.rm = TRUE) # # Retains reads where Phred scores are >= cutoff with N exceptions # fq[qcount <= Nexceptions] # } # save(list = ls(), file = "param/customFCT.RData") # }, # step_name = "custom_preprocessing_function", # dependency = "preprocessing" # ) ## ----editing_preprocessing, message=FALSE, eval=FALSE----- # yamlinput(sal, "preprocessing")$Fct # yamlinput(sal, "preprocessing", "Fct") <- "'filterFct(fq, cutoff=20, Nexceptions=0)'" # yamlinput(sal, "preprocessing")$Fct ## check the new function # cmdlist(sal, "preprocessing", targets = 1) ## check if the command line was updated with success ## ----bowtie2_index, eval=FALSE, spr=TRUE------------------ # appendStep(sal) <- SYSargsList( # step_name = "bowtie2_index", # dir = FALSE, targets = NULL, # wf_file = "bowtie2/bowtie2-index.cwl", # input_file = "bowtie2/bowtie2-index.yml", # dir_path = system.file("extdata/cwl", package = "systemPipeR"), # inputvars = NULL, # dependency = c("preprocessing") # ) ## ----bowtie2_alignment, eval=FALSE, spr=TRUE-------------- # appendStep(sal) <- SYSargsList( # step_name = "bowtie2_alignment", # dir = TRUE, # targets = "targetsPE_chip.txt", # wf_file = "workflow-bowtie2/workflow_bowtie2-pe.cwl", # input_file = "workflow-bowtie2/workflow_bowtie2-pe.yml", # dir_path = system.file("extdata/cwl", package = "systemPipeR"), # inputvars = c( # FileName1 = "_FASTQ_PATH1_", # FileName2 = "_FASTQ_PATH2_", # SampleName = "_SampleName_" # ), # dependency = c("bowtie2_index") # ) ## ----bowtie2_alignment_check, eval=FALSE------------------ # cmdlist(sal, step="bowtie2_alignment", targets=1) ## ----align_stats, eval=FALSE, spr=TRUE-------------------- # appendStep(sal) <- LineWise( # code = { # fqpaths <- getColumn(sal, step = "bowtie2_alignment", "targetsWF", column = "FileName1") # bampaths <- getColumn(sal, step = "bowtie2_alignment", "outfiles", column = "samtools_sort_bam") # read_statsDF <- alignStats(args = bampaths, fqpaths = fqpaths, pairEnd = TRUE) # write.table(read_statsDF, "results/alignStats.xls", row.names=FALSE, quote=FALSE, sep="\t") # }, # step_name = "align_stats", # dependency = "bowtie2_alignment") ## ----bam_IGV, eval=FALSE, spr=TRUE------------------------ # appendStep(sal) <- LineWise( # code = { # bampaths <- getColumn(sal, step = "bowtie2_alignment", "outfiles", # column = "samtools_sort_bam") # symLink2bam( # sysargs = bampaths, htmldir = c("~/.html/", "somedir/"), # urlbase = "http://cluster.hpcc.ucr.edu/~tgirke/", # urlfile = "./results/IGVurl.txt") # }, # step_name = "bam_IGV", # dependency = "bowtie2_alignment", # run_step = "optional" # ) ## ----rle_object, eval=FALSE------------------------------- # bampaths <- getColumn(sal, step = "bowtie2_alignment", "outfiles", column = "samtools_sort_bam") # aligns <- readGAlignments(bampaths[1]) # cov <- coverage(aligns) # cov ## ----resize_align, eval=FALSE----------------------------- # trim(resize(as(aligns, "GRanges"), width = 200)) ## ----rle_slice, eval=FALSE-------------------------------- # islands <- slice(cov, lower = 15) # islands[[1]] ## ----plot_coverage, eval=FALSE---------------------------- # library(ggbio) # myloc <- c("Chr1", 1, 100000) # ga <- readGAlignments(bampaths[1], use.names=TRUE, # param=ScanBamParam(which=GRanges(myloc[1], # IRanges(as.numeric(myloc[2]), as.numeric(myloc[3]))))) # autoplot(ga, aes(color = strand, fill = strand), facets = strand ~ seqnames, stat = "coverage") ## ----merge_bams, eval=FALSE, spr=TRUE--------------------- # appendStep(sal) <- LineWise( # code = { # bampaths <- getColumn(sal, step = "bowtie2_alignment", "outfiles", column = "samtools_sort_bam") # merge_bams <- mergeBamByFactor(args=bampaths, targetsDF = targetsWF(sal)[["bowtie2_alignment"]], out_dir = file.path("results", "merge_bam") ,overwrite=TRUE) # updateColumn(sal, step = "merge_bams", position = "targetsWF") <- merge_bams # }, # step_name = "merge_bams", # dependency = "bowtie2_alignment" # ) ## ----call_peaks_macs_noref, eval=FALSE, spr=TRUE---------- # cat("Running preprocessing for call_peaks_macs_noref\n") # # Previous Linewise step is not run at workflow building time, # # but we need the output as input for this sysArgs step. So # # we use some preprocess code to predict the output paths # # to update the output targets of merge_bams, and then # # them into this next step during workflow building phase. # mergebam_out_dir = file.path("results", "merge_bam") # make sure this is the same output directory used in merge_bams # targets_merge_bam <- targetsWF(sal)$bowtie2_alignment # targets_merge_bam <- targets_merge_bam[, -which(colnames(targets_merge_bam) %in% c("FileName1", "FileName2", "FileName"))] # targets_merge_bam <- targets_merge_bam[!duplicated(targets_merge_bam$Factor), ] # targets_merge_bam <- cbind(FileName = file.path(mergebam_out_dir, paste0(targets_merge_bam$Factor, "_merged.bam")), targets_merge_bam) # updateColumn(sal, step = "merge_bams", position = "targetsWF") <- targets_merge_bam # # write it out as backup, so you do not need to use preprocess code above again # writeTargets(sal, step = "merge_bams", file = "targets_merge_bams.txt", overwrite = TRUE) # # ###pre-end # appendStep(sal) <- SYSargsList( # step_name = "call_peaks_macs_noref", # targets = "targets_merge_bams.txt", # or use "merge_bams" to directly grab from the previous step # wf_file = "MACS2/macs2-noinput.cwl", # input_file = "MACS2/macs2-noinput.yml", # dir_path = system.file("extdata/cwl", package = "systemPipeR"), # inputvars = c( # FileName = "_FASTQ_PATH1_", # SampleName = "_SampleName_" # ), # dependency = c("merge_bams") # ) ## ----call_peaks_macs_withref, eval=FALSE, spr=TRUE-------- # cat("Running preprocessing for call_peaks_macs_withref\n") # # To generate the reference targets file for the next step, use `writeTargetsRef`, # # this file needs to be present at workflow building time # # Use following preprocess code to do so: # writeTargetsRef(infile = "targets_merge_bams.txt", outfile = "targets_bam_ref.txt", silent = FALSE, overwrite = TRUE) # # ###pre-end # appendStep(sal) <- SYSargsList( # step_name = "call_peaks_macs_withref", # targets = "targets_bam_ref.txt", # wf_file = "MACS2/macs2-input.cwl", # input_file = "MACS2/macs2-input.yml", # dir_path = system.file("extdata/cwl", package = "systemPipeR"), # inputvars = c( # FileName1 = "_FASTQ_PATH1_", # FileName2 = "_FASTQ_PATH2_", # SampleName = "_SampleName_" # ), # dependency = c("merge_bams") # ) ## ----consensus_peaks, eval=FALSE, spr=TRUE---------------- # appendStep(sal) <- LineWise( # code = { # peaks_files <- getColumn(sal, step = "call_peaks_macs_noref", "outfiles", column = "peaks_xls") # peak_M1A <- peaks_files["M1A"] # peak_M1A <- as(read.delim(peak_M1A, comment="#")[,1:3], "GRanges") # peak_A1A <- peaks_files["A1A"] # peak_A1A <- as(read.delim(peak_A1A, comment="#")[,1:3], "GRanges") # (myol1 <- subsetByOverlaps(peak_M1A, peak_A1A, minoverlap=1)) # # Returns any overlap # myol2 <- olRanges(query=peak_M1A, subject=peak_A1A, output="gr") # # Returns any overlap with OL length information # myol2[values(myol2)["OLpercQ"][,1]>=50] # # Returns only query peaks with a minimum overlap of 50% # }, # step_name = "consensus_peaks", # dependency = "call_peaks_macs_noref" # ) ## ----annotation_ChIPseeker, eval=FALSE, spr=TRUE---------- # appendStep(sal) <- LineWise( # code = { # library(ChIPseeker); library(GenomicFeatures) # peaks_files <- getColumn(sal, step = "call_peaks_macs_noref", "outfiles", column = "peaks_xls") # txdb <- suppressWarnings(makeTxDbFromGFF(file="data/tair10.gff", format="gff", dataSource="TAIR", # organism="Arabidopsis thaliana")) # for(i in seq(along=peaks_files)) { # peakAnno <- annotatePeak(peaks_files[i], TxDb=txdb, verbose=FALSE) # df <- as.data.frame(peakAnno) # outpaths <- paste0("./results/", names(peaks_files), "_ChIPseeker_annotated.xls") # names(outpaths) <- names(peaks_files) # write.table(df, outpaths[i], quote=FALSE, row.names=FALSE, sep="\t") # } # updateColumn(sal, step = "annotation_ChIPseeker", position = "outfiles") <- data.frame(outpaths) # }, # step_name = "annotation_ChIPseeker", # dependency = "call_peaks_macs_noref" # ) ## ----ChIPseeker_plots, eval=FALSE, spr=TRUE--------------- # appendStep(sal) <- LineWise( # code = { # peaks_files <- getColumn(sal, step = "call_peaks_macs_noref", "outfiles", column = "peaks_xls") # peak <- readPeakFile(peaks_files[1]) # png("results/peakscoverage.png") # covplot(peak, weightCol="X.log10.pvalue.") # dev.off() # png("results/peaksHeatmap.png") # peakHeatmap(peaks_files[1], TxDb=txdb, upstream=1000, downstream=1000, # color="red") # dev.off() # png("results/peaksProfile.png") # plotAvgProf2(peaks_files[1], TxDb=txdb, upstream=1000, downstream=1000, # xlab="Genomic Region (5'->3')", ylab = "Read Count Frequency", # conf=0.05) # dev.off() # }, # step_name = "ChIPseeker_plots", # dependency = "annotation_ChIPseeker" # ) ## ----annotation_ChIPpeakAnno, eval=FALSE, spr=TRUE-------- # appendStep(sal) <- LineWise( # code = { # library(ChIPpeakAnno); library(GenomicFeatures) # peaks_files <- getColumn(sal, step = "call_peaks_macs_noref", "outfiles", column = "peaks_xls") # txdb <- suppressWarnings(makeTxDbFromGFF(file="data/tair10.gff", format="gff", dataSource="TAIR", # organism="Arabidopsis thaliana")) # ge <- genes(txdb, columns=c("tx_name", "gene_id", "tx_type")) # for(i in seq(along=peaks_files)) { # peaksGR <- as(read.delim(peaks_files[i], comment="#"), "GRanges") # annotatedPeak <- annotatePeakInBatch(peaksGR, AnnotationData=genes(txdb)) # df <- data.frame(as.data.frame(annotatedPeak), # as.data.frame(values(ge[values(annotatedPeak)$feature,]))) # df$tx_name <- as.character(lapply(df$tx_name, function(x) paste(unlist(x), sep='', collapse=', '))) # df$tx_type <- as.character(lapply(df$tx_type, function(x) paste(unlist(x), sep='', collapse=', '))) # outpaths <- paste0("./results/", names(peaks_files), "_ChIPpeakAnno_annotated.xls") # names(outpaths) <- names(peaks_files) # write.table(df, outpaths[i], quote=FALSE, row.names=FALSE, sep="\t") # } # }, # step_name = "annotation_ChIPpeakAnno", # dependency = "call_peaks_macs_noref", # run_step = "optional" # ) ## ----count_peak_ranges, eval=FALSE, spr=TRUE-------------- # appendStep(sal) <- LineWise( # code = { # library(GenomicRanges) # bam_files <- getColumn(sal, step = "bowtie2_alignment", "outfiles", column = "samtools_sort_bam") # args <- getColumn(sal, step = "call_peaks_macs_noref", "outfiles", column = "peaks_xls") # outfiles <- paste0("./results/", names(args), "_countDF.xls") # bfl <- BamFileList(bam_files, yieldSize=50000, index=character()) # countDFnames <- countRangeset(bfl, args, outfiles, mode="Union", ignore.strand=TRUE) # updateColumn(sal, step = "count_peak_ranges", position = "outfiles") <- data.frame(countDFnames) # }, # step_name = "count_peak_ranges", # dependency = "call_peaks_macs_noref", # ) ## ----diff_bind_analysis, eval=FALSE, spr=TRUE------------- # appendStep(sal) <- LineWise( # code = { # countDF_files <- getColumn(sal, step = "count_peak_ranges", "outfiles") # outfiles <- paste0("./results/", names(countDF_files), "_peaks_edgeR.xls") # names(outfiles) <- names(countDF_files) # cmp <- readComp(file =stepsWF(sal)[["bowtie2_alignment"]], # format="matrix") # dbrlist <- runDiff(args=countDF_files, outfiles = outfiles, diffFct=run_edgeR, # targets=targetsWF(sal)[["bowtie2_alignment"]], cmp=cmp[[1]], # independent=TRUE, dbrfilter=c(Fold=2, FDR=1)) # }, # step_name = "diff_bind_analysis", # dependency = "count_peak_ranges", # ) ## ----go_enrich, eval=FALSE, spr=TRUE---------------------- # appendStep(sal) <- LineWise( # code = { # annofiles <- getColumn(sal, step = "annotation_ChIPseeker", "outfiles") # gene_ids <- sapply(annofiles, # function(x) unique(as.character # (read.delim(x)[,"geneId"])), simplify=FALSE) # load("data/GO/catdb.RData") # BatchResult <- GOCluster_Report(catdb=catdb, setlist=gene_ids, method="all", # id_type="gene", CLSZ=2, cutoff=0.9, # gocats=c("MF", "BP", "CC"), recordSpecGO=NULL) # write.table(BatchResult, "results/GOBatchAll.xls", quote=FALSE, row.names=FALSE, sep="\t") # }, # step_name = "go_enrich", # dependency = "annotation_ChIPseeker", # ) ## ----parse_peak_sequences, eval=FALSE, spr=TRUE----------- # appendStep(sal) <- LineWise( # code = { # library(Biostrings); library(seqLogo); library(BCRANK) # rangefiles <- getColumn(sal, step = "call_peaks_macs_noref", "outfiles") # for(i in seq(along=rangefiles)) { # df <- read.delim(rangefiles[i], comment="#") # peaks <- as(df, "GRanges") # names(peaks) <- paste0(as.character(seqnames(peaks)), "_", start(peaks), # "-", end(peaks)) # peaks <- peaks[order(values(peaks)$X.log10.pvalue., decreasing=TRUE)] # pseq <- getSeq(FaFile("./data/tair10.fasta"), peaks) # names(pseq) <- names(peaks) # writeXStringSet(pseq, paste0(rangefiles[i], ".fasta")) # } # }, # step_name = "parse_peak_sequences", # dependency = "call_peaks_macs_noref", # ) ## ----bcrank_enrich, eval=FALSE, spr=TRUE------------------ # appendStep(sal) <- LineWise( # code = { # library(Biostrings); library(seqLogo); library(BCRANK) # rangefiles <- getColumn(sal, step = "call_peaks_macs_noref", "outfiles") # set.seed(0) # BCRANKout <- bcrank(paste0(rangefiles[1], ".fasta"), restarts=25, # use.P1=TRUE, use.P2=TRUE) # toptable(BCRANKout) # topMotif <- toptable(BCRANKout, 1) # weightMatrix <- pwm(topMotif, normalize = FALSE) # weightMatrixNormalized <- pwm(topMotif, normalize = TRUE) # png("results/seqlogo.png") # seqLogo(weightMatrixNormalized) # dev.off() # }, # step_name = "bcrank_enrich", # dependency = "call_peaks_macs_noref", # ) ## ----sessionInfo, eval=FALSE, spr=TRUE-------------------- # appendStep(sal) <- LineWise( # code = { # sessionInfo() # }, # step_name = "sessionInfo", # dependency = "bcrank_enrich" # ) ## ----runWF, eval=FALSE------------------------------------ # sal <- runWF(sal) ## ----runWF_cluster, eval=FALSE---------------------------- # resources <- list(conffile=".batchtools.conf.R", # template="batchtools.slurm.tmpl", # Njobs=18, # walltime=120, ## minutes # ntasks=1, # ncpus=4, # memory=1024, ## Mb # partition = "short" # ) # sal <- addResources(sal, c("bowtie2_alignment"), resources = resources) # sal <- runWF(sal) ## ----plotWF, eval=FALSE----------------------------------- # plotWF(sal, rstudio = TRUE) ## ----statusWF, eval=FALSE--------------------------------- # sal # statusWF(sal) ## ----logsWF, eval=FALSE----------------------------------- # sal <- renderLogs(sal) ## ----eval=TRUE-------------------------------------------- sessionInfo()