## ----style-knitr, eval=TRUE, echo=FALSE, results="asis"-------------------- BiocStyle::latex(use.unsrturl=FALSE) ## ----setup_knitr, include=FALSE, cache=FALSE------------------------------- library(knitr) opts_chunk$set(cache = TRUE, tidy = FALSE, tidy.opts = list(blank = FALSE, width.cutoff=70), highlight = FALSE, out.width = "7cm", out.height = "7cm", fig.align = "center") ## ----sri_file, eval = TRUE------------------------------------------------- library(PasillaTranscriptExpr) data_dir <- system.file("extdata", package = "PasillaTranscriptExpr") sri <- read.table(paste0(data_dir, "/SraRunInfo.csv"), stringsAsFactors = FALSE, sep = ",", header = TRUE) keep <- grep("CG8144|Untreated-", sri$LibraryName) sri <- sri[keep, ] ## ----download, eval = FALSE------------------------------------------------ # sra_files <- basename(sri$download_path) # # for(i in 1:nrow(sri)) # download.file(sri$download_path[i], sra_files[i]) # ## ----fastqdump, eval = FALSE----------------------------------------------- # cmd <- paste0("fastq-dump -O ./ --split-3 ", sra_files) # # for(i in 1:length(cmd)) # system(cmd[i]) # # system("gzip *.fastq") ## ----download_fasta, eval = FALSE------------------------------------------ # system("wget ftp://ftp.ensembl.org/pub/release-70/fasta/drosophila_melanogaster/cdna/Drosophila_melanogaster.BDGP5.70.cdna.all.fa.gz") # system("gunzip Drosophila_melanogaster.BDGP5.70.cdna.all.fa.gz") ## ----download_gtf, eval = FALSE-------------------------------------------- # system("wget ftp://ftp.ensembl.org/pub/release-70/gtf/drosophila_melanogaster/Drosophila_melanogaster.BDGP5.70.gtf.gz") # system("gunzip Drosophila_melanogaster.BDGP5.70.gtf.gz") ## ----kallisto_metadata, eval = TRUE---------------------------------------- sri$LibraryName <- gsub("S2_DRSC_","",sri$LibraryName) metadata <- unique(sri[,c("LibraryName", "LibraryLayout", "SampleName", "avgLength")]) for(i in seq_len(nrow(metadata))){ indx <- sri$LibraryName == metadata$LibraryName[i] if(metadata$LibraryLayout[i] == "PAIRED"){ metadata$fastq[i] <- paste0(sri$Run[indx], "_1.fastq.gz ", sri$Run[indx], "_2.fastq.gz", collapse = " ") }else{ metadata$fastq[i] <- paste0(sri$Run[indx], ".fastq.gz", collapse = " ") } } metadata$condition <- ifelse(grepl("CG8144_RNAi", metadata$LibraryName), "KD", "CTL") metadata$UniqueName <- paste0(1:nrow(metadata), "_", metadata$SampleName) ## ----kallisto_index, eval = TRUE------------------------------------------- cDNA_fasta <- "Drosophila_melanogaster.BDGP5.70.cdna.all.fa" index <- "Drosophila_melanogaster.BDGP5.70.cdna.all.idx" cmd <- paste("kallisto index -i", index, cDNA_fasta, sep = " ") cmd ## ----kallisto_index_cmd, eval = FALSE-------------------------------------- # system(cmd) ## ----kallisto_quantification, eval = TRUE---------------------------------- out_dir <- metadata$UniqueName cmd <- paste("kallisto quant -i", index, "-o", out_dir, "-b 0 -t 5", ifelse(metadata$LibraryLayout == "SINGLE", paste("--single -l", metadata$avgLength), ""), metadata$fastq) cmd ## ----kallisto_quantification_cmd, eval = FALSE----------------------------- # for(i in 1:length(cmd)) # system(cmd[i]) ## ----library_rtracklayer, eval = TRUE, message = FALSE--------------------- library(rtracklayer) ## ----gtf, eval = FALSE----------------------------------------------------- # gtf_dir <- "Drosophila_melanogaster.BDGP5.70.gtf" # # gtf <- import(gtf_dir) # # gt <- unique(mcols(gtf)[, c("gene_id", "transcript_id")]) # rownames(gt) <- gt$transcript_id # ## ----merge_counts, eval = FALSE-------------------------------------------- # samples <- unique(metadata$SampleName) # # counts_list <- lapply(1:length(samples), function(i){ # indx <- which(metadata$SampleName == samples[i]) # # if(length(indx) == 1){ # abundance <- read.table(file.path(metadata$UniqueName[indx], # "abundance.txt"), header = TRUE, sep = "\t", as.is = TRUE) # }else{ # abundance <- lapply(indx, function(j){ # abundance_tmp <- read.table(file.path(metadata$UniqueName[j], # "abundance.txt"), header = TRUE, sep = "\t", as.is = TRUE) # abundance_tmp <- abundance_tmp[, c("target_id", "est_counts")] # abundance_tmp # }) # abundance <- Reduce(function(...) merge(..., by = "target_id", all = TRUE, # sort = FALSE), abundance) # est_counts <- rowSums(abundance[, -1]) # abundance <- data.frame(target_id = abundance$target_id, # est_counts = est_counts, stringsAsFactors = FALSE) # } # # counts <- data.frame(abundance$target_id, abundance$est_counts, # stringsAsFactors = FALSE) # colnames(counts) <- c("feature_id", samples[i]) # return(counts) # }) # # counts <- Reduce(function(...) merge(..., by = "feature_id", all = TRUE, # sort = FALSE), counts_list) # # ### Add gene IDs # counts$gene_id <- gt[counts$feature_id, "gene_id"] # ## ----metadata, eval = TRUE------------------------------------------------- metadata <- unique(metadata[, c("LibraryName", "LibraryLayout", "SampleName", "condition")]) metadata ## ----metadata_save, eval = FALSE------------------------------------------- # write.table(metadata, "metadata.txt", quote = FALSE, sep = "\t", # row.names = FALSE) ## ----counts_save, eval = FALSE--------------------------------------------- # ### Final counts with columns sorted as in metadata # counts <- counts[, c("feature_id", "gene_id", metadata$SampleName)] # write.table(counts, "counts.txt", quote = FALSE, sep = "\t", row.names = FALSE) ## ----sessionInfo----------------------------------------------------------- sessionInfo()