## ---- include = FALSE--------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "../man/figures/README-", out.width = "100%", warning = FALSE, dpi=300 ) library(BiocStyle) ## ----'install1', eval = FALSE, message=FALSE, warning=FALSE------------------- # # if (!require("BiocManager", quietly = TRUE)) # # install.packages("BiocManager") # # BiocManager::install(version = "3.16") ## ----'install2', eval = FALSE, message=FALSE, warning=FALSE------------------- # # BiocManager::install("AnnotationHub", update = FALSE) # # BiocManager::install("GenomicRanges", update = FALSE) # # BiocManager::install("plyranges", update = FALSE) ## ----AnnotationHub------------------------------------------------------------ suppressMessages(library(GenomicRanges)) suppressMessages(library(AnnotationHub)) ah <- AnnotationHub() query_data <- subset(ah, preparerclass == "excluderanges") # You can search for multiple terms # query_data <- query(ah, c("excluderanges", "Kundaje", "hg38")) query_data ## ----hg38excluderanges-------------------------------------------------------- excludeGR.hg38.Kundaje.1 <- query_data[["AH107305"]] # Always a good idea to sort GRanges and keep standard chromosomes excludeGR.hg38.Kundaje.1 <- excludeGR.hg38.Kundaje.1 %>% sort() %>% keepStandardChromosomes(pruning.mode = "tidy") excludeGR.hg38.Kundaje.1 ## ----eval=FALSE--------------------------------------------------------------- # # Note that rtracklayer::import and rtracklayer::export perform unexplained # # start coordinate conversion, likely related to 0- and 1-based coordinate # # system. We recommend converting GRanges to a data frame and save tab-separated # write.table(as.data.frame(excludeGR.hg38.Kundaje.1), # file = "hg38.Kundaje.GRCh38_unified_Excludable.bed", # sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) ## ----allhg38excluderanges----------------------------------------------------- query_data <- query(ah, c("excluderanges", "hg38")) query_data excludeGR.hg38.Bernstein <- query_data[["AH107306"]] excludeGR.hg38.Boyle <- query_data[["AH107307"]] excludeGR.hg38.Kundaje.2 <- query_data[["AH107308"]] excludeGR.hg38.Lareau <- query_data[["AH107309"]] excludeGR.hg38.Reddy <- query_data[["AH107310"]] excludeGR.hg38.Wimberley <- query_data[["AH107311"]] excludeGR.hg38.Wold <- query_data[["AH107312"]] excludeGR.hg38.Yeo <- query_data[["AH107313"]] ## ----excluderanges_hg38_count, fig.width=6.5, fig.height=2-------------------- library(ggplot2) mtx_to_plot <- data.frame(Count = c(length(excludeGR.hg38.Bernstein), length(excludeGR.hg38.Boyle), length(excludeGR.hg38.Kundaje.1), length(excludeGR.hg38.Kundaje.2), length(excludeGR.hg38.Lareau), length(excludeGR.hg38.Reddy), length(excludeGR.hg38.Wimberley), length(excludeGR.hg38.Wold), length(excludeGR.hg38.Yeo)), Source = c("Bernstein.Mint_Excludable_GRCh38", "Boyle.hg38-Excludable.v2", "Kundaje.GRCh38_unified_Excludable", "Kundaje.GRCh38.Excludable", "Lareau.hg38.full.Excludable", "Reddy.wgEncodeDacMapabilityConsensusExcludable", "Wimberley.peakPass60Perc_sorted", "Wold.hg38mitoExcludable", "Yeo.eCLIP_Excludableregions.hg38liftover.bed")) # Order Source by the number of regions mtx_to_plot$Source <- factor(mtx_to_plot$Source, levels = mtx_to_plot$Source[order(mtx_to_plot$Count)]) ggplot(mtx_to_plot, aes(x = Source, y = Count, fill = Source)) + geom_bar(stat = "identity") + coord_flip() + theme_bw() + theme(legend.position = "none") # ggsave("../man/figures/excluderanges_hg38_count.png", width = 5.5, height = 2) ## ----echo=FALSE, eval=FALSE--------------------------------------------------- # knitr::include_graphics('../man/figures/excluderanges_hg38_count.png') ## ----excluderanges_hg38_width, fig.width=6.5, fig.height=2, message=FALSE----- library(ggridges) library(dplyr) mtx_to_plot <- data.frame(Width = c(width(excludeGR.hg38.Bernstein), width(excludeGR.hg38.Boyle), width(excludeGR.hg38.Kundaje.1), width(excludeGR.hg38.Kundaje.2), width(excludeGR.hg38.Lareau), width(excludeGR.hg38.Reddy), width(excludeGR.hg38.Wimberley), width(excludeGR.hg38.Wold), width(excludeGR.hg38.Yeo)), Source = c(rep("Bernstein.Mint_Excludable_GRCh38", length(excludeGR.hg38.Bernstein)), rep("Boyle.hg38-Excludable.v2", length(excludeGR.hg38.Boyle)), rep("Kundaje.GRCh38_unified_Excludable", length(excludeGR.hg38.Kundaje.1)), rep("Kundaje.GRCh38.Excludable", length(excludeGR.hg38.Kundaje.2)), rep("Lareau.hg38.full.Excludable", length(excludeGR.hg38.Lareau)), rep("Reddy.wgEncodeDacMapabilityConsensusExcludable", length(excludeGR.hg38.Reddy)), rep("Wimberley.peakPass60Perc_sorted", length(excludeGR.hg38.Wimberley)), rep("Wold.hg38mitoExcludable", length(excludeGR.hg38.Wold)), rep("Yeo.eCLIP_Excludableregions.hg38liftover.bed", length(excludeGR.hg38.Yeo)))) # Order objects by decreasing width mtx_to_plot$Source <- factor(mtx_to_plot$Source, levels = mtx_to_plot %>% group_by(Source) %>% summarise(Mean = mean(Width)) %>% arrange(Mean) %>% pull(Source)) ggplot(mtx_to_plot, aes(x = log2(Width), y = Source, fill = Source)) + geom_density_ridges() + theme_bw() + theme(legend.position = "none") # ggsave("../man/figures/excluderanges_hg38_width.png", width = 5.5, height = 2) ## ----echo=FALSE, eval=FALSE--------------------------------------------------- # knitr::include_graphics('../man/figures/excluderanges_hg38_width.png') ## ----excluderanges_hg38_sumwidth, fig.width=6.5, fig.height=3----------------- mtx_to_plot <- data.frame(TotalWidth = c(sum(width(excludeGR.hg38.Bernstein)), sum(width(excludeGR.hg38.Boyle)), sum(width(excludeGR.hg38.Kundaje.1)), sum(width(excludeGR.hg38.Kundaje.2)), sum(width(excludeGR.hg38.Lareau)), sum(width(excludeGR.hg38.Reddy)), sum(width(excludeGR.hg38.Wimberley)), sum(width(excludeGR.hg38.Wold)), sum(width(excludeGR.hg38.Yeo))), Source = c("Bernstein.Mint_Excludable_GRCh38", "Boyle.hg38-Excludable.v2", "Kundaje.GRCh38_unified_Excludable", "Kundaje.GRCh38.Excludable", "Lareau.hg38.full.Excludable", "Reddy.wgEncodeDacMapabilityConsensusExcludable", "Wimberley.peakPass60Perc_sorted", "Wold.hg38mitoExcludable", "Yeo.eCLIP_Excludableregions.hg38liftover.bed")) # Order objects by decreasing width mtx_to_plot$Source <- factor(mtx_to_plot$Source, levels = mtx_to_plot %>% group_by(Source) %>% arrange(TotalWidth) %>% pull(Source)) ggplot(mtx_to_plot, aes(x = TotalWidth, y = Source, fill = Source)) + geom_bar(stat="identity") + scale_x_log10() + scale_y_discrete(label=abbreviate, limits=rev) + xlab("log10 total width") # ggsave("../man/figures/excluderanges_hg38_sumwidth.png", width = 6.5, height = 2) ## ----echo=FALSE, eval=FALSE--------------------------------------------------- # knitr::include_graphics('../man/figures/excluderanges_hg38_sumwidth.png') ## ----excluderanges_hg38_overlap_coefficient, warning=FALSE, fig.width=6.5, fig.height=6---- library(pheatmap) library(stringr) # Overlap coefficient calculations overlap_coefficient <- function(gr_a, gr_b) { intersects <- GenomicRanges::intersect(gr_a, gr_b, ignore.strand = TRUE) intersection_width <- sum(width(intersects)) min_width <- min(sum(width(gr_a)), sum(width(gr_b))) DataFrame(intersection_width, min_width, overlap_coefficient = intersection_width/min_width, n_intersections = length(intersects)) } # List and names of all excludable regions all_excludeGR_list <- list(excludeGR.hg38.Bernstein, excludeGR.hg38.Boyle, excludeGR.hg38.Kundaje.1, excludeGR.hg38.Kundaje.2, excludeGR.hg38.Lareau, excludeGR.hg38.Reddy, excludeGR.hg38.Wimberley, excludeGR.hg38.Wold, excludeGR.hg38.Yeo) all_excludeGR_name <- c("Bernstein.Mint_Excludable_GRCh38", "Boyle.hg38-Excludable.v2", "Kundaje.GRCh38_unified_Excludable", "Kundaje.GRCh38.Excludable", "Lareau.hg38.full.Excludable", "Reddy.wgEncodeDacMapabilityConsensusExcludable", "Wimberley.peakPass60Perc_sorted", "Wold.hg38mitoExcludable", "Yeo.eCLIP_Excludableregions.hg38liftover.bed") # Correlation matrix, empty mtx_to_plot <- matrix(data = 0, nrow = length(all_excludeGR_list), ncol = length(all_excludeGR_list)) # Fill it in for (i in 1:length(all_excludeGR_list)) { for (j in 1:length(all_excludeGR_list)) { # If diagonal, set to zero if (i == j) mtx_to_plot[i, j] <- 0 # Process only one half, the other is symmetric if (i > j) { mtx_to_plot[i, j] <- mtx_to_plot[j, i] <- overlap_coefficient(all_excludeGR_list[[i]], all_excludeGR_list[[j]])[["overlap_coefficient"]] } } } # Trim row/colnames rownames(mtx_to_plot) <- colnames(mtx_to_plot) <- str_trunc(all_excludeGR_name, width = 25) # Save the plot # png("../man/figures/excluderanges_hg38_jaccard.png", width = 1000, height = 900, res = 200) pheatmap(data.matrix(mtx_to_plot), clustering_method = "ward.D") # dev.off() ## ----echo=FALSE, eval=FALSE--------------------------------------------------- # knitr::include_graphics('../man/figures/excluderanges_hg38_jaccard.png') ## ----excluderanges_hg38_Reddy_metadata, fig.width=6.5, fig.height=3----------- mcols(excludeGR.hg38.Reddy) mtx_to_plot <- table(mcols(excludeGR.hg38.Reddy)[["name"]]) %>% as.data.frame() colnames(mtx_to_plot) <- c("Type", "Number") mtx_to_plot <- mtx_to_plot[order(mtx_to_plot$Number), ] mtx_to_plot$Type <- factor(mtx_to_plot$Type, levels = mtx_to_plot$Type) ggplot(mtx_to_plot, aes(x = Number, y = Type, fill = Type)) + geom_bar(stat="identity") + theme_bw() + theme(legend.position = "none") # ggsave("../man/figures/excluderanges_hg38_Reddy_metadata.png", width = 5, height = 2.5) ## ----echo=FALSE, eval=FALSE--------------------------------------------------- # knitr::include_graphics('../man/figures/excluderanges_hg38_Reddy_metadata.png') ## ----combinedexcluderanges, warning=FALSE------------------------------------- excludeGR.hg38.all <- reduce(c(excludeGR.hg38.Bernstein, excludeGR.hg38.Boyle, excludeGR.hg38.Kundaje.1, excludeGR.hg38.Kundaje.2, excludeGR.hg38.Lareau, excludeGR.hg38.Reddy, excludeGR.hg38.Wimberley, excludeGR.hg38.Wold, excludeGR.hg38.Yeo)) # Sort and Keep only standard chromosomes excludeGR.hg38.all <- excludeGR.hg38.all %>% sort %>% keepStandardChromosomes(pruning.mode = "tidy") print(length(excludeGR.hg38.all)) summary(width(excludeGR.hg38.all)) ## ----------------------------------------------------------------------------- suppressMessages(library(httr)) # Get hg38.Lareau.hg38_peaks BEDbase ID bedbase_id <- "9fa55701a3bd3e7a598d1d2815e3390f" # Construct output file name fileNameOut <- "hg38.Lareau.hg38_peak.bed.gz" # API token for BED data token2 <- paste0("http://bedbase.org/api/bed/", bedbase_id, "/file/bed") # Download file GET(url = token2, write_disk(fileNameOut, overwrite = TRUE)) # , verbose() # Read the data in hg38.Lareau.hg38_peaks <- read.table(fileNameOut, sep = "\t", header = FALSE) # Assign column names depending on the number of columns all_columns <- c("chr", "start", "stop", "name", "score", "strand", "signalValue", "pValue", "qValue", "peak") colnames(hg38.Lareau.hg38_peaks) <- all_columns[1:ncol(hg38.Lareau.hg38_peaks)] # Convert to GRanges object hg38.Lareau.hg38_peaks <- makeGRangesFromDataFrame(hg38.Lareau.hg38_peaks, keep.extra.columns = TRUE) hg38.Lareau.hg38_peaks ## ----eval=FALSE--------------------------------------------------------------- # # Search for the gap track # # ahData <- query(ah, c("gap", "Homo sapiens", "hg19")) # # ahData[ahData$title == "Gap"] # gaps <- ahData[["AH6444"]] ## ----gapcounts, echo=FALSE, out.height="70%", out.width="70%"----------------- suppressMessages(library(rtracklayer)) # knitr::include_graphics('../man/figures/excluderanges_hg19_gaps_number.png') # Get genome-specific gaps table mySession <- browserSession() genome(mySession) <- "hg19" query <- ucscTableQuery(mySession, table = "gap") gaps <- getTable(query) # Number of regions per gap type mtx_to_plot <- as.data.frame(table(gaps$type)) colnames(mtx_to_plot) <- c("Type", "Number") mtx_to_plot <- mtx_to_plot[order(mtx_to_plot$Number), ] mtx_to_plot$Type <- factor(mtx_to_plot$Type, levels = mtx_to_plot$Type) ggplot(mtx_to_plot, aes(x = Number, y = Type, fill = Type)) + geom_bar(stat="identity") + theme_bw() + theme(legend.position = "none") # ggsave("../man/figures/excluderanges_hg19_gaps_number.png", width = 5, height = 2.5) ## ----gapshg38----------------------------------------------------------------- query_data <- query(ah, c("excluderanges", "UCSC", "Homo Sapiens", "hg38")) query_data gapsGR_hg38_centromere <- query_data[["AH107354"]] gapsGR_hg38_centromere ## ----eval=FALSE--------------------------------------------------------------- # # hg38 CUT&RUN exclusion set, BED # download.file("https://drive.google.com/uc?export=download&id=1rKIu7kdiEySTi-cq3nYxXJP4VQX1IPcS", # destfile = "hg38.Nordin.CandRblacklist_hg38.bed") # # hg38 CUT&RUN exclusion set, RDS # download.file("https://drive.google.com/uc?export=download&id=1JuB1h-QQUw1mddBavI7CIuH7R-lwwczU", # destfile = "hg38.Nordin.CandRblacklist_hg38.rds") # # And then load the GRanges object # mtx <- readRDS("hg38.Nordin.CandRblacklist_hg38.rds") ## ----eval=FALSE--------------------------------------------------------------- # # mm10 CUT&RUN exclusion set, BED # download.file("https://drive.google.com/uc?export=download&id=1CRAojdphMbAzd3MnW_UmO1WtsDrHsrU1", # destfile = "mm10.Nordin.CandRblacklist_mm10.bed") # # mm10 CUT&RUN exclusion set, RDS # download.file("https://drive.google.com/uc?export=download&id=1orPXLWUZ4-C4n_Jt2gH-WERLpY9Kn0t_", # destfile = "mm10.Nordin.CandRblacklist_mm10.rds") ## ----echo=FALSE--------------------------------------------------------------- mtx <- read.csv("../man/figures/Table_S1.csv") mtx$BEDbase.URL <- ifelse(!is.na(mtx$BEDbase.ID), paste0("[link](http://bedbase.org/#/bedsplash/", mtx$BEDbase.ID, ")"), "NA") knitr::kable(mtx[, c("Name", "Ahub.IDs.BioC.3.16.and.above", "BEDbase.URL", "Description", "Filtered.Region.count")]) ## ----'citation', eval = requireNamespace('excluderanges')--------------------- print(citation("excluderanges"), bibtex = TRUE) ## ----------------------------------------------------------------------------- sessionInfo()