## ----installation, eval=FALSE------------------------------------------------- # if (!require("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # BiocManager::install("SETA") ## ----setup, message=FALSE, warning=FALSE, echo=TRUE--------------------------- library(SingleCellExperiment) library(SETA) library(ggplot2) library(dplyr) library(TabulaMurisSenisData) library(reshape2) # library(tidytext) ## ----load data, message=FALSE, warning=FALSE, echo=TRUE----------------------- sce <- TabulaMurisSenisDroplet(tissues = "Lung")$Lung # Two samples in this dataset have had certain celltype lineages depleted. # For compositional comparisons, we remove these for simplicity. sce <- sce[, colData(sce)$subtissue != "immune-endo-depleted"] ## ----tabular data exploration------------------------------------------------- table(sce$free_annotation, sce$age) table(sce$free_annotation, sce$sex) ## ----palettes----------------------------------------------------------------- # Set up a color palette for plots # 3-group distinct categoricals age_palette <- c( "1m" = "#90EE90", "3m" = "#4CBB17", "18m" = "#228B22", "21m" = "#355E3B", "30m" = "#023020") ## ----data wrangling, message=FALSE, warning=FALSE, echo=TRUE------------------ # Extract the taxonomic counts matrix using custom metadata column names # (Users can specify these column names according to their dataset.) df <- data.frame(colData(sce)) # Fix special characters in mouse.id before working with the data df$mouse.id <- gsub("/", "", df$mouse.id) taxa_counts <- setaCounts( df, cell_type_col = "free_annotation", sample_col = "mouse.id", bc_col = "cell") head(taxa_counts) ## ----transformation, echo=TRUE------------------------------------------------ # CLR Transformation with a pseudocount of 1 (default) clr_out <- setaCLR(taxa_counts, pseudocount = 1) print(clr_out$counts[1:5, 1:5]) ## ----ALR---------------------------------------------------------------------- # ALR Transformation: using "NK cell" as the reference cell type if ("Natural Killer" %in% colnames(taxa_counts)) { alr_out <- setaALR(taxa_counts, ref = "Natural Killer", pseudocount = 1) print(alr_out$counts[1:5, 1:5]) } else { message("Reference 'NK cell' not found in taxa_counts. Please choose an available cell type.") } ## ----ILR---------------------------------------------------------------------- ilr_out <- setaILR(taxa_counts, boxcox_p = 0, pseudocount = 1) print(ilr_out$counts[1:5, 1:5]) ## ----percent------------------------------------------------------------------ pct_out <- setaPercent(taxa_counts) print(pct_out$counts[1:5, 1:5]) ## ----logCPM------------------------------------------------------------------- lcpm_out <- setaLogCPM(taxa_counts, pseudocount = 1) print(lcpm_out$counts[1:5, 1:5]) ## ----latent spaces, echo=TRUE------------------------------------------------- latent_pca <- setaLatent(clr_out, method = "PCA", dims = 5) # PCA Latent Space Coordinates: head(latent_pca$latentSpace) # Variance Explained: latent_pca$varExplained ## ----variance explained, message=FALSE, warning=FALSE, echo=TRUE-------------- ve_df <- data.frame( PC = factor(seq_along(latent_pca$varExplained), levels = seq_along(latent_pca$varExplained)), VarianceExplained = cumsum(latent_pca$varExplained) ) ## ----variance plot, fig.width = 6, fig.height = 4----------------------------- ggplot(ve_df, aes(x = PC, y = VarianceExplained)) + geom_line(color = "#2ca1db", size = 1.5, group = 1) + geom_point(color = "#f44323", size = 3) + labs(title = "Cumulative Variance Explained by Principal Components", x = "Principal Component", y = "Cumulative Variance Explained (%)") + theme_minimal() + ylim(c(0, 1)) + theme(text = element_text(size = 12)) ## ----PCA scatter-------------------------------------------------------------- # Extract latent space coordinates from PCA result. pca_coords <- latent_pca$latentSpace # Extract metadata meta_df <- setaMetadata( df, sample_col = "mouse.id", meta_cols = c("age", "sex")) # Merge latent coordinates with metadata pca_plot_df <- cbind(pca_coords, meta_df) ## ----PCA scatter plot, fig.width = 6, fig.height = 4-------------------------- # Create the scatter plot ggplot(pca_plot_df, aes(x = PC1, y = PC2, color = age)) + geom_text(aes(label = sample_id), size = 5) + scale_color_manual( values = age_palette ) + labs(title = "PCA Scatter Plot", x = "PC1", y = "PC2", color = "Age") + theme_linedraw() + xlab(sprintf("PC1 (%s%%)", signif(latent_pca$varExplained[1], 4) * 100)) + ylab(sprintf("PC2 (%s%%)", signif(latent_pca$varExplained[2], 4) * 100)) ## ----loadings, message=FALSE, warning=FALSE, echo=TRUE------------------------ loadings_df <- as.data.frame(latent_pca$loadings) loadings_df$Celltype <- rownames(loadings_df) # Melt the loadings into long format for `ggplot2` loadings_long <- melt(loadings_df, id.vars = "Celltype", variable.name = "PC", value.name = "Loading") ## ----loadings viz, fig.width = 12, fig.height = 9----------------------------- # Subset rows where PC is "PC1" or "PC2" loading_df <- loadings_long[loadings_long$PC %in% c("PC1", "PC2"), ] # Append PC to Celltype to mimic tidytext::reorder_within() loading_df$Celltype_PC <- paste(loading_df$Celltype, loading_df$PC, sep = "___") # Reorder by Loading *within each PC* loading_df$Celltype_PC <- with(loading_df, reorder(Celltype_PC, Loading)) ggplot(loading_df, aes(x = Loading, y = Celltype, label = Celltype, fill = Loading)) + geom_col() + facet_wrap(~PC, scales = "free") + # scale_y_reordered() + labs(title = "PC Loadings", x = "Loading", y = "Cell Type") + theme_linedraw() + scale_fill_viridis_c() + expand_limits(x = c(-0.2, 0.4)) ## ----packages used------------------------------------------------------------ sessionInfo()