ClusterGVis

To enhance clustering and visualization of time-series gene expression data from RNA-Seq experiments, we present the ClusterGVis package. This tool enables concise and elegant analysis of time-series gene expression data in a simple, one-step operation. Additionally, you can perform enrichment analysis for each cluster using the enrichCluster function, which integrates seamlessly with clusterProfiler. ClusterGVis empowers you to create publication-quality visualizations with ease.

Comprehensive documentation can be found https://junjunlab.github.io/ClusterGvis-manual.

Usage

Basic examples:

Here we load the built-in RNA-seq expression matrix, where each column represents transcriptome gene expression information from different differentiation stages: zygote, two-cell, four-cell, eight-cell, morula, and blastocyst:

suppressPackageStartupMessages(library(SummarizedExperiment))
suppressPackageStartupMessages(library(S4Vectors))
library(ClusterGVis)

# a data.frame or SummarizedExperiment object
data("exps")

head(exps)

##            Zygote    2-cell   4-cell   8-cell    Morula Blastocyst
## Oog4    1.3132282 1.2370781 1.325978 1.262073 0.6549312  0.2067114
## Psmd9   1.0917337 1.3159888 1.174417 1.064756 0.8685598  0.4845448
## Sephs2  0.9859232 1.2010257 1.123076 1.084673 0.8878931  0.7174088
## Nhlrc2  0.9856354 1.0387869 1.061926 1.076825 0.9716945  0.8651322
## Trappc4 1.0775310 0.9757542 1.065544 1.080973 0.9732145  0.8269832
## Ywhah   1.0485306 1.0212216 1.117839 1.199569 1.0384096  0.5744298

The getClusters function employs the elbow method to help users pre-determine the appropriate number of clusters for their analysis:

# check suitable cluster nmbers
getClusters(obj = exps)

To investigate gene expression modules that exhibit distinct expression patterns across different differentiation stages, we employ k-means clustering to group genes, with the number of clusters set to 8:

# using kemans for clustering
ck <- clusterData(obj = exps,
                  clusterMethod = "kmeans",
                  clusterNum = 8)

## [1] "0 genes excluded.\n"

Besides standard gene expression matrices (in data.frame or matrix format), users can also directly pass SummarizedExperiment objects as input data:

# construct a SummarizedExperiment object
sce <- SummarizedExperiment(assays = list(counts = exps),
                            colData = S4Vectors::DataFrame(
                              sample = colnames(exps),
                              row.names = colnames(exps))
                            )

sce

## class: SummarizedExperiment 
## dim: 3767 6 
## metadata(0):
## assays(1): counts
## rownames(3767): Oog4 Psmd9 ... Eprs Cenpe
## rowData names(0):
## colnames(6): Zygote 2-cell ... Morula Blastocyst
## colData names(1): sample

# using kemans for clustering
ck2 <- clusterData(obj = sce,
                  clusterMethod = "kmeans",
                  clusterNum = 8)

## [1] "0 genes excluded.\n"

We can then visualize the clustering results. The visCluster function supports various visualization methods, including line plots, heatmaps, and complex composite graphics, to demonstrate the expression trend patterns of genes across different modules:

Line plot:

# plot line only
visCluster(object = ck,
           plotType = "line")

Heatmap plot:

# plot heatmap only
visCluster(object = ck,
           plotType = "heatmap")

Complex heatmap with line plot annotation:

# plot heatmap only
visCluster(object = ck,
           plotType = "both")

Integration with seurat object:

ClusterGVis is compatible with outputs from single-cell analysis pipelines, such as Seurat objects. Here we demonstrate the visualization of marker genes discovered for distinct cell subpopulations:

suppressPackageStartupMessages(library(Seurat))

data("pbmc_subset")

# find markers for every cluster compared to all remaining cells
# report only the positive ones
pbmc.markers.all <- Seurat::FindAllMarkers(pbmc_subset,
                                           only.pos = TRUE,
                                           min.pct = 0.25,
                                           logfc.threshold = 0.25)

# get top 10 genes
pbmc.markers <- pbmc.markers.all |>
  dplyr::group_by(cluster) |>
  dplyr::top_n(n = 20, wt = avg_log2FC)

# check
head(pbmc.markers)

## # A tibble: 6 × 7
## # Groups:   cluster [1]
##      p_val avg_log2FC pct.1 pct.2 p_val_adj cluster     gene 
##      <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>       <chr>
## 1 1.51e-18      0.585 1     0.973  7.57e-16 Naive CD4 T RPS23
## 2 8.37e-15      0.638 0.977 0.954  4.18e-12 Naive CD4 T RPSA 
## 3 1.55e-14      0.560 0.992 0.973  7.74e-12 Naive CD4 T RPS16
## 4 1.45e- 8      0.436 0.955 0.965  7.24e- 6 Naive CD4 T RPL17
## 5 7.47e- 6      0.442 0.902 0.861  3.73e- 3 Naive CD4 T RPL23
## 6 1.24e- 3      0.918 0.406 0.278  6.19e- 1 Naive CD4 T FUS

# prepare data from seurat object
st.data <- prepareDataFromscRNA(object = pbmc_subset,
                                diffData = pbmc.markers,
                                showAverage = TRUE)

# check
str(st.data)

## List of 5
##  $ wide.res:'data.frame':    77 obs. of  11 variables:
##   ..$ Naive CD4 T : num [1:77] 1.31 1.28 1.59 1.27 1.08 ...
##   ..$ Memory CD4 T: num [1:77] 0.646 1.005 1.006 0.976 -0.256 ...
##   ..$ CD14+ Mono  : num [1:77] -0.586 -0.825 0.134 -0.334 0.291 ...
##   ..$ B           : num [1:77] 1.37 1.304 0.798 0.717 0.641 ...
##   ..$ CD8 T       : num [1:77] 0.2046 0.2121 -0.0593 0.7409 -0.0982 ...
##   ..$ FCGR3A+ Mono: num [1:77] -0.424 -0.818 -0.812 -0.565 -0.37 ...
##   ..$ NK          : num [1:77] -0.82 -0.402 -0.71 -0.428 -1.442 ...
##   ..$ DC          : num [1:77] -0.0409 -0.3588 -0.3731 -0.5095 1.4949 ...
##   ..$ Platelet    : num [1:77] -1.66 -1.4 -1.58 -1.87 -1.34 ...
##   ..$ gene        : chr [1:77] "RPS23" "RPSA" "RPS16" "RPL17" ...
##   ..$ cluster     : chr [1:77] "1" "1" "1" "1" ...
##  $ long.res:'data.frame':    693 obs. of  5 variables:
##   ..$ cluster     : chr [1:693] "1" "1" "1" "1" ...
##   ..$ gene        : chr [1:693] "RPS23" "RPSA" "RPS16" "RPL17" ...
##   ..$ cell_type   : Factor w/ 9 levels "Naive CD4 T",..: 1 1 1 1 1 1 2 2 2 2 ...
##   ..$ norm_value  : num [1:693] 1.31 1.28 1.59 1.27 1.08 ...
##   ..$ cluster_name: Factor w/ 9 levels "cluster 1 (6)",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ type    : chr "scRNAdata"
##  $ geneMode: chr "average"
##  $ geneType: chr "unique|_"

Heatmap plot:

# add gene name
markGenes <- unique(pbmc.markers$gene)[
  sample(1:length(unique(pbmc.markers$gene)),40,replace = FALSE)]

# heatmap plot
# pdf('sc1.pdf',height = 10,width = 6,onefile = FALSE)
p <- visCluster(object = st.data,
           plotType = "heatmap",
           column_names_rot = 45,
           markGenes = markGenes,
           clusterOrder = c(1:9))

# dev.off()

Integration with SingleCellExperiment object:

If you are working with a SingleCellExperiment object, you can use ClusterGVis to easily extract data and generate plots:

library(Seurat)
data("pbmc_subset")

# transform into SingleCellExperiment 
sce <- as.SingleCellExperiment(pbmc_subset)

pbmc.markers.all <- Seurat::FindAllMarkers(pbmc_subset,
                                           only.pos = TRUE,
                                           min.pct = 0.25,
                                           logfc.threshold = 0.25)

# get top 10 genes
pbmc.markers <- pbmc.markers.all |>
  dplyr::group_by(cluster) |>
  dplyr::top_n(n = 20, wt = avg_log2FC)

st.data <- prepareDataFromscRNA(object = sce,
                                diffData = pbmc.markers[,c("cluster","gene")],
                                showAverage = TRUE)

visCluster(object = st.data,
           plotType = "heatmap",
           column_names_rot = 45,
           markGenes = markGenes,
           clusterOrder = c(1:9))

Session Info

sessionInfo()