Flexibility of the dimensionality reduction assessment • ClustAssess

is_seuratdata <- require("SeuratData", quietly = TRUE)
if (!is_seuratdata) {
    devtools::install_github('satijalab/seurat-data', upgrade = "never")
}

## Downloading GitHub repo satijalab/seurat-data@HEAD

## ── R CMD build ─────────────────────────────────────────────────────────────────
## * checking for file ‘/tmp/RtmppyRYcT/remotesc6e337361af/satijalab-seurat-data-3e51f44/DESCRIPTION’ ... OK
## * preparing ‘SeuratData’:
## * checking DESCRIPTION meta-information ... OK
## * checking for LF line-endings in source and make files and shell scripts
## * checking for empty or unneeded directories
## Omitted ‘LazyData’ from DESCRIPTION
## * building ‘SeuratData_0.2.2.9002.tar.gz’

## Installing package into '/tmp/RtmpvumD9H/temp_libpath7ac96630a8e'
## (as 'lib' is unspecified)

is_harmony <- require("harmony", quietly = TRUE)
if (!is_harmony) {
    install.packages("harmony", repos = "https://cloud.r-project.org")
}

is_rsamtool <- require("Rsamtools", quietly = TRUE)

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
##     tapply, union, unique, unsplit, which.max, which.min

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:utils':
## 
##     findMatches

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## 
## Attaching package: 'Biostrings'

## The following object is masked from 'package:base':
## 
##     strsplit

if (!is_rsamtool) {
    is_biocmanager <- require("BiocManager", quietly = TRUE)
    if (!is_biocmanager) {
        install.packages("BiocManager", repos = "https://cloud.r-project.org")
        library(BiocManager)
    }

    install("Rsamtools")
}

library(Seurat)

## Loading required package: SeuratObject

## Loading required package: sp

## 
## Attaching package: 'sp'

## The following object is masked from 'package:IRanges':
## 
##     %over%

## 
## Attaching package: 'SeuratObject'

## The following object is masked from 'package:Biostrings':
## 
##     intersect

## The following object is masked from 'package:GenomicRanges':
## 
##     intersect

## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect

## The following object is masked from 'package:IRanges':
## 
##     intersect

## The following object is masked from 'package:S4Vectors':
## 
##     intersect

## The following object is masked from 'package:BiocGenerics':
## 
##     intersect

## The following objects are masked from 'package:base':
## 
##     intersect, t

library(SeuratData)
library(ClustAssess)
library(ggplot2)
library(harmony)
library(data.table)

## 
## Attaching package: 'data.table'

## The following object is masked from 'package:GenomicRanges':
## 
##     shift

## The following object is masked from 'package:IRanges':
## 
##     shift

## The following objects are masked from 'package:S4Vectors':
## 
##     first, second

library(Rsamtools)

n_repetitions <- 30

The matrix processing parameter of the assess_feature_stability function is a function that enables the user to specify any method to perform the dimensionality reduction prior to applying the UMAP algorithm and the clustering pipeline. By default, the dimensionality reduction used in ClustAssess is a precise PCA using the prcomp package. However, this function can be easily changed, as it will be shown in the following examples.

ClustAssess using PCA

For the PCA example, we will use the PBMC 3k dataset from the SeuratData package. The preprocessing of the dataset is identical with the one performed in the stability pipeline vignette.

options(timeout=3600)
InstallData("pbmc3k")

## Installing package into '/tmp/RtmpvumD9H/temp_libpath7ac96630a8e'
## (as 'lib' is unspecified)

data("pbmc3k")
pbmc3k <- UpdateSeuratObject(pbmc3k)

## Validating object structure

## Updating object slots

## Ensuring keys are in the proper structure

## Warning: Assay RNA changing from Assay to Assay

## Ensuring keys are in the proper structure

## Ensuring feature names don't have underscores or pipes

## Updating slots in RNA

## Validating object structure for Assay 'RNA'

## Object representation is consistent with the most current Seurat version

pbmc3k <- PercentageFeatureSet(pbmc3k, pattern = "^MT-", col.name = "percent.mito")
pbmc3k <- PercentageFeatureSet(pbmc3k, pattern = "^RP[SL][[:digit:]]", col.name = "percent.rp")
# remove MT and RP genes
all.index <- seq_len(nrow(pbmc3k))
MT.index <- grep(pattern = "^MT-", x = rownames(pbmc3k), value = FALSE)
RP.index <- grep(pattern = "^RP[SL][[:digit:]]", x = rownames(pbmc3k), value = FALSE)
pbmc3k <- pbmc3k[!((all.index %in% MT.index) | (all.index %in% RP.index)), ]
pbmc3k <- subset(pbmc3k, nFeature_RNA < 2000 & nCount_RNA < 2500 & percent.mito < 7 & percent.rp > 7)
pbmc3k <- NormalizeData(pbmc3k, verbose = FALSE)
pbmc3k <- FindVariableFeatures(pbmc3k, selection.method = "vst", nfeatures = 3000, verbose = FALSE)

features <- dimnames(pbmc3k@assays$RNA)[[1]]
var_features <- pbmc3k@assays[["RNA"]]@var.features
n_abundant <- 3000
most_abundant_genes <- rownames(pbmc3k@assays$RNA)[order(Matrix::rowSums(pbmc3k@assays$RNA),
    decreasing = TRUE
)]

pbmc3k <- ScaleData(pbmc3k, features = features, verbose = FALSE)

We notice that the seurat_annotations column has some missing values. For simplicity, we will replace them with “NA”.

mask <- is.na(pbmc3k$seurat_annotations)
pbmc3k$seurat_annotations <- as.character(pbmc3k$seurat_annotations)
pbmc3k$seurat_annotations[mask] <- "NA"

Select the features used for the stability assessment.

features <- dimnames(pbmc3k@assays$RNA)[[1]]
var_features <- pbmc3k@assays[["RNA"]]@var.features
n_abundant <- 3000
most_abundant_genes <- rownames(pbmc3k@assays$RNA)[order(Matrix::rowSums(pbmc3k@assays$RNA),
    decreasing = TRUE
)]

steps <- seq(from = 500, to = 3000, by = 500)
ma_hv_genes_intersection_sets <- sapply(steps, function(x) intersect(most_abundant_genes[1:x], var_features[1:x]))
ma_hv_genes_intersection <- Reduce(union, ma_hv_genes_intersection_sets)
ma_hv_steps <- sapply(ma_hv_genes_intersection_sets, length)

Assess the stability of the dimensionality reduction when PCA is used as dimensionality reduction.

matrix_processing_function <- function(dt_mtx, actual_npcs = 30) {
    actual_npcs <- min(actual_npcs, ncol(dt_mtx) %/% 2)

    RhpcBLASctl::blas_set_num_threads(foreach::getDoParWorkers())
    embedding <- stats::prcomp(x = dt_mtx, rank. = actual_npcs)$x

    RhpcBLASctl::blas_set_num_threads(1)
    rownames(embedding) <- rownames(dt_mtx)
    colnames(embedding) <- paste0("PC_", seq_len(actual_npcs))

    return(embedding)
}

pca_feature_stability <- assess_feature_stability(
    data_matrix = pbmc3k@assays[["RNA"]]@scale.data,
    feature_set = most_abundant_genes,
    resolution = seq(from = 0.1, to = 1, by = 0.1),
    steps = steps,
    n_repetitions = n_repetitions,
    feature_type = "MA",
    graph_reduction_type = "PCA",
    matrix_processing = matrix_processing_function,
    umap_arguments = list(
        min_dist = 0.3,
        n_neighbors = 30,
        metric = "cosine"
    ),
    ecs_thresh = 1,
    clustering_algorithm = 1
)

## Warning: executing %dopar% sequentially: no parallel backend registered

Plot the distribution of the celltypes on the UMAP embedding obtained on the top 1000 Most Abundant genes.

umap_df <- data.frame(pca_feature_stability$embedding_list$MA$"1000")
umap_df$celltypes <- pbmc3k$seurat_annotations
ggplot(umap_df, aes(x = UMAP_1, y = UMAP_2, color = celltypes)) +
    geom_point() +
    theme_classic()

ClustAssess using Harmony

We can also modify the function by adding an addition post-processing step to the PCA. In this example, we will use the Harmony correction to remove the “batch effect” created by the celltypes. Note: This example is meant to exemplify how to use the Harmony correction in the ClusAssess pipeline. The batch correction is actually not needed in the PBMC 3k dataset.

matrix_processing_function <- function(dt_mtx, actual_npcs = 30) {
    actual_npcs <- min(actual_npcs, ncol(dt_mtx) %/% 2)

    RhpcBLASctl::blas_set_num_threads(foreach::getDoParWorkers())
    embedding <- stats::prcomp(x = dt_mtx, rank. = actual_npcs)$x

    RhpcBLASctl::blas_set_num_threads(1)
    rownames(embedding) <- rownames(dt_mtx)
    colnames(embedding) <- paste0("PC_", seq_len(actual_npcs))

    embedding <- RunHarmony(embedding, pbmc3k$seurat_annotations, verbose = FALSE)

    return(embedding)
}

pca_harmony_feature_stability <- assess_feature_stability(
    data_matrix = pbmc3k@assays[["RNA"]]@scale.data,
    feature_set = most_abundant_genes,
    resolution = seq(from = 0.1, to = 1, by = 0.1),
    steps = steps,
    n_repetitions = n_repetitions,
    feature_type = "MA",
    graph_reduction_type = "PCA",
    matrix_processing = matrix_processing_function,
    umap_arguments = list(
        min_dist = 0.3,
        n_neighbors = 30,
        metric = "cosine"
    ),
    ecs_thresh = 1,
    clustering_algorithm = 1,
    verbose = TRUE
)

Plot the distribution of the celltypes on the UMAP embedding obtained on the top 1000 Most Abundant genes.

umap_df <- data.frame(pca_harmony_feature_stability$embedding_list$MA$"1000")
umap_df$celltypes <- pbmc3k$seurat_annotations
ggplot(umap_df, aes(x = UMAP_1, y = UMAP_2, color = celltypes)) +
    geom_point() +
    theme_classic()

ClustAssess in the scATAC-seq data

In this example we will showcase the flexibility of the assess_feature_stability function by using the ATAC-seq data. For this example, we will use the multiome PBMC dataset from the SeuratData package.

library(Signac)
InstallData("pbmcMultiome")

## Installing package into '/tmp/RtmpvumD9H/temp_libpath7ac96630a8e'
## (as 'lib' is unspecified)

data("pbmc.atac")

As presented in the (Signac)(https://stuartlab.org/signac/articles/pbmc_vignette) package, the ATAC-seq data is usually processed using the TF-IDF normalization followed by the the calculation of the singular values. These two steps are also known as LSI (Latent Semantic Indexing).

pbmc.atac <- RunTFIDF(pbmc.atac)

## Performing TF-IDF normalization

## Warning in RunTFIDF.default(object = GetAssayData(object = object, layer =
## "counts"), : Some features contain 0 total counts

Identify the highly variable peaks.

pbmc.atac <- FindTopFeatures(pbmc.atac, min.cutoff = "q5")
var_peaks <- pbmc.atac@assays$ATAC@var.features[seq_len(3000)]

To speedup the assessment, set a parallel backend with 6 cores.

RhpcBLASctl::blas_set_num_threads(1)
ncores <- 1
if (ncores > 1) {
    my_cluster <- parallel::makeCluster(
        ncores,
        type = "PSOCK"
    )

    doParallel::registerDoParallel(cl = my_cluster)
}

Assess the stability of the dimensionality reduction by varying the number of highly variable peaks.

matrix_processing_function <- function(dt_mtx, actual_n_singular_values = 50) {
    actual_n_singular_values <- min(actual_n_singular_values, ncol(dt_mtx) %/% 2)

    RhpcBLASctl::blas_set_num_threads(foreach::getDoParWorkers())
    embedding <- RunSVD(Matrix::t(dt_mtx), n = actual_n_singular_values, verbose = FALSE)@cell.embeddings
    # remove the first component, as it does contain noise - see the Signac vignette
    embedding <- embedding[, 2:actual_n_singular_values]

    RhpcBLASctl::blas_set_num_threads(1)
    rownames(embedding) <- rownames(dt_mtx)
    colnames(embedding) <- paste0("LSI_", seq_len(actual_n_singular_values - 1))

    return(embedding)
}

lsi_atac_feature_stability <- assess_feature_stability(
    data_matrix = pbmc.atac@assays[["ATAC"]]@data,
    feature_set = var_peaks,
    resolution = seq(from = 0.1, to = 1, by = 0.1),
    steps = steps,
    n_repetitions = n_repetitions,
    feature_type = "HV_peaks",
    graph_reduction_type = "PCA",
    matrix_processing = matrix_processing_function,
    umap_arguments = list(
        min_dist = 0.3,
        n_neighbors = 30,
        metric = "cosine"
    ),
    ecs_thresh = 1,
    clustering_algorithm = 1,
    verbose = TRUE
)

## Warning: No assay specified, setting assay as RNA by default.
## No assay specified, setting assay as RNA by default.
## No assay specified, setting assay as RNA by default.
## No assay specified, setting assay as RNA by default.
## No assay specified, setting assay as RNA by default.
## No assay specified, setting assay as RNA by default.

foreach::registerDoSEQ()

Plot the distribution of the celltypes on the UMAP embedding obtained on the top 1000 Highly Variable peaks.

umap_df <- data.frame(lsi_atac_feature_stability$embedding_list$HV_peaks$"1000")
umap_df$celltypes <- pbmc.atac$seurat_annotations
ggplot(umap_df, aes(x = UMAP_1, y = UMAP_2, color = celltypes)) +
    geom_point() +
    theme_classic()

Session info

sessionInfo()

## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: Europe/Bucharest
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] pbmcMultiome.SeuratData_0.1.4 Signac_1.14.0                
##  [3] pbmc3k.SeuratData_3.1.4       data.table_1.15.4            
##  [5] ggplot2_3.5.1                 ClustAssess_1.1.0            
##  [7] SeuratData_0.2.2.9002         Seurat_5.1.0                 
##  [9] SeuratObject_5.0.2            sp_2.1-4                     
## [11] Rsamtools_2.20.0              Biostrings_2.72.1            
## [13] XVector_0.44.0                GenomicRanges_1.56.1         
## [15] GenomeInfoDb_1.40.1           IRanges_2.38.1               
## [17] S4Vectors_0.42.1              BiocGenerics_0.50.0          
## [19] harmony_1.2.3                 Rcpp_1.0.12                  
## 
## loaded via a namespace (and not attached):
##   [1] RcppAnnoy_0.0.22        splines_4.4.1           later_1.3.2            
##   [4] bitops_1.0-7            tibble_3.2.1            polyclip_1.10-6        
##   [7] fastDummies_1.7.3       lifecycle_1.0.4         globals_0.16.3         
##  [10] processx_3.8.4          lattice_0.22-6          MASS_7.3-55            
##  [13] magrittr_2.0.3          plotly_4.10.4           sass_0.4.9             
##  [16] rmarkdown_2.27          jquerylib_0.1.4         yaml_2.3.9             
##  [19] remotes_2.5.0           httpuv_1.6.15           sctransform_0.4.1      
##  [22] spam_2.10-0             sessioninfo_1.2.2       pkgbuild_1.4.4         
##  [25] spatstat.sparse_3.1-0   reticulate_1.38.0       cowplot_1.1.3          
##  [28] pbapply_1.7-2           RColorBrewer_1.1-3      abind_1.4-5            
##  [31] pkgload_1.4.0           zlibbioc_1.50.0         Rtsne_0.17             
##  [34] purrr_1.0.2             rappdirs_0.3.3          GenomeInfoDbData_1.2.12
##  [37] ggrepel_0.9.5           irlba_2.3.5.1           listenv_0.9.1          
##  [40] spatstat.utils_3.0-5    goftest_1.2-3           RSpectra_0.16-1        
##  [43] spatstat.random_3.3-1   fitdistrplus_1.2-1      parallelly_1.37.1      
##  [46] pkgdown_2.1.3           RcppRoll_0.3.1          leiden_0.4.3.1         
##  [49] codetools_0.2-18        tidyselect_1.2.1        farver_2.1.2           
##  [52] UCSC.utils_1.0.0        matrixStats_1.3.0       spatstat.explore_3.3-1 
##  [55] jsonlite_1.8.8          ellipsis_0.3.2          progressr_0.14.0       
##  [58] ggridges_0.5.6          survival_3.2-13         iterators_1.0.14       
##  [61] systemfonts_1.1.0       foreach_1.5.2           progress_1.2.3         
##  [64] tools_4.4.1             ragg_1.4.0              ica_1.0-3              
##  [67] glue_1.7.0              gridExtra_2.3           xfun_0.50              
##  [70] usethis_2.2.3           dplyr_1.1.4             withr_3.0.0            
##  [73] fastmap_1.2.0           callr_3.7.6             digest_0.6.36          
##  [76] R6_2.5.1                mime_0.12               textshaping_0.4.0      
##  [79] colorspace_2.1-0        scattermore_1.2         tensor_1.5             
##  [82] spatstat.data_3.1-2     RhpcBLASctl_0.23-42     tidyr_1.3.1            
##  [85] generics_0.1.3          prettyunits_1.2.0       httr_1.4.7             
##  [88] htmlwidgets_1.6.4       uwot_0.2.2              pkgconfig_2.0.3        
##  [91] gtable_0.3.5            lmtest_0.9-40           htmltools_0.5.8.1      
##  [94] profvis_0.3.8           dotCall64_1.1-1         scales_1.3.0           
##  [97] png_0.1-8               spatstat.univar_3.0-0   knitr_1.49             
## [100] reshape2_1.4.4          nlme_3.1-155            curl_5.2.1             
## [103] cachem_1.1.0            zoo_1.8-12              stringr_1.5.1          
## [106] KernSmooth_2.23-20      parallel_4.4.1          miniUI_0.1.1.1         
## [109] desc_1.4.3              pillar_1.10.1           grid_4.4.1             
## [112] vctrs_0.6.5             RANN_2.6.1              urlchecker_1.0.1       
## [115] promises_1.3.0          xtable_1.8-4            cluster_2.1.2          
## [118] evaluate_0.24.0         cli_3.6.3               compiler_4.4.1         
## [121] rlang_1.1.4             crayon_1.5.3            future.apply_1.11.2    
## [124] labeling_0.4.3          ps_1.7.7                plyr_1.8.9             
## [127] fs_1.6.4                stringi_1.8.4           viridisLite_0.4.2      
## [130] deldir_2.0-4            BiocParallel_1.38.0     munsell_0.5.1          
## [133] lazyeval_0.2.2          devtools_2.4.5          spatstat.geom_3.3-2    
## [136] SharedObject_1.19.1     Matrix_1.7-0            RcppHNSW_0.6.0         
## [139] hms_1.1.3               patchwork_1.2.0         future_1.33.2          
## [142] shiny_1.8.1.1           ROCR_1.0-11             igraph_2.0.3           
## [145] memoise_2.0.1           bslib_0.7.0             fastmatch_1.1-6