Skip to contents
is_e1071 <- require("e1071", quietly = TRUE)
if (!is_e1071) {
    install.packages("e1071", repos = "https://cloud.r-project.org")
    library(e1071)
}

is_dbscan <- require("dbscan", quietly = TRUE)
#> 
#> Attaching package: 'dbscan'
#> The following object is masked from 'package:stats':
#> 
#>     as.dendrogram
if (!is_dbscan) {
    install.packages("dbscan", repos = "https://cloud.r-project.org")
    library(dbscan)
}

is_dendextend <- require("dendextend", quietly = TRUE)
#> 
#> ---------------------
#> Welcome to dendextend version 1.18.0
#> Type citation('dendextend') for how to cite the package.
#> 
#> Type browseVignettes(package = 'dendextend') for the package vignette.
#> The github page is: https://github.com/talgalili/dendextend/
#> 
#> Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
#> You may ask questions at stackoverflow, use the r and dendextend tags: 
#>   https://stackoverflow.com/questions/tagged/dendextend
#> 
#>  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
#> ---------------------
#> 
#> Attaching package: 'dendextend'
#> The following object is masked from 'package:stats':
#> 
#>     cutree
if (!is_dendextend) {
    install.packages("dendextend", repos = "https://cloud.r-project.org")
    library(dendextend)
}

install.packages("harmony", repos = "https://cloud.r-project.org")
#> Installing package into '/tmp/Rtmp6XVZo7/temp_libpath5f6452a966ffe'
#> (as 'lib' is unspecified)

In this vignette we will illustrate how element-centric similarity can be used to compare different kinds of clustering results: flat disjoint clusterings, flat overlapping clusterings, and hierarchical clusterings.

library(ClustAssess)
library(ggplot2)
#> 
#> Attaching package: 'ggplot2'
#> The following object is masked from 'package:base':
#> 
#>     is.element
suppressPackageStartupMessages(library(dendextend))
theme_set(theme_classic())

# we will use the Iris dataset for this vignette
data <- as.matrix(iris[, 1:4])
df.iris <- as.data.frame(prcomp(data)$x)
df.iris$species <- iris$Species
ggplot(df.iris, aes(x = PC1, y = PC2, color = species)) +
    geom_point() +
    labs(title = "Iris PCA")

Next, we cluster the data using three different approaches.

# a flat, disjoint clustering with DBscan
db.res <- dbscan(data, eps = 1)$cluster
df.iris$db.cluster <- as.factor(db.res)
ggplot(df.iris, aes(x = PC1, y = PC2, color = db.cluster)) +
    geom_point() +
    labs(title = "DBScan clustering")


# an overlapping clustering with soft k-means
cmeans.res <- cmeans(data, centers = 6)$membership
# get the strongest cluster assignment for each observation to plot
# but we will still use the soft cluster assignments to calculate ECS
df.iris$cmeans.cluster <- as.factor(apply(cmeans.res, 1, which.max))
ggplot(df.iris, aes(x = PC1, y = PC2, color = cmeans.cluster)) +
    geom_point() +
    labs(title = "c-means clustering")


# a hierarchical clustering
distances <- dist(data, method = "euclidean")
hc.res <- hclust(distances, method = "complete")
# plot the resulting dendrogram
node.colors <- c("blue", "red", "green")
hc.res %>%
    as.dendrogram() %>%
    set("leaves_pch", 19) %>%
    set("leaves_cex", 0.5) %>%
    set("leaves_col", node.colors[df.iris$species]) %>%
    plot(main = "Complete linkage hierarchical clustering", leaflab = "none")

Now, we will compare the clustering results using element-centric similarity. ECS allows us to compare different kinds of clustering results, including the overlapping clustering and the hierarchical clustering we just calculated. The results tell us how similarly each observation was clustered by the two methods; the higher the ECS, the more similar the clustering of that observation.

# which observations are clustered more similarly?
# first compare flat disjoint and flat soft clusterings
df.iris$dbscan.cmeans.ecs <- element_sim_elscore(
    db.res,
    cmeans.res
)
ggplot(df.iris, aes(x = PC1, y = PC2, color = dbscan.cmeans.ecs)) +
    geom_point() +
    labs(title = "DBScan vs c-means similarity") +
    scale_colour_viridis_c()

mean(df.iris$dbscan.cmeans.ecs)
#> [1] 0.4675256

# next compare flat disjoint and hierarchical disjoint clusterings
df.iris$dbscan.hc.ecs <- element_sim_elscore(
    db.res,
    hc.res
)
ggplot(df.iris, aes(x = PC1, y = PC2, color = dbscan.hc.ecs)) +
    geom_point() +
    labs(title = "DBScan vs hclust similarity") +
    scale_colour_viridis_c()

mean(df.iris$dbscan.hc.ecs)
#> [1] 0.5875193

# finally compare flat soft and hierarchical disjoint clusterings
df.iris$cmeans.hc.ecs <- element_sim_elscore(
    cmeans.res,
    hc.res
)
ggplot(df.iris, aes(x = PC1, y = PC2, color = cmeans.hc.ecs)) +
    geom_point() +
    labs(title = "c-means vs hclust similarity") +
    scale_colour_viridis_c()

mean(df.iris$cmeans.hc.ecs)
#> [1] 0.614915

Session info

sessionInfo()
#> R version 4.4.0 (2024-04-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 22.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
#> 
#> locale:
#>  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
#>  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
#>  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
#> [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
#> 
#> time zone: Europe/Bucharest
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] ggplot2_3.5.1.9000 ClustAssess_1.0.0  dendextend_1.18.0  dbscan_1.2-0      
#> [5] e1071_1.7-14      
#> 
#> loaded via a namespace (and not attached):
#>  [1] viridis_0.6.5     sass_0.4.9        utf8_1.2.4        generics_0.1.3   
#>  [5] class_7.3-22      lattice_0.22-5    digest_0.6.35     magrittr_2.0.3   
#>  [9] evaluate_1.0.0    grid_4.4.0        iterators_1.0.14  fastmap_1.2.0    
#> [13] foreach_1.5.2     jsonlite_1.8.8    Matrix_1.7-0      gridExtra_2.3    
#> [17] fansi_1.0.6       viridisLite_0.4.2 scales_1.3.0      codetools_0.2-19 
#> [21] textshaping_0.4.0 jquerylib_0.1.4   cli_3.6.3         rlang_1.1.4      
#> [25] munsell_0.5.1     withr_3.0.1       cachem_1.1.0      yaml_2.3.8       
#> [29] tools_4.4.0       dplyr_1.1.4       colorspace_2.1-1  vctrs_0.6.5      
#> [33] R6_2.5.1          proxy_0.4-27      lifecycle_1.0.4   fs_1.6.4         
#> [37] htmlwidgets_1.6.4 ragg_1.3.3        pkgconfig_2.0.3   desc_1.4.3       
#> [41] pkgdown_2.1.1     pillar_1.9.0      bslib_0.7.0       gtable_0.3.5     
#> [45] glue_1.7.0        Rcpp_1.0.13       systemfonts_1.1.0 highr_0.11       
#> [49] xfun_0.44         tibble_3.2.1      tidyselect_1.2.1  knitr_1.47       
#> [53] farver_2.1.2      igraph_2.0.3      htmltools_0.5.8.1 labeling_0.4.3   
#> [57] rmarkdown_2.27    compiler_4.4.0