ClustAssessPy
1from ClustAssessPy.basic_functions import element_sim_elscore, element_sim, element_sim_matrix, element_agreement, element_consistency, weighted_element_consistency 2from ClustAssessPy.dim_reduction_assessment import assess_feature_stability 3from ClustAssessPy.graph_clustering_assessment import assess_clustering_stability 4from ClustAssessPy.graph_construction_assessment import assess_nn_stability, get_adjacency_matrix_wrapper 5from ClustAssessPy.plotting_functions import plot_clustering_overall_stability, plot_feature_overall_stability_boxplot, plot_feature_overall_stability_incremental, plot_k_n_partitions, plot_k_resolution_corresp, plot_n_neigh_ecs 6 7__all__ = [ 8 'element_sim_elscore', 9 'element_sim', 10 'element_sim_matrix', 11 'element_agreement', 12 'element_consistency', 13 'weighted_element_consistency', 14 'assess_feature_stability', 15 'assess_clustering_stability', 16 'assess_nn_stability', 17 'plot_clustering_overall_stability', 18 'plot_feature_overall_stability_boxplot', 19 'plot_feature_overall_stability_incremental', 20 'plot_k_n_partitions', 21 'plot_k_resolution_corresp', 22 'plot_n_neigh_ecs' 23] 24 25# print("ClustAssessPy package loaded!!")
4def element_sim_elscore(clustering1, clustering2): 5 """ 6 Calculates the element-wise element-centric similarity between two clustering results. 7 8 ### Args: 9 clustering1 (array-like): The first clustering partition, where each element represents a cluster assignment. 10 clustering2 (array-like): The second clustering partition to compare against 'clustering1'. 11 12 ### Returns: 13 ndarray: A NumPy array of ECS values for each element in the clustering partitions. The length of this array is equal to the number of elements in the clustering partitions. 14 15 ### Raises: 16 ValueError: If 'clustering1' and 'clustering2' are not of the same length. 17 18 ### Example: 19 >>> clustering1 = np.array([1, 1, 2, 2, 3]) 20 >>> clustering2 = np.array([1, 2, 2, 3, 3]) 21 >>> element_sim_elscore(clustering1, clustering2) 22 """ 23 24 clustering1 = np.asarray(clustering1) 25 clustering2 = np.asarray(clustering2) 26 27 if clustering1.shape != clustering2.shape: 28 raise ValueError("clustering1 and clustering2 must have the same length") 29 30 # Map cluster labels to indices starting from 0 31 labels_a, inverse_a = np.unique(clustering1, return_inverse=True) 32 labels_b, inverse_b = np.unique(clustering2, return_inverse=True) 33 n_labels_a = labels_a.size 34 n_labels_b = labels_b.size 35 36 # Compute the contingency table C 37 data = np.ones(clustering1.size) 38 C = coo_matrix((data, (inverse_a, inverse_b)), shape=(n_labels_a, n_labels_b)).toarray() 39 40 len_A_eq_ca = C.sum(axis=1) # Number of elements in each cluster in clustering1 41 len_B_eq_cb = C.sum(axis=0) # Number of elements in each cluster in clustering2 42 43 # Compute the inverse cluster sizes, handling zero divisions 44 len_A_eq_ca_inv = np.divide(1.0, len_A_eq_ca, where=len_A_eq_ca != 0) 45 len_B_eq_cb_inv = np.divide(1.0, len_B_eq_cb, where=len_B_eq_cb != 0) 46 47 # Compute the absolute difference of inverse cluster sizes 48 delta_inv = np.abs(len_A_eq_ca_inv[:, None] - len_B_eq_cb_inv[None, :]) 49 50 # Compute the terms used in the ECS calculation 51 term1 = C * delta_inv 52 term2 = (len_A_eq_ca[:, None] - C) * len_A_eq_ca_inv[:, None] 53 term3 = (len_B_eq_cb[None, :] - C) * len_B_eq_cb_inv[None, :] 54 55 # Compute the ECS score matrix 56 score_matrix = 1 - 0.5 * (term1 + term2 + term3) 57 score_matrix = np.clip(score_matrix, 0, 1) 58 59 # Assign ECS values to each element based on their cluster assignments 60 ecs_element_values = score_matrix[inverse_a, inverse_b] 61 62 return ecs_element_values
Calculates the element-wise element-centric similarity between two clustering results.
Args:
clustering1 (array-like): The first clustering partition, where each element represents a cluster assignment.
clustering2 (array-like): The second clustering partition to compare against 'clustering1'.
Returns:
ndarray: A NumPy array of ECS values for each element in the clustering partitions. The length of this array is equal to the number of elements in the clustering partitions.
Raises:
ValueError: If 'clustering1' and 'clustering2' are not of the same length.
Example:
>>> clustering1 = np.array([1, 1, 2, 2, 3])
>>> clustering2 = np.array([1, 2, 2, 3, 3])
>>> element_sim_elscore(clustering1, clustering2)
64def element_sim(clustering1, clustering2): 65 """ 66 Calculate the average element-centric similarity between two clustering results 67 68 ### Args: 69 clustering1 (array-like): The first clustering partition, where each element represents a cluster assignment. 70 clustering2 (array-like): The second clustering partition to compare against 'clustering1'. 71 72 ### Returns: 73 float: The average ECS value for all elements across the two clustering partitions. 74 75 ### Raises: 76 ValueError: If 'clustering1' and 'clustering2' are not of the same length or if they are not appropriate vector types (i.e., NumPy arrays, lists, or tuples). 77 78 ### Example: 79 >>> clustering1 = np.array([1, 1, 2, 2, 3]) 80 >>> clustering2 = np.array([1, 2, 2, 3, 3]) 81 >>> element_sim(clustering1, clustering2) 82 """ 83 return np.mean(element_sim_elscore(clustering1, clustering2))
Calculate the average element-centric similarity between two clustering results
Args:
clustering1 (array-like): The first clustering partition, where each element represents a cluster assignment.
clustering2 (array-like): The second clustering partition to compare against 'clustering1'.
Returns:
float: The average ECS value for all elements across the two clustering partitions.
Raises:
ValueError: If 'clustering1' and 'clustering2' are not of the same length or if they are not appropriate vector types (i.e., NumPy arrays, lists, or tuples).
Example:
>>> clustering1 = np.array([1, 1, 2, 2, 3])
>>> clustering2 = np.array([1, 2, 2, 3, 3])
>>> element_sim(clustering1, clustering2)
189def element_sim_matrix(clustering_list): 190 """ 191 Compare a set of clusterings by calculating their pairwise average element-centric clustering similarities. 192 193 Optimized to avoid redundant computations and utilize vectorized operations. 194 195 ### Args: 196 clustering_list (list of array-like): A list where each element is a clustering partition represented as an array-like structure (e.g., lists, NumPy arrays). Each clustering partition should have the same length, with each element representing a cluster assignment. 197 198 ### Returns: 199 ndarray: A NumPy matrix containing the pairwise ECS values. 200 201 ### Raises: 202 ValueError: If elements of the 'clustering_list' are not of the same length or if they are not appropriate array-like structures. 203 204 ### Example: 205 >>> clustering_list = [np.array([1, 1, 2]), np.array([1, 2, 2]), np.array([2, 2, 1])] 206 >>> element_sim_matrix(clustering_list) 207 """ 208 209 # Precompute data for all clusterings 210 clustering_data = _precompute_clustering_data(clustering_list) 211 num_clusterings = len(clustering_data) 212 sim_matrix = np.ones((num_clusterings, num_clusterings)) # Diagonal elements are 1 213 214 # Generate upper triangle indices (excluding diagonal) 215 indices_i, indices_j = np.triu_indices(num_clusterings, k=1) 216 217 # Compute pairwise similarities 218 for idx in range(len(indices_i)): 219 i = indices_i[idx] 220 j = indices_j[idx] 221 sim_score = _element_sim_with_precomputed(clustering_data[i], clustering_data[j]) 222 sim_matrix[i, j] = sim_score 223 sim_matrix[j, i] = sim_score # Symmetric matrix 224 225 return sim_matrix
Compare a set of clusterings by calculating their pairwise average element-centric clustering similarities.
Optimized to avoid redundant computations and utilize vectorized operations.
Args:
clustering_list (list of array-like): A list where each element is a clustering partition represented as an array-like structure (e.g., lists, NumPy arrays). Each clustering partition should have the same length, with each element representing a cluster assignment.
Returns:
ndarray: A NumPy matrix containing the pairwise ECS values.
Raises:
ValueError: If elements of the 'clustering_list' are not of the same length or if they are not appropriate array-like structures.
Example:
>>> clustering_list = [np.array([1, 1, 2]), np.array([1, 2, 2]), np.array([2, 2, 1])]
>>> element_sim_matrix(clustering_list)
85def element_agreement(reference_clustering, clustering_list): 86 """ 87 Inspect how consistently a set of clusterings agree with a reference clustering by calculating their element-wise average agreement. 88 89 Optimized using precomputed data. 90 91 ### Args: 92 reference_clustering (array-like): The reference clustering, that each clustering in clustering_list is compared to 93 clustering_list (list of array-like): A list where each element is a clustering partition represented as an array-like structure (e.g., lists, NumPy arrays). 94 95 ### Returns: 96 ndarray: A vector containing the element-wise average agreement. 97 98 ### Raises: 99 ValueError: If any of the clusterings in the clustering_list are not of the same length as reference_clustering or if they are not appropriate array-like structures. 100 101 ### Example: 102 >>> reference_clustering = np.array([1, 1, 2]) 103 >>> clustering_list = [np.array([1, 1, 2]), np.array([1, 2, 2]), np.array([2, 2, 1])] 104 >>> element_agreement(reference_clustering, clustering_list) 105 """ 106 reference_clustering = np.asarray(reference_clustering) 107 num_elements = len(reference_clustering) 108 num_clusterings = len(clustering_list) 109 110 # Precompute data for the reference clustering 111 labels_ref, inverse_ref = np.unique(reference_clustering, return_inverse=True) 112 data_ref = { 113 'clustering': reference_clustering, 114 'labels': labels_ref, 115 'inverse': inverse_ref 116 } 117 118 # Initialize ECS accumulator 119 ecs_element_values = np.zeros(num_elements) 120 121 # Compute ECS values for each clustering 122 for clustering in clustering_list: 123 clustering = np.asarray(clustering) 124 labels, inverse = np.unique(clustering, return_inverse=True) 125 data = { 126 'clustering': clustering, 127 'labels': labels, 128 'inverse': inverse 129 } 130 ecs_values = _element_sim_elscore_with_precomputed(data_ref, data) 131 ecs_element_values += ecs_values 132 133 # Compute average ECS values 134 ecs_element_values /= num_clusterings 135 136 return ecs_element_values
Inspect how consistently a set of clusterings agree with a reference clustering by calculating their element-wise average agreement.
Optimized using precomputed data.
Args:
reference_clustering (array-like): The reference clustering, that each clustering in clustering_list is compared to
clustering_list (list of array-like): A list where each element is a clustering partition represented as an array-like structure (e.g., lists, NumPy arrays).
Returns:
ndarray: A vector containing the element-wise average agreement.
Raises:
ValueError: If any of the clusterings in the clustering_list are not of the same length as reference_clustering or if they are not appropriate array-like structures.
Example:
>>> reference_clustering = np.array([1, 1, 2])
>>> clustering_list = [np.array([1, 1, 2]), np.array([1, 2, 2]), np.array([2, 2, 1])]
>>> element_agreement(reference_clustering, clustering_list)
138def element_consistency(clustering_list): 139 """ 140 Inspects the consistency of a set of clusterings by calculating their element-wise clustering consistency. 141 142 Optimized version using precomputed data and efficient looping. 143 144 ### Args: 145 clustering_list (list of array-like): A list where each element is a clustering partition represented as an array-like structure (e.g., lists, NumPy arrays). Each clustering partition should have the same length, with each element representing a cluster assignment. 146 147 ### Returns: 148 ndarray: A NumPy array of average consistency scores for each element in the clustering partitions. 149 The length of this array is equal to the number of elements in the individual clustering partitions. 150 151 ### Raises: 152 ValueError: If elements of the 'clustering_list' are not of the same length or if they are not appropriate array-like structures. 153 154 ### Example: 155 >>> import numpy as np 156 >>> clustering_list = [np.array([1, 1, 2]), np.array([1, 2, 2]), np.array([2, 2, 1])] 157 >>> ecs_values = element_consistency(clustering_list) 158 >>> print(ecs_values) 159 """ 160 161 # Precompute data for each clustering 162 clustering_data = _precompute_clustering_data(clustering_list) 163 num_clusterings = len(clustering_data) 164 num_elements = len(clustering_data[0]['clustering']) 165 166 # Check that all clusterings have the same length 167 if not all(len(data['clustering']) == num_elements for data in clustering_data): 168 raise ValueError("All clusterings must have the same number of elements") 169 170 ecs_element_values = np.zeros(num_elements) 171 num_pairs = num_clusterings * (num_clusterings - 1) // 2 172 173 # Generate all unique pairs of indices 174 indices_i, indices_j = np.triu_indices(num_clusterings, k=1) 175 176 # Loop over pairs using precomputed data 177 for idx in range(len(indices_i)): 178 i = indices_i[idx] 179 j = indices_j[idx] 180 ecs_element_values += _element_sim_elscore_with_precomputed( 181 clustering_data[i], clustering_data[j] 182 ) 183 184 # Compute the average ECS values 185 ecs_element_values /= num_pairs 186 187 return ecs_element_values
Inspects the consistency of a set of clusterings by calculating their element-wise clustering consistency.
Optimized version using precomputed data and efficient looping.
Args:
clustering_list (list of array-like): A list where each element is a clustering partition represented as an array-like structure (e.g., lists, NumPy arrays). Each clustering partition should have the same length, with each element representing a cluster assignment.
Returns:
ndarray: A NumPy array of average consistency scores for each element in the clustering partitions.
The length of this array is equal to the number of elements in the individual clustering partitions.
Raises:
ValueError: If elements of the 'clustering_list' are not of the same length or if they are not appropriate array-like structures.
Example:
>>> import numpy as np
>>> clustering_list = [np.array([1, 1, 2]), np.array([1, 2, 2]), np.array([2, 2, 1])]
>>> ecs_values = element_consistency(clustering_list)
>>> print(ecs_values)
227def weighted_element_consistency(clustering_list, weights=None, calculate_sim_matrix=False): 228 """ 229 Calculate the weighted element-wise consistency of a set of clusterings. 230 231 This function computes per-element weighted consistency scores, aligning with the old implementation. 232 233 ### Args: 234 clustering_list (list of array-like): A list where each element is a clustering partition. 235 weights (list of float, optional): A list of weights corresponding to each clustering in 'clustering_list'. 236 calculate_sim_matrix (bool, optional): Whether to return the similarity matrix. 237 238 ### Returns: 239 ndarray: An array of per-element weighted consistency scores. 240 (optional) ndarray: Similarity matrix if calculate_sim_matrix is True. 241 242 ### Example: 243 >>> clustering_list = [np.array([1, 1, 2]), np.array([1, 2, 2]), np.array([2, 2, 1])] 244 >>> weighted_ecc = weighted_element_consistency(clustering_list, weights=[1, 0.5, 0.5]) 245 >>> print(weighted_ecc) 246 """ 247 n_clusterings = len(clustering_list) 248 num_elements = len(clustering_list[0]) 249 250 # Precompute data for all clusterings 251 clustering_data = _precompute_clustering_data(clustering_list) 252 253 if weights is None: 254 weights = np.ones(n_clusterings) 255 else: 256 weights = np.asarray(weights) 257 if weights.shape[0] != n_clusterings: 258 raise ValueError("Length of 'weights' must match the number of clusterings.") 259 260 total_weights = np.sum(weights) 261 num_pairs = total_weights * (total_weights - 1) / 2 262 263 # Initialize consistency accumulator as an array 264 consistency = np.zeros(num_elements) 265 266 if calculate_sim_matrix: 267 sim_matrix = np.zeros((n_clusterings, n_clusterings)) # Initialize with zeros 268 269 # Generate all unique pairs of indices 270 indices_i, indices_j = np.triu_indices(n_clusterings, k=1) 271 272 # Compute pairwise ECS values and accumulate weighted consistency 273 for idx in range(len(indices_i)): 274 i = indices_i[idx] 275 j = indices_j[idx] 276 weight_pair = weights[i] * weights[j] 277 278 ecs_values = _element_sim_elscore_with_precomputed(clustering_data[i], clustering_data[j]) 279 280 consistency += ecs_values * weight_pair 281 282 if calculate_sim_matrix: 283 mean_ecs = np.mean(ecs_values) 284 sim_matrix[i, j] = mean_ecs # Only assign upper triangle 285 286 # Add self-consistency contributions (element-wise) 287 for i in range(n_clusterings): 288 weight = weights[i] 289 # Since ECS of a clustering with itself is 1 for all elements, 290 # and weights[i] * (weights[i] - 1) / 2 is the weight of self-consistency, 291 # we multiply it by 1 (the ECS value) for each element. 292 consistency += (weight * (weight - 1) / 2) * np.ones(num_elements) 293 294 # Normalize consistency 295 normalized_consistency = consistency / num_pairs 296 297 if calculate_sim_matrix: 298 return normalized_consistency, sim_matrix # sim_matrix remains upper triangular with zeros elsewhere 299 300 return normalized_consistency
Calculate the weighted element-wise consistency of a set of clusterings.
This function computes per-element weighted consistency scores, aligning with the old implementation.
Args:
clustering_list (list of array-like): A list where each element is a clustering partition.
weights (list of float, optional): A list of weights corresponding to each clustering in 'clustering_list'.
calculate_sim_matrix (bool, optional): Whether to return the similarity matrix.
Returns:
ndarray: An array of per-element weighted consistency scores.
(optional) ndarray: Similarity matrix if calculate_sim_matrix is True.
Example:
>>> clustering_list = [np.array([1, 1, 2]), np.array([1, 2, 2]), np.array([2, 2, 1])]
>>> weighted_ecc = weighted_element_consistency(clustering_list, weights=[1, 0.5, 0.5])
>>> print(weighted_ecc)
132def assess_feature_stability(data_matrix, feature_set, steps, feature_type, 133 resolution, ncores, n_repetitions=100, seed_sequence=None, 134 graph_reduction_type="PCA", npcs=30, ecs_thresh=1, 135 algorithm='leiden', umap_arguments=None, show_warnings = False, parallel_steps = True, pca_postprocessing = None, flavor = "default"): 136 """ 137 Assess the stability for configurations of feature types and size. Evaluate the stability of clusterings obtained based on incremental subsets of a given feature set. 138 139 The algorithm assumes that the feature_set is already sorted when performing the subsetting. For example, if the user wants to analyze highly variable feature set, they should provide them sorted by their variability. 140 141 ### Args: 142 data_matrix (pandas.DataFrame): A data matrix with rows as cells and columns as features. 143 feature_set (list): A list of features feature names that can be found on the rownames of the data matrix. 144 steps (list): A list of integers containing the sizes of the subsets. 145 feature_type (str): A name associated to the feature_set (e.g. HV - for highly variable genes). 146 resolution (list): A list of resolutions to be used for clustering. 147 ncores (int): The number of cores to be used for parallel processing. 148 n_repetitions (int): The number of repetitions of applying the pipeline with different seeds; ignored if seed_sequence is provided by the user. 149 seed_sequence (int array): A custom seed sequence; if the value is None, the sequence will be built starting from 1 with a step of 100. 150 graph_reduction_type (str): The graph reduction type, denoting if the graph should be built on either the PCA or the UMAP embedding. 151 npcs (int): The number of principal components. 152 ecs_thresh (float): The ECS threshold used for merging similar clusterings. 153 algorithm (str): The community detection algorithm to be used: 'louvain', or 'leiden'. Default and recommended is 'leiden'. 154 umap_arguments (dict): A dictionary containing the arguments to be passed to the UMAP function. 155 warnings_verbose (bool): Whether to print warnings or not. 156 flavor (str): The implementation of the graph clustering algorithm to be used. Default is 'default'. You can also use 'igraph'. 157 158 ### Raises: 159 ValueError: If data_matrix is not a pandas DataFrame. 160 ValueError: If feature_set is not a list or tuple of strings, or contains elements not in data_matrix. 161 ValueError: If any feature in feature_set is not found in data_matrix columns. 162 ValueError: If steps is not an integer or a list/tuple of integers. 163 ValueError: If feature_type is not a string. 164 ValueError: If resolution is not a list or tuple of floats. 165 ValueError: If n_repetitions is not an integer. 166 ValueError: If graph_reduction_type is not 'PCA' or 'UMAP'. 167 ValueError: If npcs is not an integer. 168 ValueError: If ecs_thresh is not a numeric value. 169 ValueError: If algorithm is not 'louvain' or 'leiden'. 170 ValueError: If flavor is not 'default' or 'igraph'. 171 172 ### Returns: 173 # dict: A dictionary with the following structure: 174 # - pca_list (dict): PCA embeddings for each step. 175 # - embedding_list (dict): UMAP embeddings for each step. 176 # - steps_ecc_list (dict): Contains the element consistency scores for each step and resolution, with each entry as follows: 177 # - ecc (float): The element consistency score for the corresponding step. 178 # - most_frequent_partition (dict): Information about the partition with the highest frequency: 179 # - mb (list): The partition with the highest frequency. 180 # - freq (int): The frequency of the most frequent partition. 181 # - seed (int): The seed used for the most frequent partition. 182 # - n_different_partitions (int): The number of different partitions obtained for the corresponding step. 183 # - incremental_ecs_list (dict): Incremental element consistency scores for each step and resolution. 184 185 ### Example: 186 >>> import pandas as pd 187 >>> import ClustAssessPy as ca 188 >>> import numpy as np 189 >>> # Generate a random data matrix 190 >>> data_matrix = pd.DataFrame(np.random.rand(1000, 1000)) 191 >>> data_matrix.columns = [f'gene_{i}' for i in range(1000)] 192 >>> # Define the feature set 193 >>> feature_set = data_matrix.columns.tolist() 194 >>> # Define the steps 195 >>> steps = [500, 1000, 1500] 196 >>> # Define the resolutions 197 >>> resolution = np.arange(0.1, 1.1, 0.3) 198 >>> # Assess the feature stability 199 >>> feature_stability_results = ca.assess_feature_stability(data_matrix, feature_set, steps, 'test', resolution, n_repetitions=10, graph_reduction_type="PCA", npcs=30, algorithm='leiden')""" 200 201 warnings.warn("Please ensure data matrix has rows as cells and columns as features.") 202 203 if not show_warnings: 204 if algorithm == "leiden": 205 print("FutureWarnings (caused by ScanPy for Leiden) will be suppressed. Set show_warnings=True to see them.") 206 207 # Parameter checks and initial processing 208 _validate_parameters_feature_stability(data_matrix, feature_set, steps, feature_type, resolution, n_repetitions, graph_reduction_type, npcs, ecs_thresh, algorithm, flavor, ncores) 209 210 ncells = data_matrix.shape[0] 211 212 pca_list, embedding_list, partitions_list, steps_ecc_list, incremental_ecs_list = {}, {}, {}, {}, {} 213 214 if seed_sequence is None: 215 seed_sequence = range(1, n_repetitions * 100, 100) 216 217 if parallel_steps: 218 # Run steps in parallel 219 with concurrent.futures.ProcessPoolExecutor(max_workers = ncores) as executor: 220 futures = {} 221 for step in steps: 222 # Map each step to its corresponding future. 223 future = executor.submit(_perform_feature_stability_step, data_matrix, feature_set, step, npcs, ncells, resolution, seed_sequence, algorithm, ecs_thresh, partitions_list, graph_reduction_type, pca_postprocessing, flavor, ncores, show_warnings, umap_arguments) 224 futures[future] = step 225 226 # Process the results as they complete 227 for future in concurrent.futures.as_completed(futures): 228 step = futures[future] 229 result = future.result() 230 231 if graph_reduction_type == "UMAP": 232 pca_embeddings, umap_embeddings, temp_list = result 233 assign_to_nested_dict(embedding_list, feature_type, step, umap_embeddings) 234 else: 235 pca_embeddings, temp_list = result 236 237 # Store pca_embeddings and steps_ecc (temp_list) 238 assign_to_nested_dict(pca_list, feature_type, str(step), pca_embeddings) 239 assign_to_nested_dict(steps_ecc_list, feature_type, str(step), temp_list) 240 else: 241 for step in steps: 242 if graph_reduction_type == "UMAP": 243 pca_embeddings, umap_embeddings, temp_list = _perform_feature_stability_step(data_matrix, feature_set, step, npcs, ncells, resolution, seed_sequence, algorithm, ecs_thresh, partitions_list, graph_reduction_type, pca_postprocessing, flavor, ncores, umap_arguments) 244 assign_to_nested_dict(embedding_list, feature_type, step, umap_embeddings) 245 else: 246 pca_embeddings, temp_list = _perform_feature_stability_step(data_matrix, feature_set, step, npcs, ncells, resolution, seed_sequence, algorithm, ecs_thresh, partitions_list, graph_reduction_type, pca_postprocessing, flavor, ncores, umap_arguments) 247 248 # Store pca_embeddings and steps_ecc (temp_list) 249 assign_to_nested_dict(pca_list, feature_type, str(step), pca_embeddings) 250 assign_to_nested_dict(steps_ecc_list, feature_type, str(step), temp_list) 251 252 # Compute the incremental_ecs_list 253 incremental_ecs_list = {} 254 incremental_ecs_list[feature_type] = {} 255 if len(steps) > 1: 256 for i in range(1, len(steps)): 257 temp_list = {} 258 for res in map(str, resolution): 259 temp_list[res] = element_sim_elscore( 260 steps_ecc_list[feature_type][str(steps[i - 1])][res]['most_frequent_partition']['mb'], 261 steps_ecc_list[feature_type][str(steps[i])][res]['most_frequent_partition']['mb'] 262 ) 263 264 incremental_ecs_list[feature_type][f'{steps[i - 1]}-{steps[i]}'] = temp_list 265 266 return { 267 'pca_list': pca_list, 268 'embedding_list': embedding_list, 269 'steps_ecc_list': steps_ecc_list, 270 'incremental_ecs_list': incremental_ecs_list 271 }
Assess the stability for configurations of feature types and size. Evaluate the stability of clusterings obtained based on incremental subsets of a given feature set.
The algorithm assumes that the feature_set is already sorted when performing the subsetting. For example, if the user wants to analyze highly variable feature set, they should provide them sorted by their variability.
Args:
data_matrix (pandas.DataFrame): A data matrix with rows as cells and columns as features.
feature_set (list): A list of features feature names that can be found on the rownames of the data matrix.
steps (list): A list of integers containing the sizes of the subsets.
feature_type (str): A name associated to the feature_set (e.g. HV - for highly variable genes).
resolution (list): A list of resolutions to be used for clustering.
ncores (int): The number of cores to be used for parallel processing.
n_repetitions (int): The number of repetitions of applying the pipeline with different seeds; ignored if seed_sequence is provided by the user.
seed_sequence (int array): A custom seed sequence; if the value is None, the sequence will be built starting from 1 with a step of 100.
graph_reduction_type (str): The graph reduction type, denoting if the graph should be built on either the PCA or the UMAP embedding.
npcs (int): The number of principal components.
ecs_thresh (float): The ECS threshold used for merging similar clusterings.
algorithm (str): The community detection algorithm to be used: 'louvain', or 'leiden'. Default and recommended is 'leiden'.
umap_arguments (dict): A dictionary containing the arguments to be passed to the UMAP function.
warnings_verbose (bool): Whether to print warnings or not.
flavor (str): The implementation of the graph clustering algorithm to be used. Default is 'default'. You can also use 'igraph'.
Raises:
ValueError: If data_matrix is not a pandas DataFrame.
ValueError: If feature_set is not a list or tuple of strings, or contains elements not in data_matrix.
ValueError: If any feature in feature_set is not found in data_matrix columns.
ValueError: If steps is not an integer or a list/tuple of integers.
ValueError: If feature_type is not a string.
ValueError: If resolution is not a list or tuple of floats.
ValueError: If n_repetitions is not an integer.
ValueError: If graph_reduction_type is not 'PCA' or 'UMAP'.
ValueError: If npcs is not an integer.
ValueError: If ecs_thresh is not a numeric value.
ValueError: If algorithm is not 'louvain' or 'leiden'.
ValueError: If flavor is not 'default' or 'igraph'.
Returns:
# dict: A dictionary with the following structure:
# - pca_list (dict): PCA embeddings for each step.
# - embedding_list (dict): UMAP embeddings for each step.
# - steps_ecc_list (dict): Contains the element consistency scores for each step and resolution, with each entry as follows:
# - ecc (float): The element consistency score for the corresponding step.
# - most_frequent_partition (dict): Information about the partition with the highest frequency:
# - mb (list): The partition with the highest frequency.
# - freq (int): The frequency of the most frequent partition.
# - seed (int): The seed used for the most frequent partition.
# - n_different_partitions (int): The number of different partitions obtained for the corresponding step.
# - incremental_ecs_list (dict): Incremental element consistency scores for each step and resolution.
Example:
>>> import pandas as pd
>>> import ClustAssessPy as ca
>>> import numpy as np
>>> # Generate a random data matrix
>>> data_matrix = pd.DataFrame(np.random.rand(1000, 1000))
>>> data_matrix.columns = [f'gene_{i}' for i in range(1000)]
>>> # Define the feature set
>>> feature_set = data_matrix.columns.tolist()
>>> # Define the steps
>>> steps = [500, 1000, 1500]
>>> # Define the resolutions
>>> resolution = np.arange(0.1, 1.1, 0.3)
>>> # Assess the feature stability
>>> feature_stability_results = ca.assess_feature_stability(data_matrix, feature_set, steps, 'test', resolution, n_repetitions=10, graph_reduction_type="PCA", npcs=30, algorithm='leiden')
70def assess_clustering_stability(graph_adjacency_matrix, resolution, ncores, 71 n_repetitions=100, 72 seed_sequence=None, ecs_thresh=1, 73 algorithms=['louvain', 'leiden'], 74 flavor="default", show_warnings=False): 75 ''' 76 Evaluates the stability of different graph clustering methods in the clustering pipeline. 77 The method will iterate through different values of the resolution parameter and compare, using the EC Consistency score, the partitions obtained at different seeds. 78 79 ### Args: 80 graph_adjacency_matrix (pandas.DataFrame): A data matrix with rows as cells and columns as features. 81 resolution (list): A list of resolutions to be used for clustering. 82 ncores (int): The number of cores to be used for parallel processing. 83 n_repetitions (int): The number of repetitions of applying the pipeline with different seeds; ignored if seed_sequence is provided by the user. 84 seed_sequence (int array): A custom seed sequence; if the value is None, the sequence will be built starting from 1 with a step of 100. 85 ecs_thresh (float): The ECS threshold used for merging similar clusterings. 86 algorithms (str array): The community detection algorithm to be used: ['louvain'], or ['leiden'], or both ['louvain', 'leiden']. 87 flavor (str): The implementation of the graph clustering algorithm to be used. Default is 'default'. You can also use 'igraph'. 88 89 ### Raises: 90 ValueError: If resolution is not a list or tuple of floats. 91 ValueError: If n_repetitions is not an integer. 92 ValueError: If ecs_thresh is not a numeric value. 93 ValueError: If algorithm is not 'louvain' or 'leiden'. 94 ValueError: If flavor is not 'default' or 'igraph'. 95 96 ### Returns: 97 # dict: A dictionary with the following structure: 98 # - split_by_resolution (dict): Stability results when evaluating the clustering stability by resolution. 99 # - split_by_k (dict): Stability results when evaluating the clustering stability by the number of clusters (k). 100 101 Example: 102 >>> import numpy as np 103 >>> import pandas as pd 104 >>> import ClustAssessPy as ca 105 >>> # Create a PCA embedding. 106 >>> pca_emb = np.random.uniform(low=0.0, high=1.0, size=(100, 30)) 107 >>> pca_emb_df = pd.DataFrame(pca_emb, index=map(str, range(1, 101)), columns=["PCA_" + str(i) for i in range(1, 31)]) 108 >>> # Get SNN matrix from pca using 20 neighbours 109 >>> adj_mat = ca.get_adjacency_matrix_wrapper(pca_emb_df, n_neighs=20)['snn'] 110 >>> clust_stability = ca.assess_clustering_stability(adj_mat, resolution=[0.1, 0.3, 0.5, 0.7, 0.9], n_repetitions=10, algorithms=['louvain','leiden']) 111 >>> # ------------------------------------------------ 112 >>> # ALTERNATIVELY WITH SCANPY - assuming you have an adata object 113 >>> # ------------------------------------------------ 114 >>> import scanpy as sc 115 >>> sc.pp.neighbors(adata, n_neighbors=20) 116 >>> adj_mat = adata.obsp['connectivities'] 117 >>> clust_stability = ca.assess_clustering_stability(adj_mat, resolution=[0.1, 0.3, 0.5, 0.7, 0.9], n_repetitions=10, algorithms=['louvain','leiden']) 118 ''' 119 if not show_warnings: 120 if 'leiden' in algorithms: 121 print("FutureWarnings (caused by ScanPy for Leiden) will be suppressed. Set show_warnings=True to see them.") 122 123 # Convert adjacency matrix to csr_matrix if it's not already 124 if not isinstance(graph_adjacency_matrix, csr_matrix): 125 graph_adjacency_matrix = csr_matrix(graph_adjacency_matrix) 126 127 _validate_parameters_clustering_stability(resolution, n_repetitions, ecs_thresh, algorithms, flavor, ncores) 128 129 if seed_sequence is None: 130 seed_sequence = range(1, n_repetitions * 100, 100) 131 else: 132 if not isinstance(seed_sequence, (list, np.ndarray)): 133 raise ValueError("seed_sequence parameter should be numeric") 134 seed_sequence = np.array(seed_sequence).astype(int) 135 136 result_object = {} 137 138 adata = AnnData(graph_adjacency_matrix) 139 140 for alg in algorithms: 141 result_object[alg] = {} 142 143 with concurrent.futures.ProcessPoolExecutor(max_workers = ncores) as executor: 144 futures = {executor.submit(_evaluate_clustering_resolution, graph_adjacency_matrix, adata, alg, res, seed_sequence, ncores, ecs_thresh, flavor, show_warnings):res for res in resolution} 145 146 for future in concurrent.futures.as_completed(futures.keys()): 147 res = futures[future] 148 result_object[alg][str(res)] = future.result() 149 150 algorithm_names = list(result_object.keys()) 151 split_by_k = {alg_name: merge_resolutions(result_object[alg_name]) for alg_name in algorithm_names if alg_name in result_object} 152 153 return { 154 'split_by_resolution': result_object, 155 'split_by_k': split_by_k 156 }
Evaluates the stability of different graph clustering methods in the clustering pipeline. The method will iterate through different values of the resolution parameter and compare, using the EC Consistency score, the partitions obtained at different seeds.
Args:
graph_adjacency_matrix (pandas.DataFrame): A data matrix with rows as cells and columns as features.
resolution (list): A list of resolutions to be used for clustering.
ncores (int): The number of cores to be used for parallel processing.
n_repetitions (int): The number of repetitions of applying the pipeline with different seeds; ignored if seed_sequence is provided by the user.
seed_sequence (int array): A custom seed sequence; if the value is None, the sequence will be built starting from 1 with a step of 100.
ecs_thresh (float): The ECS threshold used for merging similar clusterings.
algorithms (str array): The community detection algorithm to be used: ['louvain'], or ['leiden'], or both ['louvain', 'leiden'].
flavor (str): The implementation of the graph clustering algorithm to be used. Default is 'default'. You can also use 'igraph'.
Raises:
ValueError: If resolution is not a list or tuple of floats.
ValueError: If n_repetitions is not an integer.
ValueError: If ecs_thresh is not a numeric value.
ValueError: If algorithm is not 'louvain' or 'leiden'.
ValueError: If flavor is not 'default' or 'igraph'.
Returns:
# dict: A dictionary with the following structure:
# - split_by_resolution (dict): Stability results when evaluating the clustering stability by resolution.
# - split_by_k (dict): Stability results when evaluating the clustering stability by the number of clusters (k).
Example:
>>> import numpy as np
>>> import pandas as pd
>>> import ClustAssessPy as ca
>>> # Create a PCA embedding.
>>> pca_emb = np.random.uniform(low=0.0, high=1.0, size=(100, 30))
>>> pca_emb_df = pd.DataFrame(pca_emb, index=map(str, range(1, 101)), columns=["PCA_" + str(i) for i in range(1, 31)])
>>> # Get SNN matrix from pca using 20 neighbours
>>> adj_mat = ca.get_adjacency_matrix_wrapper(pca_emb_df, n_neighs=20)['snn']
>>> clust_stability = ca.assess_clustering_stability(adj_mat, resolution=[0.1, 0.3, 0.5, 0.7, 0.9], n_repetitions=10, algorithms=['louvain','leiden'])
>>> # ------------------------------------------------
>>> # ALTERNATIVELY WITH SCANPY - assuming you have an adata object
>>> # ------------------------------------------------
>>> import scanpy as sc
>>> sc.pp.neighbors(adata, n_neighbors=20)
>>> adj_mat = adata.obsp['connectivities']
>>> clust_stability = ca.assess_clustering_stability(adj_mat, resolution=[0.1, 0.3, 0.5, 0.7, 0.9], n_repetitions=10, algorithms=['louvain','leiden'])
185def assess_nn_stability(embedding, n_neigh_sequence, ncores, n_repetitions = 100, seed_sequence = None, 186 graph_reduction_type = "PCA", ecs_thresh = 1, graph_type = 1, algorithm = 'leiden', umap_arguments = None, flavor="default", show_warnings = False): 187 ''' 188 Evaluates clustering stability when changing the values of different parameters involved in the graph building step, namely the base embedding, the graph type and the number of neighbours. 189 190 ### Args: 191 embedding (np.ndarray or pd.DataFrame): A matrix associated with a PCA embedding. Embeddings from other dimensionality reduction techniques can be used. 192 n_neigh_sequence (int array): An integer array of the number of nearest neighbours. 193 ncores (int): The number of cores to be used for parallel processing. 194 n_repetitions (int): The number of repetitions of applying the pipeline with different seeds; ignored if seed_sequence is provided by the user. 195 seed_sequence (int array): A custom seed sequence; if the value is None, the sequence will be built starting from 1 with a step of 100. 196 graph_reduction_type (str): The graph reduction type, denoting if the graph should be built on either the PCA or the UMAP embedding. 197 ecs_thresh (float): The ECS threshold used for merging similar clusterings. 198 graph_type (int): Argument indicating whether the graph should be NN (0), SNN (1) or both (2). 199 algorithm (str): The community detection algorithm to be used: 'louvain', or 'leiden'. Default and recommended is 'leiden'. 200 umap_arguments (dict): Arguments to be used in UMAP construction. Used when graph_reduction_type = "UMAP". 201 flavor (str): The implementation of the graph clustering algorithm to be used. Default is 'default'. You can also use 'igraph'. 202 203 ### Raises: 204 ValueError: If embedding is not a numpy array or pandas DataFrame. 205 ValueError: If n_neigh_sequence is not a list or np.ndarray. 206 ValueError: If n_repetitions is not an integer. 207 ValueError: If graph_reduction_type is not 'PCA' or 'UMAP'. 208 ValueError: If ecs_thresh is not an integer or a float. 209 ValueError: If graph_type is not an array of integers or if it contain values not between 0 and 2. 210 ValueError: If algorithm is not 'louvain' or 'leiden'. 211 ValueError: If flavor is not 'default' or 'igraph'. 212 213 ### Returns: 214 # dict: A dictionary with the following structure: 215 # n_neigh_k_corresp (dict): A dict containing the number of the clusters obtained by running the pipeline multiple times with different seed, number of neighbors and graph type (weighted vs unweigted). 216 # n_neigh_ec_consistency (dict): A dict containing the EC consistency of the partitions obtained at multiple runs when changing the number of neighbors or the graph type. 217 # n_different_partitions (dict): The number of different partitions obtained by each number of neighbors. 218 219 ### Example: 220 >>> import numpy as np 221 >>> import pandas as pd 222 >>> import ClustAssessPy as ca 223 >>> # Create a PCA embedding. 224 >>> pca_emb = np.random.uniform(low=0.0, high=1.0, size=(100, 30)) 225 >>> pca_emb_df = pd.DataFrame(pca_emb, index=map(str, range(1, 101)), columns=["PCA_" + str(i) for i in range(1, 31)]) 226 >>> # Assess nn stability using Leiden algorithm 227 >>> nn_stability_obj_leiden = ca.assess_nn_stability(pca_emb_df, n_neigh_sequence=[10, 15, 20], n_repetitions = 10, graph_reduction_type="PCA" ,algorithm = 'leiden', graph_type = 2) 228 ''' 229 if not show_warnings: 230 if algorithm == "leiden": 231 print("FutureWarnings (caused by ScanPy for Leiden) will be suppressed. Set show_warnings=True to see them.") 232 233 # Validate parameters stability. 234 _validate_parameters_stability(n_neigh_sequence, n_repetitions, ecs_thresh, graph_type, embedding, algorithm, graph_reduction_type, flavor, ncores) 235 236 # Convert number of neighbors to integers. 237 n_neigh_sequence = np.sort(n_neigh_sequence).astype(int) 238 239 # convert n_repetitions to integers 240 n_repetitions = int(n_repetitions) 241 242 n_cells = embedding.shape[0] 243 n_neigh_sequence = n_neigh_sequence[np.where(n_neigh_sequence < n_cells)] 244 245 if len(n_neigh_sequence) == 0: 246 warnings.warn(f"The provided values for the `n_neigh_sequence` are greater than the number of cells ({n_cells}). For the downstream analysis, we will set `n_neigh` to {n_cells - 1}.") 247 n_neigh_sequence = np.array([n_cells - 1]) 248 249 # Create a seed sequence if it's not provided. 250 if seed_sequence is None: 251 seed_sequence = np.arange(1, n_repetitions * 100, 100) 252 else: 253 if not isinstance(seed_sequence, (list, np.ndarray)): 254 raise ValueError("seed_sequence parameter should be numeric") 255 seed_sequence = np.array(seed_sequence).astype(int) 256 257 if graph_reduction_type == "PCA": 258 return _assess_nn_stability_pca(embedding, n_neigh_sequence, seed_sequence, ecs_thresh, graph_type, algorithm, flavor, ncores, show_warnings) 259 else: 260 return _assess_nn_stability_umap(embedding, n_neigh_sequence, seed_sequence, ecs_thresh, graph_type, algorithm, umap_arguments, flavor, ncores, show_warnings)
Evaluates clustering stability when changing the values of different parameters involved in the graph building step, namely the base embedding, the graph type and the number of neighbours.
Args:
embedding (np.ndarray or pd.DataFrame): A matrix associated with a PCA embedding. Embeddings from other dimensionality reduction techniques can be used.
n_neigh_sequence (int array): An integer array of the number of nearest neighbours.
ncores (int): The number of cores to be used for parallel processing.
n_repetitions (int): The number of repetitions of applying the pipeline with different seeds; ignored if seed_sequence is provided by the user.
seed_sequence (int array): A custom seed sequence; if the value is None, the sequence will be built starting from 1 with a step of 100.
graph_reduction_type (str): The graph reduction type, denoting if the graph should be built on either the PCA or the UMAP embedding.
ecs_thresh (float): The ECS threshold used for merging similar clusterings.
graph_type (int): Argument indicating whether the graph should be NN (0), SNN (1) or both (2).
algorithm (str): The community detection algorithm to be used: 'louvain', or 'leiden'. Default and recommended is 'leiden'.
umap_arguments (dict): Arguments to be used in UMAP construction. Used when graph_reduction_type = "UMAP".
flavor (str): The implementation of the graph clustering algorithm to be used. Default is 'default'. You can also use 'igraph'.
Raises:
ValueError: If embedding is not a numpy array or pandas DataFrame.
ValueError: If n_neigh_sequence is not a list or np.ndarray.
ValueError: If n_repetitions is not an integer.
ValueError: If graph_reduction_type is not 'PCA' or 'UMAP'.
ValueError: If ecs_thresh is not an integer or a float.
ValueError: If graph_type is not an array of integers or if it contain values not between 0 and 2.
ValueError: If algorithm is not 'louvain' or 'leiden'.
ValueError: If flavor is not 'default' or 'igraph'.
Returns:
# dict: A dictionary with the following structure:
# n_neigh_k_corresp (dict): A dict containing the number of the clusters obtained by running the pipeline multiple times with different seed, number of neighbors and graph type (weighted vs unweigted).
# n_neigh_ec_consistency (dict): A dict containing the EC consistency of the partitions obtained at multiple runs when changing the number of neighbors or the graph type.
# n_different_partitions (dict): The number of different partitions obtained by each number of neighbors.
Example:
>>> import numpy as np
>>> import pandas as pd
>>> import ClustAssessPy as ca
>>> # Create a PCA embedding.
>>> pca_emb = np.random.uniform(low=0.0, high=1.0, size=(100, 30))
>>> pca_emb_df = pd.DataFrame(pca_emb, index=map(str, range(1, 101)), columns=["PCA_" + str(i) for i in range(1, 31)])
>>> # Assess nn stability using Leiden algorithm
>>> nn_stability_obj_leiden = ca.assess_nn_stability(pca_emb_df, n_neigh_sequence=[10, 15, 20], n_repetitions = 10, graph_reduction_type="PCA" ,algorithm = 'leiden', graph_type = 2)
102def plot_clustering_overall_stability(clust_object, value_type="k", summary_function=np.median): 103 ''' 104 Display EC consistency across clustering method. 105 106 ### Args: 107 clust_object (dict): The object returned by the `assess_clustering_stability` function. 108 value_type (str): The type of value to be used for grouping the results. Can be either `k` or `resolution`. 109 summary_function (function): The function to be used for summarizing the ECC values. 110 111 ### Example: 112 >>> pca_emb = np.random.rand(100, 30) 113 >>> # Create an AnnData object from pca_emb 114 >>> adata = sc.AnnData(X=pca_emb) 115 >>> # Run scanpy.pp.neighbors to get the connectivities as an adjacency matrix 116 >>> sc.pp.neighbors(adata, n_neighbors=10, use_rep='X') 117 >>> adjacency_matrix = adata.obsp['connectivities'] 118 >>> clust_obj = assess_clustering_stability(adjacency_matrix, resolution=[0.1, 0.3, 0.5], n_repetitions=10, algorithm=[1,4]) 119 >>> plot_k_resolution_corresp(clust_obj) 120 121 ''' 122 if value_type not in ["k", "resolution"]: 123 raise ValueError("`value_type` should contain either `k` or `resolution`") 124 125 # Extracting ECC values and applying the summary function 126 ecc_vals = { 127 method: { 128 val_type: summary_function(data['ecc']) 129 for val_type, data in alg.items() 130 } 131 for method, alg in clust_object[f"split_by_{value_type}"].items() 132 } 133 134 melted_df = pd.DataFrame([ 135 {'Method': method, value_type: val_type, 'ECC': ecc} 136 for method, val_types in ecc_vals.items() 137 for val_type, ecc in val_types.items() 138 ]) 139 140 plot = ( 141 ggplot(melted_df, aes(x='Method', y='ECC', fill='Method')) + 142 geom_boxplot() + 143 theme_classic() + 144 ggtitle(f"Overall clustering stability grouped by {value_type}") 145 ) 146 147 return plot.draw()
Display EC consistency across clustering method.
Args:
clust_object (dict): The object returned by the `assess_clustering_stability` function.
value_type (str): The type of value to be used for grouping the results. Can be either `k` or `resolution`.
summary_function (function): The function to be used for summarizing the ECC values.
Example:
>>> pca_emb = np.random.rand(100, 30)
>>> # Create an AnnData object from pca_emb
>>> adata = sc.AnnData(X=pca_emb)
>>> # Run scanpy.pp.neighbors to get the connectivities as an adjacency matrix
>>> sc.pp.neighbors(adata, n_neighbors=10, use_rep='X')
>>> adjacency_matrix = adata.obsp['connectivities']
>>> clust_obj = assess_clustering_stability(adjacency_matrix, resolution=[0.1, 0.3, 0.5], n_repetitions=10, algorithm=[1,4])
>>> plot_k_resolution_corresp(clust_obj)
4def plot_feature_overall_stability_boxplot(feature_objects_assessment_list): 5 ''' 6 Display EC consistency for each feature set and for each step. 7 8 ### Args: 9 feature_objects_assessment_list (dict): The object returned by the `assess_feature_stability` function. 10 11 ### Example: 12 >>> np.random.seed(42) 13 >>> expr_matrix_values = np.vstack((np.random.rand(25, 10), np.random.uniform(5, 7, (75, 10)))) 14 >>> expr_matrix = pd.DataFrame(expr_matrix_values) 15 >>> # Set the row names and column names 16 >>> expr_matrix.index = map(str, range(1, 101)) 17 >>> expr_matrix.columns = ["feature " + str(i) for i in range(1, 11)] 18 >>> feature_stability_result = assess_feature_stability(data_matrix = expr_matrix, feature_set = expr_matrix.columns.tolist(), steps = [5, 10], feature_type = 'type_name', resolution = [0.3, 0.5], n_repetitions=10, algorithm=4) 19 >>> plot_feature_overall_stability_boxplot(feature_stability_result) 20 ''' 21 # if it is not a list, make it a list 22 if not isinstance(feature_objects_assessment_list, list): 23 feature_objects_assessment_list = [feature_objects_assessment_list] 24 25 # Get overall stability results in a dataframe. 26 data = [] 27 for fs_object in feature_objects_assessment_list: 28 key = list(fs_object['steps_ecc_list'].keys())[0] 29 feature_object = fs_object['steps_ecc_list'][key] 30 for type_key, steps_dict in feature_object.items(): 31 for resolution, results_ecc in steps_dict.items(): 32 data.append({'#Features': type_key, 'ECC': np.median(results_ecc['ecc']), 'type': key}) 33 34 df = pd.DataFrame(data) 35 36 # Sort by features 37 df['#Features'] = df['#Features'].astype(int) 38 df = df.sort_values(by='#Features') 39 df['#Features'] = pd.Categorical(df['#Features'], categories=sorted(df['#Features'].unique()), ordered=True) 40 41 plot = ( 42 ggplot(df, aes(x='#Features', y='ECC', fill='type')) 43 + geom_boxplot(outlier_shape=None, outlier_alpha=0) 44 + theme_classic() 45 ) 46 47 return plot.draw()
Display EC consistency for each feature set and for each step.
Args:
feature_objects_assessment_list (dict): The object returned by the `assess_feature_stability` function.
Example:
>>> np.random.seed(42)
>>> expr_matrix_values = np.vstack((np.random.rand(25, 10), np.random.uniform(5, 7, (75, 10))))
>>> expr_matrix = pd.DataFrame(expr_matrix_values)
>>> # Set the row names and column names
>>> expr_matrix.index = map(str, range(1, 101))
>>> expr_matrix.columns = ["feature " + str(i) for i in range(1, 11)]
>>> feature_stability_result = assess_feature_stability(data_matrix = expr_matrix, feature_set = expr_matrix.columns.tolist(), steps = [5, 10], feature_type = 'type_name', resolution = [0.3, 0.5], n_repetitions=10, algorithm=4)
>>> plot_feature_overall_stability_boxplot(feature_stability_result)
50def plot_feature_overall_stability_incremental(feature_objects_assessment_list): 51 ''' 52 Perform an incremental ECS between two consecutive feature steps. 53 54 ### Args: 55 feature_objects_assessment_list (dict): The object returned by the `assess_feature_stability` function. 56 57 ### Example: 58 >>> np.random.seed(42) 59 >>> expr_matrix_values = np.vstack((np.random.rand(25, 10), np.random.uniform(5, 7, (75, 10)))) 60 >>> expr_matrix = pd.DataFrame(expr_matrix_values) 61 >>> # Set the row names and column names 62 >>> expr_matrix.index = map(str, range(1, 101)) 63 >>> expr_matrix.columns = ["feature " + str(i) for i in range(1, 11)] 64 >>> feature_stability_result = assess_feature_stability(data_matrix = expr_matrix, feature_set = expr_matrix.columns.tolist(), steps = [5, 10], feature_type = 'type_name', resolution = [0.3, 0.5], n_repetitions=10, algorithm=4) 65 >>> plot_feature_overall_stability_incremental(feature_stability_result) 66 ''' 67 # if it is not a list, make it a list 68 if not isinstance(feature_objects_assessment_list, list): 69 feature_objects_assessment_list = [feature_objects_assessment_list] 70 71 df = pd.DataFrame(columns=['#Features', 'ECS', 'type']) 72 73 # Get incremental stability results in a dataframe. 74 for fs_object in feature_objects_assessment_list: 75 key = list(fs_object['incremental_ecs_list'].keys())[0] 76 feature_object = fs_object['incremental_ecs_list'][key] 77 for type_key, steps_dict in feature_object.items(): 78 for resolution, results_ecs in steps_dict.items(): 79 # Temporary df to hold the results for this resolution 80 temp_df = pd.DataFrame({ 81 '#Features': [type_key], 82 'ECS': [np.median(results_ecs)], 83 'type': [key] 84 }) 85 # Append the temporary DataFrame to the main DataFrame 86 df = pd.concat([df, temp_df], ignore_index=True) 87 88 # Sort by features order (order that they appear in the incremental_ecs list). 89 features_order = df['#Features'].drop_duplicates().tolist() 90 df['#Features'] = pd.Categorical(df['#Features'], categories=features_order, ordered=True) 91 92 plot = ( 93 ggplot(df, aes(x='#Features', y='ECS', fill='type')) 94 + geom_boxplot(outlier_shape=None, outlier_alpha=0) 95 + theme_classic() 96 ) 97 98 return plot.draw()
Perform an incremental ECS between two consecutive feature steps.
Args:
feature_objects_assessment_list (dict): The object returned by the `assess_feature_stability` function.
Example:
>>> np.random.seed(42)
>>> expr_matrix_values = np.vstack((np.random.rand(25, 10), np.random.uniform(5, 7, (75, 10))))
>>> expr_matrix = pd.DataFrame(expr_matrix_values)
>>> # Set the row names and column names
>>> expr_matrix.index = map(str, range(1, 101))
>>> expr_matrix.columns = ["feature " + str(i) for i in range(1, 11)]
>>> feature_stability_result = assess_feature_stability(data_matrix = expr_matrix, feature_set = expr_matrix.columns.tolist(), steps = [5, 10], feature_type = 'type_name', resolution = [0.3, 0.5], n_repetitions=10, algorithm=4)
>>> plot_feature_overall_stability_incremental(feature_stability_result)
150def plot_k_n_partitions(clust_object, color_information="ecc", y_step=5, pt_size_range=(1.5, 4), dodge_width=0.3, summary_function=np.median): 151 ''' 152 For each configuration provided in clust_object, display how many different partitions with the same number of clusters can be obtained by changing the seed. 153 154 ### Args: 155 clust_object (dict): The object returned by the `assess_clustering_stability` function. 156 color_information (str): The information to be used for coloring the points. Can be either `ecc` or `freq_part`. 157 y_step (int): The step for the y-axis. 158 pt_size_range (tuple): The range of point sizes. 159 dodge_width (float): The width of the dodging. 160 summary_function (function): The function to be used for summarizing the ECC values. 161 162 ### Example: 163 >>> pca_emb = np.random.rand(100, 30) 164 >>> # Create an AnnData object from pca_emb 165 >>> adata = sc.AnnData(X=pca_emb) 166 >>> # Run scanpy.pp.neighbors to get the connectivities as an adjacency matrix 167 >>> sc.pp.neighbors(adata, n_neighbors=10, use_rep='X') 168 >>> adjacency_matrix = adata.obsp['connectivities'] 169 >>> clust_obj = assess_clustering_stability(adjacency_matrix, resolution=[0.1, 0.3, 0.5], n_repetitions=10, algorithm=[1,4]) 170 >>> plot_k_n_partitions(clust_obj) 171 ''' 172 if color_information not in ["ecc", "freq_part"]: 173 raise ValueError("colour_information can be either `ecc` or `freq_part`") 174 175 if 'split_by_k' in clust_object: 176 clust_object = clust_object['split_by_k'] 177 178 data_list = [] 179 max_n_part = 0 180 181 for configuration, partition_object in clust_object.items(): 182 for k, data in partition_object.items(): 183 n_partitions = len(data['partitions']) 184 first_occ = max(partition['freq'] for partition in data['partitions']) 185 total_occ = sum(partition['freq'] for partition in data['partitions']) 186 freq_part = first_occ / total_occ 187 ecc = summary_function(data['ecc']) 188 189 data_list.append({ 190 # convert k to numeric 191 'n.clusters': pd.to_numeric(k), 192 'n.partitions': n_partitions, 193 'configuration': configuration, 194 'first.occ': first_occ, 195 'total.occ': total_occ, 196 'freq_part': freq_part, 197 'ecc': ecc 198 }) 199 200 max_n_part = max(max_n_part, n_partitions) 201 202 overall_total_occ = sum(item['total.occ'] for item in data_list) 203 for item in data_list: 204 item['frequency_k'] = item['total.occ'] / overall_total_occ 205 206 unique_parts = pd.DataFrame(data_list) 207 y_breaks = list(range(0, max_n_part + y_step, y_step)) 208 unique_parts['n.clusters'] = pd.Categorical(unique_parts['n.clusters'], ordered=True) 209 210 plot = ( 211 ggplot(unique_parts, aes( 212 x='n.clusters', 213 y='n.partitions', 214 shape='configuration', 215 size='frequency_k', 216 fill=color_information, # Replace with the correct column name 217 group='configuration' 218 )) + 219 scale_y_continuous(breaks=y_breaks) + 220 scale_size_continuous(range=pt_size_range, guide='legend') + 221 geom_hline(yintercept=list(range(0, max_n_part + 1, y_step)), linetype="dashed", color="#e3e3e3") + 222 geom_point(position=position_dodge(width=dodge_width)) + 223 theme_classic() + 224 labs(x="k", y="# partitions") + 225 guides(shape=guide_legend(override_aes={'size': max(pt_size_range)})) 226 ) 227 228 return plot.draw()
For each configuration provided in clust_object, display how many different partitions with the same number of clusters can be obtained by changing the seed.
Args:
clust_object (dict): The object returned by the `assess_clustering_stability` function.
color_information (str): The information to be used for coloring the points. Can be either `ecc` or `freq_part`.
y_step (int): The step for the y-axis.
pt_size_range (tuple): The range of point sizes.
dodge_width (float): The width of the dodging.
summary_function (function): The function to be used for summarizing the ECC values.
Example:
>>> pca_emb = np.random.rand(100, 30)
>>> # Create an AnnData object from pca_emb
>>> adata = sc.AnnData(X=pca_emb)
>>> # Run scanpy.pp.neighbors to get the connectivities as an adjacency matrix
>>> sc.pp.neighbors(adata, n_neighbors=10, use_rep='X')
>>> adjacency_matrix = adata.obsp['connectivities']
>>> clust_obj = assess_clustering_stability(adjacency_matrix, resolution=[0.1, 0.3, 0.5], n_repetitions=10, algorithm=[1,4])
>>> plot_k_n_partitions(clust_obj)
230def plot_k_resolution_corresp(clust_object, color_information="ecc", dodge_width = 0.3, pt_size_range = [1.5, 4], summary_function=np.median): 231 ''' 232 For each configuration provided in the clust_object, display what number of clusters appear for different values of the resolution parameters. 233 234 ### Args: 235 clust_object (dict): The object returned by the `assess_clustering_stability` function. 236 color_information (str): The information to be used for coloring the points. Can be either `ecc` or `freq_part`. 237 dodge_width (float): The width of the dodging. 238 pt_size_range (float array): The range of point sizes. 239 summary_function (function): The function to be used for summarizing the ECC values. 240 241 ### Example: 242 >>> pca_emb = np.random.rand(100, 30) 243 >>> # Create an AnnData object from pca_emb 244 >>> adata = sc.AnnData(X=pca_emb) 245 >>> # Run scanpy.pp.neighbors to get the connectivities as an adjacency matrix 246 >>> sc.pp.neighbors(adata, n_neighbors=10, use_rep='X') 247 >>> adjacency_matrix = adata.obsp['connectivities'] 248 >>> clust_obj = assess_clustering_stability(adjacency_matrix, resolution=[0.1, 0.3, 0.5], n_repetitions=10, algorithm=[1,4]) 249 >>> plot_k_resolution_corresp(clust_obj) 250 ''' 251 if color_information not in ["ecc", "freq_k"]: 252 raise ValueError("color_information can be either `ecc` or `freq_k`") 253 254 appearances_df = pd.DataFrame() 255 256 # Iterate through the clustering results 257 for method, resolutions in clust_object['split_by_resolution'].items(): 258 for resolution, res_object in resolutions.items(): 259 # Calculate total runs for normalization 260 n_runs = sum(partition['freq'] for cluster in res_object['clusters'].values() for partition in cluster['partitions']) 261 262 temp_data = { 263 'method': [], 264 'resolution_value': [], 265 'number_clusters': [], 266 'freq_partition': [], 267 'ecc': [], 268 'freq_k': [] 269 } 270 271 for k, cluster_info in res_object['clusters'].items(): 272 # Calculate the sum of frequencies for this cluster size (k) 273 freq_k = sum(partition['freq'] for partition in cluster_info['partitions']) 274 275 temp_data['method'].append(method) 276 temp_data['resolution_value'].append(resolution) 277 temp_data['number_clusters'].append(k) 278 temp_data['freq_partition'].append(cluster_info['partitions'][0]['freq']/sum(partition['freq'] for partition in cluster_info['partitions'])) 279 temp_data['ecc'].append(summary_function(cluster_info['ecc'])) 280 temp_data['freq_k'].append(freq_k) 281 282 # Convert to DataFrame and normalize 283 temp_df = pd.DataFrame(temp_data) 284 temp_df['freq_k'] /= n_runs 285 appearances_df = pd.concat([appearances_df, temp_df], ignore_index=True) 286 287 appearances_df['number_clusters'] = pd.to_numeric(appearances_df['number_clusters']) 288 appearances_df['resolution_value'] = pd.Categorical(appearances_df['resolution_value']) 289 290 plot = ( 291 ggplot(appearances_df, aes( 292 y='number_clusters', 293 x='factor(resolution_value)', 294 size='freq_partition', 295 fill=color_information, 296 shape='method', 297 group='method' 298 )) + 299 geom_hline(yintercept=appearances_df['number_clusters'].unique(), linetype="dashed", color="#e3e3e3") + 300 geom_point(position= position_dodge(width=dodge_width)) + 301 theme_classic() + 302 scale_fill_cmap(guide="colorbar") + 303 labs(x="resolution", y="k") + 304 scale_size_continuous(range=pt_size_range, guide="legend") + 305 theme(axis_text_x=element_text(angle=90, hjust=1)) + 306 guides(shape=guide_legend(override_aes={'size': max(pt_size_range)})) 307 ) 308 309 return plot.draw()
For each configuration provided in the clust_object, display what number of clusters appear for different values of the resolution parameters.
Args:
clust_object (dict): The object returned by the `assess_clustering_stability` function.
color_information (str): The information to be used for coloring the points. Can be either `ecc` or `freq_part`.
dodge_width (float): The width of the dodging.
pt_size_range (float array): The range of point sizes.
summary_function (function): The function to be used for summarizing the ECC values.
Example:
>>> pca_emb = np.random.rand(100, 30)
>>> # Create an AnnData object from pca_emb
>>> adata = sc.AnnData(X=pca_emb)
>>> # Run scanpy.pp.neighbors to get the connectivities as an adjacency matrix
>>> sc.pp.neighbors(adata, n_neighbors=10, use_rep='X')
>>> adjacency_matrix = adata.obsp['connectivities']
>>> clust_obj = assess_clustering_stability(adjacency_matrix, resolution=[0.1, 0.3, 0.5], n_repetitions=10, algorithm=[1,4])
>>> plot_k_resolution_corresp(clust_obj)
311def plot_n_neigh_ecs(nn_ecs_object): 312 ''' 313 Display, for all configurations consisting in different number of neighbors, graph types and base embeddings, 314 the EC Consistency of the partitions obtained over multiple runs. 315 316 ### Args: 317 nn_ecs_object (dict): The object returned by the `assess_nn_stability` function. 318 319 ### Example: 320 >>> pca_emb = np.random.rand(100, 30) 321 >>> nn_stability_obj = assess_nn_stability(pca_emb, n_neigh_sequence= [10, 15, 20], n_repetitions = 10, algorithm = 4) 322 >>> plot_n_neigh_ecs(nn_stability_obj) 323 ''' 324 data = [] 325 for config_name, config in nn_ecs_object['n_neigh_ec_consistency'].items(): 326 for n_neigh, values in config.items(): 327 for ecc in values: 328 data.append({'ECC': ecc, 'n_neigh': n_neigh, 'config_name': config_name}) 329 df = pd.DataFrame(data) 330 331 # Convert 'n_neigh' to a categorical variable and sort its levels 332 df['n_neigh'] = pd.Categorical(df['n_neigh'], ordered=True) 333 df['n_neigh'] = df['n_neigh'].cat.reorder_categories(sorted(df['n_neigh'].unique(), key=lambda x: float(x))) 334 335 336 plot = ( 337 ggplot(df, aes(x='n_neigh', y='ECC', fill='config_name')) 338 + geom_boxplot() 339 + theme_classic() 340 ) 341 342 return plot.draw()
Display, for all configurations consisting in different number of neighbors, graph types and base embeddings, the EC Consistency of the partitions obtained over multiple runs.
Args:
nn_ecs_object (dict): The object returned by the `assess_nn_stability` function.
Example:
>>> pca_emb = np.random.rand(100, 30)
>>> nn_stability_obj = assess_nn_stability(pca_emb, n_neigh_sequence= [10, 15, 20], n_repetitions = 10, algorithm = 4)
>>> plot_n_neigh_ecs(nn_stability_obj)