ClustAssessPy

 1from ClustAssessPy.basic_functions import element_sim_elscore, element_sim, element_sim_matrix, element_agreement, element_consistency, weighted_element_consistency
 2from ClustAssessPy.dim_reduction_assessment import assess_feature_stability
 3from ClustAssessPy.graph_clustering_assessment import assess_clustering_stability
 4from ClustAssessPy.graph_construction_assessment import assess_nn_stability, get_adjacency_matrix_wrapper
 5from ClustAssessPy.plotting_functions import plot_clustering_overall_stability, plot_feature_overall_stability_boxplot, plot_feature_overall_stability_incremental, plot_k_n_partitions, plot_k_resolution_corresp, plot_n_neigh_ecs
 6
 7__all__ = [
 8    'element_sim_elscore',
 9    'element_sim',
10    'element_sim_matrix',
11    'element_agreement',
12    'element_consistency',
13    'weighted_element_consistency',
14    'assess_feature_stability',
15    'assess_clustering_stability',
16    'assess_nn_stability',
17    'plot_clustering_overall_stability',
18    'plot_feature_overall_stability_boxplot',
19    'plot_feature_overall_stability_incremental',
20    'plot_k_n_partitions',
21    'plot_k_resolution_corresp',
22    'plot_n_neigh_ecs'
23]
24
25# print("ClustAssessPy package loaded!!")
def element_sim_elscore(clustering1, clustering2):
 4def element_sim_elscore(clustering1, clustering2):
 5    """
 6    Calculates the element-wise element-centric similarity between two clustering results.
 7    
 8    ### Args:
 9        clustering1 (array-like): The first clustering partition, where each element represents a cluster assignment.
10        clustering2 (array-like): The second clustering partition to compare against 'clustering1'.
11        
12    ### Returns:
13        ndarray: A NumPy array of ECS values for each element in the clustering partitions. The length of this array is equal to the number of elements in the clustering partitions.
14
15    ### Raises:
16        ValueError: If 'clustering1' and 'clustering2' are not of the same length.
17
18    ### Example:
19    >>> clustering1 = np.array([1, 1, 2, 2, 3])
20    >>> clustering2 = np.array([1, 2, 2, 3, 3])
21    >>> element_sim_elscore(clustering1, clustering2)
22    """
23
24    clustering1 = np.asarray(clustering1)
25    clustering2 = np.asarray(clustering2)
26
27    if clustering1.shape != clustering2.shape:
28        raise ValueError("clustering1 and clustering2 must have the same length")
29
30    # Map cluster labels to indices starting from 0
31    labels_a, inverse_a = np.unique(clustering1, return_inverse=True)
32    labels_b, inverse_b = np.unique(clustering2, return_inverse=True)
33    n_labels_a = labels_a.size
34    n_labels_b = labels_b.size
35
36    # Compute the contingency table C
37    data = np.ones(clustering1.size)
38    C = coo_matrix((data, (inverse_a, inverse_b)), shape=(n_labels_a, n_labels_b)).toarray()
39
40    len_A_eq_ca = C.sum(axis=1)  # Number of elements in each cluster in clustering1
41    len_B_eq_cb = C.sum(axis=0)  # Number of elements in each cluster in clustering2
42
43    # Compute the inverse cluster sizes, handling zero divisions
44    len_A_eq_ca_inv = np.divide(1.0, len_A_eq_ca, where=len_A_eq_ca != 0)
45    len_B_eq_cb_inv = np.divide(1.0, len_B_eq_cb, where=len_B_eq_cb != 0)
46
47    # Compute the absolute difference of inverse cluster sizes
48    delta_inv = np.abs(len_A_eq_ca_inv[:, None] - len_B_eq_cb_inv[None, :])
49
50    # Compute the terms used in the ECS calculation
51    term1 = C * delta_inv
52    term2 = (len_A_eq_ca[:, None] - C) * len_A_eq_ca_inv[:, None]
53    term3 = (len_B_eq_cb[None, :] - C) * len_B_eq_cb_inv[None, :]
54
55    # Compute the ECS score matrix
56    score_matrix = 1 - 0.5 * (term1 + term2 + term3)
57    score_matrix = np.clip(score_matrix, 0, 1)
58
59    # Assign ECS values to each element based on their cluster assignments
60    ecs_element_values = score_matrix[inverse_a, inverse_b]
61
62    return ecs_element_values

Calculates the element-wise element-centric similarity between two clustering results.

Args:

clustering1 (array-like): The first clustering partition, where each element represents a cluster assignment.
clustering2 (array-like): The second clustering partition to compare against 'clustering1'.

Returns:

ndarray: A NumPy array of ECS values for each element in the clustering partitions. The length of this array is equal to the number of elements in the clustering partitions.

Raises:

ValueError: If 'clustering1' and 'clustering2' are not of the same length.

Example:

>>> clustering1 = np.array([1, 1, 2, 2, 3])
>>> clustering2 = np.array([1, 2, 2, 3, 3])
>>> element_sim_elscore(clustering1, clustering2)
def element_sim(clustering1, clustering2):
64def element_sim(clustering1, clustering2):
65    """
66    Calculate the average element-centric similarity between two clustering results
67
68    ### Args:
69        clustering1 (array-like): The first clustering partition, where each element represents a cluster assignment.
70        clustering2 (array-like): The second clustering partition to compare against 'clustering1'.
71        
72    ### Returns:
73        float: The average ECS value for all elements across the two clustering partitions.
74
75    ### Raises:
76        ValueError: If 'clustering1' and 'clustering2' are not of the same length or if they are not appropriate vector types (i.e., NumPy arrays, lists, or tuples).
77
78    ### Example:
79    >>> clustering1 = np.array([1, 1, 2, 2, 3])
80    >>> clustering2 = np.array([1, 2, 2, 3, 3])
81    >>> element_sim(clustering1, clustering2)
82    """
83    return np.mean(element_sim_elscore(clustering1, clustering2))

Calculate the average element-centric similarity between two clustering results

Args:

clustering1 (array-like): The first clustering partition, where each element represents a cluster assignment.
clustering2 (array-like): The second clustering partition to compare against 'clustering1'.

Returns:

float: The average ECS value for all elements across the two clustering partitions.

Raises:

ValueError: If 'clustering1' and 'clustering2' are not of the same length or if they are not appropriate vector types (i.e., NumPy arrays, lists, or tuples).

Example:

>>> clustering1 = np.array([1, 1, 2, 2, 3])
>>> clustering2 = np.array([1, 2, 2, 3, 3])
>>> element_sim(clustering1, clustering2)
def element_sim_matrix(clustering_list):
189def element_sim_matrix(clustering_list):
190    """
191    Compare a set of clusterings by calculating their pairwise average element-centric clustering similarities.
192
193    Optimized to avoid redundant computations and utilize vectorized operations.
194
195    ### Args:
196        clustering_list (list of array-like): A list where each element is a clustering partition represented as an array-like structure (e.g., lists, NumPy arrays). Each clustering partition should have the same length, with each element representing a cluster assignment.
197
198    ### Returns:
199        ndarray: A NumPy matrix containing the pairwise ECS values.
200
201    ### Raises:
202        ValueError: If elements of the 'clustering_list' are not of the same length or if they are not appropriate array-like structures.
203
204    ### Example:
205    >>> clustering_list = [np.array([1, 1, 2]), np.array([1, 2, 2]), np.array([2, 2, 1])]
206    >>> element_sim_matrix(clustering_list)
207    """
208
209    # Precompute data for all clusterings
210    clustering_data = _precompute_clustering_data(clustering_list)
211    num_clusterings = len(clustering_data)
212    sim_matrix = np.ones((num_clusterings, num_clusterings))  # Diagonal elements are 1
213
214    # Generate upper triangle indices (excluding diagonal)
215    indices_i, indices_j = np.triu_indices(num_clusterings, k=1)
216
217    # Compute pairwise similarities
218    for idx in range(len(indices_i)):
219        i = indices_i[idx]
220        j = indices_j[idx]
221        sim_score = _element_sim_with_precomputed(clustering_data[i], clustering_data[j])
222        sim_matrix[i, j] = sim_score
223        sim_matrix[j, i] = sim_score  # Symmetric matrix
224
225    return sim_matrix

Compare a set of clusterings by calculating their pairwise average element-centric clustering similarities.

Optimized to avoid redundant computations and utilize vectorized operations.

Args:

clustering_list (list of array-like): A list where each element is a clustering partition represented as an array-like structure (e.g., lists, NumPy arrays). Each clustering partition should have the same length, with each element representing a cluster assignment.

Returns:

ndarray: A NumPy matrix containing the pairwise ECS values.

Raises:

ValueError: If elements of the 'clustering_list' are not of the same length or if they are not appropriate array-like structures.

Example:

>>> clustering_list = [np.array([1, 1, 2]), np.array([1, 2, 2]), np.array([2, 2, 1])]
>>> element_sim_matrix(clustering_list)
def element_agreement(reference_clustering, clustering_list):
 85def element_agreement(reference_clustering, clustering_list):
 86    """
 87    Inspect how consistently a set of clusterings agree with a reference clustering by calculating their element-wise average agreement.
 88
 89    Optimized using precomputed data.
 90
 91    ### Args:
 92        reference_clustering (array-like): The reference clustering, that each clustering in clustering_list is compared to
 93        clustering_list (list of array-like): A list where each element is a clustering partition represented as an array-like structure (e.g., lists, NumPy arrays).
 94        
 95    ### Returns:
 96        ndarray: A vector containing the element-wise average agreement.
 97
 98    ### Raises:
 99        ValueError: If any of the clusterings in the clustering_list are not of the same length as reference_clustering or if they are not appropriate array-like structures.
100
101    ### Example:
102    >>> reference_clustering = np.array([1, 1, 2])
103    >>> clustering_list = [np.array([1, 1, 2]), np.array([1, 2, 2]), np.array([2, 2, 1])]
104    >>> element_agreement(reference_clustering, clustering_list)
105    """
106    reference_clustering = np.asarray(reference_clustering)
107    num_elements = len(reference_clustering)
108    num_clusterings = len(clustering_list)
109
110    # Precompute data for the reference clustering
111    labels_ref, inverse_ref = np.unique(reference_clustering, return_inverse=True)
112    data_ref = {
113        'clustering': reference_clustering,
114        'labels': labels_ref,
115        'inverse': inverse_ref
116    }
117
118    # Initialize ECS accumulator
119    ecs_element_values = np.zeros(num_elements)
120
121    # Compute ECS values for each clustering
122    for clustering in clustering_list:
123        clustering = np.asarray(clustering)
124        labels, inverse = np.unique(clustering, return_inverse=True)
125        data = {
126            'clustering': clustering,
127            'labels': labels,
128            'inverse': inverse
129        }
130        ecs_values = _element_sim_elscore_with_precomputed(data_ref, data)
131        ecs_element_values += ecs_values
132
133    # Compute average ECS values
134    ecs_element_values /= num_clusterings
135
136    return ecs_element_values

Inspect how consistently a set of clusterings agree with a reference clustering by calculating their element-wise average agreement.

Optimized using precomputed data.

Args:

reference_clustering (array-like): The reference clustering, that each clustering in clustering_list is compared to
clustering_list (list of array-like): A list where each element is a clustering partition represented as an array-like structure (e.g., lists, NumPy arrays).

Returns:

ndarray: A vector containing the element-wise average agreement.

Raises:

ValueError: If any of the clusterings in the clustering_list are not of the same length as reference_clustering or if they are not appropriate array-like structures.

Example:

>>> reference_clustering = np.array([1, 1, 2])
>>> clustering_list = [np.array([1, 1, 2]), np.array([1, 2, 2]), np.array([2, 2, 1])]
>>> element_agreement(reference_clustering, clustering_list)
def element_consistency(clustering_list):
138def element_consistency(clustering_list):
139    """
140    Inspects the consistency of a set of clusterings by calculating their element-wise clustering consistency.
141
142    Optimized version using precomputed data and efficient looping.
143
144    ### Args:
145        clustering_list (list of array-like): A list where each element is a clustering partition represented as an array-like structure (e.g., lists, NumPy arrays). Each clustering partition should have the same length, with each element representing a cluster assignment.
146
147    ### Returns:
148        ndarray: A NumPy array of average consistency scores for each element in the clustering partitions.
149        The length of this array is equal to the number of elements in the individual clustering partitions.
150
151    ### Raises:
152        ValueError: If elements of the 'clustering_list' are not of the same length or if they are not appropriate array-like structures.
153
154    ### Example:
155        >>> import numpy as np
156        >>> clustering_list = [np.array([1, 1, 2]), np.array([1, 2, 2]), np.array([2, 2, 1])]
157        >>> ecs_values = element_consistency(clustering_list)
158        >>> print(ecs_values)
159    """
160
161    # Precompute data for each clustering
162    clustering_data = _precompute_clustering_data(clustering_list)
163    num_clusterings = len(clustering_data)
164    num_elements = len(clustering_data[0]['clustering'])
165
166    # Check that all clusterings have the same length
167    if not all(len(data['clustering']) == num_elements for data in clustering_data):
168        raise ValueError("All clusterings must have the same number of elements")
169
170    ecs_element_values = np.zeros(num_elements)
171    num_pairs = num_clusterings * (num_clusterings - 1) // 2
172
173    # Generate all unique pairs of indices
174    indices_i, indices_j = np.triu_indices(num_clusterings, k=1)
175
176    # Loop over pairs using precomputed data
177    for idx in range(len(indices_i)):
178        i = indices_i[idx]
179        j = indices_j[idx]
180        ecs_element_values += _element_sim_elscore_with_precomputed(
181            clustering_data[i], clustering_data[j]
182        )
183
184    # Compute the average ECS values
185    ecs_element_values /= num_pairs
186
187    return ecs_element_values

Inspects the consistency of a set of clusterings by calculating their element-wise clustering consistency.

Optimized version using precomputed data and efficient looping.

Args:

clustering_list (list of array-like): A list where each element is a clustering partition represented as an array-like structure (e.g., lists, NumPy arrays). Each clustering partition should have the same length, with each element representing a cluster assignment.

Returns:

ndarray: A NumPy array of average consistency scores for each element in the clustering partitions.
The length of this array is equal to the number of elements in the individual clustering partitions.

Raises:

ValueError: If elements of the 'clustering_list' are not of the same length or if they are not appropriate array-like structures.

Example:

>>> import numpy as np
>>> clustering_list = [np.array([1, 1, 2]), np.array([1, 2, 2]), np.array([2, 2, 1])]
>>> ecs_values = element_consistency(clustering_list)
>>> print(ecs_values)
def weighted_element_consistency(clustering_list, weights=None, calculate_sim_matrix=False):
227def weighted_element_consistency(clustering_list, weights=None, calculate_sim_matrix=False):
228    """
229    Calculate the weighted element-wise consistency of a set of clusterings.
230
231    This function computes per-element weighted consistency scores, aligning with the old implementation.
232
233    ### Args:
234        clustering_list (list of array-like): A list where each element is a clustering partition.
235        weights (list of float, optional): A list of weights corresponding to each clustering in 'clustering_list'.
236        calculate_sim_matrix (bool, optional): Whether to return the similarity matrix.
237
238    ### Returns:
239        ndarray: An array of per-element weighted consistency scores.
240        (optional) ndarray: Similarity matrix if calculate_sim_matrix is True.
241
242    ### Example:
243        >>> clustering_list = [np.array([1, 1, 2]), np.array([1, 2, 2]), np.array([2, 2, 1])]
244        >>> weighted_ecc = weighted_element_consistency(clustering_list, weights=[1, 0.5, 0.5])
245        >>> print(weighted_ecc)
246    """
247    n_clusterings = len(clustering_list)
248    num_elements = len(clustering_list[0])
249
250    # Precompute data for all clusterings
251    clustering_data = _precompute_clustering_data(clustering_list)
252
253    if weights is None:
254        weights = np.ones(n_clusterings)
255    else:
256        weights = np.asarray(weights)
257        if weights.shape[0] != n_clusterings:
258            raise ValueError("Length of 'weights' must match the number of clusterings.")
259
260    total_weights = np.sum(weights)
261    num_pairs = total_weights * (total_weights - 1) / 2
262
263    # Initialize consistency accumulator as an array
264    consistency = np.zeros(num_elements)
265
266    if calculate_sim_matrix:
267        sim_matrix = np.zeros((n_clusterings, n_clusterings))  # Initialize with zeros
268
269    # Generate all unique pairs of indices
270    indices_i, indices_j = np.triu_indices(n_clusterings, k=1)
271
272    # Compute pairwise ECS values and accumulate weighted consistency
273    for idx in range(len(indices_i)):
274        i = indices_i[idx]
275        j = indices_j[idx]
276        weight_pair = weights[i] * weights[j]
277
278        ecs_values = _element_sim_elscore_with_precomputed(clustering_data[i], clustering_data[j])
279
280        consistency += ecs_values * weight_pair
281
282        if calculate_sim_matrix:
283            mean_ecs = np.mean(ecs_values)
284            sim_matrix[i, j] = mean_ecs  # Only assign upper triangle
285
286    # Add self-consistency contributions (element-wise)
287    for i in range(n_clusterings):
288        weight = weights[i]
289        # Since ECS of a clustering with itself is 1 for all elements,
290        # and weights[i] * (weights[i] - 1) / 2 is the weight of self-consistency,
291        # we multiply it by 1 (the ECS value) for each element.
292        consistency += (weight * (weight - 1) / 2) * np.ones(num_elements)
293
294    # Normalize consistency
295    normalized_consistency = consistency / num_pairs
296
297    if calculate_sim_matrix:
298        return normalized_consistency, sim_matrix  # sim_matrix remains upper triangular with zeros elsewhere
299
300    return normalized_consistency

Calculate the weighted element-wise consistency of a set of clusterings.

This function computes per-element weighted consistency scores, aligning with the old implementation.

Args:

clustering_list (list of array-like): A list where each element is a clustering partition.
weights (list of float, optional): A list of weights corresponding to each clustering in 'clustering_list'.
calculate_sim_matrix (bool, optional): Whether to return the similarity matrix.

Returns:

ndarray: An array of per-element weighted consistency scores.
(optional) ndarray: Similarity matrix if calculate_sim_matrix is True.

Example:

>>> clustering_list = [np.array([1, 1, 2]), np.array([1, 2, 2]), np.array([2, 2, 1])]
>>> weighted_ecc = weighted_element_consistency(clustering_list, weights=[1, 0.5, 0.5])
>>> print(weighted_ecc)
def assess_feature_stability( data_matrix, feature_set, steps, feature_type, resolution, ncores, n_repetitions=100, seed_sequence=None, graph_reduction_type='PCA', npcs=30, ecs_thresh=1, algorithm='leiden', umap_arguments=None, show_warnings=False, parallel_steps=True, pca_postprocessing=None, flavor='default'):
132def assess_feature_stability(data_matrix, feature_set, steps, feature_type,
133                             resolution, ncores, n_repetitions=100, seed_sequence=None,
134                             graph_reduction_type="PCA", npcs=30, ecs_thresh=1,
135                             algorithm='leiden', umap_arguments=None, show_warnings = False, parallel_steps = True, pca_postprocessing = None, flavor = "default"):
136    """
137    Assess the stability for configurations of feature types and size. Evaluate the stability of clusterings obtained based on incremental subsets of a given feature set.
138    
139    The algorithm assumes that the feature_set is already sorted when performing the subsetting. For example, if the user wants to analyze highly variable feature set, they should provide them sorted by their variability.
140    
141    ### Args:
142        data_matrix (pandas.DataFrame): A data matrix with rows as cells and columns as features.
143        feature_set (list): A list of features feature names that can be found on the rownames of the data matrix.
144        steps (list): A list of integers containing the sizes of the subsets.
145        feature_type (str): A name associated to the feature_set (e.g. HV - for highly variable genes).
146        resolution (list): A list of resolutions to be used for clustering.
147        ncores (int): The number of cores to be used for parallel processing.
148        n_repetitions (int): The number of repetitions of applying the pipeline with different seeds; ignored if seed_sequence is provided by the user.
149        seed_sequence (int array): A custom seed sequence; if the value is None, the sequence will be built starting from 1 with a step of 100.
150        graph_reduction_type (str): The graph reduction type, denoting if the graph should be built on either the PCA or the UMAP embedding.
151        npcs (int): The number of principal components.
152        ecs_thresh (float): The ECS threshold used for merging similar clusterings.
153        algorithm (str): The community detection algorithm to be used: 'louvain', or 'leiden'. Default and recommended is 'leiden'.
154        umap_arguments (dict): A dictionary containing the arguments to be passed to the UMAP function.
155        warnings_verbose (bool): Whether to print warnings or not.
156        flavor (str): The implementation of the graph clustering algorithm to be used. Default is 'default'. You can also use 'igraph'.
157    
158    ### Raises:
159        ValueError: If data_matrix is not a pandas DataFrame.
160        ValueError: If feature_set is not a list or tuple of strings, or contains elements not in data_matrix.
161        ValueError: If any feature in feature_set is not found in data_matrix columns.
162        ValueError: If steps is not an integer or a list/tuple of integers.
163        ValueError: If feature_type is not a string.
164        ValueError: If resolution is not a list or tuple of floats.
165        ValueError: If n_repetitions is not an integer.
166        ValueError: If graph_reduction_type is not 'PCA' or 'UMAP'.
167        ValueError: If npcs is not an integer.
168        ValueError: If ecs_thresh is not a numeric value.
169        ValueError: If algorithm is not 'louvain' or 'leiden'.
170        ValueError: If flavor is not 'default' or 'igraph'.
171        
172    ### Returns:
173        # dict: A dictionary with the following structure:
174        #     - pca_list (dict): PCA embeddings for each step.
175        #     - embedding_list (dict): UMAP embeddings for each step.
176        #     - steps_ecc_list (dict): Contains the element consistency scores for each step and resolution, with each entry as follows:
177        #             - ecc (float): The element consistency score for the corresponding step.
178        #             - most_frequent_partition (dict): Information about the partition with the highest frequency:
179        #                 - mb (list): The partition with the highest frequency.
180        #                 - freq (int): The frequency of the most frequent partition.
181        #                 - seed (int): The seed used for the most frequent partition.
182        #             - n_different_partitions (int): The number of different partitions obtained for the corresponding step.
183        #     - incremental_ecs_list (dict): Incremental element consistency scores for each step and resolution.
184
185    ### Example:
186    >>> import pandas as pd
187    >>> import ClustAssessPy as ca
188    >>> import numpy as np
189    >>> # Generate a random data matrix
190    >>> data_matrix = pd.DataFrame(np.random.rand(1000, 1000))
191    >>> data_matrix.columns = [f'gene_{i}' for i in range(1000)] 
192    >>> # Define the feature set
193    >>> feature_set = data_matrix.columns.tolist()
194    >>> # Define the steps
195    >>> steps = [500, 1000, 1500]
196    >>> # Define the resolutions
197    >>> resolution = np.arange(0.1, 1.1, 0.3)
198    >>> # Assess the feature stability
199    >>> feature_stability_results = ca.assess_feature_stability(data_matrix, feature_set, steps, 'test', resolution, n_repetitions=10,  graph_reduction_type="PCA", npcs=30, algorithm='leiden')"""
200   
201    warnings.warn("Please ensure data matrix has rows as cells and columns as features.")
202    
203    if not show_warnings:
204        if algorithm == "leiden":
205            print("FutureWarnings (caused by ScanPy for Leiden) will be suppressed. Set show_warnings=True to see them.")
206    
207    # Parameter checks and initial processing
208    _validate_parameters_feature_stability(data_matrix, feature_set, steps, feature_type, resolution, n_repetitions, graph_reduction_type, npcs, ecs_thresh, algorithm, flavor, ncores)
209    
210    ncells = data_matrix.shape[0]
211    
212    pca_list, embedding_list, partitions_list, steps_ecc_list, incremental_ecs_list = {}, {}, {}, {}, {}
213    
214    if seed_sequence is None:
215        seed_sequence = range(1, n_repetitions * 100, 100) 
216        
217    if parallel_steps: 
218        # Run steps in parallel
219        with concurrent.futures.ProcessPoolExecutor(max_workers = ncores) as executor:
220            futures = {}
221            for step in steps:
222                # Map each step to its corresponding future.
223                future = executor.submit(_perform_feature_stability_step, data_matrix, feature_set, step, npcs, ncells, resolution, seed_sequence, algorithm, ecs_thresh, partitions_list, graph_reduction_type, pca_postprocessing, flavor, ncores, show_warnings, umap_arguments)
224                futures[future] = step
225
226            # Process the results as they complete
227            for future in concurrent.futures.as_completed(futures):
228                step = futures[future]
229                result = future.result()
230                
231                if graph_reduction_type == "UMAP":
232                    pca_embeddings, umap_embeddings, temp_list = result
233                    assign_to_nested_dict(embedding_list, feature_type, step, umap_embeddings)
234                else:
235                    pca_embeddings, temp_list = result
236
237                # Store pca_embeddings and steps_ecc (temp_list)
238                assign_to_nested_dict(pca_list, feature_type, str(step), pca_embeddings)
239                assign_to_nested_dict(steps_ecc_list, feature_type, str(step), temp_list)
240    else:
241        for step in steps:
242            if graph_reduction_type == "UMAP":
243                pca_embeddings, umap_embeddings, temp_list = _perform_feature_stability_step(data_matrix, feature_set, step, npcs, ncells, resolution, seed_sequence, algorithm, ecs_thresh, partitions_list, graph_reduction_type, pca_postprocessing, flavor, ncores, umap_arguments)
244                assign_to_nested_dict(embedding_list, feature_type, step, umap_embeddings)
245            else:
246                pca_embeddings, temp_list = _perform_feature_stability_step(data_matrix, feature_set, step, npcs, ncells, resolution, seed_sequence, algorithm, ecs_thresh, partitions_list, graph_reduction_type, pca_postprocessing, flavor, ncores, umap_arguments)
247                
248            # Store pca_embeddings and steps_ecc (temp_list)
249            assign_to_nested_dict(pca_list, feature_type, str(step), pca_embeddings)
250            assign_to_nested_dict(steps_ecc_list, feature_type, str(step), temp_list)
251        
252    # Compute the incremental_ecs_list
253    incremental_ecs_list = {}
254    incremental_ecs_list[feature_type] = {}
255    if len(steps) > 1:
256        for i in range(1, len(steps)):
257            temp_list = {}
258            for res in map(str, resolution):
259                temp_list[res] = element_sim_elscore(
260                    steps_ecc_list[feature_type][str(steps[i - 1])][res]['most_frequent_partition']['mb'],
261                    steps_ecc_list[feature_type][str(steps[i])][res]['most_frequent_partition']['mb']
262                )
263
264            incremental_ecs_list[feature_type][f'{steps[i - 1]}-{steps[i]}'] = temp_list
265            
266    return {
267        'pca_list': pca_list,
268        'embedding_list': embedding_list,
269        'steps_ecc_list': steps_ecc_list,
270        'incremental_ecs_list': incremental_ecs_list
271    }

Assess the stability for configurations of feature types and size. Evaluate the stability of clusterings obtained based on incremental subsets of a given feature set.

The algorithm assumes that the feature_set is already sorted when performing the subsetting. For example, if the user wants to analyze highly variable feature set, they should provide them sorted by their variability.

Args:

data_matrix (pandas.DataFrame): A data matrix with rows as cells and columns as features.
feature_set (list): A list of features feature names that can be found on the rownames of the data matrix.
steps (list): A list of integers containing the sizes of the subsets.
feature_type (str): A name associated to the feature_set (e.g. HV - for highly variable genes).
resolution (list): A list of resolutions to be used for clustering.
ncores (int): The number of cores to be used for parallel processing.
n_repetitions (int): The number of repetitions of applying the pipeline with different seeds; ignored if seed_sequence is provided by the user.
seed_sequence (int array): A custom seed sequence; if the value is None, the sequence will be built starting from 1 with a step of 100.
graph_reduction_type (str): The graph reduction type, denoting if the graph should be built on either the PCA or the UMAP embedding.
npcs (int): The number of principal components.
ecs_thresh (float): The ECS threshold used for merging similar clusterings.
algorithm (str): The community detection algorithm to be used: 'louvain', or 'leiden'. Default and recommended is 'leiden'.
umap_arguments (dict): A dictionary containing the arguments to be passed to the UMAP function.
warnings_verbose (bool): Whether to print warnings or not.
flavor (str): The implementation of the graph clustering algorithm to be used. Default is 'default'. You can also use 'igraph'.

Raises:

ValueError: If data_matrix is not a pandas DataFrame.
ValueError: If feature_set is not a list or tuple of strings, or contains elements not in data_matrix.
ValueError: If any feature in feature_set is not found in data_matrix columns.
ValueError: If steps is not an integer or a list/tuple of integers.
ValueError: If feature_type is not a string.
ValueError: If resolution is not a list or tuple of floats.
ValueError: If n_repetitions is not an integer.
ValueError: If graph_reduction_type is not 'PCA' or 'UMAP'.
ValueError: If npcs is not an integer.
ValueError: If ecs_thresh is not a numeric value.
ValueError: If algorithm is not 'louvain' or 'leiden'.
ValueError: If flavor is not 'default' or 'igraph'.

Returns:

# dict: A dictionary with the following structure:
#     - pca_list (dict): PCA embeddings for each step.
#     - embedding_list (dict): UMAP embeddings for each step.
#     - steps_ecc_list (dict): Contains the element consistency scores for each step and resolution, with each entry as follows:
#             - ecc (float): The element consistency score for the corresponding step.
#             - most_frequent_partition (dict): Information about the partition with the highest frequency:
#                 - mb (list): The partition with the highest frequency.
#                 - freq (int): The frequency of the most frequent partition.
#                 - seed (int): The seed used for the most frequent partition.
#             - n_different_partitions (int): The number of different partitions obtained for the corresponding step.
#     - incremental_ecs_list (dict): Incremental element consistency scores for each step and resolution.

Example:

>>> import pandas as pd
>>> import ClustAssessPy as ca
>>> import numpy as np
>>> # Generate a random data matrix
>>> data_matrix = pd.DataFrame(np.random.rand(1000, 1000))
>>> data_matrix.columns = [f'gene_{i}' for i in range(1000)] 
>>> # Define the feature set
>>> feature_set = data_matrix.columns.tolist()
>>> # Define the steps
>>> steps = [500, 1000, 1500]
>>> # Define the resolutions
>>> resolution = np.arange(0.1, 1.1, 0.3)
>>> # Assess the feature stability
>>> feature_stability_results = ca.assess_feature_stability(data_matrix, feature_set, steps, 'test', resolution, n_repetitions=10,  graph_reduction_type="PCA", npcs=30, algorithm='leiden')
def assess_clustering_stability( graph_adjacency_matrix, resolution, ncores, n_repetitions=100, seed_sequence=None, ecs_thresh=1, algorithms=['louvain', 'leiden'], flavor='default', show_warnings=False):
 70def assess_clustering_stability(graph_adjacency_matrix, resolution, ncores,
 71                                n_repetitions=100,
 72                                seed_sequence=None, ecs_thresh=1,
 73                                algorithms=['louvain', 'leiden'],
 74                                flavor="default", show_warnings=False):
 75    '''
 76    Evaluates the stability of different graph clustering methods in the clustering pipeline. 
 77    The method will iterate through different values of the resolution parameter and compare, using the EC Consistency score, the partitions obtained at different seeds.
 78    
 79    ### Args:
 80        graph_adjacency_matrix (pandas.DataFrame): A data matrix with rows as cells and columns as features.
 81        resolution (list): A list of resolutions to be used for clustering.
 82        ncores (int): The number of cores to be used for parallel processing.
 83        n_repetitions (int): The number of repetitions of applying the pipeline with different seeds; ignored if seed_sequence is provided by the user.
 84        seed_sequence (int array): A custom seed sequence; if the value is None, the sequence will be built starting from 1 with a step of 100.
 85        ecs_thresh (float): The ECS threshold used for merging similar clusterings.
 86        algorithms (str array): The community detection algorithm to be used: ['louvain'], or ['leiden'], or both ['louvain', 'leiden'].
 87        flavor (str): The implementation of the graph clustering algorithm to be used. Default is 'default'. You can also use 'igraph'.
 88    
 89    ### Raises:
 90        ValueError: If resolution is not a list or tuple of floats.
 91        ValueError: If n_repetitions is not an integer.
 92        ValueError: If ecs_thresh is not a numeric value.
 93        ValueError: If algorithm is not 'louvain' or 'leiden'.
 94        ValueError: If flavor is not 'default' or 'igraph'.
 95        
 96    ### Returns:
 97        # dict: A dictionary with the following structure:
 98        #     - split_by_resolution (dict): Stability results when evaluating the clustering stability by resolution. 
 99        #     - split_by_k (dict): Stability results when evaluating the clustering stability by the number of clusters (k). 
100    
101    Example:
102    >>> import numpy as np
103    >>> import pandas as pd
104    >>> import ClustAssessPy as ca
105    >>> # Create a PCA embedding.
106    >>> pca_emb = np.random.uniform(low=0.0, high=1.0, size=(100, 30))
107    >>> pca_emb_df = pd.DataFrame(pca_emb, index=map(str, range(1, 101)), columns=["PCA_" + str(i) for i in range(1, 31)])
108    >>> # Get SNN matrix from pca using 20 neighbours
109    >>> adj_mat = ca.get_adjacency_matrix_wrapper(pca_emb_df, n_neighs=20)['snn']
110    >>> clust_stability = ca.assess_clustering_stability(adj_mat, resolution=[0.1, 0.3, 0.5, 0.7, 0.9], n_repetitions=10, algorithms=['louvain','leiden'])
111    >>> # ------------------------------------------------
112    >>> # ALTERNATIVELY WITH SCANPY - assuming you have an adata object
113    >>> # ------------------------------------------------
114    >>> import scanpy as sc
115    >>> sc.pp.neighbors(adata, n_neighbors=20)
116    >>> adj_mat = adata.obsp['connectivities']
117    >>> clust_stability = ca.assess_clustering_stability(adj_mat, resolution=[0.1, 0.3, 0.5, 0.7, 0.9], n_repetitions=10, algorithms=['louvain','leiden']) 
118    '''
119    if not show_warnings:
120        if 'leiden' in algorithms:
121            print("FutureWarnings (caused by ScanPy for Leiden) will be suppressed. Set show_warnings=True to see them.")
122         
123    # Convert adjacency matrix to csr_matrix if it's not already
124    if not isinstance(graph_adjacency_matrix, csr_matrix):
125        graph_adjacency_matrix = csr_matrix(graph_adjacency_matrix)
126    
127    _validate_parameters_clustering_stability(resolution, n_repetitions, ecs_thresh, algorithms, flavor, ncores)
128        
129    if seed_sequence is None:
130        seed_sequence = range(1, n_repetitions * 100, 100)
131    else:
132        if not isinstance(seed_sequence, (list, np.ndarray)):
133            raise ValueError("seed_sequence parameter should be numeric")
134        seed_sequence = np.array(seed_sequence).astype(int)
135        
136    result_object = {}
137
138    adata = AnnData(graph_adjacency_matrix)
139    
140    for alg in algorithms: 
141        result_object[alg] = {}
142        
143        with concurrent.futures.ProcessPoolExecutor(max_workers = ncores) as executor:
144            futures = {executor.submit(_evaluate_clustering_resolution, graph_adjacency_matrix, adata, alg, res, seed_sequence, ncores, ecs_thresh, flavor, show_warnings):res for res in resolution}
145
146        for future in concurrent.futures.as_completed(futures.keys()):
147            res = futures[future]
148            result_object[alg][str(res)] = future.result()
149            
150    algorithm_names = list(result_object.keys())
151    split_by_k = {alg_name: merge_resolutions(result_object[alg_name]) for alg_name in algorithm_names if alg_name in result_object}
152
153    return {
154        'split_by_resolution': result_object,
155        'split_by_k': split_by_k
156    }

Evaluates the stability of different graph clustering methods in the clustering pipeline. The method will iterate through different values of the resolution parameter and compare, using the EC Consistency score, the partitions obtained at different seeds.

Args:

graph_adjacency_matrix (pandas.DataFrame): A data matrix with rows as cells and columns as features.
resolution (list): A list of resolutions to be used for clustering.
ncores (int): The number of cores to be used for parallel processing.
n_repetitions (int): The number of repetitions of applying the pipeline with different seeds; ignored if seed_sequence is provided by the user.
seed_sequence (int array): A custom seed sequence; if the value is None, the sequence will be built starting from 1 with a step of 100.
ecs_thresh (float): The ECS threshold used for merging similar clusterings.
algorithms (str array): The community detection algorithm to be used: ['louvain'], or ['leiden'], or both ['louvain', 'leiden'].
flavor (str): The implementation of the graph clustering algorithm to be used. Default is 'default'. You can also use 'igraph'.

Raises:

ValueError: If resolution is not a list or tuple of floats.
ValueError: If n_repetitions is not an integer.
ValueError: If ecs_thresh is not a numeric value.
ValueError: If algorithm is not 'louvain' or 'leiden'.
ValueError: If flavor is not 'default' or 'igraph'.

Returns:

# dict: A dictionary with the following structure:
#     - split_by_resolution (dict): Stability results when evaluating the clustering stability by resolution. 
#     - split_by_k (dict): Stability results when evaluating the clustering stability by the number of clusters (k).

Example:

>>> import numpy as np
>>> import pandas as pd
>>> import ClustAssessPy as ca
>>> # Create a PCA embedding.
>>> pca_emb = np.random.uniform(low=0.0, high=1.0, size=(100, 30))
>>> pca_emb_df = pd.DataFrame(pca_emb, index=map(str, range(1, 101)), columns=["PCA_" + str(i) for i in range(1, 31)])
>>> # Get SNN matrix from pca using 20 neighbours
>>> adj_mat = ca.get_adjacency_matrix_wrapper(pca_emb_df, n_neighs=20)['snn']
>>> clust_stability = ca.assess_clustering_stability(adj_mat, resolution=[0.1, 0.3, 0.5, 0.7, 0.9], n_repetitions=10, algorithms=['louvain','leiden'])
>>> # ------------------------------------------------
>>> # ALTERNATIVELY WITH SCANPY - assuming you have an adata object
>>> # ------------------------------------------------
>>> import scanpy as sc
>>> sc.pp.neighbors(adata, n_neighbors=20)
>>> adj_mat = adata.obsp['connectivities']
>>> clust_stability = ca.assess_clustering_stability(adj_mat, resolution=[0.1, 0.3, 0.5, 0.7, 0.9], n_repetitions=10, algorithms=['louvain','leiden'])
def assess_nn_stability( embedding, n_neigh_sequence, ncores, n_repetitions=100, seed_sequence=None, graph_reduction_type='PCA', ecs_thresh=1, graph_type=1, algorithm='leiden', umap_arguments=None, flavor='default', show_warnings=False):
185def assess_nn_stability(embedding, n_neigh_sequence, ncores, n_repetitions = 100, seed_sequence = None,
186                        graph_reduction_type = "PCA", ecs_thresh = 1, graph_type = 1, algorithm = 'leiden', umap_arguments = None, flavor="default", show_warnings = False):
187    '''
188    Evaluates clustering stability when changing the values of different parameters involved in the graph building step, namely the base embedding, the graph type and the number of neighbours.
189    
190    ### Args:
191        embedding (np.ndarray or pd.DataFrame): A matrix associated with a PCA embedding. Embeddings from other dimensionality reduction techniques can be used.
192        n_neigh_sequence (int array): An integer array of the number of nearest neighbours.
193        ncores (int): The number of cores to be used for parallel processing.
194        n_repetitions (int): The number of repetitions of applying the pipeline with different seeds; ignored if seed_sequence is provided by the user.
195        seed_sequence (int array): A custom seed sequence; if the value is None, the sequence will be built starting from 1 with a step of 100.
196        graph_reduction_type (str): The graph reduction type, denoting if the graph should be built on either the PCA or the UMAP embedding.
197        ecs_thresh (float): The ECS threshold used for merging similar clusterings.
198        graph_type (int): Argument indicating whether the graph should be NN (0), SNN (1) or both (2).
199        algorithm (str): The community detection algorithm to be used: 'louvain', or 'leiden'. Default and recommended is 'leiden'.
200        umap_arguments (dict): Arguments to be used in UMAP construction. Used when graph_reduction_type = "UMAP".
201        flavor (str): The implementation of the graph clustering algorithm to be used. Default is 'default'. You can also use 'igraph'.
202    
203    ### Raises:
204        ValueError: If embedding is not a numpy array or pandas DataFrame.
205        ValueError: If n_neigh_sequence is not a list or np.ndarray.
206        ValueError: If n_repetitions is not an integer.
207        ValueError: If graph_reduction_type is not 'PCA' or 'UMAP'.
208        ValueError: If ecs_thresh is not an integer or a float.
209        ValueError: If graph_type is not an array of integers or if it contain values not between 0 and 2.
210        ValueError: If algorithm is not 'louvain' or 'leiden'.
211        ValueError: If flavor is not 'default' or 'igraph'.
212    
213    ### Returns:
214        # dict: A dictionary with the following structure:
215        #     n_neigh_k_corresp (dict): A dict containing the number of the clusters obtained by running the pipeline multiple times with different seed, number of neighbors and graph type (weighted vs unweigted).
216        #     n_neigh_ec_consistency (dict): A dict containing the EC consistency of the partitions obtained at multiple runs when changing the number of neighbors or the graph type.
217        #     n_different_partitions (dict): The number of different partitions obtained by each number of neighbors.
218    
219    ### Example:
220    >>> import numpy as np
221    >>> import pandas as pd
222    >>> import ClustAssessPy as ca
223    >>> # Create a PCA embedding.
224    >>> pca_emb = np.random.uniform(low=0.0, high=1.0, size=(100, 30))
225    >>> pca_emb_df = pd.DataFrame(pca_emb, index=map(str, range(1, 101)), columns=["PCA_" + str(i) for i in range(1, 31)])
226    >>> # Assess nn stability using Leiden algorithm
227    >>> nn_stability_obj_leiden = ca.assess_nn_stability(pca_emb_df, n_neigh_sequence=[10, 15, 20], n_repetitions = 10, graph_reduction_type="PCA" ,algorithm = 'leiden', graph_type = 2)
228    '''
229    if not show_warnings:
230        if algorithm == "leiden":
231            print("FutureWarnings (caused by ScanPy for Leiden) will be suppressed. Set show_warnings=True to see them.")
232        
233    # Validate parameters stability.
234    _validate_parameters_stability(n_neigh_sequence, n_repetitions, ecs_thresh, graph_type, embedding, algorithm, graph_reduction_type, flavor, ncores)
235
236    # Convert number of neighbors to integers.
237    n_neigh_sequence = np.sort(n_neigh_sequence).astype(int)
238
239    # convert n_repetitions to integers
240    n_repetitions = int(n_repetitions)
241
242    n_cells = embedding.shape[0]
243    n_neigh_sequence = n_neigh_sequence[np.where(n_neigh_sequence < n_cells)]
244
245    if len(n_neigh_sequence) == 0:
246        warnings.warn(f"The provided values for the `n_neigh_sequence` are greater than the number of cells ({n_cells}). For the downstream analysis, we will set `n_neigh` to {n_cells - 1}.")
247        n_neigh_sequence = np.array([n_cells - 1])
248
249    # Create a seed sequence if it's not provided.
250    if seed_sequence is None:
251        seed_sequence = np.arange(1, n_repetitions * 100, 100)
252    else:
253        if not isinstance(seed_sequence, (list, np.ndarray)):
254            raise ValueError("seed_sequence parameter should be numeric")
255        seed_sequence = np.array(seed_sequence).astype(int)
256
257    if graph_reduction_type == "PCA":
258        return _assess_nn_stability_pca(embedding, n_neigh_sequence, seed_sequence, ecs_thresh, graph_type, algorithm, flavor, ncores, show_warnings)
259    else:
260        return _assess_nn_stability_umap(embedding, n_neigh_sequence, seed_sequence, ecs_thresh, graph_type, algorithm, umap_arguments, flavor, ncores, show_warnings)

Evaluates clustering stability when changing the values of different parameters involved in the graph building step, namely the base embedding, the graph type and the number of neighbours.

Args:

embedding (np.ndarray or pd.DataFrame): A matrix associated with a PCA embedding. Embeddings from other dimensionality reduction techniques can be used.
n_neigh_sequence (int array): An integer array of the number of nearest neighbours.
ncores (int): The number of cores to be used for parallel processing.
n_repetitions (int): The number of repetitions of applying the pipeline with different seeds; ignored if seed_sequence is provided by the user.
seed_sequence (int array): A custom seed sequence; if the value is None, the sequence will be built starting from 1 with a step of 100.
graph_reduction_type (str): The graph reduction type, denoting if the graph should be built on either the PCA or the UMAP embedding.
ecs_thresh (float): The ECS threshold used for merging similar clusterings.
graph_type (int): Argument indicating whether the graph should be NN (0), SNN (1) or both (2).
algorithm (str): The community detection algorithm to be used: 'louvain', or 'leiden'. Default and recommended is 'leiden'.
umap_arguments (dict): Arguments to be used in UMAP construction. Used when graph_reduction_type = "UMAP".
flavor (str): The implementation of the graph clustering algorithm to be used. Default is 'default'. You can also use 'igraph'.

Raises:

ValueError: If embedding is not a numpy array or pandas DataFrame.
ValueError: If n_neigh_sequence is not a list or np.ndarray.
ValueError: If n_repetitions is not an integer.
ValueError: If graph_reduction_type is not 'PCA' or 'UMAP'.
ValueError: If ecs_thresh is not an integer or a float.
ValueError: If graph_type is not an array of integers or if it contain values not between 0 and 2.
ValueError: If algorithm is not 'louvain' or 'leiden'.
ValueError: If flavor is not 'default' or 'igraph'.

Returns:

# dict: A dictionary with the following structure:
#     n_neigh_k_corresp (dict): A dict containing the number of the clusters obtained by running the pipeline multiple times with different seed, number of neighbors and graph type (weighted vs unweigted).
#     n_neigh_ec_consistency (dict): A dict containing the EC consistency of the partitions obtained at multiple runs when changing the number of neighbors or the graph type.
#     n_different_partitions (dict): The number of different partitions obtained by each number of neighbors.

Example:

>>> import numpy as np
>>> import pandas as pd
>>> import ClustAssessPy as ca
>>> # Create a PCA embedding.
>>> pca_emb = np.random.uniform(low=0.0, high=1.0, size=(100, 30))
>>> pca_emb_df = pd.DataFrame(pca_emb, index=map(str, range(1, 101)), columns=["PCA_" + str(i) for i in range(1, 31)])
>>> # Assess nn stability using Leiden algorithm
>>> nn_stability_obj_leiden = ca.assess_nn_stability(pca_emb_df, n_neigh_sequence=[10, 15, 20], n_repetitions = 10, graph_reduction_type="PCA" ,algorithm = 'leiden', graph_type = 2)
def plot_clustering_overall_stability(clust_object, value_type='k', summary_function=<function median>):
102def plot_clustering_overall_stability(clust_object, value_type="k", summary_function=np.median):
103    '''
104    Display EC consistency across clustering method.
105    
106    ### Args:
107        clust_object (dict): The object returned by the `assess_clustering_stability` function.
108        value_type (str): The type of value to be used for grouping the results. Can be either `k` or `resolution`.
109        summary_function (function): The function to be used for summarizing the ECC values.
110        
111    ### Example:
112    >>> pca_emb = np.random.rand(100, 30)
113    >>> # Create an AnnData object from pca_emb
114    >>> adata = sc.AnnData(X=pca_emb)
115    >>> # Run scanpy.pp.neighbors to get the connectivities as an adjacency matrix
116    >>> sc.pp.neighbors(adata, n_neighbors=10, use_rep='X')
117    >>> adjacency_matrix = adata.obsp['connectivities']
118    >>> clust_obj = assess_clustering_stability(adjacency_matrix, resolution=[0.1, 0.3, 0.5], n_repetitions=10, algorithm=[1,4])
119    >>> plot_k_resolution_corresp(clust_obj)
120    
121    '''
122    if value_type not in ["k", "resolution"]:
123        raise ValueError("`value_type` should contain either `k` or `resolution`")
124
125    # Extracting ECC values and applying the summary function
126    ecc_vals = {
127        method: {
128            val_type: summary_function(data['ecc'])
129            for val_type, data in alg.items()
130        }
131        for method, alg in clust_object[f"split_by_{value_type}"].items()
132    }
133    
134    melted_df = pd.DataFrame([
135        {'Method': method, value_type: val_type, 'ECC': ecc}
136        for method, val_types in ecc_vals.items()
137        for val_type, ecc in val_types.items()
138    ])
139
140    plot = (
141        ggplot(melted_df, aes(x='Method', y='ECC', fill='Method')) +
142        geom_boxplot() +
143        theme_classic() +
144        ggtitle(f"Overall clustering stability grouped by {value_type}")
145    )
146
147    return plot.draw()

Display EC consistency across clustering method.

Args:

clust_object (dict): The object returned by the `assess_clustering_stability` function.
value_type (str): The type of value to be used for grouping the results. Can be either `k` or `resolution`.
summary_function (function): The function to be used for summarizing the ECC values.

Example:

>>> pca_emb = np.random.rand(100, 30)
>>> # Create an AnnData object from pca_emb
>>> adata = sc.AnnData(X=pca_emb)
>>> # Run scanpy.pp.neighbors to get the connectivities as an adjacency matrix
>>> sc.pp.neighbors(adata, n_neighbors=10, use_rep='X')
>>> adjacency_matrix = adata.obsp['connectivities']
>>> clust_obj = assess_clustering_stability(adjacency_matrix, resolution=[0.1, 0.3, 0.5], n_repetitions=10, algorithm=[1,4])
>>> plot_k_resolution_corresp(clust_obj)
def plot_feature_overall_stability_boxplot(feature_objects_assessment_list):
 4def plot_feature_overall_stability_boxplot(feature_objects_assessment_list):
 5    '''
 6    Display EC consistency for each feature set and for each step.
 7    
 8    ### Args:
 9        feature_objects_assessment_list (dict): The object returned by the `assess_feature_stability` function.
10        
11    ### Example:
12    >>> np.random.seed(42)
13    >>> expr_matrix_values = np.vstack((np.random.rand(25, 10), np.random.uniform(5, 7, (75, 10))))
14    >>> expr_matrix = pd.DataFrame(expr_matrix_values)
15    >>> # Set the row names and column names
16    >>> expr_matrix.index = map(str, range(1, 101))
17    >>> expr_matrix.columns = ["feature " + str(i) for i in range(1, 11)]
18    >>> feature_stability_result = assess_feature_stability(data_matrix = expr_matrix, feature_set = expr_matrix.columns.tolist(), steps = [5, 10], feature_type = 'type_name', resolution = [0.3, 0.5], n_repetitions=10, algorithm=4)
19    >>> plot_feature_overall_stability_boxplot(feature_stability_result)
20    '''
21    # if it is not a list, make it a list
22    if not isinstance(feature_objects_assessment_list, list):
23        feature_objects_assessment_list = [feature_objects_assessment_list]
24    
25    # Get overall stability results in a dataframe. 
26    data = []
27    for fs_object in feature_objects_assessment_list:
28        key = list(fs_object['steps_ecc_list'].keys())[0]
29        feature_object = fs_object['steps_ecc_list'][key]
30        for type_key, steps_dict in feature_object.items():
31            for resolution, results_ecc in steps_dict.items():
32                data.append({'#Features': type_key, 'ECC': np.median(results_ecc['ecc']), 'type': key})
33
34    df = pd.DataFrame(data)
35    
36    # Sort by features
37    df['#Features'] = df['#Features'].astype(int)  
38    df = df.sort_values(by='#Features')
39    df['#Features'] = pd.Categorical(df['#Features'], categories=sorted(df['#Features'].unique()), ordered=True)
40
41    plot = (
42        ggplot(df, aes(x='#Features', y='ECC', fill='type'))
43        + geom_boxplot(outlier_shape=None, outlier_alpha=0)
44        + theme_classic()
45    )
46    
47    return plot.draw()

Display EC consistency for each feature set and for each step.

Args:

feature_objects_assessment_list (dict): The object returned by the `assess_feature_stability` function.

Example:

>>> np.random.seed(42)
>>> expr_matrix_values = np.vstack((np.random.rand(25, 10), np.random.uniform(5, 7, (75, 10))))
>>> expr_matrix = pd.DataFrame(expr_matrix_values)
>>> # Set the row names and column names
>>> expr_matrix.index = map(str, range(1, 101))
>>> expr_matrix.columns = ["feature " + str(i) for i in range(1, 11)]
>>> feature_stability_result = assess_feature_stability(data_matrix = expr_matrix, feature_set = expr_matrix.columns.tolist(), steps = [5, 10], feature_type = 'type_name', resolution = [0.3, 0.5], n_repetitions=10, algorithm=4)
>>> plot_feature_overall_stability_boxplot(feature_stability_result)
def plot_feature_overall_stability_incremental(feature_objects_assessment_list):
50def plot_feature_overall_stability_incremental(feature_objects_assessment_list):
51    '''
52    Perform an incremental ECS between two consecutive feature steps.
53    
54    ### Args:
55        feature_objects_assessment_list (dict): The object returned by the `assess_feature_stability` function.
56        
57    ### Example:
58    >>> np.random.seed(42)
59    >>> expr_matrix_values = np.vstack((np.random.rand(25, 10), np.random.uniform(5, 7, (75, 10))))
60    >>> expr_matrix = pd.DataFrame(expr_matrix_values)
61    >>> # Set the row names and column names
62    >>> expr_matrix.index = map(str, range(1, 101))
63    >>> expr_matrix.columns = ["feature " + str(i) for i in range(1, 11)]
64    >>> feature_stability_result = assess_feature_stability(data_matrix = expr_matrix, feature_set = expr_matrix.columns.tolist(), steps = [5, 10], feature_type = 'type_name', resolution = [0.3, 0.5], n_repetitions=10, algorithm=4)
65    >>> plot_feature_overall_stability_incremental(feature_stability_result)
66    '''
67    # if it is not a list, make it a list
68    if not isinstance(feature_objects_assessment_list, list):
69        feature_objects_assessment_list = [feature_objects_assessment_list]
70    
71    df = pd.DataFrame(columns=['#Features', 'ECS', 'type'])
72    
73    # Get incremental stability results in a dataframe.
74    for fs_object in feature_objects_assessment_list:
75        key = list(fs_object['incremental_ecs_list'].keys())[0]
76        feature_object = fs_object['incremental_ecs_list'][key]
77        for type_key, steps_dict in feature_object.items():
78            for resolution, results_ecs in steps_dict.items():
79                # Temporary df to hold the results for this resolution
80                temp_df = pd.DataFrame({
81                    '#Features': [type_key],
82                    'ECS': [np.median(results_ecs)],
83                    'type': [key]
84                })
85                # Append the temporary DataFrame to the main DataFrame
86                df = pd.concat([df, temp_df], ignore_index=True)
87    
88    # Sort by features order (order that they appear in the incremental_ecs list).
89    features_order = df['#Features'].drop_duplicates().tolist()
90    df['#Features'] = pd.Categorical(df['#Features'], categories=features_order, ordered=True)
91
92    plot = (
93        ggplot(df, aes(x='#Features', y='ECS', fill='type'))
94        + geom_boxplot(outlier_shape=None, outlier_alpha=0)
95        + theme_classic()
96    )
97    
98    return plot.draw()

Perform an incremental ECS between two consecutive feature steps.

Args:

feature_objects_assessment_list (dict): The object returned by the `assess_feature_stability` function.

Example:

>>> np.random.seed(42)
>>> expr_matrix_values = np.vstack((np.random.rand(25, 10), np.random.uniform(5, 7, (75, 10))))
>>> expr_matrix = pd.DataFrame(expr_matrix_values)
>>> # Set the row names and column names
>>> expr_matrix.index = map(str, range(1, 101))
>>> expr_matrix.columns = ["feature " + str(i) for i in range(1, 11)]
>>> feature_stability_result = assess_feature_stability(data_matrix = expr_matrix, feature_set = expr_matrix.columns.tolist(), steps = [5, 10], feature_type = 'type_name', resolution = [0.3, 0.5], n_repetitions=10, algorithm=4)
>>> plot_feature_overall_stability_incremental(feature_stability_result)
def plot_k_n_partitions( clust_object, color_information='ecc', y_step=5, pt_size_range=(1.5, 4), dodge_width=0.3, summary_function=<function median>):
150def plot_k_n_partitions(clust_object, color_information="ecc", y_step=5, pt_size_range=(1.5, 4), dodge_width=0.3, summary_function=np.median):
151    '''
152    For each configuration provided in clust_object, display how many different partitions with the same number of clusters can be obtained by changing the seed.
153    
154    ### Args:
155        clust_object (dict): The object returned by the `assess_clustering_stability` function.
156        color_information (str): The information to be used for coloring the points. Can be either `ecc` or `freq_part`.
157        y_step (int): The step for the y-axis.
158        pt_size_range (tuple): The range of point sizes.
159        dodge_width (float): The width of the dodging.
160        summary_function (function): The function to be used for summarizing the ECC values.
161        
162    ### Example:
163    >>> pca_emb = np.random.rand(100, 30)
164    >>> # Create an AnnData object from pca_emb
165    >>> adata = sc.AnnData(X=pca_emb)
166    >>> # Run scanpy.pp.neighbors to get the connectivities as an adjacency matrix
167    >>> sc.pp.neighbors(adata, n_neighbors=10, use_rep='X')
168    >>> adjacency_matrix = adata.obsp['connectivities']
169    >>> clust_obj = assess_clustering_stability(adjacency_matrix, resolution=[0.1, 0.3, 0.5], n_repetitions=10, algorithm=[1,4])
170    >>> plot_k_n_partitions(clust_obj)
171    '''
172    if color_information not in ["ecc", "freq_part"]:
173        raise ValueError("colour_information can be either `ecc` or `freq_part`")
174
175    if 'split_by_k' in clust_object:
176        clust_object = clust_object['split_by_k']
177
178    data_list = []
179    max_n_part = 0
180
181    for configuration, partition_object in clust_object.items():
182        for k, data in partition_object.items():
183            n_partitions = len(data['partitions'])
184            first_occ = max(partition['freq'] for partition in data['partitions'])
185            total_occ = sum(partition['freq'] for partition in data['partitions'])
186            freq_part = first_occ / total_occ
187            ecc = summary_function(data['ecc'])
188
189            data_list.append({
190                # convert k to numeric
191                'n.clusters': pd.to_numeric(k),
192                'n.partitions': n_partitions,
193                'configuration': configuration,
194                'first.occ': first_occ,
195                'total.occ': total_occ,
196                'freq_part': freq_part,
197                'ecc': ecc
198            })
199
200            max_n_part = max(max_n_part, n_partitions)
201
202    overall_total_occ = sum(item['total.occ'] for item in data_list)
203    for item in data_list:
204        item['frequency_k'] = item['total.occ'] / overall_total_occ
205
206    unique_parts = pd.DataFrame(data_list)
207    y_breaks = list(range(0, max_n_part + y_step, y_step))
208    unique_parts['n.clusters'] = pd.Categorical(unique_parts['n.clusters'], ordered=True)
209
210    plot = (
211        ggplot(unique_parts, aes(
212            x='n.clusters',
213            y='n.partitions',
214            shape='configuration',
215            size='frequency_k',
216            fill=color_information,  # Replace with the correct column name
217            group='configuration'
218        )) +
219        scale_y_continuous(breaks=y_breaks) +
220        scale_size_continuous(range=pt_size_range, guide='legend') +
221        geom_hline(yintercept=list(range(0, max_n_part + 1, y_step)), linetype="dashed", color="#e3e3e3") +
222        geom_point(position=position_dodge(width=dodge_width)) +
223        theme_classic() +
224        labs(x="k", y="# partitions") +
225        guides(shape=guide_legend(override_aes={'size': max(pt_size_range)}))
226    )
227    
228    return plot.draw()

For each configuration provided in clust_object, display how many different partitions with the same number of clusters can be obtained by changing the seed.

Args:

clust_object (dict): The object returned by the `assess_clustering_stability` function.
color_information (str): The information to be used for coloring the points. Can be either `ecc` or `freq_part`.
y_step (int): The step for the y-axis.
pt_size_range (tuple): The range of point sizes.
dodge_width (float): The width of the dodging.
summary_function (function): The function to be used for summarizing the ECC values.

Example:

>>> pca_emb = np.random.rand(100, 30)
>>> # Create an AnnData object from pca_emb
>>> adata = sc.AnnData(X=pca_emb)
>>> # Run scanpy.pp.neighbors to get the connectivities as an adjacency matrix
>>> sc.pp.neighbors(adata, n_neighbors=10, use_rep='X')
>>> adjacency_matrix = adata.obsp['connectivities']
>>> clust_obj = assess_clustering_stability(adjacency_matrix, resolution=[0.1, 0.3, 0.5], n_repetitions=10, algorithm=[1,4])
>>> plot_k_n_partitions(clust_obj)
def plot_k_resolution_corresp( clust_object, color_information='ecc', dodge_width=0.3, pt_size_range=[1.5, 4], summary_function=<function median>):
230def plot_k_resolution_corresp(clust_object, color_information="ecc", dodge_width = 0.3, pt_size_range = [1.5, 4], summary_function=np.median):
231    '''
232    For each configuration provided in the clust_object, display what number of clusters appear for different values of the resolution parameters.
233    
234    ### Args:
235        clust_object (dict): The object returned by the `assess_clustering_stability` function.
236        color_information (str): The information to be used for coloring the points. Can be either `ecc` or `freq_part`.
237        dodge_width (float): The width of the dodging.
238        pt_size_range (float array): The range of point sizes.
239        summary_function (function): The function to be used for summarizing the ECC values.
240        
241    ### Example:
242    >>> pca_emb = np.random.rand(100, 30)
243    >>> # Create an AnnData object from pca_emb
244    >>> adata = sc.AnnData(X=pca_emb)
245    >>> # Run scanpy.pp.neighbors to get the connectivities as an adjacency matrix
246    >>> sc.pp.neighbors(adata, n_neighbors=10, use_rep='X')
247    >>> adjacency_matrix = adata.obsp['connectivities']
248    >>> clust_obj = assess_clustering_stability(adjacency_matrix, resolution=[0.1, 0.3, 0.5], n_repetitions=10, algorithm=[1,4])
249    >>> plot_k_resolution_corresp(clust_obj)
250    '''
251    if color_information not in ["ecc", "freq_k"]:
252        raise ValueError("color_information can be either `ecc` or `freq_k`")
253
254    appearances_df = pd.DataFrame()
255
256    # Iterate through the clustering results
257    for method, resolutions in clust_object['split_by_resolution'].items():
258        for resolution, res_object in resolutions.items():
259            # Calculate total runs for normalization
260            n_runs = sum(partition['freq'] for cluster in res_object['clusters'].values() for partition in cluster['partitions'])
261
262            temp_data = {
263                'method': [],
264                'resolution_value': [],
265                'number_clusters': [],
266                'freq_partition': [],
267                'ecc': [],
268                'freq_k': []
269            }
270
271            for k, cluster_info in res_object['clusters'].items():
272                # Calculate the sum of frequencies for this cluster size (k)
273                freq_k = sum(partition['freq'] for partition in cluster_info['partitions'])
274                
275                temp_data['method'].append(method)
276                temp_data['resolution_value'].append(resolution)
277                temp_data['number_clusters'].append(k)
278                temp_data['freq_partition'].append(cluster_info['partitions'][0]['freq']/sum(partition['freq'] for partition in cluster_info['partitions']))
279                temp_data['ecc'].append(summary_function(cluster_info['ecc']))
280                temp_data['freq_k'].append(freq_k)
281
282            # Convert to DataFrame and normalize
283            temp_df = pd.DataFrame(temp_data)
284            temp_df['freq_k'] /= n_runs
285            appearances_df = pd.concat([appearances_df, temp_df], ignore_index=True)
286
287    appearances_df['number_clusters'] = pd.to_numeric(appearances_df['number_clusters'])
288    appearances_df['resolution_value'] = pd.Categorical(appearances_df['resolution_value'])
289
290    plot = (
291        ggplot(appearances_df, aes(
292            y='number_clusters',
293            x='factor(resolution_value)',
294            size='freq_partition',
295            fill=color_information,
296            shape='method',
297            group='method'
298        )) +
299        geom_hline(yintercept=appearances_df['number_clusters'].unique(), linetype="dashed", color="#e3e3e3") +
300        geom_point(position= position_dodge(width=dodge_width)) +
301        theme_classic() +
302        scale_fill_cmap(guide="colorbar") +
303        labs(x="resolution", y="k") +
304        scale_size_continuous(range=pt_size_range, guide="legend") +
305        theme(axis_text_x=element_text(angle=90, hjust=1)) +
306        guides(shape=guide_legend(override_aes={'size': max(pt_size_range)}))
307    )
308    
309    return plot.draw()    

For each configuration provided in the clust_object, display what number of clusters appear for different values of the resolution parameters.

Args:

clust_object (dict): The object returned by the `assess_clustering_stability` function.
color_information (str): The information to be used for coloring the points. Can be either `ecc` or `freq_part`.
dodge_width (float): The width of the dodging.
pt_size_range (float array): The range of point sizes.
summary_function (function): The function to be used for summarizing the ECC values.

Example:

>>> pca_emb = np.random.rand(100, 30)
>>> # Create an AnnData object from pca_emb
>>> adata = sc.AnnData(X=pca_emb)
>>> # Run scanpy.pp.neighbors to get the connectivities as an adjacency matrix
>>> sc.pp.neighbors(adata, n_neighbors=10, use_rep='X')
>>> adjacency_matrix = adata.obsp['connectivities']
>>> clust_obj = assess_clustering_stability(adjacency_matrix, resolution=[0.1, 0.3, 0.5], n_repetitions=10, algorithm=[1,4])
>>> plot_k_resolution_corresp(clust_obj)
def plot_n_neigh_ecs(nn_ecs_object):
311def plot_n_neigh_ecs(nn_ecs_object):
312    '''
313    Display, for all configurations consisting in different number of neighbors, graph types and base embeddings, 
314    the EC Consistency of the partitions obtained over multiple runs.
315    
316    ### Args:
317        nn_ecs_object (dict): The object returned by the `assess_nn_stability` function.
318    
319    ### Example:
320    >>> pca_emb = np.random.rand(100, 30)
321    >>> nn_stability_obj = assess_nn_stability(pca_emb, n_neigh_sequence= [10, 15, 20], n_repetitions = 10, algorithm = 4)
322    >>> plot_n_neigh_ecs(nn_stability_obj)
323    '''
324    data = []
325    for config_name, config in nn_ecs_object['n_neigh_ec_consistency'].items():
326        for n_neigh, values in config.items():
327            for ecc in values:
328                data.append({'ECC': ecc, 'n_neigh': n_neigh, 'config_name': config_name})
329    df = pd.DataFrame(data)
330
331    # Convert 'n_neigh' to a categorical variable and sort its levels
332    df['n_neigh'] = pd.Categorical(df['n_neigh'], ordered=True)
333    df['n_neigh'] = df['n_neigh'].cat.reorder_categories(sorted(df['n_neigh'].unique(), key=lambda x: float(x)))
334
335    
336    plot = (
337        ggplot(df, aes(x='n_neigh', y='ECC', fill='config_name'))
338        + geom_boxplot()
339        + theme_classic()
340    )
341    
342    return plot.draw()

Display, for all configurations consisting in different number of neighbors, graph types and base embeddings, the EC Consistency of the partitions obtained over multiple runs.

Args:

nn_ecs_object (dict): The object returned by the `assess_nn_stability` function.

Example:

>>> pca_emb = np.random.rand(100, 30)
>>> nn_stability_obj = assess_nn_stability(pca_emb, n_neigh_sequence= [10, 15, 20], n_repetitions = 10, algorithm = 4)
>>> plot_n_neigh_ecs(nn_stability_obj)