Source code for graphico.graphico

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jan  8 13:37:56 2020
@author: karliskanders
Last updated on 01/04/2020
"""

import leidenalg as la
import pandas as pd
import numpy as np
from sklearn.metrics.cluster import adjusted_mutual_info_score as ami_score
from sklearn.metrics import confusion_matrix

from matplotlib import pyplot as plt
import seaborn as sns
import igraph as ig
from time import time
import os

[docs]class ConsensusClustering: """ Class for determining stable clustering of data by using a 3-step process. First, an ensemble of clustering results is generated by repeatedly applying a clustering algorithm many times (step 1). Then, the ensemble is used to define new edge weights between the graph nodes based on the data point co-clustering occurrences. These weights are then used to generate another "consensus ensemble", which in practice is very stable and exhibits only minor variations between different clustering runs (step 2). To decide which one of the partitions among the "consensus ensemble" should be designated as the final consensus partition, we use adjusted mutual information to compare all partitions within the consensus ensemble, and choose the one which agrees the best with all of the other partitions (step 3). Presently, we use the Leiden community detection algorithm for clustering the graph into communities. However, this class can be easily adapted to use other graph-based clustering algorithms. The consensus clustering approach used here is an adapted version of the intuitively simple but well-performing "Ensemble Clustering for Graphs" method by Poulin & Theberge (see https://arxiv.org/abs/1809.05578). """ def __init__( self, graph, N=20, N_consensus=10, verbose=True, seed=None, edge_bootstrap=False ): """ Parameters ---------- graph (igraph.Graph): Graph object that will be used for finding graph communities. N (int): Ensemble size for the first clustering step (normally use 500-1000). N_consensus (int): Ensemble size for the consensus clustering step. verbose (boolean): Determines whether user is informed about the progress regarding the intermediate steps of the clustering procedure. seed (int): Seed for the random number generator; useful for reproducing the exact same clustering result. This seed is then used to generate all other random seeds for each repeated clustering run. edge_bootstrap (boolean): Determines whether edge bootstrapping is used for generating the clustering ensemble. """ self.graph = graph self.directed = graph.is_directed() self.N = N self.N_consensus = N_consensus self.v = verbose self.edge_bootstrap = edge_bootstrap self.w_min = 0.05 # Hard-coded parameter for consensus clustering (step 2) from Poulin & # Theberge publication self._ensemble = None # List of lists containing the ensemble of step 1 clustering results self._consensus_ensemble = None # List of lists containing the ensemble of step 2 clustering results self._COOC = None # Clustering co-occurrence matrix self._consensus_partition = None # Final consensus clustering partition # Manage random seeds if type(seed) != type(None): print("Setting random seeds...") np.random.seed(seed) self.ensemble_seeds = np.random.randint(100000000, size=N) self.consensus_ensemble_seed = np.random.randint(100000000) else: self.ensemble_seeds = None self.consensus_ensemble_seed = None @property def ensemble(self): """ List of clustering results (pertaining to step 1 of the clustering procedure), where each clustering result is a list of integers. These integers correspond to cluster labels. """ if self._ensemble is None: # Generate ensemble of self.N partitions self._ensemble = self.create_ensemble(self.N, weights="weight") # self.clustering_AMI, _ = self.ensemble_AMI(self.ensemble, v=self.v) return self._ensemble @property def COOC(self): """ Co-clustering occurrence matrix: element (i,j) of this matrix indicates how many times nodes i and j were clustered together. """ if self._COOC is None: # Calculate the co-occurrence matrix from the ensemble self._COOC = self.cooccurrence_matrix(self.ensemble) return self._COOC @property def consensus_ensemble(self): """ List of consensus clustering results (pertaining to step 2 of the clustering procedure) where each clustering result is a list of integers. These integers correspond to cluster labels. """ if self._consensus_ensemble is None: # Use the co-occurrence matrix values for consensus clustering weights A = (self.COOC != 0).astype(int) if self.v: print("Using co-occurrence matrix to do consensus clustering...") # Create a new graph and find communities in this new graph g_cooc = build_graph(self.COOC / self.N, kNN=A) clust_cooc = ConsensusClustering( g_cooc, N=self.N_consensus, seed=self.consensus_ensemble_seed ) self._consensus_ensemble = clust_cooc.create_ensemble() return self._consensus_ensemble
[docs] def load_ensemble(self, ensemble, consensus=False): """ This method can be used to load an external ensemble. For example, you might have stored an ensemble of clustering results from a previous analysis and would now like to recalculate the consensus partition. Parameters ---------- ensemble (list of lists of int): List of clustering results, where each clustering result is a list of integers. These integers correspond to cluster labels. consensus (boolean): Determines whether the ensemble should be treated as the initial ensemble (from step 1) or the consensus ensemble (from step 2). """ if not consensus: self._ensemble = ensemble else: self._consensus_ensemble = ensemble
[docs] def create_ensemble(self, N=None, weights="weight"): """ Generates ensemble of clustering partitions by repeatedly applying a clustering algorithm many times. Parameters ---------- N (int OR None): Ensemble size for the first clustering step. If N==None, use the class property self.N weights (string OR None): Edge property to use for the community detection Returns ------- ensemble (list of lists of int): List of clustering results, where each clustering result is a list of integers. These integers correspond to cluster labels. """ if N is None: N = self.N ensemble = [] if self.v: print(f"Generating an ensemble with {N} partitions...") for i in range(N): # Choose random seed for the clustering if self.ensemble_seeds is not None: ensemble_seed = self.ensemble_seeds[i] else: ensemble_seed = None # Bootstrapping by removing edges if self.edge_bootstrap == True: graph_ = self.graph.copy() rand_numbers = np.random.rand(len(graph_.es)) edge_weights = graph_.es[weights] # Normalise the edge weights between 0 and 1 edge_weights = np.array(edge_weights) / np.max(edge_weights) # Remove edges based on a probability that is proportional to their weight # (one might want to parameterise this further to tweak the edge removal) id_to_delete = np.where(rand_numbers > edge_weights)[0] graph_.delete_edges(list(id_to_delete)) else: graph_ = self.graph # Community detection p = la.find_partition( graph_, weights=weights, partition_type=la.ModularityVertexPartition, seed=ensemble_seed, ) ensemble.append(p.membership) if self.v: print("x", end="") if self.v: print("") return ensemble
[docs] @staticmethod def cooccurrence_matrix(ensemble): """ Create the co-clustering occurrence matrix (also called 'cooccurrence matrix'); This can be quite slow for large graphs with ~10K nodes and probably could be optimized, e.g., with numba. Parameters ---------- ensemble (list of lists of int): List of clustering results, where each clustering result is a list of integers. These integers correspond to cluster labels. """ n = len(ensemble[0]) COOC = np.zeros((n, n)) # For each clustering result in the ensemble for i, p in enumerate(ensemble): membership = p # Use pandas to find node pairs with the same cluster labels membership_df = pd.DataFrame( data={"id": list(range(len(membership))), "cluster": membership} ) cooc = membership_df.merge(right=membership_df, on="cluster") cooc = cooc[cooc.id_x < cooc.id_y] # For each node pair with the same cluster labels, add 1 to the # co-clustering occurrence matrix COOC[cooc.id_x.values, cooc.id_y.values] += 1 COOC = COOC + np.triu(COOC).T return COOC
@property def consensus_partition(self): """ Final consensus partition of the clustering procedure """ if self._consensus_partition is None: self.consensus_communities() return self._consensus_partition
[docs] def consensus_communities(self): """ Method for finding the consensus clustering partition, i.e., for the steps 2-3 of the clustering procedure. """ # Measure the stability of the consensus ensemble. If the consensus ensemble # has not been generated yet, it will be by calling the self.consensus_ensemble self.consensus_AMI, AMI_matrix = self.ensemble_AMI( self.consensus_ensemble, v=self.v ) # Take "the most agreeable" partition as the final consensus clustering # partition (i.e., step 3) mean_ami = np.mean(AMI_matrix, axis=1) most_agreeable = np.argsort(mean_ami)[-1] self._consensus_partition = self.consensus_ensemble[most_agreeable] # Describe the final consensus clustering partition char = self.describe_partition(self._consensus_partition, self.v) self.n = char["n"] self.sizes = char["sizes"]
[docs] @staticmethod def describe_partition(partition, verbose=True): """ Describes the number of clusters and the number of nodes in each cluster """ partition = np.array(partition) clusters = np.unique(partition) n = len(clusters) sizes = [0] * n for c in range(n): sizes[c] = np.sum(partition == c) if verbose: print(f"Clustering with {len(partition)} nodes and {n} clusters.") return {"n": n, "sizes": sizes}
[docs] @staticmethod def ensemble_AMI(P, v=True): """ Calculates pairwise adjusted mutual information (AMI) scores across the clustering ensemble. Parameters ---------- P (list of lists of int): Clustering ensemble, i.e., a list of clustering results, where each clustering result is a list of integers. These integers correspond to cluster labels. v (boolean): Determines whether information about the results is printed. Returns ------- ami_avg (float): Average adjusted mutual information across the ensemble ami_matrix (numpy.ndarray): The complete matrix with adjusted mutual information scores between all pairs of clustering results """ # If P is not a list of lists but a partition module instead, extract the lists of memberships if type(P[0]) == la.VertexPartition.ModularityVertexPartition: P = [e.membership for e in P] ami_matrix = np.zeros((len(P), len(P))) for i in range(0, len(P)): for j in range(i, len(P)): ami_matrix[i][j] = ami_score(P[i], P[j], average_method="arithmetic") ami_matrix += np.triu(ami_matrix).T np.fill_diagonal(ami_matrix, 1) ami_avg = np.mean(ami_matrix[np.triu_indices_from(ami_matrix, k=1)]) if v: print( f"Average pairwise AMI across {len(P)} partitions is {np.round(ami_avg,4)}" ) return ami_avg, ami_matrix
[docs]def build_kNN_matrix(similarity_matrix, kNN, self_connections=False): """ Method for building k-nearest neighbour adjacency matrix Parameters ---------- similarity_matrix (numpy.ndarray): Matrix with similarity values for each pair of nodes/data points kNN (int): Number of nearest neighbours for each node/data point self_connections (boolean): Determines whether self connections are included as part of the k-nearest neighbours. Returns ------- kNN_matrix (numpy.ndarray): Binary matrix with 1s for each node's k-nearest neighbours and 0s otherwise. """ kNN_matrix = np.zeros(similarity_matrix.shape) if self_connections == False: similarity_matrix = similarity_matrix.copy() np.fill_diagonal(similarity_matrix, 0) for i in range(similarity_matrix.shape[0]): closest = np.flip(np.argsort(similarity_matrix[i, :]))[0:kNN] kNN_matrix[i, closest] = 1 kNN_matrix[closest, i] = 1 return kNN_matrix
[docs]def build_graph(similarity_matrix, kNN=None, self_connections=False): """ Builds an igraph.Graph from the provided similarity matrix and adjacency matrix. Parameters ---------- similarity_matrix (numpy.ndarray): Matrix with similarity values for each pair of nodes/data points (diagonal is assumed to be 1). kNN (int OR numpy.ndarray OR None): If kNN is an int, this method builds a k-nearest neighbour graph with k=kNN; if kNN is a matrix, it assumed to be the adjacency matrix of the graph; if kNN is None, we allow all possible connections when creating the graph. self_connections (boolean): Determines whether self connections are included as part of the k-nearest neighbours. Returns ------- g (igraph.Graph): Undirected graph where edges have property 'weight' corresponding to the values of the 'similarity_matrix'. """ if type(kNN) == int: # Builds a symmetric, undirected k-nearest neighbour graph with k=kNN kNN_matrix = build_kNN_matrix(similarity_matrix, kNN, self_connections) elif type(kNN) == np.ndarray: # Assumes that an adjacency matrix has been provided kNN_matrix = kNN elif kNN == None: # Uses all connections kNN_matrix = np.ones((similarity_matrix.shape)) if self_connections == False: np.fill_diagonal(kNN_matrix, 0) print("Building the graph... ", end="") W_triu = np.triu(similarity_matrix) A_triu = np.triu(kNN_matrix) sources, targets = A_triu.nonzero() weights = W_triu[A_triu.nonzero()] edgelist = list(zip(sources.tolist(), targets.tolist())) g = ig.Graph(edges=edgelist, directed=False) g.es["weight"] = weights print("done!") return g
[docs]def node_affinity(cooc_matrix, cluster_labels, normalise=True): """ Estimate each node's affinity to the different clusters based on the ensemble clustering results (if normalise==True, then this can be interpreted as the probability of node belonging to the particular cluster). Parameters ---------- cooc_matrix (numpy.ndarray): Co-clustering occurrence matrix (see also ConsensusClustering.COOC). cluster_labels (list of int): Clustering partition with integers denoting cluster labels. normalise (boolean): Determines whether node affinities to clusters are normalised by the sum of rows. Returns ------- M (numpy.ndarray): Node affinity matrix with rows corresponding to nodes and columns to clusters. Matrix elements (i,c) indicate the average co-clustering occurrence value between node i and all other nodes in the cluster c (in terms of either absolute or normalised values). """ clust = np.unique(cluster_labels) M = np.zeros((len(cluster_labels), len(clust))) # Calculate node affinity score for each node with respect to each cluster for i in range(cooc_matrix.shape[0]): for c in range(0, len(clust)): j = cluster_labels == c M[i][c] = np.mean(cooc_matrix[i][j]) # Normalise rows if normalise == True: M[i, :] = M[i, :] / np.sum(M[i, :]) return M
[docs]def node_affinity_plot(M, cluster_labels, aspect_ratio=0.002, return_matrix=False): """ Plot the node affinity matrix created using node_affinity() method. Parameters ---------- M (numpy.ndarray): Node affinity matrix. cluster_labels (list of int): Clustering partition with integers denoting cluster labels. aspect_ratio (float): Needs to be adjusted for properly displaying the node affinity matrix. return_matrix (boolean): Determines whether the function returns the sorted affinity matrix. """ membership_sorted = np.sort(cluster_labels) sort_order = np.argsort(cluster_labels) M_sorted = M[sort_order, :] for c in np.unique(cluster_labels): j = membership_sorted == c m_values = M_sorted[j, c] M_sorted[j, c] = np.flip(np.sort(m_values)) fig, ax = plt.subplots() fig.figsize = (12, 12) plt.imshow(M_sorted, cmap="Blues") plt.colorbar() ax.set_aspect(aspect_ratio) plt.xlabel("cluster") plt.ylabel("node") plt.title("Node affinity to cluster") plt.show() if return_matrix == True: return M_sorted
[docs]def cluster_affinity_matrix(M, cluster_labels, symmetric=True, plot=True, cmap="Blues"): """ Calculate each cluster's affinity to other clusters based on their constituent nodes' affinities to the different clusters. Parameters ---------- M (numpy.ndarray): Node affinity matrix. cluster_labels (list of int): Clustering partition with integers denoting cluster labels. symmetric (boolean): If True, ensures that the cluster affinity matrix is symmetric. symmetric (boolean): Determines whether the cluster affinity matrix is displayed. Returns ------- C (numpy.ndarray): Cluster affinity matrix, where elements (k,l) indicates the average co-clustering occurrence of cluster k nodes with the nodes of cluster l. """ n_clust = len(np.unique(cluster_labels)) C = np.zeros((n_clust, n_clust)) for i in range(n_clust): for j in range(n_clust): C[i, j] = np.mean(M[np.where(cluster_labels == i)[0], j]) if symmetric == True: C = 0.5 * C + 0.5 * C.T if plot == True: plt.imshow(C, cmap=cmap) plt.xlabel("cluster") plt.ylabel("cluster") plt.colorbar() plt.title("Cluster affinity to other clusters") plt.show() return C
[docs]def list_cluster_stability(C, cluster_labels=None): """ Prints out the diagonal values of cluster affinity matrix, which can be used as a measurement of cluster stability. """ clust_stability = C.diagonal() if type(cluster_labels) == type(None): cluster_labels = [""] * len(clust_stability) for j in range(len(clust_stability)): print(f"{np.round(clust_stability[j],2)}, (cluster {j}), {cluster_labels[j]}")
####### TO DO TO DO TO DO TO DO TO DO ########
[docs]def plot_confusion_matrix( y_true, y_pred, true_labels=None, pred_labels=None, normalize_to=None, plot=True, return_handle=False, ): """ Compares two different partitionings of data, i.e., two seperate clusterings, using a matrix where the entry (k,l) indicates the correspondence between cluster k of the first partition and cluster l of the second partition. For example, if normalize_to==None, the (k,l) entry will show how many points were assigned both to cluster k and cluster l. Another popular use case is a classification task, where we would compare the predicted labels (first partition) and true labels (second partition). Parameters ---------- y_true (list of int): First partition; a list of integers denoting cluster labels y_pred (list of int): Second partition; a list of integers denoting cluster labels true_labels: Text labels describing the clusters in y_true partition pred_labels: Text labels describing the clusters in y_pred partition normalize_to (int OR None): Can take the values 0, 1 or None; determines whether the values of the confusion matrix are normalised with respect to the total number of points in clusters of the y_true partition (normalize_to=1) or y_pred partition (normalize_to=0), or if the values are not normalised at all (normalize_to=None). Returns ------- (numpy.ndarray): Confusion matrix where the entry (k,l) indicates the correspondence between cluster k of the first partition and cluster l of the second partition. """ # y_true = columns, y_pred = rows def prepare_labels(labels, y): if type(labels) == type(None): labels = list(range(len(np.unique(y)))) else: labels = [k[0:20] + ".." for k in labels] return labels true_labels = prepare_labels(true_labels, y_true) pred_labels = prepare_labels(pred_labels, y_pred) c = confusion_matrix(y_true=y_true, y_pred=y_pred) c = c[0 : len(np.unique(y_true))] c = c[:, 0 : len(np.unique(y_pred))] # Normalise according to the counts of the y_pred categories if normalize_to == 1: counts = np.sum(c, axis=normalize_to) c = np.divide(c.T, counts.T).T elif normalize_to == 0: counts = np.sum(c, axis=normalize_to) c = np.divide(c, counts) if plot == True: cdf = pd.DataFrame( data=np.round(c.T, 2), columns=true_labels, index=pred_labels ) ax = sns.heatmap(cdf, annot=True, cmap="Blues", square=True) ax.set_xticklabels(ax.get_xticklabels(), rotation=90) # Fix for matplotlib bug that cuts off top/bottom of seaborn viz b, t = plt.ylim() # Discover the values for bottom and top b += 0.5 # Add 0.5 to the bottom t -= 0.5 # Subtract 0.5 from the top plt.ylim(b, t) # update the ylim(bottom, top) values if return_handle == False: plt.show() return c.T else: return c.T, ax
[docs]def plot_sorted_matrix(C, cluster_labels): """ Sorts the rows/columns of a matrix with respect to a set of cluster labels """ sort_order = np.argsort(cluster_labels) C_ = C.copy()[sort_order, :] C_ = C_[:, sort_order] plt.imshow(C_, cmap="inferno_r") plt.colorbar() plt.show() return C_
########## HIERARCHICAL CLUSTERING PIPELINE ##########
[docs]def subcluster_nodes( W, l, clusters, nearest_neighbours, fpath, session_name, N_nn, N, N_consensus, random_state=None, edge_bootstrap=False, ): """ Method that selects a particular partition (clustering), takes this partition's clusters and sub-clusters them further (i.e., splits them apart into small clusters) using the ConsensusClustering class defined above. Note that the first level of the hierarchy that will be sub-clustered needs to be set up manually. We normally refer to it as the 0-th level and assign all data points to the same cluster with a label 0. However, the 0-th level can also be used to exclude some points from the clustering analysis (e.g., the most central skills nodes). Consult the provided workflow examples on how to set up the whole pipeline from a similarity matrix to a hierarchical set of clusters. Parameters ---------- W (numpy.ndarray): Similarity matrix which is used to construct the graph. l (int): Level of clustering hierarchy that is sub-clustered further; this is used to find the file with the particular partitioning. clusters (list of int OR 'all'): If it's a list of integers then it sub-clusters only the clusters with the integer label that is contained in this list. If cluster=='all', then all clusters of Level l are sub-clustered further. nearest_neighbours (list of int OR ['all']): If this is a list of integers, then the method constructs k-nearest neighbour graphs using each of the integers in this list as k, and detects communities in these graphs. Then, the results with different k values are pooled together for the consensus clusestering step. If nearest_neighbours=['all'] then all non-zero values of W are used to construct the graph. fpath (string): File path where to save the clustering results. session_name (string): Name of this clustering session to be used when saving output files. N_nn (int): Number of clustering runs for each nearest neighbour value in 'nearest_neighbours'; normally we always set N_nn = N//len(nearest_neighbours) N (int): See the description of 'N' in ConsensusClustering.__init__() N_consensus (int): See the description of 'N_consensus' in ConsensusClustering.__init__() random_state (int): See the description of 'seed' in ConsensusClustering.__init__() edge_bootstrap (boolean): See the description of 'edge_bootstrap' in ConsensusClustering.__init__() Returns ------- The method does not return anyting but instead saves output files with the results in the designated location. The following outputs are saved: 1. CSV table with the complete ensemble of clusterings (from step 1 of the clustering procedure; see ConsensusClustering class description). 2. CSV table with the complete ensemble of consensus clusterings (from step 2 of the clustering procedure; see ConsensusClustering class description). 3. CSV table with the sub-clusters for each processed cluster of Level l. The table has two columns 'id' and 'cluster', where 'id' is the original node id and 'cluster' are the new cluster labels. 4. NPY file that stores the co-clustering occurrence matrix derived from the clustering ensemble from step 1 of the clustering procedure (can be useful for sub-sequent stability assessments; note, however, that it can be on the order of GBs for large graphs). """ source_path = fpath + session_name + "_clusters_Level" + str(l) + ".csv" partition = pd.read_csv(source_path) if clusters == "all": clusters = list(np.sort(partition.cluster.unique())) ######## Manage random seeds if type(random_state) != type(None): np.random.seed(random_state) cluster_seeds = np.random.randint(100000000, size=len(clusters)) else: cluster_seeds = None ############################ for cc, c in enumerate(clusters): print("==============") print(f"Partitioning cluster {c}...") ######## Manage random seeds if type(cluster_seeds) != type(None): np.random.seed(cluster_seeds[cc]) nn_seeds = np.random.randint(100000000, size=len(nearest_neighbours)) else: nn_seeds = None ############################ vertices = partition[partition.cluster == c].id.to_list() W_ = W.copy() W_ = W_[vertices, :] W_ = W_[:, vertices] ensemble_nn = [] # Get ensembles for each nearest neighbour value for ii, nn in enumerate(nearest_neighbours): ######## Manage random seeds if type(nn_seeds) != type(None): nn_seed = nn_seeds[ii] else: nn_seed = None ############################ if type(nn) == int: # Create an undirected graph based on the nodes to be clustered and the kNN value g = build_graph(W_, kNN=nn, self_connections=False) print(f"Clustering graph with {nn} nearest-neighbours...") elif nn == "all": g = build_graph(W_, kNN=(W_ != 0).astype(int), self_connections=False) print(f"Clustering graph...") g.vs["id"] = vertices t = time() clust = ConsensusClustering( g, N_nn, N_consensus, seed=nn_seed, edge_bootstrap=edge_bootstrap ) ensemble = clust.create_ensemble() ensemble_nn += ensemble t_elapsed = time() - t print(f"Elapsed time: {round(t_elapsed): .2f} seconds") # Perform the consensus clustering on the combined ensembles print("Clustering the consensus partition...") t = time() clust = ConsensusClustering(g, N, N_consensus, seed=random_state) clust.load_ensemble(ensemble_nn) clust.consensus_communities() t_elapsed = time() - t print(f"Elapsed time: {round(t_elapsed)} seconds") # Dump the results ensemble_df = pd.DataFrame() ensemble_df["id"] = g.vs["id"] for i in range(N): ensemble_df[str(i)] = clust.ensemble[i] ensemble_df.to_csv( fpath + session_name + "_cluster_ensemble_Level" + str(l) + "_Cluster" + str(c) + ".csv", index=False, ) consensus_ensemble_df = pd.DataFrame() consensus_ensemble_df["id"] = g.vs["id"] for i in range(N_consensus): consensus_ensemble_df[str(i)] = clust.consensus_ensemble[i] consensus_ensemble_df.to_csv( fpath + session_name + "_consensus_ensemble_Level" + str(l) + "_Cluster" + str(c) + ".csv", index=False, ) partition_df = pd.DataFrame() partition_df["id"] = g.vs["id"] partition_df["cluster"] = clust.consensus_partition partition_df.to_csv( fpath + session_name + "_subclusters_Level" + str(l) + "_Cluster" + str(c) + ".csv", index=False, ) # Save the co-occurrence matrix np.save( fpath + session_name + "_COOC_Level" + str(l) + "_Cluster" + str(c) + ".npy", clust.COOC, )
[docs]def collect_subclusters(l, fpath, session_name, n_total=None): """ Method that collects the sub-clusters of clustering hierarchy level l into one table thus yielding the clusters of the clustering hierarchy level l+1. Note that the tables describing the sub-clusters need to follow a specific naming convention. Parameters ---------- l (int): Level of clustering hierachy whose sub-clusters we wish to collect. fpath (string): File path where the clustering results are saved at. session_name (string): Name of the clustering session that is being analysed. n_total (int OR None): Total number of data points/nodes in the dataset Returns ------- partition (pandas.DataFrame): Final partition of the whole dataset of the level l+1 of the clustering hierarchy. Contains two columns 'id' and 'cluster' where 'id' are the original node IDs and 'cluster' is the cluster label. This table is also stored as a CSV table in the same folder as the sub-cluster tables. """ # Find all the sub-cluster tables (note that they are following a specific # naming convention) file_list = os.listdir(fpath) file_list = sorted( [ f for f in file_list if f.startswith(session_name + "_subclusters_Level" + str(l)) ] ) # Concatenate all subcluster tables and make sure that the cluster labels # are correct k = 0 all_data = pd.DataFrame() for f in file_list: data = pd.read_csv(fpath + f) n_c = len(data.cluster.unique()) data.cluster = data.cluster + k all_data = pd.concat([all_data, data]) k += n_c # Take care of nodes that have not been included in the subclusters (assign all to one, 0-th cluster) # (e.g., these will be the central nodes in the case of Level-1 TSC clustering) if (len(all_data) != n_total) and (n_total is not None): all_data.cluster = all_data.cluster + 1 partition = pd.DataFrame(data={"id": list(range(n_total))}) partition = partition.merge(all_data, on="id", how="left") partition.loc[partition[partition.cluster.isnull()].index.values, "cluster"] = 0 else: partition = pd.DataFrame(data={"id": list(range(n_total))}) partition = partition.merge(all_data, on="id", how="left") # Store the final partition as a CSV table partition = partition[["id", "cluster"]].sort_values("id").reset_index(drop=True) filename = fpath + session_name + "_clusters_Level" + str(l + 1) + ".csv" partition.to_csv(filename, index=False) print(f"Final partition saved in {filename}") return partition