Source code for graphico.graphico

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jan  8 13:37:56 2020
@author: karliskanders
Last updated on 01/04/2020
"""

import leidenalg as la
import pandas as pd
import numpy as np
from sklearn.metrics.cluster import adjusted_mutual_info_score as ami_score
from sklearn.metrics import confusion_matrix

from matplotlib import pyplot as plt
import seaborn as sns
import igraph as ig
from time import time
import os

[docs]class ConsensusClustering:

    """
    Class for determining stable clustering of data by using a 3-step process.
    First, an ensemble of clustering results is generated by repeatedly applying
    a clustering algorithm many times (step 1).
    Then, the ensemble is used to define new edge weights between the graph nodes
    based on the data point co-clustering occurrences. These weights are then used
    to generate another "consensus ensemble", which in practice is very stable and
    exhibits only minor variations between different clustering runs (step 2).
    To decide which one of the partitions among the "consensus ensemble" should
    be designated as the final consensus partition, we use adjusted mutual information
    to compare all partitions within the consensus ensemble, and choose the one
    which agrees the best with all of the other partitions (step 3).
    Presently, we use the Leiden community detection algorithm for clustering the
    graph into communities. However, this class can be easily adapted to use other
    graph-based clustering algorithms.
    The consensus clustering approach used here is an adapted version of the
    intuitively simple but well-performing "Ensemble Clustering for Graphs" method
    by Poulin & Theberge (see https://arxiv.org/abs/1809.05578).
    """

    def __init__(
        self, graph, N=20, N_consensus=10, verbose=True, seed=None, edge_bootstrap=False
    ):

        """
        Parameters
        ----------
        graph (igraph.Graph):
            Graph object that will be used for finding graph communities.
        N (int):
            Ensemble size for the first clustering step (normally use 500-1000).
        N_consensus (int):
            Ensemble size for the consensus clustering step.
        verbose (boolean):
            Determines whether user is informed about the progress regarding the
            intermediate steps of the clustering procedure.
        seed (int):
            Seed for the random number generator; useful for reproducing the exact
            same clustering result. This seed is then used to generate all other
            random seeds for each repeated clustering run.
        edge_bootstrap (boolean):
            Determines whether edge bootstrapping is used for generating the
            clustering ensemble.
        """
        self.graph = graph
        self.directed = graph.is_directed()
        self.N = N
        self.N_consensus = N_consensus
        self.v = verbose
        self.edge_bootstrap = edge_bootstrap

        self.w_min = 0.05
        # Hard-coded parameter for consensus clustering (step 2) from Poulin &
        # Theberge publication

        self._ensemble = None
        # List of lists containing the ensemble of step 1 clustering results

        self._consensus_ensemble = None
        # List of lists containing the ensemble of step 2 clustering results

        self._COOC = None
        # Clustering co-occurrence matrix

        self._consensus_partition = None
        # Final consensus clustering partition

        # Manage random seeds
        if type(seed) != type(None):
            print("Setting random seeds...")
            np.random.seed(seed)
            self.ensemble_seeds = np.random.randint(100000000, size=N)
            self.consensus_ensemble_seed = np.random.randint(100000000)
        else:
            self.ensemble_seeds = None
            self.consensus_ensemble_seed = None

    @property
    def ensemble(self):
        """
        List of clustering results (pertaining to step 1 of the clustering
        procedure), where each clustering result is a list of integers. These
        integers correspond to cluster labels.
        """
        if self._ensemble is None:
            # Generate ensemble of self.N partitions
            self._ensemble = self.create_ensemble(self.N, weights="weight")
            # self.clustering_AMI, _ = self.ensemble_AMI(self.ensemble, v=self.v)
        return self._ensemble

    @property
    def COOC(self):
        """
        Co-clustering occurrence matrix: element (i,j) of this matrix indicates
        how many times nodes i and j were clustered together.
        """
        if self._COOC is None:
            # Calculate the co-occurrence matrix from the ensemble
            self._COOC = self.cooccurrence_matrix(self.ensemble)
        return self._COOC

    @property
    def consensus_ensemble(self):
        """
        List of consensus clustering results (pertaining to step 2 of the clustering
        procedure) where each clustering result is a list of integers. These
        integers correspond to cluster labels.
        """
        if self._consensus_ensemble is None:
            # Use the co-occurrence matrix values for consensus clustering weights
            A = (self.COOC != 0).astype(int)
            if self.v:
                print("Using co-occurrence matrix to do consensus clustering...")
            # Create a new graph and find communities in this new graph
            g_cooc = build_graph(self.COOC / self.N, kNN=A)
            clust_cooc = ConsensusClustering(
                g_cooc, N=self.N_consensus, seed=self.consensus_ensemble_seed
            )
            self._consensus_ensemble = clust_cooc.create_ensemble()
        return self._consensus_ensemble

[docs]    def load_ensemble(self, ensemble, consensus=False):
        """
        This method can be used to load an external ensemble. For example,
        you might have stored an ensemble of clustering results from a previous
        analysis and would now like to recalculate the consensus partition.
        Parameters
        ----------
        ensemble (list of lists of int):
            List of clustering results, where each clustering result is a list
            of integers. These integers correspond to cluster labels.
        consensus (boolean):
            Determines whether the ensemble should be treated as the initial
            ensemble (from step 1) or the consensus ensemble (from step 2).
        """
        if not consensus:
            self._ensemble = ensemble
        else:
            self._consensus_ensemble = ensemble

[docs]    def create_ensemble(self, N=None, weights="weight"):
        """
        Generates ensemble of clustering partitions by repeatedly applying
        a clustering algorithm many times.
        Parameters
        ----------
        N (int OR None):
            Ensemble size for the first clustering step. If N==None, use the
            class property self.N
        weights (string OR None):
            Edge property to use for the community detection
        Returns
        -------
        ensemble (list of lists of int):
            List of clustering results, where each clustering result is a list
            of integers. These integers correspond to cluster labels.
        """
        if N is None:
            N = self.N
        ensemble = []
        if self.v:
            print(f"Generating an ensemble with {N} partitions...")
        for i in range(N):
            # Choose random seed for the clustering
            if self.ensemble_seeds is not None:
                ensemble_seed = self.ensemble_seeds[i]
            else:
                ensemble_seed = None

            # Bootstrapping by removing edges
            if self.edge_bootstrap == True:
                graph_ = self.graph.copy()
                rand_numbers = np.random.rand(len(graph_.es))
                edge_weights = graph_.es[weights]
                # Normalise the edge weights between 0 and 1
                edge_weights = np.array(edge_weights) / np.max(edge_weights)
                # Remove edges based on a probability that is proportional to their weight
                # (one might want to parameterise this further to tweak the edge removal)
                id_to_delete = np.where(rand_numbers > edge_weights)[0]
                graph_.delete_edges(list(id_to_delete))
            else:
                graph_ = self.graph

            # Community detection
            p = la.find_partition(
                graph_,
                weights=weights,
                partition_type=la.ModularityVertexPartition,
                seed=ensemble_seed,
            )
            ensemble.append(p.membership)
            if self.v:
                print("x", end="")
        if self.v:
            print("")
        return ensemble

[docs]    @staticmethod
    def cooccurrence_matrix(ensemble):
        """
        Create the co-clustering occurrence matrix (also called 'cooccurrence matrix');
        This can be quite slow for large graphs with ~10K nodes and probably could
        be optimized, e.g., with numba.
        Parameters
        ----------
        ensemble (list of lists of int):
            List of clustering results, where each clustering result is a list
            of integers. These integers correspond to cluster labels.
        """

        n = len(ensemble[0])
        COOC = np.zeros((n, n))
        # For each clustering result in the ensemble
        for i, p in enumerate(ensemble):
            membership = p
            # Use pandas to find node pairs with the same cluster labels
            membership_df = pd.DataFrame(
                data={"id": list(range(len(membership))), "cluster": membership}
            )
            cooc = membership_df.merge(right=membership_df, on="cluster")
            cooc = cooc[cooc.id_x < cooc.id_y]
            # For each node pair with the same cluster labels, add 1 to the
            # co-clustering occurrence matrix
            COOC[cooc.id_x.values, cooc.id_y.values] += 1
        COOC = COOC + np.triu(COOC).T
        return COOC

    @property
    def consensus_partition(self):
        """
        Final consensus partition of the clustering procedure
        """
        if self._consensus_partition is None:
            self.consensus_communities()
        return self._consensus_partition

[docs]    def consensus_communities(self):
        """
        Method for finding the consensus clustering partition, i.e.,
        for the steps 2-3 of the clustering procedure.
        """

        # Measure the stability of the consensus ensemble. If the consensus ensemble
        # has not been generated yet, it will be by calling the self.consensus_ensemble
        self.consensus_AMI, AMI_matrix = self.ensemble_AMI(
            self.consensus_ensemble, v=self.v
        )

        # Take "the most agreeable" partition as the final consensus clustering
        # partition (i.e., step 3)
        mean_ami = np.mean(AMI_matrix, axis=1)
        most_agreeable = np.argsort(mean_ami)[-1]
        self._consensus_partition = self.consensus_ensemble[most_agreeable]

        # Describe the final consensus clustering partition
        char = self.describe_partition(self._consensus_partition, self.v)
        self.n = char["n"]
        self.sizes = char["sizes"]

[docs]    @staticmethod
    def describe_partition(partition, verbose=True):
        """
        Describes the number of clusters and the number of nodes in each cluster
        """
        partition = np.array(partition)
        clusters = np.unique(partition)
        n = len(clusters)
        sizes = [0] * n
        for c in range(n):
            sizes[c] = np.sum(partition == c)

        if verbose:
            print(f"Clustering with {len(partition)} nodes and {n} clusters.")

        return {"n": n, "sizes": sizes}

[docs]    @staticmethod
    def ensemble_AMI(P, v=True):
        """
        Calculates pairwise adjusted mutual information (AMI) scores across
        the clustering ensemble.
        Parameters
        ----------
        P (list of lists of int):
            Clustering ensemble, i.e., a list of clustering results, where each
            clustering result is a list of integers. These integers correspond
            to cluster labels.
        v (boolean):
            Determines whether information about the results is printed.
        Returns
        -------
        ami_avg (float):
            Average adjusted mutual information across the ensemble
        ami_matrix (numpy.ndarray):
            The complete matrix with adjusted mutual information scores between
            all pairs of clustering results
        """

        # If P is not a list of lists but a partition module instead, extract the lists of memberships
        if type(P[0]) == la.VertexPartition.ModularityVertexPartition:
            P = [e.membership for e in P]

        ami_matrix = np.zeros((len(P), len(P)))
        for i in range(0, len(P)):
            for j in range(i, len(P)):
                ami_matrix[i][j] = ami_score(P[i], P[j], average_method="arithmetic")

        ami_matrix += np.triu(ami_matrix).T
        np.fill_diagonal(ami_matrix, 1)
        ami_avg = np.mean(ami_matrix[np.triu_indices_from(ami_matrix, k=1)])

        if v:
            print(
                f"Average pairwise AMI across {len(P)} partitions is {np.round(ami_avg,4)}"
            )

        return ami_avg, ami_matrix


[docs]def build_kNN_matrix(similarity_matrix, kNN, self_connections=False):
    """
    Method for building k-nearest neighbour adjacency matrix
    Parameters
    ----------
    similarity_matrix (numpy.ndarray):
        Matrix with similarity values for each pair of nodes/data points
    kNN (int):
        Number of nearest neighbours for each node/data point
    self_connections (boolean):
        Determines whether self connections are included as part of the k-nearest
        neighbours.
    Returns
    -------
    kNN_matrix (numpy.ndarray):
        Binary matrix with 1s for each node's k-nearest neighbours and 0s
        otherwise.
    """
    kNN_matrix = np.zeros(similarity_matrix.shape)
    if self_connections == False:
        similarity_matrix = similarity_matrix.copy()
        np.fill_diagonal(similarity_matrix, 0)

    for i in range(similarity_matrix.shape[0]):
        closest = np.flip(np.argsort(similarity_matrix[i, :]))[0:kNN]
        kNN_matrix[i, closest] = 1
        kNN_matrix[closest, i] = 1
    return kNN_matrix


[docs]def build_graph(similarity_matrix, kNN=None, self_connections=False):
    """
    Builds an igraph.Graph from the provided similarity matrix and adjacency matrix.
    Parameters
    ----------
    similarity_matrix (numpy.ndarray):
        Matrix with similarity values for each pair of nodes/data points (diagonal
        is assumed to be 1).
    kNN (int OR numpy.ndarray OR None):
        If kNN is an int, this method builds a k-nearest neighbour graph with k=kNN;
        if kNN is a matrix, it assumed to be the adjacency matrix of the graph;
        if kNN is None, we allow all possible connections when creating the graph.
    self_connections (boolean):
        Determines whether self connections are included as part of the k-nearest
        neighbours.
    Returns
    -------
    g (igraph.Graph):
        Undirected graph where edges have property 'weight' corresponding to the
        values of the 'similarity_matrix'.
    """

    if type(kNN) == int:
        # Builds a symmetric, undirected k-nearest neighbour graph with k=kNN
        kNN_matrix = build_kNN_matrix(similarity_matrix, kNN, self_connections)
    elif type(kNN) == np.ndarray:
        # Assumes that an adjacency matrix has been provided
        kNN_matrix = kNN
    elif kNN == None:
        # Uses all connections
        kNN_matrix = np.ones((similarity_matrix.shape))
        if self_connections == False:
            np.fill_diagonal(kNN_matrix, 0)

    print("Building the graph... ", end="")
    W_triu = np.triu(similarity_matrix)
    A_triu = np.triu(kNN_matrix)
    sources, targets = A_triu.nonzero()
    weights = W_triu[A_triu.nonzero()]
    edgelist = list(zip(sources.tolist(), targets.tolist()))
    g = ig.Graph(edges=edgelist, directed=False)
    g.es["weight"] = weights
    print("done!")
    return g


[docs]def node_affinity(cooc_matrix, cluster_labels, normalise=True):
    """
    Estimate each node's affinity to the different clusters based on the
    ensemble clustering results (if normalise==True, then this can be interpreted
    as the probability of node belonging to the particular cluster).
    Parameters
    ----------
    cooc_matrix (numpy.ndarray):
        Co-clustering occurrence matrix (see also ConsensusClustering.COOC).
    cluster_labels (list of int):
        Clustering partition with integers denoting cluster labels.
    normalise (boolean):
        Determines whether node affinities to clusters are normalised by the sum of rows.
    Returns
    -------
    M (numpy.ndarray):
        Node affinity matrix with rows corresponding to nodes and columns to
        clusters. Matrix elements (i,c) indicate the average co-clustering occurrence
        value between node i and all other nodes in the cluster c (in terms of
        either absolute or normalised values).
    """

    clust = np.unique(cluster_labels)
    M = np.zeros((len(cluster_labels), len(clust)))

    # Calculate node affinity score for each node with respect to each cluster
    for i in range(cooc_matrix.shape[0]):
        for c in range(0, len(clust)):
            j = cluster_labels == c
            M[i][c] = np.mean(cooc_matrix[i][j])

        # Normalise rows
        if normalise == True:
            M[i, :] = M[i, :] / np.sum(M[i, :])

    return M


[docs]def node_affinity_plot(M, cluster_labels, aspect_ratio=0.002, return_matrix=False):
    """
    Plot the node affinity matrix created using node_affinity() method.
    Parameters
    ----------
    M (numpy.ndarray):
        Node affinity matrix.
    cluster_labels (list of int):
        Clustering partition with integers denoting cluster labels.
    aspect_ratio (float):
        Needs to be adjusted for properly displaying the node affinity matrix.
    return_matrix (boolean):
        Determines whether the function returns the sorted affinity matrix.
    """

    membership_sorted = np.sort(cluster_labels)

    sort_order = np.argsort(cluster_labels)
    M_sorted = M[sort_order, :]

    for c in np.unique(cluster_labels):
        j = membership_sorted == c
        m_values = M_sorted[j, c]
        M_sorted[j, c] = np.flip(np.sort(m_values))

    fig, ax = plt.subplots()
    fig.figsize = (12, 12)
    plt.imshow(M_sorted, cmap="Blues")
    plt.colorbar()
    ax.set_aspect(aspect_ratio)
    plt.xlabel("cluster")
    plt.ylabel("node")
    plt.title("Node affinity to cluster")
    plt.show()

    if return_matrix == True:
        return M_sorted


[docs]def cluster_affinity_matrix(M, cluster_labels, symmetric=True, plot=True, cmap="Blues"):
    """
    Calculate each cluster's affinity to other clusters based on their constituent
    nodes' affinities to the different clusters.
    Parameters
    ----------
    M (numpy.ndarray):
        Node affinity matrix.
    cluster_labels (list of int):
        Clustering partition with integers denoting cluster labels.
    symmetric (boolean):
        If True, ensures that the cluster affinity matrix is symmetric.
    symmetric (boolean):
        Determines whether the cluster affinity matrix is displayed.
    Returns
    -------
    C (numpy.ndarray):
        Cluster affinity matrix, where elements (k,l) indicates the average
        co-clustering occurrence of cluster k nodes with the nodes of cluster l.
    """

    n_clust = len(np.unique(cluster_labels))
    C = np.zeros((n_clust, n_clust))
    for i in range(n_clust):
        for j in range(n_clust):
            C[i, j] = np.mean(M[np.where(cluster_labels == i)[0], j])

    if symmetric == True:
        C = 0.5 * C + 0.5 * C.T

    if plot == True:
        plt.imshow(C, cmap=cmap)
        plt.xlabel("cluster")
        plt.ylabel("cluster")
        plt.colorbar()
        plt.title("Cluster affinity to other clusters")
        plt.show()
    return C


[docs]def list_cluster_stability(C, cluster_labels=None):
    """
    Prints out the diagonal values of cluster affinity matrix, which can be
    used as a measurement of cluster stability.
    """
    clust_stability = C.diagonal()
    if type(cluster_labels) == type(None):
        cluster_labels = [""] * len(clust_stability)
    for j in range(len(clust_stability)):
        print(f"{np.round(clust_stability[j],2)}, (cluster {j}), {cluster_labels[j]}")


####### TO DO TO DO TO DO TO DO TO DO ########
[docs]def plot_confusion_matrix(
    y_true,
    y_pred,
    true_labels=None,
    pred_labels=None,
    normalize_to=None,
    plot=True,
    return_handle=False,
):
    """
    Compares two different partitionings of data, i.e., two seperate clusterings,
    using a matrix where the entry (k,l) indicates the correspondence between
    cluster k of the first partition and cluster l of the second partition.
    For example, if normalize_to==None, the (k,l) entry will show how many points
    were assigned both to cluster k and cluster l.
    Another popular use case is a classification task, where we would compare
    the predicted labels (first partition) and true labels (second partition).
    Parameters
    ----------
    y_true (list of int):
        First partition; a list of integers denoting cluster labels
    y_pred (list of int):
        Second partition; a list of integers denoting cluster labels
    true_labels:
        Text labels describing the clusters in y_true partition
    pred_labels:
        Text labels describing the clusters in y_pred partition
    normalize_to (int OR None):
        Can take the values 0, 1 or None; determines whether the values of the
        confusion matrix are normalised with respect to the total number of points
        in clusters of the y_true partition (normalize_to=1) or y_pred partition
        (normalize_to=0), or if the values are not normalised at all (normalize_to=None).
    Returns
    -------
    (numpy.ndarray):
        Confusion matrix where the entry (k,l) indicates the correspondence between
        cluster k of the first partition and cluster l of the second partition.
    """
    # y_true = columns, y_pred = rows
    def prepare_labels(labels, y):
        if type(labels) == type(None):
            labels = list(range(len(np.unique(y))))
        else:
            labels = [k[0:20] + ".." for k in labels]
        return labels

    true_labels = prepare_labels(true_labels, y_true)
    pred_labels = prepare_labels(pred_labels, y_pred)

    c = confusion_matrix(y_true=y_true, y_pred=y_pred)
    c = c[0 : len(np.unique(y_true))]
    c = c[:, 0 : len(np.unique(y_pred))]

    # Normalise according to the counts of the y_pred categories
    if normalize_to == 1:
        counts = np.sum(c, axis=normalize_to)
        c = np.divide(c.T, counts.T).T
    elif normalize_to == 0:
        counts = np.sum(c, axis=normalize_to)
        c = np.divide(c, counts)

    if plot == True:
        cdf = pd.DataFrame(
            data=np.round(c.T, 2), columns=true_labels, index=pred_labels
        )
        ax = sns.heatmap(cdf, annot=True, cmap="Blues", square=True)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=90)

        # Fix for matplotlib bug that cuts off top/bottom of seaborn viz
        b, t = plt.ylim()  # Discover the values for bottom and top
        b += 0.5  # Add 0.5 to the bottom
        t -= 0.5  # Subtract 0.5 from the top
        plt.ylim(b, t)  # update the ylim(bottom, top) values

    if return_handle == False:
        plt.show()
        return c.T
    else:
        return c.T, ax


[docs]def plot_sorted_matrix(C, cluster_labels):
    """
    Sorts the rows/columns of a matrix with respect to a set of cluster labels
    """
    sort_order = np.argsort(cluster_labels)
    C_ = C.copy()[sort_order, :]
    C_ = C_[:, sort_order]
    plt.imshow(C_, cmap="inferno_r")
    plt.colorbar()
    plt.show()

    return C_


########## HIERARCHICAL CLUSTERING PIPELINE ##########


[docs]def subcluster_nodes(
    W,
    l,
    clusters,
    nearest_neighbours,
    fpath,
    session_name,
    N_nn,
    N,
    N_consensus,
    random_state=None,
    edge_bootstrap=False,
):

    """
    Method that selects a particular partition (clustering), takes this
    partition's clusters and sub-clusters them further (i.e., splits them apart
    into small clusters) using the ConsensusClustering class defined above.
    Note that the first level of the hierarchy that will be sub-clustered needs
    to be set up manually. We normally refer to it as the 0-th level and assign
    all data points to the same cluster with a label 0. However, the 0-th level
    can also be used to exclude some points from the clustering analysis (e.g.,
    the most central skills nodes).
    Consult the provided workflow examples on how to set up the whole pipeline from
    a similarity matrix to a hierarchical set of clusters.
    Parameters
    ----------
    W (numpy.ndarray):
        Similarity matrix which is used to construct the graph.
    l (int):
        Level of clustering hierarchy that is sub-clustered further; this is used
        to find the file with the particular partitioning.
    clusters (list of int OR 'all'):
        If it's a list of integers then it sub-clusters only the clusters with
        the integer label that is contained in this list. If cluster=='all', then
        all clusters of Level l are sub-clustered further.
    nearest_neighbours (list of int OR ['all']):
        If this is a list of integers, then the method constructs k-nearest
        neighbour graphs using each of the integers in this list as k, and detects
        communities in these graphs. Then, the results with different k values
        are pooled together for the consensus clusestering step.
        If nearest_neighbours=['all'] then all non-zero values of W are used
        to construct the graph.
    fpath (string):
        File path where to save the clustering results.
    session_name (string):
        Name of this clustering session to be used when saving output files.
    N_nn (int):
        Number of clustering runs for each nearest neighbour value in
        'nearest_neighbours'; normally we always set N_nn = N//len(nearest_neighbours)
    N (int):
        See the description of 'N' in ConsensusClustering.__init__()
    N_consensus (int):
        See the description of 'N_consensus' in ConsensusClustering.__init__()
    random_state (int):
        See the description of 'seed' in ConsensusClustering.__init__()
    edge_bootstrap (boolean):
        See the description of 'edge_bootstrap' in ConsensusClustering.__init__()
    Returns
    -------
    The method does not return anyting but instead saves output files with the
    results in the designated location. The following outputs are saved:
        1.  CSV table with the complete ensemble of clusterings (from step 1 of
            the clustering procedure; see ConsensusClustering class description).
        2.  CSV table with the complete ensemble of consensus clusterings (from
            step 2 of the clustering procedure; see ConsensusClustering class description).
        3.  CSV table with the sub-clusters for each processed cluster of Level l.
            The table has two columns 'id' and 'cluster', where 'id' is the original
            node id and 'cluster' are the new cluster labels.
        4.  NPY file that stores the co-clustering occurrence matrix derived from
            the clustering ensemble from step 1 of the clustering procedure (can
            be useful for sub-sequent stability assessments; note, however, that
            it can be on the order of GBs for large graphs).
    """

    source_path = fpath + session_name + "_clusters_Level" + str(l) + ".csv"
    partition = pd.read_csv(source_path)

    if clusters == "all":
        clusters = list(np.sort(partition.cluster.unique()))

    ######## Manage random seeds
    if type(random_state) != type(None):
        np.random.seed(random_state)
        cluster_seeds = np.random.randint(100000000, size=len(clusters))
    else:
        cluster_seeds = None
    ############################

    for cc, c in enumerate(clusters):
        print("==============")
        print(f"Partitioning cluster {c}...")

        ######## Manage random seeds
        if type(cluster_seeds) != type(None):
            np.random.seed(cluster_seeds[cc])
            nn_seeds = np.random.randint(100000000, size=len(nearest_neighbours))
        else:
            nn_seeds = None
        ############################

        vertices = partition[partition.cluster == c].id.to_list()

        W_ = W.copy()
        W_ = W_[vertices, :]
        W_ = W_[:, vertices]

        ensemble_nn = []

        # Get ensembles for each nearest neighbour value
        for ii, nn in enumerate(nearest_neighbours):

            ######## Manage random seeds
            if type(nn_seeds) != type(None):
                nn_seed = nn_seeds[ii]
            else:
                nn_seed = None
            ############################

            if type(nn) == int:
                # Create an undirected graph based on the nodes to be clustered and the kNN value
                g = build_graph(W_, kNN=nn, self_connections=False)
                print(f"Clustering graph with {nn} nearest-neighbours...")
            elif nn == "all":
                g = build_graph(W_, kNN=(W_ != 0).astype(int), self_connections=False)
                print(f"Clustering graph...")
            g.vs["id"] = vertices

            t = time()
            clust = ConsensusClustering(
                g, N_nn, N_consensus, seed=nn_seed, edge_bootstrap=edge_bootstrap
            )
            ensemble = clust.create_ensemble()
            ensemble_nn += ensemble
            t_elapsed = time() - t
            print(f"Elapsed time: {round(t_elapsed): .2f} seconds")

        # Perform the consensus clustering on the combined ensembles
        print("Clustering the consensus partition...")
        t = time()
        clust = ConsensusClustering(g, N, N_consensus, seed=random_state)
        clust.load_ensemble(ensemble_nn)
        clust.consensus_communities()
        t_elapsed = time() - t
        print(f"Elapsed time: {round(t_elapsed)} seconds")

        # Dump the results
        ensemble_df = pd.DataFrame()
        ensemble_df["id"] = g.vs["id"]
        for i in range(N):
            ensemble_df[str(i)] = clust.ensemble[i]
        ensemble_df.to_csv(
            fpath
            + session_name
            + "_cluster_ensemble_Level"
            + str(l)
            + "_Cluster"
            + str(c)
            + ".csv",
            index=False,
        )

        consensus_ensemble_df = pd.DataFrame()
        consensus_ensemble_df["id"] = g.vs["id"]
        for i in range(N_consensus):
            consensus_ensemble_df[str(i)] = clust.consensus_ensemble[i]
        consensus_ensemble_df.to_csv(
            fpath
            + session_name
            + "_consensus_ensemble_Level"
            + str(l)
            + "_Cluster"
            + str(c)
            + ".csv",
            index=False,
        )

        partition_df = pd.DataFrame()
        partition_df["id"] = g.vs["id"]
        partition_df["cluster"] = clust.consensus_partition
        partition_df.to_csv(
            fpath
            + session_name
            + "_subclusters_Level"
            + str(l)
            + "_Cluster"
            + str(c)
            + ".csv",
            index=False,
        )

        # Save the co-occurrence matrix
        np.save(
            fpath
            + session_name
            + "_COOC_Level"
            + str(l)
            + "_Cluster"
            + str(c)
            + ".npy",
            clust.COOC,
        )


[docs]def collect_subclusters(l, fpath, session_name, n_total=None):
    """
    Method that collects the sub-clusters of clustering hierarchy level l into
    one table thus yielding the clusters of the clustering hierarchy level l+1.
    Note that the tables describing the sub-clusters need to follow a specific
    naming convention.
    Parameters
    ----------
    l (int):
        Level of clustering hierachy whose sub-clusters we wish to collect.
    fpath (string):
        File path where the clustering results are saved at.
    session_name (string):
        Name of the clustering session that is being analysed.
    n_total (int OR None):
        Total number of data points/nodes in the dataset
    Returns
    -------
    partition (pandas.DataFrame):
        Final partition of the whole dataset of the level l+1 of the clustering
        hierarchy. Contains two columns 'id' and 'cluster' where 'id' are the
        original node IDs and 'cluster' is the cluster label. This table is also
        stored as a CSV table in the same folder as the sub-cluster tables.
    """

    # Find all the sub-cluster tables (note that they are following a specific
    # naming convention)
    file_list = os.listdir(fpath)
    file_list = sorted(
        [
            f
            for f in file_list
            if f.startswith(session_name + "_subclusters_Level" + str(l))
        ]
    )

    # Concatenate all subcluster tables and make sure that the cluster labels
    # are correct
    k = 0
    all_data = pd.DataFrame()
    for f in file_list:
        data = pd.read_csv(fpath + f)
        n_c = len(data.cluster.unique())
        data.cluster = data.cluster + k
        all_data = pd.concat([all_data, data])
        k += n_c

    # Take care of nodes that have not been included in the subclusters (assign all to one, 0-th cluster)
    # (e.g., these will be the central nodes in the case of Level-1 TSC clustering)
    if (len(all_data) != n_total) and (n_total is not None):
        all_data.cluster = all_data.cluster + 1
        partition = pd.DataFrame(data={"id": list(range(n_total))})
        partition = partition.merge(all_data, on="id", how="left")
        partition.loc[partition[partition.cluster.isnull()].index.values, "cluster"] = 0
    else:
        partition = pd.DataFrame(data={"id": list(range(n_total))})
        partition = partition.merge(all_data, on="id", how="left")

    # Store the final partition as a CSV table
    partition = partition[["id", "cluster"]].sort_values("id").reset_index(drop=True)
    filename = fpath + session_name + "_clusters_Level" + str(l + 1) + ".csv"
    partition.to_csv(filename, index=False)
    print(f"Final partition saved in {filename}")

    return partition