Source code for pyDNMFk.dist_clustering

# @Author: Manish Bhattarai, Erik Skau
from .utils import *


[docs]class custom_clustering():
    r"""
    Greedy algorithm to approximate a quadratic assignment problem to cluster vectors. Given p groups of k vectors, construct k clusters, each cluster containing a single vector from each of the p groups. This clustering approximation uses cos distances and mean centroids.

    Parameters
    ----------

        W_all : ndarray
            Order three tensor of shape m by k by p, where m is the ambient dimension of the vectors, k is the number of vectors in each group, and p is the number of groups of vectors.
        H_all : ndarray
            Order three tensor of shape n by k by p, where n is the ambient dimension of the vectors, k is the number of vectors in each group, and p is the number of groups of vectors.
        params : class
            Class object with communication parameters which comprises of grid information (p_r,p_c) , commincator (comm) and epsilon (eps).

    """

    @comm_timing()
    def __init__(self, Wall, Hall, params):
        self.W_all = Wall
        self.H_all = Hall
        self.p_r, self.p_c = params.p_r, params.p_c
        self.comm1 = params.comm1
        self.eps = params.eps
        self.p = self.p_r * self.p_c

[docs]    @comm_timing()
    def normalize_by_W(self):
        r'''Normalize the factors W and H'''
        Wall_norm = (self.W_all * self.W_all).sum(axis=0)
        if self.p_r != 1:
            Wall_norm = self.comm1.allreduce(Wall_norm)
        Wall_norm += self.eps
        temp = np.sqrt(Wall_norm)
        self.W_all /= temp.reshape(1, temp.shape[0], temp.shape[1])
        self.H_all *= temp.reshape(temp.shape[0], 1, temp.shape[1])

[docs]    @comm_timing()
    def mad(self, data, flag=1, axis=-1):
        r'''Compute the median/mean absolute deviation'''
        if flag == 1:  # the median absolute deviation
            return np.nanmedian(np.absolute(data - np.nanmedian(data, axis=axis, keepdims=True)), axis=axis)
        else:  # flag = 0 the mean absolute deviation
            # return np.nanmean((np.absolute(data.T - np.nanmean(data, axis = dimf))).T,axis = dimf)
            return np.nanmean(np.absolute(data - np.nanmean(data, axis=axis)), axis=axis)

[docs]    @comm_timing()
    def change_order(self, tens):
        r'''change the order of features'''
        ans = list(range(len(tens)))
        for p in tens:
            ans[p[0]] = p[1]
        return ans

[docs]    @comm_timing()
    def greedy_lsa(self, A):
        r"""Return the permutation order"""
        X = A.copy()
        pairs = []
        for i in range(X.shape[0]):
            minindex = np.argmax(X)
            ind = np.unravel_index(minindex, X.shape)
            pairs.append(ind)
            X[:, ind[1]] = -np.inf
            X[ind[0], :] = -np.inf
        return pairs

[docs]    @comm_timing()
    def dist_feature_ordering(self, centroids, W_sub):
        r'''return the features in proper order'''
        k = W_sub.shape[1]
        dist = centroids.T @ W_sub
        if self.p_r != 1:
            dist = self.comm1.allreduce(dist)
        tmp = self.greedy_lsa(dist)
        j = self.change_order(tmp)
        W_sub = W_sub[:, j]
        return W_sub, j

[docs]    @comm_timing()
    def dist_custom_clustering(self, centroids=None, vb=0):
        """
        Performs the distributed custom clustering

        Parameters
        ----------
        centroids : ndarray, optional
           The m by k initialization of the centroids of the clusters. None corresponds to using the first slice, W_all[:,:,0], as the initial centroids. Defaults to None.
        vb : bool, optional
           Verbose to display intermediate results

        Returns
        -------
        centroids : ndarray
           The m by k centroids of the clusters
        W_all :ndarray
           Clustered organization of the vectors W_all
        H_all : ndarray
           Clustered organization of the vectors H_all
        permute_order : list
           Indices of the permuted features
        """

        permute_order = []
        self.normalize_by_W()
        if centroids == None:
            centroids = self.W_all[:, :, 0].copy()
        dist = centroids.T @ self.W_all[:, :, 0]
        if self.p_r != 1:
            dist = self.comm1.allreduce(dist)
        for i in range(100):
            for p in range(self.W_all.shape[-1]):
                W_ord, j = self.dist_feature_ordering(centroids, self.W_all[:, :, p])
                permute_order.append(j)
                self.W_all[:, :, p] = W_ord
                self.H_all[:, :, p] = [self.H_all[:, :, p][k] for k in j]
            centroids = np.median(self.W_all, axis=-1)
            centroids_norm = (centroids ** 2).sum(axis=0)
            if self.p_r != 1:
                centroids_norm = self.comm1.allreduce(centroids_norm)
            centroids_norm += self.eps
            temp = np.sqrt(centroids_norm)
            centroids /= temp
        return centroids, self.W_all, self.H_all, permute_order

[docs]    @comm_timing()
    def dist_silhouettes(self):
        """
        Computes the cosine distances silhouettes of a distributed clustering of vectors.

        Returns
        -------
        sils : ndarray
            The k by p array of silhouettes where sils[i,j] is the silhouette measure for the vector W_all[:,i,j]
        """

        self.dist_custom_clustering()
        N, k, n_pert = self.W_all.shape
        W_flat = self.W_all.reshape(N, k * n_pert)
        W_all2 = (W_flat.T @ W_flat).reshape(k, n_pert, k, n_pert)
        if self.p_r != 1:
            W_all2 = self.comm1.allreduce(W_all2)
        distances = np.arccos(np.clip(W_all2, -1.0, 1.0))
        (N, K, n_perts) = self.W_all.shape
        if K == 1:
            sils = np.ones((K, n_perts))
        else:
            a = np.zeros((K, n_perts))
            b = np.zeros((K, n_perts))
            for k in range(K):
                for n in range(n_perts):
                    a[k, n] = 1 / (n_perts - 1) * np.sum(distances[k, n, k, :])
                    tmp = np.sum(distances[k, n, :, :], axis=1)
                    tmp[k] = np.inf
                    b[k, n] = 1 / n_perts * np.min(tmp)
            sils = (b - a) / np.maximum(a, b)
        return sils

[docs]    @comm_timing()
    def fit(self):
        r"""
        Calls the sub routines to perform distributed custom clustering and  compute silhouettes

        Returns
        -------
        centroids : ndarray
            The m by k centroids of the clusters
        CentStd : ndarray
            Absolute deviation of the features from the centroid
        W_all : ndarray
            Clustered organization of the vectors W_all
        H_all : ndarray
            Clustered organization of the vectors H_all
        S_avg : ndarray
            mean Silhouette score
        permute_order : list
            Indices of the permuted features
        """

        centroids, _, _, IDX_F2 = self.dist_custom_clustering()
        CentStd = self.mad(self.W_all, axis=-1)
        cluster_coefficients = self.dist_silhouettes()
        S_avg = cluster_coefficients.flatten().mean()
        result = [centroids, CentStd, self.H_all, cluster_coefficients.mean(axis=1), S_avg, IDX_F2]
        return result