Source code for TELF.factorization.decompositions.utilities.silhouettes

from scipy.spatial.distance import cdist
from sklearn.metrics import silhouette_samples
from .generic_utils import get_np

[docs] def silhouettes(W_all, use_gpu=False): """ Computes the cosine distances silhouettes of a clustering of vectors. Args: W_all (ndarray): Order three tensor of clustered vectors of shape m by k by p, where m is the ambient dimension of the vectors, k is the number of vectors in each group, and p is the number of groups of vectors. Returns: sils (ndarray): The k by p array of silhouettes where sils[i,j] is the silhouette measure for the vector W_all[:,i,j]. """ np = get_np(W_all, use_gpu=use_gpu) dtype = W_all[0].dtype if np.issubdtype(dtype, np.integer): eps = np.finfo(float).eps elif np.issubdtype(dtype, np.floating): eps = np.finfo(dtype).eps else: raise Exception("Unknown data type!") N, k, n_pert = W_all.shape W_all = W_all / np.sqrt(np.sum(W_all ** 2, axis=0)) W_flat = W_all.reshape(N, k * n_pert) W_all2 = (W_flat.T @ W_flat).reshape(k, n_pert, k, n_pert) distances = np.arccos(np.clip(W_all2, -1.0, 1.0)) (N, K, n_perts) = W_all.shape if K == 1: sils = np.ones((K, n_perts)) else: a = np.zeros((K, n_perts)) b = np.zeros((K, n_perts)) for k in range(K): for n in range(n_perts): a[k, n] = 1 / (n_perts - 1) * np.sum(distances[k, n, k, :]) tmp = np.sum(distances[k, n, :, :], axis=1) tmp[k] = np.inf b[k, n] = 1 / n_perts * np.min(tmp) a = np.maximum(a, eps) b = np.maximum(b, eps) sils = (b - a) / np.maximum(a, b) return sils
[docs] def silhouettes_with_distance(W_all, distance="hamming", use_gpu=False): # compute the distance matrix, and pass it to sklearn silhouettes samples np = get_np(W_all, use_gpu=use_gpu) N, k, n_pert = W_all.shape if k == 1: return np.ones((k, n_pert)) W_flat = W_all.reshape(N, k * n_pert, order="F") label = list(range(k)) * n_pert # * compute distance matrix dist = np.zeros((k * n_pert, k * n_pert)) if (distance == "FN") or (distance == "FP"): for i in np.arange(k * n_pert): for j in np.arange(k * n_pert): dist[i, j] = np.mean( np.logical_and(W_flat[:, i] == 1, W_flat[:, j] == 0) ) if distance == "FP": dist = dist.T elif distance == "hamming": dist = cdist(W_flat.T, W_flat.T, metric=distance) else: raise Exception("Unknown distance metric!") # now pass dist to silhouette S = silhouette_samples(dist, labels=label, metric="precomputed") return np.reshape(S, [k, n_pert], order="F")