Source code for TELF.factorization.utilities.clustering
from scipy.spatial.distance import cosine
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import scipy
[docs]
def H_clustering(H, verbose=False) -> (dict, dict):
"""
Performs H-clustering, and gathers cluster information.
Parameters
----------
H: np.ndarray or scipy.sparse.csr_matrix
H matrix from NMF
verbose: bool, default is False
If True, shows the progress.
Returns
-------
(clusters_information, centroid_similarities): tuple of dict and list
Dictionary carrying information for each cluster,
and dictionary carrying information for each document.
"""
# hyper-parameter check
assert scipy.sparse.issparse(H) or \
type(H) == np.ndarray, \
"H type is not supported. H type: " + str(type(H)) + "\n" \
"H should be type scipy.sparse.csr_matrix or np.ndarray."
assert type(verbose) is bool, "verbose should be type bool!"
# begin
cluster_assignments = np.argmax(H, axis=0)
total_documents = H.shape[1]
clusters_information = {}
documents_information = {}
# clusters information
for cluster in tqdm(set(cluster_assignments), disable=not verbose):
# get the documents in the current cluster
docs_in_cluster = np.argwhere(cluster_assignments == cluster).flatten()
# calculate the cluster information
num_docs_in_cluster = len(docs_in_cluster)
percent_docs_in_cluster = num_docs_in_cluster / total_documents
# calculate the centroid information
coordinates_in_cluster = H[:,docs_in_cluster]
cluster_centroid = np.sum(coordinates_in_cluster, axis=1) / num_docs_in_cluster
# save the information
clusters_information[cluster] = {
"count": num_docs_in_cluster,
"percent": percent_docs_in_cluster,
"centroid": cluster_centroid
}
# documents information, go through document's coordinates and its index
for doc_idx, doc_coord in tqdm(enumerate(H.transpose()),
disable= not verbose, total=H.shape[1]):
# cluster that the document belongs to
doc_cluster = cluster_assignments[doc_idx]
# centroid of the cluster that the current document belongs to
cluster_centroid = clusters_information[doc_cluster]["centroid"]
# similarity of the coordinate of the document to the cluster centroid
doc_sim_centroid = 1 - cosine(doc_coord, cluster_centroid)
documents_information[doc_idx] = {
"similarity_to_cluster_centroid": doc_sim_centroid,
"cluster": doc_cluster
}
return (clusters_information, documents_information)
[docs]
def plot_H_clustering(H, name="filename"):
"""
Plots the centroids of the H-clusters
Parameters
----------
H: np.ndarray or scipy.sparse.csr_matrix
H matrix from NMF
name: File name to save
Returns
-------
Matplotlib plots
"""
labels = np.argmax(H, axis=0)
uniques = np.unique(labels)
fig = plt.figure(figsize=(6,3*uniques.shape[0]))
for i,l in enumerate(uniques):
cluster = H[:,labels==l]
cluster_means = np.mean(cluster,axis=1)
cluster_stds = np.std(cluster,axis=1)
ax = plt.subplot(uniques.shape[0],1,i+1)
plt.bar(np.arange(H.shape[0]), cluster_means, color=(0.2, 0.4, 0.6, 0.6))
plt.title(f'{name} cluster {i}')
plt.tight_layout()
extent = ax.get_window_extent().transformed(fig.dpi_scale_trans.inverted())
fig.savefig(f'{name}_{i}.png', bbox_inches=extent.expanded(1.2, 1.3), dpi=200)
plt.savefig(f'{name}.png', bbox_inches='tight', dpi=200)
return fig