Source code for TELF.pre_processing.Beaver.sppmi
import numpy as np
import scipy
[docs]
def sppmi(cooc, shift=4):
"""computes the shifted positive pointwise mutual information from the cooccurrence matrix
input:
cooc: sparse cooccurence matrix
shift: the shift
output:
sppmi_matrix: sparse shifted positive cooccurrence matrix
author: Erik Skau
"""
total = np.sum(cooc)
colsum = np.array(np.sum(cooc, axis=0))[0, :].astype(np.float64)
rowsum = np.array(np.sum(cooc, axis=1))[:, 0].astype(np.float64)
colsuminv = np.divide(
np.ones_like(colsum), colsum, out=np.zeros_like(colsum), where=colsum != 0
)
rowsuminv = np.divide(
np.ones_like(rowsum), rowsum, out=np.zeros_like(rowsum), where=rowsum != 0
)
sppmi_matrix = scipy.sparse.diags(rowsuminv) * cooc * scipy.sparse.diags(colsuminv)
sppmi_matrix.data = np.maximum(np.log(sppmi_matrix.data * total) - shift, 0)
sppmi_matrix.eliminate_zeros()
return sppmi_matrix