Source code for TELF.pre_processing.Beaver.sppmi

import numpy as np
import scipy


[docs] def sppmi(cooc, shift=4): """computes the shifted positive pointwise mutual information from the cooccurrence matrix input: cooc: sparse cooccurence matrix shift: the shift output: sppmi_matrix: sparse shifted positive cooccurrence matrix author: Erik Skau """ total = np.sum(cooc) colsum = np.array(np.sum(cooc, axis=0))[0, :].astype(np.float64) rowsum = np.array(np.sum(cooc, axis=1))[:, 0].astype(np.float64) colsuminv = np.divide( np.ones_like(colsum), colsum, out=np.zeros_like(colsum), where=colsum != 0 ) rowsuminv = np.divide( np.ones_like(rowsum), rowsum, out=np.zeros_like(rowsum), where=rowsum != 0 ) sppmi_matrix = scipy.sparse.diags(rowsuminv) * cooc * scipy.sparse.diags(colsuminv) sppmi_matrix.data = np.maximum(np.log(sppmi_matrix.data * total) - shift, 0) sppmi_matrix.eliminate_zeros() return sppmi_matrix