Source code for TELF.pre_processing.Vulture.tokens_analysis.top_words

# -*- coding: utf-8 -*-
from collections import defaultdict
import pandas as pd
from tqdm import tqdm
import numpy as np
import random

[docs] def get_top_words(documents, top_n=10, n_gram=1, verbose=True, filename=None) -> pd.DataFrame: """ Collects statistics for the top words or n-grams. Returns a table with columns word, tf, df, df_fraction, and tf_fraction. - word column lists the words in the top_n. - tf is the term-frequency, how many times given word occured in documents. - df is the document-frequency, in how documents given word occured. - df_fraction is df / len(documents) - tf_fraction is tf / (total number of unique tokens or n-grams) Parameters ---------- documents : list or dict list or dictionary of documents. If dictionary, keys are the document IDs, values are the text. top_n : int, optional Top n words or n-grams to report. The default is 10. n_gram : int, optional 1 is words, or n-grams when > 1. The default is 1. verbose : bool, optional Verbosity flag. The default is True. filename : str, optional If not one, saves the table to the given location. Returns ------- pd.DataFrame Table for the statistics. """ if isinstance(documents, dict): documents = list(documents.values()) word_stats = defaultdict(lambda: {"tf": 0, "df": 0}) for doc in tqdm(documents, disable=not verbose): tokens = doc.split() ngrams = zip(*[tokens[i:] for i in range(n_gram)]) ngrams = [" ".join(ngram) for ngram in ngrams] for gram in ngrams: word_stats[gram]["tf"] += 1 for gram in set(ngrams): word_stats[gram]["df"] += 1 word_stats = dict(word_stats) top_words = dict(sorted(word_stats.items(), key=lambda x: x[1]["tf"], reverse=True)[:top_n]) target_data = {"word": [], "tf": [], "df": [], "df_fraction": [], "tf_fraction": []} for word in top_words: target_data["word"].append(word) target_data["tf"].append(word_stats[word]["tf"]) target_data["df"].append(word_stats[word]["df"]) target_data["df_fraction"].append(word_stats[word]["df"] / len(documents)) target_data["tf_fraction"].append(word_stats[word]["tf"] / len(word_stats)) # put together the results table = pd.DataFrame.from_dict(target_data) if filename: table.to_csv(filename+".csv", index=False) return table