Source code for TELF.pre_processing.Beaver.vectorize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


[docs] def count(documents, options): """ Count vectorizer. Parameters ---------- documents: list List of documents. Each entry in the list contains text. options: dict Parameters for sklearn library. Returns ------- X: list sparse bag of words. vocabulary: list List of unqiue words present in the all documents as vocabulary. """ vectorizer = CountVectorizer(**options) X = vectorizer.fit_transform(documents) vocabulary = vectorizer.get_feature_names_out() return X, vocabulary
[docs] def tfidf(documents, options): """ TF-IDF Parameters ---------- documents: list List of documents. Each entry in the list contains text. options: dict Parameters for sklearn library. Returns ------- X: list sparse tf-idf matrix. vocabulary: list List of unqiue words present in the all documents as vocabulary. """ vectorizer = TfidfVectorizer(**options) X = vectorizer.fit_transform(documents) vocabulary = vectorizer.get_feature_names_out() return X, vocabulary