Source code for TELF.pre_processing.Beaver.vectorize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
[docs]
def count(documents, options):
"""
Count vectorizer.
Parameters
----------
documents: list
List of documents. Each entry in the list contains text.
options: dict
Parameters for sklearn library.
Returns
-------
X: list
sparse bag of words.
vocabulary: list
List of unqiue words present in the all documents as vocabulary.
"""
vectorizer = CountVectorizer(**options)
X = vectorizer.fit_transform(documents)
vocabulary = vectorizer.get_feature_names_out()
return X, vocabulary
[docs]
def tfidf(documents, options):
"""
TF-IDF
Parameters
----------
documents: list
List of documents. Each entry in the list contains text.
options: dict
Parameters for sklearn library.
Returns
-------
X: list
sparse tf-idf matrix.
vocabulary: list
List of unqiue words present in the all documents as vocabulary.
"""
vectorizer = TfidfVectorizer(**options)
X = vectorizer.fit_transform(documents)
vocabulary = vectorizer.get_feature_names_out()
return X, vocabulary