TELF.pre_processing.Vulture: Advanced text pre-processing and cleaning tool for NLP and text-mining#

Vulture is a tool for text pre-processing and cleaning. It has multi-processing and distributed computing capabilities, and designed to run fast.


First let’s load the example dataset (example dataset can be found here):

import pickle

DATA_DIR = os.path.join('..', '..', 'data')
DATA_DIR = pathlib.Path(DATA_DIR).resolve()
DATA_FILE = 'documents.p'
documents = pickle.load(open(os.path.join(DATA_DIR, DATA_FILE), 'rb'))

Now we can perform pre-processing:

# import libraries
import os
import pathlib
import pandas as pd

from TELF.pre_processing import Vulture

from TELF.pre_processing.Vulture.modules import SimpleCleaner
from TELF.pre_processing.Vulture.modules import LemmatizeCleaner
from TELF.pre_processing.Vulture.modules import SubstitutionCleaner
from TELF.pre_processing.Vulture.modules import RemoveNonEnglishCleaner

from TELF.pre_processing.Vulture.default_stop_words import STOP_WORDS
from TELF.pre_processing.Vulture.default_stop_phrases import STOP_PHRASES

# output directory
RESULTS_DIR = 'results'
RESULTS_DIR = pathlib.Path(RESULTS_DIR).resolve()
RESULTS_FILE = 'clean_documents.p'
# create a cleaning pipeline
vulture = Vulture(n_jobs  = 1,
              verbose = 10,  # Disable == 0, Verbose >= 1

steps = [
RemoveNonEnglishCleaner(ascii_ratio=0.9, stopwords_ratio=0.25),
SimpleCleaner(stop_words = STOP_WORDS,
                  stop_phrases = STOP_PHRASES,
                  order = [

# clean
cleaned_documents = vulture.clean(documents, steps=steps)

class TELF.pre_processing.Vulture.vulture.Vulture(*, n_jobs=-1, n_nodes=1, parallel_backend='multiprocessing', cache='/tmp', verbose=False)[source]#

Bases: object

Vulture is a parallel, multi-node parallel, and distributed parallel document pre-processing tool. It is designed to be simple and fast.

Vultures are natures’ cleaners!

DEFAULT_OPERATOR_PIPELINE = [NEDetector(module_type='OPERATOR', backend=None)]#
DEFAULT_PIPELINE = [SimpleCleaner(module_type='CLEANER', effective_stop_words=['characteristics', 'characteristic', 'acknowledgment', 'corresponding', 'unfortunately', 'predominantly', 'investigation', 'significantly', 'substantially', 'automatically', 'approximately', 'introduction', 'respectively', 'specifically', 'particularly', 'applications', 'consequently', 'nevertheless', 'successfully', 'demonstrates', 'representing', 'sufficiently', 'demonstrated', 'researchers', 'discovering', ... (+1359 more)], patterns={'standardize_hyphens': (re.compile('[\\u002D\\u2010\\u2011\\u2012\\u2013\\u2014\\u2015\\u2212\\u2E3A\\u2E3B]'), '-'), 'remove_copyright_statement': None, 'remove_stop_phrases': None, 'make_lower_case': None, 'normalize': None, 'remove_trailing_dash': ('(?<!\\w)-|-(?!\\w)', ''), 'make_hyphens_words': ('([a-z])\\-([a-z])', ''), 'remove_next_line': ('\\n+', ' '), 'remove_email': ('\\S*@\\S*\\s?', ''), 'remove_formulas': ('\\b\\w*[\\=\\≈\\/\\\\\\±]\\w*\\b', ''), 'remove_dash': ('-', ''), 'remove_between_[]': ('\\[.*?\\]', ' '), 'remove_between_()': ('\\(.*?\\)', ' '), 'remove_[]': ('[\\[\\]]', ' '), 'remove_()': ('[()]', ' '), 'remove_\\': ('\\\\', ' '), 'remove_numbers': ('\\d+', ''), 'remove_standalone_numbers': ('\\b\\d+\\b', ''), 'remove_nonASCII_boundary': ('\\b[^\\x00-\\x7F]+\\b', ''), 'remove_nonASCII': ('[^\\x00-\\x7F]+', ''), 'remove_tags': ('&lt;/?.*?&gt;', ''), 'remove_special_characters': ('[!|"|#|$|%|&|\\|\\\'|(|)|*|+|,|.|/|:|;|<|=|>|?|@|[|\\|]|^|_|`|{|\\||}|~]', ''), 'isolate_frozen': None, 'remove_extra_whitespace': ('\\s+', ' '), 'remove_stop_words': None, 'min_characters': None}, exclude_hyphenated_stopwords=False, sw_pattern=re.compile('\\b[\\w-]+\\b'))]#
PARALLEL_BACKEND_OPTIONS = {'loky', 'multiprocessing', 'threading'}#
property cache#
clean(documents, steps=None, substitutions=None, save_path=None)[source]#
clean_dataframe(df, columns, steps=None, substitutions=None, append_to_original_df=False, concat_cleaned_cols=False)[source]#
property n_jobs#
property n_nodes#
operate(documents, steps=None, save_path=None, file_name='')[source]#
property parallel_backend#
property save_path#
property verbose#
TELF.pre_processing.Vulture.vulture.chunk_tuple_list(l, n_chunks)[source]#

Splits the given list of (key, value) tuples into sub-lists.

  • l (list of tuple) – List of (key, value) tuples to split.

  • n_chunks (int) – How many sets of sub-lists to create.


list – Sub-list containing (key, value) tuples.