TELF.pre_processing.Vulture: Advanced text pre-processing and cleaning tool for NLP and text-mining#

Vulture is a tool for text pre-processing and cleaning. It has multi-processing and distributed computing capabilities, and designed to run fast.

Example#

First let’s load the example dataset (example dataset can be found here):

import pickle

DATA_DIR = os.path.join('..', '..', 'data')
DATA_DIR = pathlib.Path(DATA_DIR).resolve()
DATA_FILE = 'documents.p'
documents = pickle.load(open(os.path.join(DATA_DIR, DATA_FILE), 'rb'))

Now we can perform pre-processing:

# import libraries
import os
import pathlib
import pandas as pd

from TELF.pre_processing import Vulture

from TELF.pre_processing.Vulture.modules import SimpleCleaner
from TELF.pre_processing.Vulture.modules import LemmatizeCleaner
from TELF.pre_processing.Vulture.modules import SubstitutionCleaner
from TELF.pre_processing.Vulture.modules import RemoveNonEnglishCleaner

from TELF.pre_processing.Vulture.default_stop_words import STOP_WORDS
from TELF.pre_processing.Vulture.default_stop_phrases import STOP_PHRASES


# output directory
RESULTS_DIR = 'results'
RESULTS_DIR = pathlib.Path(RESULTS_DIR).resolve()
RESULTS_FILE = 'clean_documents.p'
try:
    os.mkdir(RESULTS_DIR)
except FileExistsError:
    pass

# create a cleaning pipeline
vulture = Vulture(n_jobs  = 1,
              verbose = 10,  # Disable == 0, Verbose >= 1
             )

steps = [
RemoveNonEnglishCleaner(ascii_ratio=0.9, stopwords_ratio=0.25),
SimpleCleaner(stop_words = STOP_WORDS,
                  stop_phrases = STOP_PHRASES,
                  order = [
                      'standardize_hyphens',
                      'isolate_frozen',
                      'remove_copyright_statement',
                      'remove_stop_phrases',
                      'make_lower_case',
                      'remove_formulas',
                      'normalize',
                      'remove_next_line',
                      'remove_email',
                      'remove_()',
                      'remove_[]',
                      'remove_special_characters',
                      'remove_nonASCII_boundary',
                      'remove_nonASCII',
                      'remove_tags',
                      'remove_stop_words',
                      'remove_standalone_numbers',
                      'remove_extra_whitespace',
                      'min_characters',
                  ]
                 ),
    LemmatizeCleaner('spacy'),
]

# clean
cleaned_documents = vulture.clean(documents, steps=steps)

Available Functions#

`Vulture.__init__`(*[, n_jobs, n_nodes, ...])
`Vulture.clean`(documents[, steps, ...])
`Vulture.clean_dataframe`(df, columns[, ...])

Module Contents#

© 2022. Triad National Security, LLC. All rights reserved. This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S. Department of Energy/National Nuclear Security Administration. All rights in the program are reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear Security Administration. The Government is granted for itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare derivative works, distribute copies to the public, perform publicly and display publicly, and to permit others to do so.

class TELF.pre_processing.Vulture.vulture.Vulture(*, n_jobs=-1, n_nodes=1, parallel_backend='multiprocessing', cache='/tmp', verbose=False)[source]#

Bases: object

Vulture is a parallel, multi-node parallel, and distributed parallel document pre-processing tool. It is designed to be simple and fast.

Vultures are natures’ cleaners!

DEFAULT_OPERATOR_PIPELINE = [NEDetector(module_type='OPERATOR', backend=None)]#

DEFAULT_PIPELINE = [SimpleCleaner(module_type='CLEANER', effective_stop_words=['characteristics', 'acknowledgment', 'characteristic', 'corresponding', 'unfortunately', 'automatically', 'approximately', 'substantially', 'investigation', 'significantly', 'predominantly', 'applications', 'particularly', 'respectively', 'consequently', 'introduction', 'representing', 'demonstrated', 'demonstrates', 'successfully', 'nevertheless', 'sufficiently', 'specifically', 'conclusions', 'performance', ... (+1359 more)], patterns={'standardize_hyphens': (re.compile('[\\u002D\\u2010\\u2011\\u2012\\u2013\\u2014\\u2015\\u2212\\u2E3A\\u2E3B]'), '-'), 'remove_copyright_statement': None, 'remove_stop_phrases': None, 'make_lower_case': None, 'normalize': None, 'remove_trailing_dash': ('(?<!\\w)-|-(?!\\w)', ''), 'make_hyphens_words': ('([a-z])\\-([a-z])', ''), 'remove_next_line': ('\\n+', ' '), 'remove_email': ('\\S*@\\S*\\s?', ''), 'remove_formulas': ('\\b\\w*[\\=\\≈\\/\\\\\\±]\\w*\\b', ''), 'remove_dash': ('-', ''), 'remove_between_[]': ('\\[.*?\\]', ' '), 'remove_between_()': ('\$.*?\$', ' '), 'remove_[]': ('[\\[\\]]', ' '), 'remove_()': ('[()]', ' '), 'remove_\\': ('\\\\', ' '), 'remove_numbers': ('\\b\\d+\\b', ''), 'remove_standalone_numbers': ('\\b\\d+\\b', ''), 'remove_nonASCII_boundary': ('\\b[^\\x00-\\x7F]+\\b', ''), 'remove_nonASCII': ('[^\\x00-\\x7F]+', ''), 'remove_tags': ('</?.*?>', ''), 'remove_special_characters': ('[!|"|#|$|%|&|\\|\\\'|(|)|*|+|,|.|/|:|;|<|=|>|?|@|[|\\|]|^|_|`|{|\\||}|~]', ''), 'isolate_frozen': None, 'remove_extra_whitespace': ('\\s+', ' '), 'remove_stop_words': None, 'min_characters': None, 'remove_alphanumeric': ('\\b(?=\\w*[A-Za-z])(?=\\w*\\d)\\w+\\b', ''), 'remove_roman_numerals': ('(?i)\\bM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\\b', '')}, exclude_hyphenated_stopwords=False, sw_pattern=re.compile('\\b[\\w-]+\\b'))]#

PARALLEL_BACKEND_OPTIONS = {'loky', 'multiprocessing', 'threading'}#

property cache#

clean(documents, steps=None, substitutions=None, save_path=None)[source]#

clean_dataframe(df, columns, steps=None, substitutions=None, append_to_original_df=False, concat_cleaned_cols=False)[source]#

property n_jobs#

property n_nodes#

operate(documents, steps=None, save_path=None, file_name='')[source]#

property parallel_backend#

property save_path#

use_mpi()[source]#

property verbose#

TELF.pre_processing.Vulture.vulture.chunk_tuple_list(l, n_chunks)[source]#

Splits the given list of (key, value) tuples into sub-lists.

Parameters:

l (list of tuple) – List of (key, value) tuples to split.
n_chunks (int) – How many sets of sub-lists to create.

Yields:

list – Sub-list containing (key, value) tuples.