Source code for TELF.applications.Cheetah.cheetah

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 15 17:05:54 2022

@author: maksimekineren
"""

import os
import re
import sys
import ast
import time
import pickle
import warnings
import pandas as pd
from tqdm import tqdm
from collections import Counter, defaultdict
from typing import Union


[docs] def add_with_union_of_others(d, s, key): """ Compute the addition of a set associated with a given key and the union of all other sets in a dictionary. Parameters: ----------- d: dict A dictionary where keys are strings and values are sets. s: set The set (associated with key) to be added to key: str The key in the dictionary whose set is to be modified with the union of all other sets. Returns: -------- set: The intersection set of the specified key's set and the union of all other sets in the dictionary. Raises: ------- ValueError If the specified key is not found in the dictionary. """ if key not in d: raise ValueError(f'Key {key!r} not found in the dictionary!') # dont do anything if dict only contains target key if len(d) == 1 and key in d: return s else: # create union of all sets (excluding the set for key) union_set = set.union(*(other_set for other_key, other_set in d.items() if other_key != key)) # create the modified set s_updated = d[key].copy() s_updated.update(s & union_set) return s_updated
[docs] class Cheetah: # keys are fields accepted by Cheetah, values are default column names in DataFrame COLUMNS = { 'title': 'title', 'abstract': 'abstract', 'year': 'year', 'author_ids': 'author_ids', 'affiliations': 'affiliations', } def __init__(self, verbose: bool) -> None: """ Init an empty Cheetah object Parameters ---------- verbose : bool, optional Vebosity flag. The default is False. Returns ------- None """ self.data = None self.indexed = False self.verbose = verbose
[docs] @classmethod def find_ngram(cls, text:str, query:list, window_size:int=5, ordered:bool=True) -> bool: """ Determine if the tokens in the list query are contained within the string text using a sliding window algorithm with a specified window_size. If ordered is True then the order of tokens appearing in query and text needs to be maintained for a positive match. Returns True is such a match is found. Parameters ---------- text : str A string of multiple tokens that are separated by whitespace query : list A list of tokens that should be checked for in text. Duplicate values in query are allowed and order will be maintained if ordered=True. window_size : int, optional Set the size of the sliding window. NOTE: if window_size < len(query), no matches can ever be found as the query cannot fit in the window. Default=5. ordered : bool, optional If True, preserve the order of tokens in query when searching for match. Default=True. Returns ------- bool True if ngram query was found in text. False otherwise. """ window = [] tokens = text.split() token_set = dict(Counter(query)) # count occurences of tokens in query for token in tokens: window.append(token) if len(window) > window_size: window.pop(0) # form a list of tokens for comparison using only tokens seen in the query # an additional precaution is taken to only use each token the exact number # of times that it is seen in the query at most curr_window = [] window_set = defaultdict(int) for t in window: if t in token_set and window_set[t] < token_set[t]: window_set[t] += 1 curr_window.append(t) if len(curr_window) == len(query): if ordered and curr_window == query: # compare preserving order return True elif not ordered and sorted(curr_window) == sorted(query): return True return False
[docs] def index(self, data:pd.DataFrame, columns:dict=None, index_file:str=None, reindex:bool=False, verbose:bool=True) -> None: """ Creates indices for selected columns in data for Cheetah search. author_ids and affiliations are expected to use the their respective SLIC data structures. See an example notebook for a sample of these data structures. Text data such as 'title' and 'abstract' should be pre-processed using Vulture simple clean. The text in these columns is expected to be lowercase with special characters removed. Tokens are delimited with a single whitespace. Parameters ---------- data: pd.DataFrame Pandas DataFrame of papers columns: dict, optional Dictionary where the keys are categories that can be mapped by Cheetah and the values are the corresponding columns names for these categories in data. See Cheetah.COLUMNS for an example of the structure and all currently supported keys. If columns is None, Cheetah will default to the Cheetah.COLUMNS values. index_file: str, optional Path a to a previously generated Cheetah index file. If no path is passed, Cheetah will generate indices for one time use. If index_file is passed but the path does not exist, Cheetah will generate indices and save them for future use at the index_file path. If a path is passed and reindex=True, new indices will be generated and saved at index_file, overwriting the current contents of index_file if it exists. reindex: int or float, optional If True, overwrite the index_file if it exists verbose: bool, optional Vebosity flag. The default is False. Returns ------- None """ self.data = data.copy().reset_index(drop=True) self.data['slic_index'] = data.index self.columns = columns self.verbose = verbose if index_file: if not os.path.exists(index_file) or reindex: # index data if self.verbose: if reindex: print("Overwriting existing index.") else: print("Indexing file not found. Creating a new index.") indexing_results = self._index_data(self.data) else: # load indexed data from disk indexing_results = pickle.load(open(index_file, 'rb')) else: indexing_results = self._index_data(self.data) self.abstract_index = indexing_results[0] self.title_index = indexing_results[1] self.year_index = indexing_results[2] self.country_index = indexing_results[3] self.author_index = indexing_results[4] self.affiliation_index = indexing_results[5] if index_file: pickle.dump(indexing_results, open(index_file, "wb")) self.indexed = True self.last_search_result = None
[docs] def search(self, query:list=None, and_search:bool=True, in_title:bool=True, in_abstract:bool=True, save_path:bool=None, author_filter:list=[], affiliation_filter:list=[], country_filter:list=[], year_filter:list=[], ngram_window_size:int=5, ngram_ordered:bool=True, do_results_table=False, link_search=False) -> pd.DataFrame: """ Search a dataset indexed by this Cheetah object. Text can be searched using query and properties of the data can be filtered using year_filter, country_filter, author_filter, affiliation_filter. If both query and filter(s) are used, the results of the search are intersected. Note that trying to use a filter that was never indexed by Cheetah will result in an error. Parameters ---------- query: str, list, dict, NoneType A string or a list of strings to lookup. n-grams for n>1 should be split with whitespace. Note that query will be pre-processed by converting all characters to lowecase and stripping all extra whitespace. >>> query = 'laser' # a single word to lookup >>> query = {'laser': 'node'} # a single word with negative query >>> query = ['laser', 'angle deflection'] # a word and bigram to lookup >>> query = [{'laser': ['blue', 'green'], # a word and bigram to lookup with multiple negative 'angle deflection'] # search terms for the unigram >>> query = None # no query to lookup (using filters only) and_search: bool, optional This option applies when multiple queries are being looked up simultenously. If True, the intersection of documents that match all queries is returned. Otherwise, the union. Default=True. in_title: bool, optional If True, searches for queries in the indexed title text. Default=True. NOTE: If in_title and in_abstract are both True, the union between these two query searches is returned in_abstract: bool, optional If True, searches for queries in the indexed abstract text. Default=True. NOTE: If in_title and in_abstract are both True, the union between these two query searches is returned save_path: str, optional The path at which to save the resulting subset DataFrame. If the path is not defined, the result of the search is returned. The default is None. author_filter: list, optional List of author ids that papers should be affiliated with. The default is []. affiliation_filter: list, optional List of affiliation ids that papers should be affiliated with. The default is []. country_filter: list, optional List of countries that papers should be affiliated with. The default is []. year_filter: list, optional List of years that papers should be published in. The default is []. ngram_window_size: int, optional The size of the window used in Cheetah.find_ngram(). This function is called if one or more entries in query are n-grams for n>1. ngram_window_size determines how many tokens can be examined at a time. For example for the text ['aa bb bb cc cc dd'], the query 'aa cc' will be found if the window size is >= 4. Default=5. This value should be greater than the length of the n-gram. ngram_ordered: bool The order used in Cheetah.find_ngram(). This function is called if one or more entries in query are n-grams for n>1. ngram_ordered determines if the order of tokens in ngram should be preserved while looking for a match. Default=True. do_results_table: bool, optional Flag that determines if a results table should be generated for the search. If True, this table will provide explainability for why certain documents were selected by Cheetah. If False, None is returned as the second argument. Default=False link_search: bool, optional A flag that controls if the queries should be linked in the positive/negative inclusion step. For example, take a document that contains the queried text "A" and "B". However positive or negative inclusion partnered with "B" overrides the selection. If this flag is set to True then the inclusion step will be ignored since another query, "A", had already selected the document as being on-topic (hence linking the search). Default=False Returns ------- return_data: None, pd.DataFrame If save_path is not defined, return the search result (pd.DataFrame object). However, if save_path is defined, return None and save result at save_path as a CSV file. results_table: None, pd.DataFrame If do_results_table is True then this argument will provide explainability for Cheetah filtering. Otherwise this argument is None """ # validate that data has been loaded and indexed if not self.indexed: raise ValueError('No index found! Call Cheetah.index() first!') # validate input to the function self.query = query self.ngram_ordered = ngram_ordered self.ngram_window_size = ngram_window_size # begin timing the search start = time.time() # unite all possible filters into a data structure that will allow easy filtering filters = [('year', year_filter, self.year_index), ('affiliations', country_filter, self.country_index), ('author_ids', author_filter, self.author_index), ('affiliations', affiliation_filter, self.affiliation_index)] ## 1. Filter the data # use the filters to generate a set of desirable document ids # if the filters exclude all documents, an empty set will be used filter_indices_map = {} using_filter_flag = False for col_filter, filter_values, filter_index in filters: if filter_values: # filter values are None or empty list assert col_filter in self.columns, f"Attempted {col_filter} search but {col_filter} column does not exist!" filter_indices_map[(col_filter, ",".join([str(x) for x in filter_values]))] = self._filter_search(col_filter, filter_values, filter_index) using_filter_flag = True # intersect the results from each filter if filter_indices_map: filter_indices = set.intersection(*list(filter_indices_map.values())) else: filter_indices = set() if using_filter_flag and not filter_indices: warnings.warn('Selected filters returned an empty set', RuntimeWarning) ## 2. Search by query # if a query (or several) queries are given, produce the search results for # these querie(s) in all documents: with no respect for the filters query_indices_map = {} if self.query: # use reverse index map to quickly lookup 1grams and limit search space for ngrams query_indices_map = self._query_search(self.query, in_abstract, in_title, link_search) # deal with n_grams query_indices_map = self._ngram_check(query_indices_map, self.query, in_abstract, in_title) # unite the search results inclusively or exclusively depening on and_search value if and_search: # intersect list of sets to form single set query_indices = set.intersection(*list(query_indices_map.values())) else: # union list of sets to form single set query_indices = set.union(*list(query_indices_map.values())) # intersect results of query search with filter results (if using filtering) return_indices = query_indices if using_filter_flag: return_indices &= filter_indices else: return_indices = filter_indices # use the determined indices to select subset of papers from whole data return_data = self.data.loc[list(return_indices)].copy().reset_index(drop=True) return_data = return_data.set_index('slic_index') return_data.index.name = None self.last_search_result = return_data.copy() # update object's last search # end timing of search end = time.time() # print computation time if verbose if self.verbose: print(f"Found {len(return_data)} papers in {round(end - start, 4)} seconds", file=sys.stderr) results_table = None if do_results_table: results_table = self._create_results_table(filter_indices_map, query_indices_map) # return results of search if no save_path passed, otherwise save if not save_path: return return_data, results_table else: return_data.to_csv(save_path, index=False) return None, results_table
def _create_results_table(self, filter_results, query_results): """ Creates a DataFrame that explains why and how documents were selected by Cheetah filters Parameters ---------- filter_results: dict Map of filters and their associated paper indices query_results: list Map of queries and their associated paper indices Returns ------- pd.DataFrame DataFrame that has filters and the papers that they included/excluded """ data = { 'filter_type': [], 'filter_value': [], 'num_papers': [], 'included_ids': [], } for key, indices in filter_results.items(): filter_type, filter_value = key num_included = len(indices) if indices: indices = [int(x) for x in indices] included_ids = ';'.join([str(x) for x in self.data.loc[indices]['slic_index'].to_list()]) else: included_ids = None #included_ids = ';'.join([str(x) for x in indices]) if indices else None # add to DataFrame data['filter_type'].append(filter_type) data['filter_value'].append(filter_value) data['num_papers'].append(num_included) data['included_ids'].append(included_ids) for query, indices in query_results.items(): num_included = len(indices) if indices: indices = [int(x) for x in indices] included_ids = ';'.join([str(x) for x in self.data.loc[indices]['slic_index'].to_list()]) else: included_ids = None # add to DataFrame data['filter_type'].append('query') data['filter_value'].append(query) data['num_papers'].append(num_included) data['included_ids'].append(included_ids) results_table_df = pd.DataFrame.from_dict(data) results_table_df = results_table_df.sort_values(by=['filter_type', 'filter_value']).reset_index(drop=True) return results_table_df def _filter_search(self, col:str, filters:list, indexing:dict) -> set: """ Searches the index for the given filters in the specified column. Parameters ---------- col: str The name of the column to be searched. filters: list A list representing the filters to be applied indexing: dict A dictionary representing the index of the text data. Returns ------- set A set of record IDs that match the specified filters """ # pre-process input filters = [str(f).lower().strip() for f in filters] # get the indices matching filters filter_indices = [] for f in filters: filter_indices.append(indexing.get(f, set())) # take the union of the filter results # if multiple values in a filter are given, we combine the results. # EX filters=[2018, 2019, 2020] -> get documents published in this 3 year range return set.union(*filter_indices) def _query_search(self, queries:list, in_abstract:bool, in_title:bool, link_search:bool) -> list: """ Searches the text data for the given queries in the specified columns. Parameters ---------- queries: list A list of strings representing the queries to be searched. in_abstract: bool A boolean value indicating whether to search for queries in the abstract column. in_title: bool A boolean value indicating whether to search for queries in the title column. link_search: bool A flag that controls if the queries should be linked in the positive/negative inclusion step. For example, take a document that contains the queried text "A" and "B". However positive or negative inclusion partnered with "B" overrides the selection. If this flag is set to True then the inclusion step will be ignored since another query, "A", had already selected the document as being on-topic (hence linking the search). Returns ------- index_map: dict A dict of query to record IDs that match the specified queries. Each record ID is a string representing the key in the original data dictionary for the record. Raises ------ ValueError If the `in_abstract` and `in_title` parameter are true and not in the data or if both `in_abstract` and `in_title` are False. """ # validate input if in_abstract: assert "abstract" in self.columns, "Attempted abstract search but abstract column does not exist!" if in_title: assert "title" in self.columns, "Attempted title search but title column does not exist!" # warn user if they attempt to search without selecting any search data if not in_abstract and not in_title: warnings.warn('Attempting to search a query without any data source. ' \ 'Enable search in abstracts and/or title.', RuntimeWarning) # init the search results list. each entry in this list will be a set of ids # corresponding to the searched query. # # NOTE: n-gram search will be performed later. For now, search results for n-grams just # mean that each document id in the set contains all tokens from the query index_map = {} for query in queries: query_indices = [] for q in query: term, negatives, positives = q q_indices = set() if in_title: q_indices |= self.title_index.get(term, set()) q_indices = self._inclusion_search('title', q_indices, positives, negatives) if in_abstract: q_indices |= self.abstract_index.get(term, set()) q_indices = self._inclusion_search('abstract', q_indices, positives, negatives) query_indices.append(q_indices) # update results for query # add set of matching ids for a single query to the output list query_proc = [next(iter(x)) for x in query] index_map[" ".join(query_proc)] = set.intersection(*query_indices) if link_search: old_index_map = index_map.copy() index_map = {} for query in queries: query_indices = [] for q in query: term, negatives, positives = q q_indices = set() if in_title: q_indices |= self.title_index.get(term, set()) if in_abstract: q_indices |= self.abstract_index.get(term, set()) query_indices.append(q_indices) # update results for query query_proc = " ".join([next(iter(x)) for x in query]) query_indices = set.intersection(*query_indices) linked_indices = add_with_union_of_others(old_index_map, query_indices, query_proc) index_map[query_proc] = linked_indices return index_map def _inclusion_search(self, text_index, q_indices, positives, negatives): assert text_index in ('title', 'abstract'), f'Invalid text_index {text_index!r}' if text_index == 'title': index = self.title_index if text_index == 'abstract': index = self.abstract_index # remove any terms that are being negated for n in negatives: q_indices -= index.get(n, set()) # union the current set of terms with positively included terms if positives: pos_q_indices = set.union(*[index.get(x, set()) for x in positives]) q_indices &= pos_q_indices return q_indices def _ngram_check(self, all_indices, all_queries, in_abstract, in_title) -> list: """ Check for the occurrence of n-grams in a text or document based on some criteria. Parameters ---------- all_indices: dict A map of query keys and index set values all_queries: list List of queries by which documents were filtered in_abstract: bool If True, check for the occurrence of n-grams in the abstract of the document. in_title: bool If True, check for the occurrence of n-grams in the title of the document. Returns ------- list True if the specified n-grams are found in the text or document based on the specified criteria, False otherwise. """ # validate input if in_abstract: assert "abstract" in self.columns, "Attempted abstract search but abstract column does not exist!" if in_title: assert "title" in self.columns, "Attempted title search but title column does not exist!" assert len(all_indices) == len(all_queries), f"Attempted to use {len(all_indices)} indices for {len(all_queries)}" for i,q in enumerate(all_queries): q = [next(iter(x)) for x in q] joined_query = ' '.join(q) index, query = all_indices.get(joined_query, set()), q index_data = self.data.loc[list(index)] if len(query) > 1 and not index_data.empty: query_indices = list() if in_abstract: # Join results for title and abstract query_indices.append(self._ngram_check_helper(index_data, query, 'abstract')) if in_title: query_indices.append(self._ngram_check_helper(index_data, query, 'title')) query_indices = set.intersection(*query_indices) all_indices[joined_query] -= query_indices return all_indices def _ngram_check_helper(self, index_data:pd.DataFrame, query:str, col:str) -> set: """ Perform an n-gram check on the specified DataFrame, query string, and column name. Parameters ---------- index_data : pd.DataFrame The DataFrame containing the indexed data. query : str The query string to check for n-grams. col : str The name of the column to check for n-grams. Returns ------- set A set of indices of the rows in the DataFrame that contain the specified n-grams. """ # validate input assert col in self.columns, f"Attempted abstract search but {col} column does not exist!" # warn user if window size is too small if len(query) > self.ngram_window_size: warnings.warn(f"Attempting to find a {len(query)}-token query with a window of size {self.ngram_window_size}", RuntimeWarning) # check if ngram is found in each id to_remove = set() for idx, text in zip(index_data.index.to_list(), index_data[self.columns[col]].to_list()): if not self.find_ngram(text, query, self.ngram_window_size, self.ngram_ordered): to_remove.add(idx) return to_remove def _index_text(self, data:dict, column:str) -> dict: """ Indexes the text data in the specified column of a dictionary of records. Parameters ---------- data : dict A dictionary containing records of data to be indexed. column : str The name of the column containing text data to be indexed. Returns ------- dict A dictionary representing the mapping. Raises ------ ValueError If the `col` parameter is None. """ if self.verbose: print(f"Indexing {column}") col = self.columns.get(column) if col is None: raise ValueError(f"Invalid column name for '{column}' provided") index_map = {} text_list = data[col].to_list() for paper_idx, text in tqdm(enumerate(text_list), total=len(text_list), disable=not self.verbose): if pd.isna(text): continue tokens = text.split() for word in tokens: word = word.strip() # remove newlines, empty space word = word.lower() # convert to lowercase if not word: # word is an empty string continue if word not in index_map: index_map[word] = set() index_map[word].add(paper_idx) for token, indices in tqdm(index_map.items(), total=len(index_map), disable=not self.verbose): index_map[token] = set(indices) return index_map def _index_year(self, data:dict) -> dict: """ Indexes the year data of a dictionary of records. Parameters ---------- data : dict A dictionary containing records of data to be indexed. Returns ------- dict A dictionary representing the year index. Raises ------ ValueError If the `col` parameter is None. """ if self.verbose: print("Indexing years") col = self.columns.get('year') if col is None: raise ValueError(f"Invalid column name for 'year' provided") year_index = {} year_list = data[col].to_list() for paper_idx, year in tqdm(enumerate(year_list), total=len(year_list), disable=not self.verbose): if pd.isna(year): continue year = str(int(year)).strip().lower() if year not in year_index: year_index[year] = set() year_index[year].add(paper_idx) # for year, indices in tqdm(year_index.items(), total=len(year_index), disable=not self.verbose): # year_index[year] = set(indices) return year_index def _index_author(self, data:dict) -> dict: """ Indexes the author data of a dictionary of records. Parameters ---------- data : dict A dictionary containing records of data to be indexed. Returns ------- dict A dictionary representing the author index. Raises ------ ValueError If the `col` object parameter is None. """ if self.verbose: print("Indexing author IDs") col = self.columns.get('author_ids') if col is None: raise ValueError(f"Invalid column name for 'author_ids' provided") author_index = {} author_IDs = data[col].to_list() author_index_tmp = {} for paper_idx in tqdm(range(len(author_IDs)), disable= not self.verbose): curr_info = author_IDs[paper_idx] if pd.isna(curr_info): continue for author_id in curr_info.split(";"): author_id = str(author_id).lower().strip() if author_id in author_index_tmp: author_index_tmp[author_id].append(paper_idx) else: author_index_tmp[author_id] = [paper_idx] for token, indices in tqdm( author_index_tmp.items(), total=len(author_index_tmp), disable= not self.verbose ): author_index[token] = set(indices) return author_index def _index_affiliation_country(self, data:dict) -> tuple: """ Indexes the country affiliation data of a dictionary of records. Parameters ---------- data : dict A dictionary containing records of data to be indexed. Returns ------- dict A dictionary representing the country affiliation index. Raises ------ ValueError If the `col` object parameter is None. """ if self.verbose: print("Indexing affiliations and countries") col = self.columns.get('affiliations') if col is None: raise ValueError(f"Invalid column name for 'affiliations' provided") country_index = {} affiliation_index = {} affiliation_information_str = data[col].to_list() affiliation_index_tmp = {} country_index_tmp = {} for paper_idx, info in tqdm(enumerate(affiliation_information_str), total=len(affiliation_information_str), disable= not self.verbose): if pd.isna(info): continue if isinstance(info, str): curr_info_dict = ast.literal_eval(info) else: curr_info_dict = info for affil_id, affil_info_dict in curr_info_dict.items(): affil_id = str(affil_id).strip().lower() country = affil_info_dict["country"].strip().lower() # affiliation if str(affil_id) in affiliation_index_tmp: affiliation_index_tmp[str(affil_id)].append(paper_idx) else: affiliation_index_tmp[str(affil_id)] = [paper_idx] # country if str(country) in country_index_tmp: country_index_tmp[str(country)].append(paper_idx) else: country_index_tmp[str(country)] = [paper_idx] for token, indices in tqdm( country_index_tmp.items(), total=len(country_index_tmp), disable= not self.verbose ): country_index[token] = set(indices) for token, indices in tqdm( affiliation_index_tmp.items(), total=len(affiliation_index_tmp), disable= not self.verbose ): affiliation_index[token] = set(indices) return affiliation_index, country_index def _index_data(self, data:dict): """ Parameters ---------- Returns ------- """ if "abstract" in self.columns: abstract_index = self._index_text(data, 'abstract') else: abstract_index = {} if "title" in self.columns: title_index = self._index_text(data, 'title') else: title_index = {} if "year" in self.columns: year_index = self._index_year(data) else: year_index = {} if "author_ids" in self.columns: author_index = self._index_author(data) else: author_index = {} if "affiliations" in self.columns: affiliation_index, country_index = self._index_affiliation_country(data) else: affiliation_index, country_index = {}, {} return ( abstract_index, title_index, year_index, country_index, author_index, affiliation_index, ) # GETTERS @property def ngram_window_size(self) -> int: """ Get the numeric size of the ngram window. Parameters ---------- None Returns ------- int ngram window size """ return self._ngram_window_size @property def ngram_ordered(self) -> bool: """ Get the status of ngram_ordered. Parameters ---------- None Returns ------- bool Status of ngram ordering """ return self._ngram_ordered @property def query(self) -> Union[list, str, None]: """ Get the last query of the object. Parameters ---------- None Returns ------- Union[list, str, None] The last query of the object, which can be either a list or a string. """ return self._query @property def columns(self) -> dict: """ Retrieve the columns. Parameters ---------- None Returns ------- dict A dictionary containing column names as keys and column values as values. """ return self._columns # SETTERS @ngram_window_size.setter def ngram_window_size(self, ngram_window_size:int) -> None: """ Set or update the size of the n-gram window of the object. Parameters ---------- ngram_window_size : int The new value for the size of the n-gram window of the object. Returns ------- None """ if not isinstance(ngram_window_size, int): self.__error_unexpected_type(int, ngram_window_size) if ngram_window_size < 1: raise ValueError("ngram_window_size cannot be less than 1") self._ngram_window_size = ngram_window_size @ngram_ordered.setter def ngram_ordered(self, ngram_ordered:bool) -> None: """ Set or update a flag that controls whether n-grams should be ordered or not. Parameters ---------- ngram_ordered : bool If True, n-grams will be ordered; if False, n-grams will be unordered. Returns ------- None """ if not isinstance(ngram_ordered, bool): self.__error_unexpected_type(bool, ngram_ordered) self._ngram_ordered = ngram_ordered @query.setter def query(self, query: Union[list, str, dict, None]) -> None: """ Set or update the query of the object. Parameters ---------- query : list, dict, str, None The new value for the query of the object. Returns ------- None """ def process_entry(entry): def validate_dependent_terms(terms): for term in terms: if len(term.split()) > 1: raise ValueError(f"Expected single token in dependent term, but got {term!r}") if isinstance(entry, str): return [(word, [], []) for word in entry.split()] elif isinstance(entry, dict): if len(entry) != 1: raise ValueError('Expected query dict to only contain 1 element') term = next(iter(entry)) dependents = entry[term] if isinstance(dependents, str): validate_dependent_terms([dependents]) dependents = [dependents] elif isinstance(dependents, list): validate_dependent_terms(dependents) else: self.__error_unexpected_type({str, list}, dependents) # process the dependent term(s) to determine which are being # negated and which are being included negatives = [] positives = [] for dep in dependents: if dep.startswith('+'): # positive terms are expected to start with '+' positives.append(dep[1:]) # take off the plus sign else: negatives.append(dep) # negative terms are just the term with no symbols return [(word, negatives, positives) for word in term.split()] else: self.__error_unexpected_type({str, list, dict}, entry) # query is None, an empty string, empty dict, or an empty list if not query: self._query = None else: processed = [] if isinstance(query, str) or isinstance(query, dict): processed.append(process_entry(query)) elif isinstance(query, list): for entry in query: processed.append(process_entry(entry)) else: self.__error_unexpected_type({str, list, dict, type(None)}, query) self._query = processed @columns.setter def columns(self, columns:dict) -> None: """ Set or update the columns of a data structure or object. Parameters ---------- columns : dict A dictionary containing column names as keys and column values as values. Returns ------- None """ if columns is None: columns = Cheetah.COLUMNS else: if not isinstance(columns, dict): self.__error_unexpected_type(dict, columns) del_list = [] data_columns = set(self.data.columns.to_list()) for col_name, col_value in columns.items(): if col_value not in data_columns: del_list.append(col_name) for d in del_list: if d in columns: warnings.warn(f"'{columns[d]}' not found in DataFrame. Removing the {d} index", RuntimeWarning) del columns[d] if not columns: raise ValueError('No valid columns remain to be indexed') else: self._columns = columns # UTIL @classmethod def __error_unexpected_type(cls, expected_type, var) -> ValueError: """ Raises an ValueError exception Parameters ---------- cls: expected_type: var: Returns ------- ValueError """ raise TypeError(f"Expected {expected_type} but instead got {type(var)}: {var}")