Source code for TELF.pre_processing.Orca.orca

import os
import ast
import copy
import pickle
import warnings 
import pandas as pd
import networkx as nx
from tqdm import tqdm 

from .AuthorMatcher import AuthorMatcher

[docs] class Orca: #Code where we have a precomputed duplicates. # Pre-computed Scopus duplicates file containing entries. # If no duplicates are computed with DAF, this file will be used instead for duplicate removal #DUPLICATES_1M = 'scopus_1m_cited_collab_matches.p' def __init__(self, duplicates=None, s2_duplicates=None, verbose=False): self.slic_df = None self.duplicates = duplicates self.s2_duplicates = s2_duplicates self.verbose = verbose def _run_scopus(self, df): """ Helper function for creating a SLIC map file for a dataset that only contains Scopus information """ # generate a map of scopus ids to affiliations affiliations_map = self.__generate_affiliations_map(df) # generate author maps scopus_author_map = self.__generate_author_map(df, 'author_ids', 'authors') # correct duplicates duplicates = {} for entry in self.duplicates: if not entry & scopus_author_map.keys(): continue entry = entry.copy() # avoid modifying duplicates in place best_id = sorted(entry, key=lambda x: len(scopus_author_map.get(x, '')), reverse=True)[0] merged_affiliations = self.__merge_scopus_affiliations(entry, affiliations_map) affiliations_map[best_id] = merged_affiliations # remove old duplicate ids from both maps entry.remove(best_id) for x in entry: del scopus_author_map[x] del affiliations_map[x] # add duplicates for tracking purposes duplicates[best_id] = list(entry) ## generate SLIC IDs slic_count = 0 slic_df = { 'slic_id': [], 'slic_name': [], 'scopus_ids': [], 'scopus_names': [], 'scopus_affiliations': [], 's2_ids': [], 's2_names': [], } for i, scopus_id in enumerate(scopus_author_map): scopus_name = scopus_author_map.get(scopus_id) scopus_affiliations = affiliations_map.get(scopus_id) if scopus_id in duplicates: scopus_id = ';'.join([scopus_id] + duplicates[scopus_id]) slic_df['slic_id'].append(f'S{i}') slic_df['slic_name'].append(scopus_name) slic_df['scopus_ids'].append(scopus_id) slic_df['scopus_names'].append(scopus_name) slic_df['scopus_affiliations'].append(scopus_affiliations) slic_df['s2_ids'].append(None) slic_df['s2_names'].append(None) slic_df = pd.DataFrame.from_dict(slic_df) return slic_df
[docs] def run(self, df, scopus_duplicates=None, s2_duplicates=None, known_matches=None, n_jobs=-1): """ Run Orca and form SLIC ids for a given dataset Parameters ---------- df: pandas.DataFrame The SLIC dataframe for which author SLIC ids need to be created scopus_duplicates: list(set), optional A list of sets where each set contains scopus author ids that refer to the same person. In the ideal case, each author only has one scopus id. However, this ideal does not hold up in practice and some authors are represented by two or more scopus ids. Duplicate authors can be found using the Orca.DuplicateAuthorFinder tool. If not provided, a pre-computed scopus duplicate map is used (pre-computed from 1 million Scopus papers). If provided, this map is overriden by the user input. Default is None. s2_duplicates: list(set), optional A list of sets where each set contains s2 author ids that refer to the same person. If not provided, s2 author ids are not scanned for duplicates / only compared against scopus matches as duplicate detection. Default is None. known_matches: dict, optional A dict of s2 id keys to scopus id values. This dictionary is used to override the author matching if groundtruth is known. This is useful for helping the tool work around edge cases. Default is None. Returns ------- None """ # process duplicates (if passed) if scopus_duplicates is not None: self.duplicates = scopus_duplicates if s2_duplicates is not None: self.s2_duplicates = s2_duplicates valid, error = self.__verify_df(df) # make sure that the passed dataframe meets expected if not valid: if False: # TODO: set valid flag to check if Scopus only data here raise ValueError(error) else: self.slic_df = self._run_scopus(df) return self.slic_df # generate a map of scopus ids to affiliations affiliations_map = self.__generate_affiliations_map(df) # generate author maps s2_author_map = self.__generate_author_map(df, 's2_author_ids', 's2_authors') scopus_author_map = self.__generate_author_map(df, 'author_ids', 'authors') # match scopus author ids to s2 author ids known_matches = {} if not known_matches else known_matches am = AuthorMatcher(df, n_jobs=n_jobs, verbose=self.verbose) am_df = am.match(known_matches=known_matches) # process scopus duplicates am_enriched = self.__add_scopus_duplicates(am_df, self.duplicates) am_df = pd.concat([am_df, am_enriched], axis=0, ignore_index=True) ## generate SLIC IDs slic_count = 0 slic_df = { 'slic_id': [], 'slic_name': [], 'scopus_ids': [], 'scopus_names': [], 'scopus_affiliations': [], 's2_ids': [], 's2_names': [], } # 1. assign SLIC IDs to author ids that have correspondence between s2 and scopus seen_s2 = set() seen_scopus = set() matches = self.__uncouple_author_matches(am_df, self.s2_duplicates) for entry in matches: s2_ids = entry['s2'] s2_names = {s2_author_map.get(x, 'Unknown') for x in s2_ids if x in s2_author_map} scopus_ids = entry['scopus'] scopus_names = {scopus_author_map[x] for x in scopus_ids if x in scopus_author_map} scopus_affiliations = self.__merge_scopus_affiliations(scopus_ids, affiliations_map) slic_name = entry['name'] seen_s2 |= s2_ids seen_scopus |= scopus_ids slic_df['slic_id'].append(f'S{slic_count}') slic_df['slic_name'].append(slic_name) slic_df['scopus_ids'].append(';'.join(scopus_ids)) slic_df['scopus_names'].append(';'.join(scopus_names)) slic_df['scopus_affiliations'].append(scopus_affiliations) slic_df['s2_ids'].append(';'.join(s2_ids)) slic_df['s2_names'].append(';'.join(s2_names)) slic_count += 1 # 2. assign SLIC IDs to scopus ids that did not have correspondence df_scopus_authors = {x for y in df.author_ids.to_list() if not pd.isna(y) for x in y.split(';')} df_scopus_authors -= seen_scopus for scopus_id in df_scopus_authors: scopus_name = scopus_author_map.get(scopus_id, None) slic_df['slic_id'].append(f'S{slic_count}') slic_df['slic_name'].append(scopus_name) slic_df['scopus_ids'].append(scopus_id) slic_df['scopus_names'].append(scopus_name) slic_df['scopus_affiliations'].append(affiliations_map.get(scopus_id, None)) slic_df['s2_ids'].append(None) slic_df['s2_names'].append(None) slic_count += 1 # 3. assign SLIC Ids to remaining s2 authors df_s2_authors = {x for y in df.s2_author_ids.to_list() if not pd.isna(y) for x in y.split(';')} df_s2_authors -= seen_s2 for s2_id in df_s2_authors: if s2_id in seen_s2: continue # update the common fields slic_df['slic_id'].append(f'S{slic_count}') slic_df['scopus_ids'].append(None) slic_df['scopus_names'].append(None) slic_df['scopus_affiliations'].append(None) # handle s2 duplicates if they exist s2_dup_ids = self.s2_duplicates.get(s2_id) if s2_dup_ids is not None: s2_ids = {s2_id} | set(s2_dup_ids) s2_names = {s2_author_map.get(x, 'Unknown') for x in s2_ids if x in s2_author_map} if s2_names == {'Unknown'}: # handle case where all name missing s2_names = set() # get the slic name try: slic_name = max(s2_names, key=len) slic_name = None if slic_name == 'Unknown' else slic_name except ValueError: slic_name = None # update the data map slic_df['slic_name'].append(slic_name) s2_ids_str = ';'.join(s2_ids) if s2_ids else None slic_df['s2_ids'].append(s2_ids_str) s2_names = ';'.join(s2_names) if s2_names else None slic_df['s2_names'].append(s2_names) seen_s2 |= s2_ids else: s2_name = s2_author_map.get(s2_id, None) slic_df['slic_name'].append(s2_name) slic_df['s2_ids'].append(s2_id) slic_df['s2_names'].append(s2_name) seen_s2 |= s2_ids # incremenet the slic id identifier slic_count += 1 slic_df = pd.DataFrame.from_dict(slic_df) slic_df = slic_df.loc[slic_df.slic_name != 'Unknown'].copy().reset_index(drop=True) self.slic_df = slic_df return slic_df
[docs] def apply(self, df, slic_df=None): """ Apply the SLIC id mapping to a SLIC papers dataframe Parameters ---------- df: pandas.DataFrame The SLIC dataframe for which author SLIC ids need to be created slic_df: pandas.DataFrame, optional A pre-computed DataFrame with SLIC id mappings. This parameter is provided in the rare cases that a SLIC map is being used between multiple datasets (i.e. dataset B is a subset of A and slic_df was computed for A). Be aware that setting a value for slic_df is not recommended! If using this parameter, verify that all desired scopus/s2 authors have existing SLIC ids. To be sure of the validity of your results, use Orca.run() before using Orca.apply() and do not pass a value for this parameter. Returns ------- orca_df: pandas.DataFrame df with standarized author information (columns for 'SLIC_ids' and 'SLIC_affiliations') """ if slic_df is None and self.slic_df is None: return ValueError('No SLIC ID map found. First, compute the map with Orca.run()') if slic_df is not None and self.slic_df is not None: warnings.warn('[Orca]: slic_df was passed as an argument however this Orca object already has a ' \ 'stored slic_df object.\n\t\tOverwriting stored slic_df with given argument. If this ' \ 'message is unexpected, use Orca.apply() without specifying `slic_df`', RuntimeWarning) if slic_df is not None: self.slic_df = slic_df # verify that paper scopus ids and s2ids are unique if 'eid' in df.columns and df.eid.nunique() != len(df.loc[~df.eid.isnull()]): df = df[~df['eid'].duplicated(keep='first') | df['eid'].isna()].copy() warnings.warn('[Orca]: Encountered duplicate Scopus IDs (`eid`) in df. Dropping duplicate papers.') if 's2id' in df.columns and df.s2id.nunique() != len(df.loc[~df.s2id.isnull()]): df = df[~df['s2id'].duplicated(keep='first') | df['s2id'].isna()].copy() warnings.warn('[Orca]: Encountered duplicate S2 IDs (`s2id`) in df. Dropping duplicate papers.') # replace scopus and s2 author ids respectively if 's2id' in df.columns and 'eid' in df.columns: scopus_df = self.__compute_slic_scopus(df) s2_df = self.__compute_slic_s2(df) # merge and build output dataframe df2 = pd.merge(df, scopus_df, on='eid', how='outer') df3 = pd.merge(df2, s2_df, on='s2id', how='outer') orca_df = df3.copy() orca_df['slic_author_ids'] = orca_df['slic_author_ids_x'].combine_first(orca_df['slic_author_ids_y']) orca_df = orca_df.drop(columns=['slic_author_ids_x', 'slic_author_ids_y']) elif 's2id' in df.columns: s2_df = self.__compute_slic_s2(df) orca_df = pd.merge(df, s2_df, on='s2id', how='outer') else: scopus_df = self.__compute_slic_scopus(df) orca_df = pd.merge(df, scopus_df, on='eid', how='outer') if orca_df.slic_author_ids.isna().any(): original_len = len(orca_df) orca_df.dropna(subset=['slic_author_ids'], inplace=True) warnings.warn(f'[Orca]: Found {original_len - len(orca_df)} papers with missing ' \ 'SLIC author IDs. Dropping these papers.') # add a column of slic author names using the matched slic ids slic_authors = {k:v for k,v in zip(self.slic_df.slic_id.to_list(), self.slic_df.slic_name.to_list())} def map_ids_to_names(ids): if pd.isna(ids): return None names = [slic_authors.get(str(i), '') for i in ids.split(';')] names = [name for name in names if name] return ';'.join(names) orca_df['slic_authors'] = orca_df['slic_author_ids'].apply(map_ids_to_names) return orca_df.reset_index(drop=True)
def __verify_df(self, df): """ Verify that the given papers dataframe matches the SLIC standard and can be used with Orca Parameters ---------- df: pandas.DataFrame The SLIC papers DataFrame for which author SLIC ids need to be created Returns ------- flag: bool If true, df passes the test and can be used with Orca error: str, None If flag is True, None is returned. Otherwise a string with the encountered error is provided """ must_have = {'eid', 'authors', 'author_ids', 'affiliations', 's2_authors', 's2_author_ids'} columns = set(df.columns) if columns & must_have != must_have: return False, f'The columns {list(must_have - columns)} are missing in `df`' return True, None def __verify_slic_df(self, slic_df): """ Verify that the given papers dataframe matches the SLIC standard and can be used with Orca Parameters ---------- slic_df: pandas.DataFrame A pre-computed DataFrame with SLIC id mappings. Returns ------- flag: bool If true, df passes the test and can be used with Orca error: str, None If flag is True, None is returned. Otherwise a string with the encountered error is provided """ must_have = {'slic_id', 'slic_name', 'scopus_ids', 'scopus_names', 's2_ids', 's2_names'} columns = set(slic_df.columns) if columns & must_have != must_have: return False, f'The columns {list(must_have - columns)} are missing in `slic_df`' return True, None def __add_scopus_duplicates(self, auth_df, duplicates): """ Helper function that enriches the results of AuthorMatcher with any previously detected Scopus duplicates. These duplicates are given a shared S2 id that will be used to connect them in the succeeding SLIC id creation steps. Parameters ---------- auth_df: pandas.DataFrame The results of AuthorMatcher on the working DataFrame duplicates: list(set), optional A list of sets where each set contains scopus author ids that refer to the same person. In the ideal case, each author only has one scopus id. However, this ideal does not hold up in practice and some authors are represented by two or more scopus ids. Duplicate authors can be found using the Orca.DuplicateAuthorFinder tool. Returns ------- out_df: pandas.DataFrame DataFrame that matches the shape of auth_df but contains entries to flag scopus duplicates """ out_df = pd.DataFrame(columns=auth_df.columns) all_df_ids = set(auth_df.SCOPUS_Author_ID.to_list()) if self.verbose: print('[Orca]: Scanning for Scopus duplicates in dataset. . .') for scopus_ids in tqdm(duplicates, total=len(duplicates), disable=not self.verbose): if not scopus_ids & all_df_ids: continue tmp_df = auth_df.loc[auth_df.SCOPUS_Author_ID.isin(scopus_ids)] if len(tmp_df.SCOPUS_Author_ID.unique()) > 1: row = tmp_df.iloc[0].copy() for scopus_id in tmp_df.SCOPUS_Author_ID.unique(): if row.SCOPUS_Author_ID == scopus_id: continue else: new_row = row.copy() new_row.SCOPUS_Author_ID = scopus_id out_df = pd.concat([out_df, new_row.to_frame().T], ignore_index=True) return out_df def __add_s2_duplicates(self, duplicates): """ Converts a list of sets into a dictionary such that each key in the dictionary is an element from a set, and its value is a list of the other elements in that set. Each set should contain s2 author ids that are known duplicates of each other. No two pairs of sets can share an author id. All known duplicate of an s2 author id should be contained within a single set. Sets with a single id (non-duplicates) will be ignored. Parameters: ----------- duplicates: list(set()) A list of sets of s2 authors ids to be processed. Returns: -------- dict: The processed s2 duplicates which will be resolved in a further processinf step. Raises: ------- ValueError: If an id appears in more than one set within the list. Example: -------- >>> self.__add_s2_duplicates([{1,2}, {3,4}, {5}]) {1: [2], 2: [1], 3: [4], 4: [3]} >>> self.__add_s2_duplicates([{1,2}, {2,3}]) ValueError """ out_dict = {} seen = set() for s in duplicates: if any(elem in seen for elem in s): raise ValueError("Detected multiple entries for s2 duplicates across sets. \ Make sure that all known duplicates are constrained to a single set") seen.update(s) if len(s) == 1: continue for element in s: out_dict[element] = [x for x in s if x != element] return out_dict def __propagate_duplicates(self, a_map, a_duplicates): """ Propagate the ids associated with keys in a_map to their duplicate keys. For each key in a_map, if duplicate keys exist in a_duplicates, the associated values from a_map are propagated to these duplicate keys. The 'a'/'b' notation is used to keep this function generic so that is can be used to go from s2 --> scopus or scopus --> s2. Parameters: ----------- a_map: dict The main author dictionary that is to be updated. a_duplicates: dict Dictionary mapping keys to lists of their duplicates. Returns: -------- None a_map is modified in place. """ a_map_update = {} # create an update map based on duplicates for a_id, b_ids in a_map.items(): if a_id in a_duplicates: for dup_a_id in a_duplicates[a_id]: if dup_a_id not in a_map_update: a_map_update[dup_a_id] = set() a_map_update[dup_a_id] |= set(b_ids) # convert set to list for each key in the update map for dup_a_id in a_map_update: a_map_update[dup_a_id] = list(a_map_update[dup_a_id]) # update the main map in place a_map.update(a_map_update) def __uncouple_author_matches(self, auth_df, s2_duplicates): """ Helper function that takes the authors DataFrame produced by AuthorMatcher (could be enriched with scopus duplicates or not) and finds out which sets of authors ids represent the same individual. This is done by building a graph of author ids relationships. Take for example the following two maps. In this case letters are scopus IDs and numbers are S2 IDs. Each map presents the relationship between scopus and S2 from the perspective of the key dataset. There are only 2 authors but they are represented by 2 scopus / 3 S2 IDs for the first author and 1 scopus / 2 S2 IDs for the second author. >>> scopus_map = {'A': [1,2], 'B': [4], 'C': [3,5]} >>> s2_map = {1: ['A'], 2: ['A'], 3: ['C'], 4: ['A'], 5: ['C']} For bigger datasets, these relationships can grow complex and are best modeled by a graph. Both scopus and S2 IDs are nodes in this graph and their relationship can be modeled with egdes between them. This graph is very disconnected as there will be many unique authors in any given SLIC dataset. However, weakly connected components of the graph will signify that all author id nodes in said component belong to the same author. Parameters ---------- auth_df: pandas.DataFrame The results of AuthorMatcher on the working DataFrame s2_duplicates: Returns ------- matches: list A list of dictionaries. Each dictionary in the list contains 2 keys: 'scopus' and 's2'. The values are sets of corresponding scopus/s2 ids """ # create the two maps necessary for processing s2_map = auth_df.groupby('S2_Author_ID')['SCOPUS_Author_ID'].agg(set).to_dict() scopus_map = auth_df.groupby('SCOPUS_Author_ID')['S2_Author_ID'].agg(set).to_dict() # handle s2 duplicates self.__propagate_duplicates(s2_map, s2_duplicates) # ensure that no s2 id == scopus id by coincidence s2_map = {f'B_{k}': {f'A_{x}' for x in v} for k,v in s2_map.items()} scopus_map = {f'A_{k}': {f'B_{x}' for x in v} for k,v in scopus_map.items()} # also create a name map which we will use s2_name_map = auth_df.groupby('S2_Author_ID')['S2_Author_Name'].agg(lambda x: max(x, key=len)).to_dict() # setup the graph G = nx.DiGraph() for k, v_set in scopus_map.items(): for v in v_set: G.add_edge(k, v) for k, v_set in s2_map.items(): for v in v_set: G.add_edge(k, v) # get the list of components and process them components = list(nx.weakly_connected_components(G)) matches = [] for component_set in components: mdict = {'scopus': set(), 's2': set()} for c in component_set: if c.startswith('A_'): mdict['scopus'].add(c[2:]) else: mdict['s2'].add(c[2:]) # get the longest s2 name to use as slic name str_gen = ((pid, s2_name_map[pid]) for pid in mdict['s2'] if pid in s2_name_map) _, name = max(str_gen, key=lambda x: len(x[1]), default=(None, 'Unknown')) mdict['name'] = name matches.append(mdict) return matches def __generate_author_map(self, df, id_col, name_col): """ Helper function that generates a map of author ids to author names Parameters ---------- df: pandas.DataFrame The SLIC papers DataFrame for which author SLIC ids need to be created id_col: str The author ids column. Options are ['author_ids', 's2_author_ids'] name_col: str The author names column. Options are ['authors', 's2_authors'] Returns ------- auth_map: dict Map where keys are author ids and values are author names """ if self.verbose: print(f'[Orca]: Generating {id_col}-{name_col} map. . .') auth_map = {} tmp_df = df.dropna(subset=[id_col, name_col]) for id_list, auth_list in tqdm(zip(tmp_df[id_col].to_list(), tmp_df[name_col].to_list()), total=len(tmp_df), disable=not self.verbose): for auth_id, name in zip(id_list.split(';'), auth_list.split(';')): if auth_id not in auth_map: auth_map[auth_id] = name return auth_map def __compute_slic_scopus(self, df): """ Helper function applies the SLIC id map to papers that have scopus information Parameters ---------- df: pandas.DataFrame The SLIC papers DataFrame for which author SLIC ids need to be created Returns ------- scopus_df: pandas.DataFrame papers DataFrame that contains SLIC id and affiliation information """ tmp_df = df.loc[~df['eid'].isnull()] # get only scopus papers # create maps for scopus author an scopus_authors, scopus_affiliations = {}, {} for eid, author_ids, affiliations in zip(tmp_df['eid'].to_list(), tmp_df['author_ids'].to_list(), tmp_df['affiliations'].to_list()): if not pd.isna(author_ids): scopus_authors[eid] = author_ids if not pd.isna(affiliations): if isinstance(affiliations, str): affiliations = ast.literal_eval(affiliations) scopus_affiliations[eid] = affiliations scopus_df = { 'eid': [], 'slic_author_ids': [], 'slic_affiliations': [], } # compute map of scopus author id to slic author id scopus_to_slic = {x: k for k,v in zip(self.slic_df.slic_id.to_list(), self.slic_df.scopus_ids.to_list()) if not pd.isna(v) for x in v.split(';')} missing_authors = set() for eid in tmp_df.eid.to_list(): slic_author_ids = [] # first replace author_ids information author_ids = scopus_authors.get(eid) if author_ids is not None: for scopus_id in author_ids.split(';'): scopus_id = str(scopus_id) # should already be string but hard cast to make sure slic_id = scopus_to_slic.get(scopus_id) if slic_id is None: missing_authors.add(scopus_id) else: slic_author_ids.append(str(slic_id)) aff_dict, del_dict = {}, [] # next update affiliations structure affiliations = scopus_affiliations.get(eid) if affiliations is not None: for aff_id, aff_info_shallow in affiliations.items(): del_list = [] # items to remove aff_info = copy.deepcopy(aff_info_shallow) for i in range(len(aff_info['authors'])): scopus_id = str(aff_info['authors'][i]) if scopus_id not in scopus_to_slic: del_list.append(scopus_id) missing_authors.add(scopus_id) else: aff_info['authors'][i] = scopus_to_slic[scopus_id] for d in del_list: if d not in aff_info['authors']: aff_info['authors'].remove(str(d)) else: aff_info['authors'].remove(d) if not aff_info['authors']: del_dict.append(aff_id) aff_dict[aff_id] = aff_info for d in del_dict: del aff_dict[d] scopus_df['eid'].append(eid) if not slic_author_ids: scopus_df['slic_author_ids'].append(None) else: scopus_df['slic_author_ids'].append(";".join(slic_author_ids)) if not aff_dict: scopus_df['slic_affiliations'].append(None) else: scopus_df['slic_affiliations'].append(aff_dict) if len(missing_authors) > 0: warnings.warn(f'[Orca]: {len(missing_authors)} Scopus IDs did not have corresponding SLIC ID and were removed') scopus_df = pd.DataFrame.from_dict(scopus_df) return scopus_df def __compute_slic_s2(self, df): """ Helper function applies the SLIC id map to papers that have S2 information Parameters ---------- df: pandas.DataFrame The SLIC papers DataFrame for which author SLIC ids need to be created Returns ------- s2_df: pandas.DataFrame papers DataFrame that contains SLIC id and affiliation information """ #tmp_df = df.loc[df['eid'].isnull()] # get only s2 papers tmp_df = df.loc[~df['s2id'].isnull()] # compute map of s2 author id to slic author id s2_to_slic = {x: k for k,v in zip(self.slic_df.slic_id.to_list(), self.slic_df.s2_ids.to_list()) if not pd.isna(v) for x in v.split(';')} s2_df = { 's2id': [], 'slic_author_ids': [], } missing_authors = set() for s2id, s2_author_ids in zip(tmp_df['s2id'].to_list(), tmp_df['s2_author_ids'].to_list()): slic_author_ids = [] if not pd.isna(s2_author_ids): for s2_auth_id in s2_author_ids.split(';'): if s2_auth_id not in s2_to_slic: missing_authors.add(s2_auth_id) else: slic_author_ids.append(s2_to_slic[s2_auth_id]) if slic_author_ids: s2_df['s2id'].append(s2id) s2_df['slic_author_ids'].append(";".join(slic_author_ids)) else: s2_df['s2id'].append(s2id) s2_df['slic_author_ids'].append(None) if len(missing_authors) > 0: warnings.warn(f'[Orca]: {len(missing_authors)} S2 IDs did not have corresponding SLIC ID and were removed') s2_df = pd.DataFrame.from_dict(s2_df) return s2_df def __load_pickle(self, fn): """ Helper function for loading pickle files saved in data package If upgrading to python >=3.9, change this function make use of importlib.resources Parameters ---------- fn: str The file name to be loaded Returns ------- python object stored in the pickle file """ current_dir = os.path.dirname(os.path.abspath(__file__)) return pickle.load(open((os.path.join(current_dir, 'data', fn)), 'rb')) def __generate_affiliations_map(self, df): """ Helper function for computing a map of affiliations for scopus authors. The output map is a dict that adheres to the following structure: { SCOPUS_AUTHOR_ID: { SCOPUS_AFFILIATION_ID: { 'name': NAME, # the name of the affiliation 'country': COUNTRY # country associated with the affiliation 'first_seen': XXXX # the year when first known paper was published by author with given affiliation 'last_seen': XXXX # the year when last known paper was published by author with given affiliation 'papers': XXXX # list of known papers with this affiliation. NOT guaranteed to contain all papers }, ... }, ... } Parameters ---------- df: pandas.DataFrame The SLIC papers DataFrame for which author SLIC ids need to be created Returns ------- affiliations_map: dict, The created map """ affiliations_map = {} for eid, year, affiliations in zip(df.eid.to_list(), df.year.to_list(), df.affiliations.to_list()): # handle missing / unconverted affiliations if pd.isna(affiliations): continue if isinstance(affiliations, str): affiliations = ast.literal_eval(affiliations) # get the year if pd.isna(year): year = 0 else: year = int(year) for aff_id, info in affiliations.items(): aff_name = info.get('name', 'Unknown') aff_country = info.get('country', 'Unknown') for auth_id in info.get('authors', []): if auth_id not in affiliations_map: affiliations_map[auth_id] = {} if aff_id not in affiliations_map[auth_id]: affiliations_map[auth_id][aff_id] = {} first_seen = affiliations_map[auth_id][aff_id].get('first_seen', 1*10**4) last_seen = affiliations_map[auth_id][aff_id].get('last_seen', -1) if 'name' not in affiliations_map[auth_id][aff_id]: affiliations_map[auth_id][aff_id]['name'] = aff_name affiliations_map[auth_id][aff_id]['country'] = aff_country affiliations_map[auth_id][aff_id]['first_seen'] = year affiliations_map[auth_id][aff_id]['last_seen'] = year affiliations_map[auth_id][aff_id]['papers'] = {eid} else: affiliations_map[auth_id][aff_id]['papers'].add(eid) if year < first_seen or first_seen == 0: affiliations_map[auth_id][aff_id]['first_seen'] = year elif year > last_seen or last_seen == 0: affiliations_map[auth_id][aff_id]['last_seen'] = year # handle the missing years for auth_id in affiliations_map: for aff_id, aff_info in affiliations_map[auth_id].items(): if not aff_info['first_seen']: affiliations_map[auth_id][aff_id]['first_seen'] = None if not aff_info['last_seen']: affiliations_map[auth_id][aff_id]['last_seen'] = None return affiliations_map def __merge_scopus_affiliations(self, scopus_ids, data): """ Helper function for merging the scopus affiliation maps for entries where multiple scopus ids correspond to the same author Parameters ---------- scopus_ids: list A list of scopus author ids data: dict The scopus affiliations map Returns ------- merged: dict A dictionary that is a result of merging the dictionaries associated with the duplicate author ids. For shared items, 'first_seen' is the earliest and 'last_seen' is the latest among all the dictionaries. For unshared items, their details are kept as is. If 'first_seen' or 'last_seen' is 'Unknown' in one dictionary but has a valid integer value in another, the integer value is considered. If no valid integer value exists for 'first_seen' or 'last_seen', 'Unknown' is set as the value. """ merged = {} all_keys = set(k for key in scopus_ids if key in data for k in data[key].keys()) for key in all_keys: items = [data[k][key] for k in scopus_ids if k in data and key in data[k]] names = [item['name'] for item in items if item['name'] != 'Unknown'] countries = [item['country'] for item in items if item['country'] != 'Unknown'] papers = list(set.union(*[item['papers'] for item in items])) merged[key] = { 'name': names[0] if names else 'Unknown', # use first valid affiliation name, or 'Unknown' if no valid names 'country': countries[0] if countries else 'Unknown', # use first valid affiliation country, or 'Unknown' if no valid countries 'first_seen': min((item['first_seen'] for item in items if isinstance(item['first_seen'], int)), default='Unknown'), 'last_seen': max((item['last_seen'] for item in items if isinstance(item['last_seen'], int)), default='Unknown'), 'papers': papers } merged = merged if merged else None # {} --> None return merged ### GETTERS / SETTERS @property def duplicates(self): return self._duplicates @property def s2_duplicates(self): return self._s2_duplicates @duplicates.setter def duplicates(self, duplicates): if duplicates is None: self._duplicates = [] elif isinstance(duplicates, list): self._duplicates = duplicates else: raise TypeError(f' {type(duplicates)} is an invalid type for `duplicates`') """ Code where we have a precomputed duplicates. @duplicates.setter def duplicates(self, duplicates): if duplicates is None: self._duplicates = self.__load_pickle(self.DUPLICATES_1M) elif isinstance(duplicates, list): self._duplicates = self.__load_pickle(self.DUPLICATES_1M) + duplicates else: raise TypeError(f' {type(duplicates)} is an invalid type for `duplicates`') """ @s2_duplicates.setter def s2_duplicates(self, s2_duplicates): if s2_duplicates is None: self._s2_duplicates = {} elif isinstance(s2_duplicates, list): self._s2_duplicates = self.__add_s2_duplicates(s2_duplicates) else: raise TypeError(f' {type(s2_duplicates)} is an invalid type for `s2_duplicates`')