Source code for TELF.pre_processing.Squirrel.pruners.llm_prune

import json
import logging
import math
from pathlib import Path
from typing import Union

import pandas as pd
from tqdm.auto import tqdm

from ....helpers.llm import get_ollama_llm, vote_once, build_json_vote_prompt

log = logging.getLogger(__name__)

[docs] class LLMPruner: def __init__( self, llm_model_name: str, llm_api_url: str, llm_vote_trials: int, llm_promote_threshold: float, llm_temperature: float, verbose: bool = True, ): """ Perform LLM-based refinement on an embedding-pruned dataset, annotating each document with an `llm_accept` boolean column. Parameters ---------- llm_model_name : str Ollama model identifier (e.g. `"llama3.1:405b"`). llm_api_url : str Base URL for the Ollama API. llm_vote_trials : int Number of independent votes per document. llm_promote_threshold : float Fraction of “yes” votes required to accept a previously rejected doc. llm_temperature : float Sampling temperature for the LLM. verbose : bool Whether to show tqdm progress bars. """ self.llm_vote_trials = llm_vote_trials self.llm_promote_threshold = llm_promote_threshold self.verbose = verbose self.NAME = 'llm_prune' self.llm = get_ollama_llm( model=llm_model_name, base_url=llm_api_url, temperature=llm_temperature ) def _vote_on_document(self, df, idx, row, ctx, keep, fout, data_column) -> None: """ Perform the LLM voting process on a single document. Parameters ---------- df : pd.DataFrame The DataFrame containing the dataset. idx : int Index of the current row in the DataFrame. row : pd.Series The current row in the DataFrame. ctx : list Context for LLM voting. keep : set Set of indices to keep (i.e., already accepted). fout : file File handler to write the voting record. data_column : str Column name containing the data to be voted on. """ prompt = build_json_vote_prompt(row[data_column], ctx) votes = [vote_once(self.llm, prompt) for _ in range(self.llm_vote_trials)] yes_count = sum(int(v[0]) for v in votes) reasons = [v[1] for v in votes] threshold = math.ceil(self.llm_promote_threshold * self.llm_vote_trials) decision = yes_count >= threshold df.at[idx, self.NAME] = decision record = { "index": idx, "answers": ["yes" if v[0] else "no" for v in votes], 'reasons': reasons, "final": decision, } fout.write(json.dumps(record) + "\n") if decision: keep.add(idx) def __call__(self, df, output_dir, label_column, reference_label, data_column) -> pd.DataFrame: """ Run LLM voting across all rows, annotate `llm_accept`, save vote records and the annotated DataFrame, and return it. Parameters ---------- df : pd.DataFrame The input DataFrame to be processed. output_dir : str or Path Directory to save the output files. label_column : str Column name indicating class labels. reference_label : Union[int, str] Value in `label_column` used to build context examples. data_column : str Column name containing the data to be voted on. Returns ------- pd.DataFrame The input DataFrame augmented with `llm_accept` . """ output_dir = Path(output_dir) / "llm_outputs" output_dir.mkdir(parents=True, exist_ok=True) # Initialize required paths and context for LLM voting decisions_path = output_dir / "llm_decisions.jsonl" keep = set(df[(df[label_column] == reference_label)].index) # Open the decisions file to save the vote records with decisions_path.open("a", encoding="utf-8") as fout: for idx, row in tqdm(df.iterrows(), total=len(df), disable=not self.verbose, desc="LLM voting"): if idx in keep: df.at[idx, self.NAME] = True continue # Perform voting and record results ctx = df.head(10)[data_column].tolist() self._vote_on_document(df, idx, row, ctx, keep, fout, data_column) # Save the annotated DataFrame out_path = output_dir / "llm_pruned.csv" df.to_csv(out_path, index=False) log.info("Saved LLM-annotated DataFrame → %s", out_path) return df