import pandas as pd import numpy as np from tqdm import tqdm from collections import Counter from transformers import AutoTokenizer, AutoModelForSequenceClassification from transformers import pipeline def run_inference(df, INPUT, TASK, classifier, label_mapping, rev_map, task_label_mapping, is_sentencelevel=True): inferences = [] for i in tqdm(range(len(df)), ascii=True): if is_sentencelevel: labels = [] scores = [] sentences = df.iloc[i, :][INPUT].split(".") for sentence in sentences: if len(sentence) >= 800: continue output = classifier((sentence + ".").lower())[0] labels.append(label_mapping[TASK][rev_map[output["label"]]]) scores.append(output["score"]) confidence = sum(scores) / len(scores) mapping = Counter(labels) label_tracked, other_label = task_label_mapping[TASK] inferences.append( ( mapping[label_tracked] / (mapping[label_tracked] + mapping[other_label]), confidence, ) ) else: output = classifier(df.iloc[i, :][INPUT])[0] inferences.append( (label_mapping[TASK][rev_map[output["label"]]], output["score"]) ) return inferences # TODO: remove when model is fixed :/ def compute_agentic_communal(df, hallucination=False): df['per_ac'] = np.random.rand(len(df)) df['con_ac'] = np.random.rand(len(df)) return df # Need clarification on model lol # def compute_agentic_communal(df,hallucination=False): # model_path = "./checkpoints/checkpoint-48" # # tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # model = AutoModelForSequenceClassification.from_pretrained(model_path) # classifier = pipeline("text-classification", model=model, tokenizer=tokenizer) # rev_map = {v: k for k, v in model.config.id2label.items()} # if hallucination: # INPUT = "hallucination" # else: # INPUT = "TEXT" # need to tell users what this should be called TODO: change this to the correct column name # TASK = "ac_classifier" # task_label_mapping = { # # Track percentage agentic / percentage agentic + percentage communal # "ac_classifier": ("agentic", "communal"), # } # label_mapping = { # "ac_classifier": { # 0: "communal", # 1: "agentic", # } # } # inferences = run_inference(df, INPUT, TASK, classifier, label_mapping, rev_map, task_label_mapping) # df["per_ac"] = [i[0] for i in inferences] # df["con_ac"] = [i[1] for i in inferences] # return df