File size: 4,206 Bytes
691ae9d 7fb5b91 691ae9d 57bf1c3 691ae9d 57bf1c3 691ae9d 7fb5b91 691ae9d 57bf1c3 7fb5b91 691ae9d 7fb5b91 691ae9d 57bf1c3 7fb5b91 691ae9d 7fb5b91 691ae9d 7fb5b91 691ae9d 7fb5b91 691ae9d 7fb5b91 0e439ee 691ae9d 57bf1c3 691ae9d 7fb5b91 691ae9d 7fb5b91 691ae9d 57bf1c3 691ae9d 7fb5b91 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch
def load_data(file_obj):
# Assuming file_obj is a file-like object uploaded via Gradio, use `pd.read_excel` directly on it
return pd.read_excel(file_obj)
def initialize_models():
model_ST = SentenceTransformer("all-mpnet-base-v2")
return model_ST
def generate_embeddings(df, model, Column):
embeddings_list = []
for index, row in df.iterrows():
if type(row[Column]) == str:
print(index)
if 'Title' in df.columns:
if type(row["Title"]) == str:
content = row["Title"] + "\n" + row[Column]
else:
content = row[Column]
else:
content = row[Column]
embeddings = model.encode(content, convert_to_tensor=True)
embeddings_list.append(embeddings)
else:
embeddings_list.append(np.nan)
df['Embeddings'] = embeddings_list
return df
def process_categories(categories, model):
# Create a new DataFrame to store category information and embeddings
df_cate = pd.DataFrame(categories)
# Generate embeddings for each category description
df_cate['Embeddings'] = df_cate.apply(lambda cat: model.encode(cat['description'], convert_to_tensor=True), axis=1)
return df_cate
def match_categories(df, category_df, treshold=0.45):
categories_list, experts_list, topic_list, scores_list = [], [], [], []
for ebd_content in df['Embeddings']:
if isinstance(ebd_content, torch.Tensor):
cos_scores = util.cos_sim(ebd_content, torch.stack(list(category_df['Embeddings']), dim=0))[0]
high_score_indices = [i for i, score in enumerate(cos_scores) if score > treshold]
# Append the corresponding categories, experts, and topics for each high-scoring index
categories_list.append([category_df.loc[index, 'description'] for index in high_score_indices])
experts_list.append([category_df.loc[index, 'experts'] for index in high_score_indices])
topic_list.append([category_df.loc[index, 'topic'] for index in high_score_indices])
scores_list.append([float(cos_scores[index]) for index in high_score_indices])
else:
categories_list.append(np.nan)
experts_list.append(np.nan)
topic_list.append(np.nan)
scores_list.append('pas interessant')
df["Description"] = categories_list
df["Expert"] = experts_list
df["Topic"] = topic_list
df["Score"] = scores_list
return df
def flatten_nested_lists(nested_list):
"""Flatten a list of potentially nested lists into a single list."""
flattened_list = []
for item in nested_list:
if isinstance(item, list):
flattened_list.extend(flatten_nested_lists(item)) # Recursively flatten the list
else:
flattened_list.append(item)
return flattened_list
def save_data(df, filename):
# Apply flattening and then join for the 'Expert' column
df['Expert'] = df['Expert'].apply(lambda x: ', '.join(flatten_nested_lists(x)) if isinstance(x, list) else x)
df['Description'] = df['Description'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
df['Topic'] = df['Topic'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
df['Score'] = df['Score'].apply(lambda x: ', '.join(map(str, x)) if isinstance(x, list) else x)
df = df.drop(columns=['Embeddings'])
new_filename = filename.replace(".", "_classified.")
df.to_excel(new_filename, index=False)
return new_filename
def classification(column, file_path, categories, treshold):
# Load data
df = load_data(file_path)
# Initialize models
model_ST = initialize_models()
# Generate embeddings for df
df = generate_embeddings(df, model_ST, column)
category_df = process_categories(categories, model_ST)
# Match categories
df = match_categories(df, category_df, treshold=treshold)
# Save data
return save_data(df,file_path), df
|