Spaces:
Sleeping
Sleeping
import os | |
import pickle | |
import random | |
import pandas as pd | |
SPEAKERA_ROLE_MAP = {"Agent": 0, "Visitor": 1} | |
LABEL_MAP = { | |
"Curiosity": 0, | |
"Obscene": 1, | |
"Informative": 2, | |
"Openness": 3, | |
"Acceptance": 4, | |
"Interest": 5, | |
"Greeting": 6, | |
"Disapproval": 7, | |
"Denial": 8, | |
"Anxious": 9, | |
"Uninterested": 10, | |
"Remorse": 11, | |
"Confused": 12, | |
"Accusatory": 13, | |
"Annoyed": 14, | |
} | |
def process_user_input(input: str): | |
"""Parse the user input and return a list of row where each row is a list with | |
format `[<conversation_id>, <speaker>, <message>]`. | |
Args: | |
input (str): the input of the user with each line has the format of | |
`<speaker>:<message>`. Only one message per line. | |
Returns: | |
dict: a dictionary containing whether the input was successfully processed and | |
if so, the processed data of the input. | |
""" | |
if input == None or input == "": | |
return {"success": False, "message": "Input must not be an empty string!"} | |
data = [] | |
for line in input.split("\n"): | |
if line == "": | |
continue | |
try: | |
speaker, message = line.split(":", 1) | |
if speaker != "Agent" and speaker != "Visitor": | |
return {"success": False, "message": f"Invalid speaker {speaker}"} | |
# Assuming there's only one input conversation | |
# Give it a dummy conversation id of epik_0 | |
data.append(["epik_0", speaker, message]) | |
except: | |
return {"success": False, "message": "Invalid Input"} | |
return { | |
"success": True, | |
"message": "Success", | |
"data": data, | |
} | |
def encode_speaker_role(role): | |
return SPEAKERA_ROLE_MAP.get(role, 1) | |
def decode_speaker_role(role_numeric): | |
for role, numeric_val in SPEAKERA_ROLE_MAP.items(): | |
if role_numeric == numeric_val: | |
return role | |
return "Unknow Speaker" | |
def encode_sentiment_label(label): | |
return LABEL_MAP.get(label, -1) | |
def decode_numeric_label(label_numeric): | |
for label, numeric_val in LABEL_MAP.items(): | |
if label_numeric == numeric_val: | |
return label | |
return "Unknow Label" | |
def preapre_csv(data: list[list], output_path: str, with_label: bool = False): | |
""" | |
Process and group the speakers, messages, and labels (if any) by conversation | |
ids. This function is useful to prepare the neccesary csv file before converting it into | |
pickle file. | |
Args: | |
data (list[list]): A list contains the rows of a dataframe. Each row contains | |
values representing the coversation id, speaker role, message (, and label if any) in this order. | |
output_path (str): path to write the csv file. | |
with_label (bool, optional): Whether the input data contains labels (ie, for | |
training) or not (ie, for making predictions on a new sample). Defaults to False. | |
""" | |
columns = ["ConversationId", "ParticipantRole", "Text"] | |
if with_label: | |
columns += ["Label"] | |
df = pd.DataFrame(data=data, columns=columns) | |
# encode the participant role | |
df["ParticipantRoleEncoded"] = df["ParticipantRole"].apply( | |
lambda role: encode_speaker_role(role) | |
) | |
# encode the labels | |
if with_label: | |
df["LabelNumeric"] = df["Label"].apply( | |
lambda label: encode_sentiment_label(label) | |
) | |
else: | |
# Give the new input dummy labels to match the model input shape | |
df["LabelNumeric"] = df["ParticipantRole"].apply(lambda _: -1) | |
# group the data into list based on conversation id | |
agg_params = {"Label": list} if with_label else {} | |
agg_params.update( | |
{ | |
"ParticipantRole": list, | |
"ParticipantRoleEncoded": list, | |
"Text": list, | |
"LabelNumeric": list, | |
} | |
) | |
grouped_df = df.groupby("ConversationId").agg(agg_params).reset_index() | |
grouped_df.to_csv(output_path, index=False, encoding="ascii") | |
return grouped_df | |
def convert_to_pickle( | |
source: str, | |
dest: str, | |
index_col: str = None, | |
list_type_columns: list = [], | |
order=[], | |
exclude=[], | |
single_tuple=False, | |
): | |
"""Convert a csv file into a pickle file with format | |
col1, col2, ..., coln | |
Args: | |
source (str): path to csv file | |
dest (str): the location where the pickle file will be stored | |
index_col (str): the column with unique ids that serves as index. Default to | |
None | |
order (list, optional): specify the order for one or many columns from left to | |
right, followed by columns not in order. | |
exclude (list, optional): columns to be excluded from the result. Defaults to | |
[]. | |
single_tuple (bool): whether or not to output as tuple if there is only one | |
single column. Default to False. | |
""" | |
df = pd.read_csv(source) | |
df = df.drop(columns=exclude) | |
# convert column from string representation of a list to list | |
for col in list_type_columns: | |
if col in df.columns: | |
df[col] = df[col].fillna("[]").apply(lambda x: eval(x)) | |
if index_col != None: | |
df = df.set_index(index_col) | |
# reorder the columns | |
if order != []: | |
left = df[order] | |
right = df[[col for col in df.columns if col not in order]] | |
df = pd.concat([left, right], axis=1) | |
output = () | |
for col in df.columns: | |
output += (df[col].to_dict(),) | |
if not single_tuple and len(output) == 1: | |
output = output[0] | |
with open(dest, "wb") as f: | |
pickle.dump(output, f) | |
f.close() | |
return | |
def split_and_save_ids( | |
ids, train_ratio=0.8, test_ratio=0.1, valid_ratio=0.1, dir=".", seed=None | |
): | |
""" | |
Randomly split a list of IDs into training, testing, and validation sets and save them to text files. | |
Args: | |
ids (list): List of IDs to be split. | |
train_ratio (float): Ratio of IDs for the training set (default is 0.8). | |
test_ratio (float): Ratio of IDs for the testing set (default is 0.1). | |
valid_ratio (float): Ratio of IDs for the validation set (default is 0.1). | |
dir (str): the path to the directory to save the files for ids | |
seed (int): Seed for randomization (default is None). | |
Returns: | |
train_set (list): List of IDs in the training set. | |
test_set (list): List of IDs in the testing set. | |
valid_set (list): List of IDs in the validation set. | |
""" | |
# Check if the ratios add up to 1.0 | |
assert train_ratio + test_ratio + valid_ratio == 1.0, "Ratios should add up to 1.0" | |
# Set random seed for reproducibility | |
if seed is not None: | |
random.seed(seed) | |
# Shuffle the list of IDs | |
random.shuffle(ids) | |
# Calculate the split points | |
train_split = int(len(ids) * train_ratio) | |
test_split = train_split + int(len(ids) * test_ratio) | |
# Split the IDs | |
train_set = ids[:train_split] | |
test_set = ids[train_split:test_split] | |
valid_set = ids[test_split:] | |
# Save the sets to text files | |
def save_to_txt(file_path, id_set): | |
with open(file_path, "w") as file: | |
id_strings = [str(conv_id) for conv_id in id_set] | |
file.write("\n".join(id_strings)) | |
save_to_txt(os.path.join(dir, "train_set.txt"), train_set) | |
save_to_txt(os.path.join(dir, "test_set.txt"), test_set) | |
save_to_txt(os.path.join(dir, "validation_set.txt"), valid_set) | |
return train_set, test_set, valid_set | |
def merge_pkl_with_ids(pickle_src: str, ids_files: list, dir: str = "."): | |
"""Merge an existing pickle file with id files, resulting in a pickle file with 3 | |
more fields of train_ids, test_ids, and valid_ids. | |
Args: | |
pickle_src (str): the path to the pickle file | |
ids_files (list): list of files that contain ids. Example: | |
["train_set.txt", "test_set.txt", "validation_set.txt"]. Each file should | |
contain one single unique id on each line. | |
dir (str, optional): the directory for ids_files. Defaults to ''. | |
""" | |
ids_set = () | |
for filename in ids_files: | |
ids = [] | |
path = os.path.join(dir, filename) | |
with open(path, "r") as file: | |
for line in file: | |
ids.append(line.strip()) | |
ids_set += (ids,) | |
with open(pickle_src, "rb") as file: | |
data = pickle.load(file) | |
data += ids_set | |
file.close() | |
with open(pickle_src, "wb") as file: | |
pickle.dump(data, file) | |
file.close() | |