TestApp / components /data_loading.py
menikev's picture
Upload 9 files
d2ed505 verified
import pandas as pd
import torch
def preparing_data(text:str , domain: int):
"""
Args:
text (_str_): input text from the user
domain (_int_): output domain from domain identification pipeline
Returns:
_DataFrame_: dataframe contains texts and domain
"""
# Let's assume you have the following dictionary
# the model can't do inference with only one example so this dummy example must be put
dict_data = {
'text': ['hello world' ] ,
'domain': [0] ,
}
dict_data["text"].append(text)
dict_data["domain"].append(domain)
# Convert the dictionary to a DataFrame
df = pd.DataFrame(dict_data)
# return the dataframe
return df
def loading_data(tokenizer , df: pd.DataFrame ):
ids = []
masks = []
domain_list = []
texts = df["text"]
domains= df["domain"]
for i in range(len(df)):
text = texts[i]
token = tokenizer(text)
ids.append(token["token_id"])
masks.append(token["mask"])
domain_list.append(domains[i])
input_ids = torch.cat(ids , dim=0)
input_masks = torch.cat(masks ,dim = 0)
input_domains = torch.tensor(domain_list)
return input_ids , input_masks , input_domains