File size: 1,290 Bytes
d2ed505 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import pandas as pd
import torch
def preparing_data(text:str , domain: int):
"""
Args:
text (_str_): input text from the user
domain (_int_): output domain from domain identification pipeline
Returns:
_DataFrame_: dataframe contains texts and domain
"""
# Let's assume you have the following dictionary
# the model can't do inference with only one example so this dummy example must be put
dict_data = {
'text': ['hello world' ] ,
'domain': [0] ,
}
dict_data["text"].append(text)
dict_data["domain"].append(domain)
# Convert the dictionary to a DataFrame
df = pd.DataFrame(dict_data)
# return the dataframe
return df
def loading_data(tokenizer , df: pd.DataFrame ):
ids = []
masks = []
domain_list = []
texts = df["text"]
domains= df["domain"]
for i in range(len(df)):
text = texts[i]
token = tokenizer(text)
ids.append(token["token_id"])
masks.append(token["mask"])
domain_list.append(domains[i])
input_ids = torch.cat(ids , dim=0)
input_masks = torch.cat(masks ,dim = 0)
input_domains = torch.tensor(domain_list)
return input_ids , input_masks , input_domains
|