|
import pandas as pd |
|
import torch |
|
def preparing_data(text:str , domain: int): |
|
""" |
|
|
|
|
|
|
|
Args: |
|
text (_str_): input text from the user |
|
domain (_int_): output domain from domain identification pipeline |
|
|
|
Returns: |
|
_DataFrame_: dataframe contains texts and domain |
|
""" |
|
|
|
|
|
dict_data = { |
|
'text': ['hello world' ] , |
|
'domain': [0] , |
|
} |
|
|
|
dict_data["text"].append(text) |
|
dict_data["domain"].append(domain) |
|
|
|
df = pd.DataFrame(dict_data) |
|
|
|
|
|
return df |
|
|
|
|
|
def loading_data(tokenizer , df: pd.DataFrame ): |
|
ids = [] |
|
masks = [] |
|
domain_list = [] |
|
|
|
texts = df["text"] |
|
domains= df["domain"] |
|
|
|
|
|
for i in range(len(df)): |
|
text = texts[i] |
|
token = tokenizer(text) |
|
ids.append(token["token_id"]) |
|
masks.append(token["mask"]) |
|
domain_list.append(domains[i]) |
|
|
|
input_ids = torch.cat(ids , dim=0) |
|
input_masks = torch.cat(masks ,dim = 0) |
|
input_domains = torch.tensor(domain_list) |
|
|
|
|
|
return input_ids , input_masks , input_domains |
|
|
|
|