|
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer |
|
from utils.simple_bleu import simple_score |
|
import torch |
|
|
|
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-1.3B", torch_dtype=torch.bfloat16, device_map="auto") |
|
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-1.3B") |
|
|
|
|
|
def translate_ko2en(text): |
|
batched_input = [text] |
|
inputs = tokenizer(batched_input, return_tensors="pt", padding=True) |
|
|
|
translated_tokens = model.generate( |
|
**inputs.to(model.device), forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"] |
|
) |
|
result = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] |
|
return result |
|
|
|
|
|
def translate_en2ko(text): |
|
batched_input = [text] |
|
inputs = tokenizer(batched_input, return_tensors="pt", padding=True) |
|
|
|
translated_tokens = model.generate( |
|
**inputs.to(model.device), forced_bos_token_id=tokenizer.lang_code_to_id["kor_Hang"], max_new_tokens=2048) |
|
|
|
result = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] |
|
return result |
|
|
|
|
|
def main(): |
|
while True: |
|
text = input('>') |
|
en_text = translate_ko2en(text) |
|
ko_text = translate_en2ko(en_text) |
|
print('en_text', en_text) |
|
print('ko_text', ko_text) |
|
print('score', simple_score(text, ko_text)) |
|
""" |
|
>>? 3์ฒ๋ง ๊ฐ๊ฐ ๋๋ ํ์ผ๊ณผ 250์ต ๊ฐ์ ํ ํฐ์ด ์์ต๋๋ค. Phi1.5์ ๋ฐ์ดํฐ ์ธํธ ๊ตฌ์ฑ์ ์ ๊ทผํ์ง๋ง ์คํ ์์ค ๋ชจ๋ธ์ธ Mixtral 8x7B๋ฅผ ์ฌ์ฉํ๊ณ Apache2.0 ๋ผ์ด์ ์ค์ ๋ฐ๋ผ ๋ผ์ด์ ์ค๊ฐ ๋ถ์ฌ๋ฉ๋๋ค. |
|
en_text There are over 30 million files and 250 billion tokens. Phi1.5's data set configuration is accessible but uses the open source model Mixtral 8x7B and is licensed under the Apache 2.0 license. |
|
ko_text 300๋ง ๊ฐ ์ด์์ ํ์ผ๊ณผ 25์ต ๊ฐ์ ํ ํฐ์ด ์์ต๋๋ค. Phi1.5์ ๋ฐ์ดํฐ ์ธํธ ๊ตฌ์ฑ์ ์ก์ธ์ค ๊ฐ๋ฅํ์ง๋ง ์คํ ์์ค ๋ชจ๋ธ Mixtral 8x7B๋ฅผ ์ฌ์ฉํ๊ณ Apache 2.0 ๋ผ์ด์ ์ค์ ๋ฐ๋ผ ๋ผ์ด์ ์ค๋ฉ๋๋ค. |
|
score 0.3090015909429233 |
|
""" |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|