File size: 2,122 Bytes
577164e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from utils.simple_bleu import simple_score
import torch
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-1.3B", torch_dtype=torch.bfloat16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-1.3B")
def translate_ko2en(text):
batched_input = [text]
inputs = tokenizer(batched_input, return_tensors="pt", padding=True)
translated_tokens = model.generate(
**inputs.to(model.device), forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"]
)
result = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
return result
def translate_en2ko(text):
batched_input = [text]
inputs = tokenizer(batched_input, return_tensors="pt", padding=True)
translated_tokens = model.generate(
**inputs.to(model.device), forced_bos_token_id=tokenizer.lang_code_to_id["kor_Hang"], max_new_tokens=2048)
result = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
return result
def main():
while True:
text = input('>')
en_text = translate_ko2en(text)
ko_text = translate_en2ko(en_text)
print('en_text', en_text)
print('ko_text', ko_text)
print('score', simple_score(text, ko_text))
"""
>>? 3์ฒ๋ง ๊ฐ๊ฐ ๋๋ ํ์ผ๊ณผ 250์ต ๊ฐ์ ํ ํฐ์ด ์์ต๋๋ค. Phi1.5์ ๋ฐ์ดํฐ ์ธํธ ๊ตฌ์ฑ์ ์ ๊ทผํ์ง๋ง ์คํ ์์ค ๋ชจ๋ธ์ธ Mixtral 8x7B๋ฅผ ์ฌ์ฉํ๊ณ Apache2.0 ๋ผ์ด์ ์ค์ ๋ฐ๋ผ ๋ผ์ด์ ์ค๊ฐ ๋ถ์ฌ๋ฉ๋๋ค.
en_text There are over 30 million files and 250 billion tokens. Phi1.5's data set configuration is accessible but uses the open source model Mixtral 8x7B and is licensed under the Apache 2.0 license.
ko_text 300๋ง ๊ฐ ์ด์์ ํ์ผ๊ณผ 25์ต ๊ฐ์ ํ ํฐ์ด ์์ต๋๋ค. Phi1.5์ ๋ฐ์ดํฐ ์ธํธ ๊ตฌ์ฑ์ ์ก์ธ์ค ๊ฐ๋ฅํ์ง๋ง ์คํ ์์ค ๋ชจ๋ธ Mixtral 8x7B๋ฅผ ์ฌ์ฉํ๊ณ Apache 2.0 ๋ผ์ด์ ์ค์ ๋ฐ๋ผ ๋ผ์ด์ ์ค๋ฉ๋๋ค.
score 0.3090015909429233
"""
if __name__ == "__main__":
main()
|