|
--- |
|
datasets: |
|
- alexjerpelea/AroTranslate-rup-ron-dataset |
|
language: |
|
- ro |
|
- en |
|
license: cc-by-nc-4.0 |
|
tags: |
|
- aromanian |
|
- macedo-romanian |
|
--- |
|
This model is an extension of [the first coherent Aromanian translator](https://huggingface.co/alexjerpelea/NLLB-aromanian-romanian-v1). <br> |
|
It is a [NLLB-200-600M](https://huggingface.co/facebook/nllb-200-distilled-600M) model fine-tuned for translating between any two languages out of: Aromanian, Romanian & English, using this [dataset](https://huggingface.co/datasets/alexjerpelea/aromanian-romanian-MT-corpus), which was artificially extended with Google Translate API. |
|
|
|
Read more about AroTranslate at [this GitHub repository](https://github.com/lolismek/AroTranslate.git). |
|
|
|
We present the following results: |
|
| | ron -> rup | rup -> ron | rup -> eng | eng -> rup | ron -> eng | eng -> ron |
|
|:----|:-----|:-----|:----|:-----|:-----|:-----| |
|
| BLEU | 33.18 | 54.36 | 51.25 | 25.16 | 66.96 | 52.16 |
|
| ChrF2++ | 59.47 | 68.54 | 66.13 | 52.68 | 78.84 | 70.34 |
|
|
|
|
|
Note: |
|
* As Aromanian does not have a standard writing system, please see code below for text normalization. |
|
* This model was trained for production, being able to handle absence of diacritics. We do however recommend using them. |
|
|
|
How to use the model: |
|
```py |
|
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, NllbTokenizer |
|
import re |
|
|
|
# load model and tokenizer: |
|
model = AutoModelForSeq2SeqLM.from_pretrained('alexjerpelea/NLLB-aromanian-romanian-english') |
|
tokenizer = tokenizer = AutoTokenizer.from_pretrained('alexjerpelea/NLLB-aromanian-romanian-english') |
|
|
|
# translate function: |
|
def translate( |
|
text, src_lang='ron_Latn', tgt_lang='rup_Latn', |
|
a=32, b=3, max_input_length=1024, num_beams=4, **kwargs |
|
): |
|
tokenizer.src_lang = src_lang |
|
tokenizer.tgt_lang = tgt_lang |
|
inputs = tokenizer( |
|
text, return_tensors='pt', padding=True, truncation=True, |
|
max_length=max_input_length |
|
) |
|
model.eval() |
|
result = model.generate( |
|
**inputs.to(model.device), |
|
forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang), |
|
max_new_tokens=int(a + b * inputs.input_ids.shape[1]), |
|
num_beams=num_beams, **kwargs |
|
) |
|
return tokenizer.batch_decode(result, skip_special_tokens=True) |
|
|
|
|
|
def clean_text(text, lang): |
|
if isinstance(text, float): |
|
return text |
|
|
|
# consecutive spaces |
|
text = re.sub(r'\s+', ' ', text).strip() |
|
# old romanian î in the middle of the word |
|
text = re.sub(r'(?<=\w)î(?=\w)', 'â', text) |
|
|
|
if lang == 'ron': |
|
text = text.replace('Ş', 'Ș') |
|
text = text.replace('ş', 'ș') |
|
text = text.replace('Ţ', 'Ț') |
|
text = text.replace('ţ', 'ț') |
|
else: |
|
text = text.replace('ş', 'sh') |
|
text = text.replace('ș', 'sh') |
|
text = text.replace('ţ', 'ts') |
|
text = text.replace('ț', 'ts') |
|
text = text.replace('Ş', 'Sh') |
|
text = text.replace('Ș', 'Sh') |
|
text = text.replace('Ţ', 'Ts') |
|
text = text.replace('Ț', 'Ts') |
|
|
|
text = text.replace('ľ', 'lj') |
|
text = text.replace('Ľ', 'L') |
|
|
|
text = text.replace("l'", "lj") |
|
text = text.replace("l’", "lj") |
|
text = text.replace("L'", "Lj") |
|
text = text.replace("L’", "Lj") |
|
|
|
text = text.replace('ḑ', 'dz') |
|
text = text.replace('Ḑ', 'dz') |
|
text = text.replace('ḍ', 'dz') |
|
text = text.replace('Ḍ', 'Dz') |
|
|
|
# TODO: add n' |
|
text = text.replace('ń', 'nj') |
|
text = text.replace('Ń', 'Nj') |
|
text = text.replace('ñ', 'nj') |
|
text = text.replace('Ñ', 'Nj') |
|
|
|
text = text.replace('ă', 'ã') |
|
text = text.replace('Â', 'Ã') |
|
text = text.replace('â', 'ã') |
|
text = text.replace('Ă', 'Ã') |
|
text = text.replace('á', 'ã') |
|
text = text.replace('à', 'ã') |
|
text = text.replace('Á', 'Ã') |
|
text = text.replace('À', 'Ã') |
|
|
|
text = text.replace('Î', 'Ã') |
|
text = text.replace('î', 'ã') |
|
|
|
# weird foreign characters |
|
text = text.replace('ŭ', 'u') |
|
text = text.replace('ς', 'c') |
|
text = text.replace('é', 'e') |
|
text = text.replace('í', 'i') |
|
text = text.replace('ū', 'u') |
|
text = text.replace('ì', 'i') |
|
text = text.replace('ā', 'a') |
|
text = text.replace('ĭ', 'i') |
|
text = text.replace('γ', 'y') |
|
text = text.replace('ï', 'i') |
|
text = text.replace('ó', 'o') |
|
text = text.replace('θ', 'O') |
|
|
|
# for both languages: |
|
text = text.replace('—', '-') |
|
text = text.replace('–', '-') |
|
text = text.replace('…', '...') |
|
text = text.replace('*', '') |
|
text = text.replace('<', '') |
|
text = text.replace('>', '') |
|
|
|
text = text.replace('„', '"') |
|
text = text.replace('”', '"') |
|
text = text.replace('“', '"') |
|
text = text.replace('”', '"') |
|
|
|
text = text.replace('\xa0', '') |
|
text = text.replace('\ufeff', '') |
|
text = text.replace('\n', '') |
|
|
|
return text |
|
|
|
# Aromanian to Romanian: |
|
t = '''Trã atsea cãdzu pri mare cripare, shi tutã dzua stãtea ãnvirinat.''' |
|
t = clean_text(t, 'rup') |
|
print(translate(t, 'rup_Latn', 'ron_Latn')) |
|
|
|
# Romanian to Aromanian: |
|
t = '''Apoi se opri puțin, o sorbi din ochi, o sărută și - când începu să scâncească, îi cântă iar:''' |
|
t = clean_text(t, 'rup') |
|
print(translate(t, 'rup_Latn', 'ron_Latn')) |
|
|
|
# Aromanian to English: |
|
t = '''Cã a ta boatsi e birbil ti suflitu a meu.''' |
|
t = clean_text(t, 'rup') |
|
print(translate(t, 'rup_Latn', 'eng_Latn')) |
|
|
|
# English to Aromanian: |
|
t = '''That your voice is the nightingale of my soul.''' |
|
print(translate(t, 'eng_Latn', 'rup_Latn')) |
|
``` |
|
|
|
## License |
|
<a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/">Creative Commons Attribution-NonCommercial 4.0 International License</a>. When using this work, please mention its name as "AroTranslate" and the author. |
|
|