|
from typing import Dict |
|
import numpy as np |
|
import torch |
|
from transformers import Pipeline |
|
from transformers.utils import ModelOutput |
|
from transformers import pipeline |
|
from transformers.pipelines import PIPELINE_REGISTRY |
|
from transformers import AutoModelForSeq2SeqLM |
|
from huggingface_hub import Repository |
|
|
|
SAHIDIC_TAG = "з" |
|
BOHAIRIC_TAG = "б" |
|
|
|
from transformers import GenerationConfig |
|
|
|
GENERATION_CONFIG = GenerationConfig( |
|
max_length=20, |
|
max_new_tokens=128, |
|
min_new_tokens=1, |
|
min_length=0, |
|
early_stopping=True, |
|
do_sample=True, |
|
num_beams=5, |
|
num_beam_groups=1, |
|
top_k=50, |
|
top_p=0.95, |
|
temperature=1.0, |
|
diversity_penalty=0.0, |
|
output_scores=True, |
|
return_dict_in_generate=True, |
|
) |
|
|
|
|
|
class EnglishCopticPipeline(Pipeline): |
|
def _sanitize_parameters(self, **kwargs): |
|
preprocess_kwargs = {} |
|
if "to_bohairic" in kwargs and kwargs["to_bohairic"]: |
|
preprocess_kwargs["to_bohairic"] = True |
|
forward_kwargs = {} |
|
if "output_confidence" in kwargs and kwargs["output_confidence"]: |
|
forward_kwargs["output_confidence"] = True |
|
|
|
return preprocess_kwargs, forward_kwargs, {} |
|
|
|
def preprocess(self, text, to_bohairic=False): |
|
if to_bohairic: |
|
text = f"{BOHAIRIC_TAG} {text}" |
|
else: |
|
text = f"{SAHIDIC_TAG} {text}" |
|
|
|
return self.tokenizer.encode(text, return_tensors="pt") |
|
|
|
def _forward(self, input_tensors, output_confidence=False) -> ModelOutput: |
|
outputs = self.model.generate( |
|
input_tensors[:, : self.tokenizer.model_max_length], |
|
generation_config=GENERATION_CONFIG, |
|
) |
|
|
|
translated_text = self.tokenizer.decode( |
|
outputs.sequences[0], skip_special_tokens=True |
|
) |
|
|
|
if output_confidence: |
|
scores = outputs.scores |
|
confidences = [ |
|
torch.softmax(score, dim=-1).max().item() for score in scores |
|
] |
|
num_words = len(translated_text.split()) |
|
|
|
scaled_probability = np.exp(sum(np.log(confidences)) / num_words) |
|
return translated_text, scaled_probability |
|
|
|
return translated_text, None |
|
|
|
def postprocess(self, outputs): |
|
text, confidence = outputs |
|
text = degreekify(text) |
|
|
|
if confidence is None: |
|
return { |
|
"translation": text, |
|
} |
|
return { |
|
"translation": text, |
|
"confidence": confidence, |
|
} |
|
|
|
|
|
GREEK_TO_COPTIC = { |
|
"α": "ⲁ", |
|
"β": "ⲃ", |
|
"γ": "ⲅ", |
|
"δ": "ⲇ", |
|
"ε": "ⲉ", |
|
"ϛ": "ⲋ", |
|
"ζ": "ⲍ", |
|
"η": "ⲏ", |
|
"θ": "ⲑ", |
|
"ι": "ⲓ", |
|
"κ": "ⲕ", |
|
"λ": "ⲗ", |
|
"μ": "ⲙ", |
|
"ν": "ⲛ", |
|
"ξ": "ⲝ", |
|
"ο": "ⲟ", |
|
"π": "ⲡ", |
|
"ρ": "ⲣ", |
|
"σ": "ⲥ", |
|
"τ": "ⲧ", |
|
"υ": "ⲩ", |
|
"φ": "ⲫ", |
|
"χ": "ⲭ", |
|
"ψ": "ⲯ", |
|
"ω": "ⲱ", |
|
"s": "ϣ", |
|
"f": "ϥ", |
|
"k": "ϧ", |
|
"h": "ϩ", |
|
"j": "ϫ", |
|
"c": "ϭ", |
|
"t": "ϯ", |
|
} |
|
|
|
|
|
def degreekify(greek_text): |
|
chars = [] |
|
for c in greek_text: |
|
l_c = c.lower() |
|
chars.append(GREEK_TO_COPTIC.get(l_c, l_c)) |
|
return "".join(chars) |