File size: 3,310 Bytes
e086fdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from typing import Dict
import numpy as np
import torch
from transformers import Pipeline
from transformers.utils import ModelOutput
from transformers import pipeline
from transformers.pipelines import PIPELINE_REGISTRY
from transformers import AutoModelForSeq2SeqLM
from huggingface_hub import Repository

SAHIDIC_TAG = "з"
BOHAIRIC_TAG = "б"

from transformers import GenerationConfig

GENERATION_CONFIG = GenerationConfig(
    max_length=20,
    max_new_tokens=128,
    min_new_tokens=1,
    min_length=0,
    early_stopping=True,
    do_sample=True,
    num_beams=5,
    num_beam_groups=1,
    top_k=50,
    top_p=0.95,
    temperature=1.0,
    diversity_penalty=0.0,
    output_scores=True,
    return_dict_in_generate=True,
)


class EnglishCopticPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {}
        if "to_bohairic" in kwargs and kwargs["to_bohairic"]:
            preprocess_kwargs["to_bohairic"] = True
        forward_kwargs = {}
        if "output_confidence" in kwargs and kwargs["output_confidence"]:
            forward_kwargs["output_confidence"] = True

        return preprocess_kwargs, forward_kwargs, {}

    def preprocess(self, text, to_bohairic=False):
        if to_bohairic:
            text = f"{BOHAIRIC_TAG} {text}"
        else:
            text = f"{SAHIDIC_TAG} {text}"

        return self.tokenizer.encode(text, return_tensors="pt")

    def _forward(self, input_tensors, output_confidence=False) -> ModelOutput:
        outputs = self.model.generate(
            input_tensors[:, : self.tokenizer.model_max_length],
            generation_config=GENERATION_CONFIG,
        )

        translated_text = self.tokenizer.decode(
            outputs.sequences[0], skip_special_tokens=True
        )

        if output_confidence:
            scores = outputs.scores
            confidences = [
                torch.softmax(score, dim=-1).max().item() for score in scores
            ]
            num_words = len(translated_text.split())
            # scale the predicition probability by the number of words in the sentence
            scaled_probability = np.exp(sum(np.log(confidences)) / num_words)
            return translated_text, scaled_probability

        return translated_text, None

    def postprocess(self, outputs):
        text, confidence = outputs
        text = degreekify(text)

        if confidence is None:
            return {
                "translation": text,
            }
        return {
            "translation": text,
            "confidence": confidence,
        }


GREEK_TO_COPTIC = {
    "α": "ⲁ",
    "β": "ⲃ",
    "γ": "ⲅ",
    "δ": "ⲇ",
    "ε": "ⲉ",
    "ϛ": "ⲋ",
    "ζ": "ⲍ",
    "η": "ⲏ",
    "θ": "ⲑ",
    "ι": "ⲓ",
    "κ": "ⲕ",
    "λ": "ⲗ",
    "μ": "ⲙ",
    "ν": "ⲛ",
    "ξ": "ⲝ",
    "ο": "ⲟ",
    "π": "ⲡ",
    "ρ": "ⲣ",
    "σ": "ⲥ",
    "τ": "ⲧ",
    "υ": "ⲩ",
    "φ": "ⲫ",
    "χ": "ⲭ",
    "ψ": "ⲯ",
    "ω": "ⲱ",
    "s": "ϣ",
    "f": "ϥ",
    "k": "ϧ",
    "h": "ϩ",
    "j": "ϫ",
    "c": "ϭ",
    "t": "ϯ",
}


def degreekify(greek_text):
    chars = []
    for c in greek_text:
        l_c = c.lower()
        chars.append(GREEK_TO_COPTIC.get(l_c, l_c))
    return "".join(chars)