english-coptic-translator / english_coptic_pipeline.py

Upload 12 files

e086fdb verified 7 months ago

3.31 kB

	from typing import Dict
	import numpy as np
	import torch
	from transformers import Pipeline
	from transformers.utils import ModelOutput
	from transformers import pipeline
	from transformers.pipelines import PIPELINE_REGISTRY
	from transformers import AutoModelForSeq2SeqLM
	from huggingface_hub import Repository

	SAHIDIC_TAG = "з"
	BOHAIRIC_TAG = "б"

	from transformers import GenerationConfig

	GENERATION_CONFIG = GenerationConfig(
	max_length=20,
	max_new_tokens=128,
	min_new_tokens=1,
	min_length=0,
	early_stopping=True,
	do_sample=True,
	num_beams=5,
	num_beam_groups=1,
	top_k=50,
	top_p=0.95,
	temperature=1.0,
	diversity_penalty=0.0,
	output_scores=True,
	return_dict_in_generate=True,
	)


	class EnglishCopticPipeline(Pipeline):
	def _sanitize_parameters(self, **kwargs):
	preprocess_kwargs = {}
	if "to_bohairic" in kwargs and kwargs["to_bohairic"]:
	preprocess_kwargs["to_bohairic"] = True
	forward_kwargs = {}
	if "output_confidence" in kwargs and kwargs["output_confidence"]:
	forward_kwargs["output_confidence"] = True

	return preprocess_kwargs, forward_kwargs, {}

	def preprocess(self, text, to_bohairic=False):
	if to_bohairic:
	text = f"{BOHAIRIC_TAG} {text}"
	else:
	text = f"{SAHIDIC_TAG} {text}"

	return self.tokenizer.encode(text, return_tensors="pt")

	def _forward(self, input_tensors, output_confidence=False) -> ModelOutput:
	outputs = self.model.generate(
	input_tensors[:, : self.tokenizer.model_max_length],
	generation_config=GENERATION_CONFIG,
	)

	translated_text = self.tokenizer.decode(
	outputs.sequences[0], skip_special_tokens=True
	)

	if output_confidence:
	scores = outputs.scores
	confidences = [
	torch.softmax(score, dim=-1).max().item() for score in scores
	]
	num_words = len(translated_text.split())
	# scale the predicition probability by the number of words in the sentence
	scaled_probability = np.exp(sum(np.log(confidences)) / num_words)
	return translated_text, scaled_probability

	return translated_text, None

	def postprocess(self, outputs):
	text, confidence = outputs
	text = degreekify(text)

	if confidence is None:
	return {
	"translation": text,
	}
	return {
	"translation": text,
	"confidence": confidence,
	}


	GREEK_TO_COPTIC = {
	"α": "ⲁ",
	"β": "ⲃ",
	"γ": "ⲅ",
	"δ": "ⲇ",
	"ε": "ⲉ",
	"ϛ": "ⲋ",
	"ζ": "ⲍ",
	"η": "ⲏ",
	"θ": "ⲑ",
	"ι": "ⲓ",
	"κ": "ⲕ",
	"λ": "ⲗ",
	"μ": "ⲙ",
	"ν": "ⲛ",
	"ξ": "ⲝ",
	"ο": "ⲟ",
	"π": "ⲡ",
	"ρ": "ⲣ",
	"σ": "ⲥ",
	"τ": "ⲧ",
	"υ": "ⲩ",
	"φ": "ⲫ",
	"χ": "ⲭ",
	"ψ": "ⲯ",
	"ω": "ⲱ",
	"s": "ϣ",
	"f": "ϥ",
	"k": "ϧ",
	"h": "ϩ",
	"j": "ϫ",
	"c": "ϭ",
	"t": "ϯ",
	}


	def degreekify(greek_text):
	chars = []
	for c in greek_text:
	l_c = c.lower()
	chars.append(GREEK_TO_COPTIC.get(l_c, l_c))
	return "".join(chars)