|
import config |
|
import sys |
|
import re |
|
import string as stringmodule |
|
|
|
|
|
import srx_segmenter |
|
|
|
from MTUOC_misc import printLOG |
|
from MTUOC_misc import capitalizeMTUOC |
|
if config.MTUOCServer_MTengine=="GoogleTranslate": |
|
from MTUOC_GoogleTranslate import Google_translate |
|
if config.MTUOCServer_MTengine=="DeepL": |
|
from MTUOC_DeepL import DeepL_translate |
|
if config.MTUOCServer_MTengine=="Lucy": |
|
from MTUOC_Lucy import Lucy_translate |
|
from MTUOC_Marian import translate_segment_Marian |
|
from MTUOC_Moses import translate_segment_Moses |
|
|
|
from MTUOC_preprocess import preprocess_segment |
|
from MTUOC_preprocess import postprocess_segment |
|
|
|
from MTUOC_preprocess import tokenizationSL |
|
from MTUOC_preprocess import tokenizationTL |
|
from MTUOC_preprocess import detokenizationSL |
|
from MTUOC_preprocess import detokenizationTL |
|
|
|
def segmenta(cadena): |
|
segmenter = srx_segmenter.SrxSegmenter(config.rules[config.SRXlang],cadena) |
|
segments=segmenter.extract() |
|
resposta=[] |
|
return(segments) |
|
|
|
def is_first_letter_upper(segment): |
|
for character in segment: |
|
if character.isalpha() and character.isupper(): |
|
return(True) |
|
elif character.isalpha() and character.islower(): |
|
return(False) |
|
return(False) |
|
|
|
def upper_case_first_letter(segment): |
|
pos=0 |
|
for character in segment: |
|
if character.isalpha() and character.islower(): |
|
llista=list(segment) |
|
llista[pos]=llista[pos].upper() |
|
segment="".join(llista) |
|
return(segment) |
|
elif character.isalpha() and character.isupper(): |
|
return(segment) |
|
pos+=1 |
|
return(segment) |
|
|
|
|
|
|
|
def findEMAILs(string): |
|
email=re.findall('\S+@\S+', string) |
|
email2=[] |
|
for em in email: |
|
if em[-1] in stringmodule.punctuation: em=em[0:-1] |
|
email2.append(em) |
|
return email2 |
|
|
|
def findURLs(string): |
|
regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))" |
|
url = re.findall(regex,string) |
|
return [x[0] for x in url] |
|
|
|
def replace_EMAILs(string,code="@EMAIL@"): |
|
EMAILs=findEMAILs(string) |
|
for EMAIL in EMAILs: |
|
string=string.replace(EMAIL,code) |
|
return(string) |
|
|
|
def replace_URLs(string,code="@URL@"): |
|
URLs=findURLs(string) |
|
for URL in URLs: |
|
string=string.replace(URL,code) |
|
return(string) |
|
|
|
re_num = re.compile(r'[\d,\.]+') |
|
|
|
def replace_NUMs(segment,code="@NUM@"): |
|
trobatsEXPRNUM=re.finditer(re_num,segment) |
|
for trobat in trobatsEXPRNUM: |
|
if not trobat.group(0) in [".",","]: |
|
segment=segment.replace(trobat.group(0),code,1) |
|
return(segment) |
|
|
|
def splitnumbers(segment,joiner=""): |
|
joiner=joiner+" " |
|
xifres = re.findall(re_num,segment) |
|
equil={} |
|
for xifra in xifres: |
|
xifrastr=str(xifra) |
|
xifrasplit=xifra.split() |
|
xifra2=joiner.join(xifra) |
|
segment=segment.replace(xifra,xifra2) |
|
if xifra2.find(" ")>-1: |
|
equil[xifra2]=xifra |
|
return(segment,equil) |
|
|
|
def desplitnumbers(segment,equil): |
|
for xifra2 in equil: |
|
segment=segment.replace(xifra2,equil[xifra2]) |
|
return(segment) |
|
|
|
def restore_EMAILs(stringA,stringB,code="@EMAIL@"): |
|
EMAILs=findEMAILs(stringA) |
|
for email in EMAILs: |
|
stringB=stringB.replace(code,email,1) |
|
return(stringB) |
|
|
|
def restore_URLs(stringA,stringB,code="@URL@"): |
|
URLs=findURLs(stringA) |
|
for url in URLs: |
|
stringB=stringB.replace(code,url,1) |
|
return(stringB) |
|
|
|
def restore_NUMs(segmentSL,segmentTL,code="@NUM@"): |
|
trobatsEXPRNUM=re.finditer(re_num,segmentSL) |
|
position=0 |
|
for trobat in trobatsEXPRNUM: |
|
if not trobat.group(0) in [".",","]: |
|
segmentTL=segmentTL.replace(code,trobat.group(0),1) |
|
return(segmentTL) |
|
|
|
|
|
def translate_para(paragraph): |
|
if config.segment_input: |
|
(segments,separators)=segmenta(paragraph) |
|
translations=[] |
|
for segment in segments: |
|
translation=translate_segment(segment) |
|
if config.fix_xml: |
|
translation=config.tagrestorer.fix_xml_tags(translation) |
|
translations.append(translation) |
|
resultat=[] |
|
for i in range(0,len(separators)): |
|
resultat.append(separators[i]) |
|
try: |
|
resultat.append(translations[i]) |
|
except: |
|
pass |
|
|
|
|
|
|
|
translation="".join(resultat) |
|
|
|
else: |
|
translation=translate_segment(paragraph) |
|
|
|
return(translation) |
|
|
|
|
|
|
|
|
|
def restore_tags_translation_candidates(translation_candidates): |
|
hastags=config.tagrestorer.has_tags(translation_candidates["segmentTAGS"]) |
|
if hastags: |
|
|
|
(translation_candidates["segmentTAGS"],equil)=config.tagrestorer.replace_tags(translation_candidates["segmentOrig"]) |
|
printLOG(3,"replace_tags",translation_candidates["segmentTAGS"]) |
|
printLOG(3,"equil",equil) |
|
(translation_candidates["segmentTAGS"],tagInici,tagFinal)=config.tagrestorer.remove_start_end_tag(translation_candidates["segmentOrig"]) |
|
printLOG(3,"remove_start_end_tag",translation_candidates["segmentTAGS"]) |
|
printLOG(3,"TAG initial:",tagInici) |
|
printLOG(3,"TAG final:",tagFinal) |
|
translation_candidates["segmentNOTAGS"]=config.tagrestorer.remove_tags(translation_candidates["segmentTAGS"]) |
|
originaltags=config.tagrestorer.get_tags(translation_candidates["segmentTAGS"]) |
|
segmentNOTAGSTOK=tokenizationSL(translation_candidates["segmentNOTAGS"]) |
|
segmentTAGSTOK=tokenizationSL(translation_candidates["segmentPreTAGS"]) |
|
translation_candidates["translationTAGS"]=[None] * len(translation_candidates["translationNOTAGSPre"]) |
|
for i in range(0,len(translation_candidates["translationNOTAGSPre"])): |
|
try: |
|
if hastags and config.tag_restoration: |
|
try: |
|
alignment=translation_candidates["alignments"][i] |
|
translationNOTAGSTOK=tokenizationTL(translation_candidates["translationNOTAGSPre"][i]) |
|
translation_candidates["translationTAGS"][i]=config.tagrestorer.restore_tags(segmentNOTAGSTOK, segmentTAGSTOK, alignment, translationNOTAGSTOK) |
|
''' |
|
if tagInici: |
|
translation_candidates["translationTAGS"][i]=tagInici+translation_candidates["translationTAGS"][i] |
|
if tagFinal: |
|
translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i]+tagFinal |
|
printLOG(3,"SELECTED TRANSLATION SIMPLE TAGS",translation_candidates["translationTAGS"][i]) |
|
for t in equil: |
|
translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i].replace(t,equil[t],1) |
|
''' |
|
except: |
|
printLOG(3,"ERROR restoring tags:",sys.exc_info()) |
|
translation_candidates["translationTAGS"][i]=translationNOTAGSTOK |
|
|
|
else: |
|
translation_candidates["translationTAGS"]=translation_candidates["translationNOTAGSPre"] |
|
printLOG(3,"translationTAGS:",translation_candidates["translationTAGS"][i]) |
|
|
|
|
|
|
|
except: |
|
pass |
|
else: |
|
translation_candidates["segmentNOTAGS"]=translation_candidates["segmentTAGS"] |
|
translation_candidates["translationTAGS"]=translation_candidates["translationNOTAGSPre"] |
|
|
|
|
|
|
|
|
|
return(translation_candidates) |
|
|
|
|
|
def translate_segment(segment): |
|
printLOG(3,"translate_segment",segment) |
|
if config.MTUOCServer_MTengine=="GoogleTranslate": |
|
translation=Google_translate(segment) |
|
return(translation) |
|
elif config.MTUOCServer_MTengine=="DeepL": |
|
translation=DeepL_translate(segment) |
|
return(translation) |
|
elif config.MTUOCServer_MTengine=="Lucy": |
|
translation=Lucy_translate(segment) |
|
return(translation) |
|
segmentOrig=segment |
|
if not config.change_input_files[0]=="None": |
|
printLOG(3,"CHANGES INPUT:") |
|
printLOG(3,"ORIGINAL:",segmentOrig) |
|
for change in config.changes_input: |
|
tofind=change[0] |
|
tochange=change[1] |
|
regexp="\\b"+tofind+"\\b" |
|
trobat=re.findall(regexp,segment) |
|
if trobat: |
|
segment=re.sub(regexp, tochange, segment) |
|
printLOG(3,tofind,tochange) |
|
printLOG(3,"CHANGED:",segment) |
|
hastags=config.tagrestorer.has_tags(segment) |
|
originaltags=config.tagrestorer.get_tags(segment) |
|
printLOG(3,"hastags",hastags) |
|
printLOG(3,"originaltags",originaltags) |
|
|
|
totruecase=False |
|
toupperfinal=False |
|
if not config.truecase==None and config.truecase=="all": totruecase=True |
|
segmentnotags=config.tagrestorer.remove_tags(segment) |
|
if not config.truecase==None and config.truecase in ["upper","all"] and segmentnotags.isupper() and not segment=="@URL@" and not segment=="@EMAIL@": |
|
totruecase=True |
|
toupperfinal=True |
|
if config.checkistranslatable: |
|
segmentNOTAGS=replace_URLs(segment,config.code_URLs) |
|
segmentNOTAGS=replace_EMAILs(segment,config.code_EMAILs) |
|
tokens=tokenizationSL(segmentNOTAGS) |
|
if not is_translatable(tokens): |
|
return(segment) |
|
if totruecase: |
|
segment=config.truecaser.truecase(segment) |
|
if config.tokenizerSL: |
|
segment=config.tokenizerSL.tokenize(segment) |
|
if hastags: |
|
segmentTAGS=segment |
|
|
|
(segmentTAGS,equil)=config.tagrestorer.replace_tags(segmentTAGS) |
|
printLOG(3,"segmentTAGS:",segmentTAGS) |
|
printLOG(3,"equil:",equil) |
|
printLOG(3,"segmentTAGS:",segmentTAGS) |
|
(segmentTAGS,tagInici,tagFinal)=config.tagrestorer.remove_start_end_tag(segmentTAGS) |
|
printLOG(3,"TAG initial:",tagInici) |
|
printLOG(3,"TAG final:",tagFinal) |
|
|
|
|
|
segmentNOTAGS=config.tagrestorer.remove_tags(segment) |
|
else: |
|
segmentTAGS=segment |
|
segmentNOTAGS=segment |
|
if len(segmentNOTAGS)<config.min_chars_segment: |
|
return(segment) |
|
|
|
|
|
|
|
if config.MTUOCServer_EMAILs: |
|
segmentTAGS=replace_EMAILs(segmentTAGS) |
|
segmentNOTAGS=replace_EMAILs(segmentNOTAGS) |
|
printLOG(3,"Replace EMAILs:",segmentTAGS) |
|
if config.MTUOCServer_URLs: |
|
segmentTAGS=replace_URLs(segmentTAGS) |
|
segmentNOTAGS=replace_URLs(segmentNOTAGS) |
|
printLOG(3,"Replace URLs:",segmentTAGS) |
|
|
|
if config.pre_replace_NUMs: |
|
segmentTAGS=replace_NUMs(segmentTAGS,code=config.code_NUMs) |
|
segmentNOTAGS=replace_NUMs(segmentNOTAGS,code=config.code_NUMs) |
|
printLOG(3,"Replace NUMs:",segmentTAGS) |
|
if config.pre_split_NUMs: |
|
(segmentTAGS,equilSplitNum)=splitnumbers(segmentTAGS) |
|
(segmentNOTAGS,equilSplitNum2)=splitnumbers(segmentNOTAGS) |
|
printLOG(3,"Split NUMs:",segmentTAGS) |
|
|
|
|
|
|
|
leading_spaces=len(segment)-len(segment.lstrip()) |
|
trailing_spaces=len(segment)-len(segment.rstrip())-1 |
|
segmentPre=preprocess_segment(segmentNOTAGS) |
|
segmentPreTAGS=preprocess_segment(segmentTAGS) |
|
if config.MTUOCServer_MTengine=="Marian": |
|
translation_candidates=translate_segment_Marian(segmentPre) |
|
elif config.MTUOCServer_MTengine=="Moses": |
|
translation_candidates=translate_segment_Moses(segmentPre) |
|
|
|
translation_candidates["segment"]=segment |
|
translation_candidates["segmentOrig"]=segmentOrig |
|
translation_candidates["segmentTAGS"]=segmentTAGS |
|
translation_candidates["segmentPre"]=segmentPre |
|
translation_candidates["segmentPreTAGS"]=segmentPreTAGS |
|
translation_candidates["segmentNOTAGS"]=segmentNOTAGS |
|
print("*****1",translation_candidates) |
|
if hastags: |
|
translation_candidates=restore_tags_translation_candidates(translation_candidates) |
|
|
|
|
|
else: |
|
translation_candidates["translationTAGS"]=translation_candidates["translationNOTAGSPre"] |
|
print("*****2",translation_candidates) |
|
for i in range(0,len(translation_candidates["translationNOTAGSPre"])): |
|
translation_candidates["translationTAGS"][i]=postprocess_segment(translation_candidates["translationTAGS"][i]) |
|
|
|
if hastags: |
|
if tagInici: |
|
translation_candidates["translationTAGS"][i]=tagInici+translation_candidates["translationTAGS"][i] |
|
if tagFinal: |
|
translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i]+tagFinal |
|
for t in equil: |
|
translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i].replace(t,equil[t],1) |
|
|
|
|
|
print("*****3",translation_candidates) |
|
if totruecase: |
|
translation_candidates["translationTAGS"][i]=capitalizeMTUOC(translation_candidates["translationTAGS"][i]) |
|
if toupperfinal: |
|
translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i].upper() |
|
|
|
hastagstranslation=config.tagrestorer.has_tags(translation_candidates["translationTAGS"][i]) |
|
if hastagstranslation: |
|
translationtags=config.tagrestorer.get_tags(translation_candidates["translationTAGS"][i]) |
|
for tt in translationtags: |
|
if not tt in originaltags and tt.lower() in originaltags: |
|
translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i].replace(tt,tt.lower()) |
|
|
|
if config.MTUOCServer_EMAILs: |
|
translation_candidates["translationTAGS"][i]=restore_EMAILs(segmentOrig,translation_candidates["translationTAGS"][i],code=config.code_EMAILs) |
|
if config.MTUOCServer_URLs: |
|
translation_candidates["translationTAGS"][i]=restore_URLs(segmentOrig,translation_candidates["translationTAGS"][i],code=config.code_URLs) |
|
''' |
|
#config.pre_replace_NUMs |
|
if config.pre_replace_NUMs: |
|
translation_candidates["translationTAGS"][i]=restore_NUMs(segmentOrig,translation_candidates["translationTAGS"][i],code=config.code_NUMs) |
|
#config.pre_split_NUMs |
|
if config.pre_split_NUMs: |
|
translation_candidates["translationTAGS"][i]=desplitnumbers(translation_candidates["translationTAGS"][i],equilSplitNum) |
|
#detruecase |
|
if totruecase: |
|
translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i][0].upper()+translation_candidates["translationTAGS"][i][1:] |
|
#detokenize |
|
''' |
|
if config.tokenizerSL and not config.tokenizerTL==None: |
|
translation_candidates["translationTAGS"][i]=config.tokenizerTL.detokenize(translation_candidates["translationTAGS"][i]) |
|
|
|
|
|
|
|
translation_candidates["translationTAGS"][i]=config.tagrestorer.repairSpacesTags(translation_candidates["segmentOrig"],translation_candidates["translationTAGS"][i]) |
|
printLOG(3,"SELECTED TRANSLATION REAL TAGS",translation_candidates["translationTAGS"][i]) |
|
best_translation=select_best_candidate(translation_candidates,config.translation_selection_strategy) |
|
|
|
translation=best_translation |
|
|
|
if not config.change_output_files[0]=="None": |
|
printLOG(3,"CHANGES OUTPUT:") |
|
printLOG(3,"ORIGINAL:",translation) |
|
for change in config.changes_output: |
|
tofind=change[0] |
|
tochange=change[1] |
|
regexp="\\b"+tofind+"\\b" |
|
trobat=re.findall(regexp,translation) |
|
if trobat: |
|
translation=re.sub(regexp, tochange, translation) |
|
printLOG(3,tofind,tochange) |
|
printLOG(3,"CHANGED:",translation) |
|
|
|
if not config.change_translation_files[0]=="None": |
|
printLOG(3,"CHANGES TRANSLATION:") |
|
printLOG(3,"ORIGINAL SOURCE:",segmentOrig) |
|
printLOG(3,"ORIGINAL TARGET:",translation) |
|
for change in config.changes_translation: |
|
tofindSOURCE=change[0] |
|
tofindTARGET=change[1] |
|
tochange=change[2] |
|
regexpSOURCE="\\b"+tofindSOURCE+"\\b" |
|
regexpTARGET="\\b"+tofindTARGET+"\\b" |
|
trobatSOURCE=re.findall(regexpSOURCE,segmentOrig) |
|
trobatTARGET=re.findall(regexpTARGET,translation) |
|
if trobatSOURCE and trobatTARGET: |
|
translation=re.sub(regexpTARGET, tochange, translation) |
|
printLOG(3,tofindTARGET,tochange) |
|
printLOG(3,"CHANGED TARGET:",translation) |
|
|
|
return(translation) |
|
|
|
def is_translatable_old(tokens): |
|
tokens=tokens.split(" ") |
|
translatable=False |
|
for token in tokens: |
|
if token.isalpha(): |
|
translatable=True |
|
break |
|
return(translatable) |
|
|
|
def is_translatable(tokens): |
|
translatable=False |
|
for token in tokens.split(): |
|
transtoken=True |
|
for character in token: |
|
if str(character) in ["0","1","2","3","4","5","6","7","8","9"]: |
|
transtoken=False |
|
break |
|
if transtoken: |
|
translatable=True |
|
return(translatable) |
|
|
|
def select_best_candidate(translation_candidates,strategy): |
|
'''To implement several strategies to select the best candidate. Now it r,eturns the first one.''' |
|
if strategy=="First": |
|
best_translation=translation_candidates["translationTAGS"][0] |
|
return(best_translation) |
|
|
|
|
|
|
|
|
|
|