MTUOC-paracrawl6.0-spa-eng / MTUOC_translate.py

Upload 39 files

9fa4f9e verified 10 months ago

18.2 kB

	import config
	import sys
	import re
	import string as stringmodule


	import srx_segmenter

	from MTUOC_misc import printLOG
	from MTUOC_misc import capitalizeMTUOC
	if config.MTUOCServer_MTengine=="GoogleTranslate":
	from MTUOC_GoogleTranslate import Google_translate
	if config.MTUOCServer_MTengine=="DeepL":
	from MTUOC_DeepL import DeepL_translate
	if config.MTUOCServer_MTengine=="Lucy":
	from MTUOC_Lucy import Lucy_translate
	from MTUOC_Marian import translate_segment_Marian
	from MTUOC_Moses import translate_segment_Moses

	from MTUOC_preprocess import preprocess_segment
	from MTUOC_preprocess import postprocess_segment

	from MTUOC_preprocess import tokenizationSL
	from MTUOC_preprocess import tokenizationTL
	from MTUOC_preprocess import detokenizationSL
	from MTUOC_preprocess import detokenizationTL

	def segmenta(cadena):
	segmenter = srx_segmenter.SrxSegmenter(config.rules[config.SRXlang],cadena)
	segments=segmenter.extract()
	resposta=[]
	return(segments)

	def is_first_letter_upper(segment):
	for character in segment:
	if character.isalpha() and character.isupper():
	return(True)
	elif character.isalpha() and character.islower():
	return(False)
	return(False)

	def upper_case_first_letter(segment):
	pos=0
	for character in segment:
	if character.isalpha() and character.islower():
	llista=list(segment)
	llista[pos]=llista[pos].upper()
	segment="".join(llista)
	return(segment)
	elif character.isalpha() and character.isupper():
	return(segment)
	pos+=1
	return(segment)

	###URLs EMAILs

	def findEMAILs(string):
	email=re.findall('\S+@\S+', string)
	email2=[]
	for em in email:
	if em[-1] in stringmodule.punctuation: em=em[0:-1]
	email2.append(em)
	return email2

	def findURLs(string):
	regex = r"(?i)\b((?:https?://\|www\d{0,3}[.]\|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+\|\(([^\s()<>]+\|(\([^\s()<>]+\)))\))+(?:\(([^\s()<>]+\|(\([^\s()<>]+\)))\)\|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
	url = re.findall(regex,string)
	return [x[0] for x in url]

	def replace_EMAILs(string,code="@EMAIL@"):
	EMAILs=findEMAILs(string)
	for EMAIL in EMAILs:
	string=string.replace(EMAIL,code)
	return(string)

	def replace_URLs(string,code="@URL@"):
	URLs=findURLs(string)
	for URL in URLs:
	string=string.replace(URL,code)
	return(string)

	re_num = re.compile(r'[\d,\.]+')

	def replace_NUMs(segment,code="@NUM@"):
	trobatsEXPRNUM=re.finditer(re_num,segment)
	for trobat in trobatsEXPRNUM:
	if not trobat.group(0) in [".",","]:
	segment=segment.replace(trobat.group(0),code,1)
	return(segment)

	def splitnumbers(segment,joiner=""):
	joiner=joiner+" "
	xifres = re.findall(re_num,segment)
	equil={}
	for xifra in xifres:
	xifrastr=str(xifra)
	xifrasplit=xifra.split()
	xifra2=joiner.join(xifra)
	segment=segment.replace(xifra,xifra2)
	if xifra2.find(" ")>-1:
	equil[xifra2]=xifra
	return(segment,equil)

	def desplitnumbers(segment,equil):
	for xifra2 in equil:
	segment=segment.replace(xifra2,equil[xifra2])
	return(segment)

	def restore_EMAILs(stringA,stringB,code="@EMAIL@"):
	EMAILs=findEMAILs(stringA)
	for email in EMAILs:
	stringB=stringB.replace(code,email,1)
	return(stringB)

	def restore_URLs(stringA,stringB,code="@URL@"):
	URLs=findURLs(stringA)
	for url in URLs:
	stringB=stringB.replace(code,url,1)
	return(stringB)

	def restore_NUMs(segmentSL,segmentTL,code="@NUM@"):
	trobatsEXPRNUM=re.finditer(re_num,segmentSL)
	position=0
	for trobat in trobatsEXPRNUM:
	if not trobat.group(0) in [".",","]:
	segmentTL=segmentTL.replace(code,trobat.group(0),1)
	return(segmentTL)


	def translate_para(paragraph):
	if config.segment_input:
	(segments,separators)=segmenta(paragraph)
	translations=[]
	for segment in segments:
	translation=translate_segment(segment)
	if config.fix_xml:
	translation=config.tagrestorer.fix_xml_tags(translation)
	translations.append(translation)
	resultat=[]
	for i in range(0,len(separators)):
	resultat.append(separators[i])
	try:
	resultat.append(translations[i])
	except:
	pass



	translation="".join(resultat)

	else:
	translation=translate_segment(paragraph)

	return(translation)




	def restore_tags_translation_candidates(translation_candidates):
	hastags=config.tagrestorer.has_tags(translation_candidates["segmentTAGS"])
	if hastags:

	(translation_candidates["segmentTAGS"],equil)=config.tagrestorer.replace_tags(translation_candidates["segmentOrig"])
	printLOG(3,"replace_tags",translation_candidates["segmentTAGS"])
	printLOG(3,"equil",equil)
	(translation_candidates["segmentTAGS"],tagInici,tagFinal)=config.tagrestorer.remove_start_end_tag(translation_candidates["segmentOrig"])
	printLOG(3,"remove_start_end_tag",translation_candidates["segmentTAGS"])
	printLOG(3,"TAG initial:",tagInici)
	printLOG(3,"TAG final:",tagFinal)
	translation_candidates["segmentNOTAGS"]=config.tagrestorer.remove_tags(translation_candidates["segmentTAGS"])
	originaltags=config.tagrestorer.get_tags(translation_candidates["segmentTAGS"])
	segmentNOTAGSTOK=tokenizationSL(translation_candidates["segmentNOTAGS"])
	segmentTAGSTOK=tokenizationSL(translation_candidates["segmentPreTAGS"])
	translation_candidates["translationTAGS"]=[None] * len(translation_candidates["translationNOTAGSPre"])
	for i in range(0,len(translation_candidates["translationNOTAGSPre"])):
	try:
	if hastags and config.tag_restoration:
	try:
	alignment=translation_candidates["alignments"][i]
	translationNOTAGSTOK=tokenizationTL(translation_candidates["translationNOTAGSPre"][i])
	translation_candidates["translationTAGS"][i]=config.tagrestorer.restore_tags(segmentNOTAGSTOK, segmentTAGSTOK, alignment, translationNOTAGSTOK)
	'''
	if tagInici:
	translation_candidates["translationTAGS"][i]=tagInici+translation_candidates["translationTAGS"][i]
	if tagFinal:
	translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i]+tagFinal
	printLOG(3,"SELECTED TRANSLATION SIMPLE TAGS",translation_candidates["translationTAGS"][i])
	for t in equil:
	translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i].replace(t,equil[t],1)
	'''
	except:
	printLOG(3,"ERROR restoring tags:",sys.exc_info())
	translation_candidates["translationTAGS"][i]=translationNOTAGSTOK

	else:
	translation_candidates["translationTAGS"]=translation_candidates["translationNOTAGSPre"]
	printLOG(3,"translationTAGS:",translation_candidates["translationTAGS"][i])



	except:
	pass
	else:
	translation_candidates["segmentNOTAGS"]=translation_candidates["segmentTAGS"]
	translation_candidates["translationTAGS"]=translation_candidates["translationNOTAGSPre"]




	return(translation_candidates)


	def translate_segment(segment):
	printLOG(3,"translate_segment",segment)
	if config.MTUOCServer_MTengine=="GoogleTranslate":
	translation=Google_translate(segment)
	return(translation)
	elif config.MTUOCServer_MTengine=="DeepL":
	translation=DeepL_translate(segment)
	return(translation)
	elif config.MTUOCServer_MTengine=="Lucy":
	translation=Lucy_translate(segment)
	return(translation)
	segmentOrig=segment
	if not config.change_input_files[0]=="None":
	printLOG(3,"CHANGES INPUT:")
	printLOG(3,"ORIGINAL:",segmentOrig)
	for change in config.changes_input:
	tofind=change[0]
	tochange=change[1]
	regexp="\\b"+tofind+"\\b"
	trobat=re.findall(regexp,segment)
	if trobat:
	segment=re.sub(regexp, tochange, segment)
	printLOG(3,tofind,tochange)
	printLOG(3,"CHANGED:",segment)
	hastags=config.tagrestorer.has_tags(segment)
	originaltags=config.tagrestorer.get_tags(segment)
	printLOG(3,"hastags",hastags)
	printLOG(3,"originaltags",originaltags)
	#truecasing
	totruecase=False
	toupperfinal=False
	if not config.truecase==None and config.truecase=="all": totruecase=True
	segmentnotags=config.tagrestorer.remove_tags(segment)
	if not config.truecase==None and config.truecase in ["upper","all"] and segmentnotags.isupper() and not segment=="@URL@" and not segment=="@EMAIL@":
	totruecase=True
	toupperfinal=True
	if config.checkistranslatable:
	segmentNOTAGS=replace_URLs(segment,config.code_URLs)
	segmentNOTAGS=replace_EMAILs(segment,config.code_EMAILs)
	tokens=tokenizationSL(segmentNOTAGS)
	if not is_translatable(tokens):
	return(segment)
	if totruecase:
	segment=config.truecaser.truecase(segment)
	if config.tokenizerSL:
	segment=config.tokenizerSL.tokenize(segment)
	if hastags:
	segmentTAGS=segment

	(segmentTAGS,equil)=config.tagrestorer.replace_tags(segmentTAGS)
	printLOG(3,"segmentTAGS:",segmentTAGS)
	printLOG(3,"equil:",equil)
	printLOG(3,"segmentTAGS:",segmentTAGS)
	(segmentTAGS,tagInici,tagFinal)=config.tagrestorer.remove_start_end_tag(segmentTAGS)
	printLOG(3,"TAG initial:",tagInici)
	printLOG(3,"TAG final:",tagFinal)


	segmentNOTAGS=config.tagrestorer.remove_tags(segment)
	else:
	segmentTAGS=segment
	segmentNOTAGS=segment
	if len(segmentNOTAGS)<config.min_chars_segment:
	return(segment)



	if config.MTUOCServer_EMAILs:
	segmentTAGS=replace_EMAILs(segmentTAGS)
	segmentNOTAGS=replace_EMAILs(segmentNOTAGS)
	printLOG(3,"Replace EMAILs:",segmentTAGS)
	if config.MTUOCServer_URLs:
	segmentTAGS=replace_URLs(segmentTAGS)
	segmentNOTAGS=replace_URLs(segmentNOTAGS)
	printLOG(3,"Replace URLs:",segmentTAGS)

	if config.pre_replace_NUMs:
	segmentTAGS=replace_NUMs(segmentTAGS,code=config.code_NUMs)
	segmentNOTAGS=replace_NUMs(segmentNOTAGS,code=config.code_NUMs)
	printLOG(3,"Replace NUMs:",segmentTAGS)
	if config.pre_split_NUMs:
	(segmentTAGS,equilSplitNum)=splitnumbers(segmentTAGS)
	(segmentNOTAGS,equilSplitNum2)=splitnumbers(segmentNOTAGS)
	printLOG(3,"Split NUMs:",segmentTAGS)


	#leading and trailing spaces
	leading_spaces=len(segment)-len(segment.lstrip())
	trailing_spaces=len(segment)-len(segment.rstrip())-1
	segmentPre=preprocess_segment(segmentNOTAGS)
	segmentPreTAGS=preprocess_segment(segmentTAGS)
	if config.MTUOCServer_MTengine=="Marian":
	translation_candidates=translate_segment_Marian(segmentPre)
	elif config.MTUOCServer_MTengine=="Moses":
	translation_candidates=translate_segment_Moses(segmentPre)

	translation_candidates["segment"]=segment
	translation_candidates["segmentOrig"]=segmentOrig
	translation_candidates["segmentTAGS"]=segmentTAGS
	translation_candidates["segmentPre"]=segmentPre
	translation_candidates["segmentPreTAGS"]=segmentPreTAGS
	translation_candidates["segmentNOTAGS"]=segmentNOTAGS
	print("*****1",translation_candidates)
	if hastags:
	translation_candidates=restore_tags_translation_candidates(translation_candidates)
	#(translation_candidates["segmentTAGS"],equil)=config.tagrestorer.replace_tags(translation_candidates["segmentOrig"])

	else:
	translation_candidates["translationTAGS"]=translation_candidates["translationNOTAGSPre"]
	print("*****2",translation_candidates)
	for i in range(0,len(translation_candidates["translationNOTAGSPre"])):
	translation_candidates["translationTAGS"][i]=postprocess_segment(translation_candidates["translationTAGS"][i])

	if hastags:
	if tagInici:
	translation_candidates["translationTAGS"][i]=tagInici+translation_candidates["translationTAGS"][i]
	if tagFinal:
	translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i]+tagFinal
	for t in equil:
	translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i].replace(t,equil[t],1)
	#if not config.truecaser==None and is_first_letter_upper(segmentOrig):
	# translation_candidates["translationTAGS"][i]=upper_case_first_letter(translation_candidates["translationTAGS"][i])
	print("*****3",translation_candidates)
	if totruecase:
	translation_candidates["translationTAGS"][i]=capitalizeMTUOC(translation_candidates["translationTAGS"][i])
	if toupperfinal:
	translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i].upper()
	###LOWERCASE UPPERCASED TAGS
	hastagstranslation=config.tagrestorer.has_tags(translation_candidates["translationTAGS"][i])
	if hastagstranslation:
	translationtags=config.tagrestorer.get_tags(translation_candidates["translationTAGS"][i])
	for tt in translationtags:
	if not tt in originaltags and tt.lower() in originaltags:
	translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i].replace(tt,tt.lower())

	if config.MTUOCServer_EMAILs:
	translation_candidates["translationTAGS"][i]=restore_EMAILs(segmentOrig,translation_candidates["translationTAGS"][i],code=config.code_EMAILs)
	if config.MTUOCServer_URLs:
	translation_candidates["translationTAGS"][i]=restore_URLs(segmentOrig,translation_candidates["translationTAGS"][i],code=config.code_URLs)
	'''
	#config.pre_replace_NUMs
	if config.pre_replace_NUMs:
	translation_candidates["translationTAGS"][i]=restore_NUMs(segmentOrig,translation_candidates["translationTAGS"][i],code=config.code_NUMs)
	#config.pre_split_NUMs
	if config.pre_split_NUMs:
	translation_candidates["translationTAGS"][i]=desplitnumbers(translation_candidates["translationTAGS"][i],equilSplitNum)
	#detruecase
	if totruecase:
	translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i][0].upper()+translation_candidates["translationTAGS"][i][1:]
	#detokenize
	'''
	if config.tokenizerSL and not config.tokenizerTL==None:
	translation_candidates["translationTAGS"][i]=config.tokenizerTL.detokenize(translation_candidates["translationTAGS"][i])



	translation_candidates["translationTAGS"][i]=config.tagrestorer.repairSpacesTags(translation_candidates["segmentOrig"],translation_candidates["translationTAGS"][i])
	printLOG(3,"SELECTED TRANSLATION REAL TAGS",translation_candidates["translationTAGS"][i])
	best_translation=select_best_candidate(translation_candidates,config.translation_selection_strategy)

	translation=best_translation

	if not config.change_output_files[0]=="None":
	printLOG(3,"CHANGES OUTPUT:")
	printLOG(3,"ORIGINAL:",translation)
	for change in config.changes_output:
	tofind=change[0]
	tochange=change[1]
	regexp="\\b"+tofind+"\\b"
	trobat=re.findall(regexp,translation)
	if trobat:
	translation=re.sub(regexp, tochange, translation)
	printLOG(3,tofind,tochange)
	printLOG(3,"CHANGED:",translation)

	if not config.change_translation_files[0]=="None":
	printLOG(3,"CHANGES TRANSLATION:")
	printLOG(3,"ORIGINAL SOURCE:",segmentOrig)
	printLOG(3,"ORIGINAL TARGET:",translation)
	for change in config.changes_translation:
	tofindSOURCE=change[0]
	tofindTARGET=change[1]
	tochange=change[2]
	regexpSOURCE="\\b"+tofindSOURCE+"\\b"
	regexpTARGET="\\b"+tofindTARGET+"\\b"
	trobatSOURCE=re.findall(regexpSOURCE,segmentOrig)
	trobatTARGET=re.findall(regexpTARGET,translation)
	if trobatSOURCE and trobatTARGET:
	translation=re.sub(regexpTARGET, tochange, translation)
	printLOG(3,tofindTARGET,tochange)
	printLOG(3,"CHANGED TARGET:",translation)

	return(translation)

	def is_translatable_old(tokens):
	tokens=tokens.split(" ")
	translatable=False
	for token in tokens:
	if token.isalpha():
	translatable=True
	break
	return(translatable)

	def is_translatable(tokens):
	translatable=False
	for token in tokens.split():
	transtoken=True
	for character in token:
	if str(character) in ["0","1","2","3","4","5","6","7","8","9"]:
	transtoken=False
	break
	if transtoken:
	translatable=True
	return(translatable)

	def select_best_candidate(translation_candidates,strategy):
	'''To implement several strategies to select the best candidate. Now it r,eturns the first one.'''
	if strategy=="First":
	best_translation=translation_candidates["translationTAGS"][0]
	return(best_translation)