MTUOC-paracrawl6.0-spa-eng / MTUOC_truecaser.py

Upload 39 files

9fa4f9e verified 10 months ago

7.34 kB

	# MTUOC_truecaser
	# v. 07/06/2023
	# Copyright (C) 2021 Antoni Oliver
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.

	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <https://www.gnu.org/licenses/>.

	import sys
	import codecs
	import pickle
	import argparse
	import importlib

	class Truecaser():
	def __init__(self, MTUOCPath=".", tokenizer=None, tc_model=None):

	self.initchars=["¡","¿","-","*","+","'",'"',"«","»","—","‘","’","“","”","„",]

	if tokenizer==None:
	self.tokenizer=None
	else:
	sys.path.append(MTUOCPath)
	if tokenizer.endswith(".py"):tokenizer=tokenizer.replace(".py","")
	self.module = importlib.import_module(tokenizer)
	self.tokenizer=self.module.Tokenizer()
	if tc_model:
	self.tc_model = pickle.load(open(tc_model, "rb" ) )
	else:
	self.tc_model={}
	def set_tc_model(self, tc_model):
	self.tc_model = pickle.load(open(tc_model, "rb" ) )

	def set_tokenizer(self, tokenizer):
	if tokenizer.endswith(".py"):tokenizer=tokenizer.replace(".py","")
	self.module = importlib.import_module(tokenizer)
	self.tokenizer=self.module.Tokenizer()

	def set_MTUOCPath(self, path):
	sys.path.append(path)

	def isinitsymbol(self, token):
	if len(token)==1 and token in self.initchars:
	return(True)
	else:
	return(False)

	def detect_type(self, segment):
	tipus="unknown"
	if self.tokenizer==None:
	tokens=segment.split(" ")
	else:
	tokens=self.tokenizer.tokenize(segment)
	ntok=0
	utokens=0
	ltokens=0
	leadingsplitter=False
	trailingsplitter=False
	leadingjoiner=False
	trailingsplitter=False
	for token in tokens:
	token=token.replace("▁","")
	token=token.replace("￭","")
	if token.isalpha():ntok+=1
	if token.isalpha() and token==token.lower():
	ltokens+=1
	elif token.isalpha() and not token==token.lower():
	utokens+=1
	if ntok>=5 and utokens>=ntok/2 and not segment==segment.upper():
	tipus="titled"
	elif segment==segment.upper():
	tipus="uppercased"
	else:
	tipus="regular"
	return(tipus)

	def truecase(self, line, ucf=False, restoreCase=False):
	if self.tokenizer:
	tokens=self.tokenizer.tokenize_s(line).split(" ")
	else:
	tokens=line.split(" ")
	nsegment=[]
	for token in tokens:

	try:
	leadingsplitter=False
	trailingsplitter=False
	leadingjoiner=False
	trailingjoiner=False
	if token.startswith("▁"):leadingsplitter=True
	if token.endswith("▁"):trailingsplitter=True
	if token.startswith("￭"):leadingjoiner=True
	if token.endswith("￭"):trailingjoiner=True
	token=token.replace("▁","")
	token=token.replace("￭","")
	try:
	nlc=self.tc_model[token.lower()]["lc"]
	except:
	nlc=0
	try:
	nu1=self.tc_model[token.lower()]["u1"]
	except:
	nu1=0
	try:
	nuc=self.tc_model[token.lower()]["uc"]
	except:
	nuc=0
	proceed=False
	if not token==token.lower(): proceed=True
	if restoreCase: proceed=True
	if proceed:
	if nlc>0 and nlc>=nu1 and nlc>=nuc:
	token=token.lower()
	elif nu1>0 and nu1>nlc and nu1>nuc:
	token=token.lower().capitalize()
	elif nuc>0 and nuc>nlc and nuc>nu1:
	token=token.upper()

	if leadingsplitter:token="▁"+token
	if trailingsplitter:token=token+"▁"
	if leadingjoiner:token="￭"+token
	if trailingjoiner:token=token+"￭"
	nsegment.append(token)
	except:
	print("ERROR",sys.exc_info())
	nsegment.append(token)
	if self.tokenizer:
	nsegment=self.tokenizer.detokenize_s(" ".join(nsegment))
	else:
	nsegment=" ".join(nsegment)
	if ucf:
	if self.isinitsymbol(nsegment[0]):firstchar=1
	else: firstchar=0
	try:
	if firstchar==0:
	nsegment=nsegment[firstchar].upper()+"".join(nsegment[1:])
	elif firstchar==1:
	nsegment=nsegment[0]+nsegment[firstchar].upper()+"".join(nsegment[2:])
	except:
	pass
	return(nsegment)

	def detruecase_old(self,line,tokenizer):
	tokens=line.split(" ")
	new=[]
	yet=False
	if tokenizer:
	tokens=tokenizer.tokenize_j(line).split(" ")
	else:
	tokens=line.split(" ")
	for token in tokens:
	if not yet and token.isalpha():
	yet=True
	new.append(token[0].upper()+token[1:])
	else:
	new.append(token)
	line=" ".join(new)
	detrue=tokenizer.detokenize_j(line)
	return(line)

	def detruecase(self,line):
	detruecased=line.capitalize()
	return(line)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description='MTUOC program for truecasing.')
	parser.add_argument('-m','--model', action="store", dest="model", help='The truecasing model to use.',required=True)
	parser.add_argument('-t','--tokenizer', action="store", dest="tokenizer", help='The tokenizer to used',required=False)
	parser.add_argument('-u','--ucf', action="store_true", dest="ucf", help='Set if you want first word capitalized',required=False)
	parser.add_argument('-r','--restore', action="store_true", dest="restore", help='Set if you want to restore case (uppercase lower cased)',required=False)
	parser.add_argument('--mtuoc','--MTUOC', action="store", dest="MTUOC", help='The path to the MTUOC components',required=False)

	args = parser.parse_args()
	model=args.model
	ucf=args.ucf
	restore=args.restore


	if args.MTUOC:
	MTUOCPath=args.MTUOC
	else:
	MTUOCPath=""

	truecaser=Truecaser(MTUOCPath, args.tokenizer, args.model)
	for line in sys.stdin:
	line=line.strip()
	tcline=truecaser.truecase(line, ucf, restore)
	print(tcline)