Aleph-Weo-Webeta

Running

App Files Files Community

Aleph-Weo-Webeta / soni_translate /translate_segments.py

r3gm

v0.5.0

b152010 6 months ago

raw

history blame contribute delete

15.3 kB

	from tqdm import tqdm
	from deep_translator import GoogleTranslator
	from itertools import chain
	import copy
	from .language_configuration import fix_code_language, INVERTED_LANGUAGES
	from .logging_setup import logger
	import re
	import json
	import time

	TRANSLATION_PROCESS_OPTIONS = [
	"google_translator_batch",
	"google_translator",
	"gpt-3.5-turbo-0125_batch",
	"gpt-3.5-turbo-0125",
	"gpt-4-turbo-preview_batch",
	"gpt-4-turbo-preview",
	"disable_translation",
	]
	DOCS_TRANSLATION_PROCESS_OPTIONS = [
	"google_translator",
	"gpt-3.5-turbo-0125",
	"gpt-4-turbo-preview",
	"disable_translation",
	]


	def translate_iterative(segments, target, source=None):
	"""
	Translate text segments individually to the specified language.

	Parameters:
	- segments (list): A list of dictionaries with 'text' as a key for
	segment text.
	- target (str): Target language code.
	- source (str, optional): Source language code. Defaults to None.

	Returns:
	- list: Translated text segments in the target language.

	Notes:
	- Translates each segment using Google Translate.

	Example:
	segments = [{'text': 'first segment.'}, {'text': 'second segment.'}]
	translated_segments = translate_iterative(segments, 'es')
	"""

	segments_ = copy.deepcopy(segments)

	if (
	not source
	):
	logger.debug("No source language")
	source = "auto"

	translator = GoogleTranslator(source=source, target=target)

	for line in tqdm(range(len(segments_))):
	text = segments_[line]["text"]
	translated_line = translator.translate(text.strip())
	segments_[line]["text"] = translated_line

	return segments_


	def verify_translate(
	segments,
	segments_copy,
	translated_lines,
	target,
	source
	):
	"""
	Verify integrity and translate segments if lengths match, otherwise
	switch to iterative translation.
	"""
	if len(segments) == len(translated_lines):
	for line in range(len(segments_copy)):
	logger.debug(
	f"{segments_copy[line]['text']} >> "
	f"{translated_lines[line].strip()}"
	)
	segments_copy[line]["text"] = translated_lines[
	line].replace("\t", "").replace("\n", "").strip()
	return segments_copy
	else:
	logger.error(
	"The translation failed, switching to google_translate iterative. "
	f"{len(segments), len(translated_lines)}"
	)
	return translate_iterative(segments, target, source)


	def translate_batch(segments, target, chunk_size=2000, source=None):
	"""
	Translate a batch of text segments into the specified language in chunks,
	respecting the character limit.

	Parameters:
	- segments (list): List of dictionaries with 'text' as a key for segment
	text.
	- target (str): Target language code.
	- chunk_size (int, optional): Maximum character limit for each translation
	chunk (default is 2000; max 5000).
	- source (str, optional): Source language code. Defaults to None.

	Returns:
	- list: Translated text segments in the target language.

	Notes:
	- Splits input segments into chunks respecting the character limit for
	translation.
	- Translates the chunks using Google Translate.
	- If chunked translation fails, switches to iterative translation using
	`translate_iterative()`.

	Example:
	segments = [{'text': 'first segment.'}, {'text': 'second segment.'}]
	translated = translate_batch(segments, 'es', chunk_size=4000, source='en')
	"""

	segments_copy = copy.deepcopy(segments)

	if (
	not source
	):
	logger.debug("No source language")
	source = "auto"

	# Get text
	text_lines = []
	for line in range(len(segments_copy)):
	text = segments_copy[line]["text"].strip()
	text_lines.append(text)

	# chunk limit
	text_merge = []
	actual_chunk = ""
	global_text_list = []
	actual_text_list = []
	for one_line in text_lines:
	one_line = " " if not one_line else one_line
	if (len(actual_chunk) + len(one_line)) <= chunk_size:
	if actual_chunk:
	actual_chunk += " \|\|\|\|\| "
	actual_chunk += one_line
	actual_text_list.append(one_line)
	else:
	text_merge.append(actual_chunk)
	actual_chunk = one_line
	global_text_list.append(actual_text_list)
	actual_text_list = [one_line]
	if actual_chunk:
	text_merge.append(actual_chunk)
	global_text_list.append(actual_text_list)

	# translate chunks
	progress_bar = tqdm(total=len(segments), desc="Translating")
	translator = GoogleTranslator(source=source, target=target)
	split_list = []
	try:
	for text, text_iterable in zip(text_merge, global_text_list):
	translated_line = translator.translate(text.strip())
	split_text = translated_line.split("\|\|\|\|\|")
	if len(split_text) == len(text_iterable):
	progress_bar.update(len(split_text))
	else:
	logger.debug(
	"Chunk fixing iteratively. Len chunk: "
	f"{len(split_text)}, expected: {len(text_iterable)}"
	)
	split_text = []
	for txt_iter in text_iterable:
	translated_txt = translator.translate(txt_iter.strip())
	split_text.append(translated_txt)
	progress_bar.update(1)
	split_list.append(split_text)
	progress_bar.close()
	except Exception as error:
	progress_bar.close()
	logger.error(str(error))
	logger.warning(
	"The translation in chunks failed, switching to iterative."
	" Related: too many request"
	) # use proxy or less chunk size
	return translate_iterative(segments, target, source)

	# un chunk
	translated_lines = list(chain.from_iterable(split_list))

	return verify_translate(
	segments, segments_copy, translated_lines, target, source
	)


	def call_gpt_translate(
	client,
	model,
	system_prompt,
	user_prompt,
	original_text=None,
	batch_lines=None,
	):

	# https://platform.openai.com/docs/guides/text-generation/json-mode
	response = client.chat.completions.create(
	model=model,
	response_format={"type": "json_object"},
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt}
	]
	)
	result = response.choices[0].message.content
	logger.debug(f"Result: {str(result)}")

	try:
	translation = json.loads(result)
	except Exception as error:
	match_result = re.search(r'\{.*?\}', result)
	if match_result:
	logger.error(str(error))
	json_str = match_result.group(0)
	translation = json.loads(json_str)
	else:
	raise error

	# Get valid data
	if batch_lines:
	for conversation in translation.values():
	if isinstance(conversation, dict):
	conversation = list(conversation.values())[0]
	if (
	list(
	original_text["conversation"][0].values()
	)[0].strip() ==
	list(conversation[0].values())[0].strip()
	):
	continue
	if len(conversation) == batch_lines:
	break

	fix_conversation_length = []
	for line in conversation:
	for speaker_code, text_tr in line.items():
	fix_conversation_length.append({speaker_code: text_tr})

	logger.debug(f"Data batch: {str(fix_conversation_length)}")
	logger.debug(
	f"Lines Received: {len(fix_conversation_length)},"
	f" expected: {batch_lines}"
	)

	return fix_conversation_length

	else:
	if isinstance(translation, dict):
	translation = list(translation.values())[0]
	if isinstance(translation, list):
	translation = translation[0]
	if isinstance(translation, set):
	translation = list(translation)[0]
	if not isinstance(translation, str):
	raise ValueError(f"No valid response received: {str(translation)}")

	return translation


	def gpt_sequential(segments, model, target, source=None):
	from openai import OpenAI

	translated_segments = copy.deepcopy(segments)

	client = OpenAI()
	progress_bar = tqdm(total=len(segments), desc="Translating")

	lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip()
	lang_sc = ""
	if source:
	lang_sc = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[source]).strip()

	fixed_target = fix_code_language(target)
	fixed_source = fix_code_language(source) if source else "auto"

	system_prompt = "Machine translation designed to output the translated_text JSON."

	for i, line in enumerate(translated_segments):
	text = line["text"].strip()
	start = line["start"]
	user_prompt = f"Translate the following {lang_sc} text into {lang_tg}, write the fully translated text and nothing more:\n{text}"

	time.sleep(0.5)

	try:
	translated_text = call_gpt_translate(
	client,
	model,
	system_prompt,
	user_prompt,
	)

	except Exception as error:
	logger.error(
	f"{str(error)} >> The text of segment {start} "
	"is being corrected with Google Translate"
	)
	translator = GoogleTranslator(
	source=fixed_source, target=fixed_target
	)
	translated_text = translator.translate(text.strip())

	translated_segments[i]["text"] = translated_text.strip()
	progress_bar.update(1)

	progress_bar.close()

	return translated_segments


	def gpt_batch(segments, model, target, token_batch_limit=900, source=None):
	from openai import OpenAI
	import tiktoken

	token_batch_limit = max(100, (token_batch_limit - 40) // 2)
	progress_bar = tqdm(total=len(segments), desc="Translating")
	segments_copy = copy.deepcopy(segments)
	encoding = tiktoken.get_encoding("cl100k_base")
	client = OpenAI()

	lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip()
	lang_sc = ""
	if source:
	lang_sc = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[source]).strip()

	fixed_target = fix_code_language(target)
	fixed_source = fix_code_language(source) if source else "auto"

	name_speaker = "ABCDEFGHIJKL"

	translated_lines = []
	text_data_dict = []
	num_tokens = 0
	count_sk = {char: 0 for char in "ABCDEFGHIJKL"}

	for i, line in enumerate(segments_copy):
	text = line["text"]
	speaker = line["speaker"]
	last_start = line["start"]
	# text_data_dict.append({str(int(speaker[-1])+1): text})
	index_sk = int(speaker[-2:])
	character_sk = name_speaker[index_sk]
	count_sk[character_sk] += 1
	code_sk = character_sk+str(count_sk[character_sk])
	text_data_dict.append({code_sk: text})
	num_tokens += len(encoding.encode(text)) + 7
	if num_tokens >= token_batch_limit or i == len(segments_copy)-1:
	try:
	batch_lines = len(text_data_dict)
	batch_conversation = {"conversation": copy.deepcopy(text_data_dict)}
	# Reset vars
	num_tokens = 0
	text_data_dict = []
	count_sk = {char: 0 for char in "ABCDEFGHIJKL"}
	# Process translation
	# https://arxiv.org/pdf/2309.03409.pdf
	system_prompt = f"Machine translation designed to output the translated_conversation key JSON containing a list of {batch_lines} items."
	user_prompt = f"Translate each of the following text values in conversation{' from' if lang_sc else ''} {lang_sc} to {lang_tg}:\n{batch_conversation}"
	logger.debug(f"Prompt: {str(user_prompt)}")

	conversation = call_gpt_translate(
	client,
	model,
	system_prompt,
	user_prompt,
	original_text=batch_conversation,
	batch_lines=batch_lines,
	)

	if len(conversation) < batch_lines:
	raise ValueError(
	"Incomplete result received. Batch lines: "
	f"{len(conversation)}, expected: {batch_lines}"
	)

	for i, translated_text in enumerate(conversation):
	if i+1 > batch_lines:
	break
	translated_lines.append(list(translated_text.values())[0])

	progress_bar.update(batch_lines)

	except Exception as error:
	logger.error(str(error))

	first_start = segments_copy[max(0, i-(batch_lines-1))]["start"]
	logger.warning(
	f"The batch from {first_start} to {last_start} "
	"failed, is being corrected with Google Translate"
	)

	translator = GoogleTranslator(
	source=fixed_source,
	target=fixed_target
	)

	for txt_source in batch_conversation["conversation"]:
	translated_txt = translator.translate(
	list(txt_source.values())[0].strip()
	)
	translated_lines.append(translated_txt.strip())
	progress_bar.update(1)

	progress_bar.close()

	return verify_translate(
	segments, segments_copy, translated_lines, fixed_target, fixed_source
	)


	def translate_text(
	segments,
	target,
	translation_process="google_translator_batch",
	chunk_size=4500,
	source=None,
	token_batch_limit=1000,
	):
	"""Translates text segments using a specified process."""
	match translation_process:
	case "google_translator_batch":
	return translate_batch(
	segments,
	fix_code_language(target),
	chunk_size,
	fix_code_language(source)
	)
	case "google_translator":
	return translate_iterative(
	segments,
	fix_code_language(target),
	fix_code_language(source)
	)
	case model if model in ["gpt-3.5-turbo-0125", "gpt-4-turbo-preview"]:
	return gpt_sequential(segments, model, target, source)
	case model if model in ["gpt-3.5-turbo-0125_batch", "gpt-4-turbo-preview_batch",]:
	return gpt_batch(
	segments,
	translation_process.replace("_batch", ""),
	target,
	token_batch_limit,
	source
	)
	case "disable_translation":
	return segments
	case _:
	raise ValueError("No valid translation process")