Spaces:
Running
Running
from tqdm import tqdm | |
from deep_translator import GoogleTranslator | |
from itertools import chain | |
import copy | |
from .language_configuration import fix_code_language, INVERTED_LANGUAGES | |
from .logging_setup import logger | |
import re | |
import json | |
import time | |
TRANSLATION_PROCESS_OPTIONS = [ | |
"google_translator_batch", | |
"google_translator", | |
"gpt-3.5-turbo-0125_batch", | |
"gpt-3.5-turbo-0125", | |
"gpt-4-turbo-preview_batch", | |
"gpt-4-turbo-preview", | |
"disable_translation", | |
] | |
DOCS_TRANSLATION_PROCESS_OPTIONS = [ | |
"google_translator", | |
"gpt-3.5-turbo-0125", | |
"gpt-4-turbo-preview", | |
"disable_translation", | |
] | |
def translate_iterative(segments, target, source=None): | |
""" | |
Translate text segments individually to the specified language. | |
Parameters: | |
- segments (list): A list of dictionaries with 'text' as a key for | |
segment text. | |
- target (str): Target language code. | |
- source (str, optional): Source language code. Defaults to None. | |
Returns: | |
- list: Translated text segments in the target language. | |
Notes: | |
- Translates each segment using Google Translate. | |
Example: | |
segments = [{'text': 'first segment.'}, {'text': 'second segment.'}] | |
translated_segments = translate_iterative(segments, 'es') | |
""" | |
segments_ = copy.deepcopy(segments) | |
if ( | |
not source | |
): | |
logger.debug("No source language") | |
source = "auto" | |
translator = GoogleTranslator(source=source, target=target) | |
for line in tqdm(range(len(segments_))): | |
text = segments_[line]["text"] | |
translated_line = translator.translate(text.strip()) | |
segments_[line]["text"] = translated_line | |
return segments_ | |
def verify_translate( | |
segments, | |
segments_copy, | |
translated_lines, | |
target, | |
source | |
): | |
""" | |
Verify integrity and translate segments if lengths match, otherwise | |
switch to iterative translation. | |
""" | |
if len(segments) == len(translated_lines): | |
for line in range(len(segments_copy)): | |
logger.debug( | |
f"{segments_copy[line]['text']} >> " | |
f"{translated_lines[line].strip()}" | |
) | |
segments_copy[line]["text"] = translated_lines[ | |
line].replace("\t", "").replace("\n", "").strip() | |
return segments_copy | |
else: | |
logger.error( | |
"The translation failed, switching to google_translate iterative. " | |
f"{len(segments), len(translated_lines)}" | |
) | |
return translate_iterative(segments, target, source) | |
def translate_batch(segments, target, chunk_size=2000, source=None): | |
""" | |
Translate a batch of text segments into the specified language in chunks, | |
respecting the character limit. | |
Parameters: | |
- segments (list): List of dictionaries with 'text' as a key for segment | |
text. | |
- target (str): Target language code. | |
- chunk_size (int, optional): Maximum character limit for each translation | |
chunk (default is 2000; max 5000). | |
- source (str, optional): Source language code. Defaults to None. | |
Returns: | |
- list: Translated text segments in the target language. | |
Notes: | |
- Splits input segments into chunks respecting the character limit for | |
translation. | |
- Translates the chunks using Google Translate. | |
- If chunked translation fails, switches to iterative translation using | |
`translate_iterative()`. | |
Example: | |
segments = [{'text': 'first segment.'}, {'text': 'second segment.'}] | |
translated = translate_batch(segments, 'es', chunk_size=4000, source='en') | |
""" | |
segments_copy = copy.deepcopy(segments) | |
if ( | |
not source | |
): | |
logger.debug("No source language") | |
source = "auto" | |
# Get text | |
text_lines = [] | |
for line in range(len(segments_copy)): | |
text = segments_copy[line]["text"].strip() | |
text_lines.append(text) | |
# chunk limit | |
text_merge = [] | |
actual_chunk = "" | |
global_text_list = [] | |
actual_text_list = [] | |
for one_line in text_lines: | |
one_line = " " if not one_line else one_line | |
if (len(actual_chunk) + len(one_line)) <= chunk_size: | |
if actual_chunk: | |
actual_chunk += " ||||| " | |
actual_chunk += one_line | |
actual_text_list.append(one_line) | |
else: | |
text_merge.append(actual_chunk) | |
actual_chunk = one_line | |
global_text_list.append(actual_text_list) | |
actual_text_list = [one_line] | |
if actual_chunk: | |
text_merge.append(actual_chunk) | |
global_text_list.append(actual_text_list) | |
# translate chunks | |
progress_bar = tqdm(total=len(segments), desc="Translating") | |
translator = GoogleTranslator(source=source, target=target) | |
split_list = [] | |
try: | |
for text, text_iterable in zip(text_merge, global_text_list): | |
translated_line = translator.translate(text.strip()) | |
split_text = translated_line.split("|||||") | |
if len(split_text) == len(text_iterable): | |
progress_bar.update(len(split_text)) | |
else: | |
logger.debug( | |
"Chunk fixing iteratively. Len chunk: " | |
f"{len(split_text)}, expected: {len(text_iterable)}" | |
) | |
split_text = [] | |
for txt_iter in text_iterable: | |
translated_txt = translator.translate(txt_iter.strip()) | |
split_text.append(translated_txt) | |
progress_bar.update(1) | |
split_list.append(split_text) | |
progress_bar.close() | |
except Exception as error: | |
progress_bar.close() | |
logger.error(str(error)) | |
logger.warning( | |
"The translation in chunks failed, switching to iterative." | |
" Related: too many request" | |
) # use proxy or less chunk size | |
return translate_iterative(segments, target, source) | |
# un chunk | |
translated_lines = list(chain.from_iterable(split_list)) | |
return verify_translate( | |
segments, segments_copy, translated_lines, target, source | |
) | |
def call_gpt_translate( | |
client, | |
model, | |
system_prompt, | |
user_prompt, | |
original_text=None, | |
batch_lines=None, | |
): | |
# https://platform.openai.com/docs/guides/text-generation/json-mode | |
response = client.chat.completions.create( | |
model=model, | |
response_format={"type": "json_object"}, | |
messages=[ | |
{"role": "system", "content": system_prompt}, | |
{"role": "user", "content": user_prompt} | |
] | |
) | |
result = response.choices[0].message.content | |
logger.debug(f"Result: {str(result)}") | |
try: | |
translation = json.loads(result) | |
except Exception as error: | |
match_result = re.search(r'\{.*?\}', result) | |
if match_result: | |
logger.error(str(error)) | |
json_str = match_result.group(0) | |
translation = json.loads(json_str) | |
else: | |
raise error | |
# Get valid data | |
if batch_lines: | |
for conversation in translation.values(): | |
if isinstance(conversation, dict): | |
conversation = list(conversation.values())[0] | |
if ( | |
list( | |
original_text["conversation"][0].values() | |
)[0].strip() == | |
list(conversation[0].values())[0].strip() | |
): | |
continue | |
if len(conversation) == batch_lines: | |
break | |
fix_conversation_length = [] | |
for line in conversation: | |
for speaker_code, text_tr in line.items(): | |
fix_conversation_length.append({speaker_code: text_tr}) | |
logger.debug(f"Data batch: {str(fix_conversation_length)}") | |
logger.debug( | |
f"Lines Received: {len(fix_conversation_length)}," | |
f" expected: {batch_lines}" | |
) | |
return fix_conversation_length | |
else: | |
if isinstance(translation, dict): | |
translation = list(translation.values())[0] | |
if isinstance(translation, list): | |
translation = translation[0] | |
if isinstance(translation, set): | |
translation = list(translation)[0] | |
if not isinstance(translation, str): | |
raise ValueError(f"No valid response received: {str(translation)}") | |
return translation | |
def gpt_sequential(segments, model, target, source=None): | |
from openai import OpenAI | |
translated_segments = copy.deepcopy(segments) | |
client = OpenAI() | |
progress_bar = tqdm(total=len(segments), desc="Translating") | |
lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip() | |
lang_sc = "" | |
if source: | |
lang_sc = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[source]).strip() | |
fixed_target = fix_code_language(target) | |
fixed_source = fix_code_language(source) if source else "auto" | |
system_prompt = "Machine translation designed to output the translated_text JSON." | |
for i, line in enumerate(translated_segments): | |
text = line["text"].strip() | |
start = line["start"] | |
user_prompt = f"Translate the following {lang_sc} text into {lang_tg}, write the fully translated text and nothing more:\n{text}" | |
time.sleep(0.5) | |
try: | |
translated_text = call_gpt_translate( | |
client, | |
model, | |
system_prompt, | |
user_prompt, | |
) | |
except Exception as error: | |
logger.error( | |
f"{str(error)} >> The text of segment {start} " | |
"is being corrected with Google Translate" | |
) | |
translator = GoogleTranslator( | |
source=fixed_source, target=fixed_target | |
) | |
translated_text = translator.translate(text.strip()) | |
translated_segments[i]["text"] = translated_text.strip() | |
progress_bar.update(1) | |
progress_bar.close() | |
return translated_segments | |
def gpt_batch(segments, model, target, token_batch_limit=900, source=None): | |
from openai import OpenAI | |
import tiktoken | |
token_batch_limit = max(100, (token_batch_limit - 40) // 2) | |
progress_bar = tqdm(total=len(segments), desc="Translating") | |
segments_copy = copy.deepcopy(segments) | |
encoding = tiktoken.get_encoding("cl100k_base") | |
client = OpenAI() | |
lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip() | |
lang_sc = "" | |
if source: | |
lang_sc = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[source]).strip() | |
fixed_target = fix_code_language(target) | |
fixed_source = fix_code_language(source) if source else "auto" | |
name_speaker = "ABCDEFGHIJKL" | |
translated_lines = [] | |
text_data_dict = [] | |
num_tokens = 0 | |
count_sk = {char: 0 for char in "ABCDEFGHIJKL"} | |
for i, line in enumerate(segments_copy): | |
text = line["text"] | |
speaker = line["speaker"] | |
last_start = line["start"] | |
# text_data_dict.append({str(int(speaker[-1])+1): text}) | |
index_sk = int(speaker[-2:]) | |
character_sk = name_speaker[index_sk] | |
count_sk[character_sk] += 1 | |
code_sk = character_sk+str(count_sk[character_sk]) | |
text_data_dict.append({code_sk: text}) | |
num_tokens += len(encoding.encode(text)) + 7 | |
if num_tokens >= token_batch_limit or i == len(segments_copy)-1: | |
try: | |
batch_lines = len(text_data_dict) | |
batch_conversation = {"conversation": copy.deepcopy(text_data_dict)} | |
# Reset vars | |
num_tokens = 0 | |
text_data_dict = [] | |
count_sk = {char: 0 for char in "ABCDEFGHIJKL"} | |
# Process translation | |
# https://arxiv.org/pdf/2309.03409.pdf | |
system_prompt = f"Machine translation designed to output the translated_conversation key JSON containing a list of {batch_lines} items." | |
user_prompt = f"Translate each of the following text values in conversation{' from' if lang_sc else ''} {lang_sc} to {lang_tg}:\n{batch_conversation}" | |
logger.debug(f"Prompt: {str(user_prompt)}") | |
conversation = call_gpt_translate( | |
client, | |
model, | |
system_prompt, | |
user_prompt, | |
original_text=batch_conversation, | |
batch_lines=batch_lines, | |
) | |
if len(conversation) < batch_lines: | |
raise ValueError( | |
"Incomplete result received. Batch lines: " | |
f"{len(conversation)}, expected: {batch_lines}" | |
) | |
for i, translated_text in enumerate(conversation): | |
if i+1 > batch_lines: | |
break | |
translated_lines.append(list(translated_text.values())[0]) | |
progress_bar.update(batch_lines) | |
except Exception as error: | |
logger.error(str(error)) | |
first_start = segments_copy[max(0, i-(batch_lines-1))]["start"] | |
logger.warning( | |
f"The batch from {first_start} to {last_start} " | |
"failed, is being corrected with Google Translate" | |
) | |
translator = GoogleTranslator( | |
source=fixed_source, | |
target=fixed_target | |
) | |
for txt_source in batch_conversation["conversation"]: | |
translated_txt = translator.translate( | |
list(txt_source.values())[0].strip() | |
) | |
translated_lines.append(translated_txt.strip()) | |
progress_bar.update(1) | |
progress_bar.close() | |
return verify_translate( | |
segments, segments_copy, translated_lines, fixed_target, fixed_source | |
) | |
def translate_text( | |
segments, | |
target, | |
translation_process="google_translator_batch", | |
chunk_size=4500, | |
source=None, | |
token_batch_limit=1000, | |
): | |
"""Translates text segments using a specified process.""" | |
match translation_process: | |
case "google_translator_batch": | |
return translate_batch( | |
segments, | |
fix_code_language(target), | |
chunk_size, | |
fix_code_language(source) | |
) | |
case "google_translator": | |
return translate_iterative( | |
segments, | |
fix_code_language(target), | |
fix_code_language(source) | |
) | |
case model if model in ["gpt-3.5-turbo-0125", "gpt-4-turbo-preview"]: | |
return gpt_sequential(segments, model, target, source) | |
case model if model in ["gpt-3.5-turbo-0125_batch", "gpt-4-turbo-preview_batch",]: | |
return gpt_batch( | |
segments, | |
translation_process.replace("_batch", ""), | |
target, | |
token_batch_limit, | |
source | |
) | |
case "disable_translation": | |
return segments | |
case _: | |
raise ValueError("No valid translation process") | |