from tqdm import tqdm from deep_translator import GoogleTranslator from itertools import chain import copy from .language_configuration import fix_code_language, INVERTED_LANGUAGES from .logging_setup import logger import re import json import time TRANSLATION_PROCESS_OPTIONS = [ "google_translator_batch", "google_translator", "gpt-3.5-turbo-0125_batch", "gpt-3.5-turbo-0125", "gpt-4-turbo-preview_batch", "gpt-4-turbo-preview", "disable_translation", ] DOCS_TRANSLATION_PROCESS_OPTIONS = [ "google_translator", "gpt-3.5-turbo-0125", "gpt-4-turbo-preview", "disable_translation", ] def translate_iterative(segments, target, source=None): """ Translate text segments individually to the specified language. Parameters: - segments (list): A list of dictionaries with 'text' as a key for segment text. - target (str): Target language code. - source (str, optional): Source language code. Defaults to None. Returns: - list: Translated text segments in the target language. Notes: - Translates each segment using Google Translate. Example: segments = [{'text': 'first segment.'}, {'text': 'second segment.'}] translated_segments = translate_iterative(segments, 'es') """ segments_ = copy.deepcopy(segments) if ( not source ): logger.debug("No source language") source = "auto" translator = GoogleTranslator(source=source, target=target) for line in tqdm(range(len(segments_))): text = segments_[line]["text"] translated_line = translator.translate(text.strip()) segments_[line]["text"] = translated_line return segments_ def verify_translate( segments, segments_copy, translated_lines, target, source ): """ Verify integrity and translate segments if lengths match, otherwise switch to iterative translation. """ if len(segments) == len(translated_lines): for line in range(len(segments_copy)): logger.debug( f"{segments_copy[line]['text']} >> " f"{translated_lines[line].strip()}" ) segments_copy[line]["text"] = translated_lines[ line].replace("\t", "").replace("\n", "").strip() return segments_copy else: logger.error( "The translation failed, switching to google_translate iterative. " f"{len(segments), len(translated_lines)}" ) return translate_iterative(segments, target, source) def translate_batch(segments, target, chunk_size=2000, source=None): """ Translate a batch of text segments into the specified language in chunks, respecting the character limit. Parameters: - segments (list): List of dictionaries with 'text' as a key for segment text. - target (str): Target language code. - chunk_size (int, optional): Maximum character limit for each translation chunk (default is 2000; max 5000). - source (str, optional): Source language code. Defaults to None. Returns: - list: Translated text segments in the target language. Notes: - Splits input segments into chunks respecting the character limit for translation. - Translates the chunks using Google Translate. - If chunked translation fails, switches to iterative translation using `translate_iterative()`. Example: segments = [{'text': 'first segment.'}, {'text': 'second segment.'}] translated = translate_batch(segments, 'es', chunk_size=4000, source='en') """ segments_copy = copy.deepcopy(segments) if ( not source ): logger.debug("No source language") source = "auto" # Get text text_lines = [] for line in range(len(segments_copy)): text = segments_copy[line]["text"].strip() text_lines.append(text) # chunk limit text_merge = [] actual_chunk = "" global_text_list = [] actual_text_list = [] for one_line in text_lines: one_line = " " if not one_line else one_line if (len(actual_chunk) + len(one_line)) <= chunk_size: if actual_chunk: actual_chunk += " ||||| " actual_chunk += one_line actual_text_list.append(one_line) else: text_merge.append(actual_chunk) actual_chunk = one_line global_text_list.append(actual_text_list) actual_text_list = [one_line] if actual_chunk: text_merge.append(actual_chunk) global_text_list.append(actual_text_list) # translate chunks progress_bar = tqdm(total=len(segments), desc="Translating") translator = GoogleTranslator(source=source, target=target) split_list = [] try: for text, text_iterable in zip(text_merge, global_text_list): translated_line = translator.translate(text.strip()) split_text = translated_line.split("|||||") if len(split_text) == len(text_iterable): progress_bar.update(len(split_text)) else: logger.debug( "Chunk fixing iteratively. Len chunk: " f"{len(split_text)}, expected: {len(text_iterable)}" ) split_text = [] for txt_iter in text_iterable: translated_txt = translator.translate(txt_iter.strip()) split_text.append(translated_txt) progress_bar.update(1) split_list.append(split_text) progress_bar.close() except Exception as error: progress_bar.close() logger.error(str(error)) logger.warning( "The translation in chunks failed, switching to iterative." " Related: too many request" ) # use proxy or less chunk size return translate_iterative(segments, target, source) # un chunk translated_lines = list(chain.from_iterable(split_list)) return verify_translate( segments, segments_copy, translated_lines, target, source ) def call_gpt_translate( client, model, system_prompt, user_prompt, original_text=None, batch_lines=None, ): # https://platform.openai.com/docs/guides/text-generation/json-mode response = client.chat.completions.create( model=model, response_format={"type": "json_object"}, messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt} ] ) result = response.choices[0].message.content logger.debug(f"Result: {str(result)}") try: translation = json.loads(result) except Exception as error: match_result = re.search(r'\{.*?\}', result) if match_result: logger.error(str(error)) json_str = match_result.group(0) translation = json.loads(json_str) else: raise error # Get valid data if batch_lines: for conversation in translation.values(): if isinstance(conversation, dict): conversation = list(conversation.values())[0] if ( list( original_text["conversation"][0].values() )[0].strip() == list(conversation[0].values())[0].strip() ): continue if len(conversation) == batch_lines: break fix_conversation_length = [] for line in conversation: for speaker_code, text_tr in line.items(): fix_conversation_length.append({speaker_code: text_tr}) logger.debug(f"Data batch: {str(fix_conversation_length)}") logger.debug( f"Lines Received: {len(fix_conversation_length)}," f" expected: {batch_lines}" ) return fix_conversation_length else: if isinstance(translation, dict): translation = list(translation.values())[0] if isinstance(translation, list): translation = translation[0] if isinstance(translation, set): translation = list(translation)[0] if not isinstance(translation, str): raise ValueError(f"No valid response received: {str(translation)}") return translation def gpt_sequential(segments, model, target, source=None): from openai import OpenAI translated_segments = copy.deepcopy(segments) client = OpenAI() progress_bar = tqdm(total=len(segments), desc="Translating") lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip() lang_sc = "" if source: lang_sc = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[source]).strip() fixed_target = fix_code_language(target) fixed_source = fix_code_language(source) if source else "auto" system_prompt = "Machine translation designed to output the translated_text JSON." for i, line in enumerate(translated_segments): text = line["text"].strip() start = line["start"] user_prompt = f"Translate the following {lang_sc} text into {lang_tg}, write the fully translated text and nothing more:\n{text}" time.sleep(0.5) try: translated_text = call_gpt_translate( client, model, system_prompt, user_prompt, ) except Exception as error: logger.error( f"{str(error)} >> The text of segment {start} " "is being corrected with Google Translate" ) translator = GoogleTranslator( source=fixed_source, target=fixed_target ) translated_text = translator.translate(text.strip()) translated_segments[i]["text"] = translated_text.strip() progress_bar.update(1) progress_bar.close() return translated_segments def gpt_batch(segments, model, target, token_batch_limit=900, source=None): from openai import OpenAI import tiktoken token_batch_limit = max(100, (token_batch_limit - 40) // 2) progress_bar = tqdm(total=len(segments), desc="Translating") segments_copy = copy.deepcopy(segments) encoding = tiktoken.get_encoding("cl100k_base") client = OpenAI() lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip() lang_sc = "" if source: lang_sc = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[source]).strip() fixed_target = fix_code_language(target) fixed_source = fix_code_language(source) if source else "auto" name_speaker = "ABCDEFGHIJKL" translated_lines = [] text_data_dict = [] num_tokens = 0 count_sk = {char: 0 for char in "ABCDEFGHIJKL"} for i, line in enumerate(segments_copy): text = line["text"] speaker = line["speaker"] last_start = line["start"] # text_data_dict.append({str(int(speaker[-1])+1): text}) index_sk = int(speaker[-2:]) character_sk = name_speaker[index_sk] count_sk[character_sk] += 1 code_sk = character_sk+str(count_sk[character_sk]) text_data_dict.append({code_sk: text}) num_tokens += len(encoding.encode(text)) + 7 if num_tokens >= token_batch_limit or i == len(segments_copy)-1: try: batch_lines = len(text_data_dict) batch_conversation = {"conversation": copy.deepcopy(text_data_dict)} # Reset vars num_tokens = 0 text_data_dict = [] count_sk = {char: 0 for char in "ABCDEFGHIJKL"} # Process translation # https://arxiv.org/pdf/2309.03409.pdf system_prompt = f"Machine translation designed to output the translated_conversation key JSON containing a list of {batch_lines} items." user_prompt = f"Translate each of the following text values in conversation{' from' if lang_sc else ''} {lang_sc} to {lang_tg}:\n{batch_conversation}" logger.debug(f"Prompt: {str(user_prompt)}") conversation = call_gpt_translate( client, model, system_prompt, user_prompt, original_text=batch_conversation, batch_lines=batch_lines, ) if len(conversation) < batch_lines: raise ValueError( "Incomplete result received. Batch lines: " f"{len(conversation)}, expected: {batch_lines}" ) for i, translated_text in enumerate(conversation): if i+1 > batch_lines: break translated_lines.append(list(translated_text.values())[0]) progress_bar.update(batch_lines) except Exception as error: logger.error(str(error)) first_start = segments_copy[max(0, i-(batch_lines-1))]["start"] logger.warning( f"The batch from {first_start} to {last_start} " "failed, is being corrected with Google Translate" ) translator = GoogleTranslator( source=fixed_source, target=fixed_target ) for txt_source in batch_conversation["conversation"]: translated_txt = translator.translate( list(txt_source.values())[0].strip() ) translated_lines.append(translated_txt.strip()) progress_bar.update(1) progress_bar.close() return verify_translate( segments, segments_copy, translated_lines, fixed_target, fixed_source ) def translate_text( segments, target, translation_process="google_translator_batch", chunk_size=4500, source=None, token_batch_limit=1000, ): """Translates text segments using a specified process.""" match translation_process: case "google_translator_batch": return translate_batch( segments, fix_code_language(target), chunk_size, fix_code_language(source) ) case "google_translator": return translate_iterative( segments, fix_code_language(target), fix_code_language(source) ) case model if model in ["gpt-3.5-turbo-0125", "gpt-4-turbo-preview"]: return gpt_sequential(segments, model, target, source) case model if model in ["gpt-3.5-turbo-0125_batch", "gpt-4-turbo-preview_batch",]: return gpt_batch( segments, translation_process.replace("_batch", ""), target, token_batch_limit, source ) case "disable_translation": return segments case _: raise ValueError("No valid translation process")