DocChat_n_Talk / openai_tts_tool.py
capradeepgujaran's picture
Update openai_tts_tool.py
251214c verified
raw
history blame
6.33 kB
# openai_tts_tool.py
from openai import OpenAI
import os
from langdetect import detect, DetectorFactory
import logging
# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s')
# Ensure consistent results from langdetect
DetectorFactory.seed = 0
# Simple in-memory cache for translations
translation_cache = {}
def translate_text(api_key, text, target_language):
"""
Translate text to the target language using OpenAI's API with gpt-4o-mini model.
Args:
api_key (str): OpenAI API key
text (str): Text to translate
target_language (str): Target language code (e.g., 'en' for English)
Returns:
str: Translated text or error message
"""
cache_key = (text, target_language)
if cache_key in translation_cache:
logging.info("Fetching translation from cache.")
return translation_cache[cache_key]
try:
logging.info("Starting translation process.")
client = OpenAI(api_key=api_key)
prompt = f"Translate the following text to {target_language}:\n\n{text}"
response = client.completions.create(
model="gpt-4o-mini", # Updated model name
prompt=prompt,
max_tokens=1000,
temperature=0.3
)
translated_text = response.choices[0].text.strip()
logging.info("Translation successful.")
# Cache the translation
translation_cache[cache_key] = translated_text
return translated_text
except Exception as e:
logging.error(f"Error in translation: {str(e)}")
return f"Error in translation: {str(e)}"
def generate_audio_and_text(api_key, input_text, model_name, voice_type, voice_speed, language, output_option):
"""
Generate audio and text files from input text using OpenAI's TTS API.
Args:
api_key (str): OpenAI API key
input_text (str): Text to convert to speech
model_name (str): OpenAI model name
voice_type (str): Voice type for TTS
voice_speed (float): Speed of speech
language (str): Language code for synthesis
output_option (str): Output type ('audio', 'script_text', or 'both')
Returns:
tuple: (audio_file_path, script_file_path, status_message)
"""
if not input_text:
logging.warning("No input text provided.")
return None, None, "No input text provided"
if not api_key:
logging.warning("No API key provided.")
return None, None, "No API key provided"
try:
logging.info("Initializing OpenAI client.")
client = OpenAI(api_key=api_key)
# Create temp directory if it doesn't exist
temp_dir = os.path.join(os.getcwd(), 'temp')
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
logging.info(f"Created temporary directory at {temp_dir}.")
# Detect input language
try:
detected_language = detect(input_text)
logging.info(f"Detected input language: {detected_language}")
except Exception as e:
logging.error(f"Error detecting language: {str(e)}")
return None, None, f"Error detecting language: {str(e)}"
# Map language codes if necessary (langdetect uses ISO 639-1 codes)
target_language = language.lower()[:2] # e.g., 'en' for English
# If detected language is different from target, translate
if detected_language != target_language:
logging.info("Input language differs from target language. Proceeding to translate.")
translated_text = translate_text(api_key, input_text, target_language)
if translated_text.startswith("Error in translation:"):
return None, None, translated_text
else:
logging.info("Input language matches target language. No translation needed.")
translated_text = input_text
# Generate audio file
audio_file = None
if output_option in ["audio", "both"]:
try:
logging.info("Starting audio generation.")
speech_response = client.audio.speech.create(
model="tts-1",
voice=voice_type,
input=translated_text,
speed=float(voice_speed)
)
# Save the audio to a temporary file
audio_filename = f"output_{hash(translated_text)}_{target_language}.mp3"
audio_path = os.path.join(temp_dir, audio_filename)
with open(audio_path, "wb") as f:
for chunk in speech_response.iter_bytes():
f.write(chunk)
logging.info(f"Audio file saved at {audio_path}.")
audio_file = audio_path
except Exception as e:
logging.error(f"Error during audio generation: {str(e)}")
return None, None, f"Error during audio generation: {str(e)}"
# Save the (translated) text as a script file
script_file = None
if output_option in ["script_text", "both"]:
try:
logging.info("Starting script text generation.")
script_text = translated_text
script_filename = f"script_{hash(script_text)}_{target_language}.txt"
script_path = os.path.join(temp_dir, script_filename)
with open(script_path, "w", encoding='utf-8') as f:
f.write(script_text)
logging.info(f"Script file saved at {script_path}.")
script_file = script_path
except Exception as e:
logging.error(f"Error during script text generation: {str(e)}")
return None, None, f"Error during script text generation: {str(e)}"
status_message = f"Generation completed successfully in {language}!"
logging.info(status_message)
return audio_file, script_file, status_message
except Exception as e:
logging.error(f"Unexpected error: {str(e)}")
return None, None, f"Error: {str(e)}"