Spaces:

capradeepgujaran
/

DocChat_n_Talk

Running

App Files Files Community

capradeepgujaran commited on 15 days ago

Commit

78decde

•

1 Parent(s): e72b30a

Create openai_tts_tool.py

Browse files

Files changed (1) hide show

openai_tts_tool.py +211 -0

openai_tts_tool.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import os
+import openai
+import PyPDF2
+from deep_translator import GoogleTranslator
+from dotenv import load_dotenv
+import tiktoken
+import pytesseract
+import fitz  # PyMuPDF for PDF processing
+import docx  # For processing DOCX files
+from PIL import Image
+# Load environment variables
+load_dotenv()
+# Initialize OpenAI client
+openai_api_key = os.getenv("OPENAI_API_KEY")
+client = openai.OpenAI(api_key=openai_api_key)
+# Define model specifications
+MODEL_SPECS = {
+    'gpt-4o': {
+        'max_context_tokens': 128000,
+        'max_output_tokens': 4096,
+    },
+    'gpt-4o-mini': {
+        'max_context_tokens': 128000,
+        'max_output_tokens': 16384,
+    },
+    'gpt-4': {
+        'max_context_tokens': 8192,
+        'max_output_tokens': 8192,
+    },
+    # Add other models as needed
+}
+# Set the path for Tesseract OCR (only needed on Windows)
+pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'  # Adjust path accordingly
+# Function to extract text from PDF, using OCR for scanned documents
+def extract_text_from_pdf(pdf_path):
+    doc = fitz.open(pdf_path)
+    text = ""
+    for page_num in range(doc.page_count):
+        page = doc[page_num]
+        page_text = page.get_text()
+        # If no text (i.e., scanned PDF), use OCR
+        if not page_text.strip():
+            pix = page.get_pixmap()
+            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            page_text = pytesseract.image_to_string(img)
+        text += page_text
+    return text
+# Function to handle .docx files
+def load_docx_file(docx_path):
+    doc = docx.Document(docx_path)
+    full_text = []
+    for para in doc.paragraphs:
+        full_text.append(para.text)
+    return '\n'.join(full_text)
+# Function to handle .txt files
+def load_txt_file(txt_path):
+    with open(txt_path, 'r', encoding='utf-8') as f:
+        return f.read()
+# Function to handle file based on its extension
+def load_file_based_on_extension(file_path):
+    if file_path.endswith('.pdf'):
+        return extract_text_from_pdf(file_path)
+    elif file_path.endswith('.docx'):
+        return load_docx_file(file_path)
+    elif file_path.endswith('.txt'):
+        return load_txt_file(file_path)
+    else:
+        raise ValueError(f"Unsupported file format: {file_path}")
+# Function to process a folder and index all files within it
+def process_folder(folder_path):
+    documents = []
+    for filename in os.listdir(folder_path):
+        file_path = os.path.join(folder_path, filename)
+        if os.path.isfile(file_path):
+            try:
+                text = load_file_based_on_extension(file_path)
+                documents.append(text)
+            except ValueError as e:
+                print(f"Skipping unsupported file: {file_path} ({e})")
+    return ' '.join(documents)  # Combine all documents text
+# Function to count tokens
+def count_tokens(text, model_name):
+    encoding = tiktoken.encoding_for_model(model_name)
+    num_tokens = len(encoding.encode(text))
+    return num_tokens
+# Function to split text into chunks
+def split_text_into_chunks(text, max_tokens, model_name):
+    encoding = tiktoken.encoding_for_model(model_name)
+    tokens = encoding.encode(text)
+    chunks = []
+    start = 0
+    text_length = len(tokens)
+    while start < text_length:
+        end = start + max_tokens
+        chunk_tokens = tokens[start:end]
+        chunk_text = encoding.decode(chunk_tokens)
+        chunks.append(chunk_text)
+        start = end
+    return chunks
+# Modified summarize_text function
+def summarize_text(text, length, model_name, additional_prompt):
+    model_specs = MODEL_SPECS.get(model_name)
+    if not model_specs:
+        raise ValueError(f"Model specifications not found for model {model_name}")
+    max_output_tokens = model_specs['max_output_tokens']
+    max_context_tokens = model_specs['max_context_tokens']
+    if length > max_output_tokens:
+        length = max_output_tokens
+    input_token_count = count_tokens(text, model_name)
+    buffer_tokens = 500
+    if input_token_count + buffer_tokens + length > max_context_tokens:
+        max_chunk_tokens = max_context_tokens - buffer_tokens - length
+        chunks = split_text_into_chunks(text, max_chunk_tokens, model_name)
+        summaries = [summarize_text(chunk, length, model_name, additional_prompt) for chunk in chunks]
+        combined_summary = ' '.join(summaries)
+        final_summary = summarize_text(combined_summary, length, model_name, additional_prompt)
+        return final_summary
+    else:
+        prompt = (
+            f"Please provide a clear and concise summary of the following text in approximately {length} words. "
+            "Ensure that the summary does not include any special characters, symbols, or markdown formatting. "
+            "Use plain language and proper punctuation."
+        )
+        if additional_prompt:
+            prompt += f"\n\nAdditional instructions: {additional_prompt}"
+        prompt += f"\n\nText to summarize:\n{text}"
+        # Use the chat completion as per your snippet
+        completion = client.chat.completions.create(
+            model=model_name,
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant"},
+                {"role": "user", "content": prompt}
+            ],
+            max_tokens=length
+        )
+        return completion.choices[0].message.content.strip()
+# Function to calculate summary length based on desired audio duration
+def calculate_summary_length_by_duration(duration_minutes, voice_speed):
+    words_per_minute = 150 if voice_speed == 'normal' else 120
+    summary_length = int(duration_minutes * words_per_minute)
+    return summary_length
+# Function to translate the summarized text using deep-translator
+def translate_text(text, target_language):
+    translated = GoogleTranslator(source='auto', target=target_language).translate(text)
+    return translated
+# Function to estimate audio duration
+def estimate_audio_duration(text, voice_speed):
+    word_count = len(text.split())
+    words_per_minute = 150 if voice_speed == 'normal' else 120
+    duration_minutes = word_count / words_per_minute
+    duration_seconds = duration_minutes * 60
+    return duration_seconds
+# Function to convert text to audio using OpenAI TTS-1
+def text_to_speech_openai(text, audio_path, voice, speed):
+    response = client.audio.speech.create(
+        model="tts-1-hd",
+        voice=voice,
+        input=text
+    )
+    response.stream_to_file(audio_path)
+def process_input(pdf_path=None, input_text=None, summary_length=None, voice=None, language=None, voice_speed=None, model_name=None, additional_prompt=None, generate_audio=True, folder_path=None):
+    if folder_path:
+        extracted_text = process_folder(folder_path)
+    elif pdf_path:
+        extracted_text = load_file_based_on_extension(pdf_path)
+    elif input_text:
+        extracted_text = input_text
+    else:
+        raise ValueError("No input provided for processing.")
+    summary_text = summarize_text(extracted_text, summary_length, model_name, additional_prompt)
+    translated_summary = translate_text(summary_text, language)
+    estimated_audio_duration = estimate_audio_duration(translated_summary, voice_speed)
+    base_filename = os.path.splitext(os.path.basename(pdf_path or 'document'))[0]
+    audio_file_path = os.path.join('uploads', f"{base_filename}_audio_{language}.mp3")
+    summary_file_path = os.path.join('uploads', f"{base_filename}_summary_{language}.txt")
+    with open(summary_file_path, "w", encoding="utf-8") as summary_file:
+        summary_file.write(translated_summary)
+    if generate_audio:
+        text_to_speech_openai(translated_summary, audio_file_path, voice, voice_speed)
+    return translated_summary, audio_file_path if generate_audio else None, summary_file_path, estimated_audio_duration