DocChat_n_Talk / app.py
capradeepgujaran's picture
Update app.py
e6032f2 verified
raw
history blame
7.34 kB
import os
import cv2
import numpy as np
from PIL import Image
import pytesseract
import gradio as gr
from pdf2image import convert_from_path
import PyPDF2
from llama_index.core import VectorStoreIndex, Document
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core import get_response_synthesizer
from sentence_transformers import SentenceTransformer, util
import logging
from openai_tts_tool import generate_audio_and_text
import tempfile
# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s')
# Initialize global variables
vector_index = None
query_log = []
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
# Define a fallback list of common OCR languages
DEFAULT_LANGS = [
'eng', # English
'fra', # French
'deu', # German
'spa', # Spanish
'ita', # Italian
'por', # Portuguese
'nld', # Dutch
'pol', # Polish
'tur', # Turkish
'rus', # Russian
'ara', # Arabic
'hin', # Hindi
'jpn', # Japanese
'kor', # Korean
'chi_sim', # Simplified Chinese
'chi_tra' # Traditional Chinese
]
def get_available_languages():
"""Get available Tesseract languages with fallback"""
try:
# Try to get languages from Tesseract
langs = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
if langs and len(langs) > 0:
return sorted(langs)
except Exception as e:
logging.warning(f"Could not get Tesseract languages: {e}")
# Fallback to default languages
return DEFAULT_LANGS
# Get available languages once at startup
AVAILABLE_LANGUAGES = get_available_languages()
def create_temp_dir():
"""Create temporary directory if it doesn't exist"""
temp_dir = os.path.join(os.getcwd(), 'temp')
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
return temp_dir
# [Previous helper functions remain the same...]
def create_summary_file(summary_text):
"""Create a downloadable file from the summary text"""
if not summary_text:
return None
temp_dir = create_temp_dir()
summary_file = os.path.join(temp_dir, f"summary_{hash(summary_text)}.txt")
with open(summary_file, 'w', encoding='utf-8') as f:
f.write(summary_text)
return summary_file
def query_app(query, model_name, use_similarity_check, api_key):
"""Process a query and return both the answer and the text for generation"""
global vector_index, query_log
if vector_index is None:
return "No documents indexed yet. Please upload documents first.", None
if not api_key:
return "Please provide a valid OpenAI API Key.", None
try:
llm = OpenAI(model=model_name, api_key=api_key)
response_synthesizer = get_response_synthesizer(llm=llm)
query_engine = vector_index.as_query_engine(llm=llm, response_synthesizer=response_synthesizer)
response = query_engine.query(query)
generated_response = response.response
return generated_response, generated_response
except Exception as e:
logging.error(f"Error during query processing: {e}")
return f"Error during query processing: {str(e)}", None
def create_gradio_interface():
with gr.Blocks(title="Document Processing and TTS App") as demo:
gr.Markdown("# πŸ“„ Document Processing, Text & Audio Generation App")
# Store API key at the top level to share across tabs
api_key_input = gr.Textbox(
label="Enter OpenAI API Key",
placeholder="Paste your OpenAI API Key here",
type="password"
)
with gr.Tab("πŸ“€ Upload Documents"):
file_upload = gr.File(label="Upload Files", file_count="multiple", type="filepath")
lang_dropdown = gr.Dropdown(
choices=AVAILABLE_LANGUAGES,
label="Select OCR Language",
value='eng',
info="Select the primary language of your documents"
)
upload_button = gr.Button("Upload and Index")
upload_status = gr.Textbox(label="Status", interactive=False)
with gr.Tab("❓ Ask a Question"):
query_input = gr.Textbox(label="Enter your question")
model_dropdown = gr.Dropdown(
choices=["gpt-4-0125-preview", "gpt-3.5-turbo-0125"],
label="Select Model",
value="gpt-3.5-turbo-0125"
)
similarity_checkbox = gr.Checkbox(label="Use Similarity Check", value=False)
query_button = gr.Button("Ask")
answer_output = gr.Textbox(label="Answer", interactive=False)
with gr.Tab("πŸ—£οΈ Generate Audio and Text"):
text_input = gr.Textbox(label="Enter text for generation")
voice_type = gr.Dropdown(
choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
label="Voice Type",
value="alloy"
)
voice_speed = gr.Slider(
minimum=0.25,
maximum=4.0,
value=1.0,
label="Voice Speed"
)
language = gr.Dropdown(
choices=["en", "ar", "de", "hi", "es", "fr", "it", "ja", "ko", "pt"],
label="Language",
value="en"
)
output_option = gr.Radio(
choices=["audio", "summary_text", "both"],
label="Output Option",
value="both"
)
summary_length = gr.Slider(
minimum=50,
maximum=500,
value=100,
step=10,
label="Summary Length (words)"
)
additional_prompt = gr.Textbox(label="Additional Prompt (Optional)")
generate_button = gr.Button("Generate")
with gr.Row():
audio_output = gr.Audio(label="Generated Audio")
summary_output = gr.File(label="Generated Summary Text")
# Wire up the components
upload_button.click(
fn=process_upload,
inputs=[api_key_input, file_upload, lang_dropdown],
outputs=[upload_status]
)
query_button.click(
fn=query_app,
inputs=[query_input, model_dropdown, similarity_checkbox, api_key_input],
outputs=[answer_output, text_input]
)
def process_generation(*args):
audio_file, summary_text = generate_audio_and_text(*args)
summary_file = create_summary_file(summary_text) if summary_text else None
return audio_file, summary_file
generate_button.click(
fn=process_generation,
inputs=[
api_key_input, text_input, model_dropdown, voice_type,
voice_speed, language, output_option, summary_length,
additional_prompt
],
outputs=[audio_output, summary_output]
)
return demo
if __name__ == "__main__":
demo = create_gradio_interface()
demo.launch()
else:
demo = create_gradio_interface()