yesh987's picture
Update app.py
9b5968f verified
import os
import re
import gradio as gr
import PyPDF2
from tqdm import tqdm
from gtts import gTTS
from transformers import pipeline
from diffusers import StableDiffusionPipeline
import speech_recognition as sr
import ollama
# Disable Gradio analytics
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
history = []
recognizer = sr.Recognizer()
# Load the text-to-image pipeline
text_to_image = StableDiffusionPipeline.from_pretrained('CompVis/stable-diffusion-v1-4')
text_to_image.to("cuda") # If you have a CUDA-capable GPU
# Initialize the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
# Initialize the question-answering pipeline
qa_model = pipeline("question-answering")
# Variable to store extracted text from PDF
extracted_text = ""
def clean_text(text):
# Remove special characters and emojis
return re.sub(r'[^\w\s]', '', text)
def generate_response(prompt, image_path=None, audio=None, text_to_image_prompt=None):
if audio:
with tqdm(total=100, desc="Processing Audio") as pbar:
with sr.AudioFile(audio) as source:
audio_data = recognizer.record(source)
pbar.update(50)
try:
prompt = recognizer.recognize_google(audio_data)
pbar.update(50)
except sr.UnknownValueError:
pbar.update(50)
return "Sorry, I could not understand the audio.", None, None
if image_path:
try:
with tqdm(total=100, desc="Describing Image") as pbar:
res = ollama.chat(
model="llava",
messages=[
{
'role': 'user',
'content': 'Describe this image:',
'images': [image_path]
}
]
)
pbar.update(100)
response_text = res['message']['content']
except Exception as e:
response_text = f"Error describing image: {str(e)}"
elif text_to_image_prompt:
try:
with tqdm(total=50, desc="Generating Image") as pbar:
images = text_to_image(text_to_image_prompt, num_inference_steps=50).images
for _ in range(50):
pbar.update(1)
image_path = "generated_image.png"
images[0].save(image_path)
response_text = f"Generated an image for the prompt: {text_to_image_prompt}"
except Exception as e:
response_text = f"Error generating image: {str(e)}"
else:
history.append(prompt)
final_prompt = "\n".join(history)
try:
with tqdm(total=100, desc="Generating Text") as pbar:
res = ollama.chat(
model="gemma2",
messages=[
{
'role': 'user',
'content': final_prompt
}
]
)
pbar.update(100)
response_text = res['message']['content']
except Exception as e:
response_text = f"Error generating text: {str(e)}"
# Clean the response text for voice output
cleaned_response_text = clean_text(response_text)
with tqdm(total=100, desc="Generating Voice Output") as pbar:
tts = gTTS(cleaned_response_text)
tts.save("response.mp3")
pbar.update(100)
return response_text, "response.mp3", image_path if text_to_image_prompt else None
# Function to handle document summarization
def summarize_document(document):
global extracted_text # Use the global variable to store extracted text
try:
reader = PyPDF2.PdfReader(document.name)
full_text = ""
for page in reader.pages:
full_text += page.extract_text()
extracted_text = full_text # Store the extracted text
# Split the text into manageable chunks
chunk_size = 1000 # You can adjust this size based on your needs
chunks = [full_text[i:i + chunk_size] for i in range(0, len(full_text), chunk_size)]
# Initialize progress bar
pbar = tqdm(total=len(chunks), desc="Summarizing Document")
# Summarize each chunk
summaries = []
for chunk in chunks:
summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
summaries.append(summary)
pbar.update(1)
# Combine the summaries
combined_summary = " ".join(summaries)
pbar.close()
return combined_summary
except Exception as e:
return f"Error summarizing document: {str(e)}"
# Function to handle question answering
def answer_question(question):
try:
if not extracted_text:
return "Please upload a document first."
response = qa_model(question=question, context=extracted_text)
answer = response['answer']
# Check if the answer is brief or insufficient
if len(answer.split()) < 20: # Adjust the threshold as needed
# Generate explanation using AI model
explanation_res = ollama.chat(
model="gemma2",
messages=[
{
'role': 'user',
'content': f"Why {question}?"
}
]
)
explanation = explanation_res['message']['content']
return f"Answer: {answer}\nExplanation: {explanation}"
else:
return f"Answer: {answer}"
except Exception as e:
return f"Error answering question: {str(e)}"
# Define Gradio interface for chat functionality
chat_interface = gr.Interface(
fn=generate_response,
inputs=[
gr.Textbox(lines=4, placeholder="Enter your Prompt", label="Text Input"),
gr.Image(type="filepath", label="Upload an Image"),
gr.Audio(type="filepath", label="Voice Input"),
gr.Textbox(lines=2, placeholder="Enter text to generate an image", label="Text to Image Input")
],
outputs=[
"text",
gr.Audio(type="filepath", label="Voice Output"),
gr.Image(type="filepath", label="Generated Image Output")
],
title="Jarvis",
description="Enter a text prompt, upload an image to describe it, use your voice, or generate an image from text."
)
# Define a separate interface for document summarization
document_interface = gr.Interface(
fn=summarize_document,
inputs=gr.File(label="Upload a Document"),
outputs="text",
title="Document Summarizer",
description="Upload a document and get a summarized version of its content."
)
# Define a separate interface for question answering
qa_interface = gr.Interface(
fn=answer_question,
inputs=gr.Textbox(lines=2, placeholder="Enter your question", label="Question"),
outputs="text",
title="Document Question Answering",
description="Ask questions based on the uploaded document. If the answer is brief, an explanation will be provided."
)
# Combine all interfaces
combined_interface = gr.TabbedInterface(
[chat_interface, document_interface, qa_interface],
["Chat Interface", "Document Summarizer", "Document Q&A"],
theme="light", # Optional: Set default theme to light mode
title="Jarvis - AI Assistant"
)
# Launch the interface
combined_interface.launch(share=True)