Spaces:

yesh987
/

Jarvis-AI_assist

Runtime error

File size: 7,608 Bytes

import os
import re
import gradio as gr
import PyPDF2
from tqdm import tqdm
from gtts import gTTS
from transformers import pipeline
from diffusers import StableDiffusionPipeline
import speech_recognition as sr
import ollama

# Disable Gradio analytics
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"

history = []
recognizer = sr.Recognizer()

# Load the text-to-image pipeline
text_to_image = StableDiffusionPipeline.from_pretrained('CompVis/stable-diffusion-v1-4')
text_to_image.to("cuda")  # If you have a CUDA-capable GPU

# Initialize the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
# Initialize the question-answering pipeline
qa_model = pipeline("question-answering")

# Variable to store extracted text from PDF
extracted_text = ""

def clean_text(text):
    # Remove special characters and emojis
    return re.sub(r'[^\w\s]', '', text)

def generate_response(prompt, image_path=None, audio=None, text_to_image_prompt=None):
    if audio:
        with tqdm(total=100, desc="Processing Audio") as pbar:
            with sr.AudioFile(audio) as source:
                audio_data = recognizer.record(source)
                pbar.update(50)
                try:
                    prompt = recognizer.recognize_google(audio_data)
                    pbar.update(50)
                except sr.UnknownValueError:
                    pbar.update(50)
                    return "Sorry, I could not understand the audio.", None, None

    if image_path:
        try:
            with tqdm(total=100, desc="Describing Image") as pbar:
                res = ollama.chat(
                    model="llava",
                    messages=[
                        {
                            'role': 'user',
                            'content': 'Describe this image:',
                            'images': [image_path]
                        }
                    ]
                )
                pbar.update(100)
            response_text = res['message']['content']
        except Exception as e:
            response_text = f"Error describing image: {str(e)}"
    elif text_to_image_prompt:
        try:
            with tqdm(total=50, desc="Generating Image") as pbar:
                images = text_to_image(text_to_image_prompt, num_inference_steps=50).images
                for _ in range(50):
                    pbar.update(1)
            image_path = "generated_image.png"
            images[0].save(image_path)
            response_text = f"Generated an image for the prompt: {text_to_image_prompt}"
        except Exception as e:
            response_text = f"Error generating image: {str(e)}"
    else:
        history.append(prompt)
        final_prompt = "\n".join(history)
        try:
            with tqdm(total=100, desc="Generating Text") as pbar:
                res = ollama.chat(
                    model="gemma2",
                    messages=[
                        {
                            'role': 'user',
                            'content': final_prompt
                        }
                    ]
                )
                pbar.update(100)
            response_text = res['message']['content']
        except Exception as e:
            response_text = f"Error generating text: {str(e)}"
    
    # Clean the response text for voice output
    cleaned_response_text = clean_text(response_text)
    
    with tqdm(total=100, desc="Generating Voice Output") as pbar:
        tts = gTTS(cleaned_response_text)
        tts.save("response.mp3")
        pbar.update(100)

    return response_text, "response.mp3", image_path if text_to_image_prompt else None

# Function to handle document summarization
def summarize_document(document):
    global extracted_text  # Use the global variable to store extracted text
    try:
        reader = PyPDF2.PdfReader(document.name)
        full_text = ""
        for page in reader.pages:
            full_text += page.extract_text()
        
        extracted_text = full_text  # Store the extracted text

        # Split the text into manageable chunks
        chunk_size = 1000  # You can adjust this size based on your needs
        chunks = [full_text[i:i + chunk_size] for i in range(0, len(full_text), chunk_size)]

        # Initialize progress bar
        pbar = tqdm(total=len(chunks), desc="Summarizing Document")

        # Summarize each chunk
        summaries = []
        for chunk in chunks:
            summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
            summaries.append(summary)
            pbar.update(1)
        
        # Combine the summaries
        combined_summary = " ".join(summaries)
        pbar.close()
        
        return combined_summary
    except Exception as e:
        return f"Error summarizing document: {str(e)}"

# Function to handle question answering
def answer_question(question):
    try:
        if not extracted_text:
            return "Please upload a document first."
        
        response = qa_model(question=question, context=extracted_text)
        answer = response['answer']

        # Check if the answer is brief or insufficient
        if len(answer.split()) < 20:  # Adjust the threshold as needed
            # Generate explanation using AI model
            explanation_res = ollama.chat(
                model="gemma2",
                messages=[
                    {
                        'role': 'user',
                        'content': f"Why {question}?"
                    }
                ]
            )
            explanation = explanation_res['message']['content']
            return f"Answer: {answer}\nExplanation: {explanation}"
        else:
            return f"Answer: {answer}"
    except Exception as e:
        return f"Error answering question: {str(e)}"

# Define Gradio interface for chat functionality
chat_interface = gr.Interface(
    fn=generate_response,
    inputs=[
        gr.Textbox(lines=4, placeholder="Enter your Prompt", label="Text Input"),
        gr.Image(type="filepath", label="Upload an Image"),
        gr.Audio(type="filepath", label="Voice Input"),
        gr.Textbox(lines=2, placeholder="Enter text to generate an image", label="Text to Image Input")
    ], 
    outputs=[
        "text",
        gr.Audio(type="filepath", label="Voice Output"),
        gr.Image(type="filepath", label="Generated Image Output")
    ],
    title="Jarvis",
    description="Enter a text prompt, upload an image to describe it, use your voice, or generate an image from text."
)

# Define a separate interface for document summarization
document_interface = gr.Interface(
    fn=summarize_document,
    inputs=gr.File(label="Upload a Document"),
    outputs="text",
    title="Document Summarizer",
    description="Upload a document and get a summarized version of its content."
)

# Define a separate interface for question answering
qa_interface = gr.Interface(
    fn=answer_question,
    inputs=gr.Textbox(lines=2, placeholder="Enter your question", label="Question"),
    outputs="text",
    title="Document Question Answering",
    description="Ask questions based on the uploaded document. If the answer is brief, an explanation will be provided."
)

# Combine all interfaces
combined_interface = gr.TabbedInterface(
    [chat_interface, document_interface, qa_interface],
    ["Chat Interface", "Document Summarizer", "Document Q&A"],
    theme="light",  # Optional: Set default theme to light mode
    title="Jarvis - AI Assistant"
)

# Launch the interface
combined_interface.launch(share=True)