File size: 7,608 Bytes
ab0378c
 
 
9b5968f
ab0378c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b5968f
ab0378c
9b5968f
 
ab0378c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf3232f
ab0378c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b88d6a3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import os
import re
import gradio as gr
import PyPDF2
from tqdm import tqdm
from gtts import gTTS
from transformers import pipeline
from diffusers import StableDiffusionPipeline
import speech_recognition as sr
import ollama

# Disable Gradio analytics
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"

history = []
recognizer = sr.Recognizer()

# Load the text-to-image pipeline
text_to_image = StableDiffusionPipeline.from_pretrained('CompVis/stable-diffusion-v1-4')
text_to_image.to("cuda")  # If you have a CUDA-capable GPU

# Initialize the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
# Initialize the question-answering pipeline
qa_model = pipeline("question-answering")

# Variable to store extracted text from PDF
extracted_text = ""

def clean_text(text):
    # Remove special characters and emojis
    return re.sub(r'[^\w\s]', '', text)

def generate_response(prompt, image_path=None, audio=None, text_to_image_prompt=None):
    if audio:
        with tqdm(total=100, desc="Processing Audio") as pbar:
            with sr.AudioFile(audio) as source:
                audio_data = recognizer.record(source)
                pbar.update(50)
                try:
                    prompt = recognizer.recognize_google(audio_data)
                    pbar.update(50)
                except sr.UnknownValueError:
                    pbar.update(50)
                    return "Sorry, I could not understand the audio.", None, None

    if image_path:
        try:
            with tqdm(total=100, desc="Describing Image") as pbar:
                res = ollama.chat(
                    model="llava",
                    messages=[
                        {
                            'role': 'user',
                            'content': 'Describe this image:',
                            'images': [image_path]
                        }
                    ]
                )
                pbar.update(100)
            response_text = res['message']['content']
        except Exception as e:
            response_text = f"Error describing image: {str(e)}"
    elif text_to_image_prompt:
        try:
            with tqdm(total=50, desc="Generating Image") as pbar:
                images = text_to_image(text_to_image_prompt, num_inference_steps=50).images
                for _ in range(50):
                    pbar.update(1)
            image_path = "generated_image.png"
            images[0].save(image_path)
            response_text = f"Generated an image for the prompt: {text_to_image_prompt}"
        except Exception as e:
            response_text = f"Error generating image: {str(e)}"
    else:
        history.append(prompt)
        final_prompt = "\n".join(history)
        try:
            with tqdm(total=100, desc="Generating Text") as pbar:
                res = ollama.chat(
                    model="gemma2",
                    messages=[
                        {
                            'role': 'user',
                            'content': final_prompt
                        }
                    ]
                )
                pbar.update(100)
            response_text = res['message']['content']
        except Exception as e:
            response_text = f"Error generating text: {str(e)}"
    
    # Clean the response text for voice output
    cleaned_response_text = clean_text(response_text)
    
    with tqdm(total=100, desc="Generating Voice Output") as pbar:
        tts = gTTS(cleaned_response_text)
        tts.save("response.mp3")
        pbar.update(100)

    return response_text, "response.mp3", image_path if text_to_image_prompt else None

# Function to handle document summarization
def summarize_document(document):
    global extracted_text  # Use the global variable to store extracted text
    try:
        reader = PyPDF2.PdfReader(document.name)
        full_text = ""
        for page in reader.pages:
            full_text += page.extract_text()
        
        extracted_text = full_text  # Store the extracted text

        # Split the text into manageable chunks
        chunk_size = 1000  # You can adjust this size based on your needs
        chunks = [full_text[i:i + chunk_size] for i in range(0, len(full_text), chunk_size)]

        # Initialize progress bar
        pbar = tqdm(total=len(chunks), desc="Summarizing Document")

        # Summarize each chunk
        summaries = []
        for chunk in chunks:
            summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
            summaries.append(summary)
            pbar.update(1)
        
        # Combine the summaries
        combined_summary = " ".join(summaries)
        pbar.close()
        
        return combined_summary
    except Exception as e:
        return f"Error summarizing document: {str(e)}"

# Function to handle question answering
def answer_question(question):
    try:
        if not extracted_text:
            return "Please upload a document first."
        
        response = qa_model(question=question, context=extracted_text)
        answer = response['answer']

        # Check if the answer is brief or insufficient
        if len(answer.split()) < 20:  # Adjust the threshold as needed
            # Generate explanation using AI model
            explanation_res = ollama.chat(
                model="gemma2",
                messages=[
                    {
                        'role': 'user',
                        'content': f"Why {question}?"
                    }
                ]
            )
            explanation = explanation_res['message']['content']
            return f"Answer: {answer}\nExplanation: {explanation}"
        else:
            return f"Answer: {answer}"
    except Exception as e:
        return f"Error answering question: {str(e)}"

# Define Gradio interface for chat functionality
chat_interface = gr.Interface(
    fn=generate_response,
    inputs=[
        gr.Textbox(lines=4, placeholder="Enter your Prompt", label="Text Input"),
        gr.Image(type="filepath", label="Upload an Image"),
        gr.Audio(type="filepath", label="Voice Input"),
        gr.Textbox(lines=2, placeholder="Enter text to generate an image", label="Text to Image Input")
    ], 
    outputs=[
        "text",
        gr.Audio(type="filepath", label="Voice Output"),
        gr.Image(type="filepath", label="Generated Image Output")
    ],
    title="Jarvis",
    description="Enter a text prompt, upload an image to describe it, use your voice, or generate an image from text."
)

# Define a separate interface for document summarization
document_interface = gr.Interface(
    fn=summarize_document,
    inputs=gr.File(label="Upload a Document"),
    outputs="text",
    title="Document Summarizer",
    description="Upload a document and get a summarized version of its content."
)

# Define a separate interface for question answering
qa_interface = gr.Interface(
    fn=answer_question,
    inputs=gr.Textbox(lines=2, placeholder="Enter your question", label="Question"),
    outputs="text",
    title="Document Question Answering",
    description="Ask questions based on the uploaded document. If the answer is brief, an explanation will be provided."
)

# Combine all interfaces
combined_interface = gr.TabbedInterface(
    [chat_interface, document_interface, qa_interface],
    ["Chat Interface", "Document Summarizer", "Document Q&A"],
    theme="light",  # Optional: Set default theme to light mode
    title="Jarvis - AI Assistant"
)

# Launch the interface
combined_interface.launch(share=True)