SemanticSearch / app.py
umarigan's picture
Update app.py
beac33f verified
raw
history blame
3.64 kB
import gradio as gr
import fitz # PyMuPDF for reading PDFs
import numpy as np
from bokeh.plotting import figure, output_file, save
from bokeh.models import HoverTool, ColumnDataSource
import umap
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sentence_transformers import SentenceTransformer
import tempfile
import logging
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Initialize the model globally
model = SentenceTransformer('all-MiniLM-L6-v2')
logging.info("Model loaded successfully.")
def process_pdf(pdf_path):
logging.info(f"Processing PDF: {pdf_path}")
# Open the PDF
doc = fitz.open(pdf_path)
texts = [page.get_text() for page in doc]
print("PDF processed successfully.")
return " ".join(texts)
def create_embeddings(text):
print("Creating embeddings.")
sentences = text.split(". ") # A simple split; consider a more robust sentence splitter
embeddings = model.encode(sentences)
print("Embeddings created successfully.")
return embeddings, sentences
def generate_plot(query, pdf_file):
logging.info("Generating plot.")
# Generate embeddings for the query
query_embedding = model.encode([query])[0]
# Process the PDF and create embeddings
text = process_pdf(pdf_file.name)
embeddings, sentences = create_embeddings(text)
logging.info("Data prepared for UMAP.")
# Prepare the data for UMAP and visualization
all_embeddings = np.vstack([embeddings, query_embedding])
all_sentences = sentences + [query]
# UMAP transformation
umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
umap_embeddings = umap_transform.fit_transform(all_embeddings)
logging.info("UMAP transformation completed.")
# Find the closest sentences to the query
distances = cosine_similarity([query_embedding], embeddings)[0]
closest_indices = distances.argsort()[-5:][::-1] # Adjust the number as needed
# Prepare data for plotting
data = {
'x': umap_embeddings[:-1, 0], # Exclude the query point itself
'y': umap_embeddings[:-1, 1], # Exclude the query point itself
'content': all_sentences[:-1], # Exclude the query sentence itself
'color': ['red' if i in closest_indices else 'blue' for i in range(len(sentences))],
}
source = ColumnDataSource(data)
# Create the Bokeh plot
p = figure(title="UMAP Projection of Sentences", width=700, height=700)
p.scatter('x', 'y', color='color', source=source)
hover = HoverTool(tooltips=[("Content", "@content")])
p.add_tools(hover)
logging.info("Plot created successfully.")
# Save the plot to an HTML file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".html")
logging.info(f"temp file is {temp_file.name}")
output_file(temp_file.name)
save(p)
logging.info("Plot saved to file.")
return temp_file.name
def gradio_interface(pdf_file, query):
logging.info("Gradio interface called.")
plot_path = generate_plot(query, pdf_file)
with open(plot_path, "r") as f:
html_content = f.read()
logging.info("Returning HTML content.")
return html_content
iface = gr.Interface(
fn=gradio_interface,
inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Query")],
outputs=gr.HTML(label="Visualization"),
title="PDF Content Visualizer",
description="Upload a PDF and enter a query to visualize the content."
)
if __name__ == "__main__":
iface.launch()