Spaces:
Sleeping
Sleeping
import gradio as gr | |
import fitz # PyMuPDF for reading PDFs | |
import numpy as np | |
from bokeh.plotting import figure, output_file, save | |
from bokeh.models import HoverTool, ColumnDataSource | |
import umap | |
import pandas as pd | |
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances | |
from sentence_transformers import SentenceTransformer | |
import tempfile | |
import logging | |
# Set up logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
# Initialize the model globally | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
logging.info("Model loaded successfully.") | |
def process_pdf(pdf_path): | |
logging.info(f"Processing PDF: {pdf_path}") | |
# Open the PDF | |
doc = fitz.open(pdf_path) | |
texts = [page.get_text() for page in doc] | |
print("PDF processed successfully.") | |
return " ".join(texts) | |
def create_embeddings(text): | |
print("Creating embeddings.") | |
sentences = text.split(". ") # A simple split; consider a more robust sentence splitter | |
embeddings = model.encode(sentences) | |
print("Embeddings created successfully.") | |
return embeddings, sentences | |
def generate_plot(query, pdf_file): | |
logging.info("Generating plot.") | |
# Generate embeddings for the query | |
query_embedding = model.encode([query])[0] | |
# Process the PDF and create embeddings | |
text = process_pdf(pdf_file.name) | |
embeddings, sentences = create_embeddings(text) | |
logging.info("Data prepared for UMAP.") | |
# Prepare the data for UMAP and visualization | |
all_embeddings = np.vstack([embeddings, query_embedding]) | |
all_sentences = sentences + [query] | |
# UMAP transformation | |
umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42) | |
umap_embeddings = umap_transform.fit_transform(all_embeddings) | |
logging.info("UMAP transformation completed.") | |
# Find the closest sentences to the query | |
distances = cosine_similarity([query_embedding], embeddings)[0] | |
closest_indices = distances.argsort()[-5:][::-1] # Adjust the number as needed | |
# Prepare data for plotting | |
data = { | |
'x': umap_embeddings[:-1, 0], # Exclude the query point itself | |
'y': umap_embeddings[:-1, 1], # Exclude the query point itself | |
'content': all_sentences[:-1], # Exclude the query sentence itself | |
'color': ['red' if i in closest_indices else 'blue' for i in range(len(sentences))], | |
} | |
source = ColumnDataSource(data) | |
# Create the Bokeh plot | |
p = figure(title="UMAP Projection of Sentences", width=700, height=700) | |
p.scatter('x', 'y', color='color', source=source) | |
hover = HoverTool(tooltips=[("Content", "@content")]) | |
p.add_tools(hover) | |
logging.info("Plot created successfully.") | |
# Save the plot to an HTML file | |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".html") | |
logging.info(f"temp file is {temp_file.name}") | |
output_file(temp_file.name) | |
save(p) | |
logging.info("Plot saved to file.") | |
return temp_file.name | |
def gradio_interface(pdf_file, query): | |
logging.info("Gradio interface called.") | |
plot_path = generate_plot(query, pdf_file) | |
with open(plot_path, "r") as f: | |
html_content = f.read() | |
logging.info("Returning HTML content.") | |
return html_content | |
iface = gr.Interface( | |
fn=gradio_interface, | |
inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Query")], | |
outputs=gr.HTML(label="Visualization"), | |
title="PDF Content Visualizer", | |
description="Upload a PDF and enter a query to visualize the content." | |
) | |
if __name__ == "__main__": | |
iface.launch() | |