import gradio as gr import fitz # PyMuPDF for reading PDFs import numpy as np from bokeh.plotting import figure, output_file, save from bokeh.models import HoverTool, ColumnDataSource import umap import pandas as pd from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances from sentence_transformers import SentenceTransformer import tempfile import logging # Set up logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # Initialize the model globally model = SentenceTransformer('all-MiniLM-L6-v2') logging.info("Model loaded successfully.") def process_pdf(pdf_path): logging.info(f"Processing PDF: {pdf_path}") # Open the PDF doc = fitz.open(pdf_path) texts = [page.get_text() for page in doc] print("PDF processed successfully.") return " ".join(texts) def create_embeddings(text): print("Creating embeddings.") sentences = text.split(". ") # A simple split; consider a more robust sentence splitter embeddings = model.encode(sentences) print("Embeddings created successfully.") return embeddings, sentences def generate_plot(query, pdf_file): logging.info("Generating plot.") # Generate embeddings for the query query_embedding = model.encode([query])[0] # Process the PDF and create embeddings text = process_pdf(pdf_file.name) embeddings, sentences = create_embeddings(text) logging.info("Data prepared for UMAP.") # Prepare the data for UMAP and visualization all_embeddings = np.vstack([embeddings, query_embedding]) all_sentences = sentences + [query] # UMAP transformation umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42) umap_embeddings = umap_transform.fit_transform(all_embeddings) logging.info("UMAP transformation completed.") # Find the closest sentences to the query distances = cosine_similarity([query_embedding], embeddings)[0] closest_indices = distances.argsort()[-5:][::-1] # Adjust the number as needed # Prepare data for plotting data = { 'x': umap_embeddings[:-1, 0], # Exclude the query point itself 'y': umap_embeddings[:-1, 1], # Exclude the query point itself 'content': all_sentences[:-1], # Exclude the query sentence itself 'color': ['red' if i in closest_indices else 'blue' for i in range(len(sentences))], } source = ColumnDataSource(data) # Create the Bokeh plot p = figure(title="UMAP Projection of Sentences", width=700, height=700) p.scatter('x', 'y', color='color', source=source) hover = HoverTool(tooltips=[("Content", "@content")]) p.add_tools(hover) logging.info("Plot created successfully.") # Save the plot to an HTML file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".html") output_file(temp_file.name) save(p) logging.info("Plot saved to file.") return temp_file.name def gradio_interface(pdf_file, query): logging.info("Gradio interface called.") plot_path = generate_plot(query, pdf_file) with open(plot_path, "r") as f: html_content = f.read() logging.info("Returning HTML content.") return html_content iface = gr.Interface( fn=gradio_interface, inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Query")], outputs=gr.HTML(label="Visualization"), title="PDF Content Visualizer", description="Upload a PDF and enter a query to visualize the content." ) if __name__ == "__main__": iface.launch()