import gradio as gr from sentence_transformers import SentenceTransformer import fitz # PyMuPDF import numpy as np from bokeh.plotting import figure, output_file, save from bokeh.io import export_png from bokeh.embed import file_html from bokeh.resources import CDN import tempfile import os # Load your model model = SentenceTransformer('all-MiniLM-L6-v2') def process_pdf(pdf_path): # Open the PDF doc = fitz.open(pdf_path) texts = [] for page in doc: texts.append(page.get_text()) return " ".join(texts) def create_embeddings(text): # Split the text into sentences/chunks and generate embeddings # This is a placeholder for your actual text splitting and embedding code sentences = text.split(".") # Simplistic split, consider using a better sentence splitter embeddings = model.encode(sentences) return embeddings, sentences def generate_plot(query, pdf_file): # Process the PDF and create embeddings text = process_pdf(pdf_file) embeddings, sentences = create_embeddings(text) # Here, you'll integrate the UMAP and Bokeh visualization code you have, # and then save the Bokeh plot to a file. # For simplicity, let's assume it's saved to 'plot.html' output_file("plot.html") # Your Bokeh plot creation code here... save(p) # Assuming 'p' is your Bokeh figure # Alternatively, you can save as PNG # export_png(p, filename="plot.png") # Return the path to the saved file return "plot.html" # or "plot.png" def gradio_interface(pdf_file, query): plot_path = generate_plot(query, pdf_file.name) # If returning HTML file with open(plot_path, "r") as f: html_content = f.read() return html_content # If returning an image # return plot_path # Set up the Gradio app iface = gr.Interface( fn=gradio_interface, inputs=[gr.inputs.File(label="Upload PDF"), gr.inputs.Textbox(label="Query")], outputs=gr.outputs.HTML(label="Visualization"), # Use gr.outputs.Image for image output title="PDF Content Visualizer", description="Upload a PDF and enter a query to visualize the content." ) # Run the app iface.launch()