umarigan commited on
Commit
e932fdf
1 Parent(s): f810f7b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -0
app.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from sentence_transformers import SentenceTransformer
3
+ import fitz # PyMuPDF
4
+ import numpy as np
5
+ from bokeh.plotting import figure, output_file, save
6
+ from bokeh.io import export_png
7
+ from bokeh.embed import file_html
8
+ from bokeh.resources import CDN
9
+ import tempfile
10
+ import os
11
+
12
+ # Load your model
13
+ model = SentenceTransformer('all-MiniLM-L6-v2')
14
+
15
+ def process_pdf(pdf_path):
16
+ # Open the PDF
17
+ doc = fitz.open(pdf_path)
18
+ texts = []
19
+ for page in doc:
20
+ texts.append(page.get_text())
21
+ return " ".join(texts)
22
+
23
+ def create_embeddings(text):
24
+ # Split the text into sentences/chunks and generate embeddings
25
+ # This is a placeholder for your actual text splitting and embedding code
26
+ sentences = text.split(".") # Simplistic split, consider using a better sentence splitter
27
+ embeddings = model.encode(sentences)
28
+ return embeddings, sentences
29
+
30
+ def generate_plot(query, pdf_file):
31
+ # Process the PDF and create embeddings
32
+ text = process_pdf(pdf_file)
33
+ embeddings, sentences = create_embeddings(text)
34
+
35
+ # Here, you'll integrate the UMAP and Bokeh visualization code you have,
36
+ # and then save the Bokeh plot to a file.
37
+ # For simplicity, let's assume it's saved to 'plot.html'
38
+
39
+ output_file("plot.html")
40
+ # Your Bokeh plot creation code here...
41
+ save(p) # Assuming 'p' is your Bokeh figure
42
+
43
+ # Alternatively, you can save as PNG
44
+ # export_png(p, filename="plot.png")
45
+
46
+ # Return the path to the saved file
47
+ return "plot.html" # or "plot.png"
48
+
49
+ def gradio_interface(pdf_file, query):
50
+ plot_path = generate_plot(query, pdf_file.name)
51
+
52
+ # If returning HTML file
53
+ with open(plot_path, "r") as f:
54
+ html_content = f.read()
55
+ return html_content
56
+
57
+ # If returning an image
58
+ # return plot_path
59
+
60
+ # Set up the Gradio app
61
+ iface = gr.Interface(
62
+ fn=gradio_interface,
63
+ inputs=[gr.inputs.File(label="Upload PDF"), gr.inputs.Textbox(label="Query")],
64
+ outputs=gr.outputs.HTML(label="Visualization"), # Use gr.outputs.Image for image output
65
+ title="PDF Content Visualizer",
66
+ description="Upload a PDF and enter a query to visualize the content."
67
+ )
68
+
69
+ # Run the app
70
+ iface.launch()