umarigan commited on
Commit
0164e97
1 Parent(s): c38bbc6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -1
app.py CHANGED
@@ -8,22 +8,32 @@ import pandas as pd
8
  from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
9
  from sentence_transformers import SentenceTransformer
10
  import tempfile
 
 
 
 
11
 
12
  # Initialize the model globally
13
  model = SentenceTransformer('all-MiniLM-L6-v2')
 
14
 
15
  def process_pdf(pdf_path):
 
16
  # Open the PDF
17
  doc = fitz.open(pdf_path)
18
  texts = [page.get_text() for page in doc]
 
19
  return " ".join(texts)
20
 
21
  def create_embeddings(text):
 
22
  sentences = text.split(". ") # A simple split; consider a more robust sentence splitter
23
  embeddings = model.encode(sentences)
 
24
  return embeddings, sentences
25
 
26
  def generate_plot(query, pdf_file):
 
27
  # Generate embeddings for the query
28
  query_embedding = model.encode([query])[0]
29
 
@@ -31,6 +41,7 @@ def generate_plot(query, pdf_file):
31
  text = process_pdf(pdf_file.name)
32
  embeddings, sentences = create_embeddings(text)
33
 
 
34
  # Prepare the data for UMAP and visualization
35
  all_embeddings = np.vstack([embeddings, query_embedding])
36
  all_sentences = sentences + [query]
@@ -39,6 +50,7 @@ def generate_plot(query, pdf_file):
39
  umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
40
  umap_embeddings = umap_transform.fit_transform(all_embeddings)
41
 
 
42
  # Find the closest sentences to the query
43
  distances = cosine_similarity([query_embedding], embeddings)[0]
44
  closest_indices = distances.argsort()[-5:][::-1] # Adjust the number as needed
@@ -59,16 +71,20 @@ def generate_plot(query, pdf_file):
59
  hover = HoverTool(tooltips=[("Content", "@content")])
60
  p.add_tools(hover)
61
 
 
62
  # Save the plot to an HTML file
63
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".html")
64
  output_file(temp_file.name)
65
  save(p)
 
66
  return temp_file.name
67
 
68
  def gradio_interface(pdf_file, query):
 
69
  plot_path = generate_plot(query, pdf_file)
70
  with open(plot_path, "r") as f:
71
  html_content = f.read()
 
72
  return html_content
73
 
74
  iface = gr.Interface(
@@ -80,4 +96,4 @@ iface = gr.Interface(
80
  )
81
 
82
  if __name__ == "__main__":
83
- iface.launch()
 
8
  from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
9
  from sentence_transformers import SentenceTransformer
10
  import tempfile
11
+ import logging
12
+
13
+ # Set up logging
14
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
15
 
16
  # Initialize the model globally
17
  model = SentenceTransformer('all-MiniLM-L6-v2')
18
+ logging.info("Model loaded successfully.")
19
 
20
  def process_pdf(pdf_path):
21
+ logging.info(f"Processing PDF: {pdf_path}")
22
  # Open the PDF
23
  doc = fitz.open(pdf_path)
24
  texts = [page.get_text() for page in doc]
25
+ logging.info("PDF processed successfully.")
26
  return " ".join(texts)
27
 
28
  def create_embeddings(text):
29
+ logging.info("Creating embeddings.")
30
  sentences = text.split(". ") # A simple split; consider a more robust sentence splitter
31
  embeddings = model.encode(sentences)
32
+ logging.info("Embeddings created successfully.")
33
  return embeddings, sentences
34
 
35
  def generate_plot(query, pdf_file):
36
+ logging.info("Generating plot.")
37
  # Generate embeddings for the query
38
  query_embedding = model.encode([query])[0]
39
 
 
41
  text = process_pdf(pdf_file.name)
42
  embeddings, sentences = create_embeddings(text)
43
 
44
+ logging.info("Data prepared for UMAP.")
45
  # Prepare the data for UMAP and visualization
46
  all_embeddings = np.vstack([embeddings, query_embedding])
47
  all_sentences = sentences + [query]
 
50
  umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
51
  umap_embeddings = umap_transform.fit_transform(all_embeddings)
52
 
53
+ logging.info("UMAP transformation completed.")
54
  # Find the closest sentences to the query
55
  distances = cosine_similarity([query_embedding], embeddings)[0]
56
  closest_indices = distances.argsort()[-5:][::-1] # Adjust the number as needed
 
71
  hover = HoverTool(tooltips=[("Content", "@content")])
72
  p.add_tools(hover)
73
 
74
+ logging.info("Plot created successfully.")
75
  # Save the plot to an HTML file
76
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".html")
77
  output_file(temp_file.name)
78
  save(p)
79
+ logging.info("Plot saved to file.")
80
  return temp_file.name
81
 
82
  def gradio_interface(pdf_file, query):
83
+ logging.info("Gradio interface called.")
84
  plot_path = generate_plot(query, pdf_file)
85
  with open(plot_path, "r") as f:
86
  html_content = f.read()
87
+ logging.info("Returning HTML content.")
88
  return html_content
89
 
90
  iface = gr.Interface(
 
96
  )
97
 
98
  if __name__ == "__main__":
99
+ iface.launch()