Spaces:

umarigan
/

SemanticSearch

Sleeping

App Files Files Community

SemanticSearch / app.py

umarigan

Update app.py

0ce5ee0 verified 7 months ago

raw

history blame contribute delete

4.85 kB

	import gradio as gr
	import fitz # PyMuPDF for reading PDFs
	import numpy as np
	import pandas as pd
	import logging
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
	from sklearn.metrics.pairwise import linear_kernel as dot_similarity # For dot product
	import umap
	import plotly.graph_objects as go

	# Set up logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	# Initialize the model globally
	model = SentenceTransformer('all-MiniLM-L6-v2')
	logging.info("Model loaded successfully.")

	def process_pdf(pdf_path):
	logging.info(f"Processing PDF: {pdf_path}")
	doc = fitz.open(pdf_path)
	texts = [page.get_text() for page in doc]
	logging.info("PDF processed successfully.")
	return " ".join(texts)

	def create_embeddings(text):
	logging.info("Creating embeddings.")
	sentences = text.split(". ") # A simple split; consider a more robust sentence splitter
	embeddings = model.encode(sentences)
	logging.info("Embeddings created successfully.")
	return embeddings, sentences

	def calculate_distances(embeddings, query_embedding, metric):
	if metric == "cosine":
	distances = 1 - cosine_similarity(embeddings, [query_embedding])
	elif metric == "euclidean":
	distances = euclidean_distances(embeddings, [query_embedding])
	elif metric == "manhattan":
	distances = manhattan_distances(embeddings, [query_embedding])
	elif metric == "dot":
	distances = -dot_similarity(embeddings, [query_embedding]) # Negated for consistency with other metrics
	return distances.flatten()

	def wrap_text(text, width=40):
	"""
	Inserts HTML line breaks for Plotly hover text.
	:param text: The text to wrap.
	:param width: The maximum line width before wrapping.
	:return: Text with line breaks inserted.
	"""
	wrapped_text = '<br>'.join([text[i:i+width] for i in range(0, len(text), width)])
	return wrapped_text

	def generate_plotly_figure(query, pdf_file, metric):
	logging.info("Generating plot with Plotly.")
	query_embedding = model.encode([query])[0]
	text = process_pdf(pdf_file.name)
	embeddings, sentences = create_embeddings(text)

	# Wrap text for each sentence
	sentences_wrapped = [wrap_text(sentence) for sentence in sentences]
	all_sentences_wrapped = sentences_wrapped + [wrap_text(query)] # Apply wrapping to the query as well

	all_embeddings = np.vstack([embeddings, query_embedding])

	umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
	umap_embeddings = umap_transform.fit_transform(all_embeddings)

	distances = calculate_distances(embeddings, query_embedding, metric)
	closest_indices = np.argsort(distances)[:5] # Get indices of 5 closest sentences

	colors = ['green' if i in closest_indices else 'blue' for i in range(len(sentences))]
	colors.append('red') # For the query

	fig = go.Figure(data=go.Scatter(x=umap_embeddings[:-1, 0], y=umap_embeddings[:-1, 1], mode='markers',
	marker=dict(color=colors[:-1]), text=all_sentences_wrapped[:-1],
	name='Chunks', hoverinfo='text'))
	fig.add_trace(go.Scatter(x=[umap_embeddings[-1, 0]], y=[umap_embeddings[-1, 1]], mode='markers',
	marker=dict(color='red'), text=[all_sentences_wrapped[-1]], name='Query', hoverinfo='text'))
	fig.update_layout(title="UMAP Projection of Sentences with Query Highlight", xaxis_title="UMAP 1", yaxis_title="UMAP 2")

	logging.info("Plotly figure created successfully.")
	return fig

	def gradio_interface(pdf_file, query, metric):
	logging.info("Gradio interface called with metric: " + metric)
	fig = generate_plotly_figure(query, pdf_file, metric)
	logging.info("Returning Plotly figure.")
	return fig

	iface = gr.Interface(
	fn=gradio_interface,
	inputs=[
	gr.File(label="Upload a PDF"),
	gr.Textbox(label="Query"),
	gr.Radio(choices=["cosine", "euclidean", "manhattan", "dot"], label="Choose Distance Metric")
	],
	outputs=gr.Plot(),
	title="Semantic Search Visualizer",
	description="""This tool allows you to upload a PDF document, input a query, and visualize the context of the document
	as it relates to your query. It uses UMAP for dimensionality reduction and highlights the query and its closest contexts
	within the document based on the selected distance metric. Choose from cosine, Euclidean, Manhattan, or dot product metrics
	to explore different aspects of textual similarity.
	umap args: n_neighbors=15, min_dist=0.0,
	Green dots are the closest vectors
	"""
	)

	if __name__ == "__main__":
	iface.launch()