Spaces:

aquibmoin
/

NASA-SMD-SIMILARITY-CHECKER

Sleeping

App Files Files Community

NASA-SMD-SIMILARITY-CHECKER / app.py

aquibmoin

Update app.py

01437c3 verified about 1 month ago

raw

history blame

2.65 kB

	import gradio as gr
	import fitz # PyMuPDF for extracting text from PDFs
	from transformers import AutoTokenizer, AutoModel
	import torch
	from sklearn.metrics.pairwise import cosine_similarity

	# Load the NASA-specific bi-encoder model and tokenizer
	bi_encoder_model_name = "nasa-impact/nasa-smd-ibm-st-v2"
	bi_tokenizer = AutoTokenizer.from_pretrained(bi_encoder_model_name)
	bi_model = AutoModel.from_pretrained(bi_encoder_model_name)

	# Function to extract text from a PDF
	def extract_text_from_pdf(pdf_file):
	text = ""
	with fitz.open(pdf_file) as doc:
	for page in doc:
	text += page.get_text() # Extract text from each page
	return text

	# Function to generate embeddings from the text using the NASA Bi-Encoder
	def generate_embedding(text):
	# Tokenize the text and create input tensors
	inputs = bi_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

	# Use torch.no_grad() to disable gradient calculation during inference
	with torch.no_grad():
	# Pass inputs to the model to generate embeddings
	outputs = bi_model(**inputs)

	# Mean pooling to get the final embedding for the text
	embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

	return embedding

	# Function to compute the cosine similarity between two embeddings
	def compute_cosine_similarity(embedding1, embedding2):
	# Reshape the embeddings and calculate cosine similarity
	embedding1 = embedding1.reshape(1, -1)
	embedding2 = embedding2.reshape(1, -1)
	return cosine_similarity(embedding1, embedding2)[0][0]

	# Function to handle the full workflow: extract text, generate embeddings, and compute similarity
	def compare_pdfs(pdf1, pdf2):
	# Extract text from both PDFs
	text1 = extract_text_from_pdf(pdf1)
	text2 = extract_text_from_pdf(pdf2)

	# Generate embeddings for both texts using the NASA Bi-Encoder
	embedding1 = generate_embedding(text1)
	embedding2 = generate_embedding(text2)

	# Compute cosine similarity between the two embeddings
	similarity_score = compute_cosine_similarity(embedding1, embedding2)

	# Return the similarity score
	return f"The cosine similarity between the two PDF documents is: {similarity_score:.4f}"

	# Gradio interface: accept two PDF files and output cosine similarity score
	inputs = [gr.File(label="Upload Human SCDD"), gr.File(label="Upload AI SCDD")]
	outputs = gr.Textbox(label="Cosine Similarity Score")

	# Set up the Gradio interface
	gr.Interface(fn=compare_pdfs, inputs=inputs, outputs=outputs, title="AI-Human SCDD Similarity Checker with NASA Bi-Encoder").launch()