Spaces:

aquibmoin
/

NASA-SMD-SIMILARITY-CHECKER

Sleeping

App Files Files Community

aquibmoin commited on Oct 22

Commit

a3c1cd7

•

1 Parent(s): f560271

Create app.py

Browse files

Files changed (1) hide show

app.py +62 -0

app.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import fitz  # PyMuPDF for extracting text from PDFs
+from transformers import AutoTokenizer, AutoModel
+import torch
+from sklearn.metrics.pairwise import cosine_similarity
+# Load the NASA-specific bi-encoder model and tokenizer
+bi_encoder_model_name = "nasa-impact/nasa-smd-ibm-st-v2"
+bi_tokenizer = AutoTokenizer.from_pretrained(bi_encoder_model_name)
+bi_model = AutoModel.from_pretrained(bi_encoder_model_name)
+# Function to extract text from a PDF
+def extract_text_from_pdf(pdf_file):
+    text = ""
+    with fitz.open(pdf_file) as doc:
+        for page in doc:
+            text += page.get_text()  # Extract text from each page
+    return text
+# Function to generate embeddings from the text using the NASA Bi-Encoder
+def generate_embedding(text):
+    # Tokenize the text and create input tensors
+    inputs = bi_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
+    # Use torch.no_grad() to disable gradient calculation during inference
+    with torch.no_grad():
+        # Pass inputs to the model to generate embeddings
+        outputs = bi_model(**inputs)
+    # Mean pooling to get the final embedding for the text
+    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
+    return embedding
+# Function to compute the cosine similarity between two embeddings
+def compute_cosine_similarity(embedding1, embedding2):
+    # Reshape the embeddings and calculate cosine similarity
+    embedding1 = embedding1.reshape(1, -1)
+    embedding2 = embedding2.reshape(1, -1)
+    return cosine_similarity(embedding1, embedding2)[0][0]
+# Function to handle the full workflow: extract text, generate embeddings, and compute similarity
+def compare_pdfs(pdf1, pdf2):
+    # Extract text from both PDFs
+    text1 = extract_text_from_pdf(pdf1)
+    text2 = extract_text_from_pdf(pdf2)
+    # Generate embeddings for both texts using the NASA Bi-Encoder
+    embedding1 = generate_embedding(text1)
+    embedding2 = generate_embedding(text2)
+    # Compute cosine similarity between the two embeddings
+    similarity_score = compute_cosine_similarity(embedding1, embedding2)
+    # Return the similarity score
+    return f"The cosine similarity between the two PDF documents is: {similarity_score:.4f}"
+# Gradio interface: accept two PDF files and output cosine similarity score
+inputs = [gr.inputs.File(label="Upload Human SCDD"), gr.inputs.File(label="Upload AI SCDD")]
+outputs = gr.outputs.Textbox(label="Cosine Similarity")
+# Set up the Gradio interface
+gr.Interface(fn=compare_pdfs, inputs=inputs, outputs=outputs, title="PDF Cosine Similarity with NASA Bi-Encoder").launch()