Spaces:

aquibmoin
/

NASA-SMD-SIMILARITY-CHECKER

Sleeping

File size: 2,652 Bytes

import gradio as gr
import fitz  # PyMuPDF for extracting text from PDFs
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load the NASA-specific bi-encoder model and tokenizer
bi_encoder_model_name = "nasa-impact/nasa-smd-ibm-st-v2"
bi_tokenizer = AutoTokenizer.from_pretrained(bi_encoder_model_name)
bi_model = AutoModel.from_pretrained(bi_encoder_model_name)

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_file):
    text = ""
    with fitz.open(pdf_file) as doc:
        for page in doc:
            text += page.get_text()  # Extract text from each page
    return text

# Function to generate embeddings from the text using the NASA Bi-Encoder
def generate_embedding(text):
    # Tokenize the text and create input tensors
    inputs = bi_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Use torch.no_grad() to disable gradient calculation during inference
    with torch.no_grad():
        # Pass inputs to the model to generate embeddings
        outputs = bi_model(**inputs)
    
    # Mean pooling to get the final embedding for the text
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    
    return embedding

# Function to compute the cosine similarity between two embeddings
def compute_cosine_similarity(embedding1, embedding2):
    # Reshape the embeddings and calculate cosine similarity
    embedding1 = embedding1.reshape(1, -1)
    embedding2 = embedding2.reshape(1, -1)
    return cosine_similarity(embedding1, embedding2)[0][0]

# Function to handle the full workflow: extract text, generate embeddings, and compute similarity
def compare_pdfs(pdf1, pdf2):
    # Extract text from both PDFs
    text1 = extract_text_from_pdf(pdf1)
    text2 = extract_text_from_pdf(pdf2)

    # Generate embeddings for both texts using the NASA Bi-Encoder
    embedding1 = generate_embedding(text1)
    embedding2 = generate_embedding(text2)

    # Compute cosine similarity between the two embeddings
    similarity_score = compute_cosine_similarity(embedding1, embedding2)
    
    # Return the similarity score
    return f"The cosine similarity between the two PDF documents is: {similarity_score:.4f}"

# Gradio interface: accept two PDF files and output cosine similarity score
inputs = [gr.File(label="Upload Human SCDD"), gr.File(label="Upload AI SCDD")]
outputs = gr.Textbox(label="Cosine Similarity Score")

# Set up the Gradio interface
gr.Interface(fn=compare_pdfs, inputs=inputs, outputs=outputs, title="AI-Human SCDD Similarity Checker with NASA Bi-Encoder").launch()