File size: 2,652 Bytes
33f989b
a3c1cd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01437c3
 
a3c1cd7
 
01437c3
34aa3d8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import gradio as gr
import fitz  # PyMuPDF for extracting text from PDFs
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load the NASA-specific bi-encoder model and tokenizer
bi_encoder_model_name = "nasa-impact/nasa-smd-ibm-st-v2"
bi_tokenizer = AutoTokenizer.from_pretrained(bi_encoder_model_name)
bi_model = AutoModel.from_pretrained(bi_encoder_model_name)

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_file):
    text = ""
    with fitz.open(pdf_file) as doc:
        for page in doc:
            text += page.get_text()  # Extract text from each page
    return text

# Function to generate embeddings from the text using the NASA Bi-Encoder
def generate_embedding(text):
    # Tokenize the text and create input tensors
    inputs = bi_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Use torch.no_grad() to disable gradient calculation during inference
    with torch.no_grad():
        # Pass inputs to the model to generate embeddings
        outputs = bi_model(**inputs)
    
    # Mean pooling to get the final embedding for the text
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    
    return embedding

# Function to compute the cosine similarity between two embeddings
def compute_cosine_similarity(embedding1, embedding2):
    # Reshape the embeddings and calculate cosine similarity
    embedding1 = embedding1.reshape(1, -1)
    embedding2 = embedding2.reshape(1, -1)
    return cosine_similarity(embedding1, embedding2)[0][0]

# Function to handle the full workflow: extract text, generate embeddings, and compute similarity
def compare_pdfs(pdf1, pdf2):
    # Extract text from both PDFs
    text1 = extract_text_from_pdf(pdf1)
    text2 = extract_text_from_pdf(pdf2)

    # Generate embeddings for both texts using the NASA Bi-Encoder
    embedding1 = generate_embedding(text1)
    embedding2 = generate_embedding(text2)

    # Compute cosine similarity between the two embeddings
    similarity_score = compute_cosine_similarity(embedding1, embedding2)
    
    # Return the similarity score
    return f"The cosine similarity between the two PDF documents is: {similarity_score:.4f}"

# Gradio interface: accept two PDF files and output cosine similarity score
inputs = [gr.File(label="Upload Human SCDD"), gr.File(label="Upload AI SCDD")]
outputs = gr.Textbox(label="Cosine Similarity Score")

# Set up the Gradio interface
gr.Interface(fn=compare_pdfs, inputs=inputs, outputs=outputs, title="AI-Human SCDD Similarity Checker with NASA Bi-Encoder").launch()