import gradio as gr import fitz # PyMuPDF for extracting text from PDFs from transformers import AutoTokenizer, AutoModel import torch from sklearn.metrics.pairwise import cosine_similarity # Load the NASA-specific bi-encoder model and tokenizer bi_encoder_model_name = "nasa-impact/nasa-smd-ibm-st-v2" bi_tokenizer = AutoTokenizer.from_pretrained(bi_encoder_model_name) bi_model = AutoModel.from_pretrained(bi_encoder_model_name) # Function to extract text from a PDF def extract_text_from_pdf(pdf_file): text = "" with fitz.open(pdf_file) as doc: for page in doc: text += page.get_text() # Extract text from each page return text # Function to generate embeddings from the text using the NASA Bi-Encoder def generate_embedding(text): # Tokenize the text and create input tensors inputs = bi_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) # Use torch.no_grad() to disable gradient calculation during inference with torch.no_grad(): # Pass inputs to the model to generate embeddings outputs = bi_model(**inputs) # Mean pooling to get the final embedding for the text embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy() return embedding # Function to compute the cosine similarity between two embeddings def compute_cosine_similarity(embedding1, embedding2): # Reshape the embeddings and calculate cosine similarity embedding1 = embedding1.reshape(1, -1) embedding2 = embedding2.reshape(1, -1) return cosine_similarity(embedding1, embedding2)[0][0] # Function to handle the full workflow: extract text, generate embeddings, and compute similarity def compare_pdfs(pdf1, pdf2): # Extract text from both PDFs text1 = extract_text_from_pdf(pdf1) text2 = extract_text_from_pdf(pdf2) # Generate embeddings for both texts using the NASA Bi-Encoder embedding1 = generate_embedding(text1) embedding2 = generate_embedding(text2) # Compute cosine similarity between the two embeddings similarity_score = compute_cosine_similarity(embedding1, embedding2) # Return the similarity score return f"The cosine similarity between the two PDF documents is: {similarity_score:.4f}" # Gradio interface: accept two PDF files and output cosine similarity score inputs = [gr.File(label="Upload Human SCDD"), gr.File(label="Upload AI SCDD")] outputs = gr.Textbox(label="Cosine Similarity Score") # Set up the Gradio interface gr.Interface(fn=compare_pdfs, inputs=inputs, outputs=outputs, title="AI-Human SCDD Similarity Checker with NASA Bi-Encoder").launch()