|
import gradio as gr |
|
import fitz |
|
from transformers import AutoTokenizer, AutoModel |
|
import torch |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
|
|
bi_encoder_model_name = "nasa-impact/nasa-smd-ibm-st-v2" |
|
bi_tokenizer = AutoTokenizer.from_pretrained(bi_encoder_model_name) |
|
bi_model = AutoModel.from_pretrained(bi_encoder_model_name) |
|
|
|
|
|
def extract_text_from_pdf(pdf_file): |
|
text = "" |
|
with fitz.open(pdf_file) as doc: |
|
for page in doc: |
|
text += page.get_text() |
|
return text |
|
|
|
|
|
def generate_embedding(text): |
|
|
|
inputs = bi_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) |
|
|
|
|
|
with torch.no_grad(): |
|
|
|
outputs = bi_model(**inputs) |
|
|
|
|
|
embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy() |
|
|
|
return embedding |
|
|
|
|
|
def compute_cosine_similarity(embedding1, embedding2): |
|
|
|
embedding1 = embedding1.reshape(1, -1) |
|
embedding2 = embedding2.reshape(1, -1) |
|
return cosine_similarity(embedding1, embedding2)[0][0] |
|
|
|
|
|
def compare_pdfs(pdf1, pdf2): |
|
|
|
text1 = extract_text_from_pdf(pdf1) |
|
text2 = extract_text_from_pdf(pdf2) |
|
|
|
|
|
embedding1 = generate_embedding(text1) |
|
embedding2 = generate_embedding(text2) |
|
|
|
|
|
similarity_score = compute_cosine_similarity(embedding1, embedding2) |
|
|
|
|
|
return f"The cosine similarity between the two PDF documents is: {similarity_score:.4f}" |
|
|
|
|
|
inputs = [gr.File(label="Upload Human SCDD"), gr.File(label="Upload AI SCDD")] |
|
outputs = gr.Textbox(label="Cosine Similarity Score") |
|
|
|
|
|
gr.Interface(fn=compare_pdfs, inputs=inputs, outputs=outputs, title="AI-Human SCDD Similarity Checker with NASA Bi-Encoder").launch() |
|
|
|
|