aquibmoin commited on
Commit
a3c1cd7
1 Parent(s): f560271

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -0
app.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF for extracting text from PDFs
2
+ from transformers import AutoTokenizer, AutoModel
3
+ import torch
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+
6
+ # Load the NASA-specific bi-encoder model and tokenizer
7
+ bi_encoder_model_name = "nasa-impact/nasa-smd-ibm-st-v2"
8
+ bi_tokenizer = AutoTokenizer.from_pretrained(bi_encoder_model_name)
9
+ bi_model = AutoModel.from_pretrained(bi_encoder_model_name)
10
+
11
+ # Function to extract text from a PDF
12
+ def extract_text_from_pdf(pdf_file):
13
+ text = ""
14
+ with fitz.open(pdf_file) as doc:
15
+ for page in doc:
16
+ text += page.get_text() # Extract text from each page
17
+ return text
18
+
19
+ # Function to generate embeddings from the text using the NASA Bi-Encoder
20
+ def generate_embedding(text):
21
+ # Tokenize the text and create input tensors
22
+ inputs = bi_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
23
+
24
+ # Use torch.no_grad() to disable gradient calculation during inference
25
+ with torch.no_grad():
26
+ # Pass inputs to the model to generate embeddings
27
+ outputs = bi_model(**inputs)
28
+
29
+ # Mean pooling to get the final embedding for the text
30
+ embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
31
+
32
+ return embedding
33
+
34
+ # Function to compute the cosine similarity between two embeddings
35
+ def compute_cosine_similarity(embedding1, embedding2):
36
+ # Reshape the embeddings and calculate cosine similarity
37
+ embedding1 = embedding1.reshape(1, -1)
38
+ embedding2 = embedding2.reshape(1, -1)
39
+ return cosine_similarity(embedding1, embedding2)[0][0]
40
+
41
+ # Function to handle the full workflow: extract text, generate embeddings, and compute similarity
42
+ def compare_pdfs(pdf1, pdf2):
43
+ # Extract text from both PDFs
44
+ text1 = extract_text_from_pdf(pdf1)
45
+ text2 = extract_text_from_pdf(pdf2)
46
+
47
+ # Generate embeddings for both texts using the NASA Bi-Encoder
48
+ embedding1 = generate_embedding(text1)
49
+ embedding2 = generate_embedding(text2)
50
+
51
+ # Compute cosine similarity between the two embeddings
52
+ similarity_score = compute_cosine_similarity(embedding1, embedding2)
53
+
54
+ # Return the similarity score
55
+ return f"The cosine similarity between the two PDF documents is: {similarity_score:.4f}"
56
+
57
+ # Gradio interface: accept two PDF files and output cosine similarity score
58
+ inputs = [gr.inputs.File(label="Upload Human SCDD"), gr.inputs.File(label="Upload AI SCDD")]
59
+ outputs = gr.outputs.Textbox(label="Cosine Similarity")
60
+
61
+ # Set up the Gradio interface
62
+ gr.Interface(fn=compare_pdfs, inputs=inputs, outputs=outputs, title="PDF Cosine Similarity with NASA Bi-Encoder").launch()