Spaces:

nafees369
/

NER

Sleeping

App Files Files Community

nafees369 commited on Sep 1

Commit

3e35703

•

1 Parent(s): bf5a713

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -39

app.py CHANGED Viewed

@@ -1,74 +1,115 @@
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 import fitz  # PyMuPDF for PDF handling
-# Load a pre-trained NER model
 model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
-model = AutoModelForTokenClassification.from_pretrained(model_name)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
-# Function to extract text from a PDF file
 def extract_text_from_pdf(file_path):
-    doc = fitz.open(file_path)
-    text = ""
-    for page in doc:
-        text += page.get_text()
-    return text.strip()
-# Function to map recognized entities to custom labels
-def map_labels(entity_label, label_map):
-    for custom_label, ner_labels in label_map.items():
-        if entity_label in ner_labels:
-            return custom_label
     return None
 # Function to process the text and extract entities based on custom labels
 def process_text(file, labels):
-    # Extract text from the PDF file
-    text = extract_text_from_pdf(file.name)
     # Define the custom label mapping
     label_map = {
         "Name": ["PER"],
         "Organization": ["ORG"],
         "Location": ["LOC"],
         "Project": ["MISC"],
         "Education": ["MISC"],
     }
-    # Split the custom labels provided by the user
-    requested_labels = [label.strip() for label in labels.split(",")]
-    # Perform NER on the extracted text
-    ner_results = ner_pipeline(text)
     # Initialize a dictionary to hold the extracted information
     extracted_info = {label: [] for label in requested_labels}
     # Process the NER results
     for entity in ner_results:
-        # Remove subword tokens (##) and map the entity to the custom labels
         entity_text = entity['word'].replace("##", "")
-        mapped_label = map_labels(entity['entity_group'], label_map)
-        # If the mapped label is in the requested labels, store the entity
-        if mapped_label in extracted_info:
-            extracted_info[mapped_label].append(entity_text)
     # Format the output
     output = ""
     for label, entities in extracted_info.items():
         if entities:
-            output += f"{label}: {', '.join(sorted(set(entities)))}\n"
         else:
             output += f"{label}: No information found.\n"
     return output.strip()
 # Create Gradio components
-file_input = gr.File(label="Upload a PDF file")
 label_input = gr.Textbox(label="Enter labels to extract (comma-separated)")
 output_text = gr.Textbox(label="Extracted Information")
@@ -77,9 +118,9 @@ iface = gr.Interface(
     fn=process_text,
     inputs=[file_input, label_input],
     outputs=output_text,
-    title="NER with Custom Labels from PDF",
-    description="Upload a PDF file and extract entities based on custom labels."
 )
 # Launch the Gradio interface
-iface.launch()

 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
+from sentence_transformers import SentenceTransformer, util
 import fitz  # PyMuPDF for PDF handling
+import torch
+import docx  # For DOCX handling
+# Load pre-trained models
 model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
+ner_model = AutoModelForTokenClassification.from_pretrained(model_name)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+ner_pipeline = pipeline("ner", model=ner_model, tokenizer=tokenizer, aggregation_strategy="simple")
+embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+# Function to extract text from a PDF file with error handling
 def extract_text_from_pdf(file_path):
+    try:
+        doc = fitz.open(file_path)
+        text = ""
+        for page in doc:
+            text += page.get_text()
+        return text.strip()
+    except Exception as e:
+        return f"Error extracting text from PDF: {str(e)}"
+# Function to extract text from a DOCX file
+def extract_text_from_docx(file_path):
+    try:
+        doc = docx.Document(file_path)
+        text = "\n".join([para.text for para in doc.paragraphs])
+        return text.strip()
+    except Exception as e:
+        return f"Error extracting text from DOCX: {str(e)}"
+# Function to calculate cosine similarity
+def calculate_similarity(input_label, predefined_labels):
+    input_embedding = embedding_model.encode(input_label, convert_to_tensor=True)
+    predefined_embeddings = embedding_model.encode(predefined_labels, convert_to_tensor=True)
+    cosine_scores = util.pytorch_cos_sim(input_embedding, predefined_embeddings)
+    best_match_idx = torch.argmax(cosine_scores).item()
+    return predefined_labels[best_match_idx], cosine_scores[0][best_match_idx].item()
+# Function to map recognized entities to custom labels with cosine similarity
+def map_labels_with_similarity(input_label, label_map):
+    predefined_labels = list(label_map.keys())
+    best_match_label, similarity_score = calculate_similarity(input_label, predefined_labels)
+    if similarity_score > 0.7:  # Threshold for considering a match
+        return best_match_label
     return None
 # Function to process the text and extract entities based on custom labels
 def process_text(file, labels):
+    # Determine the file type and extract text accordingly
+    if file.name.endswith(".pdf"):
+        text = extract_text_from_pdf(file.name)
+    elif file.name.endswith(".docx"):
+        text = extract_text_from_docx(file.name)
+    else:
+        return "Unsupported file type. Please upload a PDF or DOCX file."
+    if text.startswith("Error"):
+        return text  # Return the error message if text extraction failed
     # Define the custom label mapping
     label_map = {
         "Name": ["PER"],
         "Organization": ["ORG"],
         "Location": ["LOC"],
+        "Address": ["LOC"],  # Address mapped to Location
         "Project": ["MISC"],
         "Education": ["MISC"],
     }
+    # Split the custom labels provided by the user and handle potential input issues
+    requested_labels = [label.strip().capitalize() for label in labels.split(",") if label.strip()]
+    if not requested_labels:
+        return "No valid labels provided. Please enter valid labels to extract."
     # Initialize a dictionary to hold the extracted information
     extracted_info = {label: [] for label in requested_labels}
+    # Perform NER on the extracted text
+    ner_results = ner_pipeline(text)
     # Process the NER results
     for entity in ner_results:
         entity_text = entity['word'].replace("##", "")
+        entity_group = entity['entity_group']
+        # Determine the best matching label using cosine similarity
+        for input_label in requested_labels:
+            best_match_label = map_labels_with_similarity(input_label, label_map)
+            if best_match_label and entity_group in label_map[best_match_label]:
+                extracted_info[input_label].append(entity_text)
     # Format the output
     output = ""
     for label, entities in extracted_info.items():
         if entities:
+            # Remove duplicates and clean up the entities
+            unique_entities = sorted(set(entities))
+            output += f"{label}: {', '.join(unique_entities)}\n"
         else:
             output += f"{label}: No information found.\n"
     return output.strip()
 # Create Gradio components
+file_input = gr.File(label="Upload a PDF or DOCX file")
 label_input = gr.Textbox(label="Enter labels to extract (comma-separated)")
 output_text = gr.Textbox(label="Extracted Information")
     fn=process_text,
     inputs=[file_input, label_input],
     outputs=output_text,
+    title="NER with Custom Labels from PDF or DOCX",
+    description="Upload a PDF or DOCX file and extract entities based on custom labels."
 )
 # Launch the Gradio interface
+iface.launch()