|
|
|
|
|
import gradio as gr |
|
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline |
|
from sentence_transformers import SentenceTransformer, util |
|
import fitz |
|
import torch |
|
import docx |
|
|
|
|
|
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english" |
|
ner_model = AutoModelForTokenClassification.from_pretrained(model_name) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
ner_pipeline = pipeline("ner", model=ner_model, tokenizer=tokenizer, aggregation_strategy="simple") |
|
|
|
embedding_model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
|
|
def extract_text_from_pdf(file_path): |
|
try: |
|
doc = fitz.open(file_path) |
|
text = "" |
|
for page in doc: |
|
text += page.get_text() |
|
return text.strip() |
|
except Exception as e: |
|
return f"Error extracting text from PDF: {str(e)}" |
|
|
|
|
|
def extract_text_from_docx(file_path): |
|
try: |
|
doc = docx.Document(file_path) |
|
text = "\n".join([para.text for para in doc.paragraphs]) |
|
return text.strip() |
|
except Exception as e: |
|
return f"Error extracting text from DOCX: {str(e)}" |
|
|
|
|
|
def calculate_similarity(input_label, predefined_labels): |
|
input_embedding = embedding_model.encode(input_label, convert_to_tensor=True) |
|
predefined_embeddings = embedding_model.encode(predefined_labels, convert_to_tensor=True) |
|
cosine_scores = util.pytorch_cos_sim(input_embedding, predefined_embeddings) |
|
best_match_idx = torch.argmax(cosine_scores).item() |
|
return predefined_labels[best_match_idx], cosine_scores[0][best_match_idx].item() |
|
|
|
|
|
def map_labels_with_similarity(input_label, label_map): |
|
predefined_labels = list(label_map.keys()) |
|
best_match_label, similarity_score = calculate_similarity(input_label, predefined_labels) |
|
if similarity_score > 0.7: |
|
return best_match_label |
|
return None |
|
|
|
|
|
def process_text(file, labels): |
|
|
|
if file.name.endswith(".pdf"): |
|
text = extract_text_from_pdf(file.name) |
|
elif file.name.endswith(".docx"): |
|
text = extract_text_from_docx(file.name) |
|
else: |
|
return "Unsupported file type. Please upload a PDF or DOCX file." |
|
|
|
if text.startswith("Error"): |
|
return text |
|
|
|
|
|
label_map = { |
|
"Name": ["PER"], |
|
"Organization": ["ORG"], |
|
"Location": ["LOC"], |
|
"Address": ["LOC"], |
|
"Project": ["MISC"], |
|
"Education": ["MISC"], |
|
} |
|
|
|
|
|
requested_labels = [label.strip().capitalize() for label in labels.split(",") if label.strip()] |
|
if not requested_labels: |
|
return "No valid labels provided. Please enter valid labels to extract." |
|
|
|
|
|
extracted_info = {label: [] for label in requested_labels} |
|
|
|
|
|
ner_results = ner_pipeline(text) |
|
|
|
|
|
for entity in ner_results: |
|
entity_text = entity['word'].replace("##", "") |
|
entity_group = entity['entity_group'] |
|
|
|
|
|
for input_label in requested_labels: |
|
best_match_label = map_labels_with_similarity(input_label, label_map) |
|
if best_match_label and entity_group in label_map[best_match_label]: |
|
extracted_info[input_label].append(entity_text) |
|
|
|
|
|
output = "" |
|
for label, entities in extracted_info.items(): |
|
if entities: |
|
|
|
unique_entities = sorted(set(entities)) |
|
output += f"{label}: {', '.join(unique_entities)}\n" |
|
else: |
|
output += f"{label}: No information found.\n" |
|
|
|
return output.strip() |
|
|
|
|
|
file_input = gr.File(label="Upload a PDF or DOCX file") |
|
label_input = gr.Textbox(label="Enter labels to extract (comma-separated)") |
|
output_text = gr.Textbox(label="Extracted Information") |
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_text, |
|
inputs=[file_input, label_input], |
|
outputs=output_text, |
|
title="NER with Custom Labels from PDF or DOCX", |
|
description="Upload a PDF or DOCX file and extract entities based on custom labels." |
|
) |
|
|
|
|
|
iface.launch() |
|
|