Extraction_Detail_Page_Summary

Sleeping

App Files Files Community

andreeabodea commited on Apr 16

Commit

3b44ece

•

1 Parent(s): 536f374

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -67

app.py CHANGED Viewed

@@ -1,62 +1,82 @@
 import os
 import pdfplumber
 import re
-import gradio as gr
 from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
-from io import BytesIO
-import torch
-"""
-Extract the text from a section of a PDF file  between 'wanted_section' and 'next_section'.
-Parameters:
-- path (str): The file path to the PDF file.
-- wanted_section (str): The section to start extracting text from.
-- next_section (str): The section to stop extracting text at.
-Returns:
-- text (str): The extracted text from the specified section range.
-"""
-def get_section(path, wanted_section, next_section):
-    print(wanted_section)
-    # Open the PDF file
-    doc = pdfplumber.open(BytesIO(path))
     start_page = []
     end_page = []
-    # Find the all the pages for the specified sections
     for page in range(len(doc.pages)):
-        if len(doc.pages[page].search(wanted_section, return_chars=False, case=False)) > 0:
             start_page.append(page)
-        if len(doc.pages[page].search(next_section, return_chars=False, case=False)) > 0:
             end_page.append(page)
-    # Extract the text between the start and end page of the wanted section
     text = []
     for page_num in range(max(start_page), max(end_page)+1):
         page = doc.pages[page_num]
         text.append(page.extract_text())
     text = " ".join(text)
-    final_text = text.replace("\n", " ")
-    return final_text
-def extract_between(big_string, start_string, end_string):
-    # Use a non-greedy match for content between start_string and end_string
     pattern = re.escape(start_string) + '(.*?)' + re.escape(end_string)
-    match = re.search(pattern, big_string, re.DOTALL)
     if match:
-        # Return the content without the start and end strings
         return match.group(1)
     else:
-        # Return None if the pattern is not found
         return None
 def format_section1(section1_text):
     result_section1_dict = {}
     result_section1_dict['TOPIC'] = extract_between(section1_text, "Sektor", "EZ-Programm")
     result_section1_dict['PROGRAM'] = extract_between(section1_text, "Sektor", "EZ-Programm")
     result_section1_dict['PROJECT DESCRIPTION'] = extract_between(section1_text, "EZ-Programmziel", "Datum der letzten BE")
@@ -65,69 +85,66 @@ def format_section1(section1_text):
     result_section1_dict['PROGRESS'] = extract_between(section1_text, "Zielerreichung des Moduls", "Massnahme im Zeitplan")
     result_section1_dict['STATUS'] = extract_between(section1_text, "Massnahme im Zeitplan", "Risikoeinschätzung")
     result_section1_dict['RECOMMENDATIONS'] = extract_between(section1_text, "Vorschläge zur Modulanpas-", "Voraussichtliche")
     return result_section1_dict
-def answer_questions(text,language="de"):
-    # Initialize the zero-shot classification pipeline
     model_name = "deepset/gelectra-large-germanquad"
     model = AutoModelForQuestionAnswering.from_pretrained(model_name)
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-    # Initialize the QA pipeline
     qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
     questions = [
         "Welches ist das Titel des Moduls?",
         "Welches ist das Sektor oder das Kernthema?",
         "Welches ist das Land?",
-        "Zu welchem Program oder EZ-Programm gehort das Projekt?"
-        #"Welche Durchführungsorganisation aus den 4 Varianten 'giz', 'kfw', 'ptb' und 'bgr' implementiert das Projekt?"
         # "In dem Dokument was steht bei Sektor?",
         # "In dem Dokument was steht von 'EZ-Programm' bis 'EZ-Programmziel'?",
         # "In dem Dokument was steht bei EZ-Programmziel?",
         # "In dem Dokument in dem Abschnitt '1. Kurzbeschreibung' was steht bei Modul?",
-        # "In dem Dokument was steht bei Zielerreichung des Moduls?",
         # "In dem Dokument in dem Abschnitt '1. Kurzbeschreibung' was steht bei Maßnahme im Zeitplan?",
         # "In dem Dokument was steht bei Vorschläge zur Modulanpassung?",
         # "In dem Dokument in dem Abschnitt 'Anlage 1: Wirkungsmatrix des Moduls' was steht unter Laufzeit als erstes Datum?",
         # "In dem Dokument in dem Abschnitt 'Anlage 1: Wirkungsmatrix des Moduls' was steht unter Laufzeit als zweites Datum?"
     ]
-    # Iterate over each question and get answers
     answers_dict = {}
     for question in questions:
         result = qa_pipeline(question=question, context=text)
-        # print(f"Question: {question}")
-        # print(f"Answer: {result['answer']}\n")
         answers_dict[question] = result['answer']
     return answers_dict
-def process_pdf(path):
-    results_dict = {}
-    results_dict["1. Kurzbeschreibung"] = \
-        get_section(path, "1. Kurzbeschreibung", "2. Einordnung des Moduls")
-    answers = answer_questions(results_dict["1. Kurzbeschreibung"])
-    return answers
-def get_first_page_text(file_data):
-    doc = pdfplumber.open(BytesIO(file_data))
-    if len(doc.pages):
-        return doc.pages[0].extract_text()
 if __name__ == "__main__":
-    # Define the Gradio interface
-    # iface = gr.Interface(fn=process_pdf,
-    demo = gr.Interface(fn=process_pdf,
                      inputs=gr.File(type="binary", label="Upload PDF"),
                      outputs=gr.Textbox(label="Extracted Text"),
                      title="PDF Text Extractor",
                      description="Upload a PDF file to extract.")
     demo.launch()

+import gradio as gr
+from io import BytesIO
+import torch
 import os
 import pdfplumber
 import re
 from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
+from transformers import BertTokenizer, EncoderDecoderModel
+def process_pdf(path):
+    results_dict = {}
+    results_dict["1. Kurzbeschreibung"] = \
+        read_section(path, "1. Kurzbeschreibung", "2. Einordnung des Moduls")
+    # results_dict["2. Einordnung des Moduls"] = \
+    #     read_section(path, "Einordnung des Moduls",
+    #                 "Entwicklungsmaßnahmen im konkreten Interventionsbereich des Moduls")
+    # results_dict["2.2 Andere Entwicklungsmaßnahmen im konkreten Interventionsbereich des Moduls"] = \
+    #     read_section(path, "Entwicklungsmaßnahmen im konkreten Interventionsbereich des Moduls",
+    #                 "3. Entwicklungen im Interventionsbereich")
+    results_dict["3. Entwicklungen im Interventionsbereich"] = \
+        read_section(path, "3. Entwicklungen im Interventionsbereich",
+                    "4.1 Bewertungen von Zielen, Zielgruppen, Wirkungshypothesen und Indikatoren")
+    results_dict["4.1 Bewertungen von Zielen, Zielgruppen, Wirkungshypothesen und Indikatoren"] = \
+        read_section(path, "4.1 Bewertungen von Zielen, Zielgruppen, Wirkungshypothesen und Indikatoren",
+                    "4.2")
+    results_dict["4.2 Umgesetzte Maßnahmen / Aktivitäten während des Berichtszeitraums"] = \
+        read_section(path, "4.2", "4.3")
+    results_dict["4.3 Umsetzung von Maßnahmen zur Sicherstellung der nachhaltigen Wirksamkeit des Vorhabens"] = \
+        read_section(path, "4.3",
+                    "4.4 Laufzeit und Zeitplan")
+    results_dict["4.4 Laufzeit und Zeitplan"] = \
+        read_section(path, "4.4 Laufzeit und Zeitplan", "4.5")
+    results_dict["4.5 Entstandene Kosten und Kostenverschiebungen"] = \
+        read_section(path, "4.5", "4.6")
+    results_dict["4.6 Bewertung der Wirkungen und Risiken"] = \
+        read_section(path, "4.6", "5. Übergeordnete Empfehlungen")
+    results_dict["5. Übergeordnete Empfehlungen"] = \
+        read_section(path, "5. Übergeordnete Empfehlungen",
+                    "5.2 Lernerfahrungen, die für die Länderstrategie und zukünftige")
+    results_dict["5.2 Lernerfahrungen, die für die Länderstrategie und zukünftige EZ-Programme interessant sein könnten"] = \
+        read_section(path, "5.2 Lernerfahrungen", "6. Testat")
+    # results_dict["6. Testat (TZ)"] = \
+    #     read_section(path, "6. Testat", "Anlage 1: Wirkungsmatrix des Moduls")
+    return results_dict
+def read_section(path, wanted_section, next_section):
+    doc = pdfplumber.open(path)
     start_page = []
     end_page = []
     for page in range(len(doc.pages)):
+        if len(doc.pages[page].search(wanted_section, return_chars = False, case = False)) > 0:
             start_page.append(page)
+        if len(doc.pages[page].search(next_section, return_chars = False, case = False)) > 0:
             end_page.append(page)
+    # print(wanted_section)
+    # print(max(start_page))
+    # print(max(end_page)+1)
     text = []
     for page_num in range(max(start_page), max(end_page)+1):
         page = doc.pages[page_num]
         text.append(page.extract_text())
     text = " ".join(text)
+    text.replace("\n", " ")
+    # print(wanted_section + str(extract_between(text, wanted_section, next_section)))
+    return wanted_section + str(extract_between(text, wanted_section, next_section))
+def extract_between(text, start_string, end_string):
     pattern = re.escape(start_string) + '(.*?)' + re.escape(end_string)
+    match = re.search(pattern, text, re.DOTALL)
     if match:
         return match.group(1)
     else:
         return None
 def format_section1(section1_text):
     result_section1_dict = {}
     result_section1_dict['TOPIC'] = extract_between(section1_text, "Sektor", "EZ-Programm")
     result_section1_dict['PROGRAM'] = extract_between(section1_text, "Sektor", "EZ-Programm")
     result_section1_dict['PROJECT DESCRIPTION'] = extract_between(section1_text, "EZ-Programmziel", "Datum der letzten BE")
     result_section1_dict['PROGRESS'] = extract_between(section1_text, "Zielerreichung des Moduls", "Massnahme im Zeitplan")
     result_section1_dict['STATUS'] = extract_between(section1_text, "Massnahme im Zeitplan", "Risikoeinschätzung")
     result_section1_dict['RECOMMENDATIONS'] = extract_between(section1_text, "Vorschläge zur Modulanpas-", "Voraussichtliche")
     return result_section1_dict
+def initialize_question_answering():
     model_name = "deepset/gelectra-large-germanquad"
     model = AutoModelForQuestionAnswering.from_pretrained(model_name)
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
+    return qa_pipeline
+def answer_questions_section_1(text, language="de"):
+    qa_pipeline = initialize_question_answering()
     questions = [
         "Welches ist das Titel des Moduls?",
         "Welches ist das Sektor oder das Kernthema?",
         "Welches ist das Land?",
+        "Zu welchem Program oder Programm gehort das Projekt?",
+        # "Welche Durchführungsorganisation aus den 4 Varianten 'giz', 'kfw', 'ptb' und 'bgr' implementiert das Projekt?"
+        "Wurde das Ziel des Moduls erreicht?", # "In dem Dokument was steht bei Zielerreichung des Moduls?",
+        "Welche ist die Risikoeinschätzung des Moduls?",
+        "Ist die Maßnahme im Zeitplan?"
         # "In dem Dokument was steht bei Sektor?",
         # "In dem Dokument was steht von 'EZ-Programm' bis 'EZ-Programmziel'?",
         # "In dem Dokument was steht bei EZ-Programmziel?",
         # "In dem Dokument in dem Abschnitt '1. Kurzbeschreibung' was steht bei Modul?",
         # "In dem Dokument in dem Abschnitt '1. Kurzbeschreibung' was steht bei Maßnahme im Zeitplan?",
         # "In dem Dokument was steht bei Vorschläge zur Modulanpassung?",
         # "In dem Dokument in dem Abschnitt 'Anlage 1: Wirkungsmatrix des Moduls' was steht unter Laufzeit als erstes Datum?",
         # "In dem Dokument in dem Abschnitt 'Anlage 1: Wirkungsmatrix des Moduls' was steht unter Laufzeit als zweites Datum?"
     ]
     answers_dict = {}
     for question in questions:
         result = qa_pipeline(question=question, context=text)
+        print(f"Question: {question}")
+        print(f"Answer: {result['answer']}\n")
         answers_dict[question] = result['answer']
     return answers_dict
+def summarize_german_text(text):
+    model_name = "mrm8488/bert2bert_shared-german-finetuned-summarization"
+    tokenizer = BertTokenizer.from_pretrained(model_name)
+    model = EncoderDecoderModel.from_pretrained(model_name)
+    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
+    summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=200, early_stopping=True)
+    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+    return summary
+def extact_details():
+    sections_dict = process_pdf(path)
+    results = answer_questions_section_1(sections_dict["1. Kurzbeschreibung"])
+    results["Section 4.1 summary"] = summarize_german_text(sections_dict["4.1 Bewertungen von Zielen, Zielgruppen, Wirkungshypothesen und Indikatoren"])
+    results["Section 4.2 summary"] = summarize_german_text(sections_dict["4.2 Umgesetzte Maßnahmen / Aktivitäten während des Berichtszeitraums"])
+    results["Section 4.6 summary"] = summarize_german_text(sections_dict["4.6 Bewertung der Wirkungen und Risiken"])
+    results["Section 5.1 summary"] = summarize_german_text(sections_dict["5. Übergeordnete Empfehlungen"])
+    # for key, answer in results.items():
+    #     print(f"{key}: {answer}")
 if __name__ == "__main__":
+    demo = gr.Interface(fn=extact_details,
                      inputs=gr.File(type="binary", label="Upload PDF"),
                      outputs=gr.Textbox(label="Extracted Text"),
                      title="PDF Text Extractor",
                      description="Upload a PDF file to extract.")
     demo.launch()