Spaces:

digitiamosrl
/

document_info_extractor

Build error

App Files Files Community

MrFeelgoood commited on May 18, 2023

Commit

20f0ac1

•

1 Parent(s): 2de9223

Redesigned interface

Browse files

Rebuilt the gradio interface

Files changed (1) hide show

app.py +51 -17

app.py CHANGED Viewed

@@ -214,9 +214,7 @@ def format_output(extracted_values):
     return output
-def pdf_ocr(file):
     # Convert PDF to image
     with tempfile.TemporaryDirectory() as path:
         with open(file, "rb") as f:
@@ -240,28 +238,64 @@ def pdf_ocr(file):
                 # Clear the image list to free up memory
                 del images
-    # Call extractor_clean and format_output functions
     ks = ('mq', 'metri quadri', 'm2')
-    tra = 'it5/it5-base-question-answering'
     quest = "Quanti metri quadri misura la superficie?"
     totalK = ['totale', 'complessivo', 'complessiva']
-    extracted_values = extractor_clean(text=text, k_words=ks, transformer=tra, question=quest, total_kwords=totalK, return_text=True)
-    values_output = extracted_values[0][0]  # Join values with '\n'
-    total_output = extracted_values[0][1]
     text_output = extracted_values[2]
-    return values_output, total_output, text_output
-def ocr_interface(pdf_file):
     # Call the pdf_ocr function
-    values, total, text = pdf_ocr(pdf_file.name)
     return values, total, text
-pdf_input = gr.inputs.File(label="PDF File")
-values_output = gr.outputs.Textbox(label="Mq. Values")
-total_output = gr.outputs.Textbox(label="Total")
-text_output = gr.outputs.Textbox(label="Ref. Text")
-iface = gr.Interface(fn=ocr_interface, inputs=pdf_input, title="PDF MQ EXTRACTOR", examples=["Example1.pdf", "Example2.pdf"], outputs=[values_output, total_output, text_output])
-iface.launch()

     return output
+def pdf_ocr(file, model_t, question):
     # Convert PDF to image
     with tempfile.TemporaryDirectory() as path:
         with open(file, "rb") as f:
                 # Clear the image list to free up memory
                 del images
     ks = ('mq', 'metri quadri', 'm2')
     quest = "Quanti metri quadri misura la superficie?"
     totalK = ['totale', 'complessivo', 'complessiva']
+    extracted_values = extractor_clean(text=text, k_words=ks, transformer=model_t, question=question, total_kwords=totalK, return_text=True)
+    values_output = extracted_values[0][0]
+    total_output = f'{extracted_values[0][1]}  Mq'
     text_output = extracted_values[2]
+    immobile_values = [f'{i + 1}. Immobile :  {value}  Mq\n' for i, value in enumerate(values_output)]
+    immobile_values = '\n'.join(immobile_values)
+    return immobile_values, total_output, text_output
+def ocr_interface(pdf_file, model_t, question):
     # Call the pdf_ocr function
+    values, total, text = pdf_ocr(pdf_file.name, model_t, question)
     return values, total, text
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+    '''
+    # PDF Mq Extractor
+    Set the params and switch the tabs to see the output.
+    ''')
+    with gr.Tab("Extractor", scroll_to_output = True):
+      with gr.Row():
+        pdf_input = gr.inputs.File(label="PDF File")
+      with gr.Row():
+          model_input = gr.inputs.Dropdown(['it5/it5-base-question-answering', 'it5/it5-small-question-answering'], label = 'Select model')
+          question_input = gr.inputs.Dropdown(["Quanti metri quadri misura l'immobile?"], label = 'Question')
+      with gr.Column():
+          gr.Markdown(
+          '''
+          # Output values
+          Values extracted from the pdf document
+          ''')
+      with gr.Row():
+          values_output = gr.outputs.Textbox(label="Area Values")
+          total_output = gr.outputs.Textbox(label="Total")
+      with gr.Row():
+          extract_button = gr.Button("Extract")
+    with gr.Tab("Ref. Text"):
+        text_output = gr.outputs.Textbox(label="Ref. Text")
+    extract_button.click(fn = ocr_interface, inputs=[pdf_input, model_input, question_input], outputs=[values_output, total_output, text_output])
+demo.launch()