Spaces:

digitiamosrl
/

document_info_extractor

Build error

MrFeelgoood commited on May 17, 2023

Commit

5d57f7a

•

1 Parent(s): 4b491a9

Modified graphics of the UI

Files changed (1) hide show

app.py CHANGED Viewed

@@ -205,17 +205,6 @@ def extractor_clean(text, k_words, transformer, question, total_kwords, return_t
-def format_output(extracted_values):
-    output = f"Valori: {extracted_values[0][0]}\n"
-    output += f"Totale: {extracted_values[0][1]}\n"
-    if extracted_values[1] == True:
-        output += "-------------------\n"
-        output += f"Rif. Testo:\n{extracted_values[2]}"
-    return output
 def pdf_ocr(file):
     # Convert PDF to image
     with tempfile.TemporaryDirectory() as path:
@@ -243,22 +232,25 @@ def pdf_ocr(file):
     # Call extractor_clean and format_output functions
     ks = ('mq', 'metri quadri', 'm2')
     tra = 'it5/it5-base-question-answering'
-    quest = "Quanti metri quadri misura l'immobile?"
     totalK = ['totale', 'complessivo', 'complessiva']
     extracted_values = extractor_clean(text=text, k_words=ks, transformer=tra, question=quest, total_kwords=totalK, return_text=True)
-    output = format_output(extracted_values=extracted_values)
-    return output
 def ocr_interface(pdf_file):
     # Call the pdf_ocr function
-    ocr_output = pdf_ocr(pdf_file.name)
-    return ocr_output
 pdf_input = gr.inputs.File(label="PDF File")
-output_text = gr.outputs.Textbox(label="Output")
-iface = gr.Interface(fn=ocr_interface, inputs=pdf_input, outputs=output_text)
-iface.launch()

 def pdf_ocr(file):
     # Convert PDF to image
     with tempfile.TemporaryDirectory() as path:
     # Call extractor_clean and format_output functions
     ks = ('mq', 'metri quadri', 'm2')
     tra = 'it5/it5-base-question-answering'
+    quest = "Quanti metri quadri misura la superficie?"
     totalK = ['totale', 'complessivo', 'complessiva']
     extracted_values = extractor_clean(text=text, k_words=ks, transformer=tra, question=quest, total_kwords=totalK, return_text=True)
+    values_output = extracted_values[0][0]  # Join values with '\n'
+    total_output = extracted_values[0][1]
+    text_output = extracted_values[2]
+    return values_output, total_output, text_output
 def ocr_interface(pdf_file):
     # Call the pdf_ocr function
+    values, total, text = pdf_ocr(pdf_file.name)
+    return values, total, text
 pdf_input = gr.inputs.File(label="PDF File")
+values_output = gr.outputs.Textbox(label="Mq. Values")
+total_output = gr.outputs.Textbox(label="Total")
+text_output = gr.outputs.Textbox(label="Ref. Text")
+iface = gr.Interface(fn=ocr_interface, inputs=pdf_input, title="PDF MQ EXTRACTOR", outputs=[values_output, total_output, text_output], preprocess=format_output)
+iface.launch()