Spaces:
Build error
Build error
MrFeelgoood
commited on
Commit
•
20f0ac1
1
Parent(s):
2de9223
Redesigned interface
Browse filesRebuilt the gradio interface
app.py
CHANGED
@@ -214,9 +214,7 @@ def format_output(extracted_values):
|
|
214 |
return output
|
215 |
|
216 |
|
217 |
-
|
218 |
-
|
219 |
-
def pdf_ocr(file):
|
220 |
# Convert PDF to image
|
221 |
with tempfile.TemporaryDirectory() as path:
|
222 |
with open(file, "rb") as f:
|
@@ -240,28 +238,64 @@ def pdf_ocr(file):
|
|
240 |
# Clear the image list to free up memory
|
241 |
del images
|
242 |
|
243 |
-
# Call extractor_clean and format_output functions
|
244 |
ks = ('mq', 'metri quadri', 'm2')
|
245 |
-
tra = 'it5/it5-base-question-answering'
|
246 |
quest = "Quanti metri quadri misura la superficie?"
|
247 |
totalK = ['totale', 'complessivo', 'complessiva']
|
248 |
|
249 |
-
extracted_values = extractor_clean(text=text, k_words=ks, transformer=
|
250 |
-
values_output = extracted_values[0][0]
|
251 |
-
total_output = extracted_values[0][1]
|
252 |
text_output = extracted_values[2]
|
253 |
|
254 |
-
|
|
|
255 |
|
256 |
-
|
|
|
|
|
257 |
# Call the pdf_ocr function
|
258 |
-
values, total, text = pdf_ocr(pdf_file.name)
|
259 |
return values, total, text
|
260 |
|
261 |
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
return output
|
215 |
|
216 |
|
217 |
+
def pdf_ocr(file, model_t, question):
|
|
|
|
|
218 |
# Convert PDF to image
|
219 |
with tempfile.TemporaryDirectory() as path:
|
220 |
with open(file, "rb") as f:
|
|
|
238 |
# Clear the image list to free up memory
|
239 |
del images
|
240 |
|
|
|
241 |
ks = ('mq', 'metri quadri', 'm2')
|
|
|
242 |
quest = "Quanti metri quadri misura la superficie?"
|
243 |
totalK = ['totale', 'complessivo', 'complessiva']
|
244 |
|
245 |
+
extracted_values = extractor_clean(text=text, k_words=ks, transformer=model_t, question=question, total_kwords=totalK, return_text=True)
|
246 |
+
values_output = extracted_values[0][0]
|
247 |
+
total_output = f'{extracted_values[0][1]} Mq'
|
248 |
text_output = extracted_values[2]
|
249 |
|
250 |
+
immobile_values = [f'{i + 1}. Immobile : {value} Mq\n' for i, value in enumerate(values_output)]
|
251 |
+
immobile_values = '\n'.join(immobile_values)
|
252 |
|
253 |
+
return immobile_values, total_output, text_output
|
254 |
+
|
255 |
+
def ocr_interface(pdf_file, model_t, question):
|
256 |
# Call the pdf_ocr function
|
257 |
+
values, total, text = pdf_ocr(pdf_file.name, model_t, question)
|
258 |
return values, total, text
|
259 |
|
260 |
|
261 |
+
|
262 |
+
|
263 |
+
|
264 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
265 |
+
|
266 |
+
gr.Markdown(
|
267 |
+
'''
|
268 |
+
# PDF Mq Extractor
|
269 |
+
Set the params and switch the tabs to see the output.
|
270 |
+
''')
|
271 |
+
with gr.Tab("Extractor", scroll_to_output = True):
|
272 |
+
with gr.Row():
|
273 |
+
pdf_input = gr.inputs.File(label="PDF File")
|
274 |
+
|
275 |
+
with gr.Row():
|
276 |
+
model_input = gr.inputs.Dropdown(['it5/it5-base-question-answering', 'it5/it5-small-question-answering'], label = 'Select model')
|
277 |
+
question_input = gr.inputs.Dropdown(["Quanti metri quadri misura l'immobile?"], label = 'Question')
|
278 |
+
|
279 |
+
with gr.Column():
|
280 |
+
gr.Markdown(
|
281 |
+
'''
|
282 |
+
# Output values
|
283 |
+
Values extracted from the pdf document
|
284 |
+
''')
|
285 |
+
|
286 |
+
with gr.Row():
|
287 |
+
|
288 |
+
values_output = gr.outputs.Textbox(label="Area Values")
|
289 |
+
total_output = gr.outputs.Textbox(label="Total")
|
290 |
+
with gr.Row():
|
291 |
+
extract_button = gr.Button("Extract")
|
292 |
+
|
293 |
+
with gr.Tab("Ref. Text"):
|
294 |
+
text_output = gr.outputs.Textbox(label="Ref. Text")
|
295 |
+
|
296 |
+
extract_button.click(fn = ocr_interface, inputs=[pdf_input, model_input, question_input], outputs=[values_output, total_output, text_output])
|
297 |
+
|
298 |
+
|
299 |
+
demo.launch()
|
300 |
+
|
301 |
+
|