dinhquangson commited on
Commit
816de41
1 Parent(s): 48a3716

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -1
app.py CHANGED
@@ -269,7 +269,7 @@ async def download_database():
269
  return FileResponse(zip_path, media_type='application/zip', filename='database.zip')
270
 
271
  @app.post("/pdf2text/")
272
- async def create_upload_file(file: UploadFile = File(...)):
273
  import pytesseract
274
  from pdf2image import convert_from_path
275
 
@@ -288,6 +288,78 @@ async def create_upload_file(file: UploadFile = File(...)):
288
  text=text+ocr_text+'\n'
289
 
290
  return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  @app.get("/")
292
  def api_home():
293
  return {'detail': 'Welcome to FastAPI Qdrant importer!'}
 
269
  return FileResponse(zip_path, media_type='application/zip', filename='database.zip')
270
 
271
  @app.post("/pdf2text/")
272
+ async def convert_upload_file(file: UploadFile = File(...)):
273
  import pytesseract
274
  from pdf2image import convert_from_path
275
 
 
288
  text=text+ocr_text+'\n'
289
 
290
  return text
291
+
292
+ def get_type_name(element):
293
+ return type(element).__name__
294
+
295
+ def filter_by_type(elements, type):
296
+ return [element for element in elements if get_type_name(element) == type]
297
+
298
+ import re
299
+
300
+ def extract_value_from_text(text, format):
301
+ pattern = re.compile(format)
302
+ match = pattern.search(text)
303
+ if match:
304
+ return match.group(0) # Use group(0) to get the entire match
305
+ else:
306
+ return None
307
+
308
+ def filter_by_labels(elements, labels, format):
309
+ for element in elements:
310
+ for label in labels:
311
+ if label.lower() in element.text.lower():
312
+ return extract_value_from_text(element.text, format)
313
+ return None
314
+
315
+ def filter_by_values(elements, values):
316
+ for element in elements:
317
+ for value in values:
318
+ if value.lower() in element.text.lower():
319
+ return value
320
+ return None
321
+
322
+ def get_elements_by_schemas(elements, schemas):
323
+ result_elements=[]
324
+ for schema in schemas:
325
+ result_element={}
326
+ filterred_by_type_elements = filter_by_type(elements, schema['layout_type'])
327
+ if 'labels' in schema:
328
+ filterred_by_label_elements = filter_by_labels(filterred_by_type_elements, schema['labels'], schema['format'])
329
+ if filterred_by_label_elements is not None:
330
+ result_element[schema['name']] = filterred_by_label_elements
331
+ result_elements.append(result_element)
332
+ elif 'values' in schema:
333
+ fitered_by_value_elements = filter_by_values(filterred_by_type_elements, schema['values'])
334
+ if fitered_by_value_elements is not None:
335
+ result_element[schema['name']] = fitered_by_value_elements
336
+ result_elements.append(result_element)
337
+ else:
338
+ if filterred_by_type_elements is not None:
339
+ result_element[schema['name']] = filterred_by_type_elements[0].text
340
+ result_elements.append(result_element)
341
+
342
+ return result_elements
343
+
344
+
345
+ @app.post("/pdf2metadata/")
346
+ async def extract_upload_file(file: UploadFile = File(...)):
347
+ from unstructured.partition.pdf import partition_pdf
348
+
349
+
350
+
351
+ file_savePath = join(temp_path,file.filename)
352
+
353
+ with open(file_savePath,'wb') as f:
354
+ shutil.copyfileobj(file.file, f)
355
+
356
+ # Returns a List[Element] present in the pages of the parsed pdf document
357
+ elements = partition_pdf(file_savePath, languages=["vie"])
358
+
359
+ schema = [{'name':'publisher','layout_type':'Title','position':0,'from_last':False},{'name':'number','layout_type':'Text','position':0,'from_last':False, 'label':['Số','Luật số']}]
360
+
361
+ return get_elements_by_schemas(elements, schemas)
362
+
363
  @app.get("/")
364
  def api_home():
365
  return {'detail': 'Welcome to FastAPI Qdrant importer!'}