Spaces:
Sleeping
Sleeping
dinhquangson
commited on
Commit
•
816de41
1
Parent(s):
48a3716
Update app.py
Browse files
app.py
CHANGED
@@ -269,7 +269,7 @@ async def download_database():
|
|
269 |
return FileResponse(zip_path, media_type='application/zip', filename='database.zip')
|
270 |
|
271 |
@app.post("/pdf2text/")
|
272 |
-
async def
|
273 |
import pytesseract
|
274 |
from pdf2image import convert_from_path
|
275 |
|
@@ -288,6 +288,78 @@ async def create_upload_file(file: UploadFile = File(...)):
|
|
288 |
text=text+ocr_text+'\n'
|
289 |
|
290 |
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
291 |
@app.get("/")
|
292 |
def api_home():
|
293 |
return {'detail': 'Welcome to FastAPI Qdrant importer!'}
|
|
|
269 |
return FileResponse(zip_path, media_type='application/zip', filename='database.zip')
|
270 |
|
271 |
@app.post("/pdf2text/")
|
272 |
+
async def convert_upload_file(file: UploadFile = File(...)):
|
273 |
import pytesseract
|
274 |
from pdf2image import convert_from_path
|
275 |
|
|
|
288 |
text=text+ocr_text+'\n'
|
289 |
|
290 |
return text
|
291 |
+
|
292 |
+
def get_type_name(element):
|
293 |
+
return type(element).__name__
|
294 |
+
|
295 |
+
def filter_by_type(elements, type):
|
296 |
+
return [element for element in elements if get_type_name(element) == type]
|
297 |
+
|
298 |
+
import re
|
299 |
+
|
300 |
+
def extract_value_from_text(text, format):
|
301 |
+
pattern = re.compile(format)
|
302 |
+
match = pattern.search(text)
|
303 |
+
if match:
|
304 |
+
return match.group(0) # Use group(0) to get the entire match
|
305 |
+
else:
|
306 |
+
return None
|
307 |
+
|
308 |
+
def filter_by_labels(elements, labels, format):
|
309 |
+
for element in elements:
|
310 |
+
for label in labels:
|
311 |
+
if label.lower() in element.text.lower():
|
312 |
+
return extract_value_from_text(element.text, format)
|
313 |
+
return None
|
314 |
+
|
315 |
+
def filter_by_values(elements, values):
|
316 |
+
for element in elements:
|
317 |
+
for value in values:
|
318 |
+
if value.lower() in element.text.lower():
|
319 |
+
return value
|
320 |
+
return None
|
321 |
+
|
322 |
+
def get_elements_by_schemas(elements, schemas):
|
323 |
+
result_elements=[]
|
324 |
+
for schema in schemas:
|
325 |
+
result_element={}
|
326 |
+
filterred_by_type_elements = filter_by_type(elements, schema['layout_type'])
|
327 |
+
if 'labels' in schema:
|
328 |
+
filterred_by_label_elements = filter_by_labels(filterred_by_type_elements, schema['labels'], schema['format'])
|
329 |
+
if filterred_by_label_elements is not None:
|
330 |
+
result_element[schema['name']] = filterred_by_label_elements
|
331 |
+
result_elements.append(result_element)
|
332 |
+
elif 'values' in schema:
|
333 |
+
fitered_by_value_elements = filter_by_values(filterred_by_type_elements, schema['values'])
|
334 |
+
if fitered_by_value_elements is not None:
|
335 |
+
result_element[schema['name']] = fitered_by_value_elements
|
336 |
+
result_elements.append(result_element)
|
337 |
+
else:
|
338 |
+
if filterred_by_type_elements is not None:
|
339 |
+
result_element[schema['name']] = filterred_by_type_elements[0].text
|
340 |
+
result_elements.append(result_element)
|
341 |
+
|
342 |
+
return result_elements
|
343 |
+
|
344 |
+
|
345 |
+
@app.post("/pdf2metadata/")
|
346 |
+
async def extract_upload_file(file: UploadFile = File(...)):
|
347 |
+
from unstructured.partition.pdf import partition_pdf
|
348 |
+
|
349 |
+
|
350 |
+
|
351 |
+
file_savePath = join(temp_path,file.filename)
|
352 |
+
|
353 |
+
with open(file_savePath,'wb') as f:
|
354 |
+
shutil.copyfileobj(file.file, f)
|
355 |
+
|
356 |
+
# Returns a List[Element] present in the pages of the parsed pdf document
|
357 |
+
elements = partition_pdf(file_savePath, languages=["vie"])
|
358 |
+
|
359 |
+
schema = [{'name':'publisher','layout_type':'Title','position':0,'from_last':False},{'name':'number','layout_type':'Text','position':0,'from_last':False, 'label':['Số','Luật số']}]
|
360 |
+
|
361 |
+
return get_elements_by_schemas(elements, schemas)
|
362 |
+
|
363 |
@app.get("/")
|
364 |
def api_home():
|
365 |
return {'detail': 'Welcome to FastAPI Qdrant importer!'}
|