dinhquangson commited on
Commit
f47a6ae
1 Parent(s): 768def6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -101
app.py CHANGED
@@ -4,7 +4,6 @@ from datasets import load_dataset
4
  from fastapi.middleware.cors import CORSMiddleware
5
  import pdfplumber
6
  import pytesseract
7
- from transformers import AutoModel, AutoTokenizer
8
 
9
  from models import Invoice
10
 
@@ -27,23 +26,8 @@ app.add_middleware(
27
  allow_methods=["*"],
28
  allow_headers=["*"],
29
  )
30
- # Load model
31
- model_path = 'openbmb/MiniCPM-Llama3-V-2_5'
32
- if 'int4' in model_path:
33
- if device == 'mps':
34
- print('Error: running int4 model with bitsandbytes on Mac is not supported right now.')
35
- exit()
36
- model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
37
- else:
38
- model = AutoModel.from_pretrained(model_path, trust_remote_code=True).to(dtype=torch.float16)
39
- model = model.to(device=device)
40
- tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
41
- model.eval()
42
-
43
-
44
-
45
- ERROR_MSG = "Error, please retry"
46
- model_name = 'MiniCPM-Llama3-V 2.5'
47
  NUM_PROC = os.cpu_count()
48
  parent_path = dirname(getcwd())
49
 
@@ -358,89 +342,6 @@ async def convert_upload_file(file: UploadFile = File(...)):
358
  )
359
 
360
  return {'content':text,'metadate':completion.choices[0].message.content}
361
-
362
- @app.post("/large_pdf2text/")
363
- async def upload_large_file(file: UploadFile = File(...)):
364
- import pytesseract
365
- from pdf2image import convert_from_path
366
- from octoai.client import OctoAI
367
- from octoai.text_gen import ChatCompletionResponseFormat, ChatMessage
368
- from pathlib import Path
369
-
370
- from haystack.components.generators import OpenAIGenerator
371
- from haystack.utils import Secret
372
- from haystack.components.builders import PromptBuilder
373
- from haystack import Document
374
- from haystack import Pipeline
375
- from haystack.document_stores.in_memory import InMemoryDocumentStore
376
- from haystack.components.converters.txt import TextFileToDocument
377
- from haystack.components.preprocessors import DocumentCleaner
378
- from haystack.components.preprocessors import DocumentSplitter
379
- from haystack.components.writers import DocumentWriter
380
-
381
- prompt_builder = PromptBuilder(template=template)
382
- generator = OpenAIGenerator(
383
- api_key=Secret.from_env_var("OCTOAI_TOKEN"),
384
- api_base_url="https://text.octoai.run/v1",
385
- model="meta-llama-3-70b-instruct",
386
- generation_kwargs = {"max_tokens": 512}
387
- )
388
- document_store = InMemoryDocumentStore()
389
- p = Pipeline()
390
- p.add_component(instance=TextFileToDocument(), name="text_file_converter")
391
- p.add_component(instance=DocumentCleaner(), name="cleaner")
392
- p.add_component(instance=DocumentSplitter(split_by="passage", split_length=2), name="splitter")
393
- p.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
394
- p.add_component("prompt_builder", prompt_builder)
395
- p.add_component("llm", generator)
396
- p.connect("text_file_converter.documents", "cleaner.documents")
397
- p.connect("cleaner.documents", "splitter.documents")
398
- p.connect("splitter.documents", "writer.documents")
399
- p.connect("writer.documents", "prompt_builder.documents")
400
- p.connect("prompt_builder", "llm")
401
-
402
- file_savePath = join(temp_path,file.filename)
403
-
404
- with open(file_savePath,'wb') as f:
405
- shutil.copyfileobj(file.file, f)
406
- # convert PDF to image
407
- images = convert_from_path(file_savePath)
408
-
409
- text=""
410
- first_page = ""
411
-
412
- # Extract text from images
413
- for image in images:
414
- ocr_text = pytesseract.image_to_string(image,lang='vie')
415
- if first_page=="":
416
- first_page = truncate_text(ocr_text)
417
- text=text+ocr_text+'\n'
418
-
419
- path = file_savePath+".txt"
420
- with open(path,'wb') as f:
421
- f.write(text)
422
-
423
- files = [path]
424
- p.run({"text_file_converter": {"sources": files},
425
- "prompt_builder": {"question": "Sử dụng tiếng Việt để trích thông tin từ hóa đơn sau đó trả ra dưới dạng JSON, Trong bảng chi tiết hóa đơn bỏ qua dòng có các ô [A,B,C,1,2,3=1x2]"}})
426
- client = OctoAI()
427
-
428
- completion = client.text_gen.create_chat_completion(
429
- model="meta-llama-3-70b-instruct",
430
- messages=[
431
- ChatMessage(role="system", content="You are a helpful assistant."),
432
- ChatMessage(role="user", content=first_page),
433
- ],
434
- presence_penalty=0,
435
- temperature=0.1,
436
- top_p=0.9,
437
- response_format=ChatCompletionResponseFormat(
438
- type="json_object",
439
- schema=Invoice.model_json_schema(),
440
- ),
441
- )
442
-
443
- return {'content':text,'metadate':completion.choices[0].message.content}
444
 
445
  def image2metadata(image):
446
  try:
 
4
  from fastapi.middleware.cors import CORSMiddleware
5
  import pdfplumber
6
  import pytesseract
 
7
 
8
  from models import Invoice
9
 
 
26
  allow_methods=["*"],
27
  allow_headers=["*"],
28
  )
29
+
30
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  NUM_PROC = os.cpu_count()
32
  parent_path = dirname(getcwd())
33
 
 
342
  )
343
 
344
  return {'content':text,'metadate':completion.choices[0].message.content}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
 
346
  def image2metadata(image):
347
  try: