dinhquangson commited on
Commit
768def6
1 Parent(s): a7ec673

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -1
app.py CHANGED
@@ -210,7 +210,7 @@ def search(prompt: str):
210
 
211
 
212
  template = """
213
- Với thông tin sau, hãy trả lời câu hỏi bằng tiếng ViệtViệt.
214
 
215
  Bối cảnh: {% for document in documents %}
216
 
@@ -359,6 +359,89 @@ async def convert_upload_file(file: UploadFile = File(...)):
359
 
360
  return {'content':text,'metadate':completion.choices[0].message.content}
361
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  def image2metadata(image):
363
  try:
364
  image = image.convert('RGB')
 
210
 
211
 
212
  template = """
213
+ Với thông tin sau, hãy trả lời câu hỏi bằng tiếng Việt.
214
 
215
  Bối cảnh: {% for document in documents %}
216
 
 
359
 
360
  return {'content':text,'metadate':completion.choices[0].message.content}
361
 
362
+ @app.post("/large_pdf2text/")
363
+ async def upload_large_file(file: UploadFile = File(...)):
364
+ import pytesseract
365
+ from pdf2image import convert_from_path
366
+ from octoai.client import OctoAI
367
+ from octoai.text_gen import ChatCompletionResponseFormat, ChatMessage
368
+ from pathlib import Path
369
+
370
+ from haystack.components.generators import OpenAIGenerator
371
+ from haystack.utils import Secret
372
+ from haystack.components.builders import PromptBuilder
373
+ from haystack import Document
374
+ from haystack import Pipeline
375
+ from haystack.document_stores.in_memory import InMemoryDocumentStore
376
+ from haystack.components.converters.txt import TextFileToDocument
377
+ from haystack.components.preprocessors import DocumentCleaner
378
+ from haystack.components.preprocessors import DocumentSplitter
379
+ from haystack.components.writers import DocumentWriter
380
+
381
+ prompt_builder = PromptBuilder(template=template)
382
+ generator = OpenAIGenerator(
383
+ api_key=Secret.from_env_var("OCTOAI_TOKEN"),
384
+ api_base_url="https://text.octoai.run/v1",
385
+ model="meta-llama-3-70b-instruct",
386
+ generation_kwargs = {"max_tokens": 512}
387
+ )
388
+ document_store = InMemoryDocumentStore()
389
+ p = Pipeline()
390
+ p.add_component(instance=TextFileToDocument(), name="text_file_converter")
391
+ p.add_component(instance=DocumentCleaner(), name="cleaner")
392
+ p.add_component(instance=DocumentSplitter(split_by="passage", split_length=2), name="splitter")
393
+ p.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
394
+ p.add_component("prompt_builder", prompt_builder)
395
+ p.add_component("llm", generator)
396
+ p.connect("text_file_converter.documents", "cleaner.documents")
397
+ p.connect("cleaner.documents", "splitter.documents")
398
+ p.connect("splitter.documents", "writer.documents")
399
+ p.connect("writer.documents", "prompt_builder.documents")
400
+ p.connect("prompt_builder", "llm")
401
+
402
+ file_savePath = join(temp_path,file.filename)
403
+
404
+ with open(file_savePath,'wb') as f:
405
+ shutil.copyfileobj(file.file, f)
406
+ # convert PDF to image
407
+ images = convert_from_path(file_savePath)
408
+
409
+ text=""
410
+ first_page = ""
411
+
412
+ # Extract text from images
413
+ for image in images:
414
+ ocr_text = pytesseract.image_to_string(image,lang='vie')
415
+ if first_page=="":
416
+ first_page = truncate_text(ocr_text)
417
+ text=text+ocr_text+'\n'
418
+
419
+ path = file_savePath+".txt"
420
+ with open(path,'wb') as f:
421
+ f.write(text)
422
+
423
+ files = [path]
424
+ p.run({"text_file_converter": {"sources": files},
425
+ "prompt_builder": {"question": "Sử dụng tiếng Việt để trích thông tin từ hóa đơn sau đó trả ra dưới dạng JSON, Trong bảng chi tiết hóa đơn bỏ qua dòng có các ô [A,B,C,1,2,3=1x2]"}})
426
+ client = OctoAI()
427
+
428
+ completion = client.text_gen.create_chat_completion(
429
+ model="meta-llama-3-70b-instruct",
430
+ messages=[
431
+ ChatMessage(role="system", content="You are a helpful assistant."),
432
+ ChatMessage(role="user", content=first_page),
433
+ ],
434
+ presence_penalty=0,
435
+ temperature=0.1,
436
+ top_p=0.9,
437
+ response_format=ChatCompletionResponseFormat(
438
+ type="json_object",
439
+ schema=Invoice.model_json_schema(),
440
+ ),
441
+ )
442
+
443
+ return {'content':text,'metadate':completion.choices[0].message.content}
444
+
445
  def image2metadata(image):
446
  try:
447
  image = image.convert('RGB')