Spaces:
Sleeping
Sleeping
dinhquangson
commited on
Commit
•
768def6
1
Parent(s):
a7ec673
Update app.py
Browse files
app.py
CHANGED
@@ -210,7 +210,7 @@ def search(prompt: str):
|
|
210 |
|
211 |
|
212 |
template = """
|
213 |
-
Với thông tin sau, hãy trả lời câu hỏi bằng tiếng
|
214 |
|
215 |
Bối cảnh: {% for document in documents %}
|
216 |
|
@@ -359,6 +359,89 @@ async def convert_upload_file(file: UploadFile = File(...)):
|
|
359 |
|
360 |
return {'content':text,'metadate':completion.choices[0].message.content}
|
361 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
362 |
def image2metadata(image):
|
363 |
try:
|
364 |
image = image.convert('RGB')
|
|
|
210 |
|
211 |
|
212 |
template = """
|
213 |
+
Với thông tin sau, hãy trả lời câu hỏi bằng tiếng Việt.
|
214 |
|
215 |
Bối cảnh: {% for document in documents %}
|
216 |
|
|
|
359 |
|
360 |
return {'content':text,'metadate':completion.choices[0].message.content}
|
361 |
|
362 |
+
@app.post("/large_pdf2text/")
|
363 |
+
async def upload_large_file(file: UploadFile = File(...)):
|
364 |
+
import pytesseract
|
365 |
+
from pdf2image import convert_from_path
|
366 |
+
from octoai.client import OctoAI
|
367 |
+
from octoai.text_gen import ChatCompletionResponseFormat, ChatMessage
|
368 |
+
from pathlib import Path
|
369 |
+
|
370 |
+
from haystack.components.generators import OpenAIGenerator
|
371 |
+
from haystack.utils import Secret
|
372 |
+
from haystack.components.builders import PromptBuilder
|
373 |
+
from haystack import Document
|
374 |
+
from haystack import Pipeline
|
375 |
+
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
376 |
+
from haystack.components.converters.txt import TextFileToDocument
|
377 |
+
from haystack.components.preprocessors import DocumentCleaner
|
378 |
+
from haystack.components.preprocessors import DocumentSplitter
|
379 |
+
from haystack.components.writers import DocumentWriter
|
380 |
+
|
381 |
+
prompt_builder = PromptBuilder(template=template)
|
382 |
+
generator = OpenAIGenerator(
|
383 |
+
api_key=Secret.from_env_var("OCTOAI_TOKEN"),
|
384 |
+
api_base_url="https://text.octoai.run/v1",
|
385 |
+
model="meta-llama-3-70b-instruct",
|
386 |
+
generation_kwargs = {"max_tokens": 512}
|
387 |
+
)
|
388 |
+
document_store = InMemoryDocumentStore()
|
389 |
+
p = Pipeline()
|
390 |
+
p.add_component(instance=TextFileToDocument(), name="text_file_converter")
|
391 |
+
p.add_component(instance=DocumentCleaner(), name="cleaner")
|
392 |
+
p.add_component(instance=DocumentSplitter(split_by="passage", split_length=2), name="splitter")
|
393 |
+
p.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
|
394 |
+
p.add_component("prompt_builder", prompt_builder)
|
395 |
+
p.add_component("llm", generator)
|
396 |
+
p.connect("text_file_converter.documents", "cleaner.documents")
|
397 |
+
p.connect("cleaner.documents", "splitter.documents")
|
398 |
+
p.connect("splitter.documents", "writer.documents")
|
399 |
+
p.connect("writer.documents", "prompt_builder.documents")
|
400 |
+
p.connect("prompt_builder", "llm")
|
401 |
+
|
402 |
+
file_savePath = join(temp_path,file.filename)
|
403 |
+
|
404 |
+
with open(file_savePath,'wb') as f:
|
405 |
+
shutil.copyfileobj(file.file, f)
|
406 |
+
# convert PDF to image
|
407 |
+
images = convert_from_path(file_savePath)
|
408 |
+
|
409 |
+
text=""
|
410 |
+
first_page = ""
|
411 |
+
|
412 |
+
# Extract text from images
|
413 |
+
for image in images:
|
414 |
+
ocr_text = pytesseract.image_to_string(image,lang='vie')
|
415 |
+
if first_page=="":
|
416 |
+
first_page = truncate_text(ocr_text)
|
417 |
+
text=text+ocr_text+'\n'
|
418 |
+
|
419 |
+
path = file_savePath+".txt"
|
420 |
+
with open(path,'wb') as f:
|
421 |
+
f.write(text)
|
422 |
+
|
423 |
+
files = [path]
|
424 |
+
p.run({"text_file_converter": {"sources": files},
|
425 |
+
"prompt_builder": {"question": "Sử dụng tiếng Việt để trích thông tin từ hóa đơn sau đó trả ra dưới dạng JSON, Trong bảng chi tiết hóa đơn bỏ qua dòng có các ô [A,B,C,1,2,3=1x2]"}})
|
426 |
+
client = OctoAI()
|
427 |
+
|
428 |
+
completion = client.text_gen.create_chat_completion(
|
429 |
+
model="meta-llama-3-70b-instruct",
|
430 |
+
messages=[
|
431 |
+
ChatMessage(role="system", content="You are a helpful assistant."),
|
432 |
+
ChatMessage(role="user", content=first_page),
|
433 |
+
],
|
434 |
+
presence_penalty=0,
|
435 |
+
temperature=0.1,
|
436 |
+
top_p=0.9,
|
437 |
+
response_format=ChatCompletionResponseFormat(
|
438 |
+
type="json_object",
|
439 |
+
schema=Invoice.model_json_schema(),
|
440 |
+
),
|
441 |
+
)
|
442 |
+
|
443 |
+
return {'content':text,'metadate':completion.choices[0].message.content}
|
444 |
+
|
445 |
def image2metadata(image):
|
446 |
try:
|
447 |
image = image.convert('RGB')
|