Spaces:
Sleeping
Sleeping
dinhquangson
commited on
Commit
•
f47a6ae
1
Parent(s):
768def6
Update app.py
Browse files
app.py
CHANGED
@@ -4,7 +4,6 @@ from datasets import load_dataset
|
|
4 |
from fastapi.middleware.cors import CORSMiddleware
|
5 |
import pdfplumber
|
6 |
import pytesseract
|
7 |
-
from transformers import AutoModel, AutoTokenizer
|
8 |
|
9 |
from models import Invoice
|
10 |
|
@@ -27,23 +26,8 @@ app.add_middleware(
|
|
27 |
allow_methods=["*"],
|
28 |
allow_headers=["*"],
|
29 |
)
|
30 |
-
|
31 |
-
|
32 |
-
if 'int4' in model_path:
|
33 |
-
if device == 'mps':
|
34 |
-
print('Error: running int4 model with bitsandbytes on Mac is not supported right now.')
|
35 |
-
exit()
|
36 |
-
model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
|
37 |
-
else:
|
38 |
-
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).to(dtype=torch.float16)
|
39 |
-
model = model.to(device=device)
|
40 |
-
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
41 |
-
model.eval()
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
ERROR_MSG = "Error, please retry"
|
46 |
-
model_name = 'MiniCPM-Llama3-V 2.5'
|
47 |
NUM_PROC = os.cpu_count()
|
48 |
parent_path = dirname(getcwd())
|
49 |
|
@@ -358,89 +342,6 @@ async def convert_upload_file(file: UploadFile = File(...)):
|
|
358 |
)
|
359 |
|
360 |
return {'content':text,'metadate':completion.choices[0].message.content}
|
361 |
-
|
362 |
-
@app.post("/large_pdf2text/")
|
363 |
-
async def upload_large_file(file: UploadFile = File(...)):
|
364 |
-
import pytesseract
|
365 |
-
from pdf2image import convert_from_path
|
366 |
-
from octoai.client import OctoAI
|
367 |
-
from octoai.text_gen import ChatCompletionResponseFormat, ChatMessage
|
368 |
-
from pathlib import Path
|
369 |
-
|
370 |
-
from haystack.components.generators import OpenAIGenerator
|
371 |
-
from haystack.utils import Secret
|
372 |
-
from haystack.components.builders import PromptBuilder
|
373 |
-
from haystack import Document
|
374 |
-
from haystack import Pipeline
|
375 |
-
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
376 |
-
from haystack.components.converters.txt import TextFileToDocument
|
377 |
-
from haystack.components.preprocessors import DocumentCleaner
|
378 |
-
from haystack.components.preprocessors import DocumentSplitter
|
379 |
-
from haystack.components.writers import DocumentWriter
|
380 |
-
|
381 |
-
prompt_builder = PromptBuilder(template=template)
|
382 |
-
generator = OpenAIGenerator(
|
383 |
-
api_key=Secret.from_env_var("OCTOAI_TOKEN"),
|
384 |
-
api_base_url="https://text.octoai.run/v1",
|
385 |
-
model="meta-llama-3-70b-instruct",
|
386 |
-
generation_kwargs = {"max_tokens": 512}
|
387 |
-
)
|
388 |
-
document_store = InMemoryDocumentStore()
|
389 |
-
p = Pipeline()
|
390 |
-
p.add_component(instance=TextFileToDocument(), name="text_file_converter")
|
391 |
-
p.add_component(instance=DocumentCleaner(), name="cleaner")
|
392 |
-
p.add_component(instance=DocumentSplitter(split_by="passage", split_length=2), name="splitter")
|
393 |
-
p.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
|
394 |
-
p.add_component("prompt_builder", prompt_builder)
|
395 |
-
p.add_component("llm", generator)
|
396 |
-
p.connect("text_file_converter.documents", "cleaner.documents")
|
397 |
-
p.connect("cleaner.documents", "splitter.documents")
|
398 |
-
p.connect("splitter.documents", "writer.documents")
|
399 |
-
p.connect("writer.documents", "prompt_builder.documents")
|
400 |
-
p.connect("prompt_builder", "llm")
|
401 |
-
|
402 |
-
file_savePath = join(temp_path,file.filename)
|
403 |
-
|
404 |
-
with open(file_savePath,'wb') as f:
|
405 |
-
shutil.copyfileobj(file.file, f)
|
406 |
-
# convert PDF to image
|
407 |
-
images = convert_from_path(file_savePath)
|
408 |
-
|
409 |
-
text=""
|
410 |
-
first_page = ""
|
411 |
-
|
412 |
-
# Extract text from images
|
413 |
-
for image in images:
|
414 |
-
ocr_text = pytesseract.image_to_string(image,lang='vie')
|
415 |
-
if first_page=="":
|
416 |
-
first_page = truncate_text(ocr_text)
|
417 |
-
text=text+ocr_text+'\n'
|
418 |
-
|
419 |
-
path = file_savePath+".txt"
|
420 |
-
with open(path,'wb') as f:
|
421 |
-
f.write(text)
|
422 |
-
|
423 |
-
files = [path]
|
424 |
-
p.run({"text_file_converter": {"sources": files},
|
425 |
-
"prompt_builder": {"question": "Sử dụng tiếng Việt để trích thông tin từ hóa đơn sau đó trả ra dưới dạng JSON, Trong bảng chi tiết hóa đơn bỏ qua dòng có các ô [A,B,C,1,2,3=1x2]"}})
|
426 |
-
client = OctoAI()
|
427 |
-
|
428 |
-
completion = client.text_gen.create_chat_completion(
|
429 |
-
model="meta-llama-3-70b-instruct",
|
430 |
-
messages=[
|
431 |
-
ChatMessage(role="system", content="You are a helpful assistant."),
|
432 |
-
ChatMessage(role="user", content=first_page),
|
433 |
-
],
|
434 |
-
presence_penalty=0,
|
435 |
-
temperature=0.1,
|
436 |
-
top_p=0.9,
|
437 |
-
response_format=ChatCompletionResponseFormat(
|
438 |
-
type="json_object",
|
439 |
-
schema=Invoice.model_json_schema(),
|
440 |
-
),
|
441 |
-
)
|
442 |
-
|
443 |
-
return {'content':text,'metadate':completion.choices[0].message.content}
|
444 |
|
445 |
def image2metadata(image):
|
446 |
try:
|
|
|
4 |
from fastapi.middleware.cors import CORSMiddleware
|
5 |
import pdfplumber
|
6 |
import pytesseract
|
|
|
7 |
|
8 |
from models import Invoice
|
9 |
|
|
|
26 |
allow_methods=["*"],
|
27 |
allow_headers=["*"],
|
28 |
)
|
29 |
+
|
30 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
NUM_PROC = os.cpu_count()
|
32 |
parent_path = dirname(getcwd())
|
33 |
|
|
|
342 |
)
|
343 |
|
344 |
return {'content':text,'metadate':completion.choices[0].message.content}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
345 |
|
346 |
def image2metadata(image):
|
347 |
try:
|