Update app.py
Browse files
app.py
CHANGED
@@ -1,48 +1,80 @@
|
|
1 |
-
|
2 |
-
import
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
)
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
)
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
|
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
args=training_args,
|
43 |
-
train_dataset= tokenized_datasets["train"],
|
44 |
-
tokenizer=tokenizer,
|
45 |
-
compute_metrics=compute_metrics,
|
46 |
-
)
|
47 |
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fitz # PyMuPDF
|
2 |
+
import os
|
3 |
+
import pytesseract
|
4 |
+
from PIL import Image
|
5 |
+
import io
|
6 |
+
from llama_index import *
|
7 |
+
from transformers import pipeline
|
8 |
+
from fastapi import FastAPI, UploadFile, File, Form
|
9 |
+
|
10 |
+
app = FastAPI()
|
11 |
+
|
12 |
+
# Endpoint to check for files in the current directory
|
13 |
+
@app.get("/check_files")
|
14 |
+
async def check_files():
|
15 |
+
files = os.listdir(".")
|
16 |
+
if any(file.endswith(".pdf") for file in files):
|
17 |
+
return {"message": "Files found in directory. You can proceed."}
|
18 |
+
else:
|
19 |
+
return {"message": "Select a file to work on in PDF."}
|
20 |
+
|
21 |
+
# Function to extract text from PDF with OCR
|
22 |
+
def extract_text_from_pdf(pdf_path):
|
23 |
+
doc = fitz.open(pdf_path)
|
24 |
+
text = ""
|
25 |
+
|
26 |
+
for page_num in range(len(doc)):
|
27 |
+
page = doc.load_page(page_num)
|
28 |
+
|
29 |
+
# Extract text from the page
|
30 |
+
text += page.get_text()
|
31 |
+
|
32 |
+
# Extract images from the page
|
33 |
+
image_list = page.get_images(full=True)
|
34 |
+
|
35 |
+
for img_index, img in enumerate(image_list):
|
36 |
+
xref = img[0]
|
37 |
+
base_image = doc.extract_image(xref)
|
38 |
+
image_bytes = base_image["image"]
|
39 |
+
image_ext = base_image["ext"]
|
40 |
+
|
41 |
+
# Load image
|
42 |
+
image = Image.open(io.BytesIO(image_bytes))
|
43 |
+
|
44 |
+
# Use Tesseract to do OCR on the image
|
45 |
+
image_text = pytesseract.image_to_string(image)
|
46 |
+
|
47 |
+
# Append OCR text to the main text
|
48 |
+
text += image_text
|
49 |
+
|
50 |
+
return text
|
51 |
+
|
52 |
+
# Function to create the index
|
53 |
+
def create_index(text):
|
54 |
+
# Save text to a file
|
55 |
+
with open("pdf_text.txt", "w") as file:
|
56 |
+
file.write(text)
|
57 |
+
|
58 |
+
reader = SimpleDirectoryReader(".")
|
59 |
+
index = GPTVectorStoreIndex.from_documents(reader.load_data())
|
60 |
+
return index
|
61 |
+
|
62 |
+
# Function to answer a question
|
63 |
+
def answer_question(question, index, qa_pipeline):
|
64 |
+
# Retrieve the most relevant section from the index
|
65 |
+
response = index.query(question)
|
66 |
+
|
67 |
+
# Use the Hugging Face QA pipeline to find the answer in the retrieved text
|
68 |
+
result = qa_pipeline(question=question, context=response)
|
69 |
|
70 |
+
return result['answer']
|
71 |
|
72 |
+
# Use a Hugging Face model for question answering
|
73 |
+
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
+
@app.post("/upload")
|
76 |
+
async def upload_file(file: UploadFile = File(...), question: str = Form(...)):
|
77 |
+
pdf_text = extract_text_from_pdf(file.file)
|
78 |
+
index = create_index(pdf_text)
|
79 |
+
answer = answer_question(question, index, qa_pipeline)
|
80 |
+
return {"answer": answer}
|