|
import fitz |
|
import os |
|
import pytesseract |
|
from PIL import Image |
|
import io |
|
from llama_index import * |
|
from transformers import pipeline |
|
from fastapi import FastAPI, UploadFile, File, Form |
|
|
|
app = FastAPI() |
|
|
|
|
|
@app.get("/check_files") |
|
async def check_files(): |
|
files = os.listdir(".") |
|
if any(file.endswith(".pdf") for file in files): |
|
return {"message": "Files found in directory. You can proceed."} |
|
else: |
|
return {"message": "Select a file to work on in PDF."} |
|
|
|
|
|
def extract_text_from_pdf(pdf_path): |
|
doc = fitz.open(pdf_path) |
|
text = "" |
|
|
|
for page_num in range(len(doc)): |
|
page = doc.load_page(page_num) |
|
|
|
|
|
text += page.get_text() |
|
|
|
|
|
image_list = page.get_images(full=True) |
|
|
|
for img_index, img in enumerate(image_list): |
|
xref = img[0] |
|
base_image = doc.extract_image(xref) |
|
image_bytes = base_image["image"] |
|
image_ext = base_image["ext"] |
|
|
|
|
|
image = Image.open(io.BytesIO(image_bytes)) |
|
|
|
|
|
image_text = pytesseract.image_to_string(image) |
|
|
|
|
|
text += image_text |
|
|
|
return text |
|
|
|
|
|
def create_index(text): |
|
|
|
with open("pdf_text.txt", "w") as file: |
|
file.write(text) |
|
|
|
reader = SimpleDirectoryReader(".") |
|
index = GPTVectorStoreIndex.from_documents(reader.load_data()) |
|
return index |
|
|
|
|
|
def answer_question(question, index, qa_pipeline): |
|
|
|
response = index.query(question) |
|
|
|
|
|
result = qa_pipeline(question=question, context=response) |
|
|
|
return result['answer'] |
|
|
|
|
|
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad") |
|
|
|
@app.post("/upload") |
|
async def upload_file(file: UploadFile = File(...), question: str = Form(...)): |
|
pdf_text = extract_text_from_pdf(file.file) |
|
index = create_index(pdf_text) |
|
answer = answer_question(question, index, qa_pipeline) |
|
return {"answer": answer} |