import fitz |
import os |
import pytesseract |
from PIL import Image |
import io |
from llama_index import * |
from transformers import pipeline |
from fastapi import FastAPI, UploadFile, File, Form |
app = FastAPI() |
@app.get("/check_files") |
async def check_files(): |
files = os.listdir(".") |
if any(file.endswith(".pdf") for file in files): |
return {"message": "Files found in directory. You can proceed."} |
else: |
return {"message": "Select a file to work on in PDF."} |
def extract_text_from_pdf(pdf_path): |
doc = fitz.open(pdf_path) |
text = "" |
for page_num in range(len(doc)): |
page = doc.load_page(page_num) |
text += page.get_text() |
image_list = page.get_images(full=True) |
for img_index, img in enumerate(image_list): |
xref = img[0] |
base_image = doc.extract_image(xref) |
image_bytes = base_image["image"] |
image_ext = base_image["ext"] |
image = Image.open(io.BytesIO(image_bytes)) |
image_text = pytesseract.image_to_string(image) |
text += image_text |
return text |
def create_index(text): |
with open("pdf_text.txt", "w") as file: |
file.write(text) |
reader = SimpleDirectoryReader(".") |
index = GPTVectorStoreIndex.from_documents(reader.load_data()) |
return index |
def answer_question(question, index, qa_pipeline): |
response = index.query(question) |
result = qa_pipeline(question=question, context=response) |
return result['answer'] |
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad") |
@app.post("/upload") |
async def upload_file(file: UploadFile = File(...), question: str = Form(...)): |
pdf_text = extract_text_from_pdf(file.file) |
index = create_index(pdf_text) |
answer = answer_question(question, index, qa_pipeline) |
return {"answer": answer} |