import fitz # PyMuPDF import os import pytesseract from PIL import Image import io from llama_index import * from transformers import pipeline from fastapi import FastAPI, UploadFile, File, Form app = FastAPI() # Endpoint to check for files in the current directory @app.get("/check_files") async def check_files(): files = os.listdir(".") if any(file.endswith(".pdf") for file in files): return {"message": "Files found in directory. You can proceed."} else: return {"message": "Select a file to work on in PDF."} # Function to extract text from PDF with OCR def extract_text_from_pdf(pdf_path): doc = fitz.open(pdf_path) text = "" for page_num in range(len(doc)): page = doc.load_page(page_num) # Extract text from the page text += page.get_text() # Extract images from the page image_list = page.get_images(full=True) for img_index, img in enumerate(image_list): xref = img[0] base_image = doc.extract_image(xref) image_bytes = base_image["image"] image_ext = base_image["ext"] # Load image image = Image.open(io.BytesIO(image_bytes)) # Use Tesseract to do OCR on the image image_text = pytesseract.image_to_string(image) # Append OCR text to the main text text += image_text return text # Function to create the index def create_index(text): # Save text to a file with open("pdf_text.txt", "w") as file: file.write(text) reader = SimpleDirectoryReader(".") index = GPTVectorStoreIndex.from_documents(reader.load_data()) return index # Function to answer a question def answer_question(question, index, qa_pipeline): # Retrieve the most relevant section from the index response = index.query(question) # Use the Hugging Face QA pipeline to find the answer in the retrieved text result = qa_pipeline(question=question, context=response) return result['answer'] # Use a Hugging Face model for question answering qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad") @app.post("/upload") async def upload_file(file: UploadFile = File(...), question: str = Form(...)): pdf_text = extract_text_from_pdf(file.file) index = create_index(pdf_text) answer = answer_question(question, index, qa_pipeline) return {"answer": answer}