Juslin
/

ORTHOSageAi

Model card Files Files and versions Community

ORTHOSageAi / app.py

Juslin's picture

Update app.py

2f54821 verified 5 months ago

2.53 kB

	import fitz # PyMuPDF
	import os
	import pytesseract
	from PIL import Image
	import io
	from llama_index import *
	from transformers import pipeline
	from fastapi import FastAPI, UploadFile, File, Form

	app = FastAPI()

	# Endpoint to check for files in the current directory
	@app.get("/check_files")
	async def check_files():
	files = os.listdir(".")
	if any(file.endswith(".pdf") for file in files):
	return {"message": "Files found in directory. You can proceed."}
	else:
	return {"message": "Select a file to work on in PDF."}

	# Function to extract text from PDF with OCR
	def extract_text_from_pdf(pdf_path):
	doc = fitz.open(pdf_path)
	text = ""

	for page_num in range(len(doc)):
	page = doc.load_page(page_num)

	# Extract text from the page
	text += page.get_text()

	# Extract images from the page
	image_list = page.get_images(full=True)

	for img_index, img in enumerate(image_list):
	xref = img[0]
	base_image = doc.extract_image(xref)
	image_bytes = base_image["image"]
	image_ext = base_image["ext"]

	# Load image
	image = Image.open(io.BytesIO(image_bytes))

	# Use Tesseract to do OCR on the image
	image_text = pytesseract.image_to_string(image)

	# Append OCR text to the main text
	text += image_text

	return text

	# Function to create the index
	def create_index(text):
	# Save text to a file
	with open("pdf_text.txt", "w") as file:
	file.write(text)

	reader = SimpleDirectoryReader(".")
	index = GPTVectorStoreIndex.from_documents(reader.load_data())
	return index

	# Function to answer a question
	def answer_question(question, index, qa_pipeline):
	# Retrieve the most relevant section from the index
	response = index.query(question)

	# Use the Hugging Face QA pipeline to find the answer in the retrieved text
	result = qa_pipeline(question=question, context=response)

	return result['answer']

	# Use a Hugging Face model for question answering
	qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

	@app.post("/upload")
	async def upload_file(file: UploadFile = File(...), question: str = Form(...)):
	pdf_text = extract_text_from_pdf(file.file)
	index = create_index(pdf_text)
	answer = answer_question(question, index, qa_pipeline)
	return {"answer": answer}