Juslin
/

ORTHOSageAi

Model card Files Files and versions Community

Juslin commited on Jul 5

Commit

2f54821

•

1 Parent(s): d36b4ee

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -45

app.py CHANGED Viewed

@@ -1,48 +1,80 @@
-from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments
-import torch
-tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")
-train_dataset = ("squad")
-def tokenize_function(examples):
-    return tokenizer(
-        examples["questions"],
-        examples["context"],
-        truncation="only_second",
-        max_length=512,
-        padding="max_length",
-        stride=128,
-        return_overflowing_tokens=True,
-        return_offsets_mapping=True,
-        return_attention_mask=True,
-        return_token_type_ids=True,
-    )
-tokenized_datasets = dataset.map(
-    tokenize_function,
-    batched=True,
-    remove_columns=["id", "title",
-                    "question", "context"],
-)
-training_args = TrainingArguments(
-    per_device_train_batch_size=8,
-    num_train_epochs=3,
-    logging_dir='./logs'
-)
-def compute_metrics(p):
-    return {}
-trainer = Trainer(
-    model=model,
-    args=training_args,
-    train_dataset= tokenized_datasets["train"],
-    tokenizer=tokenizer,
-    compute_metrics=compute_metrics,
-)
-trainer.train()

+import fitz  # PyMuPDF
+import os
+import pytesseract
+from PIL import Image
+import io
+from llama_index import *
+from transformers import pipeline
+from fastapi import FastAPI, UploadFile, File, Form
+app = FastAPI()
+# Endpoint to check for files in the current directory
+@app.get("/check_files")
+async def check_files():
+    files = os.listdir(".")
+    if any(file.endswith(".pdf") for file in files):
+        return {"message": "Files found in directory. You can proceed."}
+    else:
+        return {"message": "Select a file to work on in PDF."}
+# Function to extract text from PDF with OCR
+def extract_text_from_pdf(pdf_path):
+    doc = fitz.open(pdf_path)
+    text = ""
+    for page_num in range(len(doc)):
+        page = doc.load_page(page_num)
+        # Extract text from the page
+        text += page.get_text()
+        # Extract images from the page
+        image_list = page.get_images(full=True)
+        for img_index, img in enumerate(image_list):
+            xref = img[0]
+            base_image = doc.extract_image(xref)
+            image_bytes = base_image["image"]
+            image_ext = base_image["ext"]
+            # Load image
+            image = Image.open(io.BytesIO(image_bytes))
+            # Use Tesseract to do OCR on the image
+            image_text = pytesseract.image_to_string(image)
+            # Append OCR text to the main text
+            text += image_text
+    return text
+# Function to create the index
+def create_index(text):
+    # Save text to a file
+    with open("pdf_text.txt", "w") as file:
+        file.write(text)
+    reader = SimpleDirectoryReader(".")
+    index = GPTVectorStoreIndex.from_documents(reader.load_data())
+    return index
+# Function to answer a question
+def answer_question(question, index, qa_pipeline):
+    # Retrieve the most relevant section from the index
+    response = index.query(question)
+    # Use the Hugging Face QA pipeline to find the answer in the retrieved text
+    result = qa_pipeline(question=question, context=response)
+    return result['answer']
+# Use a Hugging Face model for question answering
+qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
+@app.post("/upload")
+async def upload_file(file: UploadFile = File(...), question: str = Form(...)):
+    pdf_text = extract_text_from_pdf(file.file)
+    index = create_index(pdf_text)
+    answer = answer_question(question, index, qa_pipeline)
+    return {"answer": answer}