Juslin commited on
Commit
2f54821
1 Parent(s): d36b4ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -45
app.py CHANGED
@@ -1,48 +1,80 @@
1
- from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments
2
- import torch
3
-
4
- tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
5
- model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")
6
-
7
- train_dataset = ("squad")
8
-
9
- def tokenize_function(examples):
10
- return tokenizer(
11
- examples["questions"],
12
- examples["context"],
13
- truncation="only_second",
14
-
15
- max_length=512,
16
-
17
- padding="max_length",
18
- stride=128,
19
-
20
- return_overflowing_tokens=True,
21
- return_offsets_mapping=True,
22
- return_attention_mask=True,
23
- return_token_type_ids=True,
24
- )
25
- tokenized_datasets = dataset.map(
26
- tokenize_function,
27
- batched=True,
28
- remove_columns=["id", "title",
29
- "question", "context"],
30
- )
31
- training_args = TrainingArguments(
32
- per_device_train_batch_size=8,
33
- num_train_epochs=3,
34
- logging_dir='./logs'
35
- )
36
- def compute_metrics(p):
37
- return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
 
39
 
40
- trainer = Trainer(
41
- model=model,
42
- args=training_args,
43
- train_dataset= tokenized_datasets["train"],
44
- tokenizer=tokenizer,
45
- compute_metrics=compute_metrics,
46
- )
47
 
48
- trainer.train()
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import os
3
+ import pytesseract
4
+ from PIL import Image
5
+ import io
6
+ from llama_index import *
7
+ from transformers import pipeline
8
+ from fastapi import FastAPI, UploadFile, File, Form
9
+
10
+ app = FastAPI()
11
+
12
+ # Endpoint to check for files in the current directory
13
+ @app.get("/check_files")
14
+ async def check_files():
15
+ files = os.listdir(".")
16
+ if any(file.endswith(".pdf") for file in files):
17
+ return {"message": "Files found in directory. You can proceed."}
18
+ else:
19
+ return {"message": "Select a file to work on in PDF."}
20
+
21
+ # Function to extract text from PDF with OCR
22
+ def extract_text_from_pdf(pdf_path):
23
+ doc = fitz.open(pdf_path)
24
+ text = ""
25
+
26
+ for page_num in range(len(doc)):
27
+ page = doc.load_page(page_num)
28
+
29
+ # Extract text from the page
30
+ text += page.get_text()
31
+
32
+ # Extract images from the page
33
+ image_list = page.get_images(full=True)
34
+
35
+ for img_index, img in enumerate(image_list):
36
+ xref = img[0]
37
+ base_image = doc.extract_image(xref)
38
+ image_bytes = base_image["image"]
39
+ image_ext = base_image["ext"]
40
+
41
+ # Load image
42
+ image = Image.open(io.BytesIO(image_bytes))
43
+
44
+ # Use Tesseract to do OCR on the image
45
+ image_text = pytesseract.image_to_string(image)
46
+
47
+ # Append OCR text to the main text
48
+ text += image_text
49
+
50
+ return text
51
+
52
+ # Function to create the index
53
+ def create_index(text):
54
+ # Save text to a file
55
+ with open("pdf_text.txt", "w") as file:
56
+ file.write(text)
57
+
58
+ reader = SimpleDirectoryReader(".")
59
+ index = GPTVectorStoreIndex.from_documents(reader.load_data())
60
+ return index
61
+
62
+ # Function to answer a question
63
+ def answer_question(question, index, qa_pipeline):
64
+ # Retrieve the most relevant section from the index
65
+ response = index.query(question)
66
+
67
+ # Use the Hugging Face QA pipeline to find the answer in the retrieved text
68
+ result = qa_pipeline(question=question, context=response)
69
 
70
+ return result['answer']
71
 
72
+ # Use a Hugging Face model for question answering
73
+ qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
 
 
 
 
 
74
 
75
+ @app.post("/upload")
76
+ async def upload_file(file: UploadFile = File(...), question: str = Form(...)):
77
+ pdf_text = extract_text_from_pdf(file.file)
78
+ index = create_index(pdf_text)
79
+ answer = answer_question(question, index, qa_pipeline)
80
+ return {"answer": answer}