Spaces:
Running
Running
Geetansh
commited on
Commit
•
6604d8f
1
Parent(s):
811f4a6
initial commit
Browse files- .gitignore +18 -0
- README.md +4 -4
- app.py +45 -0
- image_to_text.py +10 -0
- ml_engine/model_functions.py +26 -0
- ml_engine/saved-model/added_tokens.json +3 -0
- ml_engine/saved-model/config.json +41 -0
- ml_engine/saved-model/model.safetensors +3 -0
- ml_engine/saved-model/special_tokens_map.json +15 -0
- ml_engine/saved-model/spm.model +3 -0
- ml_engine/saved-model/tokenizer.json +0 -0
- ml_engine/saved-model/tokenizer_config.json +58 -0
- ml_engine/saved-model/training_args.bin +3 -0
- packages.txt +1 -0
- pdf_to_image.py +40 -0
- requirements.txt +12 -0
.gitignore
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Virtual environment folders
|
2 |
+
MLvenv/
|
3 |
+
venv/
|
4 |
+
|
5 |
+
# Node.js dependencies
|
6 |
+
node_modules/
|
7 |
+
|
8 |
+
# Python cache
|
9 |
+
__pycache__/
|
10 |
+
*.py[cod]
|
11 |
+
*.pyo
|
12 |
+
|
13 |
+
# Other common Python and IDE ignores
|
14 |
+
*.ipynb_checkpoints
|
15 |
+
*.env
|
16 |
+
.DS_Store
|
17 |
+
.idea/
|
18 |
+
.vscode/
|
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
-
title: Ebookify
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.4.0
|
8 |
app_file: app.py
|
|
|
1 |
---
|
2 |
+
title: Ebookify Backend
|
3 |
+
emoji: ⚡
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: green
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.4.0
|
8 |
app_file: app.py
|
app.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pdf_to_image
|
3 |
+
import image_to_text
|
4 |
+
from ml_engine.model_functions import is_it_title
|
5 |
+
|
6 |
+
def process_pdf(pdf):
|
7 |
+
# Ensure we get the correct path to the uploaded file
|
8 |
+
pdf_path = pdf.name # `pdf` is now a NamedString/TempFile with a `.name` attribute
|
9 |
+
|
10 |
+
pdf_pages_images = pdf_to_image.pdfToImg2(pdf_path)
|
11 |
+
|
12 |
+
pages = []
|
13 |
+
curr_pg = ""
|
14 |
+
|
15 |
+
for img in pdf_pages_images:
|
16 |
+
text = image_to_text.img2string(img)
|
17 |
+
for line in text.split("\n"):
|
18 |
+
if(len(line) == 0): continue
|
19 |
+
if(is_it_title(line)):
|
20 |
+
# print(f"TITLE FOUND: {line}") #Debug statement
|
21 |
+
if(len(curr_pg) != 0):
|
22 |
+
pages.append(curr_pg)
|
23 |
+
curr_pg = ""
|
24 |
+
|
25 |
+
curr_pg = (curr_pg + line + "\n")
|
26 |
+
pages.append(curr_pg)
|
27 |
+
# print(pages)
|
28 |
+
return pages # Returning a list of strings
|
29 |
+
|
30 |
+
# Gradio interface using latest syntax
|
31 |
+
with gr.Blocks() as demo:
|
32 |
+
gr.Markdown("# PDF to Pages Processor")
|
33 |
+
gr.Markdown("Upload a PDF and get a list of extracted pages as output.")
|
34 |
+
|
35 |
+
# pdf_input = gr.File(label="Upload a PDF", file_types=[".pdf"])
|
36 |
+
pdf_input = gr.File(label="Upload a PDF")
|
37 |
+
output = gr.JSON(label="Extracted Pages")
|
38 |
+
|
39 |
+
submit_button = gr.Button("Process PDF")
|
40 |
+
|
41 |
+
# Define interaction
|
42 |
+
submit_button.click(fn=process_pdf, inputs=pdf_input, outputs=output)
|
43 |
+
|
44 |
+
if __name__ == "__main__":
|
45 |
+
demo.launch()
|
image_to_text.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from PIL import Image
|
2 |
+
import pytesseract
|
3 |
+
|
4 |
+
# If you don't have tesseract executable in your PATH, include the following:
|
5 |
+
pytesseract.pytesseract.tesseract_cmd = r'C:\TesseractOCR\tesseract'
|
6 |
+
|
7 |
+
# Simple image to string
|
8 |
+
def img2string(imgPath):
|
9 |
+
textOfImage = pytesseract.image_to_string(imgPath)
|
10 |
+
return textOfImage
|
ml_engine/model_functions.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
3 |
+
|
4 |
+
# Disk path where saved model & tokenizer is located
|
5 |
+
save_dir = (r"./ml_engine/saved-model") #relative path acc. to "ebookify-backend/" directory (i.e the root directory of the backend)
|
6 |
+
|
7 |
+
# Load the saved model and tokeniser from the disk
|
8 |
+
loaded_tokeniser = AutoTokenizer.from_pretrained(save_dir)
|
9 |
+
loaded_model = AutoModelForSequenceClassification.from_pretrained(save_dir)
|
10 |
+
|
11 |
+
def is_it_title(string):
|
12 |
+
|
13 |
+
# Input
|
14 |
+
input = loaded_tokeniser(string, return_tensors='pt')
|
15 |
+
|
16 |
+
with torch.no_grad():
|
17 |
+
output = loaded_model(**input).logits.item()
|
18 |
+
# print(output.logits.item())
|
19 |
+
|
20 |
+
if(output >= 0.6):
|
21 |
+
return True
|
22 |
+
else:
|
23 |
+
return False
|
24 |
+
|
25 |
+
if __name__ == "__main__":
|
26 |
+
print(is_it_title("Secret to Success lies in hardwork and nothing else!"))
|
ml_engine/saved-model/added_tokens.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"[MASK]": 128000
|
3 |
+
}
|
ml_engine/saved-model/config.json
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "microsoft/deberta-v3-small",
|
3 |
+
"architectures": [
|
4 |
+
"DebertaV2ForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"hidden_act": "gelu",
|
8 |
+
"hidden_dropout_prob": 0.1,
|
9 |
+
"hidden_size": 768,
|
10 |
+
"id2label": {
|
11 |
+
"0": "LABEL_0"
|
12 |
+
},
|
13 |
+
"initializer_range": 0.02,
|
14 |
+
"intermediate_size": 3072,
|
15 |
+
"label2id": {
|
16 |
+
"LABEL_0": 0
|
17 |
+
},
|
18 |
+
"layer_norm_eps": 1e-07,
|
19 |
+
"max_position_embeddings": 512,
|
20 |
+
"max_relative_positions": -1,
|
21 |
+
"model_type": "deberta-v2",
|
22 |
+
"norm_rel_ebd": "layer_norm",
|
23 |
+
"num_attention_heads": 12,
|
24 |
+
"num_hidden_layers": 6,
|
25 |
+
"pad_token_id": 0,
|
26 |
+
"pooler_dropout": 0,
|
27 |
+
"pooler_hidden_act": "gelu",
|
28 |
+
"pooler_hidden_size": 768,
|
29 |
+
"pos_att_type": [
|
30 |
+
"p2c",
|
31 |
+
"c2p"
|
32 |
+
],
|
33 |
+
"position_biased_input": false,
|
34 |
+
"position_buckets": 256,
|
35 |
+
"relative_attention": true,
|
36 |
+
"share_att_key": true,
|
37 |
+
"torch_dtype": "float32",
|
38 |
+
"transformers_version": "4.46.0",
|
39 |
+
"type_vocab_size": 0,
|
40 |
+
"vocab_size": 128100
|
41 |
+
}
|
ml_engine/saved-model/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e941da07dd4a4f884aae6082850988e36acfdb9a10cffa21922bd68c7bf20606
|
3 |
+
size 567595468
|
ml_engine/saved-model/special_tokens_map.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "[CLS]",
|
3 |
+
"cls_token": "[CLS]",
|
4 |
+
"eos_token": "[SEP]",
|
5 |
+
"mask_token": "[MASK]",
|
6 |
+
"pad_token": "[PAD]",
|
7 |
+
"sep_token": "[SEP]",
|
8 |
+
"unk_token": {
|
9 |
+
"content": "[UNK]",
|
10 |
+
"lstrip": false,
|
11 |
+
"normalized": true,
|
12 |
+
"rstrip": false,
|
13 |
+
"single_word": false
|
14 |
+
}
|
15 |
+
}
|
ml_engine/saved-model/spm.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
|
3 |
+
size 2464616
|
ml_engine/saved-model/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
ml_engine/saved-model/tokenizer_config.json
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "[PAD]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "[CLS]",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "[SEP]",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"3": {
|
28 |
+
"content": "[UNK]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": true,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"128000": {
|
36 |
+
"content": "[MASK]",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"bos_token": "[CLS]",
|
45 |
+
"clean_up_tokenization_spaces": false,
|
46 |
+
"cls_token": "[CLS]",
|
47 |
+
"do_lower_case": false,
|
48 |
+
"eos_token": "[SEP]",
|
49 |
+
"mask_token": "[MASK]",
|
50 |
+
"model_max_length": 1000000000000000019884624838656,
|
51 |
+
"pad_token": "[PAD]",
|
52 |
+
"sep_token": "[SEP]",
|
53 |
+
"sp_model_kwargs": {},
|
54 |
+
"split_by_punct": false,
|
55 |
+
"tokenizer_class": "DebertaV2Tokenizer",
|
56 |
+
"unk_token": "[UNK]",
|
57 |
+
"vocab_type": "spm"
|
58 |
+
}
|
ml_engine/saved-model/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4999cb1241c88dc93c2687149f19169521292eb1e5ca325d3a244469bb1602f9
|
3 |
+
size 5176
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
poppler-utils
|
pdf_to_image.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pdf2image import convert_from_path
|
2 |
+
from pdf2image.exceptions import (
|
3 |
+
PDFInfoNotInstalledError,
|
4 |
+
PDFPageCountError,
|
5 |
+
PDFSyntaxError
|
6 |
+
)
|
7 |
+
|
8 |
+
# poppler_path = r"./Poppler/poppler-24.07.0/Library/bin"
|
9 |
+
|
10 |
+
# def pdfToImg(pdfPath, outputPath):
|
11 |
+
# '''
|
12 |
+
# 1)Images stored in output folder
|
13 |
+
# 2)It returns path to stored images
|
14 |
+
# '''
|
15 |
+
# images_paths = convert_from_path(pdfPath, 200, outputPath, fmt="jpeg", poppler_path=poppler_path, paths_only=True)
|
16 |
+
# return images_paths
|
17 |
+
|
18 |
+
# def pdfToImg2(pdfPath):
|
19 |
+
# '''
|
20 |
+
# 1)Returns a list of Pillow images
|
21 |
+
# '''
|
22 |
+
# images = convert_from_path(pdfPath, 200, fmt="jpeg", poppler_path=poppler_path)
|
23 |
+
# return images
|
24 |
+
|
25 |
+
# Changed version of above code for deployment on huggingface spaces
|
26 |
+
|
27 |
+
def pdfToImg(pdfPath, outputPath):
|
28 |
+
'''
|
29 |
+
1)Images stored in output folder
|
30 |
+
2)It returns path to stored images
|
31 |
+
'''
|
32 |
+
images_paths = convert_from_path(pdfPath, 200, outputPath, fmt="jpeg", paths_only=True)
|
33 |
+
return images_paths
|
34 |
+
|
35 |
+
def pdfToImg2(pdfPath):
|
36 |
+
'''
|
37 |
+
1)Returns a list of Pillow images
|
38 |
+
'''
|
39 |
+
images = convert_from_path(pdfPath, 200, fmt="jpeg")
|
40 |
+
return images
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
packaging==24.1
|
2 |
+
pdf2image==1.17.0
|
3 |
+
pillow==10.4.0
|
4 |
+
pytesseract==0.3.13
|
5 |
+
datasets==3.0.2
|
6 |
+
transformers==4.46.0
|
7 |
+
pandas==2.2.3
|
8 |
+
numpy==2.0.2
|
9 |
+
sentencepiece==0.2.0
|
10 |
+
tiktoken==0.8.0
|
11 |
+
torch==2.5.1
|
12 |
+
gradio==5.4.0
|