Spaces:

Geetansh01
/

ebookify-backend2

Running

App Files Files Community

Geetansh commited on 27 days ago

Commit

6604d8f

•

1 Parent(s): 811f4a6

initial commit

Browse files

Files changed (16) hide show

.gitignore +18 -0
README.md +4 -4
app.py +45 -0
image_to_text.py +10 -0
ml_engine/model_functions.py +26 -0
ml_engine/saved-model/added_tokens.json +3 -0
ml_engine/saved-model/config.json +41 -0
ml_engine/saved-model/model.safetensors +3 -0
ml_engine/saved-model/special_tokens_map.json +15 -0
ml_engine/saved-model/spm.model +3 -0
ml_engine/saved-model/tokenizer.json +0 -0
ml_engine/saved-model/tokenizer_config.json +58 -0
ml_engine/saved-model/training_args.bin +3 -0
packages.txt +1 -0
pdf_to_image.py +40 -0
requirements.txt +12 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,18 @@

+# Virtual environment folders
+MLvenv/
+venv/
+# Node.js dependencies
+node_modules/
+# Python cache
+__pycache__/
+*.py[cod]
+*.pyo
+# Other common Python and IDE ignores
+*.ipynb_checkpoints
+*.env
+.DS_Store
+.idea/
+.vscode/

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: Ebookify Backend2
-emoji: 🌖
-colorFrom: purple
-colorTo: red
 sdk: gradio
 sdk_version: 5.4.0
 app_file: app.py

 ---
+title: Ebookify Backend
+emoji: ⚡
+colorFrom: green
+colorTo: green
 sdk: gradio
 sdk_version: 5.4.0
 app_file: app.py

app.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import gradio as gr
+import pdf_to_image
+import image_to_text
+from ml_engine.model_functions import is_it_title
+def process_pdf(pdf):
+    # Ensure we get the correct path to the uploaded file
+    pdf_path = pdf.name  # `pdf` is now a NamedString/TempFile with a `.name` attribute
+    pdf_pages_images = pdf_to_image.pdfToImg2(pdf_path)
+    pages = []
+    curr_pg = ""
+    for img in pdf_pages_images:
+        text = image_to_text.img2string(img)
+        for line in text.split("\n"):
+            if(len(line) == 0): continue
+            if(is_it_title(line)):
+                # print(f"TITLE FOUND: {line}") #Debug statement
+                if(len(curr_pg) != 0):
+                    pages.append(curr_pg)
+                    curr_pg = ""
+            curr_pg = (curr_pg + line + "\n")
+    pages.append(curr_pg)
+    # print(pages)
+    return pages  # Returning a list of strings
+# Gradio interface using latest syntax
+with gr.Blocks() as demo:
+    gr.Markdown("# PDF to Pages Processor")
+    gr.Markdown("Upload a PDF and get a list of extracted pages as output.")
+    # pdf_input = gr.File(label="Upload a PDF", file_types=[".pdf"])
+    pdf_input = gr.File(label="Upload a PDF")
+    output = gr.JSON(label="Extracted Pages")
+    submit_button = gr.Button("Process PDF")
+    # Define interaction
+    submit_button.click(fn=process_pdf, inputs=pdf_input, outputs=output)
+if __name__ == "__main__":
+    demo.launch()

image_to_text.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from PIL import Image
+import pytesseract
+# If you don't have tesseract executable in your PATH, include the following:
+pytesseract.pytesseract.tesseract_cmd = r'C:\TesseractOCR\tesseract'
+# Simple image to string
+def img2string(imgPath):
+    textOfImage = pytesseract.image_to_string(imgPath)
+    return textOfImage

ml_engine/model_functions.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+# Disk path where saved model & tokenizer is located
+save_dir = (r"./ml_engine/saved-model") #relative path acc. to "ebookify-backend/" directory (i.e the root directory of the backend)
+# Load the saved model and tokeniser from the disk
+loaded_tokeniser = AutoTokenizer.from_pretrained(save_dir)
+loaded_model = AutoModelForSequenceClassification.from_pretrained(save_dir)
+def is_it_title(string):
+    # Input
+    input = loaded_tokeniser(string, return_tensors='pt')
+    with torch.no_grad():
+        output = loaded_model(**input).logits.item()
+    # print(output.logits.item())
+    if(output >= 0.6):
+        return True
+    else:
+        return False
+if __name__ == "__main__":
+    print(is_it_title("Secret to Success lies in hardwork and nothing else!"))

ml_engine/saved-model/added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "[MASK]": 128000
+}

ml_engine/saved-model/config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "_name_or_path": "microsoft/deberta-v3-small",
+  "architectures": [
+    "DebertaV2ForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_eps": 1e-07,
+  "max_position_embeddings": 512,
+  "max_relative_positions": -1,
+  "model_type": "deberta-v2",
+  "norm_rel_ebd": "layer_norm",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "pooler_dropout": 0,
+  "pooler_hidden_act": "gelu",
+  "pooler_hidden_size": 768,
+  "pos_att_type": [
+    "p2c",
+    "c2p"
+  ],
+  "position_biased_input": false,
+  "position_buckets": 256,
+  "relative_attention": true,
+  "share_att_key": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.46.0",
+  "type_vocab_size": 0,
+  "vocab_size": 128100
+}

ml_engine/saved-model/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e941da07dd4a4f884aae6082850988e36acfdb9a10cffa21922bd68c7bf20606
+size 567595468

ml_engine/saved-model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "[CLS]",
+  "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

ml_engine/saved-model/spm.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
+size 2464616

ml_engine/saved-model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ml_engine/saved-model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128000": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "sp_model_kwargs": {},
+  "split_by_punct": false,
+  "tokenizer_class": "DebertaV2Tokenizer",
+  "unk_token": "[UNK]",
+  "vocab_type": "spm"
+}

ml_engine/saved-model/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4999cb1241c88dc93c2687149f19169521292eb1e5ca325d3a244469bb1602f9
+size 5176

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ poppler-utils

pdf_to_image.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from pdf2image import convert_from_path
+from pdf2image.exceptions import (
+    PDFInfoNotInstalledError,
+    PDFPageCountError,
+    PDFSyntaxError
+)
+# poppler_path = r"./Poppler/poppler-24.07.0/Library/bin"
+# def pdfToImg(pdfPath, outputPath):
+#     '''
+#     1)Images stored in output folder
+#     2)It returns path to stored images
+#     '''
+#     images_paths = convert_from_path(pdfPath, 200, outputPath, fmt="jpeg", poppler_path=poppler_path, paths_only=True)
+#     return images_paths
+# def pdfToImg2(pdfPath):
+#     '''
+#     1)Returns a list of Pillow images
+#     '''
+#     images = convert_from_path(pdfPath, 200, fmt="jpeg", poppler_path=poppler_path)
+#     return images
+# Changed version of above code for deployment on huggingface spaces
+def pdfToImg(pdfPath, outputPath):
+    '''
+    1)Images stored in output folder
+    2)It returns path to stored images
+    '''
+    images_paths = convert_from_path(pdfPath, 200, outputPath, fmt="jpeg", paths_only=True)
+    return images_paths
+def pdfToImg2(pdfPath):
+    '''
+    1)Returns a list of Pillow images
+    '''
+    images = convert_from_path(pdfPath, 200, fmt="jpeg")
+    return images

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+packaging==24.1
+pdf2image==1.17.0
+pillow==10.4.0
+pytesseract==0.3.13
+datasets==3.0.2
+transformers==4.46.0
+pandas==2.2.3
+numpy==2.0.2
+sentencepiece==0.2.0
+tiktoken==0.8.0
+torch==2.5.1
+gradio==5.4.0