Spaces:

Tonic
/

GOT-OCR

Running on Zero

App Files Files Community

Jordan Legg commited on Sep 13

Commit

bbed54b

•

1 Parent(s): b153fc4

refactor: retrieve title and desc from markdown, improve UI for more responsive usage

Browse files

Files changed (1) hide show

app.py +75 -91

app.py CHANGED Viewed

@@ -5,87 +5,76 @@ import os
 import base64
 import spaces
 import io
-import tempfile
 from PIL import Image
-import io
-title = """# 🙋🏻‍♂️Welcome to Tonic's🫴🏻📸GOT-OCR"""
-description = """"
-The GOT-OCR model is a revolutionary step in the evolution of OCR systems, boasting 580M parameters and the ability to process various forms of "characters." It features a high-compression encoder and a long-context decoder, making it well-suited for both scene- and document-style images. The model also supports multi-page and dynamic resolution OCR for added practicality.
-The model can output results in a variety of formats, including plain text, markdown, and even complex outputs like TikZ diagrams or molecular SMILES strings. Interactive OCR allows users to specify regions of interest for OCR using coordinates or colors.
-## Features
-- **Plain Text OCR**: Recognizes and extracts plain text from images.
-- **Formatted Text OCR**: Extracts text while preserving its formatting (tables, formulas, etc.).
-- **Fine-grained OCR**: Box-based and color-based OCR for precise text extraction from specific regions.
-- **Multi-crop OCR**: Processes multiple cropped regions within an image.
-- **Rendered Formatted OCR Results**: Outputs OCR results in markdown, TikZ, SMILES, or other formats with rendered formatting.
-GOT-OCR-2.0 can handle:
-- Plain text
-- Math/molecular formulas
-- Tables
-- Charts
-- Sheet music
-- Geometric shapes
-## How to Use
-1. Select a task from the dropdown menu.
-2. Upload an image.
-3. (Optional) Fill in additional parameters based on the task.
-4. Click **Process** to see the results.
----
-### Join us :
-🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
-"""
 model_name = 'ucaslcl/GOT-OCR2_0'
 tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
 model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
 model = model.eval().cuda()
 model.config.pad_token_id = tokenizer.eos_token_id
-def save_image_to_temp_file(image):
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
-        image.save(temp_file, format="PNG")
-        return temp_file.name
 @spaces.GPU
-def process_image(image, task, ocr_type=None, ocr_box=None, ocr_color=None):
-    try:
-        if image is None:
-            return "No image provided", None
-        temp_image_path = save_image_to_temp_file(image)
-        if task == "Plain Text OCR":
-            res = model.chat(tokenizer, temp_image_path, ocr_type='ocr')
-        elif task == "Format Text OCR":
-            res = model.chat(tokenizer, temp_image_path, ocr_type='format')
-        elif task == "Fine-grained OCR (Box)":
-            res = model.chat(tokenizer, temp_image_path, ocr_type=ocr_type, ocr_box=ocr_box)
-        elif task == "Fine-grained OCR (Color)":
-            res = model.chat(tokenizer, temp_image_path, ocr_type=ocr_type, ocr_color=ocr_color)
-        elif task == "Multi-crop OCR":
-            res = model.chat_crop(tokenizer, image_file=temp_image_path)
-        elif task == "Render Formatted OCR":
-            res = model.chat(tokenizer, temp_image_path, ocr_type='format', render=True, save_render_file='./results/demo.html')
-            with open('./results/demo.html', 'r') as f:
-                html_content = f.read()
-            os.remove(temp_image_path)
-            return res, html_content
-        # Clean up
-        os.remove(temp_image_path)
-        return res, None
-    except Exception as e:
-        return str(e), None
 def update_inputs(task):
     if task == "Plain Text OCR" or task == "Format Text OCR" or task == "Multi-crop OCR":
         return [gr.update(visible=False)] * 4
@@ -105,22 +94,25 @@ def update_inputs(task):
         ]
     elif task == "Render Formatted OCR":
         return [gr.update(visible=False)] * 3 + [gr.update(visible=True)]
 def ocr_demo(image, task, ocr_type, ocr_box, ocr_color):
-    result = process_image(image, task, ocr_type, ocr_box, ocr_color)
-    if isinstance(result, tuple) and len(result) == 2:
-        res, html_content = result
-        if html_content:
-            return res, html_content
-    return result, None
 with gr.Blocks() as demo:
-    gr.Markdown(title)
-    gr.Markdown(description)
     with gr.Row():
-        with gr.Column():
-            image_input = gr.Image(type="pil", label="Input Image")
             task_dropdown = gr.Dropdown(
                 choices=[
                     "Plain Text OCR",
@@ -153,27 +145,19 @@ with gr.Blocks() as demo:
                 visible=False
             )
             submit_button = gr.Button("Process")
-        with gr.Column():
             output_text = gr.Textbox(label="OCR Result")
             output_html = gr.HTML(label="Rendered HTML Output")
-    gr.Markdown("""## GOT-OCR 2.0
-    This small **330M parameter** model powerful OCR model can handle various text recognition tasks with high accuracy.
-    ### Model Information
-    - **Model Name**: GOT-OCR 2.0
-    - **Hugging Face Repository**: [ucaslcl/GOT-OCR2_0](https://huggingface.co/ucaslcl/GOT-OCR2_0)
-    - **Environment**: CUDA 11.8 + PyTorch 2.0.1
-                """)
     task_dropdown.change(
         update_inputs,
         inputs=[task_dropdown],
         outputs=[ocr_type_dropdown, ocr_box_input, ocr_color_dropdown, render_checkbox]
     )
     submit_button.click(
         ocr_demo,
         inputs=[image_input, task_dropdown, ocr_type_dropdown, ocr_box_input, ocr_color_dropdown],
@@ -181,4 +165,4 @@ with gr.Blocks() as demo:
     )
 if __name__ == "__main__":
-    demo.launch()

 import base64
 import spaces
 import io
 from PIL import Image
+import numpy as np
+import yaml
+import markdown
+from pathlib import Path
+# Function to extract title and description from the markdown file
+def extract_title_description(md_file_path):
+    with open(md_file_path, 'r') as f:
+        lines = f.readlines()
+    # Extract frontmatter (YAML) for title
+    frontmatter = []
+    content_start = 0
+    if lines[0].strip() == '---':
+        for idx, line in enumerate(lines[1:], 1):
+            if line.strip() == '---':
+                content_start = idx + 1
+                break
+            frontmatter.append(line)
+    frontmatter_yaml = yaml.safe_load(''.join(frontmatter))
+    title = frontmatter_yaml.get('title', 'Title Not Found')
+    # Extract content (description)
+    description_md = ''.join(lines[content_start:])
+    description = markdown.markdown(description_md)
+    return title, description
+# Path to the markdown file
+md_file_path = 'content/index.md'
+# Extract title and description from the markdown file
+title, description = extract_title_description(md_file_path)
+# Rest of the script continues as before
 model_name = 'ucaslcl/GOT-OCR2_0'
 tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
+config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
 model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
 model = model.eval().cuda()
 model.config.pad_token_id = tokenizer.eos_token_id
+def image_to_base64(image):
+    buffered = io.BytesIO()
+    image.save(buffered, format="PNG")
+    return base64.b64encode(buffered.getvalue()).decode()
 @spaces.GPU
+def process_image(image, task, ocr_type=None, ocr_box=None, ocr_color=None, render=False):
+    if task == "Plain Text OCR":
+        res = model.chat(tokenizer, image, ocr_type='ocr')
+    elif task == "Format Text OCR":
+        res = model.chat(tokenizer, image, ocr_type='format')
+    elif task == "Fine-grained OCR (Box)":
+        res = model.chat(tokenizer, image, ocr_type=ocr_type, ocr_box=ocr_box)
+    elif task == "Fine-grained OCR (Color)":
+        res = model.chat(tokenizer, image, ocr_type=ocr_type, ocr_color=ocr_color)
+    elif task == "Multi-crop OCR":
+        res = model.chat_crop(tokenizer, image_file=image)
+    elif task == "Render Formatted OCR":
+        res = model.chat(tokenizer, image, ocr_type='format', render=True, save_render_file='./demo.html')
+        with open('./demo.html', 'r') as f:
+            html_content = f.read()
+        return res, html_content
+    return res, None
 def update_inputs(task):
     if task == "Plain Text OCR" or task == "Format Text OCR" or task == "Multi-crop OCR":
         return [gr.update(visible=False)] * 4
         ]
     elif task == "Render Formatted OCR":
         return [gr.update(visible=False)] * 3 + [gr.update(visible=True)]
 def ocr_demo(image, task, ocr_type, ocr_box, ocr_color):
+    res, html_content = process_image(image, task, ocr_type, ocr_box, ocr_color)
+    if html_content:
+        return res, html_content
+    return res, None
+import gradio as gr
 with gr.Blocks() as demo:
     with gr.Row():
+        # Left Column: Description
+        with gr.Column(scale=1):
+            gr.Markdown(f"# {title}")
+            gr.Markdown(description)
+        # Right Column: App Inputs and Outputs
+        with gr.Column(scale=3):
+            image_input = gr.Image(type="filepath", label="Input Image")
             task_dropdown = gr.Dropdown(
                 choices=[
                     "Plain Text OCR",
                 visible=False
             )
             submit_button = gr.Button("Process")
+            # OCR Result below the Submit button
             output_text = gr.Textbox(label="OCR Result")
             output_html = gr.HTML(label="Rendered HTML Output")
+    # Update inputs dynamically based on task selection
     task_dropdown.change(
         update_inputs,
         inputs=[task_dropdown],
         outputs=[ocr_type_dropdown, ocr_box_input, ocr_color_dropdown, render_checkbox]
     )
+    # Process OCR on button click
     submit_button.click(
         ocr_demo,
         inputs=[image_input, task_dropdown, ocr_type_dropdown, ocr_box_input, ocr_color_dropdown],
     )
 if __name__ == "__main__":
+    demo.launch()