Spaces:

Aekanun
/

Thai-HandWriting-to-Text

Running on Zero

App Files Files Community

Aekanun commited on 12 days ago

Commit

78744e1

•

1 Parent(s): 069ee6d

fixing

Browse files

Files changed (1) hide show

app.py +53 -108

app.py CHANGED Viewed

@@ -1,123 +1,68 @@
 import os
-import warnings
 import torch
-import gc
-from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig
 from PIL import Image
 import gradio as gr
-from huggingface_hub import login
-warnings.filterwarnings('ignore')
-os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-# Global variables
-model = None
-processor = None
-if torch.cuda.is_available():
-    torch.cuda.empty_cache()
-    gc.collect()
-    print("เคลียร์ CUDA cache เรียบร้อยแล้ว")
-def load_model_and_processor():
-    """โหลดโมเดลและ processor"""
-    global model, processor
-    print("กำลังโหลดโมเดลและ processor...")
-    try:
-        # กำหนด paths
-        base_model_path = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-        hub_model_path = "Aekanun/thai-handwriting-llm"
-        # ตั้งค่า BitsAndBytes
-        bnb_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_use_double_quant=True,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_compute_dtype=torch.bfloat16
-        )
-        # โหลด processor จาก base model
-        processor = AutoProcessor.from_pretrained(
-            base_model_path,
-            token=os.environ.get('HUGGING_FACE_HUB_TOKEN')
-        )
-        # โหลดโมเดลจาก Hub
-        print("กำลังโหลดโมเดลจาก Hub...")
-        model = AutoModelForVision2Seq.from_pretrained(
-            hub_model_path,
-            device_map="auto",
-            torch_dtype=torch.bfloat16,
-            quantization_config=bnb_config,
-            token=os.environ.get('HUGGING_FACE_HUB_TOKEN')
-        )
-        print("โหลดโมเดลจาก Hub สำเร็จ!")
-        return True
-    except Exception as e:
-        print(f"เกิดข้อผิดพลาดในการโหลดโมเดล: {str(e)}")
-        return False
-def process_handwriting(image):
-    """ฟังก์ชันสำหรับ Gradio interface"""
-    global model, processor
     if image is None:
         return "กรุณาอัพโหลดรูปภาพ"
-    try:
-        # Ensure image is in PIL format
-        if not isinstance(image, Image.Image):
-            image = Image.fromarray(image)
-        # Convert to RGB if needed
-        if image.mode != "RGB":
-            image = image.convert("RGB")
-        prompt = """Transcribe the Thai handwritten text from the provided image.
-Only return the transcription in Thai language."""
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": prompt},
-                    {"type": "image", "image": image}
-                ],
-            }
-        ]
-        text = processor.apply_chat_template(messages, tokenize=False)
-        inputs = processor(text=text, images=image, return_tensors="pt")
-        inputs = {k: v.to(model.device) for k, v in inputs.items()}
-        with torch.no_grad():
-            outputs = model.generate(
-                **inputs,
-                max_new_tokens=256,
-                do_sample=False,
-                pad_token_id=processor.tokenizer.pad_token_id
-            )
-        transcription = processor.decode(outputs[0], skip_special_tokens=True)
-        return transcription.strip()
-    except Exception as e:
-        return f"เกิดข้อผิดพลาด: {str(e)}"
-# Initialize application
-print("กำลังเริ่มต้นแอปพลิเคชัน...")
-if load_model_and_processor():
-    demo = gr.Interface(
-        fn=process_handwriting,
-        inputs=gr.Image(type="pil", label="อัพโหลดรูปลายมือเขียนภาษาไทย"),
-        outputs=gr.Textbox(label="ข้อความที่แปลงได้"),
-        title="Thai Handwriting Recognition",
-        description="อัพโหลดรูปภาพลายมือเขียนภาษาไทยเพื่อแปลงเป็นข้อความ"
-    )
-    if __name__ == "__main__":
-        demo.launch()
-else:
-    print("ไม่สามารถเริ่มต้นแอปพลิเคชันได้")

 import os
 import torch
+from transformers import AutoModelForVision2Seq, AutoProcessor
 from PIL import Image
 import gradio as gr
+# Login to Hugging Face Hub
+from huggingface_hub import login
+token = os.environ.get('HUGGING_FACE_HUB_TOKEN')
+if token:
+    login(token=token)
+def load_model():
+    base_model_path = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+    hub_model_path = "Aekanun/thai-handwriting-llm"
+    processor = AutoProcessor.from_pretrained(base_model_path, token=token)
+    model = AutoModelForVision2Seq.from_pretrained(hub_model_path, token=token)
+    return model, processor
+model, processor = load_model()
+def process_image(image):
     if image is None:
         return "กรุณาอัพโหลดรูปภาพ"
+    if not isinstance(image, Image.Image):
+        image = Image.fromarray(image)
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    prompt = "Transcribe the Thai handwritten text from the provided image.\nOnly return the transcription in Thai language."
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt},
+                {"type": "image", "image": image}
+            ],
+        }
+    ]
+    text = processor.apply_chat_template(messages, tokenize=False)
+    inputs = processor(text=text, images=image, return_tensors="pt")
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=256,
+            do_sample=False,
+            pad_token_id=processor.tokenizer.pad_token_id
+        )
+    transcription = processor.decode(outputs[0], skip_special_tokens=True)
+    return transcription.strip()
+demo = gr.Interface(
+    fn=process_image,
+    inputs=gr.Image(type="pil"),
+    outputs="text",
+    title="Thai Handwriting OCR",
+)
+if __name__ == "__main__":
+    demo.launch()