Spaces:

Aekanun
/

Thai-HandWriting-to-Text

Running on Zero

File size: 4,624 Bytes

1a517f1
21cb9fc
592ad8f
21cb9fc
76059b0
59ee2e8
592ad8f
 
79ec84c
b31bef1
79ec84c
21cb9fc
 
a187193
21cb9fc
 
 
 
79ec84c
21cb9fc
76059b0
 
f9d68b0
78744e1
79ec84c
 
76059b0
 
79ec84c
76059b0
79ec84c
21cb9fc
76059b0
 
 
 
 
 
59ee2e8
f9d68b0
76059b0
 
 
f9d68b0
59ee2e8
 
 
 
76059b0
 
 
 
 
59ee2e8
 
 
 
 
 
 
 
 
 
 
76059b0
 
 
 
 
78744e1
21cb9fc
76059b0
 
 
 
 
 
 
 
 
 
 
 
 
21cb9fc
76059b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21cb9fc
b5217a9
21cb9fc
 
76059b0
 
 
 
 
 
 
 
 
 
 
 
21cb9fc
76059b0

import os
import warnings
import torch
import gc
from transformers import AutoModelForVision2Seq, AutoProcessor
from peft import PeftModel
from PIL import Image
import gradio as gr
from huggingface_hub import login

# Basic settings
warnings.filterwarnings('ignore')
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Global variables
model = None
processor = None

# Clear CUDA cache
if torch.cuda.is_available():
   torch.cuda.empty_cache()
   gc.collect()
print("เคลียร์ CUDA cache เรียบร้อยแล้ว")

# Login to Hugging Face Hub
if 'HUGGING_FACE_HUB_TOKEN' in os.environ:
   print("กำลังเข้าสู่ระบบ Hugging Face Hub...")
   login(token=os.environ['HUGGING_FACE_HUB_TOKEN'])
else:
   print("คำเตือน: ไม่พบ HUGGING_FACE_HUB_TOKEN")

def load_model_and_processor():
   """โหลดโมเดลและ processor"""
   global model, processor
   print("กำลังโหลดโมเดลและ processor...")
   try:
       # Model paths
       base_model_path = "meta-llama/Llama-3.2-11B-Vision-Instruct"
       adapter_path = "Aekanun/thai-handwriting-llm"

       # Load processor from base model
       print("กำลังโหลด processor...")
       processor = AutoProcessor.from_pretrained(base_model_path, use_auth_token=True)

       # Load base model
       print("กำลังโหลด base model...")
       base_model = AutoModelForVision2Seq.from_pretrained(
           base_model_path,
           device_map="auto",
           torch_dtype=torch.bfloat16,
           trust_remote_code=True,
           use_auth_token=True
       )

       # Load adapter
       print("กำลังโหลด adapter...")
       model = PeftModel.from_pretrained(
           base_model,
           adapter_path,
           torch_dtype=torch.bfloat16,
           device_map="auto",
           use_auth_token=True
       )
       
       print("โหลดโมเดลสำเร็จ!")
       return True
   except Exception as e:
       print(f"เกิดข้อผิดพลาดในการโหลดโมเดล: {str(e)}")
       return False

def process_handwriting(image):
   """ฟังก์ชันสำหรับ Gradio interface"""
   global model, processor
   
   if image is None:
       return "กรุณาอัพโหลดรูปภาพ"
   
   try:
       # Ensure image is in PIL format
       if not isinstance(image, Image.Image):
           image = Image.fromarray(image)
       
       # Create prompt
       prompt = """Transcribe the Thai handwritten text from the provided image.
Only return the transcription in Thai language."""
       
       # Create model inputs
       messages = [
           {
               "role": "user",
               "content": [
                   {"type": "text", "text": prompt},
                   {"type": "image", "image": image}
               ],
           }
       ]
       
       # Process with model
       text = processor.apply_chat_template(messages, tokenize=False)
       inputs = processor(text=text, images=image, return_tensors="pt")
       inputs = {k: v.to(model.device) for k, v in inputs.items()}
       
       # Generate
       with torch.no_grad():
           outputs = model.generate(
               **inputs,
               max_new_tokens=256,
               do_sample=False,
               pad_token_id=processor.tokenizer.pad_token_id
           )
       
       # Decode output
       transcription = processor.decode(outputs[0], skip_special_tokens=True)
       return transcription.strip()
   except Exception as e:
       return f"เกิดข้อผิดพลาด: {str(e)}"

# Initialize application
print("กำลังเริ่มต้นแอปพลิเคชัน...")
if load_model_and_processor():
   # Create Gradio interface
   demo = gr.Interface(
       fn=process_handwriting,
       inputs=gr.Image(type="pil", label="อัพโหลดรูปลายมือเขียนภาษาไทย"),
       outputs=gr.Textbox(label="ข้อความที่แปลงได้"),
       title="Thai Handwriting Recognition",
       description="อัพโหลดรูปภาพลายมือเขียนภาษาไทยเพื่อแปลงเป็นข้อความ",
       examples=[["example1.jpg"], ["example2.jpg"]]
   )
   
   if __name__ == "__main__":
       demo.launch()
else:
   print("ไม่สามารถเริ่มต้นแอปพลิเคชันได้")