Spaces:

Aekanun
/

Thai-HandWriting-to-Text

Running on Zero

App Files Files Community

Aekanun commited on 15 days ago

Commit

79ec84c

•

1 Parent(s): b5217a9

fixed app.py

Browse files

Files changed (2) hide show

app.py +25 -15
config.json +0 -9

app.py CHANGED Viewed

@@ -5,7 +5,9 @@ import gc
 from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig
 from PIL import Image
 import gradio as gr
 warnings.filterwarnings('ignore')
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
@@ -13,22 +15,30 @@ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 model = None
 processor = None
 if torch.cuda.is_available():
     torch.cuda.empty_cache()
     gc.collect()
     print("เคลียร์ CUDA cache เรียบร้อยแล้ว")
 def load_model_and_processor():
     """โหลดโมเดลและ processor"""
     global model, processor
     print("กำลังโหลดโมเดลและ processor...")
     try:
-        # กำหนด paths
         base_model_path = "meta-llama/Llama-3.2-11B-Vision-Instruct"
         hub_model_path = "Aekanun/thai-handwriting-llm"
-        # ตั้งค่า BitsAndBytes
         bnb_config = BitsAndBytesConfig(
             load_in_4bit=True,
             bnb_4bit_use_double_quant=True,
@@ -36,21 +46,19 @@ def load_model_and_processor():
             bnb_4bit_compute_dtype=torch.bfloat16
         )
-        # โหลด processor จาก base model
-        print("Loading processor...")
         processor = AutoProcessor.from_pretrained(base_model_path)
-        # โหลดโมเดลจาก Hub
-        print("Loading model...")
         model = AutoModelForVision2Seq.from_pretrained(
             hub_model_path,
             device_map="auto",
             torch_dtype=torch.bfloat16,
             quantization_config=bnb_config,
-            trust_remote_code=True,
-            force_download=True  # เพิ่มมาเพื่อให้โหลดใหม่
         )
-        print("Model loaded successfully!")
         return True
     except Exception as e:
@@ -68,14 +76,12 @@ def process_handwriting(image):
         # Ensure image is in PIL format
         if not isinstance(image, Image.Image):
             image = Image.fromarray(image)
-        # Convert to RGB if needed
-        if image.mode != "RGB":
-            image = image.convert("RGB")
         prompt = """Transcribe the Thai handwritten text from the provided image.
 Only return the transcription in Thai language."""
         messages = [
             {
                 "role": "user",
@@ -86,10 +92,12 @@ Only return the transcription in Thai language."""
             }
         ]
         text = processor.apply_chat_template(messages, tokenize=False)
         inputs = processor(text=text, images=image, return_tensors="pt")
         inputs = {k: v.to(model.device) for k, v in inputs.items()}
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
@@ -98,6 +106,7 @@ Only return the transcription in Thai language."""
                 pad_token_id=processor.tokenizer.pad_token_id
             )
         transcription = processor.decode(outputs[0], skip_special_tokens=True)
         return transcription.strip()
@@ -113,7 +122,8 @@ if load_model_and_processor():
         inputs=gr.Image(type="pil", label="อัพโหลดรูปลายมือเขียนภาษาไทย"),
         outputs=gr.Textbox(label="ข้อความที่แปลงได้"),
         title="Thai Handwriting Recognition",
-        description="อัพโหลดรูปภาพลายมือเขียนภาษาไทยเพื่อแปลงเป็นข้อความ"
     )
     if __name__ == "__main__":

 from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig
 from PIL import Image
 import gradio as gr
+from huggingface_hub import login
+# Basic settings
 warnings.filterwarnings('ignore')
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 model = None
 processor = None
+# Clear CUDA cache
 if torch.cuda.is_available():
     torch.cuda.empty_cache()
     gc.collect()
     print("เคลียร์ CUDA cache เรียบร้อยแล้ว")
+# Login to Hugging Face Hub
+if 'HUGGING_FACE_HUB_TOKEN' in os.environ:
+    print("กำลังเข้าสู่ระบบ Hugging Face Hub...")
+    login(token=os.environ['HUGGING_FACE_HUB_TOKEN'])
+else:
+    print("คำเตือน: ไม่พบ HUGGING_FACE_HUB_TOKEN")
 def load_model_and_processor():
     """โหลดโมเดลและ processor"""
     global model, processor
     print("กำลังโหลดโมเดลและ processor...")
     try:
+        # Model paths
         base_model_path = "meta-llama/Llama-3.2-11B-Vision-Instruct"
         hub_model_path = "Aekanun/thai-handwriting-llm"
+        # BitsAndBytes config
         bnb_config = BitsAndBytesConfig(
             load_in_4bit=True,
             bnb_4bit_use_double_quant=True,
             bnb_4bit_compute_dtype=torch.bfloat16
         )
+        # Load processor from base model
         processor = AutoProcessor.from_pretrained(base_model_path)
+        # Load model from Hub
+        print("กำลังโหลดโมเดลจาก Hub...")
         model = AutoModelForVision2Seq.from_pretrained(
             hub_model_path,
             device_map="auto",
             torch_dtype=torch.bfloat16,
             quantization_config=bnb_config,
+            trust_remote_code=True
         )
+        print("โหลดโมเดลสำเร็จ!")
         return True
     except Exception as e:
         # Ensure image is in PIL format
         if not isinstance(image, Image.Image):
             image = Image.fromarray(image)
+        # Create prompt
         prompt = """Transcribe the Thai handwritten text from the provided image.
 Only return the transcription in Thai language."""
+        # Create model inputs
         messages = [
             {
                 "role": "user",
             }
         ]
+        # Process with model
         text = processor.apply_chat_template(messages, tokenize=False)
         inputs = processor(text=text, images=image, return_tensors="pt")
         inputs = {k: v.to(model.device) for k, v in inputs.items()}
+        # Generate
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
                 pad_token_id=processor.tokenizer.pad_token_id
             )
+        # Decode output
         transcription = processor.decode(outputs[0], skip_special_tokens=True)
         return transcription.strip()
         inputs=gr.Image(type="pil", label="อัพโหลดรูปลายมือเขียนภาษาไทย"),
         outputs=gr.Textbox(label="ข้อความที่แปลงได้"),
         title="Thai Handwriting Recognition",
+        description="อัพโหลดรูปภาพลายมือเขียนภาษาไทยเพื่อแปลงเป็นข้อความ",
+        examples=[["example1.jpg"], ["example2.jpg"]]
     )
     if __name__ == "__main__":

config.json DELETED Viewed

@@ -1,9 +0,0 @@
-{
-    "architectures": ["LlamaForCausalLM"],
-    "model_type": "llama",
-    "tokenizer_class": "PreTrainedTokenizerFast",
-    "model_max_length": 131072,
-    "megatron_core": "megatron.core",
-    "task_type": "CAUSAL_LM",
-    "target_modules": ["q_proj", "v_proj"]
-}