Aekanun
/

thai-handwriting-llm

@@ -27,65 +27,84 @@ A LoRA-adapted vision-language model based on Llama-3.2-11B-Vision-Instruct that
 ### Single Image
 ```python
 import torch
-from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig
 from PIL import Image
-def load_model_and_processor():
-   model_path = "Aekanun/thai-handwriting-llm"
-   base_model_path = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-   # BitsAndBytes config
-   bnb_config = BitsAndBytesConfig(
-       load_in_4bit=True,
-       bnb_4bit_use_double_quant=True,
-       bnb_4bit_quant_type="nf4",
-       bnb_4bit_compute_dtype=torch.bfloat16
-   )
-   # Load processor from base model
-   processor = AutoProcessor.from_pretrained(base_model_path)
-   # Load fine-tuned model
-   model = AutoModelForVision2Seq.from_pretrained(
-       model_path,
-       device_map="auto",
-       torch_dtype=torch.bfloat16,
-       quantization_config=bnb_config
-   )
-   return model, processor
-def process_image(image_path, model, processor):
-   image = Image.open(image_path)
-   prompt = """Transcribe the Thai handwritten text from the provided image.
 Only return the transcription in Thai language."""
-   messages = [
-       {
-           "role": "user",
-           "content": [
-               {"type": "text", "text": prompt},
-               {"type": "image", "image": image}
-           ],
-       }
-   ]
-   text = processor.apply_chat_template(messages, tokenize=False)
-   inputs = processor(text=text, images=image, return_tensors="pt")
-   inputs = {k: v.to(model.device) for k, v in inputs.items()}
-   with torch.no_grad():
-       outputs = model.generate(
-           **inputs,
-           max_new_tokens=256,
-           do_sample=False,
-           pad_token_id=processor.tokenizer.pad_token_id
-       )
-   transcription = processor.decode(outputs[0], skip_special_tokens=True)
-   return transcription
-# Usage
-model, processor = load_model_and_processor()
-result = process_image("path/to/image.jpg", model, processor)
-print(result)

 ### Single Image
 ```python
 import torch
+from transformers import AutoModelForVision2Seq, AutoProcessor
+from peft import PeftModel
 from PIL import Image
+def load_model():
+    # Model paths
+    base_model_path = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+    adapter_path = "Aekanun/thai-handwriting-llm"
+    # Load processor
+    processor = AutoProcessor.from_pretrained(
+        base_model_path,
+        use_auth_token=True
+    )
+    # Load base model
+    base_model = AutoModelForVision2Seq.from_pretrained(
+        base_model_path,
+        device_map="auto",
+        torch_dtype=torch.float16,
+        trust_remote_code=True,
+        use_auth_token=True
+    )
+    # Load adapter
+    model = PeftModel.from_pretrained(
+        base_model,
+        adapter_path,
+        device_map="auto",
+        torch_dtype=torch.float16,
+        use_auth_token=True
+    )
+    return model, processor
+def transcribe_thai_handwriting(image_path, model, processor):
+    # Load and prepare image
+    image = Image.open(image_path)
+    # Create prompt
+    prompt = """Transcribe the Thai handwritten text from the provided image.
 Only return the transcription in Thai language."""
+    # Prepare inputs
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt},
+                {"type": "image", "image": image}
+            ],
+        }
+    ]
+    # Process with model
+    text = processor.apply_chat_template(messages, tokenize=False)
+    inputs = processor(text=text, images=image, return_tensors="pt")
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    # Generate
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=512,
+            do_sample=False,
+            pad_token_id=processor.tokenizer.pad_token_id
+        )
+    # Decode output
+    transcription = processor.decode(outputs[0], skip_special_tokens=True)
+    return transcription.strip()
+# Example usage
+if __name__ == "__main__":
+    # Load model
+    model, processor = load_model()
+    # Transcribe image
+    image_path = "path/to/your/image.jpg"
+    result = transcribe_thai_handwriting(image_path, model, processor)
+    print(f"Transcription: {result}")