Aekanun
/

thai-handwriting-llm

@@ -18,93 +18,32 @@ pipeline_tag: image-to-text
 A LoRA-adapted vision-language model based on Llama-3.2-11B-Vision-Instruct that transcribes Thai handwritten text from images.
-## Model Architecture
-- Base: Llama-3.2-11B-Vision-Instruct
-## Inference
-### Single Image
-```python
-import torch
-from transformers import AutoModelForVision2Seq, AutoProcessor
-from peft import PeftModel
-from PIL import Image
-def load_model():
-    # Model paths
-    base_model_path = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-    adapter_path = "Aekanun/thai-handwriting-llm"
-    # Load processor
-    processor = AutoProcessor.from_pretrained(
-        base_model_path,
-        use_auth_token=True
-    )
-    # Load base model
-    base_model = AutoModelForVision2Seq.from_pretrained(
-        base_model_path,
-        device_map="auto",
-        torch_dtype=torch.float16,
-        trust_remote_code=True,
-        use_auth_token=True
-    )
-    # Load adapter
-    model = PeftModel.from_pretrained(
-        base_model,
-        adapter_path,
-        device_map="auto",
-        torch_dtype=torch.float16,
-        use_auth_token=True
-    )
-    return model, processor
-def transcribe_thai_handwriting(image_path, model, processor):
-    # Load and prepare image
-    image = Image.open(image_path)
-    # Create prompt
-    prompt = """Transcribe the Thai handwritten text from the provided image.
-Only return the transcription in Thai language."""
-    # Prepare inputs
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "text", "text": prompt},
-                {"type": "image", "image": image}
-            ],
-        }
-    ]
-    # Process with model
-    text = processor.apply_chat_template(messages, tokenize=False)
-    inputs = processor(text=text, images=image, return_tensors="pt")
-    inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    # Generate
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=512,
-            do_sample=False,
-            pad_token_id=processor.tokenizer.pad_token_id
-        )
-    # Decode output
-    transcription = processor.decode(outputs[0], skip_special_tokens=True)
-    return transcription.strip()
-# Example usage
-if __name__ == "__main__":
-    # Load model
-    model, processor = load_model()
-    # Transcribe image
-    image_path = "path/to/your/image.jpg"
-    result = transcribe_thai_handwriting(image_path, model, processor)
-    print(f"Transcription: {result}")

 A LoRA-adapted vision-language model based on Llama-3.2-11B-Vision-Instruct that transcribes Thai handwritten text from images.
+## Model Description
+- Base Model: Llama-3.2-11B-Vision-Instruct
+- Training Technique: LoRA adaptation
+- Quantization: Supports 4-bit inference
+- Dataset: iapp/thai_handwriting_dataset
+## Demo
+Try the model via our web interface:
+🔗 [Thai-HandWriting-to-Text](https://huggingface.co/spaces/Aekanun/Thai-HandWriting-to-Text)
+### Features
+- Supports both general handwriting and medical prescriptions
+- Simple drag-and-drop interface
+- Real-time text recognition
+- No setup required
+### Example Use Cases
+1. General Thai handwriting transcription
+2. Medical prescription reading
+3. Handwritten document digitization
+## Limitations
+- Designed specifically for Thai handwriting
+- Performance may vary with image quality
+- Requires clear handwriting for best results
+## License
+This model is released under the Apache 2.0 license.