#!/usr/bin/env python3
"""
Example usage of MonkeyOCR Vision GGUF model
"""

from llama_cpp import Llama
import base64
from PIL import Image
import io

def encode_image_to_base64(image_path):
    """Convert image to base64 string"""
    with Image.open(image_path) as img:
        # Convert to RGB if necessary
        if img.mode != 'RGB':
            img = img.convert('RGB')
        
        # Save to bytes
        buffer = io.BytesIO()
        img.save(buffer, format='JPEG')
        img_bytes = buffer.getvalue()
        
        # Encode to base64
        return base64.b64encode(img_bytes).decode('utf-8')

def main():
    # Initialize the model
    print("Loading MonkeyOCR Vision model...")
    llm = Llama(
        model_path="MonkeyOCR-pro-1.2B-Recognition.gguf",
        chat_format="qwen2vl",  # Important: specify vision chat format
        n_ctx=2048,
        verbose=False
    )
    print("Model loaded successfully!")
    
    # Example 1: Text-only query
    print("\n=== Text-only Example ===")
    response = llm.create_chat_completion(
        messages=[{
            "role": "user",
            "content": "Hello! What can you help me with?"
        }]
    )
    print("Response:", response['choices'][0]['message']['content'])
    
    # Example 2: Vision + text query (uncomment if you have an image)
    """
    print("\n=== Vision Example ===")
    # Replace with your image path
    image_path = "your_image.jpg"
    
    response = llm.create_chat_completion(
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": "What text do you see in this image? Please extract all visible text."},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image_to_base64(image_path)}"}}
            ]
        }]
    )
    print("OCR Response:", response['choices'][0]['message']['content'])
    """

if __name__ == "__main__":
    main()