#!/usr/bin/env python3
"""
Working InternVL3-8B implementation based on documentation
"""

from transformers import AutoProcessor, AutoModelForImageTextToText
import torch
from PIL import Image

# Exactly as in the doc (lines 62-82)
model_path = "/media/jerem/641C8D6C1C8D3A56/hf_cache/models--OpenGVLab--InternVL3-8B-hf/snapshots/259a3b64a14623c0ec91a045cb43f7c5af5fa6af"

processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForImageTextToText.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)

# Test with leboncoin screenshot
image = Image.open("./Screenshot from 2025-08-14 09-50-26.png")

messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "What is the price of this apartment?"},
        ],
    }
]

# Process exactly as in doc
text = processor.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

inputs = processor(
    text=text,
    images=image,
    return_tensors="pt"
)

# Move to device AND convert pixel_values to bfloat16 to fix dtype mismatch
inputs = inputs.to(model.device)
if 'pixel_values' in inputs:
    inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)

# Generate
output = model.generate(**inputs, max_new_tokens=100)
response = processor.decode(output[0], skip_special_tokens=True)

print(f"Response: {response}")
print("\n✅ If this works, we know the exact API to use!")