genialo555's picture
Upload Nemotron Plus VLM - UI automation vision-language model
d3c0eea verified
#!/usr/bin/env python3
"""
Working InternVL3-8B implementation based on documentation
"""
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch
from PIL import Image
# Exactly as in the doc (lines 62-82)
model_path = "/media/jerem/641C8D6C1C8D3A56/hf_cache/models--OpenGVLab--InternVL3-8B-hf/snapshots/259a3b64a14623c0ec91a045cb43f7c5af5fa6af"
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForImageTextToText.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True
)
# Test with leboncoin screenshot
image = Image.open("./Screenshot from 2025-08-14 09-50-26.png")
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": "What is the price of this apartment?"},
],
}
]
# Process exactly as in doc
text = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = processor(
text=text,
images=image,
return_tensors="pt"
)
# Move to device AND convert pixel_values to bfloat16 to fix dtype mismatch
inputs = inputs.to(model.device)
if 'pixel_values' in inputs:
inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
# Generate
output = model.generate(**inputs, max_new_tokens=100)
response = processor.decode(output[0], skip_special_tokens=True)
print(f"Response: {response}")
print("\n✅ If this works, we know the exact API to use!")