#!/usr/bin/env python3 """ Working InternVL3-8B implementation based on documentation """ from transformers import AutoProcessor, AutoModelForImageTextToText import torch from PIL import Image # Exactly as in the doc (lines 62-82) model_path = "/media/jerem/641C8D6C1C8D3A56/hf_cache/models--OpenGVLab--InternVL3-8B-hf/snapshots/259a3b64a14623c0ec91a045cb43f7c5af5fa6af" processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) model = AutoModelForImageTextToText.from_pretrained( model_path, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True ) # Test with leboncoin screenshot image = Image.open("./Screenshot from 2025-08-14 09-50-26.png") messages = [ { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": "What is the price of this apartment?"}, ], } ] # Process exactly as in doc text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = processor( text=text, images=image, return_tensors="pt" ) # Move to device AND convert pixel_values to bfloat16 to fix dtype mismatch inputs = inputs.to(model.device) if 'pixel_values' in inputs: inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16) # Generate output = model.generate(**inputs, max_new_tokens=100) response = processor.decode(output[0], skip_special_tokens=True) print(f"Response: {response}") print("\n✅ If this works, we know the exact API to use!")