|
|
|
|
|
""" |
|
|
Working InternVL3-8B implementation based on documentation |
|
|
""" |
|
|
|
|
|
from transformers import AutoProcessor, AutoModelForImageTextToText |
|
|
import torch |
|
|
from PIL import Image |
|
|
|
|
|
|
|
|
model_path = "/media/jerem/641C8D6C1C8D3A56/hf_cache/models--OpenGVLab--InternVL3-8B-hf/snapshots/259a3b64a14623c0ec91a045cb43f7c5af5fa6af" |
|
|
|
|
|
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) |
|
|
model = AutoModelForImageTextToText.from_pretrained( |
|
|
model_path, |
|
|
torch_dtype=torch.bfloat16, |
|
|
device_map="auto", |
|
|
trust_remote_code=True |
|
|
) |
|
|
|
|
|
|
|
|
image = Image.open("./Screenshot from 2025-08-14 09-50-26.png") |
|
|
|
|
|
messages = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "image", "image": image}, |
|
|
{"type": "text", "text": "What is the price of this apartment?"}, |
|
|
], |
|
|
} |
|
|
] |
|
|
|
|
|
|
|
|
text = processor.apply_chat_template( |
|
|
messages, |
|
|
tokenize=False, |
|
|
add_generation_prompt=True |
|
|
) |
|
|
|
|
|
inputs = processor( |
|
|
text=text, |
|
|
images=image, |
|
|
return_tensors="pt" |
|
|
) |
|
|
|
|
|
|
|
|
inputs = inputs.to(model.device) |
|
|
if 'pixel_values' in inputs: |
|
|
inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16) |
|
|
|
|
|
|
|
|
output = model.generate(**inputs, max_new_tokens=100) |
|
|
response = processor.decode(output[0], skip_special_tokens=True) |
|
|
|
|
|
print(f"Response: {response}") |
|
|
print("\n✅ If this works, we know the exact API to use!") |