nemotron-plus-vlm / model_implementation /internvl3_working.py

Upload Nemotron Plus VLM - UI automation vision-language model

d3c0eea verified 3 months ago

1.56 kB

	#!/usr/bin/env python3
	"""
	Working InternVL3-8B implementation based on documentation
	"""

	from transformers import AutoProcessor, AutoModelForImageTextToText
	import torch
	from PIL import Image

	# Exactly as in the doc (lines 62-82)
	model_path = "/media/jerem/641C8D6C1C8D3A56/hf_cache/models--OpenGVLab--InternVL3-8B-hf/snapshots/259a3b64a14623c0ec91a045cb43f7c5af5fa6af"

	processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
	model = AutoModelForImageTextToText.from_pretrained(
	model_path,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	trust_remote_code=True
	)

	# Test with leboncoin screenshot
	image = Image.open("./Screenshot from 2025-08-14 09-50-26.png")

	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": "What is the price of this apartment?"},
	],
	}
	]

	# Process exactly as in doc
	text = processor.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	inputs = processor(
	text=text,
	images=image,
	return_tensors="pt"
	)

	# Move to device AND convert pixel_values to bfloat16 to fix dtype mismatch
	inputs = inputs.to(model.device)
	if 'pixel_values' in inputs:
	inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)

	# Generate
	output = model.generate(**inputs, max_new_tokens=100)
	response = processor.decode(output[0], skip_special_tokens=True)

	print(f"Response: {response}")
	print("\n✅ If this works, we know the exact API to use!")