GVE-3B / modeling_gve.py

Upload folder using huggingface_hub

2e31242 verified 8 days ago

9.48 kB

	import math
	from dataclasses import dataclass
	from typing import List, Optional, Tuple, Union

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.nn import CrossEntropyLoss

	from transformers.activations import ACT2FN
	from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
	from transformers.generation import GenerationMixin
	from transformers.modeling_attn_mask_utils import AttentionMaskConverter
	from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
	from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
	from transformers.modeling_utils import PreTrainedModel
	from transformers.utils import (
	add_start_docstrings,
	add_start_docstrings_to_model_forward,
	is_flash_attn_2_available,
	is_flash_attn_greater_or_equal_2_10,
	logging,
	replace_return_docstrings,
	)
	from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig
	from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration, QWEN2_5_VL_INPUTS_DOCSTRING, Qwen2_5_VLCausalLMOutputWithPast

	if is_flash_attn_2_available():
	from flash_attn import flash_attn_varlen_func
	from flash_attn.layers.rotary import apply_rotary_emb

	else:
	flash_attn_varlen_func = None
	apply_rotary_emb = None


	if is_flash_attn_2_available():
	from transformers.modeling_flash_attention_utils import _flash_attention_forward
	else:
	flash_attn_varlen_func = None


	logger = logging.get_logger(__name__)

	_CONFIG_FOR_DOC = "Qwen2_5_VLConfig"


	class Qwen25VLForEmbedding(Qwen2_5_VLForConditionalGeneration):
	_tied_weights_keys = ["lm_head.weight"]
	config_class = Qwen2_5_VLConfig
	_no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"]

	@add_start_docstrings_to_model_forward(QWEN2_5_VL_INPUTS_DOCSTRING)
	@replace_return_docstrings(output_type=Qwen2_5_VLCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
	def forward(
	self,
	input_ids: torch.LongTensor = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	pixel_values: Optional[torch.Tensor] = None,
	pixel_values_videos: Optional[torch.FloatTensor] = None,
	image_grid_thw: Optional[torch.LongTensor] = None,
	video_grid_thw: Optional[torch.LongTensor] = None,
	rope_deltas: Optional[torch.LongTensor] = None,
	cache_position: Optional[torch.LongTensor] = None,
	second_per_grid_ts: Optional[torch.Tensor] = None,
	) -> Union[Tuple, Qwen2_5_VLCausalLMOutputWithPast]:
	r"""
	Args:
	labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, optional):
	Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
	config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
	(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

	Returns:

	Example:

	```python
	>>> from PIL import Image
	>>> import requests
	>>> from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

	>>> model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
	>>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

	>>> messages = [
	{
	"role": "user",
	"content": [
	{"type": "image"},
	{"type": "text", "text": "What is shown in this image?"},
	],
	},
	]
	>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
	>>> image = Image.open(requests.get(url, stream=True).raw)

	>>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	>>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])

	>>> # Generate
	>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
	>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
	"The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
	```"""

	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	if inputs_embeds is None:
	inputs_embeds = self.model.embed_tokens(input_ids)
	if pixel_values is not None:
	pixel_values = pixel_values.type(self.visual.dtype)
	image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
	n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
	n_image_features = image_embeds.shape[0]
	if n_image_tokens != n_image_features:
	raise ValueError(
	f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
	)

	mask = input_ids == self.config.image_token_id
	mask_unsqueezed = mask.unsqueeze(-1)
	mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
	image_mask = mask_expanded.to(inputs_embeds.device)

	image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
	inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)

	if pixel_values_videos is not None:
	pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
	video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
	n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
	n_video_features = video_embeds.shape[0]
	if n_video_tokens != n_video_features:
	raise ValueError(
	f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
	)

	mask = input_ids == self.config.video_token_id
	mask_unsqueezed = mask.unsqueeze(-1)
	mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
	video_mask = mask_expanded.to(inputs_embeds.device)

	video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
	inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)

	if attention_mask is not None:
	attention_mask = attention_mask.to(inputs_embeds.device)

	# if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
	if position_ids is None and (attention_mask is None or attention_mask.ndim == 2):
	# calculate RoPE index once per generation in the pre-fill stage only
	if (cache_position is not None and cache_position[0] == 0) or self.rope_deltas is None:
	position_ids, rope_deltas = self.get_rope_index(
	input_ids,
	image_grid_thw,
	video_grid_thw,
	second_per_grid_ts,
	attention_mask,
	)
	self.rope_deltas = rope_deltas
	# then use the prev pre-calculated rope-deltas to get the correct position ids
	else:
	batch_size, seq_length, _ = inputs_embeds.shape
	delta = (
	(cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
	if cache_position is not None
	else 0
	)
	position_ids = torch.arange(seq_length, device=inputs_embeds.device)
	position_ids = position_ids.view(1, -1).expand(batch_size, -1)
	if cache_position is not None: # otherwise `deltas` is an int `0`
	delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
	position_ids = position_ids.add(delta)
	position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)

	outputs = self.model(
	input_ids=None,
	position_ids=position_ids,
	attention_mask=attention_mask,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	cache_position=cache_position,
	)
	return outputs