| import json | |
| import math | |
| import os | |
| import os.path as osp | |
| from copy import deepcopy | |
| from threading import Thread | |
| from typing import List, Optional | |
| import torch | |
| import torchvision | |
| from PIL import Image | |
| from transformers import ( | |
| AutoProcessor, | |
| PretrainedConfig, | |
| PreTrainedModel, | |
| Qwen2Config, | |
| Qwen2ForCausalLM, | |
| Qwen2PreTrainedModel, | |
| TextIteratorStreamer, | |
| ) | |
| class VILAConfig(PretrainedConfig): | |
| model_type = "vila" | |
| keys_to_ignore_at_inference = ["past_key_values"] | |
| def __init__( | |
| self, | |
| llm_cfg=None, | |
| vision_tower_cfg=None, | |
| mm_projector_cfg=None, | |
| architectures=None, | |
| resume_path=None, | |
| hidden_size=None, | |
| mm_hidden_size=None, | |
| image_aspect_ratio=None, | |
| num_video_frames=None, | |
| fps=None, | |
| mm_vision_select_layer=None, | |
| mm_vision_select_feature=None, | |
| mm_use_im_start_end=False, | |
| mm_use_im_patch_token=False, | |
| mm_projector_lr=None, | |
| vision_tower_lr=None, | |
| vision_resolution=None, | |
| interpolate_mode=None, | |
| s2=None, | |
| dynamic_s2=None, | |
| s2_scales=None, | |
| s2_max_split_size=None, | |
| s2_resize_output_to_scale_idx=0, | |
| min_tiles: Optional[int] = 1, | |
| max_tiles: Optional[int] = 12, | |
| num_time_tokens=None, | |
| time_token_format=None, | |
| image_encoder: str = '{"_target_": "llava.model.encoders.BasicImageEncoder"}', | |
| video_encoder: str = '{"_target_": "llava.model.encoders.BasicVideoEncoder"}', | |
| **kwargs, | |
| ): | |
| super().__init__() | |
| self.architectures = architectures | |
| self.llm_cfg = llm_cfg | |
| self.vision_tower_cfg = vision_tower_cfg | |
| self.mm_projector_cfg = mm_projector_cfg | |
| self.resume_path = resume_path | |
| self.hidden_size = hidden_size | |
| self.mm_hidden_size = mm_hidden_size | |
| self.image_aspect_ratio = image_aspect_ratio | |
| self.num_video_frames = num_video_frames | |
| self.fps = fps | |
| self.mm_vision_select_layer = mm_vision_select_layer | |
| self.mm_vision_select_feature = mm_vision_select_feature | |
| self.mm_use_im_start_end = mm_use_im_start_end | |
| self.mm_use_im_patch_token = mm_use_im_patch_token | |
| self.mm_projector_lr = mm_projector_lr | |
| self.vision_tower_lr = vision_tower_lr | |
| self.vision_resolution = vision_resolution | |
| self.interpolate_mode = interpolate_mode | |
| self.s2 = s2 | |
| self.dynamic_s2 = dynamic_s2 | |
| self.s2_scales = s2_scales | |
| self.s2_max_split_size = s2_max_split_size | |
| self.s2_resize_output_to_scale_idx = s2_resize_output_to_scale_idx | |
| self.min_tiles = min_tiles | |
| self.max_tiles = max_tiles | |
| self.num_time_tokens = num_time_tokens | |
| self.time_token_format = time_token_format | |
| self.image_encoder = image_encoder | |
| self.video_encoder = video_encoder | |
| super().__init__(**kwargs) | |