yinbq commited on 11 days ago

Commit

8834223

verified ·

1 Parent(s): 0341b51

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +6 -0
assets/arch.png +3 -0
assets/bagel-cot-example.png +3 -0
assets/emerging_curves.png +3 -0
assets/teaser.webp +3 -0
assets/zebra_cot_datacard.png +3 -0
data/__init__.py +2 -0
data/configs/example.yaml +50 -0
data/configs/example_smm_random.yaml +50 -0
data/dataset_base.py +768 -0
data/dataset_info.py +46 -0
data/distributed_iterable_dataset.py +58 -0
data/interleave_datasets/edit_dataset.py +72 -0
data/interleave_datasets/interleave_t2i_dataset.py +218 -0
data/interleave_datasets/think_trace_dataset.py +289 -0
modeling/__init__.py +4 -0
modeling/autoencoder.py +360 -0
modeling/bagel/bagel.py +1068 -0
modeling/bagel/modeling_utils.py +144 -0
modeling/bagel/qwen2_navit.py +1157 -0
modeling/bagel/siglip_navit.py +402 -0
modeling/qwen2/__init__.py +68 -0
modeling/qwen2/configuration_qwen2.py +179 -0
modeling/qwen2/modeling_qwen2.py +929 -0
modeling/qwen2/tokenization_qwen2.py +328 -0
modeling/qwen2/tokenization_qwen2_fast.py +123 -0
modeling/siglip/__init__.py +98 -0
modeling/siglip/configuration_siglip.py +287 -0
modeling/siglip/convert_siglip_to_hf.py +401 -0
modeling/siglip/image_processing_siglip.py +230 -0
modeling/siglip/modeling_siglip.py +1557 -0
modeling/siglip/processing_siglip.py +131 -0
modeling/siglip/tokenization_siglip.py +364 -0
run.err +150 -0
run.out +871 -0
scripts/eval/eval_vlm.sh +27 -0
scripts/eval/run_eval_vlm.sh +19 -0
scripts/eval/run_gedit.sh +57 -0
scripts/eval/run_geneval.sh +41 -0
scripts/eval/run_imgedit.sh +42 -0
scripts/eval/run_kris.sh +50 -0
scripts/eval/run_rise.sh +30 -0
scripts/eval/run_wise.sh +44 -0
scripts/train.sh +48 -0
scripts/train_smm.sh +57 -0
scripts/train_smm_sbatch.sh +85 -0
test_images/image.png +3 -0
test_images/meme.jpg +0 -0
test_images/octupusy.jpg +0 -0
test_images/women.jpg +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/arch.png filter=lfs diff=lfs merge=lfs -text
+assets/bagel-cot-example.png filter=lfs diff=lfs merge=lfs -text
+assets/emerging_curves.png filter=lfs diff=lfs merge=lfs -text
+assets/teaser.webp filter=lfs diff=lfs merge=lfs -text
+assets/zebra_cot_datacard.png filter=lfs diff=lfs merge=lfs -text
+test_images/image.png filter=lfs diff=lfs merge=lfs -text

assets/arch.png ADDED Viewed

Git LFS Details

SHA256: 28affbbfede911a75884bae4e8e1d5b897b8b450fa4c7d9b68818d05492b0967
Pointer size: 131 Bytes
Size of remote file: 168 kB

assets/bagel-cot-example.png ADDED Viewed

Git LFS Details

SHA256: e6852144610280fec76591276f090d163479cb54b7e1064e9d9ab77f9fa5e582
Pointer size: 132 Bytes
Size of remote file: 4.43 MB

assets/emerging_curves.png ADDED Viewed

Git LFS Details

SHA256: 0c1ddd355742cddb52045ee59098305cc5de8174cb09afa019bb9afefd868733
Pointer size: 131 Bytes
Size of remote file: 373 kB

assets/teaser.webp ADDED Viewed

Git LFS Details

SHA256: d679e69a1fbdb7f9abceb59d9bc3d29ab65b7e871ba48b59aec0a7f35defa558
Pointer size: 132 Bytes
Size of remote file: 1.1 MB

assets/zebra_cot_datacard.png ADDED Viewed

Git LFS Details

SHA256: 13a0df1dd68f77d535d41b2dfcb092c1f015289c4ca326d74322a9c7e98b5b17
Pointer size: 132 Bytes
Size of remote file: 3.86 MB

data/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright 2025 Bytedance Ltd. and/or its affiliates.
2	+ # SPDX-License-Identifier: Apache-2.0

data/configs/example.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+think_trace:
+  dataset_names:
+  - think_trace_dataset
+  jsonl_path_list: ["/dev/shm/data/Zebra-CoT/zebra_cot.jsonl"]
+  num_used_data: None
+  image_prefix_dir: "/dev/shm/data/Zebra-CoT"
+  image_transform_args:
+    image_stride: 16
+    max_image_size: 512
+    min_image_size: 512
+  vit_image_transform_args:
+    image_stride: 14
+    max_image_size: 512
+    min_image_size: 512
+  weight: 1.0
+  is_mandatory: true
+# unified_edit:
+#   dataset_names:
+#   - seedxedit_multi
+#   image_transform_args:
+#     image_stride: 16
+#     max_image_size: 1024
+#     min_image_size: 512
+#   vit_image_transform_args:
+#     image_stride: 14
+#     max_image_size: 518
+#     min_image_size: 224
+#   is_mandatory: true
+#   num_used_data:
+#   - 10
+#   weight: 1
+# vlm_sft:
+#   dataset_names:
+#   - llava_ov
+#   image_transform_args:
+#     image_stride: 14
+#     max_image_size: 980
+#     min_image_size: 378
+#     max_pixels: 2_007_040
+#   frame_sampler_args:
+#     max_num_frames: 12
+#     min_num_frames: 8
+#   is_mandatory: true
+#   shuffle_lines: True
+#   shuffle_seed: 0
+#   num_used_data:
+#   - 1000
+#   weight: 1

data/configs/example_smm_random.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+block_dataset_random:
+  dataset_names:
+  - block_dataset_random
+  jsonl_path_list: ["/scratch/by2593/project/SMM/SMM_data/random_block.jsonl"]
+  num_used_data: None
+  image_prefix_dir: "/scratch/by2593/project/SMM/random_pipeline/random_blocks"
+  image_transform_args:
+    image_stride: 16
+    max_image_size: 512  # VAE使用stride=16, 512/16=32 patches
+    min_image_size: 512
+  vit_image_transform_args:
+    image_stride: 14
+    max_image_size: 512  # ViT使用stride=14, 512/14=36 patches (匹配模型能力)
+    min_image_size: 512
+  weight: 1.0
+  is_mandatory: true
+# unified_edit:
+#   dataset_names:
+#   - seedxedit_multi
+#   image_transform_args:
+#     image_stride: 16
+#     max_image_size: 1024
+#     min_image_size: 512
+#   vit_image_transform_args:
+#     image_stride: 14
+#     max_image_size: 518
+#     min_image_size: 224
+#   is_mandatory: true
+#   num_used_data:
+#   - 10
+#   weight: 1
+# vlm_sft:
+#   dataset_names:
+#   - llava_ov
+#   image_transform_args:
+#     image_stride: 14
+#     max_image_size: 980
+#     min_image_size: 378
+#     max_pixels: 2_007_040
+#   frame_sampler_args:
+#     max_num_frames: 12
+#     min_num_frames: 8
+#   is_mandatory: true
+#   shuffle_lines: True
+#   shuffle_seed: 0
+#   num_used_data:
+#   - 1000
+#   weight: 1

data/dataset_base.py ADDED Viewed

	@@ -0,0 +1,768 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+import random
+import json
+import numpy as np
+import torch
+from .data_utils import (
+    get_flattened_position_ids_interpolate,
+    get_flattened_position_ids_extrapolate,
+    len2weight,
+    patchify,
+    prepare_attention_mask_per_sample,
+)
+from .dataset_info import DATASET_INFO, DATASET_REGISTRY
+from .transforms import ImageTransform
+from .video_utils import FrameSampler
+class DataConfig:
+    def __init__(
+        self,
+        grouped_datasets,
+        text_cond_dropout_prob=0.1,
+        vit_cond_dropout_prob=0.4,
+        vae_cond_dropout_prob=0.1,
+        vae_image_downsample=16,
+        max_latent_size=32,
+        vit_patch_size=14,
+        max_num_patch_per_side=70,
+    ):
+        self.grouped_datasets = grouped_datasets
+        self.text_cond_dropout_prob = text_cond_dropout_prob
+        self.vit_cond_dropout_prob = vit_cond_dropout_prob
+        self.vit_patch_size = vit_patch_size
+        self.max_num_patch_per_side = max_num_patch_per_side
+        self.vae_cond_dropout_prob = vae_cond_dropout_prob
+        self.vae_image_downsample = vae_image_downsample
+        self.max_latent_size = max_latent_size
+class PackedDataset(torch.utils.data.IterableDataset):
+    def __init__(
+        self,
+        data_config,
+        tokenizer,
+        special_tokens,
+        local_rank,
+        world_size,
+        num_workers,
+        expected_num_tokens=32768,
+        max_num_tokens_per_sample=16384,
+        max_num_tokens=36864,
+        prefer_buffer_before=16384,
+        max_buffer_size=50,
+        interpolate_pos=False,
+        use_flex=False,
+        data_status=None,
+    ):
+        super().__init__()
+        self.expected_num_tokens = expected_num_tokens
+        self.max_num_tokens_per_sample = max_num_tokens_per_sample
+        self.prefer_buffer_before = prefer_buffer_before
+        self.max_num_tokens = max_num_tokens
+        self.max_buffer_size = max_buffer_size
+        self.tokenizer = tokenizer
+        self.local_rank = local_rank
+        self.world_size = world_size
+        self.num_workers = num_workers
+        self.use_flex = use_flex
+        for k, v in special_tokens.items():
+            setattr(self, k, v)
+        grouped_datasets, is_mandatory, grouped_weights = self.build_datasets(
+            data_config.grouped_datasets, data_status
+        )
+        self.grouped_datasets = grouped_datasets
+        self.dataset_iters = [iter(dataset) for dataset in grouped_datasets]
+        self.is_mandatory = is_mandatory
+        self.grouped_weights = grouped_weights
+        self.data_config = data_config
+        self.interpolate_pos = interpolate_pos
+        if self.interpolate_pos:
+            self.get_flattened_position_ids = get_flattened_position_ids_interpolate
+        else:
+            self.get_flattened_position_ids = get_flattened_position_ids_extrapolate
+    def build_datasets(self, datasets_metainfo, data_status):
+        datasets = []
+        is_mandatory = []
+        grouped_weights = []
+        for grouped_dataset_name, dataset_args in datasets_metainfo.items():
+            is_mandatory.append(dataset_args.pop('is_mandatory', False))
+            grouped_weights.append(dataset_args.pop('weight', 0.0))
+            if 'frame_sampler_args' in dataset_args.keys():
+                frame_sampler = FrameSampler(**dataset_args.pop('frame_sampler_args'))
+                dataset_args['frame_sampler'] = frame_sampler
+            if 'image_transform_args' in dataset_args.keys():
+                transform = ImageTransform(**dataset_args.pop('image_transform_args'))
+                dataset_args['transform'] = transform
+            if 'vit_image_transform_args' in dataset_args.keys():
+                vit_transform = ImageTransform(**dataset_args.pop('vit_image_transform_args'))
+                dataset_args['vit_transform'] = vit_transform
+            assert 'dataset_names' in dataset_args.keys()
+            dataset_names = dataset_args.pop('dataset_names')
+            dataset_args['data_dir_list'] = []
+            for item in dataset_names:
+                if self.local_rank == 0:
+                    print(f'Preparing Dataset {grouped_dataset_name}/{item}')
+                meta_info = DATASET_INFO[grouped_dataset_name][item]
+                dataset_args['data_dir_list'].append(meta_info['data_dir'])
+                if "parquet_info_path" in meta_info.keys():
+                    if 'parquet_info' not in dataset_args.keys():
+                        dataset_args['parquet_info'] = {}
+                    with open(meta_info['parquet_info_path'], 'r') as f:
+                        parquet_info = json.load(f)
+                    dataset_args['parquet_info'].update(parquet_info)
+                if 'json_dir' in meta_info.keys():
+                    # parquet/tar with json
+                    if 'json_dir_list' not in dataset_args.keys():
+                        dataset_args['json_dir_list'] = [meta_info['json_dir']]
+                    else:
+                        dataset_args['json_dir_list'].append(meta_info['json_dir'])
+                if 'jsonl_path' in meta_info.keys():
+                    # jsonl with jpeg
+                    if 'jsonl_path_list' not in dataset_args.keys():
+                        dataset_args['jsonl_path_list'] = [meta_info['jsonl_path']]
+                    else:
+                        dataset_args['jsonl_path_list'].append(meta_info['jsonl_path'])
+                if 'image_prefix_dir' in meta_info.keys():
+                    dataset_args['image_prefix_dir'] = meta_info['image_prefix_dir']
+            resume_data_status = dataset_args.pop('resume_data_status', True)
+            if data_status is not None and grouped_dataset_name in data_status.keys() and resume_data_status:
+                data_status_per_group = data_status[grouped_dataset_name]
+            else:
+                data_status_per_group = None
+            dataset = DATASET_REGISTRY[grouped_dataset_name](
+                dataset_name=grouped_dataset_name,
+                tokenizer=self.tokenizer,
+                local_rank=self.local_rank,
+                world_size=self.world_size,
+                num_workers=self.num_workers,
+                data_status=data_status_per_group,
+                **dataset_args
+            )
+            datasets.append(dataset)
+        return datasets, is_mandatory, grouped_weights
+    def set_epoch(self, seed):
+        for dataset in self.grouped_datasets:
+            dataset.set_epoch(seed)
+    def set_sequence_status(self):
+        sequence_status = dict(
+            curr                        = 0,
+            sample_lens                 = list(),
+            packed_position_ids         = list(),
+            nested_attention_masks      = list(),
+            split_lens                  = list(),
+            attn_modes                  = list(),
+            packed_text_ids             = list(),
+            packed_text_indexes         = list(),
+            packed_label_ids            = list(),
+            ce_loss_indexes             = list(),
+            ce_loss_weights             = list(),
+            vae_image_tensors           = list(),
+            packed_latent_position_ids  = list(),
+            vae_latent_shapes           = list(),
+            packed_vae_token_indexes    = list(),
+            packed_timesteps            = list(),
+            mse_loss_indexes            = list(),
+            packed_vit_tokens           = list(),
+            vit_token_seqlens           = list(),
+            packed_vit_position_ids     = list(),
+            packed_vit_token_indexes    = list(),
+        )
+        return sequence_status
+    def to_tensor(self, sequence_status):
+        data = dict(
+            sequence_length=sum(sequence_status['sample_lens']),
+            sample_lens=sequence_status['sample_lens'],
+            packed_text_ids=torch.tensor(sequence_status['packed_text_ids']),
+            packed_text_indexes=torch.tensor(sequence_status['packed_text_indexes']),
+            packed_position_ids=torch.tensor(sequence_status['packed_position_ids']),
+        )
+        if not self.use_flex:
+            data['nested_attention_masks'] = sequence_status['nested_attention_masks']
+        else:
+            sequence_len = data['sequence_length']
+            pad_len = self.max_num_tokens - sequence_len
+            data['split_lens'] = sequence_status['split_lens'] + [pad_len]
+            data['attn_modes'] = sequence_status['attn_modes'] + ['causal']
+            data['sample_lens'] += [pad_len]
+        # if the model has a convnet vae (e.g., as visual tokenizer)
+        if len(sequence_status['vae_image_tensors']) > 0:
+            image_tensors = sequence_status.pop('vae_image_tensors')
+            image_sizes = [item.shape for item in image_tensors]
+            max_image_size = [max(item) for item in list(zip(*image_sizes))]
+            padded_images = torch.zeros(size=(len(image_tensors), *max_image_size))
+            for i, image_tensor in enumerate(image_tensors):
+                padded_images[i, :, :image_tensor.shape[1], :image_tensor.shape[2]] = image_tensor
+            data['padded_images'] = padded_images
+            data['patchified_vae_latent_shapes'] = sequence_status['vae_latent_shapes']
+            data['packed_latent_position_ids'] = torch.cat(sequence_status['packed_latent_position_ids'], dim=0)
+            data['packed_vae_token_indexes'] = torch.tensor(sequence_status['packed_vae_token_indexes'])
+        # if the model has a vit (e.g., as visual tokenizer)
+        if len(sequence_status['packed_vit_tokens']) > 0:
+            data['packed_vit_tokens'] = torch.cat(sequence_status['packed_vit_tokens'], dim=0)
+            data['packed_vit_position_ids'] = torch.cat(sequence_status['packed_vit_position_ids'], dim=0)
+            data['packed_vit_token_indexes'] = torch.tensor(sequence_status['packed_vit_token_indexes'])
+            data['vit_token_seqlens'] = torch.tensor(sequence_status['vit_token_seqlens'])
+        # if the model is required to perform visual generation
+        if len(sequence_status['packed_timesteps']) > 0:
+            data['packed_timesteps'] = torch.tensor(sequence_status['packed_timesteps'])
+            data['mse_loss_indexes'] = torch.tensor(sequence_status['mse_loss_indexes'])
+        # if the model is required to perform text generation
+        if len(sequence_status['packed_label_ids']) > 0:
+            data['packed_label_ids'] = torch.tensor(sequence_status['packed_label_ids'])
+            data['ce_loss_indexes'] = torch.tensor(sequence_status['ce_loss_indexes'])
+            data['ce_loss_weights'] = torch.tensor(sequence_status['ce_loss_weights'])
+        # Debug printing for rank 0
+        # if self.local_rank == 0:
+        #     self.print_debug_info(data, sequence_status)
+        return data
+    def print_debug_info(self, data, sequence_status):
+        """Print detailed debug information in an intuitive table format"""
+        print("\n" + "="*120)
+        print("DEBUG: Complete Sequence Analysis")
+        print("="*120)
+        # Basic info
+        print(f"Sequence Length: {data['sequence_length']}")
+        print(f"Sample Lengths: {data['sample_lens']}")
+        # Get all data
+        packed_text_ids = data['packed_text_ids'].tolist()
+        packed_text_indexes = data['packed_text_indexes'].tolist()
+        # Build loss mappings
+        ce_loss_indexes = set(data.get('ce_loss_indexes', []).tolist())
+        mse_loss_indexes = set(data.get('mse_loss_indexes', []).tolist())
+        vit_token_indexes = set(data.get('packed_vit_token_indexes', []).tolist())
+        vae_token_indexes = set(data.get('packed_vae_token_indexes', []).tolist())
+        # Build label mapping
+        label_mapping = {}
+        if 'ce_loss_indexes' in data:
+            ce_indexes = data['ce_loss_indexes'].tolist()
+            ce_labels = data['packed_label_ids'].tolist()
+            for i, pos in enumerate(ce_indexes):
+                label_mapping[pos] = ce_labels[i]
+        # Print raw token sequence
+        print(f"\n1. Raw Token IDs: {packed_text_ids}")
+        # Print decoded token sequence
+        try:
+            decoded_text_tokens = []
+            for token_id in packed_text_ids:
+                decoded = self.tokenizer.decode([token_id])
+                decoded_text_tokens.append(decoded)
+            print(f"2. Decoded Tokens: {decoded_text_tokens}")
+        except Exception as e:
+            print(f"2. Error decoding tokens: {e}")
+            decoded_text_tokens = ["<ERROR>"] * len(packed_text_ids)
+        # Create comprehensive sequence table
+        print(f"\n3. Complete Sequence Table:")
+        print("-" * 120)
+        print(f"{'Order':<6} | {'Token Type':<12} | {'Token/Content':<30} | {'Loss Type':<10} | {'Label':<30} | {'Notes':<20}")
+        print("-" * 120)
+        # Track text token index
+        text_token_idx = 0
+        for pos in range(data['sequence_length']):
+            # Determine token type and content
+            if pos in packed_text_indexes:
+                # This is a text token position
+                token_id = packed_text_ids[text_token_idx]
+                try:
+                    decoded_token = self.tokenizer.decode([token_id])
+                    token_content = f"ID:{token_id} '{decoded_token}'"
+                except:
+                    token_content = f"ID:{token_id} '<ERROR>'"
+                token_type = "TEXT"
+                text_token_idx += 1
+            elif pos in vit_token_indexes:
+                token_type = "VIT_IMAGE"
+                token_content = "[VIT Image Patch]"
+            elif pos in vae_token_indexes:
+                token_type = "VAE_IMAGE"
+                token_content = "[VAE Image Latent]"
+            else:
+                token_type = "UNKNOWN"
+                token_content = "[Unknown Position]"
+            # Determine loss type
+            if pos in ce_loss_indexes:
+                loss_type = "CE"
+            elif pos in mse_loss_indexes:
+                loss_type = "MSE"
+            else:
+                loss_type = "None"
+            # Determine label
+            if pos in label_mapping:
+                label_id = label_mapping[pos]
+                try:
+                    decoded_label = self.tokenizer.decode([label_id])
+                    label_content = f"ID:{label_id} '{decoded_label}'"
+                except:
+                    label_content = f"ID:{label_id} '<ERROR>'"
+            elif pos in mse_loss_indexes:
+                label_content = "[Image Generation Target]"
+            else:
+                label_content = "N/A"
+            # Additional notes
+            notes = ""
+            if pos in mse_loss_indexes and 'packed_timesteps' in data:
+                timestep_idx = list(mse_loss_indexes).index(pos) if pos in mse_loss_indexes else -1
+                if timestep_idx >= 0 and timestep_idx < len(data['packed_timesteps']):
+                    timestep = data['packed_timesteps'][timestep_idx].item()
+                    if timestep == float('-inf'):
+                        notes = "No noise"
+                    else:
+                        notes = f"t={timestep:.3f}"
+            print(f"{pos:<6} | {token_type:<12} | {token_content:<30} | {loss_type:<10} | {label_content:<30} | {notes:<20}")
+        print("-" * 120)
+        # Summary statistics
+        total_positions = data['sequence_length']
+        ce_positions = len(ce_loss_indexes)
+        mse_positions = len(mse_loss_indexes)
+        vit_positions = len(vit_token_indexes)
+        vae_positions = len(vae_token_indexes)
+        text_positions = len(packed_text_indexes)
+        no_loss_positions = total_positions - ce_positions - mse_positions
+        print(f"\nSummary Statistics:")
+        print(f"  Total positions: {total_positions}")
+        print(f"  Text tokens: {text_positions} ({text_positions/total_positions*100:.1f}%)")
+        print(f"  VIT image tokens: {vit_positions} ({vit_positions/total_positions*100:.1f}%)")
+        print(f"  VAE image tokens: {vae_positions} ({vae_positions/total_positions*100:.1f}%)")
+        print(f"  Positions with CE loss: {ce_positions} ({ce_positions/total_positions*100:.1f}%)")
+        print(f"  Positions with MSE loss: {mse_positions} ({mse_positions/total_positions*100:.1f}%)")
+        print(f"  Positions with no loss: {no_loss_positions} ({no_loss_positions/total_positions*100:.1f}%)")
+        print("="*120 + "\n")
+    def __iter__(self):
+        total_weights = sum(self.grouped_weights)
+        assert total_weights > 0.0
+        group_cumprobs = [sum(self.grouped_weights[:i + 1]) / total_weights
+                          for i in range(len(self.grouped_weights))]
+        sequence_status = self.set_sequence_status()
+        batch_data_indexes = []
+        buffer = []
+        while True:
+            # Ensure at least one sample from each group
+            if sequence_status['curr'] == 0:
+                for group_index, group_iter in enumerate(self.dataset_iters):
+                    if self.is_mandatory[group_index]:
+                        while True:
+                            sample = next(group_iter)
+                            # if a sample is too long, skip it
+                            num_tokens = sample['num_tokens'] + 2 * len(sample['sequence_plan'])
+                            if num_tokens < self.max_num_tokens_per_sample:
+                                sequence_status = self.pack_sequence(sample, sequence_status)
+                                batch_data_indexes.append(sample['data_indexes'])
+                                break
+                            else:
+                                print(f"skip a sample with length {num_tokens}")
+                                continue
+            if sequence_status['curr'] < self.prefer_buffer_before and len(buffer) > 0:
+                sample = buffer.pop(0)
+                sample_from_buffer = True
+            else:
+                # sample normally across all groups
+                n = random.random()
+                group_index = 0
+                for i, cumprob in enumerate(group_cumprobs):
+                    if n < cumprob:
+                        group_index = i
+                        break
+                sample = next(self.dataset_iters[group_index])
+                sample_from_buffer = False
+            # if a sample is too long, skip it
+            num_tokens = sample['num_tokens'] + 2 * len(sample['sequence_plan'])
+            if num_tokens > self.max_num_tokens_per_sample:
+                print(f"skip a sample with length {num_tokens}")
+                continue
+            if sequence_status['curr'] + num_tokens > self.max_num_tokens:
+                if len(buffer) < self.max_buffer_size and not sample_from_buffer:
+                    buffer.append(sample)
+                else:
+                    print(f"Yielding data with length {sum(sequence_status['sample_lens'])}")
+                    data = self.to_tensor(sequence_status)
+                    data['batch_data_indexes'] = batch_data_indexes
+                    yield data
+                    sequence_status = self.set_sequence_status()
+                    batch_data_indexes = []
+                continue
+            sequence_status = self.pack_sequence(sample, sequence_status)
+            batch_data_indexes.append(sample['data_indexes'])
+            if sequence_status['curr'] >= self.expected_num_tokens:
+                data = self.to_tensor(sequence_status)
+                data['batch_data_indexes'] = batch_data_indexes
+                yield data
+                sequence_status = self.set_sequence_status()
+                batch_data_indexes = []
+    def pack_sequence(self, sample, sequence_status):
+        image_tensor_list = sample['image_tensor_list']
+        text_ids_list = sample['text_ids_list']
+        sequence_plan = sample['sequence_plan']
+        split_lens, attn_modes = list(), list()
+        curr = sequence_status['curr']
+        curr_rope_id = 0
+        sample_lens = 0
+        for item in sequence_plan:
+            split_start = item.get('split_start', True)
+            if split_start:
+                curr_split_len = 0
+            if item['type'] == 'text':
+                text_ids = text_ids_list.pop(0)
+                if item['enable_cfg'] == 1 and random.random() < self.data_config.text_cond_dropout_prob:
+                    continue
+                shifted_text_ids = [self.bos_token_id] + text_ids
+                sequence_status['packed_text_ids'].extend(shifted_text_ids)
+                sequence_status['packed_text_indexes'].extend(range(curr, curr + len(shifted_text_ids)))
+                if item['loss'] == 1:
+                    sequence_status['ce_loss_indexes'].extend(range(curr, curr + len(shifted_text_ids)))
+                    sequence_status['ce_loss_weights'].extend(
+                        [len2weight(len(shifted_text_ids))] * len(shifted_text_ids)
+                    )
+                    sequence_status['packed_label_ids'].extend(text_ids + [self.eos_token_id])
+                curr += len(shifted_text_ids)
+                curr_split_len += len(shifted_text_ids)
+                # add a <|im_end|> token
+                sequence_status['packed_text_ids'].append(self.eos_token_id)
+                sequence_status['packed_text_indexes'].append(curr)
+                if item['special_token_loss'] == 1: # <|im_end|> may have loss
+                    sequence_status['ce_loss_indexes'].append(curr)
+                    sequence_status['ce_loss_weights'].append(1.0)
+                    sequence_status['packed_label_ids'].append(item['special_token_label'])
+                curr += 1
+                curr_split_len += 1
+                # update sequence status
+                attn_modes.append("causal")
+                sequence_status['packed_position_ids'].extend(range(curr_rope_id, curr_rope_id + curr_split_len))
+                curr_rope_id += curr_split_len
+            elif item['type'] == 'vit_image':
+                image_tensor = image_tensor_list.pop(0)
+                if item['enable_cfg'] == 1 and random.random() < self.data_config.vit_cond_dropout_prob:
+                    curr_rope_id += 1
+                    continue
+                # add a <|startofimage|> token
+                sequence_status['packed_text_ids'].append(self.start_of_image)
+                sequence_status['packed_text_indexes'].append(curr)
+                curr += 1
+                curr_split_len += 1
+                # preprocess image
+                vit_tokens = patchify(image_tensor, self.data_config.vit_patch_size)
+                num_img_tokens = vit_tokens.shape[0]
+                sequence_status['packed_vit_token_indexes'].extend(range(curr, curr + num_img_tokens))
+                curr += num_img_tokens
+                curr_split_len += num_img_tokens
+                sequence_status['packed_vit_tokens'].append(vit_tokens)
+                sequence_status['vit_token_seqlens'].append(num_img_tokens)
+                sequence_status['packed_vit_position_ids'].append(
+                    self.get_flattened_position_ids(
+                        image_tensor.size(1), image_tensor.size(2),
+                        self.data_config.vit_patch_size,
+                        max_num_patches_per_side=self.data_config.max_num_patch_per_side
+                    )
+                )
+                # add a <|endofimage|> token
+                sequence_status['packed_text_ids'].append(self.end_of_image)
+                sequence_status['packed_text_indexes'].append(curr)
+                if item['special_token_loss'] == 1: # <|endofimage|> may have loss
+                    sequence_status['ce_loss_indexes'].append(curr)
+                    sequence_status['ce_loss_weights'].append(1.0)
+                    sequence_status['packed_label_ids'].append(item['special_token_label'])
+                curr += 1
+                curr_split_len += 1
+                # update sequence status
+                attn_modes.append("full")
+                sequence_status['packed_position_ids'].extend([curr_rope_id] * curr_split_len)
+                curr_rope_id += 1
+            elif item['type'] == 'vae_image':
+                image_tensor = image_tensor_list.pop(0)
+                if item['enable_cfg'] == 1 and random.random() < self.data_config.vae_cond_dropout_prob:
+                    # FIXME fix vae dropout in video2video setting.
+                    curr_rope_id += 1
+                    continue
+                # add a <|startofimage|> token
+                sequence_status['packed_text_ids'].append(self.start_of_image)
+                sequence_status['packed_text_indexes'].append(curr)
+                if item['special_token_loss'] == 1:
+                    sequence_status['ce_loss_indexes'].append(curr)
+                    sequence_status['ce_loss_weights'].append(1.0)
+                    sequence_status['packed_label_ids'].append(item['special_token_label'])
+                curr += 1
+                curr_split_len += 1
+                # preprocess image
+                sequence_status['vae_image_tensors'].append(image_tensor)
+                sequence_status['packed_latent_position_ids'].append(
+                    self.get_flattened_position_ids(
+                        image_tensor.size(1), image_tensor.size(2),
+                        self.data_config.vae_image_downsample,
+                        max_num_patches_per_side=self.data_config.max_latent_size
+                    )
+                )
+                H, W = image_tensor.shape[1:]
+                h = H // self.data_config.vae_image_downsample
+                w = W // self.data_config.vae_image_downsample
+                sequence_status['vae_latent_shapes'].append((h, w))
+                num_img_tokens = w * h
+                sequence_status['packed_vae_token_indexes'].extend(range(curr, curr + num_img_tokens))
+                if item['loss'] == 1:
+                    sequence_status['mse_loss_indexes'].extend(range(curr, curr + num_img_tokens))
+                    if split_start:
+                        timestep = np.random.randn()
+                else:
+                    timestep = float('-inf')
+                sequence_status['packed_timesteps'].extend([timestep] * num_img_tokens)
+                curr += num_img_tokens
+                curr_split_len += num_img_tokens
+                # add a <|endofimage|> token
+                sequence_status['packed_text_ids'].append(self.end_of_image)
+                sequence_status['packed_text_indexes'].append(curr)
+                # <|endofimage|> may have loss
+                if item['special_token_loss'] == 1:
+                    sequence_status['ce_loss_indexes'].append(curr)
+                    sequence_status['ce_loss_weights'].append(1.0)
+                    sequence_status['packed_label_ids'].append(item['special_token_label'])
+                curr += 1
+                curr_split_len += 1
+                # update sequence status
+                if split_start:
+                    if item['loss'] == 1 and 'frame_delta' not in item.keys():
+                        attn_modes.append("noise")
+                    else:
+                        attn_modes.append("full")
+                sequence_status['packed_position_ids'].extend([curr_rope_id] * (num_img_tokens + 2))
+                if 'frame_delta' in item.keys():
+                    curr_rope_id += item['frame_delta']
+                elif item['loss'] == 0:
+                    curr_rope_id += 1
+            if item.get('split_end', True):
+                split_lens.append(curr_split_len)
+                sample_lens += curr_split_len
+        sequence_status['curr'] = curr
+        sequence_status['sample_lens'].append(sample_lens)
+        # prepare attention mask
+        if not self.use_flex:
+            sequence_status['nested_attention_masks'].append(
+                prepare_attention_mask_per_sample(split_lens, attn_modes)
+            )
+        else:
+            sequence_status['split_lens'].extend(split_lens)
+            sequence_status['attn_modes'].extend(attn_modes)
+        return sequence_status
+class SimpleCustomBatch:
+    def __init__(self, batch):
+        data = batch[0]
+        self.batch_data_indexes = data['batch_data_indexes']
+        self.sequence_length = data["sequence_length"]
+        self.sample_lens = data["sample_lens"]
+        self.packed_text_ids = data["packed_text_ids"]
+        self.packed_text_indexes = data["packed_text_indexes"]
+        self.packed_position_ids = data["packed_position_ids"]
+        self.use_flex = "nested_attention_masks" not in data.keys()
+        if self.use_flex:
+            self.split_lens = data["split_lens"]
+            self.attn_modes = data["attn_modes"]
+        else:
+            self.nested_attention_masks = data["nested_attention_masks"]
+        if "padded_images" in data.keys():
+            self.padded_images = data["padded_images"]
+            self.patchified_vae_latent_shapes = data["patchified_vae_latent_shapes"]
+            self.packed_latent_position_ids = data["packed_latent_position_ids"]
+            self.packed_vae_token_indexes = data["packed_vae_token_indexes"]
+        if "packed_vit_tokens" in data.keys():
+            self.packed_vit_tokens = data["packed_vit_tokens"]
+            self.packed_vit_position_ids = data["packed_vit_position_ids"]
+            self.packed_vit_token_indexes = data["packed_vit_token_indexes"]
+            self.vit_token_seqlens = data["vit_token_seqlens"]
+        if "packed_timesteps" in data.keys():
+            self.packed_timesteps = data["packed_timesteps"]
+            self.mse_loss_indexes = data["mse_loss_indexes"]
+        if "packed_label_ids" in data.keys():
+            self.packed_label_ids = data["packed_label_ids"]
+            self.ce_loss_indexes = data["ce_loss_indexes"]
+            self.ce_loss_weights = data["ce_loss_weights"]
+    def pin_memory(self):
+        self.packed_text_ids = self.packed_text_ids.pin_memory()
+        self.packed_text_indexes = self.packed_text_indexes.pin_memory()
+        self.packed_position_ids = self.packed_position_ids.pin_memory()
+        if not self.use_flex:
+            self.nested_attention_masks = [item.pin_memory() for item in self.nested_attention_masks]
+        if hasattr(self, 'padded_images'):
+            self.padded_images = self.padded_images.pin_memory()
+            self.packed_vae_token_indexes = self.packed_vae_token_indexes.pin_memory()
+            self.packed_latent_position_ids = self.packed_latent_position_ids.pin_memory()
+        if hasattr(self, 'packed_timesteps'):
+            self.packed_timesteps = self.packed_timesteps.pin_memory()
+            self.mse_loss_indexes = self.mse_loss_indexes.pin_memory()
+        if hasattr(self, 'packed_vit_tokens'):
+            self.packed_vit_tokens = self.packed_vit_tokens.pin_memory()
+            self.packed_vit_position_ids = self.packed_vit_position_ids.pin_memory()
+            self.packed_vit_token_indexes = self.packed_vit_token_indexes.pin_memory()
+            self.vit_token_seqlens = self.vit_token_seqlens.pin_memory()
+        if hasattr(self, 'packed_label_ids'):
+            self.packed_label_ids = self.packed_label_ids.pin_memory()
+            self.ce_loss_indexes = self.ce_loss_indexes.pin_memory()
+            self.ce_loss_weights = self.ce_loss_weights.pin_memory()
+        return self
+    def cuda(self, device):
+        self.packed_text_ids = self.packed_text_ids.to(device)
+        self.packed_text_indexes = self.packed_text_indexes.to(device)
+        self.packed_position_ids = self.packed_position_ids.to(device)
+        if not self.use_flex:
+            self.nested_attention_masks = [item.to(device) for item in self.nested_attention_masks]
+        if hasattr(self, 'padded_images'):
+            self.padded_images = self.padded_images.to(device)
+            self.packed_vae_token_indexes = self.packed_vae_token_indexes.to(device)
+            self.packed_latent_position_ids = self.packed_latent_position_ids.to(device)
+        if hasattr(self, 'packed_timesteps'):
+            self.packed_timesteps = self.packed_timesteps.to(device)
+            self.mse_loss_indexes = self.mse_loss_indexes.to(device)
+        if hasattr(self, 'packed_vit_tokens'):
+            self.packed_vit_tokens = self.packed_vit_tokens.to(device)
+            self.packed_vit_position_ids = self.packed_vit_position_ids.to(device)
+            self.packed_vit_token_indexes = self.packed_vit_token_indexes.to(device)
+            self.vit_token_seqlens = self.vit_token_seqlens.to(device)
+        if hasattr(self, 'packed_label_ids'):
+            self.packed_label_ids = self.packed_label_ids.to(device)
+            self.ce_loss_indexes = self.ce_loss_indexes.to(device)
+            self.ce_loss_weights = self.ce_loss_weights.to(device)
+        return self
+    def to_dict(self):
+        data = dict(
+            sequence_length = self.sequence_length,
+            sample_lens = self.sample_lens,
+            packed_text_ids = self.packed_text_ids,
+            packed_text_indexes = self.packed_text_indexes,
+            packed_position_ids = self.packed_position_ids,
+            batch_data_indexes = self.batch_data_indexes,
+        )
+        if not self.use_flex:
+            data['nested_attention_masks'] = self.nested_attention_masks
+        else:
+            data['split_lens'] = self.split_lens
+            data['attn_modes'] = self.attn_modes
+        if hasattr(self, 'padded_images'):
+            data['padded_images'] = self.padded_images
+            data['patchified_vae_latent_shapes'] = self.patchified_vae_latent_shapes
+            data['packed_latent_position_ids'] = self.packed_latent_position_ids
+            data['packed_vae_token_indexes'] = self.packed_vae_token_indexes
+        if hasattr(self, 'packed_vit_tokens'):
+            data['packed_vit_tokens'] = self.packed_vit_tokens
+            data['packed_vit_position_ids'] = self.packed_vit_position_ids
+            data['packed_vit_token_indexes'] = self.packed_vit_token_indexes
+            data['vit_token_seqlens'] = self.vit_token_seqlens
+        if hasattr(self, 'packed_timesteps'):
+            data['packed_timesteps'] = self.packed_timesteps
+            data['mse_loss_indexes'] = self.mse_loss_indexes
+        if hasattr(self, 'packed_label_ids'):
+            data['packed_label_ids'] = self.packed_label_ids
+            data['ce_loss_indexes'] = self.ce_loss_indexes
+            data['ce_loss_weights'] = self.ce_loss_weights
+        return data
+def collate_wrapper():
+    def collate_fn(batch):
+        return SimpleCustomBatch(batch)
+    return collate_fn

data/dataset_info.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+from .interleave_datasets import UnifiedEditIterableDataset
+from .t2i_dataset import T2IIterableDataset
+from .vlm_dataset import SftJSONLIterableDataset
+from .interleave_datasets.think_trace_dataset import ThinkTraceJSONLIterableDataset
+DATASET_REGISTRY = {
+    't2i_pretrain': T2IIterableDataset,
+    'vlm_sft': SftJSONLIterableDataset,
+    'unified_edit': UnifiedEditIterableDataset,
+    'think_trace': ThinkTraceJSONLIterableDataset,
+    'block_dataset': ThinkTraceJSONLIterableDataset,
+    'block_dataset_random': ThinkTraceJSONLIterableDataset,
+}
+DATASET_INFO = {
+    'think_trace': {
+        'think_trace_dataset': {
+            'data_dir': '/scratch/by2593/project/SpaCU/interleaved-co3dv2/data',
+            'jsonl_path': '/scratch/by2593/project/SpaCU/interleaved-co3dv2/data/merged_train.jsonl',
+            'image_prefix_dir': '/scratch/by2593/project/SpaCU/restored_data2',  # Base path for relative image paths
+            # 'num_total_samples': 100,
+        },
+    },
+    'block_dataset': {
+        'block_dataset': {
+            'data_dir': "/scratch/by2593/project/SMM/semantic_blocks_part1",
+            # 'jsonl_path': '/scratch/by2593/project/SMM/SMM_data/semantic_block_train_part1_v2_reordered.jsonl',
+            'jsonl_path': '/scratch/by2593/project/SMM/SMM_data/semantic_block_train_part1.jsonl',
+            'image_prefix_dir': '/scratch/by2593/project/SMM/semantic_blocks_part1',  # Base path for relative image paths
+            # 'num_total_samples': 100,
+        },
+    },
+        'block_dataset_random': {
+        'block_dataset_random': {
+            'data_dir': "/scratch/by2593/project/SMM/random_pipeline/random_blocks",
+            'jsonl_path': '/scratch/by2593/project/SMM/SMM_data/random_block.jsonl',
+            'image_prefix_dir': '/scratch/by2593/project/SMM/random_pipeline/random_blocks',  # Base path for relative image paths
+            # 'num_total_samples': 100,
+        },
+    },
+}

data/distributed_iterable_dataset.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+import random
+import torch
+class DistributedIterableDataset(torch.utils.data.IterableDataset):
+    def __init__(self, dataset_name, local_rank=0, world_size=1, num_workers=8):
+        self.dataset_name = dataset_name
+        self.local_rank = local_rank
+        self.world_size = world_size
+        self.num_workers = num_workers
+        self.rng = random.Random()
+        self.data_paths = None
+    def get_data_paths(self, *args, **kwargs):
+        raise NotImplementedError
+    def set_epoch(self, seed=42):
+        if self.data_paths is None:
+            return
+        if isinstance(self.data_paths[0], tuple):
+            data_paths = sorted(self.data_paths, key=lambda x: (x[0], x[1]))
+        elif isinstance(self.data_paths[0], str):
+            data_paths = sorted(self.data_paths)
+        else:
+            raise ValueError(f"Unknown data_paths type: {type(self.data_paths[0])}")
+        self.rng.seed(seed)
+        self.rng.shuffle(data_paths)
+        num_files_per_rank = len(data_paths) // self.world_size
+        local_start = self.local_rank * num_files_per_rank
+        local_end = (self.local_rank + 1) * num_files_per_rank
+        self.num_files_per_rank = num_files_per_rank
+        self.data_paths_per_rank = data_paths[local_start:local_end]
+    def get_data_paths_per_worker(self):
+        if self.data_paths is None:
+            return None
+        info = torch.utils.data.get_worker_info()
+        if info is None:
+            # Single worker: Use all files assigned to the rank
+            return self.data_paths_per_rank, 0
+        worker_id = info.id
+        num_files_per_worker = self.num_files_per_rank // info.num_workers
+        start = num_files_per_worker * worker_id
+        end = num_files_per_worker * (worker_id + 1)
+        data_paths_per_worker = self.data_paths_per_rank[start:end]
+        return data_paths_per_worker[::-1], worker_id
+    def __iter__(self):
+        raise NotImplementedError

data/interleave_datasets/edit_dataset.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+import io
+import random
+from PIL import Image, ImageFile, PngImagePlugin
+from .interleave_t2i_dataset import InterleavedBaseIterableDataset, ParquetStandardIterableDataset
+from ..data_utils import pil_img2rgb
+Image.MAX_IMAGE_PIXELS = 200000000
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+MaximumDecompressedSize = 1024
+MegaByte = 2 ** 20
+PngImagePlugin.MAX_TEXT_CHUNK = MaximumDecompressedSize * MegaByte
+class UnifiedEditIterableDataset(InterleavedBaseIterableDataset, ParquetStandardIterableDataset):
+    def parse_row(self, row):
+        image_num = len(row["image_list"])
+        # randomly choose start and end, return [0, 1] when only two images
+        start_idx = random.choice(range(image_num - 1))
+        max_end = min(start_idx + 3, image_num)
+        end_idx = random.choice(range(start_idx + 1, max_end))
+        data = self._init_data()
+        data = self._add_image(
+            data,
+            pil_img2rgb(Image.open(io.BytesIO(row["image_list"][start_idx]))),
+            need_loss=False,
+            need_vae=True,
+            need_vit=True,
+        )
+        if end_idx - start_idx > 1 and random.random() < 0.5: # concat multiple insturction
+            if end_idx == image_num - 1:
+                end_idx -= 1
+            instruction = ""
+            for idx in range(start_idx + 1, end_idx + 1):
+                instruction += random.choice(row["instruction_list"][idx-1]) + ". "
+            data = self._add_text(data, instruction.rstrip(), need_loss=False)
+            data = self._add_image(
+                data,
+                pil_img2rgb(Image.open(io.BytesIO(row["image_list"][end_idx]))),
+                need_loss=True,
+                need_vae=False,
+                need_vit=False,
+            )
+        else:
+            for idx in range(start_idx + 1, end_idx + 1):
+                instruction = random.choice(row["instruction_list"][idx-1])
+                data = self._add_text(data, instruction, need_loss=False)
+                if idx != end_idx:
+                    data = self._add_image(
+                        data,
+                        pil_img2rgb(Image.open(io.BytesIO(row["image_list"][idx]))),
+                        need_loss=True,
+                        need_vae=True,
+                        need_vit=True,
+                    )
+                else:
+                    data = self._add_image(
+                        data,
+                        pil_img2rgb(Image.open(io.BytesIO(row["image_list"][idx]))),
+                        need_loss=True,
+                        need_vae=False,
+                        need_vit=False,
+                    )
+        return data

data/interleave_datasets/interleave_t2i_dataset.py ADDED Viewed

	@@ -0,0 +1,218 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+import pyarrow.parquet as pq
+from ..distributed_iterable_dataset import DistributedIterableDataset
+from ..parquet_utils import get_parquet_data_paths, init_arrow_pf_fs
+class InterleavedBaseIterableDataset(DistributedIterableDataset):
+    def _init_data(self):
+        data = {
+            'sequence_plan': [],
+            'text_ids_list': [],
+            'image_tensor_list': [],
+            'num_tokens': 0,
+        }
+        return data
+    def _add_text(self, data, text, need_loss, enable_cfg=True, next_token_label=None):
+        text_ids = self.tokenizer.encode(text)
+        data['num_tokens'] += len(text_ids)
+        data['text_ids_list'].append(text_ids)
+        # If next_token_label is provided, the im_end token should predict it
+        special_token_loss = 1 if next_token_label is not None else 0
+        data['sequence_plan'].append(
+            {
+                'type': 'text',
+                'enable_cfg': int(enable_cfg),
+                'loss': int(need_loss),
+                'special_token_loss': special_token_loss,
+                'special_token_label': next_token_label,
+            }
+        )
+        return data
+    def _add_image(self, data, image, need_loss, need_vae, need_vit, enable_cfg=True, special_token_label=None):
+        assert need_loss or need_vae or need_vit
+        if need_loss:
+            # For loss images, don't add special_token_loss on the start token
+            # The previous text token should predict the vision_start token
+            data['sequence_plan'].append(
+                {
+                    'type': 'vae_image',
+                    'enable_cfg': 0,
+                    'loss': 1,
+                    'special_token_loss': 0,  # No loss on start token itself
+                    'special_token_label': None,
+                }
+            )
+            image_tensor = self.transform(image)
+            height, width = image_tensor.shape[1:]
+            data['num_tokens'] += width * height // self.transform.stride ** 2
+            data['image_tensor_list'].append(image_tensor)
+        if need_vae:
+            data['sequence_plan'].append(
+                {
+                    'type': 'vae_image',
+                    'enable_cfg': int(enable_cfg),
+                    'loss': 0,
+                    'special_token_loss': 0,
+                    'special_token_label': None,
+                }
+            )
+            image_tensor = self.transform(image)
+            height, width = image_tensor.shape[1:]
+            data['num_tokens'] += width * height // self.transform.stride ** 2
+            data['image_tensor_list'].append(image_tensor.clone())
+        if need_vit:
+            data['sequence_plan'].append(
+                {
+                    'type': 'vit_image',
+                    'enable_cfg': int(enable_cfg),
+                    'loss': 0,
+                    'special_token_loss': 0,
+                    'special_token_label': None,
+                },
+            )
+            vit_image_tensor = self.vit_transform(image)
+            height, width = vit_image_tensor.shape[1:]
+            data['num_tokens'] += width * height // self.vit_transform.stride ** 2
+            data['image_tensor_list'].append(vit_image_tensor)
+        return data
+    def _add_video(self, data, frames, frame_indexes, need_loss, need_vae, enable_cfg=True):
+        assert int(need_loss) + int(need_vae) == 1
+        if need_loss:
+            for idx, (image, frame_idx) in enumerate(zip(frames, frame_indexes)):
+                current_sequence_plan = {
+                    'type': 'vae_image',
+                    'enable_cfg': 0,
+                    'loss': 1,
+                    'special_token_loss': 0,
+                    'special_token_label': None,
+                    'split_start': idx == 0,
+                    'split_end': idx == len(frames) - 1,
+                }
+                if idx < len(frame_indexes) - 1:
+                    current_sequence_plan['frame_delta'] = frame_indexes[idx + 1] - frame_idx
+                data['sequence_plan'].append(current_sequence_plan)
+                image_tensor = self.transform(image)
+                height, width = image_tensor.shape[1:]
+                data['image_tensor_list'].append(image_tensor)
+                data['num_tokens'] += width * height // self.transform.stride ** 2
+        elif need_vae:
+            for idx, (image, frame_idx) in enumerate(zip(frames, frame_indexes)):
+                current_sequence_plan = {
+                    'type': 'vae_image',
+                    'enable_cfg': int(enable_cfg),
+                    'loss': 0,
+                    'special_token_loss': 0,
+                    'special_token_label': None,
+                    'split_start': idx == 0,
+                    'split_end': idx == len(frames) - 1,
+                }
+                if idx < len(frame_indexes) - 1:
+                    current_sequence_plan['frame_delta'] = frame_indexes[idx + 1] - frame_idx
+                data['sequence_plan'].append(current_sequence_plan)
+                image_tensor = self.transform(image)
+                height, width = image_tensor.shape[1:]
+                data['image_tensor_list'].append(image_tensor)
+                data['num_tokens'] += width * height // self.transform.stride ** 2
+        return data
+class ParquetStandardIterableDataset(DistributedIterableDataset):
+    def __init__(
+        self, dataset_name, transform, tokenizer, vit_transform,
+        data_dir_list, num_used_data, parquet_info,
+        local_rank=0, world_size=1, num_workers=8, data_status=None,
+    ):
+        """
+        data_dir_list: list of data directories contains parquet files
+        num_used_data: list of number of sampled data paths for each data directory
+        vit_transform: input transform for vit model.
+        """
+        super().__init__(dataset_name, local_rank, world_size, num_workers)
+        self.transform = transform
+        self.vit_transform = vit_transform
+        self.tokenizer = tokenizer
+        self.data_status = data_status
+        self.data_paths = self.get_data_paths(data_dir_list, num_used_data, parquet_info)
+        self.set_epoch()
+    def get_data_paths(self, data_dir_list, num_used_data, parquet_info):
+        row_groups = []
+        for data_dir, num_data_path in zip(data_dir_list, num_used_data):
+            data_paths = get_parquet_data_paths([data_dir], [num_data_path])
+            for data_path in data_paths:
+                if data_path in parquet_info.keys():
+                    num_row_groups = parquet_info[data_path]['num_row_groups']
+                    for rg_idx in range(num_row_groups):
+                        row_groups.append((data_path, rg_idx))
+        return row_groups
+    def parse_row(self, row):
+        raise NotImplementedError
+    def __iter__(self):
+        file_paths_per_worker, worker_id = self.get_data_paths_per_worker()
+        if self.data_status is not None:
+            global_row_group_start_id = self.data_status[worker_id][0]
+            row_start_id = self.data_status[worker_id][1] + 1
+        else:
+            global_row_group_start_id = 0
+            row_start_id = 0
+        print(
+            f"rank-{self.local_rank} worker-{worker_id} dataset-{self.dataset_name}: "
+            f"resuming data at global_rg#{global_row_group_start_id}, row#{row_start_id}"
+        )
+        while True:
+            file_paths_per_worker_ = file_paths_per_worker[global_row_group_start_id:]
+            for global_row_group_idx, (parquet_file_path, row_group_id) in enumerate(
+                file_paths_per_worker_, start=global_row_group_start_id
+            ):
+                fs = init_arrow_pf_fs(parquet_file_path)
+                with fs.open_input_file(parquet_file_path) as f:
+                    try:
+                        fr = pq.ParquetFile(f)
+                        df = fr.read_row_group(row_group_id).to_pandas()
+                        df = df.iloc[row_start_id:]
+                    except Exception as e:
+                        print(f'Error {e} in rg#{row_group_id}, {parquet_file_path}')
+                        continue
+                    for row_idx, row in df.iterrows():
+                        try:
+                            data = self.parse_row(row)
+                            if len(data) == 0:
+                                continue
+                            data['data_indexes'] = {
+                                "data_indexes": [global_row_group_idx, row_idx],
+                                "worker_id": worker_id,
+                                "dataset_name": self.dataset_name,
+                            }
+                        except Exception as e:
+                            print(f'Error {e} in rg#{row_group_id}, {parquet_file_path}')
+                            continue
+                        yield data
+                    row_start_id = 0
+            global_row_group_start_id = 0
+            print(f"{self.dataset_name} repeat in rank-{self.local_rank} worker-{worker_id}")

data/interleave_datasets/think_trace_dataset.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import json
+import os
+import re
+import traceback
+from PIL import Image, ImageFile, PngImagePlugin
+from .interleave_t2i_dataset import InterleavedBaseIterableDataset
+from ..data_utils import pil_img2rgb
+from ..distributed_iterable_dataset import DistributedIterableDataset
+Image.MAX_IMAGE_PIXELS = 200000000
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+MaximumDecompressedSize = 1024
+MegaByte = 2 ** 20
+PngImagePlugin.MAX_TEXT_CHUNK = MaximumDecompressedSize * MegaByte
+class ThinkTraceJSONLIterableDataset(InterleavedBaseIterableDataset, DistributedIterableDataset):
+    def __init__(
+        self,
+        dataset_name,
+        transform,
+        tokenizer,
+        vit_transform,
+        jsonl_path_list,
+        data_dir_list,
+        num_used_data,
+        local_rank=0,
+        world_size=1,
+        num_workers=8,
+        data_status=None,
+        shuffle_lines=True,
+        shuffle_seed=0,
+        image_prefix_dir=None,
+    ):
+        """
+        Dataset for think-trace style JSONL files with interleaved text and images.
+        Args:
+            dataset_name: Name of the dataset
+            transform: Transform for VAE images
+            tokenizer: Text tokenizer
+            vit_transform: Transform for VIT images
+            jsonl_path_list: List of JSONL file paths
+            data_dir_list: List of base directories (should match jsonl_path_list)
+            num_used_data: List of number of samples to use from each JSONL. If a value is None or non-positive, all data from that JSONL will be used.
+            image_prefix_dir: Absolute path to prepend to relative image paths
+            Other args: Standard distributed dataset args
+        """
+        DistributedIterableDataset.__init__(self, dataset_name, local_rank, world_size, num_workers)
+        self.transform = transform
+        self.vit_transform = vit_transform
+        self.tokenizer = tokenizer
+        self.data_status = data_status
+        self.image_prefix_dir = image_prefix_dir or ""
+        self.start_of_image = tokenizer.convert_tokens_to_ids('<|vision_start|>')
+        self.end_of_image = tokenizer.convert_tokens_to_ids('<|vision_end|>')
+        self.im_start = tokenizer.convert_tokens_to_ids('<|im_start|>')
+        self.data_paths = self.get_data_paths(
+            jsonl_path_list,
+            num_used_data,
+            shuffle_lines,
+            shuffle_seed,
+        )
+        self.set_epoch()
+    def get_data_paths(self, jsonl_path_list, num_used_data, shuffle_lines, shuffle_seed):
+        data_paths = []
+        if not isinstance(num_used_data, list):
+            num_used_data = [num_used_data] * len(jsonl_path_list)
+        for jsonl_path, num_data_point in zip(jsonl_path_list, num_used_data):
+            with open(jsonl_path, 'r') as f:
+                raw_data = f.readlines()
+            if shuffle_lines:
+                self.rng.seed(shuffle_seed)
+                self.rng.shuffle(raw_data)
+            # Convert 'None' string to None type
+            if num_data_point == 'None':
+                num_data_point = None
+            if num_data_point is not None and int(num_data_point) > 0:
+                raw_data = raw_data[:int(num_data_point)]
+            data_paths.extend(raw_data)
+        return data_paths
+    def extract_image_references(self, text):
+        """Extract image references from text like <image_start>[problem_image_1]<image_end>"""
+        pattern = r'<image_start>\[([^\]]+)\]<image_end>'
+        matches = re.findall(pattern, text)
+        return matches
+    def replace_image_references(self, text):
+        """Replace image references with placeholder tokens for processing"""
+        pattern = r'<image_start>\[([^\]]+)\]<image_end>'
+        # Replace with a special placeholder that we'll process later
+        return re.sub(pattern, '<IMAGE_PLACEHOLDER>', text)
+    def remove_thought_patterns(self, text):
+        """Remove THOUGHT x: patterns from text"""
+        # Remove patterns like "THOUGHT 1:", "THOUGHT 2:", etc.
+        pattern = r'THOUGHT\s*\d+:\s*'
+        return re.sub(pattern, '', text)
+    def load_image_safely(self, data_item, image_key):
+        """Load image with null checking and path resolution"""
+        if image_key not in data_item or data_item[image_key] is None:
+            return None
+        image_path = data_item[image_key]
+        full_path = os.path.join(self.image_prefix_dir, image_path)
+        try:
+            return pil_img2rgb(Image.open(full_path))
+        except Exception as e:
+            print(f"Failed to load image {full_path}: {e}")
+            return None
+    def parse_row(self, json_line):
+        """Parse a single JSON line into the required format"""
+        try:
+            data_item = json.loads(json_line.strip())
+        except:
+            traceback.print_exc()
+            return {}
+        # Extract the main fields
+        prompt = "You are an AI reasoning assistant capable of step-by-step interleaved text and visual chain of thought. Think step by step and generate visual aids to enhance your problem-solving. You should first think about the reasoning and planning process in the mind before generating visual aids. Wrap your text reasoning with <think></think> tokens, and wrap your final conclusion with <answer></answer> tokens. Provide your final conclusion clearly in the format of '<answer>Final Answer: <answer here></answer>'"
+        question = data_item.get('Question', '')
+        question = f'Question: {question}'
+        reasoning_trace = data_item.get('Text Reasoning Trace', '')
+        reasoning_trace = f'{reasoning_trace}'
+        final_answer = data_item.get('Final Answer', '')
+        final_answer = f'<answer>Final Answer: {final_answer}</answer>'
+        if not question or not reasoning_trace or not final_answer:
+            return {}
+        # Build the sequence
+        data = self._init_data()
+        # 0. Add prompt
+        data = self._add_text(data, prompt, need_loss=False, enable_cfg=True)
+        # 1. Add question (with image parsing)
+        question_image_refs = self.extract_image_references(question)
+        if question_image_refs:
+            clean_question = self.replace_image_references(question)
+            question_text_parts = clean_question.split('<IMAGE_PLACEHOLDER>')
+            if len(question_text_parts) != len(question_image_refs) + 1:
+                print(f"Mismatch in question: text parts {len(question_text_parts)}, images {len(question_image_refs)}")
+                return {}
+            question_images = []
+            for image_ref in question_image_refs:
+                image = self.load_image_safely(data_item, image_ref)
+                if image is None:
+                    print(f"Skipping sample due to missing image in question: {image_ref}")
+                    return {}
+                question_images.append(image)
+            for i, text_part in enumerate(question_text_parts):
+                if text_part.strip():
+                    # Question text has no loss, so no need for vision start prediction
+                    data = self._add_text(data, text_part.strip(), need_loss=False, enable_cfg=True)
+                if i < len(question_images):
+                    data = self._add_image(
+                        data, question_images[i],
+                        need_loss=False, # No loss for question images
+                        need_vae=False,   # VAE conditioning
+                        need_vit=True,   # VIT understanding
+                        enable_cfg=True,
+                    )
+        else:
+            # Original behavior if no images in question
+            data = self._add_text(data, question, need_loss=False, enable_cfg=True)
+        # 2. Interleave text parts and images from reasoning trace
+        image_refs = self.extract_image_references(reasoning_trace)
+        loaded_images = []
+        for image_ref in image_refs:
+            image = self.load_image_safely(data_item, image_ref)
+            if image is not None:
+                loaded_images.append(image)
+            else:
+                # If image fails to load, skip this sample
+                print(f"Skipping sample due to missing image: {image_ref}")
+                return {}
+        # Clean reasoning trace by removing image references for text processing
+        clean_reasoning_trace = self.replace_image_references(reasoning_trace)
+        # Remove THOUGHT patterns from the reasoning trace
+        clean_reasoning_trace = self.remove_thought_patterns(clean_reasoning_trace)
+        # Append final answer to the reasoning trace
+        # clean_reasoning_trace += f"\n\nFinal Answer: {final_answer}"
+        # Split reasoning trace by image placeholders to interleave text and images
+        text_parts = clean_reasoning_trace.split('<IMAGE_PLACEHOLDER>')
+        if len(text_parts) != len(loaded_images) + 1:
+            print(f"Mismatch between text parts ({len(text_parts)}) and images ({len(loaded_images)})")
+            return {}
+        # 4. Interleave text parts and images from reasoning trace
+        for i, text_part in enumerate(text_parts):
+            # Add text part if not empty
+            if text_part.strip():
+                # Wrap reasoning text with <think></think> tokens
+                wrapped_text = f"<think>{text_part.strip()}</think>"
+                # Determine what the im_end token should predict
+                if i < len(loaded_images):
+                    # If this text part is followed by an image, predict vision_start
+                    next_token_label = self.start_of_image
+                elif i == len(text_parts) - 1:
+                    # If this is the last text part, predict im_start for final answer
+                    next_token_label = self.im_start
+                else:
+                    next_token_label = None
+                data = self._add_text(data, wrapped_text, need_loss=True, enable_cfg=True, next_token_label=next_token_label)
+            # Add image if available
+            if i < len(loaded_images):
+                # Add image with both VAE and VIT processing for full capability
+                data = self._add_image(
+                    data,
+                    loaded_images[i],
+                    need_loss=True,  # VAE generation loss
+                    need_vae=True,   # VAE conditioning
+                    need_vit=True,   # VIT understanding
+                    enable_cfg=True,
+                )
+        # 5. Add final answer
+        data = self._add_text(data, final_answer, need_loss=True, enable_cfg=True)# ybq1025 need_loss=False
+        return data
+    def __iter__(self):
+        data_paths_per_worker, worker_id = self.get_data_paths_per_worker()
+        if self.data_status is not None:
+            row_start_id = self.data_status[worker_id] + 1
+        else:
+            row_start_id = 0
+        print(
+            f"rank-{self.local_rank} worker-{worker_id} dataset-{self.dataset_name}: "
+            f"resuming data at row#{row_start_id}"
+        )
+        while True:
+            data_paths_per_worker_ = data_paths_per_worker[row_start_id:]
+            for row_idx, json_line in enumerate(data_paths_per_worker_, start=row_start_id):
+                try:
+                    data = self.parse_row(json_line)
+                    if len(data) == 0:
+                        continue
+                    # Check if we have any loss
+                    has_loss = any(item['loss'] for item in data['sequence_plan'])
+                    if not has_loss:
+                        print('No loss defined, skipped.')
+                        continue
+                    data['data_indexes'] = {
+                        "data_indexes": row_idx,
+                        "worker_id": worker_id,
+                        "dataset_name": self.dataset_name,
+                    }
+                    yield data
+                except Exception as e:
+                    print(f"Error processing row {row_idx}: {e}")
+                    traceback.print_exc()
+                    continue
+            row_start_id = 0
+            print(f"{self.dataset_name} repeat in rank-{self.local_rank} worker-{worker_id}")

modeling/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+from . import bagel, qwen2, siglip, autoencoder

modeling/autoencoder.py ADDED Viewed

	@@ -0,0 +1,360 @@

+# Copyright (c) 2024 Black Forest Labs.
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
+#
+# Original file was released under Apache-2.0, with the full license text
+# available at https://github.com/black-forest-labs/flux/blob/main/LICENSE.
+#
+# This modified file is released under the same license.
+from dataclasses import dataclass
+import torch
+from einops import rearrange
+from torch import Tensor, nn
+from safetensors.torch import load_file as load_sft
+@dataclass
+class AutoEncoderParams:
+    resolution: int
+    in_channels: int
+    downsample: int
+    ch: int
+    out_ch: int
+    ch_mult: list[int]
+    num_res_blocks: int
+    z_channels: int
+    scale_factor: float
+    shift_factor: float
+def swish(x: Tensor) -> Tensor:
+    return x * torch.sigmoid(x)
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+    def forward(self, x: Tensor):
+        pad = (0, 1, 0, 1)
+        x = nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: Tensor):
+        x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+        return x
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        block_in = self.ch
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: Tensor) -> Tensor:
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.ffactor = 2 ** (self.num_resolutions - 1)
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+    def forward(self, z: Tensor) -> Tensor:
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class DiagonalGaussian(nn.Module):
+    def __init__(self, sample: bool = True, chunk_dim: int = 1):
+        super().__init__()
+        self.sample = sample
+        self.chunk_dim = chunk_dim
+    def forward(self, z: Tensor) -> Tensor:
+        mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
+        if self.sample:
+            std = torch.exp(0.5 * logvar)
+            return mean + std * torch.randn_like(mean)
+        else:
+            return mean
+class AutoEncoder(nn.Module):
+    def __init__(self, params: AutoEncoderParams):
+        super().__init__()
+        self.encoder = Encoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.decoder = Decoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            out_ch=params.out_ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.reg = DiagonalGaussian()
+        self.scale_factor = params.scale_factor
+        self.shift_factor = params.shift_factor
+    def encode(self, x: Tensor) -> Tensor:
+        z = self.reg(self.encoder(x))
+        z = self.scale_factor * (z - self.shift_factor)
+        return z
+    def decode(self, z: Tensor) -> Tensor:
+        z = z / self.scale_factor + self.shift_factor
+        return self.decoder(z)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.decode(self.encode(x))
+def print_load_warning(missing: list[str], unexpected: list[str]) -> None:
+    if len(missing) > 0 and len(unexpected) > 0:
+        print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
+        print("\n" + "-" * 79 + "\n")
+        print(f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected))
+    elif len(missing) > 0:
+        print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
+    elif len(unexpected) > 0:
+        print(f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected))
+def load_ae(local_path: str) -> AutoEncoder:
+    ae_params = AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            downsample=8,
+            ch=128,
+            out_ch=3,
+            ch_mult=[1, 2, 4, 4],
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+    )
+    # Loading the autoencoder
+    ae = AutoEncoder(ae_params)
+    if local_path is not None:
+        sd = load_sft(local_path)
+        missing, unexpected = ae.load_state_dict(sd, strict=False, assign=True)
+        print_load_warning(missing, unexpected)
+    return ae, ae_params

modeling/bagel/bagel.py ADDED Viewed

	@@ -0,0 +1,1068 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+import copy
+from typing import List, Tuple, Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn.attention.flex_attention import create_block_mask
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import PreTrainedModel
+from data.data_utils import (
+    create_sparse_mask,
+    get_flattened_position_ids_extrapolate,
+    get_flattened_position_ids_interpolate,
+    patchify,
+)
+from .qwen2_navit import NaiveCache
+from .modeling_utils import MLPconnector, TimestepEmbedder, PositionEmbedding
+from tqdm import tqdm
+class BagelConfig(PretrainedConfig):
+    def __init__(
+        self,
+        visual_gen=True,
+        visual_und=True,
+        llm_config=None,
+        vit_config=None,
+        vae_config=None,
+        latent_patch_size=2,
+        max_latent_size=32,
+        vit_max_num_patch_per_side=70,
+        connector_act="gelu_pytorch_tanh",
+        interpolate_pos=False,
+        timestep_shift=1.0,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.visual_gen = visual_gen
+        self.visual_und = visual_und
+        self.llm_config = llm_config
+        self.vit_config = vit_config
+        self.vae_config = vae_config
+        self.latent_patch_size = latent_patch_size
+        self.max_latent_size = max_latent_size
+        self.vit_max_num_patch_per_side = vit_max_num_patch_per_side
+        self.connector_act = connector_act
+        self.interpolate_pos = interpolate_pos
+        self.timestep_shift = timestep_shift
+class Bagel(PreTrainedModel):
+    config_class = BagelConfig
+    base_model_prefix = 'bagel'
+    def __init__(self, language_model, vit_model, config: BagelConfig):
+        super().__init__(config)
+        self.language_model = language_model
+        self.hidden_size = config.llm_config.hidden_size
+        self.use_moe = "Mo" in config.llm_config.layer_module
+        self.num_heads = config.llm_config.num_attention_heads
+        if config.visual_gen:
+            self.latent_patch_size = config.latent_patch_size
+            self.timestep_shift = config.timestep_shift
+            self.latent_downsample = config.vae_config.downsample * config.latent_patch_size
+            self.max_latent_size = config.max_latent_size
+            self.latent_channel = config.vae_config.z_channels
+            self.patch_latent_dim = self.latent_patch_size ** 2 * self.latent_channel
+            self.time_embedder = TimestepEmbedder(self.hidden_size)
+            self.vae2llm = nn.Linear(self.patch_latent_dim, self.hidden_size)
+            self.llm2vae = nn.Linear(self.hidden_size, self.patch_latent_dim)
+            self.latent_pos_embed = PositionEmbedding(self.max_latent_size, self.hidden_size)
+        if config.visual_und:
+            self.vit_model = vit_model
+            self.vit_patch_size = config.vit_config.patch_size
+            self.vit_max_num_patch_per_side = config.vit_max_num_patch_per_side
+            self.vit_hidden_size = config.vit_config.hidden_size
+            self.connector = MLPconnector(self.vit_hidden_size, self.hidden_size, config.connector_act)
+            self.vit_pos_embed = PositionEmbedding(self.vit_max_num_patch_per_side, self.hidden_size)
+        if config.interpolate_pos:
+            self.get_flattened_position_ids = get_flattened_position_ids_interpolate
+        else:
+            self.get_flattened_position_ids = get_flattened_position_ids_extrapolate
+        self.config = config
+        self._init_weights()
+    def _init_weights(self):
+        if self.config.visual_gen:
+            nn.init.constant_(self.llm2vae.weight, 0)
+            nn.init.constant_(self.llm2vae.bias, 0)
+    def forward(
+        self,
+        sequence_length: int,
+        packed_text_ids: torch.LongTensor,
+        packed_text_indexes: torch.LongTensor,
+        sample_lens: List[int],
+        packed_position_ids: torch.LongTensor,
+        nested_attention_masks: List[torch.Tensor] = None,
+        split_lens: List[int] = None,
+        attn_modes: List[str] = None,
+        # for visual understanding
+        ce_loss_indexes: Optional[torch.BoolTensor] = None,
+        packed_label_ids: Optional[torch.LongTensor] = None,
+        packed_vit_tokens: Optional[torch.Tensor] = None,
+        packed_vit_token_indexes: Optional[torch.LongTensor] = None,
+        packed_vit_position_ids: Optional[torch.LongTensor] = None,
+        vit_token_seqlens: Optional[torch.IntTensor] = None,
+        # for visual generation
+        padded_latent: Optional[torch.Tensor] = None,
+        patchified_vae_latent_shapes: Optional[List[Tuple[int, int]]] = None,
+        packed_latent_position_ids: Optional[torch.LongTensor] = None,
+        packed_vae_token_indexes: Optional[torch.LongTensor] = None,
+        packed_timesteps: Optional[torch.LongTensor] = None,
+        mse_loss_indexes: Optional[torch.BoolTensor] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            sequence_length: length of sequence.
+            packed_text_ids: 1-D int tensor, packed text token ids.
+            packed_text_indexes: 1-D int tensor, packed text token indexes in sequence.
+            sample_lens: A list of N ints, length of each sample in packed_sequence.
+            nested_attention_masks: A list of N 2-D float tensor,  where 0.0 means attention and
+                -inf means ignore.
+            packed_position_ids: packed 1-D positions, an image has only one global position shared
+                by all latent tokens.
+            packed_vit_tokens: packed patchified image tokens for vit model.
+            packed_vit_position_ids: 1-D int tensor, the position of each token for vit model.
+            packed_vit_token_indexes: 1-D int tensor, packed vit token indexes in sequence.
+            vit_token_seqlens: 1-D int tensor, the length of each image tokens for vit model.
+            packed_label_ids: 1-D int tensor, packed label token ids.
+            ce_loss_indexes: 1-D bool tensor, where to compute ce loss.
+            padded_latent: padded latent from VAE encoder.
+            patchified_vae_latent_shapes: A list of (h, w) tuples, patchfied latent shapes of each image.
+            packed_latent_position_ids: 1-D int tensor, the position of each token for latent.
+            packed_vae_token_indexes: 1-D int tensor, padded image token indexes in sequence.
+            packed_timesteps: 1-D float tensor, flow timesteps. 0 indicates use clean image.
+            mse_loss_indexes: 1-D bool tensor, where to compute mse loss.
+        """
+        packed_text_embedding = self.language_model.model.embed_tokens(packed_text_ids)
+        packed_sequence = packed_text_embedding.new_zeros(size=(sequence_length, self.hidden_size))
+        packed_sequence[packed_text_indexes] = packed_text_embedding
+        if nested_attention_masks is None:
+            sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
+            seqlen = sum(sample_lens)
+            block_mask = create_block_mask(
+                sparse_mask, B=1, H=self.num_heads, Q_LEN=seqlen, KV_LEN=seqlen,
+                device=packed_text_embedding.device, BLOCK_SIZE=128, _compile=True
+            )
+            attention_mask = block_mask
+        else:
+            attention_mask = nested_attention_masks
+        # if self.config.visual_und and vit_token_seqlens is not None:
+        if self.config.visual_und:
+            cu_seqlens = torch.nn.functional.pad(torch.cumsum(vit_token_seqlens, dim=0), (1, 0))
+            cu_seqlens = cu_seqlens.to(torch.int32)
+            max_seqlen = torch.max(vit_token_seqlens).item()
+            packed_vit_token_embed = self.vit_model(
+                packed_pixel_values=packed_vit_tokens,
+                packed_flattened_position_ids=packed_vit_position_ids,
+                cu_seqlens=cu_seqlens,
+                max_seqlen=max_seqlen,
+            )
+            packed_vit_token_embed = self.connector(packed_vit_token_embed)
+            vit_token_pos_emb = self.vit_pos_embed(packed_vit_position_ids)
+            packed_vit_token_embed = packed_vit_token_embed + vit_token_pos_emb
+            packed_sequence[packed_vit_token_indexes] = packed_vit_token_embed
+        if self.config.visual_gen:
+            p = self.latent_patch_size
+            packed_latent = []
+            for latent, (h, w) in zip(padded_latent, patchified_vae_latent_shapes):
+                latent = latent[:, :h * p, :w * p].reshape(self.latent_channel, h, p, w, p)
+                latent = torch.einsum("chpwq->hwpqc", latent).reshape(-1, p * p * self.latent_channel)
+                packed_latent.append(latent)
+            packed_latent_clean = torch.cat(packed_latent, dim=0)
+            noise = torch.randn_like(packed_latent_clean)
+            packed_timesteps = torch.sigmoid(packed_timesteps)
+            packed_timesteps = self.timestep_shift * packed_timesteps / (1 + (self.timestep_shift - 1) * packed_timesteps)
+            packed_latent = (1 - packed_timesteps[:, None]) * packed_latent_clean + packed_timesteps[:, None] * noise
+            packed_timestep_embeds = self.time_embedder(packed_timesteps)
+            latent_token_pos_emb = self.latent_pos_embed(packed_latent_position_ids)
+            packed_latent = self.vae2llm(packed_latent) + packed_timestep_embeds + latent_token_pos_emb
+            packed_sequence[packed_vae_token_indexes] = packed_latent
+        extra_inputs = {}
+        if self.use_moe:
+            packed_und_token_indexes = packed_text_indexes
+            if packed_vit_token_indexes is not None:
+                packed_und_token_indexes=torch.cat([packed_text_indexes, packed_vit_token_indexes], dim=0)
+            extra_inputs.update(
+                packed_und_token_indexes=packed_und_token_indexes,
+                packed_gen_token_indexes=packed_vae_token_indexes,
+            )
+        last_hidden_state = self.language_model(
+            packed_sequence=packed_sequence,
+            sample_lens=sample_lens,
+            attention_mask=attention_mask,
+            packed_position_ids=packed_position_ids,
+            **extra_inputs,
+        )
+        mse = None
+        if self.config.visual_gen:
+            packed_mse_preds = self.llm2vae(last_hidden_state[mse_loss_indexes])
+            target = noise - packed_latent_clean # NOTE: v_t=dx_t/dt=x_1-x_0, pointing from data to noise
+            has_mse = packed_timesteps > 0
+            mse = (packed_mse_preds - target[has_mse]) ** 2
+        ce = None
+        if ce_loss_indexes is not None:
+            packed_ce_preds = self.language_model.lm_head(last_hidden_state[ce_loss_indexes])
+            ce = F.cross_entropy(packed_ce_preds, packed_label_ids, reduction="none")
+        return dict(mse=mse, ce=ce)
+    def prepare_prompts(self, curr_kvlens, curr_rope, prompts, tokenizer, new_token_ids):
+        packed_text_ids = list()
+        packed_text_position_ids = list()
+        text_token_lens = list()
+        packed_text_indexes = list()
+        packed_key_value_indexes = list()
+        curr = 0
+        newlens, new_rope = list(), list()
+        for prompt, curr_kvlen, curr_position_id in zip(prompts, curr_kvlens, curr_rope):
+            packed_key_value_indexes.extend(range(curr, curr + curr_kvlen))
+            curr += curr_kvlen
+            text_ids = tokenizer.encode(prompt)
+            text_ids = [new_token_ids['bos_token_id']] + text_ids + [new_token_ids['eos_token_id']]
+            text_token_lens.append(len(text_ids))
+            packed_text_ids.extend(text_ids)
+            packed_text_position_ids.extend(range(curr_position_id, curr_position_id + len(text_ids)))
+            packed_text_indexes.extend(range(curr, curr + len(text_ids)))
+            newlens.append(curr_kvlen + len(text_ids))
+            new_rope.append(curr_position_id + len(text_ids))
+            curr += len(text_ids)
+        generation_input = {
+            "text_token_lens": torch.tensor(text_token_lens, dtype=torch.int),
+            "packed_text_ids": torch.tensor(packed_text_ids, dtype=torch.long),
+            "packed_text_position_ids": torch.tensor(packed_text_position_ids, dtype=torch.long),
+            "packed_text_indexes": torch.tensor(packed_text_indexes, dtype=torch.long),
+            "packed_key_value_indexes": torch.tensor(packed_key_value_indexes, dtype=torch.long),
+            "key_values_lens": torch.tensor(curr_kvlens, dtype=torch.int),
+        }
+        return generation_input, newlens, new_rope
+    @torch.no_grad
+    def forward_cache_update_text(
+        self,
+        past_key_values: NaiveCache,
+        packed_text_ids: torch.IntTensor,
+        packed_text_position_ids: torch.LongTensor,
+        text_token_lens: torch.LongTensor,
+        packed_text_indexes: torch.LongTensor,
+        packed_key_value_indexes: torch.LongTensor,
+        key_values_lens: torch.IntTensor,
+    ):
+        packed_text_embedding = self.language_model.model.embed_tokens(packed_text_ids)
+        extra_inputs = {}
+        if self.use_moe:
+            extra_inputs = {"mode": "und"}
+        output = self.language_model.forward_inference(
+            packed_query_sequence=packed_text_embedding,
+            query_lens=text_token_lens,
+            packed_query_position_ids=packed_text_position_ids,
+            packed_query_indexes=packed_text_indexes,
+            past_key_values=past_key_values,
+            packed_key_value_indexes=packed_key_value_indexes,
+            key_values_lens=key_values_lens,
+            update_past_key_values=True,
+            is_causal=True,
+            **extra_inputs,
+        )
+        past_key_values = output.past_key_values
+        return past_key_values
+    def prepare_vit_images(self, curr_kvlens, curr_rope, images, transforms, new_token_ids):
+        packed_vit_token_indexes = list()
+        vit_token_seqlens, packed_vit_tokens, packed_vit_position_ids = list(), list(), list()
+        packed_text_ids, packed_text_indexes = list(), list()
+        packed_seqlens, packed_position_ids, packed_indexes = list(), list(), list()
+        packed_key_value_indexes = list()
+        _curr = curr = 0
+        newlens, new_rope = list(), list()
+        for image, curr_kvlen, curr_position_id in zip(images, curr_kvlens, curr_rope):
+            packed_key_value_indexes.extend(range(curr, curr + curr_kvlen))
+            curr += curr_kvlen
+            packed_text_ids.append(new_token_ids['start_of_image'])
+            packed_text_indexes.append(_curr)
+            packed_indexes.append(curr)
+            curr += 1
+            _curr += 1
+            image_tensor = transforms(image)
+            vit_position_ids = self.get_flattened_position_ids(
+                image_tensor.size(1), image_tensor.size(2),
+                self.vit_patch_size,
+                max_num_patches_per_side=self.vit_max_num_patch_per_side
+            )
+            vit_tokens = patchify(image_tensor, self.vit_patch_size)
+            packed_vit_tokens.append(vit_tokens)
+            num_img_tokens = vit_tokens.shape[0]
+            packed_vit_position_ids.append(vit_position_ids)
+            vit_token_seqlens.append(num_img_tokens)
+            packed_vit_token_indexes.extend(range(_curr, _curr + num_img_tokens))
+            packed_indexes.extend(range(curr, curr + num_img_tokens))
+            curr += num_img_tokens
+            _curr += num_img_tokens
+            packed_text_ids.append(new_token_ids['end_of_image'])
+            packed_text_indexes.append(_curr)
+            packed_indexes.append(curr)
+            curr += 1
+            _curr += 1
+            packed_position_ids.extend([curr_position_id] * (num_img_tokens + 2))
+            packed_seqlens.append(num_img_tokens + 2)
+            newlens.append(curr_kvlen + num_img_tokens + 2)
+            new_rope.append(curr_position_id + 1)
+        generation_input = {
+            "packed_text_ids": torch.tensor(packed_text_ids, dtype=torch.long),
+            "packed_text_indexes": torch.tensor(packed_text_indexes, dtype=torch.long),
+            "vit_token_seqlens": torch.tensor(vit_token_seqlens, dtype=torch.int),
+            "packed_vit_tokens": torch.cat(packed_vit_tokens, dim=0),
+            "packed_vit_position_ids": torch.cat(packed_vit_position_ids, dim=0),
+            "packed_vit_token_indexes": torch.tensor(packed_vit_token_indexes, dtype=torch.long),
+            "packed_position_ids": torch.tensor(packed_position_ids, dtype=torch.long),
+            "packed_seqlens": torch.tensor(packed_seqlens, dtype=torch.int),
+            "packed_indexes": torch.tensor(packed_indexes, dtype=torch.long),
+            "packed_key_value_indexes": torch.tensor(packed_key_value_indexes, dtype=torch.long),
+            "key_values_lens": torch.tensor(curr_kvlens, dtype=torch.int),
+        }
+        return generation_input, newlens, new_rope
+    @torch.no_grad
+    def forward_cache_update_vit(
+        self,
+        past_key_values: NaiveCache,
+        packed_text_ids: torch.LongTensor,
+        packed_text_indexes: torch.LongTensor,
+        packed_vit_tokens: torch.Tensor,
+        packed_vit_token_indexes: torch.LongTensor,
+        packed_vit_position_ids: torch.LongTensor,
+        vit_token_seqlens: torch.IntTensor,
+        packed_position_ids: torch.LongTensor,
+        packed_seqlens: torch.IntTensor,
+        packed_indexes: torch.LongTensor,
+        packed_key_value_indexes: torch.LongTensor,
+        key_values_lens: torch.IntTensor,
+    ):
+        packed_text_embedding = self.language_model.model.embed_tokens(packed_text_ids)
+        packed_sequence = packed_text_embedding.new_zeros((sum(packed_seqlens), self.hidden_size))
+        packed_sequence[packed_text_indexes] = packed_text_embedding
+        cu_seqlens = torch.nn.functional.pad(torch.cumsum(vit_token_seqlens, dim=0), (1, 0))
+        cu_seqlens = cu_seqlens.to(torch.int32)
+        max_seqlen = torch.max(vit_token_seqlens).item()
+        packed_vit_token_embed = self.vit_model(
+            packed_pixel_values=packed_vit_tokens,
+            packed_flattened_position_ids=packed_vit_position_ids,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )
+        packed_vit_token_embed = self.connector(packed_vit_token_embed)
+        pos_emb = self.vit_pos_embed(packed_vit_position_ids)
+        packed_vit_token_embed = packed_vit_token_embed + pos_emb
+        if packed_vit_token_embed.dtype != packed_sequence.dtype:
+            packed_vit_token_embed = packed_vit_token_embed.to(packed_sequence.dtype)
+        packed_sequence[packed_vit_token_indexes] = packed_vit_token_embed
+        extra_inputs = {}
+        if self.use_moe:
+            extra_inputs = {"mode": "und"}
+        output = self.language_model.forward_inference(
+            packed_query_sequence=packed_sequence,
+            query_lens=packed_seqlens,
+            packed_query_position_ids=packed_position_ids,
+            packed_query_indexes=packed_indexes,
+            past_key_values=past_key_values,
+            packed_key_value_indexes=packed_key_value_indexes,
+            key_values_lens=key_values_lens,
+            update_past_key_values=True,
+            is_causal=False,
+            **extra_inputs,
+        )
+        past_key_values = output.past_key_values
+        return past_key_values
+    def prepare_vae_images(self, curr_kvlens, curr_rope, images, transforms, new_token_ids, timestep=0):
+        patchified_vae_latent_shapes, packed_vae_position_ids = list(), list()
+        packed_vae_token_indexes = list()
+        packed_text_ids, packed_text_indexes = list(), list()
+        packed_seqlens, packed_position_ids, packed_indexes = list(), list(), list()
+        packed_key_value_indexes = list()
+        _curr = curr = 0
+        vae_image_tensors = list()
+        newlens, new_rope = list(), list()
+        for image, curr_kvlen, curr_position_id in zip(images, curr_kvlens, curr_rope):
+            packed_key_value_indexes.extend(range(curr, curr + curr_kvlen))
+            curr += curr_kvlen
+            packed_text_ids.append(new_token_ids['start_of_image'])
+            packed_text_indexes.append(_curr)
+            packed_indexes.append(curr)
+            curr += 1
+            _curr += 1
+            image_tensor = transforms(image)
+            vae_image_tensors.append(image_tensor)
+            vae_posiiton_ids = self.get_flattened_position_ids(
+                image_tensor.size(1), image_tensor.size(2),
+                self.latent_downsample,
+                max_num_patches_per_side=self.max_latent_size
+            )
+            packed_vae_position_ids.append(vae_posiiton_ids)
+            H, W = image_tensor.shape[1:]
+            h = H // self.latent_downsample
+            w = W // self.latent_downsample
+            patchified_vae_latent_shapes.append((h, w))
+            num_img_tokens = w * h
+            packed_vae_token_indexes.extend(range(_curr, _curr + num_img_tokens))
+            packed_indexes.extend(range(curr, curr + num_img_tokens))
+            curr += num_img_tokens
+            _curr += num_img_tokens
+            packed_text_ids.append(new_token_ids['end_of_image'])
+            packed_text_indexes.append(_curr)
+            packed_indexes.append(curr)
+            curr += 1
+            _curr += 1
+            packed_position_ids.extend([curr_position_id] * (num_img_tokens + 2))
+            packed_seqlens.append(num_img_tokens + 2)
+            newlens.append(curr_kvlen + num_img_tokens + 2)
+            new_rope.append(curr_position_id + 1)
+        image_sizes = [item.shape for item in vae_image_tensors]
+        max_image_size = [max(item) for item in list(zip(*image_sizes))]
+        padded_images = torch.zeros(size=(len(vae_image_tensors), *max_image_size))
+        for i, image_tensor in enumerate(vae_image_tensors):
+            padded_images[i, :, :image_tensor.shape[1], :image_tensor.shape[2]] = image_tensor
+        generation_input = {
+            "padded_images": padded_images,
+            "patchified_vae_latent_shapes": patchified_vae_latent_shapes,
+            "packed_vae_position_ids": torch.cat(packed_vae_position_ids, dim=0),
+            "packed_timesteps": torch.tensor([timestep]),
+            "packed_vae_token_indexes": torch.tensor(packed_vae_token_indexes, dtype=torch.long),
+            "packed_text_ids": torch.tensor(packed_text_ids, dtype=torch.long),
+            "packed_text_indexes": torch.tensor(packed_text_indexes, dtype=torch.long),
+            "packed_position_ids": torch.tensor(packed_position_ids, dtype=torch.long),
+            "packed_seqlens": torch.tensor(packed_seqlens, dtype=torch.int),
+            "packed_indexes": torch.tensor(packed_indexes, dtype=torch.long),
+            "packed_key_value_indexes": torch.tensor(packed_key_value_indexes, dtype=torch.long),
+            "key_values_lens": torch.tensor(curr_kvlens, dtype=torch.int),
+        }
+        return generation_input, newlens, new_rope
+    @torch.no_grad
+    def forward_cache_update_vae(
+        self,
+        vae_model,
+        past_key_values: NaiveCache,
+        padded_images: torch.Tensor,
+        patchified_vae_latent_shapes: List,
+        packed_vae_position_ids: torch.LongTensor,
+        packed_timesteps: torch.Tensor,
+        packed_vae_token_indexes: torch.LongTensor,
+        packed_text_ids: torch.LongTensor,
+        packed_text_indexes: torch.LongTensor,
+        packed_position_ids: torch.LongTensor,
+        packed_seqlens: torch.IntTensor,
+        packed_indexes: torch.LongTensor,
+        key_values_lens: torch.IntTensor,
+        packed_key_value_indexes: torch.Tensor,
+    ):
+        packed_text_embedding = self.language_model.model.embed_tokens(packed_text_ids)
+        packed_sequence = packed_text_embedding.new_zeros((sum(packed_seqlens), self.hidden_size))
+        packed_sequence[packed_text_indexes] = packed_text_embedding
+        padded_latent = vae_model.encode(padded_images)
+        p = self.latent_patch_size
+        packed_latent = list()
+        for latent, (h, w) in zip(padded_latent, patchified_vae_latent_shapes):
+            latent = latent[:, :h * p, :w * p].reshape(self.latent_channel, h, p, w, p)
+            latent = torch.einsum("chpwq->hwpqc", latent).reshape(-1, p * p * self.latent_channel)
+            packed_latent.append(latent)
+        packed_latent = torch.cat(packed_latent, dim=0)
+        packed_pos_embed = self.latent_pos_embed(packed_vae_position_ids)
+        packed_timestep_embeds = self.time_embedder(packed_timesteps)
+        packed_latent = self.vae2llm(packed_latent) + packed_timestep_embeds + packed_pos_embed
+        if packed_latent.dtype != packed_sequence.dtype:
+            packed_latent = packed_latent.to(packed_sequence.dtype)
+        packed_sequence[packed_vae_token_indexes] = packed_latent
+        extra_inputs = {}
+        if self.use_moe:
+            extra_inputs = {
+                "mode": "gen",
+                "packed_vae_token_indexes": packed_vae_token_indexes,
+                "packed_text_indexes": packed_text_indexes
+            }
+        output = self.language_model.forward_inference(
+            packed_query_sequence=packed_sequence,
+            query_lens=packed_seqlens,
+            packed_query_position_ids=packed_position_ids,
+            packed_query_indexes=packed_indexes,
+            past_key_values=past_key_values,
+            key_values_lens=key_values_lens,
+            packed_key_value_indexes=packed_key_value_indexes,
+            update_past_key_values=True,
+            is_causal=False,
+            **extra_inputs,
+        )
+        past_key_values = output.past_key_values
+        return past_key_values
+    def prepare_vae_latent(self, curr_kvlens, curr_rope, image_sizes, new_token_ids):
+        packed_text_ids, packed_text_indexes = list(), list()
+        packed_vae_position_ids, packed_vae_token_indexes, packed_init_noises = list(), list(), list()
+        packed_position_ids, packed_seqlens, packed_indexes = list(), list(), list()
+        packed_key_value_indexes = list()
+        query_curr = curr = 0
+        for (H, W), curr_kvlen, curr_position_id in zip(image_sizes, curr_kvlens, curr_rope):
+            packed_key_value_indexes.extend(range(curr, curr + curr_kvlen))
+            curr += curr_kvlen
+            packed_text_ids.append(new_token_ids['start_of_image'])
+            packed_text_indexes.append(query_curr)
+            packed_indexes.append(curr)
+            curr += 1
+            query_curr += 1
+            vae_posiiton_ids = self.get_flattened_position_ids(
+                H, W,
+                self.latent_downsample,
+                max_num_patches_per_side=self.max_latent_size
+            )
+            packed_vae_position_ids.append(vae_posiiton_ids)
+            h, w = H // self.latent_downsample, W // self.latent_downsample
+            num_image_tokens = h * w
+            packed_init_noises.append(
+                torch.randn(num_image_tokens, self.latent_channel * self.latent_patch_size ** 2)
+            )
+            packed_vae_token_indexes.extend(range(query_curr, query_curr + num_image_tokens))
+            packed_indexes.extend(range(curr, curr + num_image_tokens))
+            curr += num_image_tokens
+            query_curr += num_image_tokens
+            packed_text_ids.append(new_token_ids['end_of_image'])
+            packed_text_indexes.append(query_curr)
+            packed_indexes.append(curr)
+            curr += 1
+            query_curr += 1
+            packed_position_ids.extend([curr_position_id] * (num_image_tokens + 2))
+            packed_seqlens.append(num_image_tokens + 2)
+        generation_input = {
+            "packed_text_ids": torch.tensor(packed_text_ids, dtype=torch.long),
+            "packed_text_indexes": torch.tensor(packed_text_indexes, dtype=torch.long),
+            "packed_init_noises": torch.cat(packed_init_noises, dim=0),
+            "packed_vae_position_ids": torch.cat(packed_vae_position_ids, dim=0),
+            "packed_vae_token_indexes": torch.tensor(packed_vae_token_indexes, dtype=torch.long),
+            "packed_seqlens": torch.tensor(packed_seqlens, dtype=torch.int),
+            "packed_position_ids": torch.tensor(packed_position_ids, dtype=torch.long),
+            "key_values_lens": torch.tensor(curr_kvlens, dtype=torch.int),
+            "packed_indexes": torch.tensor(packed_indexes, dtype=torch.long),
+            "packed_key_value_indexes": torch.tensor(packed_key_value_indexes, dtype=torch.long),
+        }
+        return generation_input
+    def prepare_vae_latent_cfg(self, curr_kvlens, curr_rope, image_sizes):
+        packed_position_ids, packed_indexes, packed_key_value_indexes = list(), list(), list()
+        query_curr = curr = 0
+        for (H, W), curr_kvlen, curr_position_id in zip(image_sizes, curr_kvlens, curr_rope):
+            packed_key_value_indexes.extend(range(curr, curr + curr_kvlen))
+            curr += curr_kvlen
+            packed_indexes.append(curr)
+            curr += 1
+            query_curr += 1
+            h, w = H // self.latent_downsample, W // self.latent_downsample
+            num_image_tokens = h * w
+            packed_indexes.extend(range(curr, curr + num_image_tokens))
+            curr += num_image_tokens
+            query_curr += num_image_tokens
+            packed_indexes.append(curr)
+            curr += 1
+            query_curr += 1
+            packed_position_ids.extend([curr_position_id] * (num_image_tokens + 2))
+        generation_input = {
+            "cfg_packed_position_ids": torch.tensor(packed_position_ids, dtype=torch.long),
+            "cfg_key_values_lens": torch.tensor(curr_kvlens, dtype=torch.int),
+            "cfg_packed_query_indexes": torch.tensor(packed_indexes, dtype=torch.long),
+            "cfg_packed_key_value_indexes": torch.tensor(packed_key_value_indexes, dtype=torch.long),
+        }
+        return generation_input
+    @torch.no_grad
+    def generate_image(
+        self,
+        packed_text_ids: torch.LongTensor,
+        packed_text_indexes: torch.LongTensor,
+        packed_init_noises: torch.Tensor,
+        packed_vae_position_ids: torch.LongTensor,
+        packed_vae_token_indexes: torch.LongTensor,
+        packed_seqlens: torch.IntTensor,
+        packed_position_ids: torch.LongTensor,
+        packed_indexes: torch.LongTensor,
+        past_key_values: NaiveCache,
+        key_values_lens: torch.IntTensor,
+        packed_key_value_indexes: torch.LongTensor,
+        num_timesteps: int = 24,
+        timestep_shift: float = 1.0,
+        cfg_renorm_min: float = 0.0,
+        cfg_renorm_type: str = "global",
+        cfg_interval: Optional[Tuple[float, float]] = [0, 1],
+        # cfg_text
+        cfg_text_scale: float = 1.0,
+        cfg_text_packed_query_indexes: Optional[torch.LongTensor] = None,
+        cfg_text_packed_position_ids: Optional[torch.LongTensor] = None,
+        cfg_text_past_key_values: Optional[NaiveCache] = None,
+        cfg_text_key_values_lens: Optional[torch.IntTensor] = None,
+        cfg_text_packed_key_value_indexes: Optional[torch.LongTensor] = None,
+        # cfg_img
+        cfg_img_scale: float = 1.0,
+        cfg_img_packed_query_indexes: Optional[torch.LongTensor] = None,
+        cfg_img_packed_position_ids: Optional[torch.LongTensor] = None,
+        cfg_img_past_key_values: Optional[NaiveCache] = None,
+        cfg_img_key_values_lens: Optional[torch.IntTensor] = None,
+        cfg_img_packed_key_value_indexes: Optional[torch.LongTensor] = None,
+        cfg_type: str = "parallel",
+    ):
+        x_t = packed_init_noises
+        timesteps = torch.linspace(1, 0, num_timesteps, device=x_t.device)
+        timesteps = timestep_shift * timesteps / (1 + (timestep_shift - 1) * timesteps)
+        dts =  timesteps[:-1] - timesteps[1:]
+        timesteps = timesteps[:-1]
+        for i, t in tqdm(enumerate(timesteps), total=len(timesteps)):
+            timestep = torch.tensor([t] * x_t.shape[0], device=x_t.device)
+            if t > cfg_interval[0] and t <= cfg_interval[1]:
+                cfg_text_scale_ = cfg_text_scale
+                cfg_img_scale_ = cfg_img_scale
+            else:
+                cfg_text_scale_ = 1.0
+                cfg_img_scale_ = 1.0
+            v_t = self._forward_flow(
+                x_t=x_t,
+                timestep=timestep,
+                packed_vae_token_indexes=packed_vae_token_indexes,
+                packed_vae_position_ids=packed_vae_position_ids,
+                packed_text_ids=packed_text_ids,
+                packed_text_indexes=packed_text_indexes,
+                packed_position_ids=packed_position_ids,
+                packed_indexes=packed_indexes,
+                packed_seqlens=packed_seqlens,
+                key_values_lens=key_values_lens,
+                past_key_values=past_key_values,
+                packed_key_value_indexes=packed_key_value_indexes,
+                cfg_renorm_min=cfg_renorm_min,
+                cfg_renorm_type=cfg_renorm_type,
+                # cfg_text
+                cfg_text_scale=cfg_text_scale_,
+                cfg_text_packed_position_ids=cfg_text_packed_position_ids,
+                cfg_text_packed_query_indexes=cfg_text_packed_query_indexes,
+                cfg_text_key_values_lens=cfg_text_key_values_lens,
+                cfg_text_past_key_values=cfg_text_past_key_values,
+                cfg_text_packed_key_value_indexes=cfg_text_packed_key_value_indexes,
+                # cfg_img
+                cfg_img_scale=cfg_img_scale_,
+                cfg_img_packed_position_ids=cfg_img_packed_position_ids,
+                cfg_img_packed_query_indexes=cfg_img_packed_query_indexes,
+                cfg_img_key_values_lens=cfg_img_key_values_lens,
+                cfg_img_past_key_values=cfg_img_past_key_values,
+                cfg_img_packed_key_value_indexes=cfg_img_packed_key_value_indexes,
+                cfg_type=cfg_type,
+            )
+            x_t = x_t - v_t.to(x_t.device) * dts[i] # velocity pointing from data to noise
+        unpacked_latent = x_t.split((packed_seqlens - 2).tolist())
+        return unpacked_latent
+    @torch.no_grad
+    def _forward_flow(
+        self,
+        x_t: torch.Tensor,
+        timestep: torch.LongTensor,
+        packed_vae_token_indexes: torch.LongTensor,
+        packed_vae_position_ids: torch.LongTensor,
+        packed_text_ids: torch.LongTensor,
+        packed_text_indexes: torch.LongTensor,
+        packed_indexes: torch.LongTensor,
+        packed_position_ids: torch.LongTensor,
+        packed_seqlens: torch.IntTensor,
+        key_values_lens: torch.IntTensor,
+        past_key_values: NaiveCache,
+        packed_key_value_indexes: torch.LongTensor,
+        cfg_renorm_min: float = 0.0,
+        cfg_renorm_type: str = "global",
+        # cfg_text
+        cfg_text_scale: float = 1.0,
+        cfg_text_packed_position_ids: Optional[torch.LongTensor] = None,
+        cfg_text_packed_query_indexes: Optional[torch.LongTensor] = None,
+        cfg_text_key_values_lens: Optional[torch.Tensor] = None,
+        cfg_text_past_key_values: Optional[NaiveCache] = None,
+        cfg_text_packed_key_value_indexes: Optional[torch.LongTensor] = None,
+        # cfg_img
+        cfg_img_scale: float = 1.0,
+        cfg_img_packed_position_ids: Optional[torch.LongTensor] = None,
+        cfg_img_packed_query_indexes: Optional[torch.LongTensor] = None,
+        cfg_img_key_values_lens: Optional[torch.Tensor] = None,
+        cfg_img_past_key_values: Optional[NaiveCache] = None,
+        cfg_img_packed_key_value_indexes: Optional[torch.LongTensor] = None,
+        cfg_type: str = "parallel",
+    ):
+        packed_text_embedding = self.language_model.model.embed_tokens(packed_text_ids)
+        packed_sequence = packed_text_embedding.new_zeros((sum(packed_seqlens), self.hidden_size))
+        packed_sequence[packed_text_indexes] = packed_text_embedding
+        assert timestep.unique().shape[0] == 1
+        packed_pos_embed = self.latent_pos_embed(packed_vae_position_ids)
+        packed_timestep_embeds = self.time_embedder(timestep)
+        x_t = self.vae2llm(x_t) + packed_timestep_embeds + packed_pos_embed
+        if x_t.dtype != packed_sequence.dtype:
+            x_t = x_t.to(packed_sequence.dtype)
+        packed_sequence[packed_vae_token_indexes] = x_t
+        extra_inputs = {}
+        if self.use_moe:
+            extra_inputs = {
+                "mode": "gen",
+                "packed_vae_token_indexes": packed_vae_token_indexes,
+                "packed_text_indexes": packed_text_indexes
+            }
+        output = self.language_model.forward_inference(
+            packed_query_sequence=packed_sequence,
+            query_lens=packed_seqlens,
+            packed_query_position_ids=packed_position_ids,
+            packed_query_indexes=packed_indexes,
+            past_key_values=past_key_values,
+            key_values_lens=key_values_lens,
+            packed_key_value_indexes=packed_key_value_indexes,
+            update_past_key_values=False,
+            is_causal=False,
+            **extra_inputs,
+        )
+        v_t = self.llm2vae(output.packed_query_sequence)
+        v_t = v_t[packed_vae_token_indexes]
+        if cfg_text_scale > 1.0:
+            cfg_text_output = self.language_model.forward_inference(
+                packed_query_sequence=packed_sequence,
+                query_lens=packed_seqlens,
+                packed_query_position_ids=cfg_text_packed_position_ids,
+                packed_query_indexes=cfg_text_packed_query_indexes,
+                past_key_values=cfg_text_past_key_values,
+                key_values_lens=cfg_text_key_values_lens,
+                packed_key_value_indexes=cfg_text_packed_key_value_indexes,
+                update_past_key_values=False,
+                is_causal=False,
+                **extra_inputs,
+            )
+            cfg_text_v_t = self.llm2vae(cfg_text_output.packed_query_sequence)
+            cfg_text_v_t = cfg_text_v_t[packed_vae_token_indexes]
+        if cfg_img_scale > 1.0:
+            cfg_img_output = self.language_model.forward_inference(
+                packed_query_sequence=packed_sequence,
+                query_lens=packed_seqlens,
+                packed_query_position_ids=cfg_img_packed_position_ids,
+                packed_query_indexes=cfg_img_packed_query_indexes,
+                past_key_values=cfg_img_past_key_values,
+                key_values_lens=cfg_img_key_values_lens,
+                packed_key_value_indexes=cfg_img_packed_key_value_indexes,
+                update_past_key_values=False,
+                is_causal=False,
+                **extra_inputs,
+            )
+            cfg_img_v_t = self.llm2vae(cfg_img_output.packed_query_sequence)
+            cfg_img_v_t = cfg_img_v_t[packed_vae_token_indexes]
+        if cfg_text_scale > 1.0:
+            if cfg_renorm_type == "text_channel":
+                v_t_text_ = cfg_text_v_t + cfg_text_scale * (v_t - cfg_text_v_t)
+                norm_v_t = torch.norm(v_t, dim=-1, keepdim=True)
+                norm_v_t_text_ = torch.norm(v_t_text_, dim=-1, keepdim=True)
+                scale = (norm_v_t / (norm_v_t_text_ + 1e-8)).clamp(min=cfg_renorm_min, max=1.0)
+                v_t_text = v_t_text_ * scale
+                if cfg_img_scale > 1.0:
+                    v_t = cfg_img_v_t + cfg_img_scale * (v_t_text - cfg_img_v_t)
+                else:
+                    v_t = v_t_text
+            else:
+                v_t_text_ = cfg_text_v_t + cfg_text_scale * (v_t - cfg_text_v_t)
+                if cfg_img_scale > 1.0:
+                    v_t_ = cfg_img_v_t + cfg_img_scale * (v_t_text_ - cfg_img_v_t)
+                else:
+                    v_t_ = v_t_text_
+                # NOTE norm is computed over all dimensions, thus currently only supports batch_size = 1 with navit
+                if cfg_renorm_type == "global":
+                    norm_v_t = torch.norm(v_t)
+                    norm_v_t_ = torch.norm(v_t_)
+                elif cfg_renorm_type == "channel":
+                    norm_v_t = torch.norm(v_t, dim=-1, keepdim=True)
+                    norm_v_t_ = torch.norm(v_t_, dim=-1, keepdim=True)
+                else:
+                    raise NotImplementedError(f"{cfg_renorm_type} is not suppoprted")
+                scale = (norm_v_t / (norm_v_t_ + 1e-8)).clamp(min=cfg_renorm_min, max=1.0)
+                v_t = v_t_ * scale
+        else:
+            # No CFG
+            pass
+        return v_t
+    def prepare_start_tokens(self, curr_kvlens, curr_rope, new_token_ids):
+        packed_start_tokens, packed_key_value_indexes = list(), list()
+        packed_query_position_ids = list()
+        curr = 0
+        for curr_kvlen, curr_position_id in zip(curr_kvlens, curr_rope):
+            packed_key_value_indexes.extend(range(curr, curr + curr_kvlen))
+            packed_start_tokens.append(new_token_ids['bos_token_id'])
+            packed_query_position_ids.append(curr_position_id)
+            curr += curr_kvlen
+        generation_input = {
+            "packed_start_tokens": torch.tensor(packed_start_tokens, dtype=torch.long),
+            "packed_query_position_ids": torch.tensor(packed_query_position_ids, dtype=torch.long),
+            "key_values_lens": torch.tensor(curr_kvlens, dtype=torch.int),
+            "packed_key_value_indexes": torch.tensor(packed_key_value_indexes, dtype=torch.long),
+        }
+        return generation_input
+    @torch.no_grad
+    def generate_text(
+        self,
+        past_key_values: NaiveCache,
+        packed_key_value_indexes: torch.LongTensor,
+        key_values_lens: torch.IntTensor,
+        packed_start_tokens: torch.LongTensor,
+        packed_query_position_ids: torch.LongTensor,
+        max_length: int,
+        do_sample: bool = False,
+        temperature: float = 1.0,
+        end_token_id: int = None,
+    ):
+        step = 0
+        generated_sequence = []
+        curr_tokens = packed_start_tokens
+        while step < max_length:
+            generated_sequence.append(curr_tokens)
+            packed_text_embedding = self.language_model.model.embed_tokens(curr_tokens)
+            query_lens = torch.ones_like(curr_tokens)
+            packed_query_indexes = torch.cumsum(key_values_lens, dim=0) + torch.arange(
+                0, len(key_values_lens),
+                device=key_values_lens.device,
+                dtype=key_values_lens.dtype
+            )
+            uppacked = list(packed_key_value_indexes.split(key_values_lens.tolist(), dim=0))
+            for i in range(len(uppacked)):
+                uppacked[i] += i
+            packed_key_value_indexes = torch.cat(uppacked, dim=0)
+            extra_inputs = {}
+            if self.use_moe:
+                extra_inputs = {"mode": "und"}
+            output = self.language_model.forward_inference(
+                packed_query_sequence=packed_text_embedding,
+                query_lens=query_lens,
+                packed_query_position_ids=packed_query_position_ids,
+                packed_query_indexes=packed_query_indexes,
+                past_key_values=past_key_values,
+                key_values_lens=key_values_lens,
+                packed_key_value_indexes=packed_key_value_indexes,
+                update_past_key_values=True,
+                is_causal=True,
+                **extra_inputs,
+            )
+            past_key_values = output.past_key_values
+            packed_query_sequence = output.packed_query_sequence
+            pred_logits = self.language_model.lm_head(packed_query_sequence)
+            if do_sample:
+                probs = nn.functional.softmax(pred_logits / temperature, dim=-1)
+                curr_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                curr_tokens = torch.argmax(pred_logits, dim=-1)
+            uppacked = list(packed_key_value_indexes.split(key_values_lens.tolist(), dim=0))
+            for i in range(len(uppacked)):
+                uppacked[i] = torch.cat(
+                    [uppacked[i], torch.tensor([uppacked[i][-1] + 1], device=uppacked[i].device)], dim=0
+                )
+            packed_key_value_indexes = torch.cat(uppacked, dim=0)
+            key_values_lens = key_values_lens + 1
+            packed_query_position_ids = packed_query_position_ids + 1
+            step += 1
+            if end_token_id is not None and curr_tokens[0] == end_token_id: # only support batch=1
+                # Check if next token would be vision_start (151652)
+                generated_sequence.append(curr_tokens)  # Add the end token
+                # Generate one more token to check if it's vision_start
+                packed_text_embedding = self.language_model.model.embed_tokens(curr_tokens)
+                uppacked = list(packed_key_value_indexes.split(key_values_lens.tolist(), dim=0))
+                for i in range(len(uppacked)):
+                    uppacked[i] += i
+                packed_key_value_indexes = torch.cat(uppacked, dim=0)
+                output = self.language_model.forward_inference(
+                    packed_query_sequence=packed_text_embedding,
+                    query_lens=query_lens,
+                    packed_query_position_ids=packed_query_position_ids,
+                    packed_query_indexes=packed_query_indexes,
+                    past_key_values=past_key_values,
+                    key_values_lens=key_values_lens,
+                    packed_key_value_indexes=packed_key_value_indexes,
+                    update_past_key_values=False,
+                    is_causal=True,
+                    **extra_inputs,
+                )
+                pred_logits = self.language_model.lm_head(output.packed_query_sequence)
+                if do_sample:
+                    probs = nn.functional.softmax(pred_logits / temperature, dim=-1)
+                    next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
+                else:
+                    next_token = torch.argmax(pred_logits, dim=-1)
+                # If next token is vision_start (151652), include it
+                if next_token[0] == 151652:
+                    generated_sequence.append(next_token)
+                break
+        output_device = generated_sequence[0].device
+        return torch.stack([i.to(output_device) for i in generated_sequence], dim=0)
+    # for evaluation
+    @torch.no_grad()
+    def chat(
+        self,
+        tokenizer,
+        new_token_ids,
+        image_transform,
+        images,
+        prompt,
+        max_length: int,
+        do_sample: bool = False,
+        temperature: float = 1.0,
+    ):
+        device = next(self.parameters()).device
+        if isinstance(new_token_ids, dict):
+            for k, v in new_token_ids.items():
+                if torch.is_tensor(v):
+                    new_token_ids[k] = v.to(device)
+        elif torch.is_tensor(new_token_ids):
+            new_token_ids = new_token_ids.to(device)
+        # prefill
+        past_key_values = NaiveCache(self.config.llm_config.num_hidden_layers)
+        newlens = [0]
+        new_rope = [0]
+        # add images
+        for image in images:
+            generation_input, newlens, new_rope = self.prepare_vit_images(
+                curr_kvlens=newlens,
+                curr_rope=new_rope,
+                images=[image],
+                transforms=image_transform,
+                new_token_ids=new_token_ids,
+            )
+            for k, v in generation_input.items():
+                if torch.is_tensor(v):
+                    generation_input[k] = v.to(device)
+            with torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16):
+                past_key_values = self.forward_cache_update_vit(past_key_values, **generation_input)
+        # add text
+        generation_input, newlens, new_rope = self.prepare_prompts(
+            curr_kvlens=newlens,
+            curr_rope=new_rope,
+            prompts=[prompt],
+            tokenizer=tokenizer,
+            new_token_ids=new_token_ids,
+        )
+        for k, v in generation_input.items():
+            if torch.is_tensor(v):
+                generation_input[k] = v.to(device)
+        with torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16):
+            past_key_values = self.forward_cache_update_text(past_key_values, **generation_input)
+        # decode
+        generation_input = self.prepare_start_tokens(newlens, new_rope, new_token_ids)
+        for k, v in generation_input.items():
+            if torch.is_tensor(v):
+                generation_input[k] = v.to(device)
+        with torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16):
+            unpacked_latent = self.generate_text(
+                past_key_values=past_key_values,
+                max_length=max_length,
+                do_sample=do_sample,
+                temperature=temperature,
+                end_token_id=new_token_ids['eos_token_id'],
+                **generation_input,
+            )
+        output = tokenizer.decode(unpacked_latent[:,0])
+        output = output.split('<|im_end|>')[0].split('<|im_start|>')[1]
+        return output

modeling/bagel/modeling_utils.py ADDED Viewed

	@@ -0,0 +1,144 @@

+# Copyright (c) 2022 Facebook, Inc. and its affiliates.
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: CC BY-NC 4.0
+#
+# This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
+#
+# Original file was released under CC BY-NC 4.0, with the full license text
+# available at https://github.com/facebookresearch/DiT/blob/main/LICENSE.txt.
+#
+# This modified file is released under the same license.
+import math
+import numpy as np
+import torch
+from torch import nn
+from transformers.activations import ACT2FN
+# --------------------------------------------------------
+# 2D sine-cosine position embedding
+# References:
+# DiT: https://github.com/facebookresearch/DiT/blob/main/models.py
+# --------------------------------------------------------
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+# --------------------------------------------------------
+# TimestepEmbedder
+# Reference:
+# DiT: https://github.com/facebookresearch/DiT/blob/main/models.py
+# --------------------------------------------------------
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class MLPconnector(nn.Module):
+    def __init__(self, in_dim: int, out_dim: int, hidden_act: str):
+        super().__init__()
+        self.activation_fn = ACT2FN[hidden_act]
+        self.fc1 = nn.Linear(in_dim, out_dim)
+        self.fc2 = nn.Linear(out_dim, out_dim)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class PositionEmbedding(nn.Module):
+    def __init__(self, max_num_patch_per_side, hidden_size):
+        super().__init__()
+        self.max_num_patch_per_side = max_num_patch_per_side
+        self.hidden_size = hidden_size
+        self.pos_embed = nn.Parameter(
+            torch.zeros(max_num_patch_per_side ** 2, hidden_size),
+            requires_grad=False
+        )
+        self._init_weights()
+    def _init_weights(self):
+        # Initialize (and freeze) pos_embed by sin-cos embedding:
+        pos_embed = get_2d_sincos_pos_embed(self.hidden_size, self.max_num_patch_per_side)
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float())
+    def forward(self, position_ids):
+        return self.pos_embed[position_ids]

modeling/bagel/qwen2_navit.py ADDED Viewed

	@@ -0,0 +1,1157 @@

+# Copyright (c) 2024 The Qwen Team and The HuggingFace Inc. team.
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
+#
+# Original file was released under Apache-2.0, with the full license text
+# available at https://github.com/huggingface/transformers/blob/main/LICENSE.
+#
+# This modified file is released under the same license.
+from dataclasses import dataclass
+from functools import partial
+from typing import List, Optional, Tuple
+import torch
+from torch import nn
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from torch.nn.attention.flex_attention import flex_attention
+from torch.nn.functional import scaled_dot_product_attention
+from transformers.utils import ModelOutput
+from flash_attn import flash_attn_varlen_func
+from modeling.qwen2.modeling_qwen2 import (
+    Qwen2Attention,
+    Qwen2MLP,
+    Qwen2PreTrainedModel,
+    Qwen2RMSNorm,
+    Qwen2RotaryEmbedding,
+    apply_rotary_pos_emb,
+)
+from modeling.qwen2.configuration_qwen2 import Qwen2Config as _Qwen2Config
+torch._dynamo.config.cache_size_limit = 512
+torch._dynamo.config.accumulated_cache_size_limit = 4096
+# flex_attention = torch.compile(flex_attention) # , dynamic=True, mode='max-autotune'
+flex_attention = torch.compile(flex_attention)
+class Qwen2Config(_Qwen2Config):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
+    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen2Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 28):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    ```python
+    >>> from transformers import Qwen2Model, Qwen2Config
+    >>> # Initializing a Qwen2 style configuration
+    >>> configuration = Qwen2Config()
+    >>> # Initializing a model from the Qwen2-7B style configuration
+    >>> model = Qwen2Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "qwen2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        attention_dropout=0.0,
+        is_causal=True,
+        _attn_implementation="flash_attention_2",
+        qk_norm=True,
+        layer_module="Qwen2DecoderLayer",
+        freeze_und=False,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            num_key_value_heads=num_key_value_heads,
+            hidden_act=hidden_act,
+            max_position_embeddings=max_position_embeddings,
+            initializer_range=initializer_range,
+            rms_norm_eps=rms_norm_eps,
+            use_cache=use_cache,
+            tie_word_embeddings=tie_word_embeddings,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            use_sliding_window=use_sliding_window,
+            sliding_window=sliding_window,
+            max_window_layers=max_window_layers,
+            attention_dropout=attention_dropout,
+            is_causal=is_causal,
+            _attn_implementation=_attn_implementation,
+            **kwargs,
+        )
+        self.qk_norm = qk_norm
+        self.layer_module = layer_module
+        self.freeze_und = freeze_und
+class NaiveCache:
+    def __init__(self, num_layers):
+        self.key_cache = {k: None for k in range(num_layers)}
+        self.value_cache = {k: None for k in range(num_layers)}
+    @property
+    def num_layers(self):
+        return len(self.key_cache)
+    @property
+    def seq_lens(self):
+        if self.key_cache[0] is not None:
+            return self.key_cache[0].shape[0]
+        else:
+            return 0
+@dataclass
+class BaseNavitOutputWithPast(ModelOutput):
+    packed_query_sequence: torch.FloatTensor = None
+    past_key_values: Optional[NaiveCache] = None
+def pad_sequence(tensor, pad_size):
+    H, L, D = tensor.shape
+    pad_tensor = tensor.new_zeros((H, pad_size, D))
+    return torch.cat([tensor, pad_tensor], dim=1)
+class PackedAttention(Qwen2Attention):
+    def __init__(self, config, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        if self.config.qk_norm:
+            self.q_norm = Qwen2RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.k_norm = Qwen2RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        else:
+            self.q_norm = nn.Identity()
+            self.k_norm = nn.Identity()
+    def forward(self, *args, **kwargs):
+        if self.training:
+            return self.forward_train(*args, **kwargs)
+        else:
+            return self.forward_inference(*args, **kwargs)
+    def forward_train(
+        self,
+        packed_sequence: torch.Tensor,
+        sample_lens: List[int],
+        attention_mask: List[torch.Tensor],
+        packed_position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+    ):
+        packed_query_states = self.q_proj(packed_sequence).view(-1, self.num_heads, self.head_dim)
+        packed_key_states = self.k_proj(packed_sequence).view(-1, self.num_key_value_heads, self.head_dim)
+        packed_value_states = self.v_proj(packed_sequence).view(-1, self.num_key_value_heads, self.head_dim)
+        packed_query_states = self.q_norm(packed_query_states)
+        packed_key_states = self.k_norm(packed_key_states)
+        packed_cos, packed_sin = packed_position_embeddings
+        packed_query_states, packed_key_states = apply_rotary_pos_emb(
+            packed_query_states, packed_key_states, packed_cos, packed_sin, unsqueeze_dim=1
+        )
+        if isinstance(attention_mask, List):
+            packed_key_states = packed_key_states[:, :, None, :].repeat(1, 1, self.num_key_value_groups, 1)
+            packed_key_states = packed_key_states.reshape(-1, self.num_heads, self.head_dim)
+            packed_value_states = packed_value_states[:, :, None, :].repeat(1, 1, self.num_key_value_groups, 1)
+            packed_value_states = packed_value_states.reshape(-1, self.num_heads, self.head_dim)
+            unpacked_query_states = packed_query_states.transpose(0, 1).split(sample_lens, dim=1)
+            unpacked_key_states = packed_key_states.transpose(0, 1).split(sample_lens, dim=1)
+            unpacked_value_states = packed_value_states.transpose(0, 1).split(sample_lens, dim=1)
+            upacked_attn_output = []
+            for query_states, key_states, value_states, attention_mask_per_sample in zip(
+                unpacked_query_states, unpacked_key_states, unpacked_value_states, attention_mask
+            ):
+                with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
+                    attn_output = scaled_dot_product_attention(
+                        query_states.to(torch.bfloat16).unsqueeze(0),
+                        key_states.to(torch.bfloat16).unsqueeze(0),
+                        value_states.to(torch.bfloat16).unsqueeze(0),
+                        attention_mask_per_sample.to(torch.bfloat16).unsqueeze(0),
+                    )
+                upacked_attn_output.append(attn_output.squeeze(0))
+            packed_attn_output = torch.cat(upacked_attn_output, dim=1)
+        else:
+            pad_size = sum(sample_lens) - packed_query_states.shape[0]
+            packed_query_states = pad_sequence(packed_query_states.permute(1, 0, 2), pad_size)
+            packed_key_states = pad_sequence(packed_key_states.permute(1, 0, 2), pad_size)
+            packed_value_states = pad_sequence(packed_value_states.permute(1, 0, 2), pad_size)
+            packed_attn_output = flex_attention(
+                packed_query_states.unsqueeze(0),
+                packed_key_states.unsqueeze(0),
+                packed_value_states.unsqueeze(0),
+                enable_gqa=True,
+                block_mask=attention_mask,
+            )
+            end_index = packed_attn_output.shape[2] - pad_size
+            packed_attn_output = packed_attn_output[0, :, :end_index, :]
+        packed_attn_output = packed_attn_output.transpose(0, 1).reshape(-1, self.hidden_size)
+        packed_attn_output = self.o_proj(packed_attn_output)
+        return packed_attn_output
+    def forward_inference(
+        self,
+        packed_query_sequence: torch.Tensor,
+        query_lens: torch.Tensor,
+        packed_query_position_embeddings: torch.Tensor,
+        packed_query_indexes: torch.Tensor,
+        past_key_values: Optional[NaiveCache] = None,
+        key_values_lens: Optional[torch.Tensor] = None,
+        packed_key_value_indexes: Optional[torch.Tensor] = None,
+        update_past_key_values=True,
+        is_causal=True,
+    ):
+        packed_query_states = self.q_proj(packed_query_sequence).view(-1, self.num_heads, self.head_dim)
+        packed_key_states = self.k_proj(packed_query_sequence).view(-1, self.num_key_value_heads, self.head_dim)
+        packed_value_states = self.v_proj(packed_query_sequence).view(-1, self.num_key_value_heads, self.head_dim)
+        packed_query_states = self.q_norm(packed_query_states)
+        packed_key_states = self.k_norm(packed_key_states)
+        packed_cos, packed_sin = packed_query_position_embeddings
+        packed_query_states, packed_key_states = apply_rotary_pos_emb(
+            packed_query_states, packed_key_states, packed_cos, packed_sin, unsqueeze_dim=1
+        )
+        packed_query_states = packed_query_states.to(torch.bfloat16)
+        packed_key_states = packed_key_states.to(torch.bfloat16)
+        packed_value_states = packed_value_states.to(torch.bfloat16)
+        if past_key_values is not None and past_key_values.key_cache[self.layer_idx] is not None:
+            past_key_states = past_key_values.key_cache[self.layer_idx]
+            past_value_states = past_key_values.value_cache[self.layer_idx]
+            seqlens = sum(query_lens) + sum(key_values_lens)
+            merged_key_states = past_key_states.new_zeros((seqlens, self.num_key_value_heads, self.head_dim))
+            merged_value_states = past_key_states.new_zeros((seqlens, self.num_key_value_heads, self.head_dim))
+            merged_key_states[packed_query_indexes] = packed_key_states
+            merged_key_states[packed_key_value_indexes] = past_key_states
+            merged_value_states[packed_query_indexes] = packed_value_states
+            merged_value_states[packed_key_value_indexes] = past_value_states
+            key_values_lens = key_values_lens + query_lens
+        else:
+            merged_key_states = packed_key_states
+            merged_value_states = packed_value_states
+            key_values_lens = query_lens
+        cu_seqlens_q = torch.nn.functional.pad(torch.cumsum(query_lens, dim=0), (1, 0))
+        cu_seqlens_k = torch.nn.functional.pad(torch.cumsum(key_values_lens, dim=0), (1, 0))
+        packed_attn_output = flash_attn_varlen_func(
+            q=packed_query_states,
+            k=merged_key_states,
+            v=merged_value_states,
+            cu_seqlens_q=cu_seqlens_q.to(torch.int32),
+            cu_seqlens_k=cu_seqlens_k.to(torch.int32),
+            max_seqlen_q=max(query_lens).item(),
+            max_seqlen_k=max(key_values_lens).item(),
+            causal=is_causal,
+        )
+        packed_attn_output = packed_attn_output.reshape(-1, self.hidden_size)
+        packed_attn_output = self.o_proj(packed_attn_output)
+        if update_past_key_values:
+            past_key_values.key_cache[self.layer_idx] = merged_key_states
+            past_key_values.value_cache[self.layer_idx] = merged_value_states
+        return packed_attn_output, past_key_values
+class PackedAttentionMoT(Qwen2Attention):
+    def __init__(self, config, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        if self.config.qk_norm:
+            self.q_norm = Qwen2RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.k_norm = Qwen2RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.q_norm_moe_gen = Qwen2RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.k_norm_moe_gen = Qwen2RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        else:
+            self.q_norm = nn.Identity()
+            self.k_norm = nn.Identity()
+            self.q_norm_moe_gen = nn.Identity()
+            self.k_norm_moe_gen = nn.Identity()
+        self.q_proj_moe_gen = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj_moe_gen = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj_moe_gen = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj_moe_gen = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+    def forward(self, *args, **kwargs):
+        if self.training:
+            return self.forward_train(*args, **kwargs)
+        else:
+            return self.forward_inference(*args, **kwargs)
+    def forward_train(
+        self,
+        packed_sequence: torch.Tensor,
+        sample_lens: List[int],
+        attention_mask,
+        packed_position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        packed_und_token_indexes: torch.LongTensor,
+        packed_gen_token_indexes: torch.LongTensor,
+    ):
+        packed_query_states = packed_sequence.new_zeros((packed_sequence.shape[0], self.num_heads * self.head_dim))
+        packed_key_states = packed_sequence.new_zeros((packed_sequence.shape[0], self.num_key_value_heads * self.head_dim))
+        packed_value_states = packed_sequence.new_zeros((packed_sequence.shape[0], self.num_key_value_heads * self.head_dim))
+        packed_sequence_und = packed_sequence[packed_und_token_indexes]
+        packed_sequence_gen = packed_sequence[packed_gen_token_indexes]
+        packed_query_states[packed_und_token_indexes] = self.q_proj(packed_sequence_und)
+        packed_query_states[packed_gen_token_indexes] = self.q_proj_moe_gen(packed_sequence_gen)
+        packed_key_states[packed_und_token_indexes] = self.k_proj(packed_sequence_und)
+        packed_key_states[packed_gen_token_indexes] = self.k_proj_moe_gen(packed_sequence_gen)
+        packed_value_states[packed_und_token_indexes] = self.v_proj(packed_sequence_und)
+        packed_value_states[packed_gen_token_indexes] = self.v_proj_moe_gen(packed_sequence_gen)
+        packed_query_states = packed_query_states.view(-1, self.num_heads, self.head_dim)
+        packed_key_states = packed_key_states.view(-1, self.num_key_value_heads, self.head_dim)
+        packed_value_states = packed_value_states.view(-1, self.num_key_value_heads, self.head_dim)
+        if self.config.freeze_und:
+            packed_value_states[packed_und_token_indexes] = packed_value_states[packed_und_token_indexes].detach()
+        packed_query_states_ = packed_query_states.new_zeros(packed_query_states.shape)
+        packed_key_states_ = packed_key_states.new_zeros(packed_key_states.shape)
+        packed_query_states_[packed_und_token_indexes] = self.q_norm(packed_query_states[packed_und_token_indexes])
+        if self.config.freeze_und:
+            packed_query_states_[packed_und_token_indexes] = packed_query_states_[packed_und_token_indexes].detach()
+        packed_query_states_[packed_gen_token_indexes] = self.q_norm_moe_gen(packed_query_states[packed_gen_token_indexes])
+        packed_key_states_[packed_und_token_indexes] = self.k_norm(packed_key_states[packed_und_token_indexes])
+        if self.config.freeze_und:
+            packed_key_states_[packed_und_token_indexes] = packed_key_states_[packed_und_token_indexes].detach()
+        packed_key_states_[packed_gen_token_indexes] = self.k_norm_moe_gen(packed_key_states[packed_gen_token_indexes])
+        packed_cos, packed_sin = packed_position_embeddings
+        packed_query_states_, packed_key_states_ = apply_rotary_pos_emb(
+            packed_query_states_, packed_key_states_, packed_cos, packed_sin, unsqueeze_dim=1
+        )
+        if isinstance(attention_mask, List):
+            packed_key_states_ = packed_key_states_[:, :, None, :].repeat(1, 1, self.num_key_value_groups, 1)
+            packed_key_states_ = packed_key_states_.reshape(-1, self.num_heads, self.head_dim)
+            packed_value_states = packed_value_states[:, :, None, :].repeat(1, 1, self.num_key_value_groups, 1)
+            packed_value_states = packed_value_states.reshape(-1, self.num_heads, self.head_dim)
+            unpacked_query_states = packed_query_states_.transpose(0, 1).split(sample_lens, dim=1)
+            unpacked_key_states = packed_key_states_.transpose(0, 1).split(sample_lens, dim=1)
+            unpacked_value_states = packed_value_states.transpose(0, 1).split(sample_lens, dim=1)
+            upacked_attn_output = []
+            for query_states, key_states, value_states, attention_mask_per_sample in zip(
+                unpacked_query_states, unpacked_key_states, unpacked_value_states, attention_mask
+            ):
+                with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
+                    attn_output = scaled_dot_product_attention(
+                        query_states.to(torch.bfloat16).unsqueeze(0),
+                        key_states.to(torch.bfloat16).unsqueeze(0),
+                        value_states.to(torch.bfloat16).unsqueeze(0),
+                        attention_mask_per_sample.to(torch.bfloat16).unsqueeze(0),
+                    )
+                upacked_attn_output.append(attn_output.squeeze(0))
+            packed_attn_output = torch.cat(upacked_attn_output, dim=1)
+        else:
+            pad_size = sum(sample_lens) - packed_query_states.shape[0]
+            packed_query_states_ = pad_sequence(packed_query_states_.permute(1, 0, 2), pad_size)
+            packed_key_states_ = pad_sequence(packed_key_states_.permute(1, 0, 2), pad_size)
+            packed_value_states = pad_sequence(packed_value_states.permute(1, 0, 2), pad_size)
+            packed_attn_output = flex_attention(
+                packed_query_states_.unsqueeze(0), # 1, num_head, L, head_dim
+                packed_key_states_.unsqueeze(0),
+                packed_value_states.unsqueeze(0),
+                enable_gqa=True,
+                block_mask=attention_mask,
+            )
+            end_index = packed_attn_output.shape[2] - pad_size
+            packed_attn_output = packed_attn_output[0, :, :end_index, :]
+        packed_attn_output = packed_attn_output.transpose(0, 1).reshape(-1, self.num_heads * self.head_dim)
+        packed_attn_output_ = packed_attn_output.new_zeros(packed_attn_output.shape)
+        packed_attn_output_[packed_und_token_indexes] = self.o_proj(packed_attn_output[packed_und_token_indexes])
+        packed_attn_output_[packed_gen_token_indexes] = self.o_proj_moe_gen(packed_attn_output[packed_gen_token_indexes])
+        return packed_attn_output_
+    def forward_inference(
+        self,
+        packed_query_sequence: torch.Tensor,
+        query_lens: torch.Tensor,
+        packed_query_position_embeddings: torch.Tensor,
+        packed_query_indexes: torch.Tensor,
+        past_key_values: Optional[NaiveCache] = None,
+        key_values_lens: Optional[torch.Tensor] = None,
+        packed_key_value_indexes: Optional[torch.Tensor] = None,
+        update_past_key_values=True,
+        is_causal=True,
+        mode="und",
+        packed_vae_token_indexes=None,
+        packed_text_indexes=None,
+    ):
+        if mode == 'und':
+            packed_query_states = self.q_proj(packed_query_sequence).view(-1, self.num_heads, self.head_dim)
+            packed_key_states = self.k_proj(packed_query_sequence).view(-1, self.num_key_value_heads, self.head_dim)
+            packed_value_states = self.v_proj(packed_query_sequence).view(-1, self.num_key_value_heads, self.head_dim)
+            packed_query_states = self.q_norm(packed_query_states)
+            packed_key_states = self.k_norm(packed_key_states)
+        elif mode == 'gen':
+            packed_query_sequence = packed_query_sequence.to(torch.bfloat16)
+            packed_query_states = packed_query_sequence.new_zeros((packed_query_sequence.shape[0], self.num_heads * self.head_dim))
+            packed_key_states = packed_query_sequence.new_zeros((packed_query_sequence.shape[0], self.num_key_value_heads * self.head_dim))
+            packed_value_states = packed_query_sequence.new_zeros((packed_query_sequence.shape[0], self.num_key_value_heads * self.head_dim))
+            packed_text_query_sequence = packed_query_sequence[packed_text_indexes]
+            packed_vae_query_sequence = packed_query_sequence[packed_vae_token_indexes]
+            packed_query_states[packed_text_indexes] = self.q_proj(packed_text_query_sequence)
+            packed_query_states[packed_vae_token_indexes] = self.q_proj_moe_gen(packed_vae_query_sequence)
+            packed_key_states[packed_text_indexes] = self.k_proj(packed_text_query_sequence)
+            packed_key_states[packed_vae_token_indexes] = self.k_proj_moe_gen(packed_vae_query_sequence)
+            packed_value_states[packed_text_indexes] = self.v_proj(packed_text_query_sequence)
+            packed_value_states[packed_vae_token_indexes] = self.v_proj_moe_gen(packed_vae_query_sequence)
+            packed_query_states = packed_query_states.view(-1, self.num_heads, self.head_dim)
+            packed_key_states = packed_key_states.view(-1, self.num_key_value_heads, self.head_dim)
+            packed_value_states = packed_value_states.view(-1, self.num_key_value_heads, self.head_dim)
+            packed_query_states = packed_query_states.to(torch.float32)
+            packed_query_states[packed_text_indexes] = self.q_norm(packed_query_states[packed_text_indexes])
+            packed_query_states[packed_vae_token_indexes] = self.q_norm_moe_gen(packed_query_states[packed_vae_token_indexes])
+            packed_key_states = packed_key_states.to(torch.float32)
+            packed_key_states[packed_text_indexes] = self.k_norm(packed_key_states[packed_text_indexes])
+            packed_key_states[packed_vae_token_indexes] = self.k_norm_moe_gen(packed_key_states[packed_vae_token_indexes])
+        packed_cos, packed_sin = packed_query_position_embeddings
+        packed_query_states, packed_key_states = apply_rotary_pos_emb(
+            packed_query_states, packed_key_states, packed_cos, packed_sin, unsqueeze_dim=1
+        )
+        packed_query_states = packed_query_states.to(torch.bfloat16)
+        packed_key_states = packed_key_states.to(torch.bfloat16)
+        packed_value_states = packed_value_states.to(torch.bfloat16)
+        if past_key_values is not None and past_key_values.key_cache[self.layer_idx] is not None:
+            past_key_states = past_key_values.key_cache[self.layer_idx]
+            past_value_states = past_key_values.value_cache[self.layer_idx]
+            seqlens = sum(query_lens) + sum(key_values_lens)
+            merged_key_states = past_key_states.new_zeros(size=[seqlens, self.num_key_value_heads, self.head_dim])
+            merged_value_states = past_key_states.new_zeros(size=[seqlens, self.num_key_value_heads, self.head_dim])
+            merged_key_states[packed_query_indexes] = packed_key_states
+            merged_key_states[packed_key_value_indexes] = past_key_states
+            merged_value_states[packed_query_indexes] = packed_value_states
+            merged_value_states[packed_key_value_indexes] = past_value_states
+            key_values_lens = key_values_lens + query_lens
+        else:
+            merged_key_states = packed_key_states
+            merged_value_states = packed_value_states
+            key_values_lens = query_lens
+        cu_seqlens_q = torch.nn.functional.pad(torch.cumsum(query_lens, dim=0), (1, 0))
+        cu_seqlens_k = torch.nn.functional.pad(torch.cumsum(key_values_lens, dim=0), (1, 0))
+        packed_attn_output = flash_attn_varlen_func(
+            q=packed_query_states,
+            k=merged_key_states,
+            v=merged_value_states,
+            cu_seqlens_q=cu_seqlens_q.to(torch.int32),
+            cu_seqlens_k=cu_seqlens_k.to(torch.int32),
+            max_seqlen_q=max(query_lens).item(),
+            max_seqlen_k=max(key_values_lens).item(),
+            causal=is_causal,
+        )
+        packed_attn_output = packed_attn_output.reshape(-1, self.hidden_size)
+        if mode == 'und':
+            packed_attn_output = self.o_proj(packed_attn_output)
+        elif mode == 'gen':
+            packed_attn_output[packed_text_indexes] = self.o_proj(packed_attn_output[packed_text_indexes])
+            packed_attn_output[packed_vae_token_indexes] = self.o_proj_moe_gen(packed_attn_output[packed_vae_token_indexes])
+        if update_past_key_values:
+            past_key_values.key_cache[self.layer_idx] = merged_key_states
+            past_key_values.value_cache[self.layer_idx] = merged_value_states
+        return packed_attn_output, past_key_values
+class Qwen2DecoderLayer(nn.Module):
+    def __init__(self, config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = PackedAttention(config, layer_idx)
+        self.mlp = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(self, *args, **kwargs):
+        if self.training:
+            return self.forward_train(*args, **kwargs)
+        else:
+            return self.forward_inference(*args, **kwargs)
+    def forward_train(
+        self,
+        packed_sequence: torch.Tensor,
+        sample_lens: List[int],
+        attention_mask,
+        packed_position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+    ) -> torch.Tensor:
+        residual = packed_sequence
+        packed_sequence = self.input_layernorm(packed_sequence)
+        # Self Attention
+        packed_sequence = self.self_attn(
+            packed_sequence=packed_sequence,
+            sample_lens=sample_lens,
+            attention_mask=attention_mask,
+            packed_position_embeddings=packed_position_embeddings,
+        )
+        packed_sequence = residual + packed_sequence
+        # Fully Connected
+        residual = packed_sequence
+        packed_sequence = self.post_attention_layernorm(packed_sequence)
+        packed_sequence = self.mlp(packed_sequence)
+        packed_sequence = residual + packed_sequence
+        return packed_sequence
+    def forward_inference(
+        self,
+        packed_query_sequence: torch.Tensor,
+        query_lens: torch.Tensor,
+        packed_query_position_embeddings: torch.Tensor,
+        packed_query_indexes: torch.Tensor,
+        past_key_values: Optional[NaiveCache] = None,
+        key_values_lens: Optional[torch.Tensor] = None,
+        packed_key_value_indexes: Optional[torch.Tensor] = None,
+        update_past_key_values=True,
+        is_causal=True,
+    ) -> BaseNavitOutputWithPast:
+        residual = packed_query_sequence
+        packed_query_sequence = self.input_layernorm(packed_query_sequence)
+        # Self Attention
+        packed_query_sequence, past_key_values = self.self_attn(
+            packed_query_sequence=packed_query_sequence,
+            query_lens=query_lens,
+            packed_query_position_embeddings=packed_query_position_embeddings,
+            packed_query_indexes=packed_query_indexes,
+            past_key_values=past_key_values,
+            key_values_lens=key_values_lens,
+            packed_key_value_indexes=packed_key_value_indexes,
+            update_past_key_values=update_past_key_values,
+            is_causal=is_causal,
+        )
+        packed_query_sequence = residual + packed_query_sequence
+        # Fully Connected
+        residual = packed_query_sequence
+        packed_query_sequence = self.post_attention_layernorm(packed_query_sequence)
+        packed_query_sequence = self.mlp(packed_query_sequence)
+        packed_query_sequence = residual + packed_query_sequence
+        return packed_query_sequence, past_key_values
+class Qwen2MoTDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config,
+        layer_idx: Optional[int] = None,
+        attn_module: Optional[Qwen2Attention] = PackedAttentionMoT,
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.freeze_und = config.freeze_und
+        self.self_attn = attn_module(config, layer_idx)
+        self.mlp = Qwen2MLP(config)
+        self.mlp_moe_gen = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.input_layernorm_moe_gen = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm_moe_gen = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(self, *args, **kwargs):
+        if self.training:
+            return self.forward_train(*args, **kwargs)
+        else:
+            return self.forward_inference(*args, **kwargs)
+    def forward_train(
+        self,
+        packed_sequence: torch.Tensor,
+        sample_lens: List[int],
+        attention_mask,
+        packed_position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        packed_und_token_indexes: torch.LongTensor,
+        packed_gen_token_indexes: torch.LongTensor,
+    ) -> torch.Tensor:
+        residual = packed_sequence
+        packed_sequence_ = packed_sequence.new_zeros(packed_sequence.shape)
+        packed_sequence_[packed_und_token_indexes] = self.input_layernorm(packed_sequence[packed_und_token_indexes])
+        packed_sequence_[packed_gen_token_indexes] = self.input_layernorm_moe_gen(packed_sequence[packed_gen_token_indexes])
+        # Self Attention
+        packed_sequence_ = self.self_attn(
+            packed_sequence=packed_sequence_,
+            sample_lens=sample_lens,
+            attention_mask=attention_mask,
+            packed_position_embeddings=packed_position_embeddings,
+            packed_und_token_indexes=packed_und_token_indexes,
+            packed_gen_token_indexes=packed_gen_token_indexes,
+        )
+        if self.freeze_und:
+            packed_sequence_[packed_und_token_indexes] = packed_sequence_[packed_und_token_indexes].detach()
+        packed_sequence = residual + packed_sequence_
+        # Fully Connected
+        residual = packed_sequence
+        packed_sequence_ = packed_sequence.new_zeros(packed_sequence.shape)
+        packed_sequence_[packed_und_token_indexes] = self.mlp(
+            self.post_attention_layernorm(packed_sequence[packed_und_token_indexes])
+        )
+        if self.freeze_und:
+            packed_sequence_[packed_und_token_indexes] = packed_sequence_[packed_und_token_indexes].detach()
+        packed_sequence_[packed_gen_token_indexes] = self.mlp_moe_gen(
+            self.post_attention_layernorm_moe_gen(packed_sequence[packed_gen_token_indexes])
+        )
+        packed_sequence = residual + packed_sequence_
+        return packed_sequence
+    def forward_inference(
+        self,
+        packed_query_sequence: torch.Tensor,
+        query_lens: torch.Tensor,
+        packed_query_position_embeddings: torch.Tensor,
+        packed_query_indexes: torch.Tensor,
+        past_key_values: Optional[NaiveCache] = None,
+        key_values_lens: Optional[torch.Tensor] = None,
+        packed_key_value_indexes: Optional[torch.Tensor] = None,
+        update_past_key_values=True,
+        is_causal=True,
+        mode="und",
+        packed_vae_token_indexes=None,
+        packed_text_indexes=None,
+    ) -> BaseNavitOutputWithPast:
+        residual = packed_query_sequence
+        if mode == "und":
+            packed_query_sequence = self.input_layernorm(packed_query_sequence)
+        elif mode == "gen":
+            packed_query_sequence_ = torch.zeros_like(packed_query_sequence)
+            packed_query_sequence_[packed_text_indexes] = self.input_layernorm(packed_query_sequence[packed_text_indexes])
+            packed_query_sequence_[packed_vae_token_indexes] = self.input_layernorm_moe_gen(packed_query_sequence[packed_vae_token_indexes])
+            packed_query_sequence = packed_query_sequence_
+        # Self Attention
+        packed_query_sequence, past_key_values = self.self_attn(
+            packed_query_sequence=packed_query_sequence,
+            query_lens=query_lens,
+            packed_query_position_embeddings=packed_query_position_embeddings,
+            packed_query_indexes=packed_query_indexes,
+            past_key_values=past_key_values,
+            key_values_lens=key_values_lens,
+            packed_key_value_indexes=packed_key_value_indexes,
+            update_past_key_values=update_past_key_values,
+            is_causal=is_causal,
+            mode=mode,
+            packed_vae_token_indexes=packed_vae_token_indexes,
+            packed_text_indexes=packed_text_indexes,
+        )
+        packed_query_sequence = residual + packed_query_sequence
+        # Fully Connected
+        residual = packed_query_sequence
+        if mode == "und":
+            packed_query_sequence = self.post_attention_layernorm(packed_query_sequence)
+            packed_query_sequence = self.mlp(packed_query_sequence)
+        elif mode == "gen":
+            packed_text_query_sequence = packed_query_sequence[packed_text_indexes]
+            packed_vae_query_sequence = packed_query_sequence[packed_vae_token_indexes]
+            packed_text_query_sequence = self.post_attention_layernorm(packed_text_query_sequence).to(torch.bfloat16)
+            packed_vae_query_sequence = self.post_attention_layernorm_moe_gen(packed_vae_query_sequence).to(torch.bfloat16)
+            packed_query_sequence_ = torch.zeros_like(packed_query_sequence).to(torch.bfloat16)
+            packed_query_sequence_[packed_text_indexes] = self.mlp(packed_text_query_sequence)
+            packed_query_sequence_[packed_vae_token_indexes] = self.mlp_moe_gen(packed_vae_query_sequence)
+            packed_query_sequence = packed_query_sequence_
+        packed_query_sequence = residual + packed_query_sequence
+        return packed_query_sequence, past_key_values
+class Qwen2MoEDecoderLayer(nn.Module):
+    def __init__(self, config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = PackedAttention(config, layer_idx)
+        self.mlp = Qwen2MLP(config)
+        self.mlp_moe_gen = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(self, *args, **kwargs):
+        if self.training:
+            return self.forward_train(*args, **kwargs)
+        else:
+            return self.forward_inference(*args, **kwargs)
+    def forward_train(
+        self,
+        packed_sequence: torch.Tensor,
+        sample_lens: List[int],
+        attention_mask,
+        packed_position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        packed_und_token_indexes: torch.LongTensor,
+        packed_gen_token_indexes: torch.LongTensor,
+    ) -> torch.Tensor:
+        residual = packed_sequence
+        packed_sequence = self.input_layernorm(packed_sequence)
+        # Self Attention
+        packed_sequence = self.self_attn(
+            packed_sequence=packed_sequence,
+            sample_lens=sample_lens,
+            attention_mask=attention_mask,
+            packed_position_embeddings=packed_position_embeddings,
+        )
+        packed_sequence = residual + packed_sequence
+        # Fully Connected
+        residual = packed_sequence
+        packed_sequence = self.post_attention_layernorm(packed_sequence)
+        packed_sequence_new = packed_sequence.new_zeros(packed_sequence.shape)
+        packed_sequence_und = self.mlp(packed_sequence[packed_und_token_indexes])
+        packed_sequence_gen = self.mlp_moe_gen(packed_sequence[packed_gen_token_indexes])
+        packed_sequence_new[packed_und_token_indexes] = packed_sequence_und
+        packed_sequence_new[packed_gen_token_indexes] = packed_sequence_gen
+        packed_sequence = residual + packed_sequence_new
+        return packed_sequence
+    def forward_inference(
+        self,
+        packed_query_sequence: torch.Tensor,
+        query_lens: torch.Tensor,
+        packed_query_position_embeddings: torch.Tensor,
+        packed_query_indexes: torch.Tensor,
+        past_key_values: Optional[NaiveCache] = None,
+        key_values_lens: Optional[torch.Tensor] = None,
+        packed_key_value_indexes: Optional[torch.Tensor] = None,
+        update_past_key_values=True,
+        is_causal=True,
+        mode="und",
+        packed_vae_token_indexes=None,
+        packed_text_indexes=None,
+    ) -> BaseNavitOutputWithPast:
+        residual = packed_query_sequence
+        packed_query_sequence = self.input_layernorm(packed_query_sequence)
+        # Self Attention
+        packed_query_sequence, past_key_values = self.self_attn(
+            packed_query_sequence=packed_query_sequence,
+            query_lens=query_lens,
+            packed_query_position_embeddings=packed_query_position_embeddings,
+            packed_query_indexes=packed_query_indexes,
+            past_key_values=past_key_values,
+            key_values_lens=key_values_lens,
+            packed_key_value_indexes=packed_key_value_indexes,
+            update_past_key_values=update_past_key_values,
+            is_causal=is_causal,
+        )
+        packed_query_sequence = residual + packed_query_sequence
+        # Fully Connected
+        residual = packed_query_sequence
+        packed_query_sequence = self.post_attention_layernorm(packed_query_sequence)
+        if mode == "und":
+            packed_query_sequence = self.mlp(packed_query_sequence)
+        elif mode == "gen":
+            packed_query_sequence_ = torch.zeros_like(packed_query_sequence).to(torch.bfloat16)
+            packed_query_sequence_[packed_text_indexes] = self.mlp(packed_query_sequence[packed_text_indexes])
+            packed_query_sequence_[packed_vae_token_indexes] = self.mlp_moe_gen(packed_query_sequence[packed_vae_token_indexes])
+            packed_query_sequence = packed_query_sequence_
+        packed_query_sequence = residual + packed_query_sequence
+        return packed_query_sequence, past_key_values
+Decoder_layer_dict = {
+    "Qwen2DecoderLayer": Qwen2DecoderLayer,
+    "Qwen2MoEDecoderLayer": Qwen2MoEDecoderLayer,
+    "Qwen2MoTDecoderLayer": partial(Qwen2MoTDecoderLayer, attn_module=PackedAttentionMoT),
+}
+class Qwen2Model(Qwen2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.use_moe = 'Mo' in config.layer_module
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        layer_module = Decoder_layer_dict[config.layer_module]
+        self.layers = nn.ModuleList(
+            [layer_module(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        if self.use_moe:
+            self.norm_moe_gen = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Qwen2RotaryEmbedding(config=config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(self, *args, **kwargs):
+        if self.training:
+            return self.forward_train(*args, **kwargs)
+        else:
+            return self.forward_inference(*args, **kwargs)
+    def forward_train(
+        self,
+        packed_sequence: torch.Tensor,
+        sample_lens: List[int],
+        attention_mask,
+        packed_position_ids: torch.Tensor,
+        packed_und_token_indexes: Optional[torch.LongTensor] = None,
+        packed_gen_token_indexes: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        if self.config.freeze_und:
+            packed_sequence[packed_und_token_indexes] = packed_sequence[packed_und_token_indexes].detach()
+        # create position embeddings to be shared across the decoder layers
+        cos, sin = self.rotary_emb(packed_sequence, packed_position_ids.unsqueeze(0))
+        cos = cos.squeeze(0)
+        sin = sin.squeeze(0)
+        packed_position_embeddings = (cos, sin)
+        extra_inputs = {}
+        if self.use_moe:
+            assert packed_und_token_indexes is not None
+            if packed_gen_token_indexes is None:
+                packed_gen_token_indexes = packed_und_token_indexes.new_ones(size=[0])
+            extra_inputs.update(
+                packed_und_token_indexes=packed_und_token_indexes,
+                packed_gen_token_indexes=packed_gen_token_indexes,
+            )
+        for decoder_layer in self.layers:
+            packed_sequence = decoder_layer(
+                packed_sequence=packed_sequence,
+                sample_lens=sample_lens,
+                attention_mask=attention_mask,
+                packed_position_embeddings=packed_position_embeddings,
+                **extra_inputs
+            )
+        if self.use_moe:
+            packed_sequence_ = torch.zeros_like(packed_sequence)
+            packed_sequence_[packed_und_token_indexes] = self.norm(packed_sequence[packed_und_token_indexes])
+            if self.config.freeze_und:
+                packed_sequence_[packed_und_token_indexes] = packed_sequence_[packed_und_token_indexes].detach()
+            packed_sequence_[packed_gen_token_indexes] = self.norm_moe_gen(packed_sequence[packed_gen_token_indexes])
+            return packed_sequence_
+        else:
+            return self.norm(packed_sequence)
+    def forward_inference(
+        self,
+        packed_query_sequence: torch.Tensor,
+        query_lens: torch.Tensor,
+        packed_query_position_ids: torch.Tensor,
+        packed_query_indexes: torch.Tensor,
+        past_key_values: Optional[NaiveCache] = None,
+        key_values_lens: Optional[torch.Tensor] = None,
+        packed_key_value_indexes: Optional[torch.Tensor] = None,
+        update_past_key_values=True,
+        is_causal=True,
+        mode="und",
+        packed_vae_token_indexes=None,
+        packed_text_indexes=None,
+    ) -> BaseNavitOutputWithPast:
+        # create position embeddings to be shared across the decoder layers
+        cos, sin = self.rotary_emb(packed_query_sequence, packed_query_position_ids.unsqueeze(0))
+        cos = cos.squeeze(0)
+        sin = sin.squeeze(0)
+        packed_query_position_embeddings = (cos, sin)
+        extra_inputs = {}
+        if self.use_moe:
+            extra_inputs.update(mode=mode)
+            if mode == 'gen':
+                assert packed_vae_token_indexes is not None
+                assert packed_text_indexes is not None
+                extra_inputs.update(
+                    packed_vae_token_indexes=packed_vae_token_indexes,
+                    packed_text_indexes=packed_text_indexes,
+                )
+        for decoder_layer in self.layers:
+            packed_query_sequence, past_key_values = decoder_layer(
+                packed_query_sequence=packed_query_sequence,
+                query_lens=query_lens,
+                packed_query_position_embeddings=packed_query_position_embeddings,
+                packed_query_indexes=packed_query_indexes,
+                past_key_values=past_key_values,
+                key_values_lens=key_values_lens,
+                packed_key_value_indexes=packed_key_value_indexes,
+                update_past_key_values=update_past_key_values,
+                is_causal=is_causal,
+                **extra_inputs,
+            )
+        if self.use_moe:
+            if mode == "und":
+                packed_query_sequence = self.norm(packed_query_sequence)
+            elif mode == "gen":
+                packed_query_sequence_ = torch.zeros_like(packed_query_sequence)
+                packed_query_sequence_[packed_text_indexes] = self.norm(packed_query_sequence[packed_text_indexes])
+                packed_query_sequence_[packed_vae_token_indexes] = self.norm_moe_gen(packed_query_sequence[packed_vae_token_indexes])
+                packed_query_sequence = packed_query_sequence_
+        else:
+            packed_query_sequence = self.norm(packed_query_sequence)
+        return BaseNavitOutputWithPast(
+            packed_query_sequence=packed_query_sequence,
+            past_key_values=past_key_values,
+        )
+class Qwen2ForCausalLM(Qwen2PreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Qwen2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def init_moe(self):
+        for name, param in self.named_parameters():
+            if "moe_gen" in name:
+                original_name = name.replace("_moe_gen", "")
+                param.data.copy_(self.state_dict()[original_name].data)
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def forward(self, *args, **kwargs):
+        if self.training:
+            return self.forward_train(*args, **kwargs)
+        else:
+            return self.forward_inference(*args, **kwargs)
+    def forward_train(
+        self,
+        packed_sequence: torch.Tensor,
+        sample_lens: List[int],
+        attention_mask,
+        packed_position_ids: torch.Tensor,
+        packed_und_token_indexes: Optional[torch.LongTensor] = None,
+        packed_gen_token_indexes: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        outputs = self.model(
+            packed_sequence=packed_sequence,
+            sample_lens=sample_lens,
+            packed_position_ids=packed_position_ids,
+            attention_mask=attention_mask,
+            packed_und_token_indexes=packed_und_token_indexes,
+            packed_gen_token_indexes=packed_gen_token_indexes,
+        )
+        return outputs
+    def forward_inference(
+        self,
+        packed_query_sequence: torch.Tensor,
+        query_lens: torch.Tensor,
+        packed_query_position_ids: torch.Tensor,
+        packed_query_indexes: torch.Tensor,
+        past_key_values: Optional[NaiveCache] = None,
+        key_values_lens: Optional[torch.Tensor] = None,
+        packed_key_value_indexes: Optional[torch.Tensor] = None,
+        update_past_key_values=True,
+        is_causal=True,
+        mode="und",
+        packed_vae_token_indexes=None,
+        packed_text_indexes=None,
+    ) -> BaseNavitOutputWithPast:
+        outputs = self.model(
+            packed_query_sequence=packed_query_sequence,
+            query_lens=query_lens,
+            packed_query_position_ids=packed_query_position_ids,
+            packed_query_indexes=packed_query_indexes,
+            past_key_values=past_key_values,
+            key_values_lens=key_values_lens,
+            packed_key_value_indexes=packed_key_value_indexes,
+            update_past_key_values=update_past_key_values,
+            is_causal=is_causal,
+            mode=mode,
+            packed_vae_token_indexes=packed_vae_token_indexes,
+            packed_text_indexes=packed_text_indexes,
+        )
+        return outputs

modeling/bagel/siglip_navit.py ADDED Viewed

	@@ -0,0 +1,402 @@

+# Copyright (c) 2024 The HuggingFace Inc. team.
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
+#
+# Original file was released under Apache-2.0, with the full license text
+# available at https://github.com/huggingface/transformers/blob/main/LICENSE.
+#
+# This modified file is released under the same license.
+import torch
+from torch import nn
+from transformers.activations import ACT2FN
+from modeling.siglip.configuration_siglip import SiglipVisionConfig as _SiglipVisionConfig
+from modeling.siglip.modeling_siglip import SiglipAttention, SiglipPreTrainedModel
+from flash_attn import flash_attn_varlen_func
+class SiglipVisionConfig(_SiglipVisionConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
+    Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
+    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    Example:
+    ```python
+    >>> from transformers import SiglipVisionConfig, SiglipVisionModel
+    >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
+    >>> configuration = SiglipVisionConfig()
+    >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipVisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "siglip_vision_model"
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=16,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        rope=True,
+        **kwargs,
+    ):
+        super().__init__(
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            num_channels=num_channels,
+            image_size=image_size,
+            patch_size=patch_size,
+            hidden_act=hidden_act,
+            layer_norm_eps=layer_norm_eps,
+            attention_dropout=attention_dropout,
+            **kwargs)
+        self.rope = rope
+class RotaryEmbedding2D(torch.nn.Module):
+    def __init__(self, dim, max_h, max_w, base=10000):
+        super().__init__()
+        freq = torch.arange(0, dim, 2, dtype=torch.int64).float() / dim
+        inv_freq = 1.0 / (base ** freq)
+        grid_h = torch.arange(0, max_h)
+        grid_h = grid_h.to(inv_freq.dtype)
+        grid_h = grid_h[:, None].repeat(1, max_w)
+        grid_w = torch.arange(0, max_w)
+        grid_w = grid_w.to(inv_freq.dtype)
+        grid_w = grid_w[None, :].repeat(max_h, 1)
+        cos_h, sin_h = self._forward_one_side(grid_h, inv_freq)
+        cos_w, sin_w = self._forward_one_side(grid_w, inv_freq)
+        self.register_buffer("cos_h", cos_h)
+        self.register_buffer("sin_h", sin_h)
+        self.register_buffer("cos_w", cos_w)
+        self.register_buffer("sin_w", sin_w)
+    def _forward_one_side(self, grid, inv_freq):
+        freqs = grid[..., None] * inv_freq[None, None, :]
+        emb = torch.cat((freqs, freqs), dim=-1).flatten(0, 1)
+        return emb.cos(), emb.sin()
+def rotate_half(x):
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin):
+    # unsqueeze due to the head dimension
+    cos = cos.unsqueeze(1)
+    sin = sin.unsqueeze(1)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class SiglipVisionEmbeddings(nn.Module):
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        if not config.rope:
+            self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+    def convert_conv2d_to_linear(self, config, meta=False):
+        if meta:
+            linear_patch_embedding = nn.Linear(
+                config.num_channels * self.patch_size ** 2, self.embed_dim, bias=True, device='meta'
+            )
+        else:
+            linear_patch_embedding = nn.Linear(
+                config.num_channels * self.patch_size ** 2, self.embed_dim, bias=True
+            )
+        W = self.patch_embedding.weight.permute(0, 2, 3, 1).reshape(
+            self.embed_dim, config.num_channels * self.patch_size ** 2
+        )
+        linear_patch_embedding.weight.data = W
+        linear_patch_embedding.bias.data = self.patch_embedding.bias.data
+        del self.patch_embedding
+        self.patch_embedding = linear_patch_embedding
+    def forward(
+        self,
+        packed_pixel_values: torch.FloatTensor,
+        packed_flattened_position_ids: torch.LongTensor
+    ) -> torch.Tensor:
+        patch_embeds = self.patch_embedding(packed_pixel_values)
+        if not self.config.rope:
+            embeddings = patch_embeds + self.position_embedding(packed_flattened_position_ids)
+        else:
+            embeddings = patch_embeds
+        return embeddings
+class SiglipFlashAttention2(SiglipAttention):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.IntTensor,
+        max_seqlen: int,
+        cos_h: torch.Tensor = None,
+        sin_h: torch.Tensor = None,
+        cos_w: torch.Tensor = None,
+        sin_w: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        total_q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(total_q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(total_q_len, self.num_heads, self.head_dim)
+        value_states = value_states.view(total_q_len, self.num_heads, self.head_dim)
+        if self.config.rope:
+            qh, qw = query_states[:, :, :self.head_dim // 2], query_states[:, :, self.head_dim // 2:]
+            kh, kw = key_states[:, :, :self.head_dim // 2], key_states[:, :, self.head_dim // 2:]
+            qh, kh = apply_rotary_pos_emb(qh, kh, cos_h, sin_h)
+            qw, kw = apply_rotary_pos_emb(qw, kw, cos_w, sin_w)
+            query_states = torch.cat([qh, qw], dim=-1)
+            key_states = torch.cat([kh, kw], dim=-1)
+        attn_output = flash_attn_varlen_func(
+            query_states.to(torch.bfloat16),
+            key_states.to(torch.bfloat16),
+            value_states.to(torch.bfloat16),
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_k=cu_seqlens,
+            max_seqlen_q=max_seqlen,
+            max_seqlen_k=max_seqlen,
+            causal=False,
+        )
+        attn_output = self.out_proj(attn_output.reshape(total_q_len, -1))
+        return attn_output
+class SiglipMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class SiglipEncoderLayer(nn.Module):
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = SiglipFlashAttention2(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.IntTensor,
+        max_seqlen: int,
+        cos_h: torch.Tensor = None,
+        sin_h: torch.Tensor = None,
+        cos_w: torch.Tensor = None,
+        sin_w: torch.Tensor = None
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+            cos_h=cos_h,
+            sin_h=sin_h,
+            cos_w=cos_w,
+            sin_w=sin_w
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class SiglipEncoder(nn.Module):
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)]
+        )
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        cu_seqlens: torch.IntTensor,
+        max_seqlen: int,
+        cos_h: torch.Tensor = None,
+        sin_h: torch.Tensor = None,
+        cos_w: torch.Tensor = None,
+        sin_w: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(hidden_states, cu_seqlens, max_seqlen,
+                                          cos_h=cos_h, sin_h=sin_h, cos_w=cos_w, sin_w=sin_w)
+        return hidden_states
+class SiglipVisionTransformer(nn.Module):
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = SiglipVisionEmbeddings(config)
+        if config.rope:
+            max_size = config.image_size // config.patch_size
+            dim_head = config.hidden_size // config.num_attention_heads
+            self.rope = RotaryEmbedding2D(dim_head // 2, max_size, max_size)
+        self.encoder = SiglipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+    def forward(
+        self,
+        packed_pixel_values: torch.Tensor,
+        packed_flattened_position_ids: torch.LongTensor,
+        cu_seqlens: torch.IntTensor,
+        max_seqlen: int,
+    ) -> torch.Tensor:
+        hidden_states = self.embeddings(
+            packed_pixel_values=packed_pixel_values,
+            packed_flattened_position_ids=packed_flattened_position_ids
+        )
+        extra_inputs = {}
+        if self.config.rope:
+            extra_inputs.update(
+                cos_h = self.rope.cos_h[packed_flattened_position_ids],
+                sin_h = self.rope.sin_h[packed_flattened_position_ids],
+                cos_w = self.rope.cos_w[packed_flattened_position_ids],
+                sin_w = self.rope.sin_w[packed_flattened_position_ids]
+            )
+        last_hidden_state = self.encoder(
+            inputs_embeds=hidden_states, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen,
+            **extra_inputs
+        )
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+        return last_hidden_state
+class SiglipVisionModel(SiglipPreTrainedModel):
+    config_class = SiglipVisionConfig
+    main_input_name = "packed_pixel_values"
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__(config)
+        self.vision_model = SiglipVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+    def forward(
+        self,
+        packed_pixel_values: torch.Tensor,
+        packed_flattened_position_ids: torch.LongTensor,
+        cu_seqlens: torch.IntTensor,
+        max_seqlen: int,
+    ) -> torch.Tensor:
+        return self.vision_model(
+            packed_pixel_values=packed_pixel_values,
+            packed_flattened_position_ids=packed_flattened_position_ids,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )

modeling/qwen2/__init__.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# Copyright 2024 The Qwen Team and The HuggingFace Inc. team.
+# SPDX-License-Identifier: Apache-2.0
+from typing import TYPE_CHECKING
+from transformers.utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_tokenizers_available,
+    is_torch_available,
+)
+_import_structure = {
+    "configuration_qwen2": ["Qwen2Config"],
+    "tokenization_qwen2": ["Qwen2Tokenizer"],
+}
+try:
+    if not is_tokenizers_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_qwen2_fast"] = ["Qwen2TokenizerFast"]
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_qwen2"] = [
+        "Qwen2ForCausalLM",
+        "Qwen2Model",
+        "Qwen2PreTrainedModel",
+    ]
+if TYPE_CHECKING:
+    from .configuration_qwen2 import Qwen2Config
+    from .tokenization_qwen2 import Qwen2Tokenizer
+    try:
+        if not is_tokenizers_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tokenization_qwen2_fast import Qwen2TokenizerFast
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_qwen2 import (
+            Qwen2ForCausalLM,
+            Qwen2Model,
+            Qwen2PreTrainedModel,
+        )
+else:
+    import sys
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

modeling/qwen2/configuration_qwen2.py ADDED Viewed

	@@ -0,0 +1,179 @@

+# Copyright 2024 The Qwen Team and The HuggingFace Inc. team.
+# SPDX-License-Identifier: Apache-2.0
+"""Qwen2 model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class Qwen2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
+    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen2Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 28):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    ```python
+    >>> from transformers import Qwen2Model, Qwen2Config
+    >>> # Initializing a Qwen2 style configuration
+    >>> configuration = Qwen2Config()
+    >>> # Initializing a model from the Qwen2-7B style configuration
+    >>> model = Qwen2Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "qwen2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        attention_dropout=0.0,
+        is_causal=True,
+        _attn_implementation="flash_attention_2",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window if use_sliding_window else None
+        self.max_window_layers = max_window_layers
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_dropout = attention_dropout
+        self.is_causal = is_causal
+        self._attn_implementation = _attn_implementation
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

modeling/qwen2/modeling_qwen2.py ADDED Viewed

	@@ -0,0 +1,929 @@

+# Copyright 2024 The Qwen Team and The HuggingFace Inc. team.
+# SPDX-License-Identifier: Apache-2.0
+"""PyTorch Qwen2 model."""
+import math
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_qwen2 import Qwen2Config
+if is_flash_attn_2_available():
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B"
+_CONFIG_FOR_DOC = "Qwen2Config"
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
+class Qwen2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Qwen2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2
+class Qwen2RotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim=None,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+        rope_type="default",
+        config: Optional[Qwen2Config] = None,
+    ):
+        super().__init__()
+        # TODO (joao): remove the `if` below, only used for BC
+        self.rope_kwargs = {}
+        if config is None:
+            logger.warning_once(
+                "`Qwen2RotaryEmbedding` can now be fully parameterized by passing the model config through the "
+                "`config` argument. All other arguments will be removed in v4.46"
+            )
+            self.rope_kwargs = {
+                "rope_type": rope_type,
+                "factor": scaling_factor,
+                "dim": dim,
+                "base": base,
+                "max_position_embeddings": max_position_embeddings,
+            }
+            self.rope_type = rope_type
+            self.max_seq_len_cached = max_position_embeddings
+            self.original_max_seq_len = max_position_embeddings
+        else:
+            # BC: "rope_type" was originally "type"
+            if config.rope_scaling is not None:
+                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+            else:
+                self.rope_type = "default"
+            self.max_seq_len_cached = config.max_position_embeddings
+            self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len, **self.rope_kwargs
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
+class Qwen2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class Qwen2Attention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+    def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = config.is_causal
+        self.attention_dropout = config.attention_dropout
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class Qwen2FlashAttention2(Qwen2Attention):
+    """
+    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
+    as the weights of the module stays untouched. The only required change would be on the forward pass
+    where it needs to correctly call the public API of flash attention and deal with padding tokens
+    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
+    config.max_window_layers layers.
+    """
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ):
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        if (
+            self.config.use_sliding_window
+            and getattr(self.config, "sliding_window", None) is not None
+            and self.layer_idx >= self.config.max_window_layers
+        ):
+            sliding_window = self.config.sliding_window
+        else:
+            sliding_window = None
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=sliding_window,
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+QWEN2_ATTENTION_CLASSES = {
+    "eager": Qwen2Attention,
+    "flash_attention_2": Qwen2FlashAttention2,
+}
+class Qwen2DecoderLayer(nn.Module):
+    def __init__(self, config: Qwen2Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        if config.sliding_window and config._attn_implementation != "flash_attention_2":
+            logger.warning_once(
+                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
+                "unexpected results may be encountered."
+            )
+        self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.mlp = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+QWEN2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`Qwen2Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
+    QWEN2_START_DOCSTRING,
+)
+class Qwen2PreTrainedModel(PreTrainedModel):
+    config_class = Qwen2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen2DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+QWEN2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+@add_start_docstrings(
+    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
+    QWEN2_START_DOCSTRING,
+)
+class Qwen2Model(Qwen2PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
+    Args:
+        config: Qwen2Config
+    """
+    def __init__(self, config: Qwen2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Qwen2RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        if attention_mask is not None and 0.0 in attention_mask:
+            causal_mask = attention_mask
+        else:
+            causal_mask = None
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class Qwen2ForCausalLM(Qwen2PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Qwen2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        **loss_kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
+        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

modeling/qwen2/tokenization_qwen2.py ADDED Viewed

	@@ -0,0 +1,328 @@

+# Copyright 2024 The Qwen Team and The HuggingFace Inc. team.
+# SPDX-License-Identifier: Apache-2.0
+"""Tokenization classes for Qwen2."""
+import json
+import os
+import unicodedata
+from functools import lru_cache
+from typing import Optional, Tuple
+import regex as re
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
+PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+@lru_cache()
+# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on.
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+    tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+# Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+class Qwen2Tokenizer(PreTrainedTokenizer):
+    """
+    Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding.
+    Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+    ```python
+    >>> from transformers import Qwen2Tokenizer
+    >>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer")
+    >>> tokenizer("Hello world")["input_ids"]
+    [9707, 1879]
+    >>> tokenizer(" Hello world")["input_ids"]
+    [21927, 1879]
+    ```
+    This is expected.
+    You should not use GPT2Tokenizer instead, because of the different pretokenization rules.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*):
+            The beginning of sequence token. Not applicable for this tokenizer.
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The end of sequence token.
+        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should cleanup the spaces that were added when splitting the input text during the
+            tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.
+        split_special_tokens (`bool`, *optional*, defaults to `False`):
+            Whether or not the special tokens should be split during the tokenization process. The default behavior is
+            to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
+            ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
+            '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        unk_token="<|endoftext|>",
+        bos_token=None,
+        eos_token="<|endoftext|>",
+        pad_token="<|endoftext|>",
+        clean_up_tokenization_spaces=False,
+        split_special_tokens=False,
+        **kwargs,
+    ):
+        # Qwen vocab does not contain control tokens; added tokens need to be special
+        bos_token = (
+            AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(bos_token, str)
+            else bos_token
+        )
+        eos_token = (
+            AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(eos_token, str)
+            else eos_token
+        )
+        unk_token = (
+            AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(unk_token, str)
+            else unk_token
+        )
+        pad_token = (
+            AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(pad_token, str)
+            else pad_token
+        )
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        bpe_merges = []
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            for i, line in enumerate(merges_handle):
+                line = line.strip()
+                if (i == 0 and line.startswith("#version:")) or not line:
+                    continue
+                bpe_merges.append(tuple(line.split()))
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        # NOTE: the cache can grow without bound and will get really large for long running processes
+        # (esp. for texts of language that do not use space between word, e.g. Chinese); technically
+        # not a memory leak but appears as one.
+        # GPT2Tokenizer has the same problem, so let's be consistent.
+        self.cache = {}
+        self.pat = re.compile(PRETOKENIZE_REGEX)
+        if kwargs.get("add_prefix_space", False):
+            logger.warning_once(
+                f"{self.__class__.__name} does not support `add_prefix_space`, setting it to True has no effect."
+            )
+        super().__init__(
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            unk_token=unk_token,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            split_special_tokens=split_special_tokens,
+            **kwargs,
+        )
+    @property
+    def vocab_size(self) -> int:
+        return len(self.encoder)
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_vocab
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
+            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+    def decode(
+        self,
+        token_ids,
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = False,
+        spaces_between_special_tokens: bool = False,
+        **kwargs,
+    ) -> str:
+        # `spaces_between_special_tokens` defaults to True for _decode in slow tokenizers
+        # and cannot be configured elsewhere, but it should default to False for Qwen2Tokenizer
+        return super().decode(
+            token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            **kwargs,
+        )
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+        return vocab_file, merge_file
+    def prepare_for_tokenization(self, text, **kwargs):
+        text = unicodedata.normalize("NFC", text)
+        return (text, kwargs)

modeling/qwen2/tokenization_qwen2_fast.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# Copyright 2024 The Qwen Team and The HuggingFace Inc. team.
+# SPDX-License-Identifier: Apache-2.0
+"""Tokenization classes for Qwen2."""
+from typing import Optional, Tuple
+from transformers.tokenization_utils import AddedToken
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from transformers.utils import logging
+from .tokenization_qwen2 import Qwen2Tokenizer
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+    "tokenizer_file": "tokenizer.json",
+}
+MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
+class Qwen2TokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
+    Byte-Pair-Encoding.
+    Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+    ```python
+    >>> from transformers import Qwen2TokenizerFast
+    >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")
+    >>> tokenizer("Hello world")["input_ids"]
+    [9707, 1879]
+    >>> tokenizer(" Hello world")["input_ids"]
+    [21927, 1879]
+    ```
+    This is expected.
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`, *optional*):
+            Path to the vocabulary file.
+        merges_file (`str`, *optional*):
+            Path to the merges file.
+        tokenizer_file (`str`, *optional*):
+            Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
+            contains everything needed to load the tokenizer.
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead. Not applicable to this tokenizer.
+        bos_token (`str`, *optional*):
+            The beginning of sequence token. Not applicable for this tokenizer.
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The end of sequence token.
+        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = Qwen2Tokenizer
+    def __init__(
+        self,
+        vocab_file=None,
+        merges_file=None,
+        tokenizer_file=None,
+        unk_token="<|endoftext|>",
+        bos_token=None,
+        eos_token="<|endoftext|>",
+        pad_token="<|endoftext|>",
+        **kwargs,
+    ):
+        # We need to at least pass vocab_file and merges_file to base class
+        # in case a slow tokenizer needs to be initialized; other can be
+        # configured through files.
+        # following GPT2TokenizerFast, also adding unk_token, bos_token, and eos_token
+        bos_token = (
+            AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(bos_token, str)
+            else bos_token
+        )
+        eos_token = (
+            AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(eos_token, str)
+            else eos_token
+        )
+        unk_token = (
+            AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(unk_token, str)
+            else unk_token
+        )
+        pad_token = (
+            AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(pad_token, str)
+            else pad_token
+        )
+        super().__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            tokenizer_file=tokenizer_file,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            **kwargs,
+        )
+    # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)

modeling/siglip/__init__.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# Copyright 2024 The HuggingFace Inc. team.
+# SPDX-License-Identifier: Apache-2.0
+from typing import TYPE_CHECKING
+from transformers.utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_sentencepiece_available,
+    is_torch_available,
+    is_vision_available,
+)
+_import_structure = {
+    "configuration_siglip": [
+        "SiglipConfig",
+        "SiglipTextConfig",
+        "SiglipVisionConfig",
+    ],
+    "processing_siglip": ["SiglipProcessor"],
+}
+try:
+    if not is_sentencepiece_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_siglip"] = ["SiglipTokenizer"]
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_siglip"] = ["SiglipImageProcessor"]
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_siglip"] = [
+        "SiglipModel",
+        "SiglipPreTrainedModel",
+        "SiglipTextModel",
+        "SiglipVisionModel",
+        "SiglipForImageClassification",
+    ]
+if TYPE_CHECKING:
+    from .configuration_siglip import (
+        SiglipConfig,
+        SiglipTextConfig,
+        SiglipVisionConfig,
+    )
+    from .processing_siglip import SiglipProcessor
+    try:
+        if not is_sentencepiece_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tokenization_siglip import SiglipTokenizer
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_siglip import SiglipImageProcessor
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_siglip import (
+            SiglipForImageClassification,
+            SiglipModel,
+            SiglipPreTrainedModel,
+            SiglipTextModel,
+            SiglipVisionModel,
+        )
+else:
+    import sys
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

modeling/siglip/configuration_siglip.py ADDED Viewed

	@@ -0,0 +1,287 @@

+# Copyright 2024 The HuggingFace Inc. team.
+# SPDX-License-Identifier: Apache-2.0
+"""Siglip model configuration"""
+import os
+from typing import Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class SiglipTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SiglipTextModel`]. It is used to instantiate a
+    Siglip text encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the text encoder of the Siglip
+    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Siglip text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`SiglipModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 64):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            The id of the padding token in the vocabulary.
+        bos_token_id (`int`, *optional*, defaults to 49406):
+            The id of the beginning-of-sequence token in the vocabulary.
+        eos_token_id (`int`, *optional*, defaults to 49407):
+            The id of the end-of-sequence token in the vocabulary.
+    Example:
+    ```python
+    >>> from transformers import SiglipTextConfig, SiglipTextModel
+    >>> # Initializing a SiglipTextConfig with google/siglip-base-patch16-224 style configuration
+    >>> configuration = SiglipTextConfig()
+    >>> # Initializing a SiglipTextModel (with random weights) from the google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipTextModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "siglip_text_model"
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        max_position_embeddings=64,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        # This differs from `CLIPTokenizer`'s default and from openai/siglip
+        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
+        pad_token_id=1,
+        bos_token_id=49406,
+        eos_token_id=49407,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.attention_dropout = attention_dropout
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # get the text config dict if we are loading from SiglipConfig
+        if config_dict.get("model_type") == "siglip":
+            config_dict = config_dict["text_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+class SiglipVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
+    Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
+    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    Example:
+    ```python
+    >>> from transformers import SiglipVisionConfig, SiglipVisionModel
+    >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
+    >>> configuration = SiglipVisionConfig()
+    >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipVisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "siglip_vision_model"
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=16,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # get the vision config dict if we are loading from SiglipConfig
+        if config_dict.get("model_type") == "siglip":
+            config_dict = config_dict["vision_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+class SiglipConfig(PretrainedConfig):
+    r"""
+    [`SiglipConfig`] is the configuration class to store the configuration of a [`SiglipModel`]. It is used to
+    instantiate a Siglip model according to the specified arguments, defining the text model and vision model configs.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Siglip
+    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`SiglipTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`SiglipVisionConfig`].
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    Example:
+    ```python
+    >>> from transformers import SiglipConfig, SiglipModel
+    >>> # Initializing a SiglipConfig with google/siglip-base-patch16-224 style configuration
+    >>> configuration = SiglipConfig()
+    >>> # Initializing a SiglipModel (with random weights) from the google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    >>> # We can also initialize a SiglipConfig from a SiglipTextConfig and a SiglipVisionConfig
+    >>> from transformers import SiglipTextConfig, SiglipVisionConfig
+    >>> # Initializing a SiglipText and SiglipVision configuration
+    >>> config_text = SiglipTextConfig()
+    >>> config_vision = SiglipVisionConfig()
+    >>> config = SiglipConfig.from_text_vision_configs(config_text, config_vision)
+    ```"""
+    model_type = "siglip"
+    def __init__(self, text_config=None, vision_config=None, **kwargs):
+        super().__init__(**kwargs)
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `SiglipTextConfig` with default values.")
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. initializing the `SiglipVisionConfig` with default values.")
+        self.text_config = SiglipTextConfig(**text_config)
+        self.vision_config = SiglipVisionConfig(**vision_config)
+        self.initializer_factor = 1.0
+    @classmethod
+    def from_text_vision_configs(cls, text_config: SiglipTextConfig, vision_config: SiglipVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`SiglipConfig`] (or a derived class) from siglip text model configuration and siglip vision
+        model configuration.
+        Returns:
+            [`SiglipConfig`]: An instance of a configuration object
+        """
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)

modeling/siglip/convert_siglip_to_hf.py ADDED Viewed

	@@ -0,0 +1,401 @@

+# Copyright 2024 The HuggingFace Inc. team.
+# SPDX-License-Identifier: Apache-2.0
+"""Convert SigLIP checkpoints from the original repository.
+URL: https://github.com/google-research/big_vision/tree/main
+"""
+import argparse
+import collections
+from pathlib import Path
+import numpy as np
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from numpy import load
+from PIL import Image
+from transformers import SiglipConfig, SiglipImageProcessor, SiglipModel, SiglipProcessor, SiglipTokenizer
+from transformers.utils import logging
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+model_name_to_checkpoint = {
+    # base checkpoints
+    "siglip-base-patch16-224": "/Users/nielsrogge/Documents/SigLIP/webli_en_b16_224_63724782.npz",
+    "siglip-base-patch16-256": "/Users/nielsrogge/Documents/SigLIP/webli_en_b16_256_60500360.npz",
+    "siglip-base-patch16-384": "/Users/nielsrogge/Documents/SigLIP/webli_en_b16_384_68578854.npz",
+    "siglip-base-patch16-512": "/Users/nielsrogge/Documents/SigLIP/webli_en_b16_512_68580893.npz",
+    # large checkpoints
+    "siglip-large-patch16-256": "/Users/nielsrogge/Documents/SigLIP/webli_en_l16_256_60552751.npz",
+    "siglip-large-patch16-384": "/Users/nielsrogge/Documents/SigLIP/webli_en_l16_384_63634585.npz",
+    # multilingual checkpoint
+    "siglip-base-patch16-256-i18n": "/Users/nielsrogge/Documents/SigLIP/webli_i18n_b16_256_66117334.npz",
+    # so400m checkpoints
+    "siglip-so400m-patch14-384": "/Users/nielsrogge/Documents/SigLIP/webli_en_so400m_384_58765454.npz",
+}
+model_name_to_image_size = {
+    "siglip-base-patch16-224": 224,
+    "siglip-base-patch16-256": 256,
+    "siglip-base-patch16-384": 384,
+    "siglip-base-patch16-512": 512,
+    "siglip-large-patch16-256": 256,
+    "siglip-large-patch16-384": 384,
+    "siglip-base-patch16-256-i18n": 256,
+    "siglip-so400m-patch14-384": 384,
+}
+def get_siglip_config(model_name):
+    config = SiglipConfig()
+    vocab_size = 250000 if "i18n" in model_name else 32000
+    image_size = model_name_to_image_size[model_name]
+    patch_size = 16 if "patch16" in model_name else 14
+    # size of the architecture
+    config.vision_config.image_size = image_size
+    config.vision_config.patch_size = patch_size
+    config.text_config.vocab_size = vocab_size
+    if "base" in model_name:
+        pass
+    elif "large" in model_name:
+        config.text_config.hidden_size = 1024
+        config.text_config.intermediate_size = 4096
+        config.text_config.num_hidden_layers = 24
+        config.text_config.num_attention_heads = 16
+        config.vision_config.hidden_size = 1024
+        config.vision_config.intermediate_size = 4096
+        config.vision_config.num_hidden_layers = 24
+        config.vision_config.num_attention_heads = 16
+    elif "so400m" in model_name:
+        config.text_config.hidden_size = 1152
+        config.text_config.intermediate_size = 4304
+        config.text_config.num_hidden_layers = 27
+        config.text_config.num_attention_heads = 16
+        config.vision_config.hidden_size = 1152
+        config.vision_config.intermediate_size = 4304
+        config.vision_config.num_hidden_layers = 27
+        config.vision_config.num_attention_heads = 16
+    else:
+        raise ValueError("Model not supported")
+    return config
+def create_rename_keys(config):
+    rename_keys = []
+    # fmt: off
+    # vision encoder
+    rename_keys.append(("params/img/embedding/kernel", "vision_model.embeddings.patch_embedding.weight"))
+    rename_keys.append(("params/img/embedding/bias", "vision_model.embeddings.patch_embedding.bias"))
+    rename_keys.append(("params/img/pos_embedding", "vision_model.embeddings.position_embedding.weight"))
+    for i in range(config.vision_config.num_hidden_layers):
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_0/scale", f"vision_model.encoder.layers.{i}.layer_norm1.weight"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_0/bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_1/scale", f"vision_model.encoder.layers.{i}.layer_norm2.weight"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_1/bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_0/kernel", f"vision_model.encoder.layers.{i}.mlp.fc1.weight"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_0/bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_1/kernel", f"vision_model.encoder.layers.{i}.mlp.fc2.weight"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_1/bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/key/kernel", f"vision_model.encoder.layers.{i}.self_attn.k_proj.weight"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/key/bias", f"vision_model.encoder.layers.{i}.self_attn.k_proj.bias"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/value/kernel", f"vision_model.encoder.layers.{i}.self_attn.v_proj.weight"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/value/bias", f"vision_model.encoder.layers.{i}.self_attn.v_proj.bias"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/query/kernel", f"vision_model.encoder.layers.{i}.self_attn.q_proj.weight"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/query/bias", f"vision_model.encoder.layers.{i}.self_attn.q_proj.bias"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/out/kernel", f"vision_model.encoder.layers.{i}.self_attn.out_proj.weight"))
+        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/out/bias", f"vision_model.encoder.layers.{i}.self_attn.out_proj.bias"))
+    rename_keys.append(("params/img/Transformer/encoder_norm/scale", "vision_model.post_layernorm.weight"))
+    rename_keys.append(("params/img/Transformer/encoder_norm/bias", "vision_model.post_layernorm.bias"))
+    rename_keys.append(("params/img/MAPHead_0/probe", "vision_model.head.probe"))
+    rename_keys.append(("params/img/MAPHead_0/LayerNorm_0/scale", "vision_model.head.layernorm.weight"))
+    rename_keys.append(("params/img/MAPHead_0/LayerNorm_0/bias", "vision_model.head.layernorm.bias"))
+    rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_0/kernel", "vision_model.head.mlp.fc1.weight"))
+    rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_0/bias", "vision_model.head.mlp.fc1.bias"))
+    rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_1/kernel", "vision_model.head.mlp.fc2.weight"))
+    rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_1/bias", "vision_model.head.mlp.fc2.bias"))
+    rename_keys.append(("params/img/MAPHead_0/MultiHeadDotProductAttention_0/out/kernel", "vision_model.head.attention.out_proj.weight"))
+    rename_keys.append(("params/img/MAPHead_0/MultiHeadDotProductAttention_0/out/bias", "vision_model.head.attention.out_proj.bias"))
+    # text encoder
+    rename_keys.append(("params/txt/Embed_0/embedding", "text_model.embeddings.token_embedding.weight"))
+    rename_keys.append(("params/txt/pos_embedding", "text_model.embeddings.position_embedding.weight"))
+    for i in range(config.text_config.num_hidden_layers):
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_0/scale", f"text_model.encoder.layers.{i}.layer_norm1.weight"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_0/bias", f"text_model.encoder.layers.{i}.layer_norm1.bias"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_1/scale", f"text_model.encoder.layers.{i}.layer_norm2.weight"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_1/bias", f"text_model.encoder.layers.{i}.layer_norm2.bias"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_0/kernel", f"text_model.encoder.layers.{i}.mlp.fc1.weight"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_0/bias", f"text_model.encoder.layers.{i}.mlp.fc1.bias"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_1/kernel", f"text_model.encoder.layers.{i}.mlp.fc2.weight"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_1/bias", f"text_model.encoder.layers.{i}.mlp.fc2.bias"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/key/kernel", f"text_model.encoder.layers.{i}.self_attn.k_proj.weight"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/key/bias", f"text_model.encoder.layers.{i}.self_attn.k_proj.bias"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/value/kernel", f"text_model.encoder.layers.{i}.self_attn.v_proj.weight"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/value/bias", f"text_model.encoder.layers.{i}.self_attn.v_proj.bias"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/query/kernel", f"text_model.encoder.layers.{i}.self_attn.q_proj.weight"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/query/bias", f"text_model.encoder.layers.{i}.self_attn.q_proj.bias"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/out/kernel", f"text_model.encoder.layers.{i}.self_attn.out_proj.weight"))
+        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/out/bias", f"text_model.encoder.layers.{i}.self_attn.out_proj.bias"))
+    rename_keys.append(("params/txt/Encoder_0/encoder_norm/scale", "text_model.final_layer_norm.weight"))
+    rename_keys.append(("params/txt/Encoder_0/encoder_norm/bias", "text_model.final_layer_norm.bias"))
+    rename_keys.append(("params/txt/head/kernel", "text_model.head.weight"))
+    rename_keys.append(("params/txt/head/bias", "text_model.head.bias"))
+    # learned temperature and bias
+    rename_keys.append(("params/t", "logit_scale"))
+    rename_keys.append(("params/b", "logit_bias"))
+    # fmt: on
+    return rename_keys
+def rename_key(dct, old, new, config):
+    val = dct.pop(old)
+    if ("out_proj" in new or "v_proj" in new or "k_proj" in new or "q_proj" in new) and "vision" in new:
+        val = val.reshape(-1, config.vision_config.hidden_size)
+    if ("out_proj" in new or "v_proj" in new or "k_proj" in new or "q_proj" in new) and "text" in new:
+        val = val.reshape(-1, config.text_config.hidden_size)
+    if "patch_embedding.weight" in new:
+        val = val.transpose(3, 2, 0, 1)
+    elif new.endswith("weight") and "position_embedding" not in new and "token_embedding" not in new:
+        val = val.T
+    if "position_embedding" in new and "vision" in new:
+        val = val.reshape(-1, config.vision_config.hidden_size)
+    if "position_embedding" in new and "text" in new:
+        val = val.reshape(-1, config.text_config.hidden_size)
+    if new.endswith("bias"):
+        val = val.reshape(-1)
+    dct[new] = torch.from_numpy(val)
+def read_in_q_k_v_head(state_dict, config):
+    # read in individual input projection layers
+    key_proj_weight = (
+        state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/key/kernel")
+        .reshape(-1, config.vision_config.hidden_size)
+        .T
+    )
+    key_proj_bias = state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/key/bias").reshape(-1)
+    value_proj_weight = (
+        state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/value/kernel")
+        .reshape(-1, config.vision_config.hidden_size)
+        .T
+    )
+    value_proj_bias = state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/value/bias").reshape(-1)
+    query_proj_weight = (
+        state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/query/kernel")
+        .reshape(-1, config.vision_config.hidden_size)
+        .T
+    )
+    query_proj_bias = state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/query/bias").reshape(-1)
+    # next, add them to the state dict as a single matrix + vector
+    state_dict["vision_model.head.attention.in_proj_weight"] = torch.from_numpy(
+        np.concatenate([query_proj_weight, key_proj_weight, value_proj_weight], axis=0)
+    )
+    state_dict["vision_model.head.attention.in_proj_bias"] = torch.from_numpy(
+        np.concatenate([query_proj_bias, key_proj_bias, value_proj_bias], axis=0)
+    )
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    return image
+def flatten_nested_dict(params, parent_key="", sep="/"):
+    items = []
+    for k, v in params.items():
+        new_key = parent_key + sep + k if parent_key else k
+        if isinstance(v, collections.abc.MutableMapping):
+            items.extend(flatten_nested_dict(v, new_key, sep=sep).items())
+        else:
+            items.append((new_key, v))
+    return dict(items)
+@torch.no_grad()
+def convert_siglip_checkpoint(model_name, pytorch_dump_folder_path, verify_logits=True, push_to_hub=False):
+    """
+    Copy/paste/tweak model's weights to our SigLIP structure.
+    """
+    # define default SigLIP configuration
+    config = get_siglip_config(model_name)
+    # get checkpoint
+    checkpoint = model_name_to_checkpoint[model_name]
+    # get vocab file
+    if "i18n" in model_name:
+        vocab_file = "/Users/nielsrogge/Documents/SigLIP/multilingual_vocab/sentencepiece.model"
+    else:
+        vocab_file = "/Users/nielsrogge/Documents/SigLIP/english_vocab/sentencepiece.model"
+    # load original state dict
+    data = load(checkpoint)
+    state_dict = flatten_nested_dict(data)
+    # remove and rename some keys
+    rename_keys = create_rename_keys(config)
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest, config)
+    # qkv matrices of attention pooling head need special treatment
+    read_in_q_k_v_head(state_dict, config)
+    # load HuggingFace model
+    model = SiglipModel(config).eval()
+    model.load_state_dict(state_dict)
+    # create processor
+    # important: make tokenizer not return attention_mask since original one doesn't require it
+    image_size = config.vision_config.image_size
+    size = {"height": image_size, "width": image_size}
+    image_processor = SiglipImageProcessor(size=size)
+    tokenizer = SiglipTokenizer(vocab_file=vocab_file, model_input_names=["input_ids"])
+    processor = SiglipProcessor(image_processor=image_processor, tokenizer=tokenizer)
+    # verify on dummy images and texts
+    url_1 = "https://cdn.openai.com/multimodal-neurons/assets/apple/apple-ipod.jpg"
+    image_1 = Image.open(requests.get(url_1, stream=True).raw).convert("RGB")
+    url_2 = "https://cdn.openai.com/multimodal-neurons/assets/apple/apple-blank.jpg"
+    image_2 = Image.open(requests.get(url_2, stream=True).raw).convert("RGB")
+    texts = ["an apple", "a picture of an apple"]
+    inputs = processor(images=[image_1, image_2], text=texts, return_tensors="pt", padding="max_length")
+    # verify input_ids against original ones
+    if image_size == 224:
+        filename = "siglip_pixel_values.pt"
+    elif image_size == 256:
+        filename = "siglip_pixel_values_256.pt"
+    elif image_size == 384:
+        filename = "siglip_pixel_values_384.pt"
+    elif image_size == 512:
+        filename = "siglip_pixel_values_512.pt"
+    else:
+        raise ValueError("Image size not supported")
+    filepath = hf_hub_download(repo_id="nielsr/test-image", filename=filename, repo_type="dataset")
+    original_pixel_values = torch.load(filepath)
+    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="siglip_input_ids.pt", repo_type="dataset")
+    original_input_ids = torch.load(filepath)
+    if "i18n" not in model_name:
+        assert inputs.input_ids.tolist() == original_input_ids.tolist()
+    print("Mean of original pixel values:", original_pixel_values.mean())
+    print("Mean of new pixel values:", inputs.pixel_values.mean())
+    # note: we're testing with original pixel values here since we don't have exact pixel values
+    with torch.no_grad():
+        outputs = model(input_ids=inputs.input_ids, pixel_values=original_pixel_values)
+    # with torch.no_grad():
+    #     outputs = model(input_ids=inputs.input_ids, pixel_values=inputs.pixel_values)
+    print(outputs.logits_per_image[:3, :3])
+    probs = torch.sigmoid(outputs.logits_per_image)  # these are the probabilities
+    print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
+    print(f"{probs[0][1]:.1%} that image 0 is '{texts[1]}'")
+    if verify_logits:
+        if model_name == "siglip-base-patch16-224":
+            expected_slice = torch.tensor(
+                [[-2.9621, -2.1672], [-0.2713, 0.2910]],
+            )
+        elif model_name == "siglip-base-patch16-256":
+            expected_slice = torch.tensor(
+                [[-3.1146, -1.9894], [-0.7312, 0.6387]],
+            )
+        elif model_name == "siglip-base-patch16-384":
+            expected_slice = torch.tensor(
+                [[-2.8098, -2.1891], [-0.4242, 0.4102]],
+            )
+        elif model_name == "siglip-base-patch16-512":
+            expected_slice = torch.tensor(
+                [[-2.7899, -2.2668], [-0.4295, -0.0735]],
+            )
+        elif model_name == "siglip-large-patch16-256":
+            expected_slice = torch.tensor(
+                [[-1.5827, -0.5801], [-0.9153, 0.1363]],
+            )
+        elif model_name == "siglip-large-patch16-384":
+            expected_slice = torch.tensor(
+                [[-2.1523, -0.2899], [-0.2959, 0.7884]],
+            )
+        elif model_name == "siglip-so400m-patch14-384":
+            expected_slice = torch.tensor([[-1.2441, -0.6649], [-0.7060, 0.7374]])
+        elif model_name == "siglip-base-patch16-256-i18n":
+            expected_slice = torch.tensor(
+                [[-0.9064, 0.1073], [-0.0299, 0.5304]],
+            )
+        assert torch.allclose(outputs.logits_per_image[:3, :3], expected_slice, atol=1e-4)
+        print("Looks ok!")
+    if pytorch_dump_folder_path is not None:
+        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving processor to {pytorch_dump_folder_path}")
+        processor.save_pretrained(pytorch_dump_folder_path)
+    if push_to_hub:
+        model.push_to_hub(f"nielsr/{model_name}")
+        processor.push_to_hub(f"nielsr/{model_name}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="siglip-base-patch16-224",
+        type=str,
+        choices=model_name_to_checkpoint.keys(),
+        help="Name of the model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    parser.add_argument(
+        "--verify_logits",
+        action="store_false",
+        help="Whether to verify logits against the original implementation.",
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+    args = parser.parse_args()
+    convert_siglip_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.verify_logits, args.push_to_hub)

modeling/siglip/image_processing_siglip.py ADDED Viewed

	@@ -0,0 +1,230 @@

+# Copyright 2024 The HuggingFace Inc. team.
+# SPDX-License-Identifier: Apache-2.0
+"""Image processor class for SigLIP."""
+from typing import Dict, List, Optional, Union
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from transformers.image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from transformers.utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
+logger = logging.get_logger(__name__)
+if is_vision_available():
+    import PIL
+class SiglipImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a SigLIP image processor.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the image after resizing. Can be overridden by `size` in the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image by the specified mean and standard deviation. Can be overridden by
+            `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        do_convert_rgb: bool = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        images = make_list_of_images(images)
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        if do_resize:
+            height, width = size["height"], size["width"]
+            images = [
+                resize(image=image, size=(height, width), resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)

modeling/siglip/modeling_siglip.py ADDED Viewed

	@@ -0,0 +1,1557 @@

+# Copyright 2024 The HuggingFace Inc. team.
+# SPDX-License-Identifier: Apache-2.0
+"""PyTorch Siglip model."""
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from torch.nn.init import _calculate_fan_in_and_fan_out
+from transformers.activations import ACT2FN
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+    torch_int,
+)
+from .configuration_siglip import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
+if is_flash_attn_2_available():
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+logger = logging.get_logger(__name__)
+# General docstring
+_CONFIG_FOR_DOC = "SiglipConfig"
+_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    tensor.erfinv_()
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+    # Clamp to ensure it's in the proper range
+    tensor.clamp_(min=a, max=b)
+def trunc_normal_tf_(
+    tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
+) -> torch.Tensor:
+    """Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \\leq \text{mean} \\leq b`.
+    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
+    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
+    and the result is subsequently scaled and shifted by the mean and std args.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    """
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+    variance = scale / denom
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        with torch.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        with torch.no_grad():
+            tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+def default_flax_embed_init(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="normal")
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Siglip
+class SiglipVisionModelOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+    Args:
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->Siglip
+class SiglipTextModelOutput(ModelOutput):
+    """
+    Base class for text model's outputs that also contains a pooling of the last hidden states.
+    Args:
+        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The text embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    text_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->Siglip
+class SiglipOutput(ModelOutput):
+    """
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+            similarity scores.
+        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+            similarity scores.
+        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of [`SiglipTextModel`].
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The image embeddings obtained by applying the projection layer to the pooled output of [`SiglipVisionModel`].
+        text_model_output (`BaseModelOutputWithPooling`):
+            The output of the [`SiglipTextModel`].
+        vision_model_output (`BaseModelOutputWithPooling`):
+            The output of the [`SiglipVisionModel`].
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits_per_image: torch.FloatTensor = None
+    logits_per_text: torch.FloatTensor = None
+    text_embeds: torch.FloatTensor = None
+    image_embeds: torch.FloatTensor = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+class SiglipVisionEmbeddings(nn.Module):
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing and no class embeddings.
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+        num_patches = embeddings.shape[1]
+        num_positions = self.position_embedding.weight.shape[0]
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embedding(self.position_ids)
+        patch_pos_embed = self.position_embedding.weight.unsqueeze(0)
+        dim = embeddings.shape[-1]
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return patch_pos_embed
+    def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor:
+        _, _, height, width = pixel_values.shape
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->Siglip
+class SiglipTextEmbeddings(nn.Module):
+    def __init__(self, config: SiglipTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+        return embeddings
+class SiglipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+        batch_size, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        k_v_seq_len = key_states.shape[-2]
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights
+class SiglipFlashAttention2(SiglipAttention):
+    """
+    SiglipAttention flash attention module. This module inherits from `SiglipAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    is_causal = False
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    # Adapted from transformers.models.llama.modeling_llama.LlamaFlashAttention2.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        output_attentions = False
+        batch_size, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        dropout_rate = self.dropout if self.training else 0.0
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights
+class SiglipSdpaAttention(SiglipAttention):
+    """
+    Siglip attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `SiglipAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+    is_causal = False
+    # Adapted from SiglipAttention.forward and transformers.models.llama.modeling_llama.LlamaSdpaAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "SiglipModel is using SiglipSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+            )
+        batch_size, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if self.is_causal and q_len > 1 else False
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(batch_size, q_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, None
+SIGLIP_ATTENTION_CLASSES = {
+    "eager": SiglipAttention,
+    "flash_attention_2": SiglipFlashAttention2,
+    "sdpa": SiglipSdpaAttention,
+}
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
+class SiglipMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class SiglipEncoderLayer(nn.Module):
+    def __init__(self, config: SiglipConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = SIGLIP_ATTENTION_CLASSES[config._attn_implementation](config=config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+    # Ignore copy
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+class SiglipPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = SiglipConfig
+    base_model_prefix = "siglip"
+    supports_gradient_checkpointing = True
+    _no_split_modules = [
+        "SiglipTextEmbeddings",
+        "SiglipEncoderLayer",
+        "SiglipVisionEmbeddings",
+        "SiglipEncoderLayer",
+        "SiglipMultiheadAttentionPoolingHead",
+    ]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, SiglipVisionEmbeddings):
+            width = (
+                self.config.vision_config.hidden_size
+                if isinstance(self.config, SiglipConfig)
+                else self.config.hidden_size
+            )
+            nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
+        elif isinstance(module, nn.Embedding):
+            default_flax_embed_init(module.weight)
+        elif isinstance(module, SiglipAttention):
+            nn.init.xavier_uniform_(module.q_proj.weight)
+            nn.init.xavier_uniform_(module.k_proj.weight)
+            nn.init.xavier_uniform_(module.v_proj.weight)
+            nn.init.xavier_uniform_(module.out_proj.weight)
+            nn.init.zeros_(module.q_proj.bias)
+            nn.init.zeros_(module.k_proj.bias)
+            nn.init.zeros_(module.v_proj.bias)
+            nn.init.zeros_(module.out_proj.bias)
+        elif isinstance(module, SiglipMLP):
+            nn.init.xavier_uniform_(module.fc1.weight)
+            nn.init.xavier_uniform_(module.fc2.weight)
+            nn.init.normal_(module.fc1.bias, std=1e-6)
+            nn.init.normal_(module.fc2.bias, std=1e-6)
+        elif isinstance(module, SiglipMultiheadAttentionPoolingHead):
+            nn.init.xavier_uniform_(module.probe.data)
+            nn.init.xavier_uniform_(module.attention.in_proj_weight.data)
+            nn.init.zeros_(module.attention.in_proj_bias.data)
+        elif isinstance(module, SiglipModel):
+            logit_scale_init = torch.log(torch.tensor(1.0))
+            module.logit_scale.data.fill_(logit_scale_init)
+            module.logit_bias.data.zero_()
+        elif isinstance(module, SiglipForImageClassification):
+            nn.init.normal_(
+                module.classifier.weight,
+                std=self.config.vision_config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            lecun_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+SIGLIP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`SiglipConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+SIGLIP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+SIGLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+            Whether to interpolate the pre-trained position encodings.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+SIGLIP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+            Whether to interpolate the pre-trained position encodings.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoder with AltCLIP->Siglip
+class SiglipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`SiglipEncoderLayer`].
+    Args:
+        config: SiglipConfig
+    """
+    def __init__(self, config: SiglipConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+class SiglipTextTransformer(nn.Module):
+    def __init__(self, config: SiglipTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = SiglipTextEmbeddings(config)
+        self.encoder = SiglipEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.head = nn.Linear(embed_dim, embed_dim)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+        # note: SigLIP's text model does not use a causal mask, unlike the original CLIP model.
+        # expand attention_mask
+        if attention_mask is not None and not self._use_flash_attention_2:
+            # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+        # Assuming "sticky" EOS tokenization, last token is always EOS.
+        pooled_output = last_hidden_state[:, -1, :]
+        pooled_output = self.head(pooled_output)
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+@add_start_docstrings(
+    """The text model from SigLIP without any head or projection on top.""",
+    SIGLIP_START_DOCSTRING,
+)
+class SiglipTextModel(SiglipPreTrainedModel):
+    config_class = SiglipTextConfig
+    def __init__(self, config: SiglipTextConfig):
+        super().__init__(config)
+        self.text_model = SiglipTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import AutoTokenizer, SiglipTextModel
+        >>> model = SiglipTextModel.from_pretrained("google/siglip-base-patch16-224")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")
+        >>> # important: make sure to set padding="max_length" as that's how the model was trained
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+class SiglipVisionTransformer(nn.Module):
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = SiglipVisionEmbeddings(config)
+        self.encoder = SiglipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.use_head = True if not hasattr(config, "vision_use_head") else config.vision_use_head
+        if self.use_head:
+            self.head = SiglipMultiheadAttentionPoolingHead(config)
+    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
+    def forward(
+        self,
+        pixel_values,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = False,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+        pooler_output = self.head(last_hidden_state) if self.use_head else None
+        if not return_dict:
+            return (last_hidden_state, pooler_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooler_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+class SiglipMultiheadAttentionPoolingHead(nn.Module):
+    """Multihead Attention Pooling."""
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
+    def forward(self, hidden_state):
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+        hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+        return hidden_state[:, 0]
+@add_start_docstrings(
+    """The vision model from SigLIP without any head or projection on top.""",
+    SIGLIP_START_DOCSTRING,
+)
+class SiglipVisionModel(SiglipPreTrainedModel):
+    config_class = SiglipVisionConfig
+    main_input_name = "pixel_values"
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__(config)
+        self.vision_model = SiglipVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
+    def forward(
+        self,
+        pixel_values,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, SiglipVisionModel
+        >>> model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
+        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled features
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+@add_start_docstrings(SIGLIP_START_DOCSTRING)
+class SiglipModel(SiglipPreTrainedModel):
+    config_class = SiglipConfig
+    def __init__(self, config: SiglipConfig):
+        super().__init__(config)
+        if not isinstance(config.text_config, SiglipTextConfig):
+            raise TypeError(
+                "config.text_config is expected to be of type SiglipTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+        if not isinstance(config.vision_config, SiglipVisionConfig):
+            raise TypeError(
+                "config.vision_config is expected to be of type SiglipVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+        text_config = config.text_config
+        vision_config = config.vision_config
+        # First, initialize the text and vision models with proper attention implementation
+        text_model = SiglipTextModel._from_config(text_config)
+        vision_model = SiglipVisionModel._from_config(vision_config)
+        # Second, get the text and vision submodules (for backward compatibility)
+        self.text_model = text_model.text_model
+        self.vision_model = vision_model.vision_model
+        self.logit_scale = nn.Parameter(torch.randn(1))
+        self.logit_bias = nn.Parameter(torch.randn(1))
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`SiglipTextModel`].
+        Examples:
+        ```python
+        >>> from transformers import AutoTokenizer, AutoModel
+        >>> import torch
+        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")
+        >>> # important: make sure to set padding="max_length" as that's how the model was trained
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use SigLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = text_outputs[1]
+        return pooled_output
+    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`SiglipVisionModel`].
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, AutoModel
+        >>> import torch
+        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
+        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use SiglipModel's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+        pooled_output = vision_outputs[1]
+        return pooled_output
+    @add_start_docstrings_to_model_forward(SIGLIP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=SiglipOutput, config_class=SiglipConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> Union[Tuple, SiglipOutput]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, AutoModel
+        >>> import torch
+        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
+        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
+        >>> # important: we pass `padding=max_length` since the model was trained with this
+        >>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image
+        >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
+        >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
+        31.9% that image 0 is 'a photo of 2 cats'
+        ```"""
+        # Use SigLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[1]
+        text_embeds = text_outputs[1]
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+        # cosine similarity as logits
+        logits_per_text = (
+            torch.matmul(text_embeds, image_embeds.t().to(text_embeds.device)) * self.logit_scale.exp()
+            + self.logit_bias
+        )
+        logits_per_image = logits_per_text.t()
+        loss = None
+        if return_loss:
+            # Adapted from https://github.com/google-research/big_vision/blob/01edb81a4716f93a48be43b3a4af14e29cdb3a7f/big_vision/trainers/proj/image_text/siglip.py#L287
+            eye = torch.eye(logits_per_text.size(0), device=logits_per_text.device)
+            m1_diag1 = -torch.ones_like(logits_per_text) + 2 * eye
+            loglik = torch.nn.functional.logsigmoid(m1_diag1 * logits_per_text)
+            nll = -torch.sum(loglik, dim=-1)
+            loss = nll.mean()
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+        return SiglipOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+@add_start_docstrings(
+    """
+    SigLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
+    the patch tokens) e.g. for ImageNet.
+    """,
+    SIGLIP_START_DOCSTRING,
+)
+class SiglipForImageClassification(SiglipPreTrainedModel):
+    main_input_name = "pixel_values"
+    def __init__(self, config: SiglipConfig) -> None:
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        # Create the vision model with proper attention
+        # and take only vision_model submodule (for backward compatibility)
+        vision_model = SiglipVisionModel._from_config(config.vision_config)
+        self.vision_model = vision_model.vision_model
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.vision_config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(SIGLIP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, SiglipForImageClassification
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> # note: we are loading a `SiglipModel` from the hub here,
+        >>> # so the head will be randomly initialized, hence the predictions will be random if seed is not set above.
+        >>> image_processor = AutoImageProcessor.from_pretrained("google/siglip-base-patch16-224")
+        >>> model = SiglipForImageClassification.from_pretrained("google/siglip-base-patch16-224")
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits
+        >>> # model predicts one of the two classes
+        >>> predicted_class_idx = logits.argmax(-1).item()
+        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+        Predicted class: LABEL_1
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.vision_model(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+        sequence_output = outputs[0]
+        # average pool the patch tokens
+        sequence_output = torch.mean(sequence_output, dim=1)
+        # apply classifier
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

modeling/siglip/processing_siglip.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# Copyright 2024 The HuggingFace Inc. team.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Image/Text processor class for SigLIP.
+"""
+from typing import List, Optional, Union
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from transformers.utils import TensorType
+class SiglipProcessor(ProcessorMixin):
+    r"""
+    Constructs a Siglip processor which wraps a Siglip image processor and a Siglip tokenizer into a single processor.
+    [`SiglipProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`SiglipTokenizer`]. See the
+    [`~SiglipProcessor.__call__`] and [`~SiglipProcessor.decode`] for more information.
+    Args:
+        image_processor ([`SiglipImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`SiglipTokenizer`]):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "SiglipImageProcessor"
+    tokenizer_class = "SiglipTokenizer"
+    def __init__(self, image_processor, tokenizer):
+        super().__init__(image_processor, tokenizer)
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: int = None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to SiglipTokenizer's [`~SiglipTokenizer.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` argument to
+        SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`, *optional*):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+        if text is not None:
+            encoding = self.tokenizer(
+                text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
+            )
+        if images is not None:
+            image_features = self.image_processor(images, return_tensors=return_tensors)
+        if text is not None and images is not None:
+            encoding["pixel_values"] = image_features.pixel_values
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to SiglipTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to SiglipTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    @property
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->Siglip, T5->Siglip
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

modeling/siglip/tokenization_siglip.py ADDED Viewed

	@@ -0,0 +1,364 @@

+# Copyright 2024 The HuggingFace Inc. team.
+# SPDX-License-Identifier: Apache-2.0
+"""Tokenization class for SigLIP model."""
+import os
+import re
+import string
+import warnings
+from shutil import copyfile
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+import sentencepiece as spm
+from transformers.convert_slow_tokenizer import import_protobuf
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.tokenization_utils_base import AddedToken
+if TYPE_CHECKING:
+    from transformers.tokenization_utils_base import TextInput
+from transformers.utils import logging, requires_backends
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
+SPIECE_UNDERLINE = "▁"
+class SiglipTokenizer(PreTrainedTokenizer):
+    """
+    Construct a Siglip tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"</s>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        additional_special_tokens (`List[str]`, *optional*):
+            Additional special tokens used by the tokenizer.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+        model_max_length (`int`, *optional*, defaults to 64):
+            The maximum length (in number of tokens) for model inputs.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file,
+        eos_token="</s>",
+        unk_token="<unk>",
+        pad_token="</s>",
+        additional_special_tokens=None,
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        model_max_length=64,
+        do_lower_case=True,
+        **kwargs,
+    ) -> None:
+        requires_backends(self, "protobuf")
+        pad_token = (
+            AddedToken(pad_token, rstrip=True, lstrip=True, normalized=False, special=True)
+            if isinstance(pad_token, str)
+            else pad_token
+        )
+        unk_token = (
+            AddedToken(unk_token, rstrip=True, lstrip=True, normalized=False, special=True)
+            if isinstance(unk_token, str)
+            else unk_token
+        )
+        eos_token = (
+            AddedToken(eos_token, rstrip=True, lstrip=True, normalized=False, special=True)
+            if isinstance(eos_token, str)
+            else eos_token
+        )
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        self.do_lower_case = do_lower_case
+        self.vocab_file = vocab_file
+        self.sp_model = self.get_spm_processor()
+        self.vocab_file = vocab_file
+        super().__init__(
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            additional_special_tokens=additional_special_tokens,
+            sp_model_kwargs=self.sp_model_kwargs,
+            model_max_length=model_max_length,
+            do_lower_case=do_lower_case,
+            **kwargs,
+        )
+    def get_spm_processor(self):
+        tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        with open(self.vocab_file, "rb") as f:
+            sp_model = f.read()
+            model_pb2 = import_protobuf()
+            model = model_pb2.ModelProto.FromString(sp_model)
+            normalizer_spec = model_pb2.NormalizerSpec()
+            normalizer_spec.add_dummy_prefix = False
+            model.normalizer_spec.MergeFrom(normalizer_spec)
+            sp_model = model.SerializeToString()
+            tokenizer.LoadFromSerializedProto(sp_model)
+        return tokenizer
+    @property
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.vocab_size
+    def vocab_size(self):
+        return self.sp_model.get_piece_size()
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_vocab
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        # normal case: some special tokens
+        if token_ids_1 is None:
+            return ([0] * len(token_ids_0)) + [1]
+        return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._add_eos_if_not_present
+    def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]:
+        """Do not add eos again if user already added it."""
+        if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
+            warnings.warn(
+                f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated"
+                " eos tokens being added."
+            )
+            return token_ids
+        else:
+            return token_ids + [self.eos_token_id]
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.create_token_type_ids_from_sequences
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
+        use of token type ids, therefore a list of zeros is returned.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        eos = [self.eos_token_id]
+        if token_ids_1 is None:
+            return len(token_ids_0 + eos) * [0]
+        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A sequence has the following format:
+        - single sequence: `X </s>`
+        - pair of sequences: `A </s> B </s>`
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        token_ids_0 = self._add_eos_if_not_present(token_ids_0)
+        if token_ids_1 is None:
+            return token_ids_0
+        else:
+            token_ids_1 = self._add_eos_if_not_present(token_ids_1)
+            return token_ids_0 + token_ids_1
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.__getstate__
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.__setstate__
+    def __setstate__(self, d):
+        self.__dict__ = d
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+    def remove_punctuation(self, text: str) -> str:
+        return text.translate(str.maketrans("", "", string.punctuation))
+    # source: https://github.com/google-research/big_vision/blob/3b8e5ab6ad4f96e32b32826f9e1b8fd277914f9c/big_vision/evaluators/proj/image_text/prompt_engineering.py#L94
+    def canonicalize_text(self, text, *, keep_punctuation_exact_string=None):
+        """Returns canonicalized `text` (puncuation removed).
+        Args:
+            text (`str`):
+                String to be canonicalized.
+            keep_punctuation_exact_string (`str`, *optional*):
+                If provided, then this exact string is kept. For example providing '{}' will keep any occurrences of '{}'
+                (but will still remove '{' and '}' that appear separately).
+        """
+        if keep_punctuation_exact_string:
+            text = keep_punctuation_exact_string.join(
+                self.remove_punctuation(part) for part in text.split(keep_punctuation_exact_string)
+            )
+        else:
+            text = self.remove_punctuation(text)
+        text = re.sub(r"\s+", " ", text)
+        text = text.strip()
+        return text
+    def tokenize(self, text: "TextInput", add_special_tokens=False, **kwargs) -> List[str]:
+        """
+        Converts a string to a list of tokens.
+        """
+        tokens = super().tokenize(SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " "), **kwargs)
+        if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
+            tokens = tokens[1:]
+        return tokens
+    @property
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.unk_token_length
+    def unk_token_length(self):
+        return len(self.sp_model.encode(str(self.unk_token)))
+    def _tokenize(self, text, **kwargs):
+        """
+        Returns a tokenized string.
+        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
+        SPIECE_UNDERLINE.
+        For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give `['H', 'e', 'y']` instead of `['▁He', 'y']`.
+        Thus we always encode `f"{unk_token}text"` and strip the `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
+        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
+        """
+        text = self.canonicalize_text(text, keep_punctuation_exact_string=None)
+        tokens = self.sp_model.encode(text, out_type=str)
+        # 1. Encode string + prefix ex: "<unk> Hey"
+        tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
+        # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
+        return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._convert_token_to_id
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._convert_id_to_token
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string.strip()
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)

run.err ADDED Viewed

	@@ -0,0 +1,150 @@

+W1025 21:14:01.211000 2808260 site-packages/torch/distributed/run.py:793]
+W1025 21:14:01.211000 2808260 site-packages/torch/distributed/run.py:793] *****************************************
+W1025 21:14:01.211000 2808260 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+W1025 21:14:01.211000 2808260 site-packages/torch/distributed/run.py:793] *****************************************
+wandb: WARNING `resume` will be ignored since W&B syncing is set to `offline`. Starting a new run with run id h200-zebra-cot-20251025_211359-run0.
+[rank2]:[W1025 21:14:15.369652204 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+[rank7]:[W1025 21:14:15.502472578 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+[rank5]:[W1025 21:14:15.521361526 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+[rank4]:[W1025 21:14:15.539230512 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+[rank1]:[W1025 21:14:15.559660446 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+[rank3]:[W1025 21:14:15.636618409 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+[rank6]:[W1025 21:14:15.814060558 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+wandb: Tracking run with wandb version 0.22.2
+wandb: W&B syncing is set to `offline` in this directory. Run `wandb online` or set WANDB_MODE=online to enable cloud syncing.
+wandb: Run data is saved locally in /scratch/by2593/Bagel-Zebra-CoT-origin/wandb/offline-run-20251025_211414-h200-zebra-cot-20251025_211359-run0
+wandb: Detected [huggingface_hub.inference] in use.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[rank0]:[W1025 21:14:16.181889866 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+[[34m2025-10-25 21:14:20[0m] Training arguments TrainingArguments(visual_gen=True, visual_und=True, results_dir='results/', checkpoint_dir='results/checkpoints_smm_semantic_part1_v1_origin/', wandb_project='zebra-cot', wandb_name='h200-zebra-cot-20251025_211359', wandb_runid='0', wandb_resume='allow', wandb_offline=True, global_seed=4396, auto_resume=True, resume_from='/scratch/by2593/hf_cache/hub/models--multimodal-reasoning-lab--Bagel-Zebra-CoT/snapshots/ebce32410ee2062d073feae484ea2c6c1515fba8', resume_model_only=True, finetune_from_ema=False, finetune_from_hf=True, log_every=1, save_every=50, total_steps=5000, warmup_steps=50, lr_scheduler='cosine', lr=2e-05, min_lr=1e-06, beta1=0.9, beta2=0.95, eps=1e-08, ema=0.9999, max_grad_norm=1.0, timestep_shift=1.0, mse_weight=1.0, ce_weight=1.0, ce_loss_reweighting=False, expected_num_tokens=40000, num_replicate=1, num_shard=8, sharding_strategy='HYBRID_SHARD', backward_prefetch='BACKWARD_PRE', cpu_offload=True, freeze_llm=False, freeze_vit=False, freeze_vae=True, freeze_und=False, copy_init_moe=True, use_flex=False)
+[[34m2025-10-25 21:14:20[0m] Model arguments ModelArguments(model_path='/scratch/by2593/hf_cache/hub/models--multimodal-reasoning-lab--Bagel-Zebra-CoT/snapshots/ebce32410ee2062d073feae484ea2c6c1515fba8', llm_path='hf/Qwen2.5-0.5B-Instruct/', llm_qk_norm=True, tie_word_embeddings=False, layer_module='Qwen2MoTDecoderLayer', vae_path='flux/vae/ae.safetensors', vit_path='hf/siglip-so400m-14-980-flash-attn2-navit/', max_latent_size=64, latent_patch_size=2, vit_patch_size=14, vit_max_num_patch_per_side=70, connector_act='gelu_pytorch_tanh', interpolate_pos=False, vit_select_layer=-2, vit_rope=False, text_cond_dropout_prob=0.1, vae_cond_dropout_prob=0.3, vit_cond_dropout_prob=0.3)
+[[34m2025-10-25 21:14:20[0m] Data arguments DataArguments(dataset_config_file='./data/configs/example_smm_semantic.yaml', prefetch_factor=2, num_workers=1, max_num_tokens_per_sample=40000, max_num_tokens=40000, prefer_buffer_before=10000, max_buffer_size=50, data_seed=42)
+[[34m2025-10-25 21:16:50[0m] Loading checkpoint from /scratch/by2593/hf_cache/hub/models--multimodal-reasoning-lab--Bagel-Zebra-CoT/snapshots/ebce32410ee2062d073feae484ea2c6c1515fba8.
+[[34m2025-10-25 21:18:10[0m] _IncompatibleKeys(missing_keys=['latent_pos_embed.pos_embed', 'vit_pos_embed.pos_embed'], unexpected_keys=[])
+[[34m2025-10-25 21:18:10[0m] replicaing ema model from /scratch/by2593/hf_cache/hub/models--multimodal-reasoning-lab--Bagel-Zebra-CoT/snapshots/ebce32410ee2062d073feae484ea2c6c1515fba8/model_bf16.safetensors.
+[[34m2025-10-25 21:18:20[0m] _IncompatibleKeys(missing_keys=['latent_pos_embed.pos_embed', 'vit_pos_embed.pos_embed'], unexpected_keys=[])
+[[34m2025-10-25 21:18:51[0m] Training for 5000 steps, starting at 0...
+[[34m2025-10-25 21:20:20[0m] (step=0000000) Train Loss mse: 0.0185, Train Loss ce: 1.8625, Train Steps/Sec: 0.01,
+[[34m2025-10-25 21:20:57[0m] (step=0000001) Train Loss mse: 0.0168, Train Loss ce: 1.8560, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:21:32[0m] (step=0000002) Train Loss mse: 0.0208, Train Loss ce: 1.8139, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:22:13[0m] (step=0000003) Train Loss mse: 0.0200, Train Loss ce: 1.6772, Train Steps/Sec: 0.02,
+[[34m2025-10-25 21:22:49[0m] (step=0000004) Train Loss mse: 0.0164, Train Loss ce: 1.7684, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:23:31[0m] (step=0000005) Train Loss mse: 0.0199, Train Loss ce: 1.8439, Train Steps/Sec: 0.02,
+[[34m2025-10-25 21:24:04[0m] (step=0000006) Train Loss mse: 0.0166, Train Loss ce: 1.6152, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:24:40[0m] (step=0000007) Train Loss mse: 0.0181, Train Loss ce: 1.7539, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:25:15[0m] (step=0000008) Train Loss mse: 0.0164, Train Loss ce: 1.7400, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:25:49[0m] (step=0000009) Train Loss mse: 0.0167, Train Loss ce: 1.8076, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:26:25[0m] (step=0000010) Train Loss mse: 0.0233, Train Loss ce: 1.4616, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:26:56[0m] (step=0000011) Train Loss mse: 0.0168, Train Loss ce: 1.6259, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:27:37[0m] (step=0000012) Train Loss mse: 0.0170, Train Loss ce: 1.5824, Train Steps/Sec: 0.02,
+[[34m2025-10-25 21:28:08[0m] (step=0000013) Train Loss mse: 0.0189, Train Loss ce: 1.5811, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:28:42[0m] (step=0000014) Train Loss mse: 0.0221, Train Loss ce: 1.2260, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:29:16[0m] (step=0000015) Train Loss mse: 0.0140, Train Loss ce: 1.1394, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:29:49[0m] (step=0000016) Train Loss mse: 0.0163, Train Loss ce: 1.1381, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:30:26[0m] (step=0000017) Train Loss mse: 0.0229, Train Loss ce: 1.0493, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:31:02[0m] (step=0000018) Train Loss mse: 0.0169, Train Loss ce: 1.0484, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:31:43[0m] (step=0000019) Train Loss mse: 0.0187, Train Loss ce: 0.5945, Train Steps/Sec: 0.02,
+[[34m2025-10-25 21:32:19[0m] (step=0000020) Train Loss mse: 0.0158, Train Loss ce: 0.6128, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:33:00[0m] (step=0000021) Train Loss mse: 0.0157, Train Loss ce: 0.4668, Train Steps/Sec: 0.02,
+[[34m2025-10-25 21:33:33[0m] (step=0000022) Train Loss mse: 0.0181, Train Loss ce: 0.4042, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:34:07[0m] (step=0000023) Train Loss mse: 0.0209, Train Loss ce: 0.2930, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:34:40[0m] (step=0000024) Train Loss mse: 0.0190, Train Loss ce: 0.2934, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:35:16[0m] (step=0000025) Train Loss mse: 0.0144, Train Loss ce: 0.2189, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:35:49[0m] (step=0000026) Train Loss mse: 0.0185, Train Loss ce: 0.1414, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:36:22[0m] (step=0000027) Train Loss mse: 0.0166, Train Loss ce: 0.1090, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:36:59[0m] (step=0000028) Train Loss mse: 0.0202, Train Loss ce: 0.1350, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:37:36[0m] (step=0000029) Train Loss mse: 0.0175, Train Loss ce: 0.1263, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:38:11[0m] (step=0000030) Train Loss mse: 0.0165, Train Loss ce: 0.0860, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:38:47[0m] (step=0000031) Train Loss mse: 0.0169, Train Loss ce: 0.0864, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:39:20[0m] (step=0000032) Train Loss mse: 0.0218, Train Loss ce: 0.0792, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:39:57[0m] (step=0000033) Train Loss mse: 0.0203, Train Loss ce: 0.0852, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:40:30[0m] (step=0000034) Train Loss mse: 0.0200, Train Loss ce: 0.0734, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:41:07[0m] (step=0000035) Train Loss mse: 0.0166, Train Loss ce: 0.0830, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:41:42[0m] (step=0000036) Train Loss mse: 0.0167, Train Loss ce: 0.0776, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:42:14[0m] (step=0000037) Train Loss mse: 0.0175, Train Loss ce: 0.0556, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:42:51[0m] (step=0000038) Train Loss mse: 0.0176, Train Loss ce: 0.0520, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:43:23[0m] (step=0000039) Train Loss mse: 0.0144, Train Loss ce: 0.0607, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:43:59[0m] (step=0000040) Train Loss mse: 0.0151, Train Loss ce: 0.0683, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:44:32[0m] (step=0000041) Train Loss mse: 0.0180, Train Loss ce: 0.0456, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:45:08[0m] (step=0000042) Train Loss mse: 0.0157, Train Loss ce: 0.0620, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:45:51[0m] (step=0000043) Train Loss mse: 0.0167, Train Loss ce: 0.0552, Train Steps/Sec: 0.02,
+[[34m2025-10-25 21:46:28[0m] (step=0000044) Train Loss mse: 0.0143, Train Loss ce: 0.0522, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:47:08[0m] (step=0000045) Train Loss mse: 0.0159, Train Loss ce: 0.0494, Train Steps/Sec: 0.02,
+[[34m2025-10-25 21:47:41[0m] (step=0000046) Train Loss mse: 0.0160, Train Loss ce: 0.0484, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:48:14[0m] (step=0000047) Train Loss mse: 0.0187, Train Loss ce: 0.0599, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:48:52[0m] (step=0000048) Train Loss mse: 0.0173, Train Loss ce: 0.0629, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:49:26[0m] (step=0000049) Train Loss mse: 0.0167, Train Loss ce: 0.0466, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:50:00[0m] (step=0000050) Train Loss mse: 0.0150, Train Loss ce: 0.0540, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:50:01[0m] Saving checkpoint to results/checkpoints_smm_semantic_part1_v1_origin/0000050.
+/scratch/by2593/miniconda3/envs/bagel/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:690: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
+  warnings.warn(
+/scratch/by2593/miniconda3/envs/bagel/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:690: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
+  warnings.warn(
+/scratch/by2593/miniconda3/envs/bagel/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:690: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
+  warnings.warn(
+/scratch/by2593/miniconda3/envs/bagel/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:690: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
+  warnings.warn(
+/scratch/by2593/miniconda3/envs/bagel/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:690: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
+  warnings.warn(
+/scratch/by2593/miniconda3/envs/bagel/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:690: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
+  warnings.warn(
+/scratch/by2593/miniconda3/envs/bagel/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:690: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
+  warnings.warn(
+/scratch/by2593/miniconda3/envs/bagel/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:690: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
+  warnings.warn(
+[[34m2025-10-25 21:55:05[0m] Sorted checkpoint directories: ['0000050']
+[[34m2025-10-25 21:55:40[0m] (step=0000051) Train Loss mse: 0.0139, Train Loss ce: 0.0539, Train Steps/Sec: 0.00,
+[[34m2025-10-25 21:56:13[0m] (step=0000052) Train Loss mse: 0.0176, Train Loss ce: 0.0495, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:56:51[0m] (step=0000053) Train Loss mse: 0.0168, Train Loss ce: 0.0485, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:57:23[0m] (step=0000054) Train Loss mse: 0.0151, Train Loss ce: 0.0446, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:58:00[0m] (step=0000055) Train Loss mse: 0.0144, Train Loss ce: 0.0490, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:58:37[0m] (step=0000056) Train Loss mse: 0.0143, Train Loss ce: 0.0461, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:59:11[0m] (step=0000057) Train Loss mse: 0.0152, Train Loss ce: 0.0459, Train Steps/Sec: 0.03,
+[[34m2025-10-25 21:59:48[0m] (step=0000058) Train Loss mse: 0.0152, Train Loss ce: 0.0402, Train Steps/Sec: 0.03,
+[[34m2025-10-25 22:00:22[0m] (step=0000059) Train Loss mse: 0.0145, Train Loss ce: 0.0566, Train Steps/Sec: 0.03,
+[[34m2025-10-25 22:00:59[0m] (step=0000060) Train Loss mse: 0.0174, Train Loss ce: 0.0509, Train Steps/Sec: 0.03,
+[rank6]: Traceback (most recent call last):
+[rank6]:   File "/scratch/by2593/Bagel-Zebra-CoT-origin/train/pretrain_unified_navit.py", line 727, in <module>
+[rank6]:     main()
+[rank6]:   File "/scratch/by2593/Bagel-Zebra-CoT-origin/train/pretrain_unified_navit.py", line 609, in main
+[rank6]:     assert not training_args.visual_und
+[rank6]: AssertionError
+[rank6]:[W1025 22:01:04.973896433 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present,  but this warning has only been added since PyTorch 2.4 (function operator())
+W1025 22:01:11.227000 2808260 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 2808294 closing signal SIGTERM
+W1025 22:01:11.264000 2808260 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 2808295 closing signal SIGTERM
+W1025 22:01:11.265000 2808260 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 2808296 closing signal SIGTERM
+W1025 22:01:11.271000 2808260 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 2808297 closing signal SIGTERM
+W1025 22:01:11.314000 2808260 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 2808298 closing signal SIGTERM
+W1025 22:01:11.332000 2808260 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 2808299 closing signal SIGTERM
+W1025 22:01:11.357000 2808260 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 2808301 closing signal SIGTERM
+E1025 22:01:37.654000 2808260 site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: 1) local_rank: 6 (pid: 2808300) of binary: /scratch/by2593/miniconda3/envs/bagel/bin/python3.10
+Traceback (most recent call last):
+  File "/scratch/by2593/miniconda3/envs/bagel/bin/torchrun", line 7, in <module>
+    sys.exit(main())
+  File "/scratch/by2593/miniconda3/envs/bagel/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+    return f(*args, **kwargs)
+  File "/scratch/by2593/miniconda3/envs/bagel/lib/python3.10/site-packages/torch/distributed/run.py", line 919, in main
+    run(args)
+  File "/scratch/by2593/miniconda3/envs/bagel/lib/python3.10/site-packages/torch/distributed/run.py", line 910, in run
+    elastic_launch(
+  File "/scratch/by2593/miniconda3/envs/bagel/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 138, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/scratch/by2593/miniconda3/envs/bagel/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+============================================================
+train/pretrain_unified_navit.py FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2025-10-25_22:01:11
+  host      : gh129.hpc.nyu.edu
+  rank      : 6 (local_rank: 6)
+  exitcode  : 1 (pid: 2808300)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================

run.out ADDED Viewed

	@@ -0,0 +1,871 @@

+{'block_dataset': {'dataset_names': ['block_dataset'], 'jsonl_path_list': ['/scratch/by2593/project/SMM/SMM_data/semantic_block_train_part1.jsonl'], 'num_used_data': 'None', 'image_prefix_dir': '/scratch/by2593/project/SMM/semantic_blocks_part1', 'image_transform_args': {'image_stride': 16, 'max_image_size': 512, 'min_image_size': 512}, 'vit_image_transform_args': {'image_stride': 14, 'max_image_size': 512, 'min_image_size': 512}, 'weight': 1.0, 'is_mandatory': True}}
+rank-3 worker-0 dataset-block_dataset: resuming data at row#0
+{'block_dataset': {'dataset_names': ['block_dataset'], 'jsonl_path_list': ['/scratch/by2593/project/SMM/SMM_data/semantic_block_train_part1.jsonl'], 'num_used_data': 'None', 'image_prefix_dir': '/scratch/by2593/project/SMM/semantic_blocks_part1', 'image_transform_args': {'image_stride': 16, 'max_image_size': 512, 'min_image_size': 512}, 'vit_image_transform_args': {'image_stride': 14, 'max_image_size': 512, 'min_image_size': 512}, 'weight': 1.0, 'is_mandatory': True}}
+{'block_dataset': {'dataset_names': ['block_dataset'], 'jsonl_path_list': ['/scratch/by2593/project/SMM/SMM_data/semantic_block_train_part1.jsonl'], 'num_used_data': 'None', 'image_prefix_dir': '/scratch/by2593/project/SMM/semantic_blocks_part1', 'image_transform_args': {'image_stride': 16, 'max_image_size': 512, 'min_image_size': 512}, 'vit_image_transform_args': {'image_stride': 14, 'max_image_size': 512, 'min_image_size': 512}, 'weight': 1.0, 'is_mandatory': True}}
+rank-6 worker-0 dataset-block_dataset: resuming data at row#0
+rank-4 worker-0 dataset-block_dataset: resuming data at row#0
+FullyShardedDataParallel(
+  (_fsdp_wrapped_module): Bagel(
+    (language_model): Qwen2ForCausalLM(
+      (model): Qwen2Model(
+        (embed_tokens): Embedding(152064, 3584)
+        (layers): ModuleList(
+          (0-27): 28 x FullyShardedDataParallel(
+            (_fsdp_wrapped_module): CheckpointWrapper(
+              (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
+                (self_attn): PackedAttentionMoT(
+                  (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
+                  (k_proj): Linear(in_features=3584, out_features=512, bias=True)
+                  (v_proj): Linear(in_features=3584, out_features=512, bias=True)
+                  (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
+                  (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
+                  (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
+                  (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
+                  (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
+                  (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
+                  (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
+                  (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
+                  (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
+                )
+                (mlp): Qwen2MLP(
+                  (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
+                  (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
+                  (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
+                  (act_fn): SiLU()
+                )
+                (mlp_moe_gen): Qwen2MLP(
+                  (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
+                  (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
+                  (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
+                  (act_fn): SiLU()
+                )
+                (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
+                (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
+                (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
+                (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
+              )
+            )
+          )
+        )
+        (norm): Qwen2RMSNorm((3584,), eps=1e-06)
+        (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
+        (rotary_emb): Qwen2RotaryEmbedding()
+      )
+      (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
+    )
+    (time_embedder): FullyShardedDataParallel(
+      (_fsdp_wrapped_module): TimestepEmbedder(
+        (mlp): Sequential(
+          (0): Linear(in_features=256, out_features=3584, bias=True)
+          (1): SiLU()
+          (2): Linear(in_features=3584, out_features=3584, bias=True)
+        )
+      )
+    )
+    (vae2llm): Linear(in_features=64, out_features=3584, bias=True)
+    (llm2vae): Linear(in_features=3584, out_features=64, bias=True)
+    (latent_pos_embed): FullyShardedDataParallel(
+      (_fsdp_wrapped_module): PositionEmbedding()
+    )
+    (vit_model): SiglipVisionModel(
+      (vision_model): FullyShardedDataParallel(
+        (_fsdp_wrapped_module): SiglipVisionTransformer(
+          (embeddings): SiglipVisionEmbeddings(
+            (position_embedding): Embedding(4900, 1152)
+            (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
+          )
+          (encoder): SiglipEncoder(
+            (layers): ModuleList(
+              (0-25): 26 x FullyShardedDataParallel(
+                (_fsdp_wrapped_module): CheckpointWrapper(
+                  (_checkpoint_wrapped_module): SiglipEncoderLayer(
+                    (self_attn): SiglipFlashAttention2(
+                      (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                      (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                      (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                      (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                    )
+                    (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+                    (mlp): SiglipMLP(
+                      (activation_fn): PytorchGELUTanh()
+                      (fc1): Linear(in_features=1152, out_features=4304, bias=True)
+                      (fc2): Linear(in_features=4304, out_features=1152, bias=True)
+                    )
+                    (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+                  )
+                )
+              )
+            )
+          )
+          (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+        )
+      )
+    )
+    (connector): FullyShardedDataParallel(
+      (_fsdp_wrapped_module): CheckpointWrapper(
+        (_checkpoint_wrapped_module): MLPconnector(
+          (activation_fn): PytorchGELUTanh()
+          (fc1): Linear(in_features=1152, out_features=3584, bias=True)
+          (fc2): Linear(in_features=3584, out_features=3584, bias=True)
+        )
+      )
+    )
+    (vit_pos_embed): FullyShardedDataParallel(
+      (_fsdp_wrapped_module): PositionEmbedding()
+    )
+  )
+)
+_flat_param True
+language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+time_embedder._fsdp_wrapped_module._flat_param True
+latent_pos_embed._fsdp_wrapped_module._flat_param False
+vit_model.vision_model._fsdp_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_pos_embed._fsdp_wrapped_module._flat_param False
+{'block_dataset': {'dataset_names': ['block_dataset'], 'jsonl_path_list': ['/scratch/by2593/project/SMM/SMM_data/semantic_block_train_part1.jsonl'], 'num_used_data': 'None', 'image_prefix_dir': '/scratch/by2593/project/SMM/semantic_blocks_part1', 'image_transform_args': {'image_stride': 16, 'max_image_size': 512, 'min_image_size': 512}, 'vit_image_transform_args': {'image_stride': 14, 'max_image_size': 512, 'min_image_size': 512}, 'weight': 1.0, 'is_mandatory': True}}
+Preparing Dataset block_dataset/block_dataset
+{'block_dataset': {'dataset_names': ['block_dataset'], 'jsonl_path_list': ['/scratch/by2593/project/SMM/SMM_data/semantic_block_train_part1.jsonl'], 'num_used_data': 'None', 'image_prefix_dir': '/scratch/by2593/project/SMM/semantic_blocks_part1', 'image_transform_args': {'image_stride': 16, 'max_image_size': 512, 'min_image_size': 512}, 'vit_image_transform_args': {'image_stride': 14, 'max_image_size': 512, 'min_image_size': 512}, 'weight': 1.0, 'is_mandatory': True}}
+rank-0 worker-0 dataset-block_dataset: resuming data at row#0
+rank-7 worker-0 dataset-block_dataset: resuming data at row#0
+{'block_dataset': {'dataset_names': ['block_dataset'], 'jsonl_path_list': ['/scratch/by2593/project/SMM/SMM_data/semantic_block_train_part1.jsonl'], 'num_used_data': 'None', 'image_prefix_dir': '/scratch/by2593/project/SMM/semantic_blocks_part1', 'image_transform_args': {'image_stride': 16, 'max_image_size': 512, 'min_image_size': 512}, 'vit_image_transform_args': {'image_stride': 14, 'max_image_size': 512, 'min_image_size': 512}, 'weight': 1.0, 'is_mandatory': True}}
+{'block_dataset': {'dataset_names': ['block_dataset'], 'jsonl_path_list': ['/scratch/by2593/project/SMM/SMM_data/semantic_block_train_part1.jsonl'], 'num_used_data': 'None', 'image_prefix_dir': '/scratch/by2593/project/SMM/semantic_blocks_part1', 'image_transform_args': {'image_stride': 16, 'max_image_size': 512, 'min_image_size': 512}, 'vit_image_transform_args': {'image_stride': 14, 'max_image_size': 512, 'min_image_size': 512}, 'weight': 1.0, 'is_mandatory': True}}
+rank-2 worker-0 dataset-block_dataset: resuming data at row#0
+rank-5 worker-0 dataset-block_dataset: resuming data at row#0
+{'block_dataset': {'dataset_names': ['block_dataset'], 'jsonl_path_list': ['/scratch/by2593/project/SMM/SMM_data/semantic_block_train_part1.jsonl'], 'num_used_data': 'None', 'image_prefix_dir': '/scratch/by2593/project/SMM/semantic_blocks_part1', 'image_transform_args': {'image_stride': 16, 'max_image_size': 512, 'min_image_size': 512}, 'vit_image_transform_args': {'image_stride': 14, 'max_image_size': 512, 'min_image_size': 512}, 'weight': 1.0, 'is_mandatory': True}}
+rank-1 worker-0 dataset-block_dataset: resuming data at row#0
+skip a sample with length 43202
+skip a sample with length 48060
+skip a sample with length 41094
+skip a sample with length 43245
+skip a sample with length 57756
+skip a sample with length 41160
+skip a sample with length 44611
+skip a sample with length 41094
+skip a sample with length 48060
+skip a sample with length 50787
+skip a sample with length 44611
+skip a sample with length 43245
+skip a sample with length 41106
+skip a sample with length 41160
+skip a sample with length 57756
+skip a sample with length 42480
+skip a sample with length 42486
+skip a sample with length 42486
+skip a sample with length 50787
+skip a sample with length 43202
+skip a sample with length 42480
+block_dataset repeat in rank-3 worker-0
+block_dataset repeat in rank-4 worker-0
+block_dataset repeat in rank-6 worker-0
+block_dataset repeat in rank-7 worker-0
+block_dataset repeat in rank-0 worker-0
+block_dataset repeat in rank-5 worker-0
+block_dataset repeat in rank-2 worker-0
+skip a sample with length 41106
+skip a sample with length 48060
+skip a sample with length 43202
+block_dataset repeat in rank-1 worker-0
+skip a sample with length 41094
+skip a sample with length 57756
+Yielding data with length 31517
+skip a sample with length 43245
+skip a sample with length 41160
+skip a sample with length 44611
+Yielding data with length 33637
+Yielding data with length 33154
+Yielding data with length 15542
+skip a sample with length 50787
+Yielding data with length 35486
+Yielding data with length 12716
+skip a sample with length 48060
+skip a sample with length 41094
+skip a sample with length 43245
+skip a sample with length 41160
+skip a sample with length 44611
+Yielding data with length 26172
+Yielding data with length 23933
+skip a sample with length 41106
+skip a sample with length 57756
+Yielding data with length 32737
+Yielding data with length 27691
+Yielding data with length 31628
+Yielding data with length 36149
+skip a sample with length 42486
+Yielding data with length 30708
+Yielding data with length 13411
+Yielding data with length 18973
+Yielding data with length 27959
+Yielding data with length 23821
+skip a sample with length 50787
+Yielding data with length 27474
+Yielding data with length 7870
+skip a sample with length 42486
+Yielding data with length 37241
+Yielding data with length 27998
+Yielding data with length 13811
+Yielding data with length 20795
+Yielding data with length 32169
+Yielding data with length 16921
+Yielding data with length 16202
+Yielding data with length 21081
+Yielding data with length 21217
+Yielding data with length 26994
+Yielding data with length 17856
+Yielding data with length 33309
+Yielding data with length 31064
+Yielding data with length 23492
+Yielding data with length 20761
+Yielding data with length 31378
+Yielding data with length 23451
+Yielding data with length 25220
+Yielding data with length 26611
+Yielding data with length 27250
+Yielding data with length 35216
+skip a sample with length 42480
+Yielding data with length 13720
+Yielding data with length 19578
+Yielding data with length 25498
+Yielding data with length 22109
+Yielding data with length 19619
+Yielding data with length 23415
+Yielding data with length 30332
+Yielding data with length 34858
+block_dataset repeat in rank-0 worker-0
+block_dataset repeat in rank-6 worker-0
+block_dataset repeat in rank-5 worker-0
+Yielding data with length 19720
+Yielding data with length 25991
+Yielding data with length 29387
+Yielding data with length 21979
+skip a sample with length 43202
+skip a sample with length 41106
+Yielding data with length 23402
+Yielding data with length 22465
+Yielding data with length 21998
+Yielding data with length 25679
+block_dataset repeat in rank-3 worker-0
+Yielding data with length 17957
+Yielding data with length 22013
+Yielding data with length 20711
+Yielding data with length 23461
+Yielding data with length 24469
+Yielding data with length 24915
+Yielding data with length 27691
+Yielding data with length 37262
+skip a sample with length 43202
+block_dataset repeat in rank-7 worker-0
+block_dataset repeat in rank-1 worker-0
+Yielding data with length 17288
+Yielding data with length 20687
+Yielding data with length 20361
+Yielding data with length 28560
+Yielding data with length 31247
+Yielding data with length 17983
+block_dataset repeat in rank-2 worker-0
+Yielding data with length 27946
+Yielding data with length 27631
+skip a sample with length 41094
+skip a sample with length 42480
+Yielding data with length 10650
+Yielding data with length 14641
+Yielding data with length 23037
+skip a sample with length 43245
+Yielding data with length 16219
+Yielding data with length 35530
+Yielding data with length 16208
+Yielding data with length 26188
+Yielding data with length 27937
+block_dataset repeat in rank-4 worker-0
+Yielding data with length 11424
+Yielding data with length 12453
+Yielding data with length 16146
+Yielding data with length 18287
+Yielding data with length 20791
+Yielding data with length 24236
+Yielding data with length 25579
+Yielding data with length 28956
+Yielding data with length 14121
+Yielding data with length 14781
+Yielding data with length 15221
+Yielding data with length 15921
+skip a sample with length 41160
+Yielding data with length 28466
+skip a sample with length 48060
+Yielding data with length 17646
+Yielding data with length 31256
+Yielding data with length 26792
+Yielding data with length 17122
+Yielding data with length 20057
+skip a sample with length 48060
+Yielding data with length 31691
+Yielding data with length 32761
+Yielding data with length 23701
+Yielding data with length 23722
+Yielding data with length 27340
+Yielding data with length 33869
+skip a sample with length 44611
+skip a sample with length 57756
+skip a sample with length 41094
+Yielding data with length 15091Yielding data with length 16206
+Yielding data with length 13157
+Yielding data with length 26843
+Yielding data with length 21094
+Yielding data with length 24549
+Yielding data with length 20404
+Yielding data with length 25400
+skip a sample with length 41106
+Yielding data with length 18332
+Yielding data with length 20708
+Yielding data with length 21310
+skip a sample with length 50787
+Yielding data with length 27881
+Yielding data with length 25557
+skip a sample with length 57756
+Yielding data with length 24894
+Yielding data with length 28219
+Yielding data with length 24140
+skip a sample with length 43245
+skip a sample with length 44611
+skip a sample with length 41160
+Yielding data with length 27592
+Yielding data with length 26168
+Yielding data with length 20709
+Yielding data with length 23581
+skip a sample with length 42486
+Yielding data with length 29274
+Yielding data with length 24805
+Yielding data with length 31112
+Yielding data with length 36407
+skip a sample with length 50787
+Yielding data with length 18262
+Yielding data with length 26439
+Yielding data with length 18322
+Yielding data with length 33505
+Yielding data with length 29023
+Yielding data with length 25487
+Yielding data with length 31643
+Yielding data with length 27712
+skip a sample with length 42486
+Yielding data with length 15735
+Yielding data with length 17616
+Yielding data with length 13811
+Yielding data with length 19365
+Yielding data with length 19566
+Yielding data with length 24227
+Yielding data with length 28214
+Yielding data with length 30026
+Yielding data with length 18195
+Yielding data with length 18206
+Yielding data with length 19699
+Yielding data with length 23103
+Yielding data with length 33474
+Yielding data with length 29109
+Yielding data with length 36518
+Yielding data with length 27659
+Yielding data with length 21031
+Yielding data with length 27532
+Yielding data with length 21080
+Yielding data with length 20740
+Yielding data with length 24066
+Yielding data with length 26959
+Yielding data with length 32162
+skip a sample with length 42480
+Yielding data with length 31373
+block_dataset repeat in rank-0 worker-0
+Yielding data with length 9629
+block_dataset repeat in rank-6 worker-0
+Yielding data with length 12734
+Yielding data with length 20622
+Yielding data with length 31650
+Yielding data with length 23291
+Yielding data with length 25245
+Yielding data with length 27515
+Yielding data with length 28296
+Yielding data with length 20698
+Yielding data with length 21726
+skip a sample with length 43202
+Yielding data with length 21768
+Yielding data with length 18011
+Yielding data with length 23070
+Yielding data with length 19691
+Yielding data with length 25171
+Yielding data with length 33860
+block_dataset repeat in rank-5 worker-0
+Yielding data with length 8964
+skip a sample with length 43202
+block_dataset repeat in rank-3 worker-0
+Yielding data with length 19248
+Yielding data with length 16262
+Yielding data with length 29186
+skip a sample with length 41106
+Yielding data with length 19245
+Yielding data with length 24191
+Yielding data with length 23133
+Yielding data with length 35614
+block_dataset repeat in rank-2 worker-0
+Yielding data with length 13769
+Yielding data with length 24400
+Yielding data with length 31113
+Yielding data with length 25652
+Yielding data with length 25500
+Yielding data with length 26979
+block_dataset repeat in rank-7 worker-0
+Yielding data with length 24263
+Yielding data with length 27393
+skip a sample with length 41094
+Yielding data with length 23188
+Yielding data with length 19658
+block_dataset repeat in rank-1 worker-0
+Yielding data with length 24787
+Yielding data with length 26221
+Yielding data with length 21409
+Yielding data with length 32059
+skip a sample with length 43245
+Yielding data with length 26058
+Yielding data with length 24507
+Yielding data with length 8292
+skip a sample with length 42480
+Yielding data with length 12746
+Yielding data with length 17288
+Yielding data with length 20793
+Yielding data with length 17252
+Yielding data with length 25240
+Yielding data with length 25304
+Yielding data with length 30376
+block_dataset repeat in rank-4 worker-0
+Yielding data with length 14138
+skip a sample with length 48060
+skip a sample with length 41160
+skip a sample with length 48060
+Yielding data with length 19684
+Yielding data with length 14748
+Yielding data with length 21158
+Yielding data with length 21425
+Yielding data with length 30781
+Yielding data with length 33027
+Yielding data with length 33537
+Yielding data with length 14837
+Yielding data with length 12766
+Yielding data with length 14115
+Yielding data with length 15474
+Yielding data with length 21749
+Yielding data with length 33147
+Yielding data with length 25621
+skip a sample with length 57756
+Yielding data with length 22466
+skip a sample with length 41094
+Yielding data with length 22413
+skip a sample with length 50787
+Yielding data with length 27913
+Yielding data with length 25090
+Yielding data with length 25551
+Yielding data with length 25335
+skip a sample with length 57756
+skip a sample with length 43245
+Yielding data with length 25947
+skip a sample with length 41160
+Yielding data with length 31872
+Yielding data with length 36109
+skip a sample with length 44611
+Yielding data with length 16234
+Yielding data with length 19945
+Yielding data with length 19685
+Yielding data with length 34186
+Yielding data with length 36943
+Yielding data with length 23090
+Yielding data with length 29034
+Yielding data with length 30067
+Yielding data with length 8489
+skip a sample with length 41106
+skip a sample with length 44611
+skip a sample with length 50787
+Yielding data with length 10008
+Yielding data with length 32829
+Yielding data with length 23593
+Yielding data with length 29907
+skip a sample with length 42486
+Yielding data with length 25500
+Yielding data with length 34717
+Yielding data with length 29714
+Yielding data with length 16266
+Yielding data with length 17271
+Yielding data with length 20547
+Yielding data with length 22351
+Yielding data with length 26637
+Yielding data with length 32390
+Yielding data with length 30503
+Yielding data with length 29728
+skip a sample with length 42486
+Yielding data with length 21561
+Yielding data with length 16923
+Yielding data with length 19642
+Yielding data with length 20198
+Yielding data with length 22735
+Yielding data with length 32930
+Yielding data with length 24262
+Yielding data with length 34823
+Yielding data with length 28608
+Yielding data with length 28122
+Yielding data with length 24532
+Yielding data with length 26210
+Yielding data with length 36308
+Yielding data with length 27414
+Yielding data with length 30425
+Yielding data with length 30774
+block_dataset repeat in rank-6 worker-0
+block_dataset repeat in rank-0 worker-0
+skip a sample with length 42480
+Yielding data with length 15870
+Yielding data with length 15590
+Yielding data with length 18509
+Yielding data with length 23812
+Yielding data with length 18170
+Yielding data with length 32514
+Yielding data with length 24814
+Yielding data with length 28298
+Yielding data with length 9988
+Yielding data with length 18332
+Yielding data with length 21420
+Yielding data with length 23903
+Yielding data with length 25120
+Yielding data with length 28991
+Yielding data with length 30114
+Yielding data with length 30128
+skip a sample with length 43202
+skip a sample with length 43202
+Yielding data with length 17850
+Yielding data with length 18166
+Yielding data with length 22663
+Yielding data with length 20751
+Yielding data with length 19273
+Yielding data with length 17552
+Yielding data with length 26616
+Yielding data with length 28527
+block_dataset repeat in rank-3 worker-0
+block_dataset repeat in rank-5 worker-0
+Yielding data with length 13466
+Yielding data with length 14852
+block_dataset repeat in rank-7 worker-0
+Yielding data with length 20760
+Yielding data with length 22448
+Yielding data with length 20269
+Yielding data with length 27307
+Yielding data with length 31128
+Yielding data with length 23848
+block_dataset repeat in rank-2 worker-0
+Yielding data with length 8948
+skip a sample with length 41106
+Yielding data with length 10367
+Yielding data with length 12612
+Yielding data with length 18632
+Yielding data with length 32428
+Yielding data with length 25651
+Yielding data with length 22117
+Yielding data with length 30468
+Yielding data with length 12051
+Yielding data with length 13346
+Yielding data with length 15726
+Yielding data with length 11383
+skip a sample with length 41094
+Yielding data with length 19358
+Yielding data with length 31964
+skip a sample with length 43245
+Yielding data with length 34359
+Yielding data with length 25146
+block_dataset repeat in rank-1 worker-0
+Yielding data with length 12528
+Yielding data with length 14445
+Yielding data with length 21808
+skip a sample with length 48060
+Yielding data with length 24973
+Yielding data with length 24141
+Yielding data with length 35965
+Yielding data with length 29665
+Yielding data with length 28975
+skip a sample with length 42480
+skip a sample with length 48060
+Yielding data with length 15182
+Yielding data with length 19712
+skip a sample with length 41160
+Yielding data with length 19698
+Yielding data with length 18255
+Yielding data with length 30749
+Yielding data with length 34841
+Yielding data with length 22848
+Yielding data with length 28618
+block_dataset repeat in rank-4 worker-0
+Yielding data with length 12071
+Yielding data with length 15527
+Yielding data with length 19227
+Yielding data with length 19199
+skip a sample with length 50787
+Yielding data with length 25207
+Yielding data with length 26500
+skip a sample with length 57756
+Yielding data with length 25915
+skip a sample with length 57756
+Yielding data with length 29886
+skip a sample with length 43245
+skip a sample with length 41160
+skip a sample with length 41094
+Yielding data with length 18659
+Yielding data with length 23460
+Yielding data with length 29942
+Yielding data with length 30289
+Yielding data with length 27297
+Yielding data with length 28034
+Yielding data with length 29025
+Yielding data with length 36590
+skip a sample with length 44611
+skip a sample with length 50787
+skip a sample with length 44611
+Yielding data with length 24792
+Yielding data with length 20748
+Yielding data with length 23187
+Yielding data with length 19037
+Yielding data with length 31561
+Yielding data with length 34200
+Yielding data with length 26330
+Yielding data with length 30027
+skip a sample with length 41106
+Yielding data with length 12095
+Yielding data with length 15214
+Yielding data with length 17243
+Yielding data with length 23097
+Yielding data with length 24142
+Yielding data with length 28934
+Yielding data with length 29052
+skip a sample with length 42486
+Yielding data with length 34556
+Yielding data with length 14895
+Yielding data with length 19552
+Yielding data with length 22053
+Yielding data with length 29467
+Yielding data with length 23444
+Yielding data with length 26636
+Yielding data with length 33801
+Yielding data with length 34191
+skip a sample with length 42486
+Yielding data with length 20414
+Yielding data with length 21739
+Yielding data with length 23877
+Yielding data with length 26520
+Yielding data with length 24877
+Yielding data with length 27696
+Yielding data with length 27597
+Yielding data with length 32703
+block_dataset repeat in rank-0 worker-0
+block_dataset repeat in rank-6 worker-0
+Yielding data with length 18005
+Yielding data with length 26527
+Yielding data with length 20791
+Yielding data with length 20719
+Yielding data with length 22114
+Yielding data with length 22512
+Yielding data with length 29336
+Yielding data with length 31527
+Yielding data with length 9284
+skip a sample with length 42480
+Yielding data with length 17316
+Yielding data with length 19314
+Yielding data with length 25239
+Yielding data with length 19703
+Yielding data with length 21232
+Yielding data with length 17268
+Yielding data with length 26931
+skip a sample with length 43202
+Yielding data with length 16230
+Yielding data with length 19692
+Yielding data with length 23196
+Yielding data with length 22444
+Yielding data with length 29708
+Yielding data with length 20680
+Yielding data with length 30765
+Yielding data with length 27917
+skip a sample with length 43202
+Yielding data with length 17207
+Yielding data with length 17853
+Yielding data with length 23427
+block_dataset repeat in rank-5 worker-0
+Yielding data with length 27646
+Yielding data with length 25169
+Yielding data with length 26475
+Yielding data with length 25127
+Yielding data with length 27339
+block_dataset repeat in rank-3 worker-0
+block_dataset repeat in rank-7 worker-0
+Yielding data with length 16546
+Yielding data with length 16256
+Yielding data with length 22339
+Yielding data with length 17919
+Yielding data with length 23138
+Yielding data with length 19676
+Yielding data with length 24070
+Yielding data with length 25924
+block_dataset repeat in rank-2 worker-0
+Yielding data with length 14569
+Yielding data with length 31705
+Yielding data with length 24120
+Yielding data with length 33709
+Yielding data with length 26245
+Yielding data with length 39397
+Yielding data with length 31035
+Yielding data with length 15921
+skip a sample with length 41094
+skip a sample with length 41106
+Yielding data with length 11323
+Yielding data with length 20758
+Yielding data with length 24109
+skip a sample with length 43245
+skip a sample with length 48060
+Yielding data with length 21739
+Yielding data with length 22062
+Yielding data with length 11069
+Yielding data with length 33774
+Yielding data with length 24783
+Yielding data with length 13348
+Yielding data with length 13218
+Yielding data with length 17288
+Yielding data with length 26493
+Yielding data with length 24246
+Yielding data with length 26920
+Yielding data with length 28599
+Yielding data with length 31042
+block_dataset repeat in rank-1 worker-0
+skip a sample with length 48060
+skip a sample with length 41160
+Yielding data with length 25722
+Yielding data with length 33186
+skip a sample with length 50787
+Yielding data with length 19367
+Yielding data with length 26598
+Yielding data with length 18672
+Yielding data with length 27291
+Yielding data with length 33105
+skip a sample with length 57756
+Yielding data with length 31380
+skip a sample with length 43245
+skip a sample with length 42480
+skip a sample with length 41160
+Yielding data with length 22996
+Yielding data with length 18896
+Yielding data with length 19621
+Yielding data with length 24453
+Yielding data with length 37227
+Yielding data with length 28758
+skip a sample with length 57756
+Yielding data with length 31736
+Yielding data with length 26241
+block_dataset repeat in rank-4 worker-0
+skip a sample with length 44611
+skip a sample with length 50787
+Yielding data with length 8502
+skip a sample with length 41094
+Yielding data with length 23339
+Yielding data with length 26828
+Yielding data with length 22141
+Yielding data with length 27917
+Yielding data with length 30731
+Yielding data with length 35152
+Yielding data with length 32504
+Yielding data with length 14524
+Yielding data with length 21770
+Yielding data with length 23021
+Yielding data with length 31645
+Yielding data with length 34056
+skip a sample with length 44611
+Yielding data with length 24506
+Yielding data with length 27457
+Yielding data with length 28513
+Yielding data with length 15147
+skip a sample with length 41106
+Yielding data with length 16968
+Yielding data with length 13491
+Yielding data with length 22125
+Yielding data with length 21138
+Yielding data with length 24903
+Yielding data with length 28043
+skip a sample with length 42486
+Yielding data with length 31782
+Yielding data with length 6701
+Yielding data with length 13494
+Yielding data with length 15875
+Yielding data with length 17545
+Yielding data with length 21060
+Yielding data with length 22115
+Yielding data with length 29729
+Yielding data with length 31752
+skip a sample with length 42486
+Yielding data with length 17316
+block_dataset repeat in rank-0 worker-0
+Yielding data with length 24478
+Yielding data with length 24714
+skip a sample with length 42480
+Yielding data with length 24145
+Yielding data with length 25188
+Yielding data with length 21724
+block_dataset repeat in rank-6 worker-0
+Yielding data with length 28652
+Yielding data with length 31606
+Yielding data with length 8619
+Yielding data with length 16608
+Yielding data with length 21134
+Yielding data with length 28671
+Yielding data with length 24139
+Yielding data with length 34737
+Yielding data with length 28959
+Yielding data with length 30967

scripts/eval/eval_vlm.sh ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+# Check if enough arguments are provided
+if [ $# -lt 2 ]; then
+    echo "Error: PREFIX_DIR and MODEL_PATH are required as the first and second arguments respectively."
+    exit 1
+fi
+LOG_PATH=$1
+if [ ! -d "$LOG_PATH" ]; then
+    mkdir -p "$LOG_PATH"
+fi
+shift 1
+ARGS=("$@")
+export MASTER_PORT=10042
+FULL_MODEL_PATH="$PREFIX_DIR/$MODEL_PATH"
+IFS=' ' read -r -a DATASETS <<< "$DATASETS_STR"
+for DATASET in "${DATASETS[@]}"; do
+    bash eval/vlm/evaluate.sh \
+        "$DATASET" \
+        --out-dir "$LOG_PATH/$DATASET" \
+        "${ARGS[@]}"
+done

scripts/eval/run_eval_vlm.sh ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+set -x
+# Set proxy and API key
+export OPENAI_API_KEY=$openai_api_key
+export GPUS=1
+DATASETS=("mme" "mmbench-dev-en" "mmvet" "mmmu-val" "mathvista-testmini" "mmvp")
+# DATASETS=("mmmu-val_cot")
+DATASETS_STR="${DATASETS[*]}"
+export DATASETS_STR
+bash scripts/eval/eval_vlm.sh \
+    $output_path \
+    --model-path $model_path

scripts/eval/run_gedit.sh ADDED Viewed

	@@ -0,0 +1,57 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+# run this script at the root of the project folder
+pip install httpx==0.23.0
+pip install openai==1.87.0
+pip install datasets
+pip install megfile
+N_GPU=8  # Number of GPU used in for the evaluation
+MODEL_PATH="/Path/to/BAGEL-7B-MoT"
+OUTPUT_DIR="/Path/to/save/results"
+GEN_DIR="$OUTPUT_DIR/gen_image"
+LOG_DIR="$OUTPUT_DIR/logs"
+AZURE_ENDPOINT="https://azure_endpoint_url_you_use"  # set up the azure openai endpoint url
+AZURE_OPENAI_KEY=""  # set up the azure openai key
+N_GPT_PARALLEL=10
+mkdir -p "$OUTPUT_DIR"
+mkdir -p "$GEN_DIR"
+mkdir -p "$LOG_DIR"
+# # ----------------------------
+# #    Download GEdit Dataset
+# # ----------------------------
+python -c "from datasets import load_dataset; dataset = load_dataset('stepfun-ai/GEdit-Bench')"
+echo "Dataset Downloaded"
+# # ---------------------
+# #    Generate Images
+# # ---------------------
+for ((i=0; i<$N_GPU; i++)); do
+    nohup python3 eval/gen/gedit/gen_images_gedit.py --model_path "$MODEL_PATH"  --output_dir "$GEN_DIR"  --shard_id $i --total_shards "$N_GPU" --device $i  2>&1 | tee "$LOG_DIR"/request_$(($N_GPU + i)).log &
+done
+wait
+echo "Image Generation Done"
+# # ---------------------
+# #    GPT Evaluation
+# # ---------------------
+cd eval/gen/gedit
+python test_gedit_score.py --save_path "$OUTPUT_DIR" --azure_endpoint "$AZURE_ENDPOINT" --gpt_keys "$AZURE_OPENAI_KEY"  --max_workers "$N_GPT_PARALLEL"
+echo "Evaluation Done"
+# # --------------------
+# #    Print Results
+# # --------------------
+python calculate_statistics.py --save_path "$OUTPUT_DIR"  --language en

scripts/eval/run_geneval.sh ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+set -x
+GPUS=8
+# generate images
+torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --nproc_per_node=$GPUS \
+    --master_addr=127.0.0.1 \
+    --master_port=12345 \
+    ./eval/gen/gen_images_mp.py \
+    --output_dir $output_path/images \
+    --metadata_file ./eval/gen/geneval/prompts/evaluation_metadata_long.jsonl \
+    --batch_size 1 \
+    --num_images 4 \
+    --resolution 1024 \
+    --max_latent_size 64 \
+    --model-path $model_path \
+    # --metadata_file ./eval/gen/geneval/prompts/evaluation_metadata.jsonl \
+# calculate score
+torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --nproc_per_node=$GPUS \
+    --master_addr=127.0.0.1 \
+    --master_port=12345 \
+    ./eval/gen/geneval/evaluation/evaluate_images_mp.py \
+    $output_path/images \
+    --outfile $output_path/results.jsonl \
+    --model-path ./eval/gen/geneval/model
+# summarize score
+python ./eval/gen/geneval/evaluation/summary_scores.py $output_path/results.jsonl

scripts/eval/run_imgedit.sh ADDED Viewed

	@@ -0,0 +1,42 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+set -x
+export OPENAI_API_KEY=$openai_api_key
+GPUS=8
+# generate images
+torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --nproc_per_node=$GPUS \
+    --master_addr=127.0.0.1 \
+    --master_port=12345 \
+    ./eval/gen/gen_images_mp_imgedit.py \
+    --output_dir $output_path/bagel \
+    --metadata_file ./eval/gen/imgedit/Benchmark/singleturn/singleturn.json \
+    --max_latent_size 64 \
+    --model-path $model_path
+# calculate score
+python ./eval/gen/imgedit/basic_bench.py \
+    --result_img_folder $output_path/bagel \
+    --edit_json ./eval/gen/imgedit/Benchmark/singleturn/singleturn.json \
+    --origin_img_root ./eval/gen/imgedit/Benchmark/singleturn \
+    --num_processes 4 \
+    --prompts_json ./eval/gen/imgedit/Benchmark/singleturn/judge_prompt.json
+# summarize score
+python ./eval/gen/imgedit/step1_get_avgscore.py \
+    --result_json $output_path/bagel/result.json \
+    --average_score_json $output_path/bagel/average_score.json
+python ./eval/gen/imgedit/step2_typescore.py \
+    --average_score_json  $output_path/bagel/average_score.json \
+    --edit_json ./eval/gen/imgedit/Benchmark/singleturn/singleturn.json \
+    --typescore_json $output_path/bagel/typescore.json

scripts/eval/run_kris.sh ADDED Viewed

	@@ -0,0 +1,50 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+set -x
+export OPENAI_API_KEY=$openai_api_key
+GPUS=8
+# generate images
+torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --nproc_per_node=$GPUS \
+    --master_addr=127.0.0.1 \
+    --master_port=12345 \
+    ./eval/gen/gen_images_mp_kris.py \
+    --output_dir $output_path/bagel \
+    --metadata_file ./eval/gen/kris/final_data.json \
+    --max_latent_size 64 \
+    --model-path $model_path \
+    --think
+# calculate score
+python ./eval/gen/kris/metrics_common.py \
+    --results_dir $output_path \
+    --max_workers 8
+python ./eval/gen/kris/metrics_knowledge.py \
+    --results_dir $output_path \
+    --max_workers 8
+python ./eval/gen/kris/metrics_multi_element.py \
+    --results_dir $output_path \
+    --max_workers 8
+python ./eval/gen/kris/metrics_temporal_prediction.py \
+    --results_dir $output_path \
+    --max_workers 8
+python ./eval/gen/kris/metrics_view_change.py \
+    --results_dir $output_path \
+    --max_workers 8
+# summarize score
+python ./eval/gen/kris/summarize.py \
+    --results_dir $output_path/bagel \

scripts/eval/run_rise.sh ADDED Viewed

	@@ -0,0 +1,30 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+set -x
+export OPENAI_API_KEY=$openai_api_key
+GPUS=8
+# generate images
+torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --nproc_per_node=$GPUS \
+    --master_addr=127.0.0.1 \
+    --master_port=12345 \
+    ./eval/gen/gen_images_mp_rise.py \
+    --output_dir $output_path/bagel \
+    --metadata_file ./eval/gen/rise/data/datav2_total_w_subtask.json \
+    --max_latent_size 64 \
+    --model-path $model_path \
+    --think
+# calculate score
+python ./eval/gen/rise/gpt_eval.py \
+    --data ./eval/gen/rise/data/datav2_total_w_subtask.json \
+    --input ./eval/gen/rise/data \
+    --output $output_path/bagel

scripts/eval/run_wise.sh ADDED Viewed

	@@ -0,0 +1,44 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+set -x
+export OPENAI_API_KEY=$openai_api_key
+GPUS=8
+# generate images
+torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --nproc_per_node=$GPUS \
+    --master_addr=127.0.0.1 \
+    --master_port=12345 \
+    ./eval/gen/gen_images_mp_wise.py \
+    --output_dir $output_path/images \
+    --metadata-file ./eval/gen/wise/final_data.json \
+    --resolution 1024 \
+    --max-latent_size 64 \
+    --model-path $model_path \
+    --think
+# calculate score
+python3 eval/gen/wise/gpt_eval_mp.py \
+        --json_path eval/gen/wise/data/cultural_common_sense.json \
+        --image_dir $output_path/images \
+        --output_dir $output_path
+python3 eval/gen/wise/gpt_eval_mp.py \
+        --json_path eval/gen/wise/data/spatio-temporal_reasoning.json \
+        --image_dir $output_path/images \
+        --output_dir $output_path
+python3 eval/gen/wise/gpt_eval_mp.py \
+        --json_path eval/gen/wise/data/natural_science.json \
+        --image_dir $output_path/images \
+        --output_dir $output_path
+python3 eval/gen/wise/cal_score.py \
+        --output_dir $output_path

scripts/train.sh ADDED Viewed

	@@ -0,0 +1,48 @@

+#!/bin/bash
+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+export HF_HOME=/dev/shm/
+NUM_NODES=1
+NODE_RANK=0
+MASTER_ADDR=localhost
+MASTER_PORT=29500
+NPROC_PER_NODE=8
+MODEL_PATH=/dev/shm/models/BAGEL-7B-MoT
+# replace the variables with your own
+torchrun \
+  --nnodes=$NUM_NODES \
+  --node_rank=$NODE_RANK \
+  --nproc_per_node=$NPROC_PER_NODE \
+  --master_addr=$MASTER_ADDR \
+  --master_port=$MASTER_PORT \
+  train/pretrain_unified_navit.py \
+  --dataset_config_file ./data/configs/example.yaml \
+  --model_path $MODEL_PATH \
+  --layer_module Qwen2MoTDecoderLayer \
+  --max_latent_size 64 \
+  --resume-from $MODEL_PATH \
+  --finetune_from_hf True \
+  --auto_resume True \
+  --resume-model-only True \
+  --finetune-from-ema True \
+  --log_every 1 \
+  --lr 2e-5 \
+  --lr_scheduler cosine \
+  --min_lr 1e-6 \
+  --num_worker 1 \
+  --expected_num_tokens 60000 \
+  --max_num_tokens 60000 \
+  --max_num_tokens_per_sample 60000 \
+  --prefer_buffer_before 30000 \
+  --num_shard=$NPROC_PER_NODE \
+  --sharding_strategy="HYBRID_SHARD" \
+  --wandb_project "zebra-cot" \
+  --wandb_name "h200-zebra-cot-$(date +%Y%m%d_%H%M%S)" \
+  --save_every 50 \
+  --warmup_steps 50 \
+  --total_steps 5000 \
+  --results_dir results/ \
+  --checkpoint_dir results/checkpoints/ > run.out 2> run.err
+# --cpu_offload True \

scripts/train_smm.sh ADDED Viewed

	@@ -0,0 +1,57 @@

+#!/bin/bash
+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+# Change to the project directory
+cd /scratch/by2593/Bagel-Zebra-CoT-origin
+export HF_HOME=/dev/shm/
+export PYTHONPATH=/scratch/by2593/Bagel-Zebra-CoT-origin:$PYTHONPATH
+export WANDB_MODE=offline
+export WANDB_ANONYMOUS=must
+NUM_NODES=1
+NODE_RANK=0
+MASTER_ADDR=localhost
+MASTER_PORT=29500
+NPROC_PER_NODE=8
+MODEL_PATH=/scratch/by2593/hf_cache/hub/models--multimodal-reasoning-lab--Bagel-Zebra-CoT/snapshots/ebce32410ee2062d073feae484ea2c6c1515fba8
+# replace the variables with your own
+torchrun \
+  --nnodes=$NUM_NODES \
+  --node_rank=$NODE_RANK \
+  --nproc_per_node=$NPROC_PER_NODE \
+  --master_addr=$MASTER_ADDR \
+  --master_port=$MASTER_PORT \
+  train/pretrain_unified_navit.py \
+  --dataset_config_file ./data/configs/example_smm_semantic.yaml \
+  --model_path $MODEL_PATH \
+  --layer_module Qwen2MoTDecoderLayer \
+  --max_latent_size 64 \
+  --resume-from $MODEL_PATH \
+  --finetune_from_hf True \
+  --auto_resume True \
+  --resume-model-only True \
+  --finetune-from-ema False \
+  --log_every 1 \
+  --lr 2e-5 \
+  --lr_scheduler cosine \
+  --min_lr 1e-6 \
+  --num_worker 1 \
+  --expected_num_tokens 40000 \
+  --max_num_tokens 40000 \
+  --max_num_tokens_per_sample 40000 \
+  --prefer_buffer_before 10000 \
+  --num_shard=$NPROC_PER_NODE \
+  --sharding_strategy="HYBRID_SHARD" \
+  --wandb_project "zebra-cot" \
+  --wandb_name "h200-zebra-cot-$(date +%Y%m%d_%H%M%S)" \
+  --save_every 100 \
+  --warmup_steps 50 \
+  --total_steps 5000 \
+  --results_dir results/ \
+  --checkpoint_dir results/checkpoints_smm_semantic_part1_v1_origin/ > run.out 2> run.err \
+  --cpu_offload True \
+   # bash scripts/train_smm.sh

scripts/train_smm_sbatch.sh ADDED Viewed

	@@ -0,0 +1,85 @@

+#!/bin/bash
+#SBATCH --job-name=bagel-zebra-cot-smm
+#SBATCH --partition=h200_tandon
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=32
+#SBATCH --gres=gpu:h200:8
+#SBATCH --mem=1600G
+#SBATCH --time=48:00:00
+#SBATCH --output=slurm_logs/train_smm_%j.out
+#SBATCH --error=slurm_logs/train_smm_%j.err
+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+# Load any necessary modules (adjust as needed for your cluster)
+# module load cuda/12.1
+# module load conda
+# Activate conda environment
+source /scratch/by2593/miniconda3/etc/profile.d/conda.sh
+conda activate bagel
+# Change to the project directory
+cd /scratch/by2593/Bagel-Zebra-CoT-origin
+# Set environment variables
+export HF_HOME=/dev/shm/
+export PYTHONPATH=/scratch/by2593/Bagel-Zebra-CoT-origin:$PYTHONPATH
+export WANDB_MODE=offline
+export WANDB_ANONYMOUS=must
+# SLURM variables
+NUM_NODES=1
+NODE_RANK=0
+MASTER_ADDR=$(hostname)
+MASTER_PORT=29500
+NPROC_PER_NODE=8
+MODEL_PATH=/scratch/by2593/hf_cache/hub/models--multimodal-reasoning-lab--Bagel-Zebra-CoT/snapshots/ebce32410ee2062d073feae484ea2c6c1515fba8
+echo "Starting SMM training on node: $SLURM_JOB_NODELIST"
+echo "Job ID: $SLURM_JOB_ID"
+echo "Number of GPUs: $NPROC_PER_NODE"
+# Run training
+torchrun \
+  --nnodes=$NUM_NODES \
+  --node_rank=$NODE_RANK \
+  --nproc_per_node=$NPROC_PER_NODE \
+  --master_addr=$MASTER_ADDR \
+  --master_port=$MASTER_PORT \
+  train/pretrain_unified_navit.py \
+  --dataset_config_file ./data/configs/example_smm_random.yaml \
+  --model_path $MODEL_PATH \
+  --layer_module Qwen2MoTDecoderLayer \
+  --max_latent_size 64 \
+  --visual_und True \
+  --finetune_from_hf True \
+  --auto_resume True \
+  --resume-model-only False \
+  --finetune-from-ema False \
+  --log_every 1 \
+  --lr 2e-5 \
+  --lr_scheduler cosine \
+  --min_lr 1e-6 \
+  --num_worker 1 \
+  --expected_num_tokens 50000 \
+  --max_num_tokens 50000 \
+  --max_num_tokens_per_sample 50000 \
+  --prefer_buffer_before 10000 \
+  --num_shard=$NPROC_PER_NODE \
+  --sharding_strategy="HYBRID_SHARD" \
+  --wandb_project "smm" \
+  --wandb_name "h200-zebra-cot-smm-sbatch-$(date +%Y%m%d_%H%M%S)" \
+  --save_every 100 \
+  --warmup_steps 50 \
+  --total_steps 5000 \
+  --results_dir results/ \
+  --checkpoint_dir /scratch/by2593/Bagel-Zebra-CoT-origin/results/checkpoints_smm_random_20251026_033448/ \
+  --cpu_offload True \
+  --max_checkpoints 2
+echo "SMM training completed on $(date)"
+# sbatch scripts/train_smm_sbatch.sh

test_images/image.png ADDED Viewed

Git LFS Details

SHA256: 8e402e7927312911bc35200f70ef5ce98d8efb4f715b10c768a5018b330d12d4
Pointer size: 131 Bytes
Size of remote file: 157 kB

test_images/meme.jpg ADDED Viewed

test_images/octupusy.jpg ADDED Viewed

test_images/women.jpg ADDED Viewed