microsoft
/

Magma-8B

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "/home/jianwyan/projects/ProjectWillow/azureblobs/projects4jw_model/magma/checkpoints/finetune-none-bs8-ep5-bimsz512-ncrops4-anyrescrop-seqlen3072-1e-5-constant-0.0_openx_magma_trace_coin_howto100m_ego4d_sthv2_epic_seeclick_llava_sharegpt4v_vision2ui_-1_iseTrue_ihTrue_tseFalse_tsdTrue_rtptsTrue_qsz256-nnodes12-zero1/checkpoint-12000",
   "architectures": [
     "MagmaForConditionalGeneration"
   ],
@@ -118,10 +118,6 @@
   "transformers_version": "4.44.1",
   "use_cache": false,
   "vision_config": {
-    "_name_or_path": "/mnt/model/llms/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/c4a54320a52ed5f88b7a2f84496903ea4ff07b45",
-    "architectures": [
-      "LlavaLlamaForCausalLM"
-    ],
     "attention_bias": false,
     "attention_dropout": 0.0,
     "bos_token_id": 128000,
@@ -145,7 +141,6 @@
     "mm_vision_select_feature": "patch",
     "mm_vision_select_layer": -2,
     "mm_vision_tower": "segtokv9_xxlarge",
-    "model_type": "llava_llama",
     "num_attention_heads": 32,
     "num_hidden_layers": 32,
     "num_key_value_heads": 8,

 {
+  "_name_or_path": "Magma-8B",
   "architectures": [
     "MagmaForConditionalGeneration"
   ],
   "transformers_version": "4.44.1",
   "use_cache": false,
   "vision_config": {
     "attention_bias": false,
     "attention_dropout": 0.0,
     "bos_token_id": 128000,
     "mm_vision_select_feature": "patch",
     "mm_vision_select_layer": -2,
     "mm_vision_tower": "segtokv9_xxlarge",
     "num_attention_heads": 32,
     "num_hidden_layers": 32,
     "num_key_value_heads": 8,

image_processing_magma.py CHANGED Viewed

@@ -43,82 +43,6 @@ if is_vision_available():
 import torch
 import torchvision
-def padding_336(b):
-    width, height = b.size
-    tar = int(np.ceil(height / 336) * 336)
-    top_padding = int((tar - height)/2)
-    bottom_padding = tar - height - top_padding
-    left_padding = 0
-    right_padding = 0
-    b = torchvision.transforms.functional.pad(b, [left_padding, top_padding, right_padding, bottom_padding], fill=[255,255,255])
-    return b
-def calc_padded_size(width, height, padding_unit=336):
-    target_height = int(np.ceil(height / padding_unit) * padding_unit)
-    top_padding = int((target_height - height) / 2)
-    bottom_padding = target_height - height - top_padding
-    left_padding = 0
-    right_padding = 0
-    padded_width = width + left_padding + right_padding
-    padded_height = height + top_padding + bottom_padding
-    return padded_width, padded_height
-def HD_transform(img, hd_num=4, base_img_size=768):
-    width, height = img.size
-    trans = False
-    if width < height:
-        img = img.transpose(Image.TRANSPOSE)
-        trans = True
-        width, height = img.size
-    ratio = (width / height)
-    scale = 1
-    while scale*np.ceil(scale/ratio) <= hd_num:
-        scale += 1
-    scale -= 1
-    new_w = int(scale * base_img_size)
-    new_h = int(new_w / ratio)
-    img = torchvision.transforms.functional.resize(img, [new_h, new_w],)
-    img = padding_336(img)
-    width, height = img.size
-    if trans:
-        img = img.transpose(Image.TRANSPOSE)
-    return img
-def calc_hd_transform_size(width, height, hd_num=16):
-    transposed = False
-    if width < height:
-        width, height = height, width
-        transposed = True
-    ratio = width / height
-    scale = 1
-    while scale * np.ceil(scale / ratio) <= hd_num:
-        scale += 1
-    scale -= 1
-    new_width = int(scale * 336)
-    new_height = int(new_width / ratio)
-    padded_width, padded_height = calc_padded_size(new_width, new_height)
-    if transposed:
-        padded_width, padded_height = padded_height, padded_width
-    return padded_width, padded_height
-def pad_to_max_num_crops_tensor(images, max_crops=5):
-    """
-    images: B x 3 x H x W, B<=max_crops
-    """
-    B, _, H, W = images.shape
-    if B < max_crops:
-        pad = torch.zeros(max_crops - B, 3, H, W, dtype=images.dtype, device=images.device)
-        images = torch.cat([images, pad], dim=0)
-    return images
 def select_best_resolution(original_size, possible_resolutions):
     """
     Selects the best resolution from a list of possible resolutions based on the original size.

 import torch
 import torchvision
 def select_best_resolution(original_size, possible_resolutions):
     """
     Selects the best resolution from a list of possible resolutions based on the original size.