update
Browse files- config.json +1 -6
- image_processing_magma.py +0 -76
config.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"_name_or_path": "
|
| 3 |
"architectures": [
|
| 4 |
"MagmaForConditionalGeneration"
|
| 5 |
],
|
|
@@ -118,10 +118,6 @@
|
|
| 118 |
"transformers_version": "4.44.1",
|
| 119 |
"use_cache": false,
|
| 120 |
"vision_config": {
|
| 121 |
-
"_name_or_path": "/mnt/model/llms/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/c4a54320a52ed5f88b7a2f84496903ea4ff07b45",
|
| 122 |
-
"architectures": [
|
| 123 |
-
"LlavaLlamaForCausalLM"
|
| 124 |
-
],
|
| 125 |
"attention_bias": false,
|
| 126 |
"attention_dropout": 0.0,
|
| 127 |
"bos_token_id": 128000,
|
|
@@ -145,7 +141,6 @@
|
|
| 145 |
"mm_vision_select_feature": "patch",
|
| 146 |
"mm_vision_select_layer": -2,
|
| 147 |
"mm_vision_tower": "segtokv9_xxlarge",
|
| 148 |
-
"model_type": "llava_llama",
|
| 149 |
"num_attention_heads": 32,
|
| 150 |
"num_hidden_layers": 32,
|
| 151 |
"num_key_value_heads": 8,
|
|
|
|
| 1 |
{
|
| 2 |
+
"_name_or_path": "Magma-8B",
|
| 3 |
"architectures": [
|
| 4 |
"MagmaForConditionalGeneration"
|
| 5 |
],
|
|
|
|
| 118 |
"transformers_version": "4.44.1",
|
| 119 |
"use_cache": false,
|
| 120 |
"vision_config": {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
"attention_bias": false,
|
| 122 |
"attention_dropout": 0.0,
|
| 123 |
"bos_token_id": 128000,
|
|
|
|
| 141 |
"mm_vision_select_feature": "patch",
|
| 142 |
"mm_vision_select_layer": -2,
|
| 143 |
"mm_vision_tower": "segtokv9_xxlarge",
|
|
|
|
| 144 |
"num_attention_heads": 32,
|
| 145 |
"num_hidden_layers": 32,
|
| 146 |
"num_key_value_heads": 8,
|
image_processing_magma.py
CHANGED
|
@@ -43,82 +43,6 @@ if is_vision_available():
|
|
| 43 |
import torch
|
| 44 |
import torchvision
|
| 45 |
|
| 46 |
-
def padding_336(b):
|
| 47 |
-
width, height = b.size
|
| 48 |
-
tar = int(np.ceil(height / 336) * 336)
|
| 49 |
-
top_padding = int((tar - height)/2)
|
| 50 |
-
bottom_padding = tar - height - top_padding
|
| 51 |
-
left_padding = 0
|
| 52 |
-
right_padding = 0
|
| 53 |
-
b = torchvision.transforms.functional.pad(b, [left_padding, top_padding, right_padding, bottom_padding], fill=[255,255,255])
|
| 54 |
-
|
| 55 |
-
return b
|
| 56 |
-
|
| 57 |
-
def calc_padded_size(width, height, padding_unit=336):
|
| 58 |
-
target_height = int(np.ceil(height / padding_unit) * padding_unit)
|
| 59 |
-
top_padding = int((target_height - height) / 2)
|
| 60 |
-
bottom_padding = target_height - height - top_padding
|
| 61 |
-
left_padding = 0
|
| 62 |
-
right_padding = 0
|
| 63 |
-
padded_width = width + left_padding + right_padding
|
| 64 |
-
padded_height = height + top_padding + bottom_padding
|
| 65 |
-
return padded_width, padded_height
|
| 66 |
-
|
| 67 |
-
def HD_transform(img, hd_num=4, base_img_size=768):
|
| 68 |
-
width, height = img.size
|
| 69 |
-
trans = False
|
| 70 |
-
if width < height:
|
| 71 |
-
img = img.transpose(Image.TRANSPOSE)
|
| 72 |
-
trans = True
|
| 73 |
-
width, height = img.size
|
| 74 |
-
ratio = (width / height)
|
| 75 |
-
scale = 1
|
| 76 |
-
while scale*np.ceil(scale/ratio) <= hd_num:
|
| 77 |
-
scale += 1
|
| 78 |
-
scale -= 1
|
| 79 |
-
new_w = int(scale * base_img_size)
|
| 80 |
-
new_h = int(new_w / ratio)
|
| 81 |
-
|
| 82 |
-
img = torchvision.transforms.functional.resize(img, [new_h, new_w],)
|
| 83 |
-
img = padding_336(img)
|
| 84 |
-
width, height = img.size
|
| 85 |
-
if trans:
|
| 86 |
-
img = img.transpose(Image.TRANSPOSE)
|
| 87 |
-
|
| 88 |
-
return img
|
| 89 |
-
|
| 90 |
-
def calc_hd_transform_size(width, height, hd_num=16):
|
| 91 |
-
transposed = False
|
| 92 |
-
if width < height:
|
| 93 |
-
width, height = height, width
|
| 94 |
-
transposed = True
|
| 95 |
-
|
| 96 |
-
ratio = width / height
|
| 97 |
-
scale = 1
|
| 98 |
-
while scale * np.ceil(scale / ratio) <= hd_num:
|
| 99 |
-
scale += 1
|
| 100 |
-
scale -= 1
|
| 101 |
-
|
| 102 |
-
new_width = int(scale * 336)
|
| 103 |
-
new_height = int(new_width / ratio)
|
| 104 |
-
|
| 105 |
-
padded_width, padded_height = calc_padded_size(new_width, new_height)
|
| 106 |
-
|
| 107 |
-
if transposed:
|
| 108 |
-
padded_width, padded_height = padded_height, padded_width
|
| 109 |
-
|
| 110 |
-
return padded_width, padded_height
|
| 111 |
-
|
| 112 |
-
def pad_to_max_num_crops_tensor(images, max_crops=5):
|
| 113 |
-
"""
|
| 114 |
-
images: B x 3 x H x W, B<=max_crops
|
| 115 |
-
"""
|
| 116 |
-
B, _, H, W = images.shape
|
| 117 |
-
if B < max_crops:
|
| 118 |
-
pad = torch.zeros(max_crops - B, 3, H, W, dtype=images.dtype, device=images.device)
|
| 119 |
-
images = torch.cat([images, pad], dim=0)
|
| 120 |
-
return images
|
| 121 |
-
|
| 122 |
def select_best_resolution(original_size, possible_resolutions):
|
| 123 |
"""
|
| 124 |
Selects the best resolution from a list of possible resolutions based on the original size.
|
|
|
|
| 43 |
import torch
|
| 44 |
import torchvision
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
def select_best_resolution(original_size, possible_resolutions):
|
| 47 |
"""
|
| 48 |
Selects the best resolution from a list of possible resolutions based on the original size.
|