Spaces:
Sleeping
Sleeping
momergul
commited on
Commit
·
554adbb
1
Parent(s):
5f8e458
Update
Browse files- app.py +7 -6
- joint_inference.py +7 -0
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -23,7 +23,7 @@ css="""
|
|
| 23 |
def initialize_game() -> List[List[str]]:
|
| 24 |
context_dicts = [generate_complete_game() for _ in range(2)]
|
| 25 |
|
| 26 |
-
roles = ["
|
| 27 |
speaker_images = []
|
| 28 |
listener_images = []
|
| 29 |
targets = []
|
|
@@ -40,7 +40,6 @@ def get_model_response(
|
|
| 40 |
model, adapter_name, processor, index_to_token, role: str,
|
| 41 |
image_paths: List[str], user_message: str = "", target_image: str = ""
|
| 42 |
) -> str:
|
| 43 |
-
model.model.set_adapter(adapter_name)
|
| 44 |
if role == "speaker":
|
| 45 |
img_dir = "tangram_pngs"
|
| 46 |
print("Starting processing")
|
|
@@ -50,7 +49,7 @@ def get_model_response(
|
|
| 50 |
image_paths = [image_paths]
|
| 51 |
print("Starting inference")
|
| 52 |
captions = get_speaker_response(model, images, input_tokens, attn_mask, image_attn_mask, label, image_paths,
|
| 53 |
-
processor, img_dir, index_to_token)
|
| 54 |
print("Done")
|
| 55 |
response = captions[0]
|
| 56 |
else: # listener
|
|
@@ -63,14 +62,15 @@ def get_model_response(
|
|
| 63 |
print("Starting inference")
|
| 64 |
response = get_listener_response(
|
| 65 |
model, images, l_input_tokens, l_attn_mask, l_image_attn_mask, index_to_token,
|
| 66 |
-
s_input_tokens, s_attn_mask, s_image_attn_mask, s_target_mask, s_target_label, image_paths
|
| 67 |
)
|
| 68 |
print("Done")
|
| 69 |
|
| 70 |
return response
|
| 71 |
|
| 72 |
@spaces.GPU(duration=20)
|
| 73 |
-
def get_speaker_response(model, images, input_tokens, attn_mask, image_attn_mask, label, image_paths, processor, img_dir, index_to_token):
|
|
|
|
| 74 |
model = model.cuda()
|
| 75 |
with torch.no_grad():
|
| 76 |
captions, _, _, _, _ = model.generate(
|
|
@@ -83,7 +83,8 @@ def get_speaker_response(model, images, input_tokens, attn_mask, image_attn_mask
|
|
| 83 |
|
| 84 |
@spaces.GPU(duration=20)
|
| 85 |
def get_listener_response(model, images, l_input_tokens, l_attn_mask, l_image_attn_mask, index_to_token,
|
| 86 |
-
s_input_tokens, s_attn_mask, s_image_attn_mask, s_target_mask, s_target_label, image_paths):
|
|
|
|
| 87 |
model = model.cuda()
|
| 88 |
with torch.no_grad():
|
| 89 |
_, _, joint_log_probs = model.comprehension_side([
|
|
|
|
| 23 |
def initialize_game() -> List[List[str]]:
|
| 24 |
context_dicts = [generate_complete_game() for _ in range(2)]
|
| 25 |
|
| 26 |
+
roles = ["speaker"] * 3 + ["listener"] * 3
|
| 27 |
speaker_images = []
|
| 28 |
listener_images = []
|
| 29 |
targets = []
|
|
|
|
| 40 |
model, adapter_name, processor, index_to_token, role: str,
|
| 41 |
image_paths: List[str], user_message: str = "", target_image: str = ""
|
| 42 |
) -> str:
|
|
|
|
| 43 |
if role == "speaker":
|
| 44 |
img_dir = "tangram_pngs"
|
| 45 |
print("Starting processing")
|
|
|
|
| 49 |
image_paths = [image_paths]
|
| 50 |
print("Starting inference")
|
| 51 |
captions = get_speaker_response(model, images, input_tokens, attn_mask, image_attn_mask, label, image_paths,
|
| 52 |
+
processor, img_dir, index_to_token, adapter_name)
|
| 53 |
print("Done")
|
| 54 |
response = captions[0]
|
| 55 |
else: # listener
|
|
|
|
| 62 |
print("Starting inference")
|
| 63 |
response = get_listener_response(
|
| 64 |
model, images, l_input_tokens, l_attn_mask, l_image_attn_mask, index_to_token,
|
| 65 |
+
s_input_tokens, s_attn_mask, s_image_attn_mask, s_target_mask, s_target_label, image_paths, adapter_name
|
| 66 |
)
|
| 67 |
print("Done")
|
| 68 |
|
| 69 |
return response
|
| 70 |
|
| 71 |
@spaces.GPU(duration=20)
|
| 72 |
+
def get_speaker_response(model, images, input_tokens, attn_mask, image_attn_mask, label, image_paths, processor, img_dir, index_to_token, adapter_name):
|
| 73 |
+
model.model.set_adapter(adapter_name)
|
| 74 |
model = model.cuda()
|
| 75 |
with torch.no_grad():
|
| 76 |
captions, _, _, _, _ = model.generate(
|
|
|
|
| 83 |
|
| 84 |
@spaces.GPU(duration=20)
|
| 85 |
def get_listener_response(model, images, l_input_tokens, l_attn_mask, l_image_attn_mask, index_to_token,
|
| 86 |
+
s_input_tokens, s_attn_mask, s_image_attn_mask, s_target_mask, s_target_label, image_paths, adapter_name):
|
| 87 |
+
model.model.set_adapter(adapter_name)
|
| 88 |
model = model.cuda()
|
| 89 |
with torch.no_grad():
|
| 90 |
_, _, joint_log_probs = model.comprehension_side([
|
joint_inference.py
CHANGED
|
@@ -346,6 +346,7 @@ class IdeficsJointInferenceModel(nn.Module):
|
|
| 346 |
speaker = self.get_speaker()
|
| 347 |
generation_config = GenerationConfig(
|
| 348 |
max_new_tokens=max_steps,
|
|
|
|
| 349 |
do_sample=True,
|
| 350 |
temperature=temperature,
|
| 351 |
top_k=top_k, top_p=top_p,
|
|
@@ -436,6 +437,12 @@ class IdeficsJointInferenceModel(nn.Module):
|
|
| 436 |
output_hidden_states=True,
|
| 437 |
return_dict_in_generate=True
|
| 438 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
outputs = speaker.generate(
|
| 440 |
input_ids=s_input_tokens,
|
| 441 |
attention_mask=s_attn_mask,
|
|
|
|
| 346 |
speaker = self.get_speaker()
|
| 347 |
generation_config = GenerationConfig(
|
| 348 |
max_new_tokens=max_steps,
|
| 349 |
+
min_new_tokens=1,
|
| 350 |
do_sample=True,
|
| 351 |
temperature=temperature,
|
| 352 |
top_k=top_k, top_p=top_p,
|
|
|
|
| 437 |
output_hidden_states=True,
|
| 438 |
return_dict_in_generate=True
|
| 439 |
)
|
| 440 |
+
|
| 441 |
+
print(torch.any(torch.isnan(s_input_tokens)))
|
| 442 |
+
print(torch.any(torch.isnan(s_attn_mask)))
|
| 443 |
+
print(torch.any(torch.isnan(images)))
|
| 444 |
+
print(torch.any(torch.isnan(s_image_attn_mask)))
|
| 445 |
+
|
| 446 |
outputs = speaker.generate(
|
| 447 |
input_ids=s_input_tokens,
|
| 448 |
attention_mask=s_attn_mask,
|
requirements.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
torch==2.2.
|
| 2 |
datasets==2.18.0
|
| 3 |
transformers==4.40.0
|
| 4 |
sentencepiece==0.2.0
|
|
|
|
| 1 |
+
torch==2.2.0
|
| 2 |
datasets==2.18.0
|
| 3 |
transformers==4.40.0
|
| 4 |
sentencepiece==0.2.0
|