Spaces:

lillab-demos
/

cogen

Sleeping

App Files Files Community

momergul commited on Sep 21, 2024

Commit

554adbb

1 Parent(s): 5f8e458

Update

Browse files

Files changed (3) hide show

app.py +7 -6
joint_inference.py +7 -0
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -23,7 +23,7 @@ css="""
 def initialize_game() -> List[List[str]]:
     context_dicts = [generate_complete_game() for _ in range(2)]
-    roles = ["listener"] * 3 + ["speaker"] * 3
     speaker_images = []
     listener_images = []
     targets = []
@@ -40,7 +40,6 @@ def get_model_response(
         model, adapter_name, processor, index_to_token, role: str,
         image_paths: List[str], user_message: str = "", target_image: str = ""
 ) -> str:
-    model.model.set_adapter(adapter_name)
     if role == "speaker":
         img_dir = "tangram_pngs"
         print("Starting processing")
@@ -50,7 +49,7 @@ def get_model_response(
         image_paths = [image_paths]
         print("Starting inference")
         captions = get_speaker_response(model, images, input_tokens, attn_mask, image_attn_mask, label, image_paths,
-                                        processor, img_dir, index_to_token)
         print("Done")
         response = captions[0]
     else:  # listener
@@ -63,14 +62,15 @@ def get_model_response(
         print("Starting inference")
         response = get_listener_response(
             model, images, l_input_tokens, l_attn_mask, l_image_attn_mask, index_to_token,
-            s_input_tokens, s_attn_mask, s_image_attn_mask, s_target_mask, s_target_label, image_paths
         )
         print("Done")
     return response
 @spaces.GPU(duration=20)
-def get_speaker_response(model, images, input_tokens, attn_mask, image_attn_mask, label, image_paths, processor, img_dir, index_to_token):
     model = model.cuda()
     with torch.no_grad():
         captions, _, _, _, _ = model.generate(
@@ -83,7 +83,8 @@ def get_speaker_response(model, images, input_tokens, attn_mask, image_attn_mask
 @spaces.GPU(duration=20)
 def get_listener_response(model, images, l_input_tokens, l_attn_mask, l_image_attn_mask, index_to_token,
-                          s_input_tokens, s_attn_mask, s_image_attn_mask, s_target_mask, s_target_label, image_paths):
     model = model.cuda()
     with torch.no_grad():
         _, _, joint_log_probs = model.comprehension_side([

 def initialize_game() -> List[List[str]]:
     context_dicts = [generate_complete_game() for _ in range(2)]
+    roles = ["speaker"] * 3 + ["listener"] * 3
     speaker_images = []
     listener_images = []
     targets = []
         model, adapter_name, processor, index_to_token, role: str,
         image_paths: List[str], user_message: str = "", target_image: str = ""
 ) -> str:
     if role == "speaker":
         img_dir = "tangram_pngs"
         print("Starting processing")
         image_paths = [image_paths]
         print("Starting inference")
         captions = get_speaker_response(model, images, input_tokens, attn_mask, image_attn_mask, label, image_paths,
+                                        processor, img_dir, index_to_token, adapter_name)
         print("Done")
         response = captions[0]
     else:  # listener
         print("Starting inference")
         response = get_listener_response(
             model, images, l_input_tokens, l_attn_mask, l_image_attn_mask, index_to_token,
+            s_input_tokens, s_attn_mask, s_image_attn_mask, s_target_mask, s_target_label, image_paths, adapter_name
         )
         print("Done")
     return response
 @spaces.GPU(duration=20)
+def get_speaker_response(model, images, input_tokens, attn_mask, image_attn_mask, label, image_paths, processor, img_dir, index_to_token, adapter_name):
+    model.model.set_adapter(adapter_name)
     model = model.cuda()
     with torch.no_grad():
         captions, _, _, _, _ = model.generate(
 @spaces.GPU(duration=20)
 def get_listener_response(model, images, l_input_tokens, l_attn_mask, l_image_attn_mask, index_to_token,
+                          s_input_tokens, s_attn_mask, s_image_attn_mask, s_target_mask, s_target_label, image_paths, adapter_name):
+    model.model.set_adapter(adapter_name)
     model = model.cuda()
     with torch.no_grad():
         _, _, joint_log_probs = model.comprehension_side([

joint_inference.py CHANGED Viewed

@@ -346,6 +346,7 @@ class IdeficsJointInferenceModel(nn.Module):
         speaker = self.get_speaker()
         generation_config = GenerationConfig(
             max_new_tokens=max_steps,
             do_sample=True,
             temperature=temperature,
             top_k=top_k, top_p=top_p,
@@ -436,6 +437,12 @@ class IdeficsJointInferenceModel(nn.Module):
             output_hidden_states=True,
             return_dict_in_generate=True
         )
         outputs = speaker.generate(
             input_ids=s_input_tokens,
             attention_mask=s_attn_mask,

         speaker = self.get_speaker()
         generation_config = GenerationConfig(
             max_new_tokens=max_steps,
+            min_new_tokens=1,
             do_sample=True,
             temperature=temperature,
             top_k=top_k, top_p=top_p,
             output_hidden_states=True,
             return_dict_in_generate=True
         )
+        print(torch.any(torch.isnan(s_input_tokens)))
+        print(torch.any(torch.isnan(s_attn_mask)))
+        print(torch.any(torch.isnan(images)))
+        print(torch.any(torch.isnan(s_image_attn_mask)))
         outputs = speaker.generate(
             input_ids=s_input_tokens,
             attention_mask=s_attn_mask,

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-torch==2.2.2
 datasets==2.18.0
 transformers==4.40.0
 sentencepiece==0.2.0

+torch==2.2.0
 datasets==2.18.0
 transformers==4.40.0
 sentencepiece==0.2.0