Spaces:

atnikos
/

motionfix-demo

Running

App Files Files Community

atnikos commited on Jun 22, 2024

Commit

7d87cc1

1 Parent(s): d8530c7

first mvp

Browse files

Files changed (6) hide show

app.py +14 -8
gen_utils.py +3 -1
geometry_utils.py +14 -14
model_utils.py +4 -4
text_encoder.py +1 -1
tmed_denoiser.py +1 -1

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import random
 zero = torch.Tensor([0]).cuda()
 print(zero.device) # <-- 'cpu' 🤔
 # G&uumll Varol
 WEBSITE = """
 <div class="embed_hidden">
@@ -61,7 +62,8 @@ def download_models():
 with gr.Blocks() as demo:
     gr.Markdown(WEBSITE)
-    input_text = gr.Textbox(label="Input Text")
     # output_text = gr.Textbox(label="Output Text")
     with gr.Row():
@@ -76,16 +78,18 @@ with gr.Blocks() as demo:
         from tmed_denoiser import TMED_denoiser
         model_ckpt = download_models()
         checkpoint = torch.load(model_ckpt)
-        print(checkpoint.keys())
         checkpoint = {k.replace('denoiser.', ''): v for k, v in checkpoint.items()}
-        tmed_denoiser = TMED_denoiser().load_state_dict(checkpoint, strict=False)
         text_encoder = ClipTextEncoder()
-        texts_cond = [input_text]
         diffusion_process = create_diffusion(timestep_respacing=None,
                                              learn_sigma=False, sigma_small=True,
                                              diffusion_steps=300,
                                              noise_schedule='squaredcos_cap_v2',
-                                             predict_type='sample',
                                              predict_xstart=True) # noise vs sample
         # uncond_tokens = [""] * len(texts_cond)
         # if self.condition == 'text':
@@ -97,6 +101,7 @@ with gr.Blocks() as demo:
         no_of_texts = len(texts_cond)
         texts_cond = ['']*no_of_texts + texts_cond
         texts_cond = ['']*no_of_texts + texts_cond
         text_emb, text_mask = text_encoder(texts_cond)
         cond_emb_motion = torch.zeros(1, bsz,
@@ -107,8 +112,9 @@ with gr.Blocks() as demo:
         mask_target =  torch.ones((1, bsz),
                                     dtype=bool, device='cuda')
         # complete noise
-        diff_out = tmed_denoiser.diffusion_reverse(text_emb,
-                                                   text_mask,
                                                    cond_emb_motion,
                                                    cond_motion_mask,
                                                    mask_target,
@@ -118,7 +124,7 @@ with gr.Blocks() as demo:
                                                    gd_text=4.0,
                                                    gd_motion=2.0,
                                                    steps_num=300)
-        edited_motion = diffout2motion(diff_out)
     clear_button.click(clear, outputs=input_text)
     random_button.click(random_number, outputs=input_text)

 zero = torch.Tensor([0]).cuda()
 print(zero.device) # <-- 'cpu' 🤔
 # G&uumll Varol
+DEFAULT_TEXT = "A person is "
 WEBSITE = """
 <div class="embed_hidden">
 with gr.Blocks() as demo:
     gr.Markdown(WEBSITE)
+    input_text = gr.Textbox(placeholder="Type the edit text you want:",
+                    show_label=True,label="Input Text", value=DEFAULT_TEXT)
     # output_text = gr.Textbox(label="Output Text")
     with gr.Row():
         from tmed_denoiser import TMED_denoiser
         model_ckpt = download_models()
         checkpoint = torch.load(model_ckpt)
         checkpoint = {k.replace('denoiser.', ''): v for k, v in checkpoint.items()}
+        tmed_denoiser = TMED_denoiser().to('cuda')
+        tmed_denoiser.load_state_dict(checkpoint, strict=False)
+        tmed_denoiser.eval()
         text_encoder = ClipTextEncoder()
+        texts_cond = [input_text.value]
         diffusion_process = create_diffusion(timestep_respacing=None,
                                              learn_sigma=False, sigma_small=True,
                                              diffusion_steps=300,
                                              noise_schedule='squaredcos_cap_v2',
                                              predict_xstart=True) # noise vs sample
         # uncond_tokens = [""] * len(texts_cond)
         # if self.condition == 'text':
         no_of_texts = len(texts_cond)
         texts_cond = ['']*no_of_texts + texts_cond
         texts_cond = ['']*no_of_texts + texts_cond
+        print(texts_cond)
         text_emb, text_mask = text_encoder(texts_cond)
         cond_emb_motion = torch.zeros(1, bsz,
         mask_target =  torch.ones((1, bsz),
                                     dtype=bool, device='cuda')
         # complete noise
+        # import ipdb;ipdb.set_trace()
+        diff_out = tmed_denoiser._diffusion_reverse(text_emb.to(cond_emb_motion.device),
+                                                   text_mask.to(cond_emb_motion.device),
                                                    cond_emb_motion,
                                                    cond_motion_mask,
                                                    mask_target,
                                                    gd_text=4.0,
                                                    gd_motion=2.0,
                                                    steps_num=300)
+        edited_motion = diffout2motion(diff_out, normalizer)
     clear_button.click(clear, outputs=input_text)
     random_button.click(random_number, outputs=input_text)

gen_utils.py CHANGED Viewed

@@ -8,4 +8,6 @@ def cast_dict_to_tensors(d, device="cpu"):
     elif isinstance(d, torch.Tensor):
         return d.to(device)
     else:
-        return d

     elif isinstance(d, torch.Tensor):
         return d.to(device)
     else:
+        return d

geometry_utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import torch
-def diffout2motion(diffout):
         # - "body_transl_delta_pelv_xy_wo_z"
         # - "body_transl_z"
@@ -8,19 +9,19 @@ def diffout2motion(diffout):
         # - "body_orient_xy"
         # - "body_pose"
         # - "body_joints_local_wo_z_rot"
-        feats_unnorm = self.cat_inputs(self.unnorm_inputs(
-                                        self.uncat_inputs(diffout,
-                                            self.input_feats_dims),
-                                        self.input_feats))[0]
         # FIRST POSE FOR GENERATION & DELTAS FOR INTEGRATION
-        if "body_joints_local_wo_z_rot" in self.input_feats:
-            idx = self.input_feats.index("body_joints_local_wo_z_rot")
-            feats_unnorm = feats_unnorm[..., :-self.input_feats_dims[idx]]
         first_trans = torch.zeros(*diffout.shape[:-1], 3,
-                                    device=self.device)[:, [0]]
-        if 'z_orient_delta' in self.input_feats:
-            first_orient_z = torch.eye(3, device=self.device).unsqueeze(0)  # Now the shape is (1, 1, 3, 3)
             first_orient_z = first_orient_z.repeat(feats_unnorm.shape[0], 1, 1)  # Now the shape is (B, 1, 3, 3)
             first_orient_z = transform_body_pose(first_orient_z, 'rot->6d')
@@ -28,7 +29,6 @@ def diffout2motion(diffout):
             # integrate z orient delta --> z component tof orientation
             z_orient_delta = feats_unnorm[..., 9:15]
-            from src.tools.transforms3d import apply_rot_delta, remove_z_rot, get_z_rot, change_for
             prev_z = first_orient_z
             full_z_angle = [first_orient_z[:, None]]
             for i in range(1, z_orient_delta.shape[1]):
@@ -52,14 +52,14 @@ def diffout2motion(diffout):
             full_global_orient = transform_body_pose(full_global_orient_rotmat,
                                                         'rot->6d')
-            first_trans = self.cat_inputs(self.unnorm_inputs(
                                                     [first_trans],
                                                     ['body_transl'])
                                             )[0]
             # apply deltas
             # get velocity in global c.f. and add it to the state position
-            assert 'body_transl_delta_pelv' in self.input_feats
             pelvis_delta = feats_unnorm[..., :3]
             trans_vel_pelv = change_for(pelvis_delta[:, 1:],
                                         full_global_orient_rotmat[:, :-1],

 import torch
+from transform3d import transform_body_pose, apply_rot_delta, remove_z_rot, get_z_rot, change_for
+def diffout2motion(diffout, normalizer):
         # - "body_transl_delta_pelv_xy_wo_z"
         # - "body_transl_z"
         # - "body_orient_xy"
         # - "body_pose"
         # - "body_joints_local_wo_z_rot"
+        feats_unnorm = normalizer.cat_inputs(normalizer.unnorm_inputs(
+                                        normalizer.uncat_inputs(diffout,
+                                            normalizer.input_feats_dims),
+                                        normalizer.input_feats))[0]
         # FIRST POSE FOR GENERATION & DELTAS FOR INTEGRATION
+        if "body_joints_local_wo_z_rot" in normalizer.input_feats:
+            idx = normalizer.input_feats.index("body_joints_local_wo_z_rot")
+            feats_unnorm = feats_unnorm[..., :-normalizer.input_feats_dims[idx]]
         first_trans = torch.zeros(*diffout.shape[:-1], 3,
+                                    device='cuda')[:, [0]]
+        if 'z_orient_delta' in normalizer.input_feats:
+            first_orient_z = torch.eye(3, device='cuda').unsqueeze(0)  # Now the shape is (1, 1, 3, 3)
             first_orient_z = first_orient_z.repeat(feats_unnorm.shape[0], 1, 1)  # Now the shape is (B, 1, 3, 3)
             first_orient_z = transform_body_pose(first_orient_z, 'rot->6d')
             # integrate z orient delta --> z component tof orientation
             z_orient_delta = feats_unnorm[..., 9:15]
             prev_z = first_orient_z
             full_z_angle = [first_orient_z[:, None]]
             for i in range(1, z_orient_delta.shape[1]):
             full_global_orient = transform_body_pose(full_global_orient_rotmat,
                                                         'rot->6d')
+            first_trans = normalizer.cat_inputs(normalizer.unnorm_inputs(
                                                     [first_trans],
                                                     ['body_transl'])
                                             )[0]
             # apply deltas
             # get velocity in global c.f. and add it to the state position
+            assert 'body_transl_delta_pelv' in normalizer.input_feats
             pelvis_delta = feats_unnorm[..., :3]
             trans_vel_pelv = change_for(pelvis_delta[:, 1:],
                                         full_global_orient_rotmat[:, :-1],

model_utils.py CHANGED Viewed

@@ -16,7 +16,7 @@ class TimestepEmbedderMDM(nn.Module):
             nn.Linear(self.latent_dim, time_embed_dim),
             nn.SiLU(),
             nn.Linear(time_embed_dim, time_embed_dim),
-        )
     def forward(self, timesteps):
         return self.time_embed(self.sequence_pos_encoder.pe[timesteps]).permute(1, 0, 2)
@@ -34,11 +34,11 @@ class PositionalEncoding(nn.Module):
         self.negative = negative
         if negative:
-            pe = torch.zeros(2*max_len, d_model)
             position = torch.arange(-max_len, max_len, dtype=torch.float).unsqueeze(1)
         else:
-            pe = torch.zeros(max_len, d_model)
-            position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
         pe[:, 0::2] = torch.sin(position * div_term)

             nn.Linear(self.latent_dim, time_embed_dim),
             nn.SiLU(),
             nn.Linear(time_embed_dim, time_embed_dim),
+        ).to('cuda')
     def forward(self, timesteps):
         return self.time_embed(self.sequence_pos_encoder.pe[timesteps]).permute(1, 0, 2)
         self.negative = negative
         if negative:
+            pe = torch.zeros(2*max_len, d_model,device='cuda')
             position = torch.arange(-max_len, max_len, dtype=torch.float).unsqueeze(1)
         else:
+            pe = torch.zeros(max_len, d_model,device='cuda')
+            position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
         pe[:, 0::2] = torch.sin(position * div_term)

text_encoder.py CHANGED Viewed

@@ -7,7 +7,7 @@ from torch import Tensor, nn
 class ClipTextEncoder(nn.Module):
     def __init__(
             self,
-            modelpath: str='deps/clip-vit-large-patch14', # clip-vit-base-patch32
             finetune: bool = False,
             **kwargs
         ) -> None:

 class ClipTextEncoder(nn.Module):
     def __init__(
             self,
+            modelpath: str='openai/clip-vit-large-patch14', # clip-vit-base-patch32
             finetune: bool = False,
             **kwargs
         ) -> None:

tmed_denoiser.py CHANGED Viewed

@@ -83,7 +83,7 @@ class TMED_denoiser(nn.Module):
         # 1. time_embeddingno
         # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timestep.expand(noised_motion.shape[1]).clone()
         time_emb = self.embed_timestep(timesteps).to(dtype=noised_motion.dtype)
         # make it S first
         # time_emb = self.time_embedding(time_emb).unsqueeze(0)

         # 1. time_embeddingno
         # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timestep.expand(noised_motion.shape[1]).clone().to(noised_motion.device)
         time_emb = self.embed_timestep(timesteps).to(dtype=noised_motion.dtype)
         # make it S first
         # time_emb = self.time_embedding(time_emb).unsqueeze(0)