Denoising UNet structure: UNet3DConditionModel( (conv_in): InflatedConv3d(9, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_proj): Timesteps() (time_embedding): TimestepEmbedding( (linear_1): LoRACompatibleLinear(in_features=320, out_features=1280, bias=True) (act): SiLU() (linear_2): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) ) (down_blocks): ModuleList( (0): CrossAttnDownBlock3D( (attentions): ModuleList( (0-1): 2 x Transformer3DModel( (norm): GroupNorm(32, 320, eps=1e-06, affine=True) (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) (transformer_blocks): ModuleList( (0): TemporalBasicTransformerBlock( (attn1): Attention( (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) (to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) (to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True) (attn2): Attention( (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) (to_k): LoRACompatibleLinear(in_features=768, out_features=320, bias=False) (to_v): LoRACompatibleLinear(in_features=768, out_features=320, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=320, out_features=2560, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True) ) ) (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) ) ) (resnets): ModuleList( (0-1): 2 x ResnetBlock3D( (norm1): InflatedGroupNorm(32, 320, eps=1e-05, affine=True) (conv1): InflatedConv3d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): Linear(in_features=1280, out_features=320, bias=True) (norm2): InflatedGroupNorm(32, 320, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): InflatedConv3d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() ) ) (motion_modules): ModuleList( (0-1): 2 x VanillaTemporalModule( (temporal_transformer): TemporalTransformer3DModel( (norm): GroupNorm(32, 320, eps=1e-06, affine=True) (proj_in): Linear(in_features=320, out_features=320, bias=True) (transformer_blocks): ModuleList( (0): TemporalTransformerBlock( (attention_blocks): ModuleList( (0-1): 2 x VersatileAttention( (Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) (to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) (to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True) (1): Dropout(p=0.0, inplace=False) ) (pos_encoder): PositionalEncoding( (dropout): Dropout(p=0.0, inplace=False) ) ) ) (norms): ModuleList( (0-1): 2 x LayerNorm((320,), eps=1e-05, elementwise_affine=True) ) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=320, out_features=2560, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True) ) ) (ff_norm): LayerNorm((320,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Linear(in_features=320, out_features=320, bias=True) ) ) ) (downsamplers): ModuleList( (0): Downsample3D( (conv): InflatedConv3d(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) ) ) ) (1): CrossAttnDownBlock3D( (attentions): ModuleList( (0-1): 2 x Transformer3DModel( (norm): GroupNorm(32, 640, eps=1e-06, affine=True) (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) (transformer_blocks): ModuleList( (0): TemporalBasicTransformerBlock( (attn1): Attention( (to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) (to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) (to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) (attn2): Attention( (to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) (to_k): LoRACompatibleLinear(in_features=768, out_features=640, bias=False) (to_v): LoRACompatibleLinear(in_features=768, out_features=640, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True) ) ) (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) ) ) (resnets): ModuleList( (0): ResnetBlock3D( (norm1): InflatedGroupNorm(32, 320, eps=1e-05, affine=True) (conv1): InflatedConv3d(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): Linear(in_features=1280, out_features=640, bias=True) (norm2): InflatedGroupNorm(32, 640, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() (conv_shortcut): InflatedConv3d(320, 640, kernel_size=(1, 1), stride=(1, 1)) ) (1): ResnetBlock3D( (norm1): InflatedGroupNorm(32, 640, eps=1e-05, affine=True) (conv1): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): Linear(in_features=1280, out_features=640, bias=True) (norm2): InflatedGroupNorm(32, 640, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() ) ) (motion_modules): ModuleList( (0-1): 2 x VanillaTemporalModule( (temporal_transformer): TemporalTransformer3DModel( (norm): GroupNorm(32, 640, eps=1e-06, affine=True) (proj_in): Linear(in_features=640, out_features=640, bias=True) (transformer_blocks): ModuleList( (0): TemporalTransformerBlock( (attention_blocks): ModuleList( (0-1): 2 x VersatileAttention( (Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False (to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) (to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) (to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True) (1): Dropout(p=0.0, inplace=False) ) (pos_encoder): PositionalEncoding( (dropout): Dropout(p=0.0, inplace=False) ) ) ) (norms): ModuleList( (0-1): 2 x LayerNorm((640,), eps=1e-05, elementwise_affine=True) ) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True) ) ) (ff_norm): LayerNorm((640,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Linear(in_features=640, out_features=640, bias=True) ) ) ) (downsamplers): ModuleList( (0): Downsample3D( (conv): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) ) ) ) (2): CrossAttnDownBlock3D( (attentions): ModuleList( (0-1): 2 x Transformer3DModel( (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) (transformer_blocks): ModuleList( (0): TemporalBasicTransformerBlock( (attn1): Attention( (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (attn2): Attention( (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_k): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) (to_v): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True) ) ) (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) ) ) (resnets): ModuleList( (0): ResnetBlock3D( (norm1): InflatedGroupNorm(32, 640, eps=1e-05, affine=True) (conv1): InflatedConv3d(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True) (norm2): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() (conv_shortcut): InflatedConv3d(640, 1280, kernel_size=(1, 1), stride=(1, 1)) ) (1): ResnetBlock3D( (norm1): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True) (conv1): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True) (norm2): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() ) ) (motion_modules): ModuleList( (0-1): 2 x VanillaTemporalModule( (temporal_transformer): TemporalTransformer3DModel( (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) (proj_in): Linear(in_features=1280, out_features=1280, bias=True) (transformer_blocks): ModuleList( (0): TemporalTransformerBlock( (attention_blocks): ModuleList( (0-1): 2 x VersatileAttention( (Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) (pos_encoder): PositionalEncoding( (dropout): Dropout(p=0.0, inplace=False) ) ) ) (norms): ModuleList( (0-1): 2 x LayerNorm((1280,), eps=1e-05, elementwise_affine=True) ) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True) ) ) (ff_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Linear(in_features=1280, out_features=1280, bias=True) ) ) ) (downsamplers): ModuleList( (0): Downsample3D( (conv): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) ) ) ) (3): DownBlock3D( (resnets): ModuleList( (0-1): 2 x ResnetBlock3D( (norm1): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True) (conv1): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True) (norm2): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() ) ) (motion_modules): ModuleList( (0-1): 2 x VanillaTemporalModule( (temporal_transformer): TemporalTransformer3DModel( (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) (proj_in): Linear(in_features=1280, out_features=1280, bias=True) (transformer_blocks): ModuleList( (0): TemporalTransformerBlock( (attention_blocks): ModuleList( (0-1): 2 x VersatileAttention( (Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) (pos_encoder): PositionalEncoding( (dropout): Dropout(p=0.0, inplace=False) ) ) ) (norms): ModuleList( (0-1): 2 x LayerNorm((1280,), eps=1e-05, elementwise_affine=True) ) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True) ) ) (ff_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Linear(in_features=1280, out_features=1280, bias=True) ) ) ) ) ) (up_blocks): ModuleList( (0): UpBlock3D( (resnets): ModuleList( (0-2): 3 x ResnetBlock3D( (norm1): InflatedGroupNorm(32, 2560, eps=1e-05, affine=True) (conv1): InflatedConv3d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True) (norm2): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() (conv_shortcut): InflatedConv3d(2560, 1280, kernel_size=(1, 1), stride=(1, 1)) ) ) (motion_modules): ModuleList( (0-2): 3 x VanillaTemporalModule( (temporal_transformer): TemporalTransformer3DModel( (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) (proj_in): Linear(in_features=1280, out_features=1280, bias=True) (transformer_blocks): ModuleList( (0): TemporalTransformerBlock( (attention_blocks): ModuleList( (0-1): 2 x VersatileAttention( (Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) (pos_encoder): PositionalEncoding( (dropout): Dropout(p=0.0, inplace=False) ) ) ) (norms): ModuleList( (0-1): 2 x LayerNorm((1280,), eps=1e-05, elementwise_affine=True) ) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True) ) ) (ff_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Linear(in_features=1280, out_features=1280, bias=True) ) ) ) (upsamplers): ModuleList( (0): Upsample3D( (conv): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) ) (1): CrossAttnUpBlock3D( (attentions): ModuleList( (0-2): 3 x Transformer3DModel( (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) (transformer_blocks): ModuleList( (0): TemporalBasicTransformerBlock( (attn1): Attention( (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (attn2): Attention( (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_k): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) (to_v): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True) ) ) (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) ) ) (resnets): ModuleList( (0-1): 2 x ResnetBlock3D( (norm1): InflatedGroupNorm(32, 2560, eps=1e-05, affine=True) (conv1): InflatedConv3d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True) (norm2): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() (conv_shortcut): InflatedConv3d(2560, 1280, kernel_size=(1, 1), stride=(1, 1)) ) (2): ResnetBlock3D( (norm1): InflatedGroupNorm(32, 1920, eps=1e-05, affine=True) (conv1): InflatedConv3d(1920, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True) (norm2): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() (conv_shortcut): InflatedConv3d(1920, 1280, kernel_size=(1, 1), stride=(1, 1)) ) ) (motion_modules): ModuleList( (0-2): 3 x VanillaTemporalModule( (temporal_transformer): TemporalTransformer3DModel( (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) (proj_in): Linear(in_features=1280, out_features=1280, bias=True) (transformer_blocks): ModuleList( (0): TemporalTransformerBlock( (attention_blocks): ModuleList( (0-1): 2 x VersatileAttention( (Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) (pos_encoder): PositionalEncoding( (dropout): Dropout(p=0.0, inplace=False) ) ) ) (norms): ModuleList( (0-1): 2 x LayerNorm((1280,), eps=1e-05, elementwise_affine=True) ) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True) ) ) (ff_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Linear(in_features=1280, out_features=1280, bias=True) ) ) ) (upsamplers): ModuleList( (0): Upsample3D( (conv): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) ) (2): CrossAttnUpBlock3D( (attentions): ModuleList( (0-2): 3 x Transformer3DModel( (norm): GroupNorm(32, 640, eps=1e-06, affine=True) (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) (transformer_blocks): ModuleList( (0): TemporalBasicTransformerBlock( (attn1): Attention( (to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) (to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) (to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) (attn2): Attention( (to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) (to_k): LoRACompatibleLinear(in_features=768, out_features=640, bias=False) (to_v): LoRACompatibleLinear(in_features=768, out_features=640, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True) ) ) (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) ) ) (resnets): ModuleList( (0): ResnetBlock3D( (norm1): InflatedGroupNorm(32, 1920, eps=1e-05, affine=True) (conv1): InflatedConv3d(1920, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): Linear(in_features=1280, out_features=640, bias=True) (norm2): InflatedGroupNorm(32, 640, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() (conv_shortcut): InflatedConv3d(1920, 640, kernel_size=(1, 1), stride=(1, 1)) ) (1): ResnetBlock3D( (norm1): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True) (conv1): InflatedConv3d(1280, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): Linear(in_features=1280, out_features=640, bias=True) (norm2): InflatedGroupNorm(32, 640, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() (conv_shortcut): InflatedConv3d(1280, 640, kernel_size=(1, 1), stride=(1, 1)) ) (2): ResnetBlock3D( (norm1): InflatedGroupNorm(32, 960, eps=1e-05, affine=True) (conv1): InflatedConv3d(960, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): Linear(in_features=1280, out_features=640, bias=True) (norm2): InflatedGroupNorm(32, 640, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() (conv_shortcut): InflatedConv3d(960, 640, kernel_size=(1, 1), stride=(1, 1)) ) ) (motion_modules): ModuleList( (0-2): 3 x VanillaTemporalModule( (temporal_transformer): TemporalTransformer3DModel( (norm): GroupNorm(32, 640, eps=1e-06, affine=True) (proj_in): Linear(in_features=640, out_features=640, bias=True) (transformer_blocks): ModuleList( (0): TemporalTransformerBlock( (attention_blocks): ModuleList( (0-1): 2 x VersatileAttention( (Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False (to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) (to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) (to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True) (1): Dropout(p=0.0, inplace=False) ) (pos_encoder): PositionalEncoding( (dropout): Dropout(p=0.0, inplace=False) ) ) ) (norms): ModuleList( (0-1): 2 x LayerNorm((640,), eps=1e-05, elementwise_affine=True) ) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True) ) ) (ff_norm): LayerNorm((640,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Linear(in_features=640, out_features=640, bias=True) ) ) ) (upsamplers): ModuleList( (0): Upsample3D( (conv): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) ) (3): CrossAttnUpBlock3D( (attentions): ModuleList( (0-2): 3 x Transformer3DModel( (norm): GroupNorm(32, 320, eps=1e-06, affine=True) (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) (transformer_blocks): ModuleList( (0): TemporalBasicTransformerBlock( (attn1): Attention( (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) (to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) (to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True) (attn2): Attention( (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) (to_k): LoRACompatibleLinear(in_features=768, out_features=320, bias=False) (to_v): LoRACompatibleLinear(in_features=768, out_features=320, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=320, out_features=2560, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True) ) ) (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) ) ) (resnets): ModuleList( (0): ResnetBlock3D( (norm1): InflatedGroupNorm(32, 960, eps=1e-05, affine=True) (conv1): InflatedConv3d(960, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): Linear(in_features=1280, out_features=320, bias=True) (norm2): InflatedGroupNorm(32, 320, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): InflatedConv3d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() (conv_shortcut): InflatedConv3d(960, 320, kernel_size=(1, 1), stride=(1, 1)) ) (1-2): 2 x ResnetBlock3D( (norm1): InflatedGroupNorm(32, 640, eps=1e-05, affine=True) (conv1): InflatedConv3d(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): Linear(in_features=1280, out_features=320, bias=True) (norm2): InflatedGroupNorm(32, 320, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): InflatedConv3d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() (conv_shortcut): InflatedConv3d(640, 320, kernel_size=(1, 1), stride=(1, 1)) ) ) (motion_modules): ModuleList( (0-2): 3 x VanillaTemporalModule( (temporal_transformer): TemporalTransformer3DModel( (norm): GroupNorm(32, 320, eps=1e-06, affine=True) (proj_in): Linear(in_features=320, out_features=320, bias=True) (transformer_blocks): ModuleList( (0): TemporalTransformerBlock( (attention_blocks): ModuleList( (0-1): 2 x VersatileAttention( (Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) (to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) (to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True) (1): Dropout(p=0.0, inplace=False) ) (pos_encoder): PositionalEncoding( (dropout): Dropout(p=0.0, inplace=False) ) ) ) (norms): ModuleList( (0-1): 2 x LayerNorm((320,), eps=1e-05, elementwise_affine=True) ) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=320, out_features=2560, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True) ) ) (ff_norm): LayerNorm((320,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Linear(in_features=320, out_features=320, bias=True) ) ) ) ) ) (mid_block): UNetMidBlock3DCrossAttn( (attentions): ModuleList( (0): Transformer3DModel( (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) (transformer_blocks): ModuleList( (0): TemporalBasicTransformerBlock( (attn1): Attention( (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (attn2): Attention( (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_k): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) (to_v): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True) ) ) (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) ) ) (resnets): ModuleList( (0-1): 2 x ResnetBlock3D( (norm1): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True) (conv1): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True) (norm2): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() ) ) (motion_modules): ModuleList( (0): VanillaTemporalModule( (temporal_transformer): TemporalTransformer3DModel( (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) (proj_in): Linear(in_features=1280, out_features=1280, bias=True) (transformer_blocks): ModuleList( (0): TemporalTransformerBlock( (attention_blocks): ModuleList( (0-1): 2 x VersatileAttention( (Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) (pos_encoder): PositionalEncoding( (dropout): Dropout(p=0.0, inplace=False) ) ) ) (norms): ModuleList( (0-1): 2 x LayerNorm((1280,), eps=1e-05, elementwise_affine=True) ) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True) ) ) (ff_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) ) ) (proj_out): Linear(in_features=1280, out_features=1280, bias=True) ) ) ) ) (conv_norm_out): InflatedGroupNorm(32, 320, eps=1e-05, affine=True) (conv_act): SiLU() (conv_out): InflatedConv3d(320, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) Reference UNet structure: UNet2DConditionModel( (conv_in): Conv2d(5, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_proj): Timesteps() (time_embedding): TimestepEmbedding( (linear_1): LoRACompatibleLinear(in_features=320, out_features=1280, bias=True) (act): SiLU() (linear_2): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) ) (down_blocks): ModuleList( (0): CrossAttnDownBlock2D( (attentions): ModuleList( (0-1): 2 x Transformer2DModel( (norm): GroupNorm(32, 320, eps=1e-06, affine=True) (proj_in): LoRACompatibleConv(320, 320, kernel_size=(1, 1), stride=(1, 1)) (transformer_blocks): ModuleList( (0): BasicTransformerBlock( (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True) (attn1): Attention( (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) (to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) (to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True) (attn2): Attention( (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) (to_k): LoRACompatibleLinear(in_features=768, out_features=320, bias=False) (to_v): LoRACompatibleLinear(in_features=768, out_features=320, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=320, out_features=2560, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True) ) ) ) ) (proj_out): LoRACompatibleConv(320, 320, kernel_size=(1, 1), stride=(1, 1)) ) ) (resnets): ModuleList( (0-1): 2 x ResnetBlock2D( (norm1): GroupNorm(32, 320, eps=1e-05, affine=True) (conv1): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True) (norm2): GroupNorm(32, 320, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() ) ) (downsamplers): ModuleList( (0): Downsample2D( (conv): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) ) ) ) (1): CrossAttnDownBlock2D( (attentions): ModuleList( (0-1): 2 x Transformer2DModel( (norm): GroupNorm(32, 640, eps=1e-06, affine=True) (proj_in): LoRACompatibleConv(640, 640, kernel_size=(1, 1), stride=(1, 1)) (transformer_blocks): ModuleList( (0): BasicTransformerBlock( (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) (attn1): Attention( (to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) (to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) (to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) (attn2): Attention( (to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) (to_k): LoRACompatibleLinear(in_features=768, out_features=640, bias=False) (to_v): LoRACompatibleLinear(in_features=768, out_features=640, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True) ) ) ) ) (proj_out): LoRACompatibleConv(640, 640, kernel_size=(1, 1), stride=(1, 1)) ) ) (resnets): ModuleList( (0): ResnetBlock2D( (norm1): GroupNorm(32, 320, eps=1e-05, affine=True) (conv1): LoRACompatibleConv(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True) (norm2): GroupNorm(32, 640, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() (conv_shortcut): LoRACompatibleConv(320, 640, kernel_size=(1, 1), stride=(1, 1)) ) (1): ResnetBlock2D( (norm1): GroupNorm(32, 640, eps=1e-05, affine=True) (conv1): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True) (norm2): GroupNorm(32, 640, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() ) ) (downsamplers): ModuleList( (0): Downsample2D( (conv): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) ) ) ) (2): CrossAttnDownBlock2D( (attentions): ModuleList( (0-1): 2 x Transformer2DModel( (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) (proj_in): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) (transformer_blocks): ModuleList( (0): BasicTransformerBlock( (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (attn1): Attention( (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (attn2): Attention( (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_k): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) (to_v): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True) ) ) ) ) (proj_out): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) ) ) (resnets): ModuleList( (0): ResnetBlock2D( (norm1): GroupNorm(32, 640, eps=1e-05, affine=True) (conv1): LoRACompatibleConv(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() (conv_shortcut): LoRACompatibleConv(640, 1280, kernel_size=(1, 1), stride=(1, 1)) ) (1): ResnetBlock2D( (norm1): GroupNorm(32, 1280, eps=1e-05, affine=True) (conv1): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() ) ) (downsamplers): ModuleList( (0): Downsample2D( (conv): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) ) ) ) (3): DownBlock2D( (resnets): ModuleList( (0-1): 2 x ResnetBlock2D( (norm1): GroupNorm(32, 1280, eps=1e-05, affine=True) (conv1): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() ) ) ) ) (up_blocks): ModuleList( (0): UpBlock2D( (resnets): ModuleList( (0-2): 3 x ResnetBlock2D( (norm1): GroupNorm(32, 2560, eps=1e-05, affine=True) (conv1): LoRACompatibleConv(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() (conv_shortcut): LoRACompatibleConv(2560, 1280, kernel_size=(1, 1), stride=(1, 1)) ) ) (upsamplers): ModuleList( (0): Upsample2D( (conv): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) ) (1): CrossAttnUpBlock2D( (attentions): ModuleList( (0-2): 3 x Transformer2DModel( (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) (proj_in): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) (transformer_blocks): ModuleList( (0): BasicTransformerBlock( (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (attn1): Attention( (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (attn2): Attention( (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_k): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) (to_v): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True) ) ) ) ) (proj_out): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) ) ) (resnets): ModuleList( (0-1): 2 x ResnetBlock2D( (norm1): GroupNorm(32, 2560, eps=1e-05, affine=True) (conv1): LoRACompatibleConv(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() (conv_shortcut): LoRACompatibleConv(2560, 1280, kernel_size=(1, 1), stride=(1, 1)) ) (2): ResnetBlock2D( (norm1): GroupNorm(32, 1920, eps=1e-05, affine=True) (conv1): LoRACompatibleConv(1920, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() (conv_shortcut): LoRACompatibleConv(1920, 1280, kernel_size=(1, 1), stride=(1, 1)) ) ) (upsamplers): ModuleList( (0): Upsample2D( (conv): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) ) (2): CrossAttnUpBlock2D( (attentions): ModuleList( (0-2): 3 x Transformer2DModel( (norm): GroupNorm(32, 640, eps=1e-06, affine=True) (proj_in): LoRACompatibleConv(640, 640, kernel_size=(1, 1), stride=(1, 1)) (transformer_blocks): ModuleList( (0): BasicTransformerBlock( (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) (attn1): Attention( (to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) (to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) (to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) (attn2): Attention( (to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) (to_k): LoRACompatibleLinear(in_features=768, out_features=640, bias=False) (to_v): LoRACompatibleLinear(in_features=768, out_features=640, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True) ) ) ) ) (proj_out): LoRACompatibleConv(640, 640, kernel_size=(1, 1), stride=(1, 1)) ) ) (resnets): ModuleList( (0): ResnetBlock2D( (norm1): GroupNorm(32, 1920, eps=1e-05, affine=True) (conv1): LoRACompatibleConv(1920, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True) (norm2): GroupNorm(32, 640, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() (conv_shortcut): LoRACompatibleConv(1920, 640, kernel_size=(1, 1), stride=(1, 1)) ) (1): ResnetBlock2D( (norm1): GroupNorm(32, 1280, eps=1e-05, affine=True) (conv1): LoRACompatibleConv(1280, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True) (norm2): GroupNorm(32, 640, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() (conv_shortcut): LoRACompatibleConv(1280, 640, kernel_size=(1, 1), stride=(1, 1)) ) (2): ResnetBlock2D( (norm1): GroupNorm(32, 960, eps=1e-05, affine=True) (conv1): LoRACompatibleConv(960, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True) (norm2): GroupNorm(32, 640, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() (conv_shortcut): LoRACompatibleConv(960, 640, kernel_size=(1, 1), stride=(1, 1)) ) ) (upsamplers): ModuleList( (0): Upsample2D( (conv): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) ) (3): CrossAttnUpBlock2D( (attentions): ModuleList( (0-2): 3 x Transformer2DModel( (norm): GroupNorm(32, 320, eps=1e-06, affine=True) (proj_in): LoRACompatibleConv(320, 320, kernel_size=(1, 1), stride=(1, 1)) (transformer_blocks): ModuleList( (0): BasicTransformerBlock( (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True) (attn1): Attention( (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) (to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) (to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True) (attn2): Attention( (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) (to_k): LoRACompatibleLinear(in_features=768, out_features=320, bias=False) (to_v): LoRACompatibleLinear(in_features=768, out_features=320, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=320, out_features=2560, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True) ) ) ) ) (proj_out): LoRACompatibleConv(320, 320, kernel_size=(1, 1), stride=(1, 1)) ) ) (resnets): ModuleList( (0): ResnetBlock2D( (norm1): GroupNorm(32, 960, eps=1e-05, affine=True) (conv1): LoRACompatibleConv(960, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True) (norm2): GroupNorm(32, 320, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() (conv_shortcut): LoRACompatibleConv(960, 320, kernel_size=(1, 1), stride=(1, 1)) ) (1-2): 2 x ResnetBlock2D( (norm1): GroupNorm(32, 640, eps=1e-05, affine=True) (conv1): LoRACompatibleConv(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True) (norm2): GroupNorm(32, 320, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() (conv_shortcut): LoRACompatibleConv(640, 320, kernel_size=(1, 1), stride=(1, 1)) ) ) ) ) (mid_block): UNetMidBlock2DCrossAttn( (attentions): ModuleList( (0): Transformer2DModel( (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) (proj_in): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) (transformer_blocks): ModuleList( (0): BasicTransformerBlock( (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (attn1): Attention( (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (attn2): Attention( (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) (to_k): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) (to_v): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) (to_out): ModuleList( (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) (ff): FeedForward( (net): ModuleList( (0): GEGLU( (proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True) ) (1): Dropout(p=0.0, inplace=False) (2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True) ) ) ) ) (proj_out): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) ) ) (resnets): ModuleList( (0-1): 2 x ResnetBlock2D( (norm1): GroupNorm(32, 1280, eps=1e-05, affine=True) (conv1): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (nonlinearity): SiLU() ) ) ) (conv_norm_out): None (conv_act): SiLU() ) Pose Guider structure: PoseGuider( (conv_in): InflatedConv3d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (blocks): ModuleList( (0): InflatedConv3d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): InflatedConv3d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) (2): InflatedConv3d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (3): InflatedConv3d(32, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) (4): InflatedConv3d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (5): InflatedConv3d(96, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) ) (conv_out): InflatedConv3d(256, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) image_enc: CLIPVisionModelWithProjection( (vision_model): CLIPVisionTransformer( (embeddings): CLIPVisionEmbeddings( (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False) (position_embedding): Embedding(257, 1024) ) (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (encoder): CLIPEncoder( (layers): ModuleList( (0-23): 24 x CLIPEncoderLayer( (self_attn): CLIPAttention( (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (mlp): CLIPMLP( (activation_fn): QuickGELUActivation() (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) ) (layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) ) ) (post_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (visual_projection): Linear(in_features=1024, out_features=768, bias=False) ) Pose Guider structure: PoseGuider( (conv_in): InflatedConv3d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (blocks): ModuleList( (0): InflatedConv3d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): InflatedConv3d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) (2): InflatedConv3d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (3): InflatedConv3d(32, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) (4): InflatedConv3d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (5): InflatedConv3d(96, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) ) (conv_out): InflatedConv3d(256, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) pipe: Pose2VideoPipeline { "_class_name": "Pose2VideoPipeline", "_diffusers_version": "0.24.0", "denoising_unet": [ "src.models.unet_3d", "UNet3DConditionModel" ], "image_encoder": [ "transformers", "CLIPVisionModelWithProjection" ], "image_proj_model": [ null, null ], "pose_guider": [ "src.models.pose_guider", "PoseGuider" ], "reference_unet": [ "src.models.unet_2d_condition", "UNet2DConditionModel" ], "scheduler": [ "diffusers", "DDIMScheduler" ], "text_encoder": [ null, null ], "tokenizer": [ null, null ], "vae": [ "diffusers", "AutoencoderKL" ] }