VIVID / model_structures.log
Teatime666's picture
Add files using upload-large-folder tool
823e49a verified
Denoising UNet structure:
UNet3DConditionModel(
(conv_in): InflatedConv3d(9, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_proj): Timesteps()
(time_embedding): TimestepEmbedding(
(linear_1): LoRACompatibleLinear(in_features=320, out_features=1280, bias=True)
(act): SiLU()
(linear_2): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
)
(down_blocks): ModuleList(
(0): CrossAttnDownBlock3D(
(attentions): ModuleList(
(0-1): 2 x Transformer3DModel(
(norm): GroupNorm(32, 320, eps=1e-06, affine=True)
(proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): TemporalBasicTransformerBlock(
(attn1): Attention(
(to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(attn2): Attention(
(to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_k): LoRACompatibleLinear(in_features=768, out_features=320, bias=False)
(to_v): LoRACompatibleLinear(in_features=768, out_features=320, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=320, out_features=2560, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True)
)
)
(norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
)
)
(resnets): ModuleList(
(0-1): 2 x ResnetBlock3D(
(norm1): InflatedGroupNorm(32, 320, eps=1e-05, affine=True)
(conv1): InflatedConv3d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): Linear(in_features=1280, out_features=320, bias=True)
(norm2): InflatedGroupNorm(32, 320, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): InflatedConv3d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
)
)
(motion_modules): ModuleList(
(0-1): 2 x VanillaTemporalModule(
(temporal_transformer): TemporalTransformer3DModel(
(norm): GroupNorm(32, 320, eps=1e-06, affine=True)
(proj_in): Linear(in_features=320, out_features=320, bias=True)
(transformer_blocks): ModuleList(
(0): TemporalTransformerBlock(
(attention_blocks): ModuleList(
(0-1): 2 x VersatileAttention(
(Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False
(to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
(pos_encoder): PositionalEncoding(
(dropout): Dropout(p=0.0, inplace=False)
)
)
)
(norms): ModuleList(
(0-1): 2 x LayerNorm((320,), eps=1e-05, elementwise_affine=True)
)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=320, out_features=2560, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True)
)
)
(ff_norm): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Linear(in_features=320, out_features=320, bias=True)
)
)
)
(downsamplers): ModuleList(
(0): Downsample3D(
(conv): InflatedConv3d(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
)
)
)
(1): CrossAttnDownBlock3D(
(attentions): ModuleList(
(0-1): 2 x Transformer3DModel(
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): TemporalBasicTransformerBlock(
(attn1): Attention(
(to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(attn2): Attention(
(to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_k): LoRACompatibleLinear(in_features=768, out_features=640, bias=False)
(to_v): LoRACompatibleLinear(in_features=768, out_features=640, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True)
)
)
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
)
)
(resnets): ModuleList(
(0): ResnetBlock3D(
(norm1): InflatedGroupNorm(32, 320, eps=1e-05, affine=True)
(conv1): InflatedConv3d(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): Linear(in_features=1280, out_features=640, bias=True)
(norm2): InflatedGroupNorm(32, 640, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(conv_shortcut): InflatedConv3d(320, 640, kernel_size=(1, 1), stride=(1, 1))
)
(1): ResnetBlock3D(
(norm1): InflatedGroupNorm(32, 640, eps=1e-05, affine=True)
(conv1): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): Linear(in_features=1280, out_features=640, bias=True)
(norm2): InflatedGroupNorm(32, 640, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
)
)
(motion_modules): ModuleList(
(0-1): 2 x VanillaTemporalModule(
(temporal_transformer): TemporalTransformer3DModel(
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(proj_in): Linear(in_features=640, out_features=640, bias=True)
(transformer_blocks): ModuleList(
(0): TemporalTransformerBlock(
(attention_blocks): ModuleList(
(0-1): 2 x VersatileAttention(
(Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False
(to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
(pos_encoder): PositionalEncoding(
(dropout): Dropout(p=0.0, inplace=False)
)
)
)
(norms): ModuleList(
(0-1): 2 x LayerNorm((640,), eps=1e-05, elementwise_affine=True)
)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True)
)
)
(ff_norm): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Linear(in_features=640, out_features=640, bias=True)
)
)
)
(downsamplers): ModuleList(
(0): Downsample3D(
(conv): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
)
)
)
(2): CrossAttnDownBlock3D(
(attentions): ModuleList(
(0-1): 2 x Transformer3DModel(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): TemporalBasicTransformerBlock(
(attn1): Attention(
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(attn2): Attention(
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True)
)
)
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
)
)
(resnets): ModuleList(
(0): ResnetBlock3D(
(norm1): InflatedGroupNorm(32, 640, eps=1e-05, affine=True)
(conv1): InflatedConv3d(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)
(norm2): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(conv_shortcut): InflatedConv3d(640, 1280, kernel_size=(1, 1), stride=(1, 1))
)
(1): ResnetBlock3D(
(norm1): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True)
(conv1): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)
(norm2): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
)
)
(motion_modules): ModuleList(
(0-1): 2 x VanillaTemporalModule(
(temporal_transformer): TemporalTransformer3DModel(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): Linear(in_features=1280, out_features=1280, bias=True)
(transformer_blocks): ModuleList(
(0): TemporalTransformerBlock(
(attention_blocks): ModuleList(
(0-1): 2 x VersatileAttention(
(Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
(pos_encoder): PositionalEncoding(
(dropout): Dropout(p=0.0, inplace=False)
)
)
)
(norms): ModuleList(
(0-1): 2 x LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True)
)
)
(ff_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Linear(in_features=1280, out_features=1280, bias=True)
)
)
)
(downsamplers): ModuleList(
(0): Downsample3D(
(conv): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
)
)
)
(3): DownBlock3D(
(resnets): ModuleList(
(0-1): 2 x ResnetBlock3D(
(norm1): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True)
(conv1): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)
(norm2): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
)
)
(motion_modules): ModuleList(
(0-1): 2 x VanillaTemporalModule(
(temporal_transformer): TemporalTransformer3DModel(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): Linear(in_features=1280, out_features=1280, bias=True)
(transformer_blocks): ModuleList(
(0): TemporalTransformerBlock(
(attention_blocks): ModuleList(
(0-1): 2 x VersatileAttention(
(Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
(pos_encoder): PositionalEncoding(
(dropout): Dropout(p=0.0, inplace=False)
)
)
)
(norms): ModuleList(
(0-1): 2 x LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True)
)
)
(ff_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Linear(in_features=1280, out_features=1280, bias=True)
)
)
)
)
)
(up_blocks): ModuleList(
(0): UpBlock3D(
(resnets): ModuleList(
(0-2): 3 x ResnetBlock3D(
(norm1): InflatedGroupNorm(32, 2560, eps=1e-05, affine=True)
(conv1): InflatedConv3d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)
(norm2): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(conv_shortcut): InflatedConv3d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
)
)
(motion_modules): ModuleList(
(0-2): 3 x VanillaTemporalModule(
(temporal_transformer): TemporalTransformer3DModel(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): Linear(in_features=1280, out_features=1280, bias=True)
(transformer_blocks): ModuleList(
(0): TemporalTransformerBlock(
(attention_blocks): ModuleList(
(0-1): 2 x VersatileAttention(
(Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
(pos_encoder): PositionalEncoding(
(dropout): Dropout(p=0.0, inplace=False)
)
)
)
(norms): ModuleList(
(0-1): 2 x LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True)
)
)
(ff_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Linear(in_features=1280, out_features=1280, bias=True)
)
)
)
(upsamplers): ModuleList(
(0): Upsample3D(
(conv): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
)
(1): CrossAttnUpBlock3D(
(attentions): ModuleList(
(0-2): 3 x Transformer3DModel(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): TemporalBasicTransformerBlock(
(attn1): Attention(
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(attn2): Attention(
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True)
)
)
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
)
)
(resnets): ModuleList(
(0-1): 2 x ResnetBlock3D(
(norm1): InflatedGroupNorm(32, 2560, eps=1e-05, affine=True)
(conv1): InflatedConv3d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)
(norm2): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(conv_shortcut): InflatedConv3d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
)
(2): ResnetBlock3D(
(norm1): InflatedGroupNorm(32, 1920, eps=1e-05, affine=True)
(conv1): InflatedConv3d(1920, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)
(norm2): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(conv_shortcut): InflatedConv3d(1920, 1280, kernel_size=(1, 1), stride=(1, 1))
)
)
(motion_modules): ModuleList(
(0-2): 3 x VanillaTemporalModule(
(temporal_transformer): TemporalTransformer3DModel(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): Linear(in_features=1280, out_features=1280, bias=True)
(transformer_blocks): ModuleList(
(0): TemporalTransformerBlock(
(attention_blocks): ModuleList(
(0-1): 2 x VersatileAttention(
(Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
(pos_encoder): PositionalEncoding(
(dropout): Dropout(p=0.0, inplace=False)
)
)
)
(norms): ModuleList(
(0-1): 2 x LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True)
)
)
(ff_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Linear(in_features=1280, out_features=1280, bias=True)
)
)
)
(upsamplers): ModuleList(
(0): Upsample3D(
(conv): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
)
(2): CrossAttnUpBlock3D(
(attentions): ModuleList(
(0-2): 3 x Transformer3DModel(
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): TemporalBasicTransformerBlock(
(attn1): Attention(
(to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(attn2): Attention(
(to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_k): LoRACompatibleLinear(in_features=768, out_features=640, bias=False)
(to_v): LoRACompatibleLinear(in_features=768, out_features=640, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True)
)
)
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
)
)
(resnets): ModuleList(
(0): ResnetBlock3D(
(norm1): InflatedGroupNorm(32, 1920, eps=1e-05, affine=True)
(conv1): InflatedConv3d(1920, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): Linear(in_features=1280, out_features=640, bias=True)
(norm2): InflatedGroupNorm(32, 640, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(conv_shortcut): InflatedConv3d(1920, 640, kernel_size=(1, 1), stride=(1, 1))
)
(1): ResnetBlock3D(
(norm1): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True)
(conv1): InflatedConv3d(1280, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): Linear(in_features=1280, out_features=640, bias=True)
(norm2): InflatedGroupNorm(32, 640, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(conv_shortcut): InflatedConv3d(1280, 640, kernel_size=(1, 1), stride=(1, 1))
)
(2): ResnetBlock3D(
(norm1): InflatedGroupNorm(32, 960, eps=1e-05, affine=True)
(conv1): InflatedConv3d(960, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): Linear(in_features=1280, out_features=640, bias=True)
(norm2): InflatedGroupNorm(32, 640, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(conv_shortcut): InflatedConv3d(960, 640, kernel_size=(1, 1), stride=(1, 1))
)
)
(motion_modules): ModuleList(
(0-2): 3 x VanillaTemporalModule(
(temporal_transformer): TemporalTransformer3DModel(
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(proj_in): Linear(in_features=640, out_features=640, bias=True)
(transformer_blocks): ModuleList(
(0): TemporalTransformerBlock(
(attention_blocks): ModuleList(
(0-1): 2 x VersatileAttention(
(Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False
(to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
(pos_encoder): PositionalEncoding(
(dropout): Dropout(p=0.0, inplace=False)
)
)
)
(norms): ModuleList(
(0-1): 2 x LayerNorm((640,), eps=1e-05, elementwise_affine=True)
)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True)
)
)
(ff_norm): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Linear(in_features=640, out_features=640, bias=True)
)
)
)
(upsamplers): ModuleList(
(0): Upsample3D(
(conv): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
)
(3): CrossAttnUpBlock3D(
(attentions): ModuleList(
(0-2): 3 x Transformer3DModel(
(norm): GroupNorm(32, 320, eps=1e-06, affine=True)
(proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): TemporalBasicTransformerBlock(
(attn1): Attention(
(to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(attn2): Attention(
(to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_k): LoRACompatibleLinear(in_features=768, out_features=320, bias=False)
(to_v): LoRACompatibleLinear(in_features=768, out_features=320, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=320, out_features=2560, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True)
)
)
(norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
)
)
(resnets): ModuleList(
(0): ResnetBlock3D(
(norm1): InflatedGroupNorm(32, 960, eps=1e-05, affine=True)
(conv1): InflatedConv3d(960, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): Linear(in_features=1280, out_features=320, bias=True)
(norm2): InflatedGroupNorm(32, 320, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): InflatedConv3d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(conv_shortcut): InflatedConv3d(960, 320, kernel_size=(1, 1), stride=(1, 1))
)
(1-2): 2 x ResnetBlock3D(
(norm1): InflatedGroupNorm(32, 640, eps=1e-05, affine=True)
(conv1): InflatedConv3d(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): Linear(in_features=1280, out_features=320, bias=True)
(norm2): InflatedGroupNorm(32, 320, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): InflatedConv3d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(conv_shortcut): InflatedConv3d(640, 320, kernel_size=(1, 1), stride=(1, 1))
)
)
(motion_modules): ModuleList(
(0-2): 3 x VanillaTemporalModule(
(temporal_transformer): TemporalTransformer3DModel(
(norm): GroupNorm(32, 320, eps=1e-06, affine=True)
(proj_in): Linear(in_features=320, out_features=320, bias=True)
(transformer_blocks): ModuleList(
(0): TemporalTransformerBlock(
(attention_blocks): ModuleList(
(0-1): 2 x VersatileAttention(
(Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False
(to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
(pos_encoder): PositionalEncoding(
(dropout): Dropout(p=0.0, inplace=False)
)
)
)
(norms): ModuleList(
(0-1): 2 x LayerNorm((320,), eps=1e-05, elementwise_affine=True)
)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=320, out_features=2560, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True)
)
)
(ff_norm): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Linear(in_features=320, out_features=320, bias=True)
)
)
)
)
)
(mid_block): UNetMidBlock3DCrossAttn(
(attentions): ModuleList(
(0): Transformer3DModel(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): TemporalBasicTransformerBlock(
(attn1): Attention(
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(attn2): Attention(
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True)
)
)
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
)
)
(resnets): ModuleList(
(0-1): 2 x ResnetBlock3D(
(norm1): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True)
(conv1): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)
(norm2): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
)
)
(motion_modules): ModuleList(
(0): VanillaTemporalModule(
(temporal_transformer): TemporalTransformer3DModel(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): Linear(in_features=1280, out_features=1280, bias=True)
(transformer_blocks): ModuleList(
(0): TemporalTransformerBlock(
(attention_blocks): ModuleList(
(0-1): 2 x VersatileAttention(
(Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
(pos_encoder): PositionalEncoding(
(dropout): Dropout(p=0.0, inplace=False)
)
)
)
(norms): ModuleList(
(0-1): 2 x LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True)
)
)
(ff_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Linear(in_features=1280, out_features=1280, bias=True)
)
)
)
)
(conv_norm_out): InflatedGroupNorm(32, 320, eps=1e-05, affine=True)
(conv_act): SiLU()
(conv_out): InflatedConv3d(320, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
Reference UNet structure:
UNet2DConditionModel(
(conv_in): Conv2d(5, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_proj): Timesteps()
(time_embedding): TimestepEmbedding(
(linear_1): LoRACompatibleLinear(in_features=320, out_features=1280, bias=True)
(act): SiLU()
(linear_2): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
)
(down_blocks): ModuleList(
(0): CrossAttnDownBlock2D(
(attentions): ModuleList(
(0-1): 2 x Transformer2DModel(
(norm): GroupNorm(32, 320, eps=1e-06, affine=True)
(proj_in): LoRACompatibleConv(320, 320, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(attn1): Attention(
(to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(attn2): Attention(
(to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_k): LoRACompatibleLinear(in_features=768, out_features=320, bias=False)
(to_v): LoRACompatibleLinear(in_features=768, out_features=320, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=320, out_features=2560, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True)
)
)
)
)
(proj_out): LoRACompatibleConv(320, 320, kernel_size=(1, 1), stride=(1, 1))
)
)
(resnets): ModuleList(
(0-1): 2 x ResnetBlock2D(
(norm1): GroupNorm(32, 320, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True)
(norm2): GroupNorm(32, 320, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
)
)
(downsamplers): ModuleList(
(0): Downsample2D(
(conv): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
)
)
)
(1): CrossAttnDownBlock2D(
(attentions): ModuleList(
(0-1): 2 x Transformer2DModel(
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(proj_in): LoRACompatibleConv(640, 640, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(attn1): Attention(
(to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(attn2): Attention(
(to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_k): LoRACompatibleLinear(in_features=768, out_features=640, bias=False)
(to_v): LoRACompatibleLinear(in_features=768, out_features=640, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True)
)
)
)
)
(proj_out): LoRACompatibleConv(640, 640, kernel_size=(1, 1), stride=(1, 1))
)
)
(resnets): ModuleList(
(0): ResnetBlock2D(
(norm1): GroupNorm(32, 320, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True)
(norm2): GroupNorm(32, 640, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(conv_shortcut): LoRACompatibleConv(320, 640, kernel_size=(1, 1), stride=(1, 1))
)
(1): ResnetBlock2D(
(norm1): GroupNorm(32, 640, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True)
(norm2): GroupNorm(32, 640, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
)
)
(downsamplers): ModuleList(
(0): Downsample2D(
(conv): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
)
)
)
(2): CrossAttnDownBlock2D(
(attentions): ModuleList(
(0-1): 2 x Transformer2DModel(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(attn1): Attention(
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(attn2): Attention(
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True)
)
)
)
)
(proj_out): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
)
)
(resnets): ModuleList(
(0): ResnetBlock2D(
(norm1): GroupNorm(32, 640, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(conv_shortcut): LoRACompatibleConv(640, 1280, kernel_size=(1, 1), stride=(1, 1))
)
(1): ResnetBlock2D(
(norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
)
)
(downsamplers): ModuleList(
(0): Downsample2D(
(conv): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
)
)
)
(3): DownBlock2D(
(resnets): ModuleList(
(0-1): 2 x ResnetBlock2D(
(norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
)
)
)
)
(up_blocks): ModuleList(
(0): UpBlock2D(
(resnets): ModuleList(
(0-2): 3 x ResnetBlock2D(
(norm1): GroupNorm(32, 2560, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(conv_shortcut): LoRACompatibleConv(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
)
)
(upsamplers): ModuleList(
(0): Upsample2D(
(conv): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
)
(1): CrossAttnUpBlock2D(
(attentions): ModuleList(
(0-2): 3 x Transformer2DModel(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(attn1): Attention(
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(attn2): Attention(
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True)
)
)
)
)
(proj_out): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
)
)
(resnets): ModuleList(
(0-1): 2 x ResnetBlock2D(
(norm1): GroupNorm(32, 2560, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(conv_shortcut): LoRACompatibleConv(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
)
(2): ResnetBlock2D(
(norm1): GroupNorm(32, 1920, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(1920, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(conv_shortcut): LoRACompatibleConv(1920, 1280, kernel_size=(1, 1), stride=(1, 1))
)
)
(upsamplers): ModuleList(
(0): Upsample2D(
(conv): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
)
(2): CrossAttnUpBlock2D(
(attentions): ModuleList(
(0-2): 3 x Transformer2DModel(
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(proj_in): LoRACompatibleConv(640, 640, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(attn1): Attention(
(to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(attn2): Attention(
(to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)
(to_k): LoRACompatibleLinear(in_features=768, out_features=640, bias=False)
(to_v): LoRACompatibleLinear(in_features=768, out_features=640, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True)
)
)
)
)
(proj_out): LoRACompatibleConv(640, 640, kernel_size=(1, 1), stride=(1, 1))
)
)
(resnets): ModuleList(
(0): ResnetBlock2D(
(norm1): GroupNorm(32, 1920, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(1920, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True)
(norm2): GroupNorm(32, 640, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(conv_shortcut): LoRACompatibleConv(1920, 640, kernel_size=(1, 1), stride=(1, 1))
)
(1): ResnetBlock2D(
(norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(1280, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True)
(norm2): GroupNorm(32, 640, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(conv_shortcut): LoRACompatibleConv(1280, 640, kernel_size=(1, 1), stride=(1, 1))
)
(2): ResnetBlock2D(
(norm1): GroupNorm(32, 960, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(960, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True)
(norm2): GroupNorm(32, 640, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(conv_shortcut): LoRACompatibleConv(960, 640, kernel_size=(1, 1), stride=(1, 1))
)
)
(upsamplers): ModuleList(
(0): Upsample2D(
(conv): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
)
(3): CrossAttnUpBlock2D(
(attentions): ModuleList(
(0-2): 3 x Transformer2DModel(
(norm): GroupNorm(32, 320, eps=1e-06, affine=True)
(proj_in): LoRACompatibleConv(320, 320, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(attn1): Attention(
(to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(attn2): Attention(
(to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
(to_k): LoRACompatibleLinear(in_features=768, out_features=320, bias=False)
(to_v): LoRACompatibleLinear(in_features=768, out_features=320, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=320, out_features=2560, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True)
)
)
)
)
(proj_out): LoRACompatibleConv(320, 320, kernel_size=(1, 1), stride=(1, 1))
)
)
(resnets): ModuleList(
(0): ResnetBlock2D(
(norm1): GroupNorm(32, 960, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(960, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True)
(norm2): GroupNorm(32, 320, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(conv_shortcut): LoRACompatibleConv(960, 320, kernel_size=(1, 1), stride=(1, 1))
)
(1-2): 2 x ResnetBlock2D(
(norm1): GroupNorm(32, 640, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True)
(norm2): GroupNorm(32, 320, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
(conv_shortcut): LoRACompatibleConv(640, 320, kernel_size=(1, 1), stride=(1, 1))
)
)
)
)
(mid_block): UNetMidBlock2DCrossAttn(
(attentions): ModuleList(
(0): Transformer2DModel(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
(transformer_blocks): ModuleList(
(0): BasicTransformerBlock(
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(attn1): Attention(
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(attn2): Attention(
(to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)
(to_k): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False)
(to_v): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False)
(to_out): ModuleList(
(0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(ff): FeedForward(
(net): ModuleList(
(0): GEGLU(
(proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True)
)
)
)
)
(proj_out): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
)
)
(resnets): ModuleList(
(0-1): 2 x ResnetBlock2D(
(norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)
(conv1): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)
(norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nonlinearity): SiLU()
)
)
)
(conv_norm_out): None
(conv_act): SiLU()
)
Pose Guider structure:
PoseGuider(
(conv_in): InflatedConv3d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(blocks): ModuleList(
(0): InflatedConv3d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): InflatedConv3d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(2): InflatedConv3d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(3): InflatedConv3d(32, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(4): InflatedConv3d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(5): InflatedConv3d(96, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
)
(conv_out): InflatedConv3d(256, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
image_enc:
CLIPVisionModelWithProjection(
(vision_model): CLIPVisionTransformer(
(embeddings): CLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
(position_embedding): Embedding(257, 1024)
)
(pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(encoder): CLIPEncoder(
(layers): ModuleList(
(0-23): 24 x CLIPEncoderLayer(
(self_attn): CLIPAttention(
(k_proj): Linear(in_features=1024, out_features=1024, bias=True)
(v_proj): Linear(in_features=1024, out_features=1024, bias=True)
(q_proj): Linear(in_features=1024, out_features=1024, bias=True)
(out_proj): Linear(in_features=1024, out_features=1024, bias=True)
)
(layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): CLIPMLP(
(activation_fn): QuickGELUActivation()
(fc1): Linear(in_features=1024, out_features=4096, bias=True)
(fc2): Linear(in_features=4096, out_features=1024, bias=True)
)
(layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
)
(post_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(visual_projection): Linear(in_features=1024, out_features=768, bias=False)
)
Pose Guider structure:
PoseGuider(
(conv_in): InflatedConv3d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(blocks): ModuleList(
(0): InflatedConv3d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): InflatedConv3d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(2): InflatedConv3d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(3): InflatedConv3d(32, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(4): InflatedConv3d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(5): InflatedConv3d(96, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
)
(conv_out): InflatedConv3d(256, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
pipe:
Pose2VideoPipeline {
"_class_name": "Pose2VideoPipeline",
"_diffusers_version": "0.24.0",
"denoising_unet": [
"src.models.unet_3d",
"UNet3DConditionModel"
],
"image_encoder": [
"transformers",
"CLIPVisionModelWithProjection"
],
"image_proj_model": [
null,
null
],
"pose_guider": [
"src.models.pose_guider",
"PoseGuider"
],
"reference_unet": [
"src.models.unet_2d_condition",
"UNet2DConditionModel"
],
"scheduler": [
"diffusers",
"DDIMScheduler"
],
"text_encoder": [
null,
null
],
"tokenizer": [
null,
null
],
"vae": [
"diffusers",
"AutoencoderKL"
]
}