| Denoising UNet structure: | |
| UNet3DConditionModel( | |
| (conv_in): InflatedConv3d(9, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_proj): Timesteps() | |
| (time_embedding): TimestepEmbedding( | |
| (linear_1): LoRACompatibleLinear(in_features=320, out_features=1280, bias=True) | |
| (act): SiLU() | |
| (linear_2): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| ) | |
| (down_blocks): ModuleList( | |
| (0): CrossAttnDownBlock3D( | |
| (attentions): ModuleList( | |
| (0-1): 2 x Transformer3DModel( | |
| (norm): GroupNorm(32, 320, eps=1e-06, affine=True) | |
| (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): TemporalBasicTransformerBlock( | |
| (attn1): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| (attn2): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=768, out_features=320, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=768, out_features=320, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=320, out_features=2560, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True) | |
| ) | |
| ) | |
| (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (resnets): ModuleList( | |
| (0-1): 2 x ResnetBlock3D( | |
| (norm1): InflatedGroupNorm(32, 320, eps=1e-05, affine=True) | |
| (conv1): InflatedConv3d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): Linear(in_features=1280, out_features=320, bias=True) | |
| (norm2): InflatedGroupNorm(32, 320, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): InflatedConv3d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| ) | |
| ) | |
| (motion_modules): ModuleList( | |
| (0-1): 2 x VanillaTemporalModule( | |
| (temporal_transformer): TemporalTransformer3DModel( | |
| (norm): GroupNorm(32, 320, eps=1e-06, affine=True) | |
| (proj_in): Linear(in_features=320, out_features=320, bias=True) | |
| (transformer_blocks): ModuleList( | |
| (0): TemporalTransformerBlock( | |
| (attention_blocks): ModuleList( | |
| (0-1): 2 x VersatileAttention( | |
| (Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False | |
| (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| (pos_encoder): PositionalEncoding( | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| (norms): ModuleList( | |
| (0-1): 2 x LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=320, out_features=2560, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True) | |
| ) | |
| ) | |
| (ff_norm): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Linear(in_features=320, out_features=320, bias=True) | |
| ) | |
| ) | |
| ) | |
| (downsamplers): ModuleList( | |
| (0): Downsample3D( | |
| (conv): InflatedConv3d(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) | |
| ) | |
| ) | |
| ) | |
| (1): CrossAttnDownBlock3D( | |
| (attentions): ModuleList( | |
| (0-1): 2 x Transformer3DModel( | |
| (norm): GroupNorm(32, 640, eps=1e-06, affine=True) | |
| (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): TemporalBasicTransformerBlock( | |
| (attn1): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| (attn2): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=768, out_features=640, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=768, out_features=640, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True) | |
| ) | |
| ) | |
| (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (resnets): ModuleList( | |
| (0): ResnetBlock3D( | |
| (norm1): InflatedGroupNorm(32, 320, eps=1e-05, affine=True) | |
| (conv1): InflatedConv3d(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): Linear(in_features=1280, out_features=640, bias=True) | |
| (norm2): InflatedGroupNorm(32, 640, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| (conv_shortcut): InflatedConv3d(320, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (1): ResnetBlock3D( | |
| (norm1): InflatedGroupNorm(32, 640, eps=1e-05, affine=True) | |
| (conv1): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): Linear(in_features=1280, out_features=640, bias=True) | |
| (norm2): InflatedGroupNorm(32, 640, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| ) | |
| ) | |
| (motion_modules): ModuleList( | |
| (0-1): 2 x VanillaTemporalModule( | |
| (temporal_transformer): TemporalTransformer3DModel( | |
| (norm): GroupNorm(32, 640, eps=1e-06, affine=True) | |
| (proj_in): Linear(in_features=640, out_features=640, bias=True) | |
| (transformer_blocks): ModuleList( | |
| (0): TemporalTransformerBlock( | |
| (attention_blocks): ModuleList( | |
| (0-1): 2 x VersatileAttention( | |
| (Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False | |
| (to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| (pos_encoder): PositionalEncoding( | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| (norms): ModuleList( | |
| (0-1): 2 x LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True) | |
| ) | |
| ) | |
| (ff_norm): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Linear(in_features=640, out_features=640, bias=True) | |
| ) | |
| ) | |
| ) | |
| (downsamplers): ModuleList( | |
| (0): Downsample3D( | |
| (conv): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) | |
| ) | |
| ) | |
| ) | |
| (2): CrossAttnDownBlock3D( | |
| (attentions): ModuleList( | |
| (0-1): 2 x Transformer3DModel( | |
| (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) | |
| (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): TemporalBasicTransformerBlock( | |
| (attn1): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (attn2): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True) | |
| ) | |
| ) | |
| (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (resnets): ModuleList( | |
| (0): ResnetBlock3D( | |
| (norm1): InflatedGroupNorm(32, 640, eps=1e-05, affine=True) | |
| (conv1): InflatedConv3d(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True) | |
| (norm2): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| (conv_shortcut): InflatedConv3d(640, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (1): ResnetBlock3D( | |
| (norm1): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True) | |
| (conv1): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True) | |
| (norm2): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| ) | |
| ) | |
| (motion_modules): ModuleList( | |
| (0-1): 2 x VanillaTemporalModule( | |
| (temporal_transformer): TemporalTransformer3DModel( | |
| (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) | |
| (proj_in): Linear(in_features=1280, out_features=1280, bias=True) | |
| (transformer_blocks): ModuleList( | |
| (0): TemporalTransformerBlock( | |
| (attention_blocks): ModuleList( | |
| (0-1): 2 x VersatileAttention( | |
| (Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False | |
| (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| (pos_encoder): PositionalEncoding( | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| (norms): ModuleList( | |
| (0-1): 2 x LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True) | |
| ) | |
| ) | |
| (ff_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Linear(in_features=1280, out_features=1280, bias=True) | |
| ) | |
| ) | |
| ) | |
| (downsamplers): ModuleList( | |
| (0): Downsample3D( | |
| (conv): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) | |
| ) | |
| ) | |
| ) | |
| (3): DownBlock3D( | |
| (resnets): ModuleList( | |
| (0-1): 2 x ResnetBlock3D( | |
| (norm1): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True) | |
| (conv1): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True) | |
| (norm2): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| ) | |
| ) | |
| (motion_modules): ModuleList( | |
| (0-1): 2 x VanillaTemporalModule( | |
| (temporal_transformer): TemporalTransformer3DModel( | |
| (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) | |
| (proj_in): Linear(in_features=1280, out_features=1280, bias=True) | |
| (transformer_blocks): ModuleList( | |
| (0): TemporalTransformerBlock( | |
| (attention_blocks): ModuleList( | |
| (0-1): 2 x VersatileAttention( | |
| (Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False | |
| (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| (pos_encoder): PositionalEncoding( | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| (norms): ModuleList( | |
| (0-1): 2 x LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True) | |
| ) | |
| ) | |
| (ff_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Linear(in_features=1280, out_features=1280, bias=True) | |
| ) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (up_blocks): ModuleList( | |
| (0): UpBlock3D( | |
| (resnets): ModuleList( | |
| (0-2): 3 x ResnetBlock3D( | |
| (norm1): InflatedGroupNorm(32, 2560, eps=1e-05, affine=True) | |
| (conv1): InflatedConv3d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True) | |
| (norm2): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| (conv_shortcut): InflatedConv3d(2560, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (motion_modules): ModuleList( | |
| (0-2): 3 x VanillaTemporalModule( | |
| (temporal_transformer): TemporalTransformer3DModel( | |
| (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) | |
| (proj_in): Linear(in_features=1280, out_features=1280, bias=True) | |
| (transformer_blocks): ModuleList( | |
| (0): TemporalTransformerBlock( | |
| (attention_blocks): ModuleList( | |
| (0-1): 2 x VersatileAttention( | |
| (Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False | |
| (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| (pos_encoder): PositionalEncoding( | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| (norms): ModuleList( | |
| (0-1): 2 x LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True) | |
| ) | |
| ) | |
| (ff_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Linear(in_features=1280, out_features=1280, bias=True) | |
| ) | |
| ) | |
| ) | |
| (upsamplers): ModuleList( | |
| (0): Upsample3D( | |
| (conv): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| ) | |
| ) | |
| (1): CrossAttnUpBlock3D( | |
| (attentions): ModuleList( | |
| (0-2): 3 x Transformer3DModel( | |
| (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) | |
| (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): TemporalBasicTransformerBlock( | |
| (attn1): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (attn2): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True) | |
| ) | |
| ) | |
| (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (resnets): ModuleList( | |
| (0-1): 2 x ResnetBlock3D( | |
| (norm1): InflatedGroupNorm(32, 2560, eps=1e-05, affine=True) | |
| (conv1): InflatedConv3d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True) | |
| (norm2): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| (conv_shortcut): InflatedConv3d(2560, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (2): ResnetBlock3D( | |
| (norm1): InflatedGroupNorm(32, 1920, eps=1e-05, affine=True) | |
| (conv1): InflatedConv3d(1920, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True) | |
| (norm2): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| (conv_shortcut): InflatedConv3d(1920, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (motion_modules): ModuleList( | |
| (0-2): 3 x VanillaTemporalModule( | |
| (temporal_transformer): TemporalTransformer3DModel( | |
| (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) | |
| (proj_in): Linear(in_features=1280, out_features=1280, bias=True) | |
| (transformer_blocks): ModuleList( | |
| (0): TemporalTransformerBlock( | |
| (attention_blocks): ModuleList( | |
| (0-1): 2 x VersatileAttention( | |
| (Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False | |
| (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| (pos_encoder): PositionalEncoding( | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| (norms): ModuleList( | |
| (0-1): 2 x LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True) | |
| ) | |
| ) | |
| (ff_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Linear(in_features=1280, out_features=1280, bias=True) | |
| ) | |
| ) | |
| ) | |
| (upsamplers): ModuleList( | |
| (0): Upsample3D( | |
| (conv): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| ) | |
| ) | |
| (2): CrossAttnUpBlock3D( | |
| (attentions): ModuleList( | |
| (0-2): 3 x Transformer3DModel( | |
| (norm): GroupNorm(32, 640, eps=1e-06, affine=True) | |
| (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): TemporalBasicTransformerBlock( | |
| (attn1): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| (attn2): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=768, out_features=640, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=768, out_features=640, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True) | |
| ) | |
| ) | |
| (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (resnets): ModuleList( | |
| (0): ResnetBlock3D( | |
| (norm1): InflatedGroupNorm(32, 1920, eps=1e-05, affine=True) | |
| (conv1): InflatedConv3d(1920, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): Linear(in_features=1280, out_features=640, bias=True) | |
| (norm2): InflatedGroupNorm(32, 640, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| (conv_shortcut): InflatedConv3d(1920, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (1): ResnetBlock3D( | |
| (norm1): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True) | |
| (conv1): InflatedConv3d(1280, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): Linear(in_features=1280, out_features=640, bias=True) | |
| (norm2): InflatedGroupNorm(32, 640, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| (conv_shortcut): InflatedConv3d(1280, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (2): ResnetBlock3D( | |
| (norm1): InflatedGroupNorm(32, 960, eps=1e-05, affine=True) | |
| (conv1): InflatedConv3d(960, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): Linear(in_features=1280, out_features=640, bias=True) | |
| (norm2): InflatedGroupNorm(32, 640, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| (conv_shortcut): InflatedConv3d(960, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (motion_modules): ModuleList( | |
| (0-2): 3 x VanillaTemporalModule( | |
| (temporal_transformer): TemporalTransformer3DModel( | |
| (norm): GroupNorm(32, 640, eps=1e-06, affine=True) | |
| (proj_in): Linear(in_features=640, out_features=640, bias=True) | |
| (transformer_blocks): ModuleList( | |
| (0): TemporalTransformerBlock( | |
| (attention_blocks): ModuleList( | |
| (0-1): 2 x VersatileAttention( | |
| (Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False | |
| (to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| (pos_encoder): PositionalEncoding( | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| (norms): ModuleList( | |
| (0-1): 2 x LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True) | |
| ) | |
| ) | |
| (ff_norm): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Linear(in_features=640, out_features=640, bias=True) | |
| ) | |
| ) | |
| ) | |
| (upsamplers): ModuleList( | |
| (0): Upsample3D( | |
| (conv): InflatedConv3d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| ) | |
| ) | |
| (3): CrossAttnUpBlock3D( | |
| (attentions): ModuleList( | |
| (0-2): 3 x Transformer3DModel( | |
| (norm): GroupNorm(32, 320, eps=1e-06, affine=True) | |
| (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): TemporalBasicTransformerBlock( | |
| (attn1): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| (attn2): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=768, out_features=320, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=768, out_features=320, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=320, out_features=2560, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True) | |
| ) | |
| ) | |
| (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (resnets): ModuleList( | |
| (0): ResnetBlock3D( | |
| (norm1): InflatedGroupNorm(32, 960, eps=1e-05, affine=True) | |
| (conv1): InflatedConv3d(960, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): Linear(in_features=1280, out_features=320, bias=True) | |
| (norm2): InflatedGroupNorm(32, 320, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): InflatedConv3d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| (conv_shortcut): InflatedConv3d(960, 320, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (1-2): 2 x ResnetBlock3D( | |
| (norm1): InflatedGroupNorm(32, 640, eps=1e-05, affine=True) | |
| (conv1): InflatedConv3d(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): Linear(in_features=1280, out_features=320, bias=True) | |
| (norm2): InflatedGroupNorm(32, 320, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): InflatedConv3d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| (conv_shortcut): InflatedConv3d(640, 320, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (motion_modules): ModuleList( | |
| (0-2): 3 x VanillaTemporalModule( | |
| (temporal_transformer): TemporalTransformer3DModel( | |
| (norm): GroupNorm(32, 320, eps=1e-06, affine=True) | |
| (proj_in): Linear(in_features=320, out_features=320, bias=True) | |
| (transformer_blocks): ModuleList( | |
| (0): TemporalTransformerBlock( | |
| (attention_blocks): ModuleList( | |
| (0-1): 2 x VersatileAttention( | |
| (Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False | |
| (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| (pos_encoder): PositionalEncoding( | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| (norms): ModuleList( | |
| (0-1): 2 x LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=320, out_features=2560, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True) | |
| ) | |
| ) | |
| (ff_norm): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Linear(in_features=320, out_features=320, bias=True) | |
| ) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (mid_block): UNetMidBlock3DCrossAttn( | |
| (attentions): ModuleList( | |
| (0): Transformer3DModel( | |
| (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) | |
| (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): TemporalBasicTransformerBlock( | |
| (attn1): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (attn2): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True) | |
| ) | |
| ) | |
| (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (resnets): ModuleList( | |
| (0-1): 2 x ResnetBlock3D( | |
| (norm1): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True) | |
| (conv1): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True) | |
| (norm2): InflatedGroupNorm(32, 1280, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): InflatedConv3d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| ) | |
| ) | |
| (motion_modules): ModuleList( | |
| (0): VanillaTemporalModule( | |
| (temporal_transformer): TemporalTransformer3DModel( | |
| (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) | |
| (proj_in): Linear(in_features=1280, out_features=1280, bias=True) | |
| (transformer_blocks): ModuleList( | |
| (0): TemporalTransformerBlock( | |
| (attention_blocks): ModuleList( | |
| (0-1): 2 x VersatileAttention( | |
| (Module Info) Attention_Mode: Temporal, Is_Cross_Attention: False | |
| (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| (pos_encoder): PositionalEncoding( | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| ) | |
| (norms): ModuleList( | |
| (0-1): 2 x LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True) | |
| ) | |
| ) | |
| (ff_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Linear(in_features=1280, out_features=1280, bias=True) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (conv_norm_out): InflatedGroupNorm(32, 320, eps=1e-05, affine=True) | |
| (conv_act): SiLU() | |
| (conv_out): InflatedConv3d(320, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| Reference UNet structure: | |
| UNet2DConditionModel( | |
| (conv_in): Conv2d(5, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_proj): Timesteps() | |
| (time_embedding): TimestepEmbedding( | |
| (linear_1): LoRACompatibleLinear(in_features=320, out_features=1280, bias=True) | |
| (act): SiLU() | |
| (linear_2): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| ) | |
| (down_blocks): ModuleList( | |
| (0): CrossAttnDownBlock2D( | |
| (attentions): ModuleList( | |
| (0-1): 2 x Transformer2DModel( | |
| (norm): GroupNorm(32, 320, eps=1e-06, affine=True) | |
| (proj_in): LoRACompatibleConv(320, 320, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): BasicTransformerBlock( | |
| (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| (attn1): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| (attn2): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=768, out_features=320, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=768, out_features=320, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=320, out_features=2560, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (proj_out): LoRACompatibleConv(320, 320, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (resnets): ModuleList( | |
| (0-1): 2 x ResnetBlock2D( | |
| (norm1): GroupNorm(32, 320, eps=1e-05, affine=True) | |
| (conv1): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True) | |
| (norm2): GroupNorm(32, 320, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| ) | |
| ) | |
| (downsamplers): ModuleList( | |
| (0): Downsample2D( | |
| (conv): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) | |
| ) | |
| ) | |
| ) | |
| (1): CrossAttnDownBlock2D( | |
| (attentions): ModuleList( | |
| (0-1): 2 x Transformer2DModel( | |
| (norm): GroupNorm(32, 640, eps=1e-06, affine=True) | |
| (proj_in): LoRACompatibleConv(640, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): BasicTransformerBlock( | |
| (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| (attn1): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| (attn2): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=768, out_features=640, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=768, out_features=640, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (proj_out): LoRACompatibleConv(640, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (resnets): ModuleList( | |
| (0): ResnetBlock2D( | |
| (norm1): GroupNorm(32, 320, eps=1e-05, affine=True) | |
| (conv1): LoRACompatibleConv(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True) | |
| (norm2): GroupNorm(32, 640, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| (conv_shortcut): LoRACompatibleConv(320, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (1): ResnetBlock2D( | |
| (norm1): GroupNorm(32, 640, eps=1e-05, affine=True) | |
| (conv1): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True) | |
| (norm2): GroupNorm(32, 640, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| ) | |
| ) | |
| (downsamplers): ModuleList( | |
| (0): Downsample2D( | |
| (conv): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) | |
| ) | |
| ) | |
| ) | |
| (2): CrossAttnDownBlock2D( | |
| (attentions): ModuleList( | |
| (0-1): 2 x Transformer2DModel( | |
| (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) | |
| (proj_in): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): BasicTransformerBlock( | |
| (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (attn1): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (attn2): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (proj_out): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (resnets): ModuleList( | |
| (0): ResnetBlock2D( | |
| (norm1): GroupNorm(32, 640, eps=1e-05, affine=True) | |
| (conv1): LoRACompatibleConv(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| (conv_shortcut): LoRACompatibleConv(640, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (1): ResnetBlock2D( | |
| (norm1): GroupNorm(32, 1280, eps=1e-05, affine=True) | |
| (conv1): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| ) | |
| ) | |
| (downsamplers): ModuleList( | |
| (0): Downsample2D( | |
| (conv): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) | |
| ) | |
| ) | |
| ) | |
| (3): DownBlock2D( | |
| (resnets): ModuleList( | |
| (0-1): 2 x ResnetBlock2D( | |
| (norm1): GroupNorm(32, 1280, eps=1e-05, affine=True) | |
| (conv1): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| ) | |
| ) | |
| ) | |
| ) | |
| (up_blocks): ModuleList( | |
| (0): UpBlock2D( | |
| (resnets): ModuleList( | |
| (0-2): 3 x ResnetBlock2D( | |
| (norm1): GroupNorm(32, 2560, eps=1e-05, affine=True) | |
| (conv1): LoRACompatibleConv(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| (conv_shortcut): LoRACompatibleConv(2560, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (upsamplers): ModuleList( | |
| (0): Upsample2D( | |
| (conv): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| ) | |
| ) | |
| (1): CrossAttnUpBlock2D( | |
| (attentions): ModuleList( | |
| (0-2): 3 x Transformer2DModel( | |
| (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) | |
| (proj_in): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): BasicTransformerBlock( | |
| (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (attn1): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (attn2): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (proj_out): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (resnets): ModuleList( | |
| (0-1): 2 x ResnetBlock2D( | |
| (norm1): GroupNorm(32, 2560, eps=1e-05, affine=True) | |
| (conv1): LoRACompatibleConv(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| (conv_shortcut): LoRACompatibleConv(2560, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (2): ResnetBlock2D( | |
| (norm1): GroupNorm(32, 1920, eps=1e-05, affine=True) | |
| (conv1): LoRACompatibleConv(1920, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| (conv_shortcut): LoRACompatibleConv(1920, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (upsamplers): ModuleList( | |
| (0): Upsample2D( | |
| (conv): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| ) | |
| ) | |
| (2): CrossAttnUpBlock2D( | |
| (attentions): ModuleList( | |
| (0-2): 3 x Transformer2DModel( | |
| (norm): GroupNorm(32, 640, eps=1e-06, affine=True) | |
| (proj_in): LoRACompatibleConv(640, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): BasicTransformerBlock( | |
| (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| (attn1): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| (attn2): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=768, out_features=640, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=768, out_features=640, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (proj_out): LoRACompatibleConv(640, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (resnets): ModuleList( | |
| (0): ResnetBlock2D( | |
| (norm1): GroupNorm(32, 1920, eps=1e-05, affine=True) | |
| (conv1): LoRACompatibleConv(1920, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True) | |
| (norm2): GroupNorm(32, 640, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| (conv_shortcut): LoRACompatibleConv(1920, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (1): ResnetBlock2D( | |
| (norm1): GroupNorm(32, 1280, eps=1e-05, affine=True) | |
| (conv1): LoRACompatibleConv(1280, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True) | |
| (norm2): GroupNorm(32, 640, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| (conv_shortcut): LoRACompatibleConv(1280, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (2): ResnetBlock2D( | |
| (norm1): GroupNorm(32, 960, eps=1e-05, affine=True) | |
| (conv1): LoRACompatibleConv(960, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True) | |
| (norm2): GroupNorm(32, 640, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| (conv_shortcut): LoRACompatibleConv(960, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (upsamplers): ModuleList( | |
| (0): Upsample2D( | |
| (conv): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| ) | |
| ) | |
| (3): CrossAttnUpBlock2D( | |
| (attentions): ModuleList( | |
| (0-2): 3 x Transformer2DModel( | |
| (norm): GroupNorm(32, 320, eps=1e-06, affine=True) | |
| (proj_in): LoRACompatibleConv(320, 320, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): BasicTransformerBlock( | |
| (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| (attn1): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| (attn2): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=768, out_features=320, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=768, out_features=320, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=320, out_features=2560, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (proj_out): LoRACompatibleConv(320, 320, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (resnets): ModuleList( | |
| (0): ResnetBlock2D( | |
| (norm1): GroupNorm(32, 960, eps=1e-05, affine=True) | |
| (conv1): LoRACompatibleConv(960, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True) | |
| (norm2): GroupNorm(32, 320, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| (conv_shortcut): LoRACompatibleConv(960, 320, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (1-2): 2 x ResnetBlock2D( | |
| (norm1): GroupNorm(32, 640, eps=1e-05, affine=True) | |
| (conv1): LoRACompatibleConv(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True) | |
| (norm2): GroupNorm(32, 320, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| (conv_shortcut): LoRACompatibleConv(640, 320, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (mid_block): UNetMidBlock2DCrossAttn( | |
| (attentions): ModuleList( | |
| (0): Transformer2DModel( | |
| (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) | |
| (proj_in): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): BasicTransformerBlock( | |
| (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (attn1): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (attn2): Attention( | |
| (to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) | |
| (to_v): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False) | |
| (to_out): ModuleList( | |
| (0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (ff): FeedForward( | |
| (net): ModuleList( | |
| (0): GEGLU( | |
| (proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (proj_out): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (resnets): ModuleList( | |
| (0-1): 2 x ResnetBlock2D( | |
| (norm1): GroupNorm(32, 1280, eps=1e-05, affine=True) | |
| (conv1): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True) | |
| (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True) | |
| (dropout): Dropout(p=0.0, inplace=False) | |
| (conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (nonlinearity): SiLU() | |
| ) | |
| ) | |
| ) | |
| (conv_norm_out): None | |
| (conv_act): SiLU() | |
| ) | |
| Pose Guider structure: | |
| PoseGuider( | |
| (conv_in): InflatedConv3d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (blocks): ModuleList( | |
| (0): InflatedConv3d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (1): InflatedConv3d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) | |
| (2): InflatedConv3d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (3): InflatedConv3d(32, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) | |
| (4): InflatedConv3d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (5): InflatedConv3d(96, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) | |
| ) | |
| (conv_out): InflatedConv3d(256, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| image_enc: | |
| CLIPVisionModelWithProjection( | |
| (vision_model): CLIPVisionTransformer( | |
| (embeddings): CLIPVisionEmbeddings( | |
| (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False) | |
| (position_embedding): Embedding(257, 1024) | |
| ) | |
| (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
| (encoder): CLIPEncoder( | |
| (layers): ModuleList( | |
| (0-23): 24 x CLIPEncoderLayer( | |
| (self_attn): CLIPAttention( | |
| (k_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
| (v_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
| (q_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
| (out_proj): Linear(in_features=1024, out_features=1024, bias=True) | |
| ) | |
| (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
| (mlp): CLIPMLP( | |
| (activation_fn): QuickGELUActivation() | |
| (fc1): Linear(in_features=1024, out_features=4096, bias=True) | |
| (fc2): Linear(in_features=4096, out_features=1024, bias=True) | |
| ) | |
| (layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| ) | |
| (post_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| (visual_projection): Linear(in_features=1024, out_features=768, bias=False) | |
| ) | |
| Pose Guider structure: | |
| PoseGuider( | |
| (conv_in): InflatedConv3d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (blocks): ModuleList( | |
| (0): InflatedConv3d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (1): InflatedConv3d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) | |
| (2): InflatedConv3d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (3): InflatedConv3d(32, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) | |
| (4): InflatedConv3d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| (5): InflatedConv3d(96, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) | |
| ) | |
| (conv_out): InflatedConv3d(256, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| pipe: | |
| Pose2VideoPipeline { | |
| "_class_name": "Pose2VideoPipeline", | |
| "_diffusers_version": "0.24.0", | |
| "denoising_unet": [ | |
| "src.models.unet_3d", | |
| "UNet3DConditionModel" | |
| ], | |
| "image_encoder": [ | |
| "transformers", | |
| "CLIPVisionModelWithProjection" | |
| ], | |
| "image_proj_model": [ | |
| null, | |
| null | |
| ], | |
| "pose_guider": [ | |
| "src.models.pose_guider", | |
| "PoseGuider" | |
| ], | |
| "reference_unet": [ | |
| "src.models.unet_2d_condition", | |
| "UNet2DConditionModel" | |
| ], | |
| "scheduler": [ | |
| "diffusers", | |
| "DDIMScheduler" | |
| ], | |
| "text_encoder": [ | |
| null, | |
| null | |
| ], | |
| "tokenizer": [ | |
| null, | |
| null | |
| ], | |
| "vae": [ | |
| "diffusers", | |
| "AutoencoderKL" | |
| ] | |
| } | |