|
| 1 | +model: |
| 2 | + target: cldm.cldm.ControlLDM |
| 3 | + params: |
| 4 | + linear_start: 0.00085 |
| 5 | + linear_end: 0.0120 |
| 6 | + num_timesteps_cond: 1 |
| 7 | + log_every_t: 200 |
| 8 | + timesteps: 1000 |
| 9 | + first_stage_key: "jpg" |
| 10 | + cond_stage_key: "txt" |
| 11 | + control_key: "hint" |
| 12 | + image_size: 64 |
| 13 | + channels: 4 |
| 14 | + cond_stage_trainable: false |
| 15 | + conditioning_key: crossattn |
| 16 | + monitor: val/loss_simple_ema |
| 17 | + scale_factor: 0.18215 |
| 18 | + use_ema: False |
| 19 | + only_mid_control: False |
| 20 | + |
| 21 | + control_stage_config: |
| 22 | + target: cldm.cldm.ControlNet |
| 23 | + params: |
| 24 | + use_checkpoint: True |
| 25 | + image_size: 32 # unused |
| 26 | + in_channels: 4 |
| 27 | + hint_channels: 3 |
| 28 | + model_channels: 320 |
| 29 | + attention_resolutions: [ 4, 2, 1 ] |
| 30 | + num_res_blocks: 2 |
| 31 | + channel_mult: [ 1, 2, 4, 4 ] |
| 32 | + num_head_channels: 64 # need to fix for flash-attn |
| 33 | + use_spatial_transformer: True |
| 34 | + use_linear_in_transformer: True |
| 35 | + transformer_depth: 1 |
| 36 | + context_dim: 1024 |
| 37 | + legacy: False |
| 38 | + |
| 39 | + unet_config: |
| 40 | + target: cldm.cldm.ControlledUnetModel |
| 41 | + params: |
| 42 | + use_checkpoint: True |
| 43 | + image_size: 32 # unused |
| 44 | + in_channels: 4 |
| 45 | + out_channels: 4 |
| 46 | + model_channels: 320 |
| 47 | + attention_resolutions: [ 4, 2, 1 ] |
| 48 | + num_res_blocks: 2 |
| 49 | + channel_mult: [ 1, 2, 4, 4 ] |
| 50 | + num_head_channels: 64 # need to fix for flash-attn |
| 51 | + use_spatial_transformer: True |
| 52 | + use_linear_in_transformer: True |
| 53 | + transformer_depth: 1 |
| 54 | + context_dim: 1024 |
| 55 | + legacy: False |
| 56 | + |
| 57 | + first_stage_config: |
| 58 | + target: ldm.models.autoencoder.AutoencoderKL |
| 59 | + params: |
| 60 | + embed_dim: 4 |
| 61 | + monitor: val/rec_loss |
| 62 | + ddconfig: |
| 63 | + #attn_type: "vanilla-xformers" |
| 64 | + double_z: true |
| 65 | + z_channels: 4 |
| 66 | + resolution: 256 |
| 67 | + in_channels: 3 |
| 68 | + out_ch: 3 |
| 69 | + ch: 128 |
| 70 | + ch_mult: |
| 71 | + - 1 |
| 72 | + - 2 |
| 73 | + - 4 |
| 74 | + - 4 |
| 75 | + num_res_blocks: 2 |
| 76 | + attn_resolutions: [] |
| 77 | + dropout: 0.0 |
| 78 | + lossconfig: |
| 79 | + target: torch.nn.Identity |
| 80 | + |
| 81 | + cond_stage_config: |
| 82 | + target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder |
| 83 | + params: |
| 84 | + freeze: True |
| 85 | + layer: "penultimate" |
0 commit comments