-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathtrain.sh
More file actions
115 lines (102 loc) · 3.43 KB
/
Copy pathtrain.sh
File metadata and controls
115 lines (102 loc) · 3.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
### Stage1, Recommend using 32 A100 cluster training takes approximately 1.5 days, but you can also use 8 A100 with more time
# Basic Configuration
BATCH_SIZE=4
EPOCHS=50
GAC=1
LR=1e-4
# Training Parameters
TRAIN_ARGS="--output_dir=/tmp \
--name=postermaker_debug_stage1 \
--mixed_precision=fp16 \
--learning_rate=${LR} \
--num_train_epochs=${EPOCHS} \
--train_batch_size=${BATCH_SIZE} \
--gradient_accumulation_steps=${GAC} \
--lr_warmup_steps=500 \
--lr_scheduler=constant_with_warmup \
--adam_epsilon=1e-15 \
--dataloader_num_workers=10 \
--checkpointing_steps=10 \
--validation_steps=10 \
--resolution_h=1024 \
--resolution_w=1024 \
--pretrained_model_name_or_path=./checkpoints/stable-diffusion-3-medium-diffusers/ \
--ctrl_layers=12 \
--controlnet_model_name_or_path=./checkpoints/SD3-Controlnet-Inpainting \
--max_num_texts=7 \
--char_padding_to_len=16 \
--text_feature_drop=0.1 \
--p_drop_caption=0 \
--cfg_scale=5.0"
#Start Training
HF_HUB_OFFLINE=1 WORLD_SIZE=1 RANK=-1 python3 train_sd3_stage1.py $TRAIN_ARGS \
### Stage2, Recommend using 32 A100 cluster training takes approximately 1.5 days, but you can also use 8 A100 with more time
# Basic Configuration
BATCH_SIZE=2
EPOCHS=50
GAC=1
LR=1e-4
#Training Parameters
TRAIN_ARGS="--output_dir=/tmp \
--name=postermaker_debug_stage2_wo_reward \
--mixed_precision=fp16 \
--learning_rate=${LR} \
--num_train_epochs=${EPOCHS} \
--train_batch_size=${BATCH_SIZE} \
--gradient_accumulation_steps=${GAC} \
--lr_warmup_steps=500 \
--lr_scheduler=constant_with_warmup \
--adam_epsilon=1e-15 \
--dataloader_num_workers=10 \
--checkpointing_steps=20 \
--validation_steps=1000 \
--resolution_h=1024 \
--resolution_w=1024 \
--pretrained_model_name_or_path=./checkpoints/stable-diffusion-3-medium-diffusers/ \
--ctrl_layers=12 \
--controlnet_model_name_or_path=./checkpoints/SD3-Controlnet-Inpainting \
--controlnet_model_name_or_path2=/tmp/postermaker_debug_stage1/0_net_postermaker_debug_stage1.pth \
--max_num_texts=7 \
--char_padding_to_len=16 \
--text_feature_drop=0 \
--cfg_scale=5.0 \
--bg_inpaint" # NOTE: add this to enable stage2 training
# #Start Training
HF_HUB_OFFLINE=1 WORLD_SIZE=1 RANK=-1 python3 train_sd3_stage2.py $TRAIN_ARGS
# Stage2 with reward, must use deepspeed with >= 8 GPUs, single GPU will OOM, recommend using 32 A100 cluster.
## Basic Configuration
BATCH_SIZE=1
EPOCHS=50
GAC=1
LR=1e-4
#Training Parameters
TRAIN_ARGS="--output_dir=/tmp \
--name=postermaker_debug_stage2_w_reward \
--mixed_precision=fp16 \
--learning_rate=${LR} \
--num_train_epochs=${EPOCHS} \
--train_batch_size=${BATCH_SIZE} \
--gradient_accumulation_steps=${GAC} \
--lr_warmup_steps=500 \
--lr_scheduler=constant_with_warmup \
--adam_epsilon=1e-15 \
--dataloader_num_workers=10 \
--checkpointing_steps=20 \
--validation_steps=1000 \
--resolution_h=1024 \
--resolution_w=1024 \
--pretrained_model_name_or_path=./checkpoints/stable-diffusion-3-medium-diffusers/ \
--ctrl_layers=12 \
--controlnet_model_name_or_path=./checkpoints/SD3-Controlnet-Inpainting \
--controlnet_model_name_or_path2=/tmp/postermaker_debug_stage2_wo_reward/0_net_postermaker_debug_stage2_wo_reward.pth \
--max_num_texts=7 \
--char_padding_to_len=16 \
--text_feature_drop=0 \
--cfg_scale=5.0 \
--bg_inpaint \
--sam_net_type=vit_h \
--sam_dir=./checkpoints/outgrowth_det/sam_vit_h_4b8939.pth \
--sam_head_dir=./checkpoints/outgrowth_det/vit_h_epoch10.pth \
--gradient_checkpointing \
--deepspeed"
HF_HUB_OFFLINE=1 WORLD_SIZE=1 RANK=-1 python3 train_sd3_stage2_with_reward.py $TRAIN_ARGS