Skip to content

Commit c866f64

Browse files
committed
reformat
1 parent 13ad050 commit c866f64

File tree

7 files changed

+65
-8
lines changed

7 files changed

+65
-8
lines changed

mftcoder_accelerate/README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,10 +176,14 @@ DeepSpeed config in accelerate_ds_config.yaml.
176176
accelerate launch --config_file accelerate_ds_config.yaml pefts/mft_accelerate.py --train_config configs/xxx_train_config.json --distributed_type "DeepSpeed"
177177
```
178178
or
179-
DeepSpeed config in command line arguments
179+
DeepSpeed Zero2 config in command line arguments
180180
```bash
181181
sh ds_single_launch.sh
182182
```
183+
DeepSpeed Zero3 config in command line arguments
184+
```bash
185+
sh ds_zero3_single_launch.sh
186+
```
183187

184188
#### Launch via FSDP
185189
FSDP config in accelerate_fsdp_config.yaml.

mftcoder_accelerate/README_cn.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,11 +153,16 @@ accelerate launch --config_file accelerate_ds_config.yaml pefts/mft_accelerate.p
153153
```
154154
或者
155155

156-
DeepSpeed配置在脚本中通过命令行输入
156+
DeepSpeed Zero2 配置在脚本中通过命令行输入
157157
```bash
158158
sh ds_single_launch.sh
159159
```
160160

161+
DeepSpeed Zero3 配置在脚本中通过命令行输入
162+
```bash
163+
sh ds_zero3_single_launch.sh
164+
```
165+
161166
#### Launch via FSDP
162167
FSDP配置在accelerate_fsdp_config.yaml中。
163168
```bash

mftcoder_accelerate/src/ds_single_launch.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
# Launch script on single node
77
N_GPU_PER_NODE=8
88

9+
# config path
10+
CONFIG="configs/xxx_train_config.json"
11+
912
# envs used inside training
1013
export OMP_NUM_THREADS=4
1114
export TOKENIZERS_PARALLELISM=False
@@ -30,6 +33,6 @@ accelerate launch \
3033
--same_network \
3134
--machine_rank 0 \
3235
--rdzv_backend 'static' \
33-
pefts/mft_accelerate.py --train_config configs/"xxx_train_config.json" \
36+
pefts/mft_accelerate.py --train_config "$CONFIG" \
3437
--distributed_type "deepspeed" \
3538
> MFTCoder-training-"$TODAY".log 2>&1 &
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/bin/sh
2+
# Author: Chaoyu Chen
3+
# Last Modified: 2024/12/11
4+
# Description: An alternative(Command line) way to launch DeepSpeed training
5+
6+
# Launch script on single node
7+
N_GPU_PER_NODE=8
8+
9+
# config path
10+
CONFIG="configs/xxx_train_config.json"
11+
12+
# envs used inside training
13+
export OMP_NUM_THREADS=4
14+
export TOKENIZERS_PARALLELISM=False
15+
16+
TODAY=$(date +%Y-%m%d-%H%M)
17+
18+
# accelerate launch --config_file accelerate_ds_config.yaml \
19+
accelerate launch \
20+
--num_machines 1 \
21+
--num_processes $N_GPU_PER_NODE \
22+
--use_deepspeed \
23+
--zero_stage 3 \
24+
--offload_optimizer_device 'cpu' \
25+
--offload_param_device 'cpu' \
26+
--gradient_accumulation_steps 1 \
27+
--gradient_clipping 1.0 \
28+
--zero3_init_flag true \
29+
--zero3_save_16bit_model true \
30+
--main_training_function 'main' \
31+
--mixed_precision 'bf16' \
32+
--dynamo_backend 'no' \
33+
--same_network \
34+
--machine_rank 0 \
35+
--rdzv_backend 'static' \
36+
pefts/mft_accelerate.py --train_config "$CONFIG" \
37+
--distributed_type "deepspeed" \
38+
> MFTCoder-training-"$TODAY".log 2>&1 &

mftcoder_accelerate/src/fsdp_single_launch.sh

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,14 @@
66
# Launch script on single node
77
N_GPU_PER_NODE=8
88

9+
# config path
10+
CONFIG="configs/xxx_train_config.json"
11+
12+
# fsdp_transformer_layer_cls_to_wrap, choose the DecoderLayer
13+
WRAP_MODULE="LlamaDecoderLayer"
14+
15+
16+
917
# envs used inside training
1018
export OMP_NUM_THREADS=4
1119
export TOKENIZERS_PARALLELISM=False
@@ -21,15 +29,15 @@ accelerate launch \
2129
--fsdp_auto_wrap_policy=TRANSFORMER_BASED_WRAP \
2230
--fsdp_state_dict_type=FULL_STATE_DICT \
2331
--fsdp_backward_prefetch_policy=BACKWARD_PRE \
24-
--fsdp_transformer_layer_cls_to_wrap=LlamaDecoderLayer \
32+
--fsdp_transformer_layer_cls_to_wrap=$WRAP_MODULE \
2533
--fsdp_offload_params=false \
2634
--main_training_function=main \
2735
--mixed_precision=bf16 \
2836
--dynamo_backend=no \
2937
--same_network \
3038
--machine_rank=0 \
3139
--rdzv_backend=static \
32-
pefts/mft_accelerate.py --train_config configs/"xxx_train_config.json" \
40+
pefts/mft_accelerate.py --train_config "$CONFIG" \
3341
--distributed_type "fsdp" \
3442
> MFTCoder-training-"$TODAY".log 2>&1 &
3543

mftcoder_accelerate/src/pefts/mft_accelerate.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,7 @@ def main():
418418
bias="lora_only",
419419
)
420420

421-
# # 是否要加入新的special tokens
421+
# new special tokens
422422
# num_added_toks = tokenizer.tokenizer.add_special_tokens(["<role_start>", "<role_end>"])
423423
# accelerator.print("We have added", num_added_toks, "tokens")
424424
# accelerator.print(f"role marker tokens {tokenizer.convert_tokens_to_ids('<role_start>')} {tokenizer.convert_tokens_to_ids('<role_end>')}, resized tokenizer_size: {len(tokenizer)}")
@@ -465,7 +465,6 @@ def main():
465465
tokenizer = build_tokenizer(args)
466466
# Note: resize_token_embeddings expects to receive the full size of the new vocabulary,
467467
# i.e. the length of the tokenizer.
468-
# 如果新增special tokens, 需要resize input embedding 和output embedding
469468
# model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=32)
470469

471470
accelerator.print("Model load_in_4bit: ", args.quantization == "4bit")

mftcoder_accelerate/src/pefts/trainer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ def accelerate_saving_checkpoint(self, output_dir: str, completed_steps: int):
221221
"latest_ckpt": output_dir,
222222
"lr": self.optimizer.param_groups[0]["lr"],
223223
# 1 step back because ckping is after schuduler.step()
224-
"scheduler_last_ep": self.lr_scheduler.state_dict().get("last_epoch", 0) - 1,
224+
# "scheduler_last_ep": self.lr_scheduler.state_dict().get("last_epoch", 0) - 1,
225225
}
226226
with open(os.path.join(self.args.output_dir, "latest"), "w") as f:
227227
json.dump(latest, f, indent=2)

0 commit comments

Comments
 (0)