Skip to content

Commit b8f2101

Browse files
authored
clear flag (#10870)
1 parent a205bc3 commit b8f2101

5 files changed

+89
-112
lines changed

scripts/distribute/ci_case_auto.sh

Lines changed: 89 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -217,113 +217,105 @@ function llama_dygraph_auto_bs4_bf16_SD2() {
217217
export PYTHONPATH=$root_path/:$PYTHONPATH
218218
export FLAGS_call_stack_level=3
219219
export NVIDIA_TF32_OVERRIDE=0
220-
221220
export FLAGS_cudnn_deterministic=1
222221
export FLAGS_embedding_deterministic=1
223-
222+
224223
export CUDA_DEVICE_MAX_CONNECTIONS=1
225224

226-
flags=("" "FLAGS_enable_tensor_fusion FLAGS_enable_sharding_overlap")
227-
for i in "${!flags[@]}"; do
228-
flag="${flags[$i]}"
225+
test_cases=(
226+
"default" "" 1
227+
"tensor_fusion_overlap1" "enable_tensor_fusion enable_overlap" 1
228+
"tensor_fusion_overlap2" "enable_tensor_fusion enable_overlap" 2
229+
)
229230

230-
if [ -n "$flag" ]; then
231-
for f in $flag; do
232-
export "$f=true"
233-
done
234-
fi
235-
acc_steps=(1)
236-
if [ "$flag" = "FLAGS_enable_tensor_fusion FLAGS_enable_sharding_overlap" ]; then
237-
acc_steps=(1 2)
238-
fi
239-
for acc_step in "${acc_steps[@]}"; do
240-
task_name="llama_dygraph_auto_bs4_bf16_SD2_$f"
241-
case_out_dir="output/$task_name"
242-
case_log_dir="output/$task_name""_log"
243-
rm -rf $case_out_dir
244-
rm -rf $case_log_dir
231+
for ((i=0; i<${#test_cases[@]}; i+=3)); do
232+
case_name=${test_cases[i]}
233+
sharding_config=${test_cases[i+1]}
234+
acc_step=${test_cases[i+2]}
245235

246-
python -u -m paddle.distributed.launch \
247-
--gpus "0,1" \
248-
--log_dir "output/$task_name""_log" \
249-
./run_pretrain_auto.py \
250-
--model_name_or_path "meta-llama/Llama-2-7b" \
251-
--tokenizer_name_or_path "meta-llama/Llama-2-7b" \
252-
--input_dir "./data" \
253-
--output_dir "./output" \
254-
--weight_decay 0.01 \
255-
--warmup_ratio 0.01 \
256-
--max_grad_norm 1.0 \
257-
--learning_rate 3e-05 \
258-
--min_learning_rate 3e-06 \
259-
--max_steps 10 \
260-
--logging_steps 10 \
261-
--eval_steps 1000 \
262-
--save_steps 50000 \
263-
--continue_training 0 \
264-
--do_train true \
265-
--do_eval false \
266-
--do_predict false \
267-
--disable_tqdm true \
268-
--skip_profile_timer true \
269-
--device gpu \
270-
--enable_auto_parallel 1 \
271-
--per_device_train_batch_size 1 \
272-
--gradient_accumulation_steps $acc_step \
273-
--per_device_eval_batch_size 2 \
274-
--recompute false \
275-
--recompute_use_reentrant true \
276-
--recompute_granularity full \
277-
--pp_recompute_interval 0 \
278-
--bf16 true \
279-
--fp16_opt_level "O2" \
280-
--amp_master_grad true \
281-
--fuse_attention_ffn true \
282-
--fuse_attention_qkv true \
283-
--fused_linear_param_grad_add 1 \
284-
--use_flash_attention true \
285-
--use_fused_rope true \
286-
--use_fused_rms_norm true \
287-
--max_seq_length 4096 \
288-
--sequence_parallel false \
289-
--pipeline_parallel_degree 1 \
290-
--tensor_parallel_degree 1 \
291-
--sharding "stage1" \
292-
--data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \
293-
--sharding_parallel_config "enable_tensor_fusion enable_overlap" \
294-
--to_static 0 \
295-
--amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
296-
--amp_custom_white_list "lookup_table" "lookup_table_v2" \
297-
--num_hidden_layers 4 \
298-
>>${log_path}/$FUNCNAME 2>&1
299-
loss=`cat $case_log_dir/workerlog.0 | grep 'global_step: 10' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
300-
ips=-1
301-
mem=-1
302-
echo "result: loss=$loss ips=$ips mem=$mem"
303-
echo "flag=$flag acc_step=$acc_step"
304-
if [ -z "$flag" ]; then
305-
loss_base=9.23504105
306-
elif [ "$flag" = "FLAGS_enable_tensor_fusion FLAGS_enable_sharding_overlap" ]; then
307-
if [ $acc_step -eq 1 ]; then
308-
loss_base=9.23504868
309-
else
310-
loss_base=9.16484451
311-
fi
312-
else
313-
loss_base=-1
314-
fi
236+
task_name="llama_dygraph_auto_bs4_bf16_SD2_${case_name}_acc${acc_step}"
315237

316-
ips_base=-1
317-
mem_base=-1
318-
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
319-
done
320238

321-
if [ -n "$flag" ]; then
322-
for f in $flag; do
323-
export "$f=false"
324-
done
239+
case_out_dir="output/$task_name"
240+
case_log_dir="output/$task_name""_log"
241+
rm -rf $case_out_dir
242+
rm -rf $case_log_dir
243+
244+
python -u -m paddle.distributed.launch \
245+
--gpus "0,1" \
246+
--log_dir "output/$task_name""_log" \
247+
./run_pretrain_auto.py \
248+
--model_name_or_path "meta-llama/Llama-2-7b" \
249+
--tokenizer_name_or_path "meta-llama/Llama-2-7b" \
250+
--input_dir "./data" \
251+
--output_dir "./output" \
252+
--weight_decay 0.01 \
253+
--warmup_ratio 0.01 \
254+
--max_grad_norm 1.0 \
255+
--learning_rate 3e-05 \
256+
--min_learning_rate 3e-06 \
257+
--max_steps 10 \
258+
--logging_steps 10 \
259+
--eval_steps 1000 \
260+
--save_steps 50000 \
261+
--continue_training 0 \
262+
--do_train true \
263+
--do_eval false \
264+
--do_predict false \
265+
--disable_tqdm true \
266+
--skip_profile_timer true \
267+
--device gpu \
268+
--enable_auto_parallel 1 \
269+
--per_device_train_batch_size 1 \
270+
--gradient_accumulation_steps $acc_step \
271+
--per_device_eval_batch_size 2 \
272+
--recompute false \
273+
--recompute_use_reentrant true \
274+
--recompute_granularity full \
275+
--pp_recompute_interval 0 \
276+
--bf16 true \
277+
--fp16_opt_level "O2" \
278+
--amp_master_grad true \
279+
--fuse_attention_ffn true \
280+
--fuse_attention_qkv true \
281+
--fused_linear_param_grad_add 1 \
282+
--use_flash_attention true \
283+
--use_fused_rope true \
284+
--use_fused_rms_norm true \
285+
--max_seq_length 4096 \
286+
--sequence_parallel false \
287+
--pipeline_parallel_degree 1 \
288+
--tensor_parallel_degree 1 \
289+
--sharding "stage1" \
290+
--data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \
291+
--sharding_parallel_config $sharding_config \
292+
--to_static 0 \
293+
--amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
294+
--amp_custom_white_list "lookup_table" "lookup_table_v2" \
295+
--num_hidden_layers 4 \
296+
>>${log_path}/$FUNCNAME 2>&1
297+
loss=`cat $case_log_dir/workerlog.0 | grep 'global_step: 10' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
298+
ips=-1
299+
mem=-1
300+
echo "result: loss=$loss ips=$ips mem=$mem"
301+
echo "case=$case_name sharding_config=$sharding_config acc_step=$acc_step"
302+
if [ "$case_name" = "default" ]; then
303+
loss_base=9.23504105
304+
elif [ "$case_name" = "tensor_fusion_overlap" ]; then
305+
if [ $acc_step -eq 1 ]; then
306+
loss_base=9.23504868
307+
else
308+
loss_base=9.16484451
309+
fi
310+
else
311+
loss_base=-1
325312
fi
313+
314+
ips_base=-1
315+
mem_base=-1
316+
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
326317
done
318+
327319
echo "=========== $FUNCNAME run end ==========="
328320
fi
329321
}
@@ -3984,7 +3976,6 @@ function llama_baichuan_dygraph_auto_sp_async_reduce_scatter_bs8_bf16_DP4-MP2-SP
39843976
export NVIDIA_TF32_OVERRIDE=0
39853977

39863978
export CUDA_DEVICE_MAX_CONNECTIONS=1
3987-
export FLAGS_enable_inplace_master_grad=1
39883979
export FLAGS_auto_parallel_align_mode=1
39893980
export FLAGS_max_inplace_grad_add=65536
39903981
export FLAGS_embedding_deterministic=1

tests/test_tipc/static/auto_parallel/baichuan2/N4C32/baichuan-inc-baichuan-2-13b_pretrain_dynamic_auto_bs32_bf16_DP1_MP4_PP1_Sharding8_Stage1.sh

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,6 @@ param+="nnodes=4 "
2020
param+="model_type=baichuan2_13b "
2121
param+='dynamic_auto=_dynamic_auto '
2222

23-
export FLAGS_enable_sharding_overlap=1
24-
export FLAGS_enable_tensor_fusion=1
25-
2623
cd ./tests
2724
bash ./test_tipc/static/auto_parallel/baichuan2/benchmark_common/prepare.sh
2825

tests/test_tipc/static/auto_parallel/llama2/N4C32/meta-llama-Llama-2-13b_pretrain_dynamic_auto_bs32_bf16_DP1_MP1_PP4_VPP5_Sharding8_Stage1.sh

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,6 @@ param+="nnodes=4 "
2020
param+="model_type=llama2_13b "
2121
param+='dynamic_auto=_dynamic_auto '
2222

23-
# Enable tensor fusion and sharding overlap optimization
24-
export FLAGS_enable_tensor_fusion=1
25-
export FLAGS_enable_sharding_overlap=1
26-
2723
cd ./tests
2824
bash ./test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh
2925

tests/test_tipc/static/auto_parallel/llama2/N4C32/meta-llama-Llama-2-7b_pretrain_dynamic_auto_bs32_bf16_Sharding32_Stage2.sh

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,6 @@ param+="nnodes=4 "
2020
param+="model_type=llama2_7b "
2121
param+='dynamic_auto=_dynamic_auto '
2222

23-
# Enable tensor fusion and sharding overlap optimization
24-
export FLAGS_enable_tensor_fusion=1
25-
export FLAGS_enable_sharding_overlap=1
26-
2723
cd ./tests
2824
bash ./test_tipc/static/auto_parallel/llama2/benchmark_common/prepare.sh
2925

tests/test_tipc/static/auto_parallel/qwen/N4C32/qwen-14b_pretrain_dynamic_auto_bs32_bf16_DP1_MP2_Sharding16_Stage1.sh

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,6 @@ param+="nnodes=4 "
2020
param+="model_type=qwen_14b "
2121
param+='dynamic_auto=_dynamic_auto '
2222

23-
export FLAGS_enable_tensor_fusion=1
24-
export FLAGS_enable_sharding_overlap=1
25-
2623
cd ./tests
2724
bash ./test_tipc/static/auto_parallel/qwen/benchmark_common/prepare.sh
2825

0 commit comments

Comments
 (0)