Skip to content

Commit 15d49bb

Browse files
authored
[AutoParallel] Open dynamic sharding CI test (#10793)
* open sharding CI test and remove flag * update loss_base
1 parent 44eff1f commit 15d49bb

5 files changed

+5
-18
lines changed

scripts/distribute/ci_case_auto.sh

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ function llm_gpt_case_list_auto() {
166166
llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2
167167
llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2-PP2
168168
llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2
169-
# llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2_intermediate
169+
llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2_intermediate
170170
llm_gpt_pir_auto_bs4_TP2
171171
llm_gpt_pir_auto_bs4_TP2_PP2
172172
llm_gpt_pir_auto_bs8_DP2_TP2_PP2
@@ -225,7 +225,7 @@ function llama_dygraph_auto_bs4_bf16_SD2() {
225225

226226
export CUDA_DEVICE_MAX_CONNECTIONS=1
227227

228-
flags=("" "FLAGS_fuse_allreduce_in_opt" "FLAGS_fuse_reducescatter_in_opt" "FLAGS_enable_tensor_fusion FLAGS_enable_sharding_overlap")
228+
flags=("" "FLAGS_enable_tensor_fusion FLAGS_enable_sharding_overlap")
229229
for i in "${!flags[@]}"; do
230230
flag="${flags[$i]}"
231231

@@ -292,7 +292,7 @@ function llama_dygraph_auto_bs4_bf16_SD2() {
292292
--tensor_parallel_degree 1 \
293293
--sharding "stage1" \
294294
--data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \
295-
--sharding_parallel_config "" \
295+
--sharding_parallel_config "enable_tensor_fusion enable_overlap" \
296296
--to_static 0 \
297297
--amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
298298
--amp_custom_white_list "lookup_table" "lookup_table_v2" \
@@ -304,10 +304,6 @@ function llama_dygraph_auto_bs4_bf16_SD2() {
304304
echo "result: loss=$loss ips=$ips mem=$mem"
305305
echo "flag=$flag acc_step=$acc_step"
306306
if [ -z "$flag" ]; then
307-
loss_base=9.23504791
308-
elif [ "$flag" = "FLAGS_fuse_allreduce_in_opt" ]; then
309-
loss_base=9.23502579
310-
elif [ "$flag" = "FLAGS_fuse_reducescatter_in_opt" ]; then
311307
loss_base=9.23504105
312308
elif [ "$flag" = "FLAGS_enable_tensor_fusion FLAGS_enable_sharding_overlap" ]; then
313309
if [ $acc_step -eq 1 ]; then
@@ -2798,11 +2794,11 @@ function llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2_intermediate() {
27982794
mem=-1
27992795
echo "result: loss=$loss ips=$ips mem=$mem loss_md5=$loss_md5"
28002796
# loss_base=10.58456802 # note: need to debug
2801-
loss_base=10.56716251
2797+
loss_base=10.56668091
28022798
ips_base=-1
28032799
mem_base=-1
28042800
if [ $IS_A100 -ne 0 ];then
2805-
loss_base=10.56166935 # after add dropout spmd
2801+
loss_base=10.56199837 # after add dropout spmd
28062802
fi
28072803
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
28082804
echo "=========== $FUNCNAME run end ==========="
@@ -3989,7 +3985,6 @@ function llama_baichuan_dygraph_auto_sp_async_reduce_scatter_bs8_bf16_DP4-MP2-SP
39893985
export NVIDIA_TF32_OVERRIDE=0
39903986

39913987
export CUDA_DEVICE_MAX_CONNECTIONS=1
3992-
export FLAGS_fuse_reducescatter_in_opt=1
39933988
export FLAGS_enable_inplace_master_grad=1
39943989
export FLAGS_auto_parallel_align_mode=1
39953990
export FLAGS_max_inplace_grad_add=65536

tests/test_tipc/static/auto_parallel/baichuan2/N4C32/baichuan-inc-baichuan-2-13b_pretrain_dynamic_auto_bs32_bf16_DP1_MP4_PP1_Sharding8_Stage1.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ param+="nnodes=4 "
2020
param+="model_type=baichuan2_13b "
2121
param+='dynamic_auto=_dynamic_auto '
2222

23-
export FLAGS_fuse_reducescatter_in_opt=1
2423
export FLAGS_enable_sharding_overlap=1
2524
export FLAGS_enable_tensor_fusion=1
2625

tests/test_tipc/static/auto_parallel/llama2/N4C32/meta-llama-Llama-2-13b_pretrain_dynamic_auto_bs32_bf16_DP1_MP1_PP4_VPP5_Sharding8_Stage1.sh

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,6 @@ param+="nnodes=4 "
2020
param+="model_type=llama2_13b "
2121
param+='dynamic_auto=_dynamic_auto '
2222

23-
# This optimization currently only runs in the dynamic automatic parallelism of Llama7B.
24-
export FLAGS_fuse_reducescatter_in_opt=1
25-
2623
# Enable tensor fusion and sharding overlap optimization
2724
export FLAGS_enable_tensor_fusion=1
2825
export FLAGS_enable_sharding_overlap=1

tests/test_tipc/static/auto_parallel/llama2/N4C32/meta-llama-Llama-2-7b_pretrain_dynamic_auto_bs32_bf16_Sharding32_Stage2.sh

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,6 @@ param+="nnodes=4 "
2020
param+="model_type=llama2_7b "
2121
param+='dynamic_auto=_dynamic_auto '
2222

23-
# This optimization currently only runs in the dynamic automatic parallelism of Llama7B.
24-
export FLAGS_fuse_reducescatter_in_opt=1
25-
2623
# Enable tensor fusion and sharding overlap optimization
2724
export FLAGS_enable_tensor_fusion=1
2825
export FLAGS_enable_sharding_overlap=1

tests/test_tipc/static/auto_parallel/qwen/N4C32/qwen-14b_pretrain_dynamic_auto_bs32_bf16_DP1_MP2_Sharding16_Stage1.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ param+="nnodes=4 "
2020
param+="model_type=qwen_14b "
2121
param+='dynamic_auto=_dynamic_auto '
2222

23-
export FLAGS_fuse_reducescatter_in_opt=1
2423
export FLAGS_enable_tensor_fusion=1
2524
export FLAGS_enable_sharding_overlap=1
2625

0 commit comments

Comments
 (0)