@@ -217,113 +217,105 @@ function llama_dygraph_auto_bs4_bf16_SD2() {
217
217
export PYTHONPATH=$root_path /:$PYTHONPATH
218
218
export FLAGS_call_stack_level=3
219
219
export NVIDIA_TF32_OVERRIDE=0
220
-
221
220
export FLAGS_cudnn_deterministic=1
222
221
export FLAGS_embedding_deterministic=1
223
-
222
+
224
223
export CUDA_DEVICE_MAX_CONNECTIONS=1
225
224
226
- flags=(" " " FLAGS_enable_tensor_fusion FLAGS_enable_sharding_overlap" )
227
- for i in " ${! flags[@]} " ; do
228
- flag=" ${flags[$i]} "
225
+ test_cases=(
226
+ " default" " " 1
227
+ " tensor_fusion_overlap1" " enable_tensor_fusion enable_overlap" 1
228
+ " tensor_fusion_overlap2" " enable_tensor_fusion enable_overlap" 2
229
+ )
229
230
230
- if [ -n " $flag " ]; then
231
- for f in $flag ; do
232
- export " $f =true"
233
- done
234
- fi
235
- acc_steps=(1)
236
- if [ " $flag " = " FLAGS_enable_tensor_fusion FLAGS_enable_sharding_overlap" ]; then
237
- acc_steps=(1 2)
238
- fi
239
- for acc_step in " ${acc_steps[@]} " ; do
240
- task_name=" llama_dygraph_auto_bs4_bf16_SD2_$f "
241
- case_out_dir=" output/$task_name "
242
- case_log_dir=" output/$task_name " " _log"
243
- rm -rf $case_out_dir
244
- rm -rf $case_log_dir
231
+ for (( i= 0 ; i< ${# test_cases[@]} ; i+= 3 )) ; do
232
+ case_name=${test_cases[i]}
233
+ sharding_config=${test_cases[i+1]}
234
+ acc_step=${test_cases[i+2]}
245
235
246
- python -u -m paddle.distributed.launch \
247
- --gpus " 0,1" \
248
- --log_dir " output/$task_name " " _log" \
249
- ./run_pretrain_auto.py \
250
- --model_name_or_path " meta-llama/Llama-2-7b" \
251
- --tokenizer_name_or_path " meta-llama/Llama-2-7b" \
252
- --input_dir " ./data" \
253
- --output_dir " ./output" \
254
- --weight_decay 0.01 \
255
- --warmup_ratio 0.01 \
256
- --max_grad_norm 1.0 \
257
- --learning_rate 3e-05 \
258
- --min_learning_rate 3e-06 \
259
- --max_steps 10 \
260
- --logging_steps 10 \
261
- --eval_steps 1000 \
262
- --save_steps 50000 \
263
- --continue_training 0 \
264
- --do_train true \
265
- --do_eval false \
266
- --do_predict false \
267
- --disable_tqdm true \
268
- --skip_profile_timer true \
269
- --device gpu \
270
- --enable_auto_parallel 1 \
271
- --per_device_train_batch_size 1 \
272
- --gradient_accumulation_steps $acc_step \
273
- --per_device_eval_batch_size 2 \
274
- --recompute false \
275
- --recompute_use_reentrant true \
276
- --recompute_granularity full \
277
- --pp_recompute_interval 0 \
278
- --bf16 true \
279
- --fp16_opt_level " O2" \
280
- --amp_master_grad true \
281
- --fuse_attention_ffn true \
282
- --fuse_attention_qkv true \
283
- --fused_linear_param_grad_add 1 \
284
- --use_flash_attention true \
285
- --use_fused_rope true \
286
- --use_fused_rms_norm true \
287
- --max_seq_length 4096 \
288
- --sequence_parallel false \
289
- --pipeline_parallel_degree 1 \
290
- --tensor_parallel_degree 1 \
291
- --sharding " stage1" \
292
- --data_parallel_config " enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \
293
- --sharding_parallel_config " enable_tensor_fusion enable_overlap" \
294
- --to_static 0 \
295
- --amp_custom_black_list " reduce_sum" " c_softmax_with_cross_entropy" \
296
- --amp_custom_white_list " lookup_table" " lookup_table_v2" \
297
- --num_hidden_layers 4 \
298
- >> ${log_path} /$FUNCNAME 2>&1
299
- loss=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
300
- ips=-1
301
- mem=-1
302
- echo " result: loss=$loss ips=$ips mem=$mem "
303
- echo " flag=$flag acc_step=$acc_step "
304
- if [ -z " $flag " ]; then
305
- loss_base=9.23504105
306
- elif [ " $flag " = " FLAGS_enable_tensor_fusion FLAGS_enable_sharding_overlap" ]; then
307
- if [ $acc_step -eq 1 ]; then
308
- loss_base=9.23504868
309
- else
310
- loss_base=9.16484451
311
- fi
312
- else
313
- loss_base=-1
314
- fi
236
+ task_name=" llama_dygraph_auto_bs4_bf16_SD2_${case_name} _acc${acc_step} "
315
237
316
- ips_base=-1
317
- mem_base=-1
318
- check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
319
- done
320
238
321
- if [ -n " $flag " ]; then
322
- for f in $flag ; do
323
- export " $f =false"
324
- done
239
+ case_out_dir=" output/$task_name "
240
+ case_log_dir=" output/$task_name " " _log"
241
+ rm -rf $case_out_dir
242
+ rm -rf $case_log_dir
243
+
244
+ python -u -m paddle.distributed.launch \
245
+ --gpus " 0,1" \
246
+ --log_dir " output/$task_name " " _log" \
247
+ ./run_pretrain_auto.py \
248
+ --model_name_or_path " meta-llama/Llama-2-7b" \
249
+ --tokenizer_name_or_path " meta-llama/Llama-2-7b" \
250
+ --input_dir " ./data" \
251
+ --output_dir " ./output" \
252
+ --weight_decay 0.01 \
253
+ --warmup_ratio 0.01 \
254
+ --max_grad_norm 1.0 \
255
+ --learning_rate 3e-05 \
256
+ --min_learning_rate 3e-06 \
257
+ --max_steps 10 \
258
+ --logging_steps 10 \
259
+ --eval_steps 1000 \
260
+ --save_steps 50000 \
261
+ --continue_training 0 \
262
+ --do_train true \
263
+ --do_eval false \
264
+ --do_predict false \
265
+ --disable_tqdm true \
266
+ --skip_profile_timer true \
267
+ --device gpu \
268
+ --enable_auto_parallel 1 \
269
+ --per_device_train_batch_size 1 \
270
+ --gradient_accumulation_steps $acc_step \
271
+ --per_device_eval_batch_size 2 \
272
+ --recompute false \
273
+ --recompute_use_reentrant true \
274
+ --recompute_granularity full \
275
+ --pp_recompute_interval 0 \
276
+ --bf16 true \
277
+ --fp16_opt_level " O2" \
278
+ --amp_master_grad true \
279
+ --fuse_attention_ffn true \
280
+ --fuse_attention_qkv true \
281
+ --fused_linear_param_grad_add 1 \
282
+ --use_flash_attention true \
283
+ --use_fused_rope true \
284
+ --use_fused_rms_norm true \
285
+ --max_seq_length 4096 \
286
+ --sequence_parallel false \
287
+ --pipeline_parallel_degree 1 \
288
+ --tensor_parallel_degree 1 \
289
+ --sharding " stage1" \
290
+ --data_parallel_config " enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \
291
+ --sharding_parallel_config $sharding_config \
292
+ --to_static 0 \
293
+ --amp_custom_black_list " reduce_sum" " c_softmax_with_cross_entropy" \
294
+ --amp_custom_white_list " lookup_table" " lookup_table_v2" \
295
+ --num_hidden_layers 4 \
296
+ >> ${log_path} /$FUNCNAME 2>&1
297
+ loss=` cat $case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
298
+ ips=-1
299
+ mem=-1
300
+ echo " result: loss=$loss ips=$ips mem=$mem "
301
+ echo " case=$case_name sharding_config=$sharding_config acc_step=$acc_step "
302
+ if [ " $case_name " = " default" ]; then
303
+ loss_base=9.23504105
304
+ elif [ " $case_name " = " tensor_fusion_overlap" ]; then
305
+ if [ $acc_step -eq 1 ]; then
306
+ loss_base=9.23504868
307
+ else
308
+ loss_base=9.16484451
309
+ fi
310
+ else
311
+ loss_base=-1
325
312
fi
313
+
314
+ ips_base=-1
315
+ mem_base=-1
316
+ check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
326
317
done
318
+
327
319
echo " =========== $FUNCNAME run end ==========="
328
320
fi
329
321
}
@@ -3984,7 +3976,6 @@ function llama_baichuan_dygraph_auto_sp_async_reduce_scatter_bs8_bf16_DP4-MP2-SP
3984
3976
export NVIDIA_TF32_OVERRIDE=0
3985
3977
3986
3978
export CUDA_DEVICE_MAX_CONNECTIONS=1
3987
- export FLAGS_enable_inplace_master_grad=1
3988
3979
export FLAGS_auto_parallel_align_mode=1
3989
3980
export FLAGS_max_inplace_grad_add=65536
3990
3981
export FLAGS_embedding_deterministic=1
0 commit comments