Skip to content

Commit 58bed42

Browse files
GITD245cyy536
andauthored
[Auto Parallel] change fused_layers.py to support fused_linear_grad add/sync_mp_allreduce/sp_async_reduce_scatter in same time (#10797)
* add origin linear * fix * update mock_layer in run_pretrain_auto * fix * fix * fix to_static * fix to_static * support sp_async_reducescatter * polish * fix ci bug: no flashattn in v100 * fix loss base * close mp async when sp is on --------- Co-authored-by: chenyuyan <2903319587@qq.com>
1 parent 15d49bb commit 58bed42

File tree

7 files changed

+271
-67
lines changed

7 files changed

+271
-67
lines changed

llm/auto_parallel/deepseek-v3/run_pretrain_auto.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -460,12 +460,25 @@ def main():
460460
else:
461461
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
462462

463-
if training_args.enable_linear_fused_grad_add:
463+
do_enable_linear_fused_grad_add = training_args.enable_linear_fused_grad_add
464+
do_enable_mp_async_allreduce = (
465+
training_args.enable_auto_parallel
466+
and training_args.tensor_parallel_degree > 1
467+
and "enable_mp_async_allreduce" in training_args.tensor_parallel_config
468+
and not training_args.sequence_parallel
469+
)
470+
do_enable_sp_async_reduce_scatter = (
471+
training_args.enable_auto_parallel
472+
and training_args.tensor_parallel_degree > 1
473+
and training_args.sequence_parallel
474+
and "enable_sp_async_reduce_scatter" in training_args.tensor_parallel_config
475+
)
476+
if (
477+
do_enable_linear_fused_grad_add or do_enable_mp_async_allreduce or do_enable_sp_async_reduce_scatter
478+
) and not training_args.to_static:
464479
from llm.utils.fused_layers import mock_layers
465480

466-
mock_layers(
467-
mp_async_allreduce=True if "enable_mp_async_allreduce" in training_args.tensor_parallel_config else False
468-
)
481+
mock_layers(do_enable_linear_fused_grad_add, do_enable_mp_async_allreduce, do_enable_sp_async_reduce_scatter)
469482

470483
if model_args.tokenizer_name_or_path is None:
471484
model_args.tokenizer_name_or_path = model_args.model_name_or_path

llm/auto_parallel/gpt-3/run_pretrain_auto.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -444,12 +444,25 @@ def main():
444444
else:
445445
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
446446

447-
if training_args.enable_linear_fused_grad_add:
447+
do_enable_linear_fused_grad_add = training_args.enable_linear_fused_grad_add
448+
do_enable_mp_async_allreduce = (
449+
training_args.enable_auto_parallel
450+
and training_args.tensor_parallel_degree > 1
451+
and "enable_mp_async_allreduce" in training_args.tensor_parallel_config
452+
and not training_args.sequence_parallel
453+
)
454+
do_enable_sp_async_reduce_scatter = (
455+
training_args.enable_auto_parallel
456+
and training_args.tensor_parallel_degree > 1
457+
and training_args.sequence_parallel
458+
and "enable_sp_async_reduce_scatter" in training_args.tensor_parallel_config
459+
)
460+
if (
461+
do_enable_linear_fused_grad_add or do_enable_mp_async_allreduce or do_enable_sp_async_reduce_scatter
462+
) and not training_args.to_static:
448463
from llm.utils.fused_layers import mock_layers
449464

450-
mock_layers(
451-
mp_async_allreduce=True if "enable_mp_async_allreduce" in training_args.tensor_parallel_config else False
452-
)
465+
mock_layers(do_enable_linear_fused_grad_add, do_enable_mp_async_allreduce, do_enable_sp_async_reduce_scatter)
453466

454467
if model_args.tokenizer_name_or_path is None:
455468
model_args.tokenizer_name_or_path = model_args.model_name_or_path

llm/auto_parallel/llama/run_pretrain_auto.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -464,18 +464,25 @@ def main():
464464
else:
465465
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
466466

467+
do_enable_linear_fused_grad_add = training_args.enable_linear_fused_grad_add
468+
do_enable_mp_async_allreduce = (
469+
training_args.enable_auto_parallel
470+
and training_args.tensor_parallel_degree > 1
471+
and "enable_mp_async_allreduce" in training_args.tensor_parallel_config
472+
and not training_args.sequence_parallel
473+
)
467474
do_enable_sp_async_reduce_scatter = (
468475
training_args.enable_auto_parallel
469476
and training_args.tensor_parallel_degree > 1
470477
and training_args.sequence_parallel
471478
and "enable_sp_async_reduce_scatter" in training_args.tensor_parallel_config
472479
)
473-
if training_args.enable_linear_fused_grad_add and not do_enable_sp_async_reduce_scatter:
480+
if (
481+
do_enable_linear_fused_grad_add or do_enable_mp_async_allreduce or do_enable_sp_async_reduce_scatter
482+
) and not training_args.to_static:
474483
from llm.utils.fused_layers import mock_layers
475484

476-
mock_layers(
477-
mp_async_allreduce=True if "enable_mp_async_allreduce" in training_args.tensor_parallel_config else False
478-
)
485+
mock_layers(do_enable_linear_fused_grad_add, do_enable_mp_async_allreduce, do_enable_sp_async_reduce_scatter)
479486

480487
if model_args.tokenizer_name_or_path is None:
481488
model_args.tokenizer_name_or_path = model_args.model_name_or_path
@@ -611,13 +618,6 @@ def fn(layer):
611618

612619
model.apply(fn)
613620

614-
if do_enable_sp_async_reduce_scatter:
615-
from llm.utils.sp_async_reduce_scatter import (
616-
mock_layers_sp_async_reduce_scatter,
617-
)
618-
619-
mock_layers_sp_async_reduce_scatter(model)
620-
621621
# Create the learning_rate scheduler and optimizer
622622
if training_args.decay_steps is None:
623623
training_args.decay_steps = training_args.max_steps

llm/auto_parallel/qwen/run_pretrain_auto.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -433,12 +433,25 @@ def main():
433433
else:
434434
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
435435

436-
if training_args.enable_linear_fused_grad_add:
436+
do_enable_linear_fused_grad_add = training_args.enable_linear_fused_grad_add
437+
do_enable_mp_async_allreduce = (
438+
training_args.enable_auto_parallel
439+
and training_args.tensor_parallel_degree > 1
440+
and "enable_mp_async_allreduce" in training_args.tensor_parallel_config
441+
and not training_args.sequence_parallel
442+
)
443+
do_enable_sp_async_reduce_scatter = (
444+
training_args.enable_auto_parallel
445+
and training_args.tensor_parallel_degree > 1
446+
and training_args.sequence_parallel
447+
and "enable_sp_async_reduce_scatter" in training_args.tensor_parallel_config
448+
)
449+
if (
450+
do_enable_linear_fused_grad_add or do_enable_mp_async_allreduce or do_enable_sp_async_reduce_scatter
451+
) and not training_args.to_static:
437452
from llm.utils.fused_layers import mock_layers
438453

439-
mock_layers(
440-
mp_async_allreduce=True if "enable_mp_async_allreduce" in training_args.tensor_parallel_config else False
441-
)
454+
mock_layers(do_enable_linear_fused_grad_add, do_enable_mp_async_allreduce, do_enable_sp_async_reduce_scatter)
442455

443456
if model_args.tokenizer_name_or_path is None:
444457
model_args.tokenizer_name_or_path = model_args.model_name_or_path

0 commit comments

Comments
 (0)