[Auto Parallel] add llama with auto pp (#10751)

Waynezee · web-flow · commit bfba64bb9842 · 2025-06-23T14:59:05.000+08:00
* llama_with_auto_pp

* support change n_microbatches in json

* fix init model

* keep layers which are not in curr rank

* formal llama_with_auto_pp

* formal llama_with_auto_pp

* style

* formal

* fix

* fix

* fix

* fix

* add condition for hybrid pp

* fix

* add test case

* add test case

* add test case

* add llama type pp_schedule

* fix
diff --git a/paddlenlp/trainer/auto_trainer.py b/paddlenlp/trainer/auto_trainer.py
@@ -29,7 +29,6 @@
 
 from paddlenlp.trainer import Trainer
 
-from ..transformers import get_pp_schedule
 from ..transformers.model_utils import clean_model_class_name, unwrap_model
 from ..utils.batch_sampler import DistributedBatchSampler as NlpDistributedBatchSampler
 from ..utils.env import (
@@ -49,6 +48,7 @@
     _exec_mode_guard,
     check_auto_parallel_pipeline_support,
     get_last_checkpoint,
+    get_pp_schedule,
     has_length,
     speed_metrics,
 )
@@ -104,6 +104,7 @@ def loss_func(loss, outputs):
         if self.args.pipeline_parallel_degree > 1 and check_auto_parallel_pipeline_support(self.model_type):
             self.pp_schedule = get_pp_schedule(
                 model,
+                self.model_type,
                 self.args.n_microbatches,
                 self.criterion,
                 self.args.pipeline_schedule_mode,
diff --git a/paddlenlp/trainer/trainer_utils.py b/paddlenlp/trainer/trainer_utils.py
@@ -43,6 +43,7 @@
 from paddlenlp.ops import Topology
 
 from ..trainer.argparser import strtobool
+from ..transformers import get_llama_pp_schedule
 from ..transformers.tokenizer_utils_base import BatchEncoding
 from ..utils.env import PREFIX_CHECKPOINT_DIR, _re_checkpoint  # noqa for compatibility
 from ..utils.fault_tolerance import PDC_DOWNLOAD_ERROR
@@ -1258,3 +1259,9 @@ def download_recovery_ckpt_from_pdc(recovery_checkpoint_path, timeout):
 def check_auto_parallel_pipeline_support(model_type=None):
     support_types = ["llama_pp"]
     return model_type in support_types
+
+
+def get_pp_schedule(model, model_type, n_microbatches, loss_fn, mode, pp_degree, group):
+    assert check_auto_parallel_pipeline_support(model_type)
+    if model_type == "llama_pp":
+        return get_llama_pp_schedule(model, n_microbatches, loss_fn, mode, pp_degree, group)
diff --git a/paddlenlp/transformers/llama/modeling_auto_pp.py b/paddlenlp/transformers/llama/modeling_auto_pp.py
@@ -59,7 +59,7 @@ def swiglu(x, y=None):
     flash_attention = None
 
 __all__ = [
-    "get_pp_schedule",
+    "get_llama_pp_schedule",
     "LlamaForCausalLM3DAutoPP",
 ]
 
@@ -146,10 +146,12 @@ def return_args(hidden_states, attention_mask=None, position_ids=None, alibi=Non
 
 
 class LlamaChunk(nn.Layer):
-    def __init__(self, layers=None, is_first=False):
+    def __init__(self, layers=None, is_first=False, is_last=False):
         super(LlamaChunk, self).__init__()
+        assert not (is_first and is_last)
         self.layers = layers
         self.is_first = is_first
+        self.is_last = is_last
 
     def forward(self, *args, **kwargs):
         if self.is_first:
@@ -161,6 +163,13 @@ def forward(self, *args, **kwargs):
             for idx, (decoder_layer) in enumerate(self.layers):
                 outputs = decoder_layer(outputs)
             return outputs
+        elif self.is_last:
+            outputs = args
+            # decoder layers
+            for idx, (decoder_layer) in enumerate(self.layers):
+                outputs = decoder_layer(outputs)
+            if isinstance(outputs, tuple):
+                outputs = outputs[0]
         else:
             outputs = args
             # decoder layers
@@ -182,9 +191,15 @@ def manual_model_split(model, stage_idx, group, mode, pp_degree):
     def _build_stage(model, stage_idx, group):
         new_model = None
         if stage_idx == 0:  # 第一个model_chunk输入特殊处理
-            new_model = LlamaChunk(layer_lists[:chunk_size], is_first=True)
+            new_model = LlamaChunk(layer_lists[:chunk_size], is_first=True, is_last=False)
+        elif stage_idx == chunk_num - 1:  # 最后一个一个model_chunk输出特殊处理
+            new_model = LlamaChunk(
+                layer_lists[stage_idx * chunk_size : (stage_idx + 1) * chunk_size], is_first=False, is_last=True
+            )
         else:
-            new_model = LlamaChunk(layer_lists[stage_idx * chunk_size : (stage_idx + 1) * chunk_size], is_first=False)
+            new_model = LlamaChunk(
+                layer_lists[stage_idx * chunk_size : (stage_idx + 1) * chunk_size], is_first=False, is_last=False
+            )
         stage = PipelineStage(new_model, stage_idx, chunk_num, group=group)
         return stage
 
@@ -195,7 +210,7 @@ def _build_stage(model, stage_idx, group):
     return stages
 
 
-def get_pp_schedule(model, n_microbatches, loss_fn, mode, pp_degree, group):
+def get_llama_pp_schedule(model, n_microbatches, loss_fn, mode, pp_degree, group):
     assert mode in ["VPP", "1F1B", "FThenB"]
     stages = manual_model_split(model, group.rank, group, mode, pp_degree)
     if mode == "VPP":
diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh
@@ -102,6 +102,7 @@ function llama_case_list_auto() {
         # llama_dygraph_auto_bs8_fp32_DP2-MP2
         llama_dygraph_auto_bs8_fp32_DP2-MP2-PP2
         llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2
+        llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2_hybrid_pp
         # llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2_intermediate
         llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2-VPP3_split_bw
         llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2
@@ -695,6 +696,80 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2_intermediate() {
     check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="
 }
+
+function llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2_hybrid_pp() {
+    echo IS_A100 is $IS_A100
+    if [ $IS_A100 -ne 0 ]; then
+        echo "=========== $FUNCNAME run begin ==========="
+        export PYTHONPATH=$root_path/:$PYTHONPATH
+        export FLAGS_call_stack_level=3
+        export NVIDIA_TF32_OVERRIDE=0
+
+        task_name="llama_auto_bs8_fp16_dp2mp2pp2_hybrid_pp"
+        case_out_dir="output/$task_name"
+        case_log_dir="output/$task_name""_log"
+        rm -rf $case_out_dir
+        rm -rf $case_log_dir
+
+        python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" --log_dir $case_log_dir run_pretrain_auto.py \
+            --model_type "llama_pp" \
+            --model_name_or_path "facebook/llama-7b" \
+            --tokenizer_name_or_path "facebook/llama-7b" \
+            --input_dir "./data" \
+            --output_dir $case_out_dir \
+            --split 949,50,1 \
+            --max_seq_length 2048 \
+            --hidden_size 1024 \
+            --intermediate_size 3072 \
+            --num_hidden_layers 8 \
+            --num_attention_heads 32 \
+            --per_device_train_batch_size 4 \
+            --per_device_eval_batch_size 4 \
+            --n_microbatch 4 \
+            --gradient_accumulation_steps 1 \
+            --use_flash_attention 1 \
+            --use_fused_rms_norm 0 \
+            --fp16 1 \
+            --fp16_opt_level "O2" \
+            --amp_master_grad 1 \
+            --scale_loss 1024 \
+            --pipeline_parallel_degree 2 \
+            --pipeline_schedule_mode "FThenB" \
+            --tensor_parallel_degree 2 \
+            --sharding_parallel_degree 1 \
+            --learning_rate 0.0001 \
+            --min_learning_rate 0.00001 \
+            --max_steps 10 \
+            --save_steps 5000 \
+            --weight_decay 0.01 \
+            --warmup_ratio 0.01 \
+            --logging_steps 1 \
+            --dataloader_num_workers 1 \
+            --sharding "" \
+            --eval_steps 1000000 \
+            --disable_tqdm true \
+            --continue_training 0 \
+            --recompute 0 \
+            --do_train \
+            --do_eval \
+            --device "gpu" \
+            --data_impl "mmap" \
+            --enable_auto_parallel 1 \
+            --to_static 0 \
+            --max_grad_norm 0.0 \
+            >>${log_path}/$FUNCNAME 2>&1
+        loss=`cat $case_log_dir/workerlog.0 | grep 'global_step: 10' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
+        ips=-1
+        mem=-1
+        echo "result: loss=$loss ips=$ips mem=$mem"
+        loss_base=9.57178879
+        ips_base=-1
+        mem_base=-1
+        check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
+        echo "=========== $FUNCNAME run  end ==========="
+    fi
+}
+
 function llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2() {
     # Only A100 support this case.
     echo IS_A100 is $IS_A100