PaddlePaddle
diff --git a/‎llm/auto_parallel/llama/run_pretrain_auto.py
Lines changed: 7 additions & 0 deletions b/‎llm/auto_parallel/llama/run_pretrain_auto.py
Lines changed: 7 additions & 0 deletions
diff --git a/‎paddlenlp/trainer/auto_trainer.py
Lines changed: 61 additions & 1 deletion b/‎paddlenlp/trainer/auto_trainer.py
Lines changed: 61 additions & 1 deletion
diff --git a/‎paddlenlp/trainer/trainer_utils.py
Lines changed: 5 additions & 0 deletions b/‎paddlenlp/trainer/trainer_utils.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎paddlenlp/transformers/llama/__init__.py
Lines changed: 1 addition & 0 deletions b/‎paddlenlp/transformers/llama/__init__.py
Lines changed: 1 addition & 0 deletions
@@ -41,6 +41,7 @@
     LinearAnnealingWithWarmupDecay,
     LlamaConfig,
     LlamaForCausalLM3DAuto,
+    LlamaForCausalLM3DAutoPP,
     LlamaForCausalLMNet,
     LlamaPretrainingCriterion3DAuto,
     LlamaPretrainingCriterionNet,
@@ -49,6 +50,7 @@
 
 MODEL_CLASSES = {
     "llama": (LlamaConfig, LlamaForCausalLM3DAuto, LlamaPretrainingCriterion3DAuto),
+    "llama_pp": (LlamaConfig, LlamaForCausalLM3DAutoPP, LlamaPretrainingCriterion3DAuto),
     "llama_network": (LlamaConfig, LlamaForCausalLMNet, LlamaPretrainingCriterionNet),
 }
 
@@ -94,6 +96,10 @@ class PreTrainingArguments(AutoTrainingArguments):
         default=False,
         metadata={"help": "Weather to run benchmark by autotuner. True for from_scratch and pad_max_length."},
     )
+    n_microbatches: int = field(
+        default=1,
+        metadata={"help": "Control the num of microbatches in one pp step."},
+    )
 
     def __post_init__(self):
         super().__post_init__()
@@ -637,6 +643,7 @@ def fn(layer):
     )
     trainer = PretrainingTrainer(
         model=model,
+        model_type=model_args.model_type,
         criterion=criterion,
         args=training_args,
         data_collator=data_collator,
 
@@ -29,6 +29,7 @@
 
 from paddlenlp.trainer import Trainer
 
+from ..transformers import get_pp_schedule
 from ..transformers.model_utils import clean_model_class_name, unwrap_model
 from ..utils.batch_sampler import DistributedBatchSampler as NlpDistributedBatchSampler
 from ..utils.env import (
@@ -46,6 +47,7 @@
     ShardingOption,
     TrainOutput,
     _exec_mode_guard,
+    check_auto_parallel_pipeline_support,
     get_last_checkpoint,
     has_length,
     speed_metrics,
@@ -77,6 +79,7 @@ def loss_func(loss, outputs):
                 kwargs.update({"criterion": loss_func})
         self.auto_dist_config = kwargs.pop("auto_dist_config", None)
         model = kwargs.get("model", None)
+        self.model_type = kwargs.pop("model_type", None)
         assert model is not None
         if kwargs.get("args", None) is not None and kwargs["args"].use_intermediate_api:
             if not parallelize.has_parallelized_model:
@@ -93,12 +96,20 @@ def loss_func(loss, outputs):
             if not param._is_initialized() and param._init_func is not None:
                 param.initialize()
         kwargs["model"] = model
-
         super().__init__(*args, **kwargs)
         assert self.args.enable_auto_parallel
 
         self.global_mesh = fleet.auto.get_mesh()
         self.comm_group_in_pp = fleet.get_hybrid_communicate_group().get_pipe_parallel_group()
+        if self.args.pipeline_parallel_degree > 1 and check_auto_parallel_pipeline_support(self.model_type):
+            self.pp_schedule = get_pp_schedule(
+                model,
+                self.args.n_microbatches,
+                self.criterion,
+                self.args.pipeline_schedule_mode,
+                self.args.pipeline_parallel_degree,
+                self.comm_group_in_pp,
+            )
         self._in_pir_mode = paddle.base.framework.get_flags("FLAGS_enable_pir_api")["FLAGS_enable_pir_api"]
 
     @classmethod
@@ -703,7 +714,56 @@ def to_list(value):
 
         return (loss, outputs) if return_outputs else loss
 
+    def compute_pipeline_loss(self, model, inputs, return_outputs=False):
+        """
+        How the loss is computed by Trainer. By default, all models return the loss in the first element.
+        Subclass and override for custom behavior.
+        """
+        if self.criterion is not None:
+            if "labels" in inputs:
+                labels = inputs.pop("labels")
+            elif "start_positions" in inputs and "end_positions" in inputs:
+                labels = (inputs.pop("start_positions"), inputs.pop("end_positions"))
+            elif self.args.label_names is not None:
+                labels = []
+                for label in self.label_names:
+                    labels.append(inputs.pop(label))
+                labels = tuple(labels)
+            elif "generator_labels" in inputs:
+                labels = inputs["generator_labels"]
+        else:
+            labels = None
+
+        pp_rank = self.comm_group_in_pp.rank
+        losses = []
+        if pp_rank == 0:  # 第一个pp_stage，参数传入数据流
+            self.pp_schedule.step(**inputs)  # 最后的pp_stage，参数传入label, 并输出loss
+        elif pp_rank == self.args.pipeline_parallel_degree - 1:
+            self.pp_schedule.step(target=labels, losses=losses)
+        else:
+            self.pp_schedule.step()
+
+        final_loss = None
+        if len(losses) != 0:
+            final_loss = paddle.stack(losses).mean()
+
+        return final_loss
+
+    def dynamic_auto_parallel_pipeline_training(
+        self, model: nn.Layer, inputs: Dict[str, Union[paddle.Tensor, Any]]
+    ) -> paddle.Tensor:
+        assert self.args.pipeline_parallel_degree > 1, "pipeline_parallel_degree must be greater than 1."
+        assert check_auto_parallel_pipeline_support(
+            self.model_type
+        ), "dynamic auto_parallel pipeline only supports special models"
+        with self.autocast_smart_context_manager():
+            loss = self.compute_pipeline_loss(model, inputs)
+
+        return loss
+
     def dynamic_training(self, model: nn.Layer, inputs: Dict[str, Union[paddle.Tensor, Any]]) -> paddle.Tensor:
+        if self.args.pipeline_parallel_degree > 1 and check_auto_parallel_pipeline_support(self.model_type):
+            return self.dynamic_auto_parallel_pipeline_training(model, inputs)
         with self.autocast_smart_context_manager():
             loss = self.compute_loss(model, inputs)
 
 
@@ -1253,3 +1253,8 @@ def download_recovery_ckpt_from_pdc(recovery_checkpoint_path, timeout):
         raise RuntimeError(
             f"{PDC_DOWNLOAD_ERROR}; Error occurred when trying to download checkpoint from PDC, recovery_checkpoint_path: {recovery_checkpoint_path}, timeout: {timeout}; error details: {PDCErrorMessageMap[result]}"
         )
+
+
+def check_auto_parallel_pipeline_support(model_type=None):
+    support_types = ["llama_pp"]
+    return model_type in support_types
@@ -15,6 +15,7 @@
 from .configuration import *
 from .modeling import *
 from .modeling_auto import *
+from .modeling_auto_pp import *
 from .modeling_network import *
 from .modeling_pp import *
 from .tokenizer import *
Original file line number	Diff line number	Diff line change
`@@ -1253,3 +1253,8 @@ def download_recovery_ckpt_from_pdc(recovery_checkpoint_path, timeout):`
`1253`	`1253`	`raise RuntimeError(`
`1254`	`1254`	`f"{PDC_DOWNLOAD_ERROR}; Error occurred when trying to download checkpoint from PDC, recovery_checkpoint_path: {recovery_checkpoint_path}, timeout: {timeout}; error details: {PDCErrorMessageMap[result]}"`
`1255`	`1255`	`)`
	`1256`	`+`
	`1257`	`+`
	`1258`	`+def check_auto_parallel_pipeline_support(model_type=None):`
	`1259`	`+ support_types = ["llama_pp"]`
	`1260`	`+ return model_type in support_types`