Skip to content

Commit 06378bb

Browse files
waliwali777Xing-lilStarrysea996
authored
Add gpt dyanmic auto benchamrk (#10759)
* update * Update pretrain-gpt3_13b_pp.json * Update modeling_auto_pp.py * Update modeling_auto_pp.py * Update modeling_auto_pp.py * Update run_pretrain_auto.py * Update run_pretrain_auto.py * Update run_pretrain_auto.py * Update modeling_auto_pp.py * Update pretrain-gpt3_13b_pp.json * Update trainer_utils.py * init benchmark * update json * update json * remove sd flag * format * fix rebase * open enable_linear_fused_grad_add * pre-commit --------- Co-authored-by: Xing-lil <zhenxingli@hust.edu.cn> Co-authored-by: ZhenxingLi <lizhenxing02@baidu.com> Co-authored-by: Starrysea996 <2462405885@qq.com>
1 parent ecf7a0c commit 06378bb

File tree

7 files changed

+471
-5
lines changed

7 files changed

+471
-5
lines changed

llm/auto_parallel/gpt-3/run_pretrain_auto.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
CosineAnnealingWithWarmupDecay,
3939
GPTConfig,
4040
GPTForCausalLMAuto,
41+
GPTForCausalLMAutoPP,
4142
GPTForCausalLMNet,
4243
GPTPretrainingCriterionAuto,
4344
GPTPretrainingCriterionNet,
@@ -48,6 +49,7 @@
4849

4950
MODEL_CLASSES = {
5051
"gpt": (GPTConfig, GPTForCausalLMAuto, GPTPretrainingCriterionAuto),
52+
"gpt_pp": (GPTConfig, GPTForCausalLMAutoPP, GPTPretrainingCriterionAuto),
5153
"gpt_network": (GPTConfig, GPTForCausalLMNet, GPTPretrainingCriterionNet),
5254
}
5355

@@ -99,6 +101,10 @@ class PreTrainingArguments(AutoTrainingArguments):
99101
default=False,
100102
metadata={"help": "Weather to run benchmark by autotuner. True for from_scratch and pad_max_length."},
101103
)
104+
n_microbatches: int = field(
105+
default=1,
106+
metadata={"help": "Control the num of microbatches in one pp step."},
107+
)
102108
pre_alloc_memory: float = field(
103109
default=0.0,
104110
metadata={
@@ -601,6 +607,7 @@ def fn(layer):
601607

602608
trainer = PretrainingTrainer(
603609
model=model,
610+
model_type=model_args.model_type,
604611
criterion=criterion,
605612
args=training_args,
606613
data_collator=data_collator,

paddlenlp/trainer/trainer_utils.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
from paddlenlp.ops import Topology
4444

4545
from ..trainer.argparser import strtobool
46-
from ..transformers import get_llama_pp_schedule
46+
from ..transformers import get_gpt_pp_schedule, get_llama_pp_schedule
4747
from ..transformers.tokenizer_utils_base import BatchEncoding
4848
from ..utils.env import PREFIX_CHECKPOINT_DIR, _re_checkpoint # noqa for compatibility
4949
from ..utils.fault_tolerance import PDC_DOWNLOAD_ERROR
@@ -1257,11 +1257,13 @@ def download_recovery_ckpt_from_pdc(recovery_checkpoint_path, timeout):
12571257

12581258

12591259
def check_auto_parallel_pipeline_support(model_type=None):
1260-
support_types = ["llama_pp"]
1260+
support_types = ["llama_pp", "gpt_pp"]
12611261
return model_type in support_types
12621262

12631263

12641264
def get_pp_schedule(model, model_type, n_microbatches, loss_fn, mode, pp_degree, group):
12651265
assert check_auto_parallel_pipeline_support(model_type)
12661266
if model_type == "llama_pp":
12671267
return get_llama_pp_schedule(model, n_microbatches, loss_fn, mode, pp_degree, group)
1268+
elif model_type == "gpt_pp":
1269+
return get_gpt_pp_schedule(model, n_microbatches, loss_fn, mode, pp_degree, group)

paddlenlp/transformers/gpt/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@
1515
from .configuration import *
1616
from .modeling import *
1717
from .modeling_auto import *
18+
19+
try:
20+
from .modeling_auto_pp import *
21+
except (ImportError, ModuleNotFoundError):
22+
# Temporarily adapt to the release version of Paddle, which can be removed later.
23+
pass
1824
from .modeling_network import *
1925
from .modeling_pp import *
2026
from .tokenizer import *

0 commit comments

Comments
 (0)