Skip to content

Commit e042ac1

Browse files
authored
[CI] update workflow (#10846)
* update workflow for CI * fix
1 parent f4846c1 commit e042ac1

File tree

14 files changed

+967
-226
lines changed

14 files changed

+967
-226
lines changed
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
name: 'Rerun Workflow'
2+
description: 'Re-run GitHub Actions workflow for a given Pull Request'
3+
inputs:
4+
GITHUB_TOKEN:
5+
description: 'GitHub token with repo scope'
6+
required: true
7+
OWNER:
8+
description: 'Repository owner'
9+
required: true
10+
REPO:
11+
description: 'Repository name'
12+
required: true
13+
PR_ID:
14+
description: 'Pull Request ID'
15+
required: true
16+
JOB_NAME:
17+
description: 'Job name to rerun'
18+
required: true
19+
20+
runs:
21+
using: 'composite'
22+
steps:
23+
- run: bash ./.github/actions/rerun-workflow/rerun.sh
24+
shell: bash
25+
env:
26+
GITHUB_TOKEN: ${{ inputs.GITHUB_TOKEN }}
27+
OWNER: ${{ inputs.OWNER }}
28+
REPO: ${{ inputs.REPO }}
29+
PR_ID: ${{ inputs.PR_ID }}
30+
JOB_NAME: ${{ inputs.JOB_NAME }}
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# Copyright (c) 2025 PaddleNLP Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
set -e
16+
17+
COMMIT_SHA=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \
18+
"https://api.github.com/repos/$OWNER/$REPO/pulls/$PR_ID" | jq -r '.head.sha')
19+
20+
echo "Commit SHA: $COMMIT_SHA"
21+
22+
response=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \
23+
"https://api.github.com/repos/$OWNER/$REPO/actions/runs?head_sha=$COMMIT_SHA&per_page=100")
24+
25+
echo "Response: $response"
26+
27+
run_ids=$(echo "$response" | jq -r '.workflow_runs[].id')
28+
29+
if [ -n "$run_ids" ]; then
30+
echo "Found run_ids for commit $COMMIT_SHA: $run_ids"
31+
32+
for run_id in $run_ids; do
33+
if [ "$JOB_NAME" = "all-failed" ]; then
34+
echo "Rerunning all failed jobs for run_id: $run_id"
35+
36+
rerun_response=$(curl -X POST -s -w "%{http_code}" -o /dev/null \
37+
-H "Accept: application/vnd.github.v3+json" \
38+
-H "Authorization: Bearer $GITHUB_TOKEN" \
39+
"https://api.github.com/repos/$OWNER/$REPO/actions/runs/$run_id/rerun-failed-jobs")
40+
if [ "$rerun_response" -eq 201 ]; then
41+
echo "Successfully requested rerun for all blocked jobs in run_id: $run_id"
42+
else
43+
echo "Failed to request rerun for run_id: $run_id with status code $rerun_response"
44+
fi
45+
46+
else
47+
jobs_response=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \
48+
"https://api.github.com/repos/$OWNER/$REPO/actions/runs/$run_id/jobs")
49+
50+
echo "Jobs Response for run_id $run_id: $jobs_response"
51+
52+
# if [[ "$JOB_NAME" == *"bypass"* ]]; then
53+
block_jobs=$(echo "$jobs_response" | jq -r --arg job_name "$JOB_NAME" \
54+
'.jobs[] | select(.name == $job_name) | .id')
55+
# else
56+
# block_jobs=$(echo "$jobs_response" | jq -r --arg job_name "$JOB_NAME" \
57+
# '.jobs[] | select(.name == $job_name and .conclusion != "success") | .id')
58+
# fi
59+
60+
if [ -n "$block_jobs" ]; then
61+
echo "Found block jobs for run_id $run_id: $block_jobs"
62+
63+
for job_id in $block_jobs; do
64+
echo "Rerunning job_id: $job_id"
65+
curl -X POST -H "Accept: application/vnd.github.v3+json" \
66+
-H "Authorization: token $GITHUB_TOKEN" \
67+
"https://api.github.com/repos/$OWNER/$REPO/actions/jobs/$job_id/rerun"
68+
done
69+
else
70+
echo "No block jobs found for run_id $run_id with name $JOB_NAME."
71+
fi
72+
fi
73+
done
74+
else
75+
echo "No matching workflow runs found for commit $COMMIT_SHA."
76+
exit 1
77+
fi

.github/workflows/distribute.yml

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
name: Distribute CI (V100)
2+
3+
on:
4+
pull_request:
5+
types: [opened, synchronize, reopened]
6+
branches: [develop]
7+
schedule:
8+
- cron: "2 0 * * *"
9+
workflow_call:
10+
inputs:
11+
run_downstream:
12+
required: true
13+
type: string
14+
image_name:
15+
required: true
16+
type: string
17+
18+
19+
concurrency:
20+
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
21+
cancel-in-progress: true
22+
23+
env:
24+
PR_ID: ${{ github.event.pull_request.number }}
25+
COMMIT_ID: ${{ github.event.pull_request.head.sha }}
26+
TASK: paddlenlp-CI-${{ github.event.pull_request.number }}-Distribut-V100
27+
ci_scripts: /workspace/PaddleNLP/scripts/distribute
28+
BRANCH: ${{ github.event.pull_request.base.ref }}
29+
AGILE_COMPILE_BRANCH: ${{ github.event.pull_request.base.ref }}
30+
CI_name: distribute-ci
31+
no_proxy: "localhost,bj.bcebos.com,su.bcebos.com,bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn"
32+
GITHUB_EVENT_NAME: ${{ github.event_name }}
33+
RUN_DOWNSTREAM: ${{ inputs.run_downstream }}
34+
35+
defaults:
36+
run:
37+
shell: bash
38+
39+
jobs:
40+
distribute-v100-ci:
41+
name: distribute-v100-ci
42+
runs-on:
43+
group: Auto-Parallel
44+
steps:
45+
- name: Determine Image Name
46+
env:
47+
IMAGE_NAME: ${{ inputs.image_name }}
48+
run: |
49+
if [[ -n "${IMAGE_NAME}" ]]; then
50+
echo "IMAGE_NAME=${IMAGE_NAME}" >> "$GITHUB_ENV"
51+
else
52+
echo "IMAGE_NAME=registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda11.8-cudnn8.6-trt8.5-gcc82" >> "$GITHUB_ENV"
53+
fi
54+
55+
- name: Run Container
56+
env:
57+
work_dir: ${{ github.workspace }}
58+
CACHE_DIR: /home/data/cfs/.cache
59+
FLAGS_dynamic_static_unified_comm: "True"
60+
python_version: "3.10"
61+
paddle_whl: https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuSome-LinuxCentos-Gcc82-Cuda118-Cudnn86-Trt85-Py310-CINN-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
62+
run: |
63+
container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
64+
echo "container_name=${container_name}" >> "$GITHUB_ENV"
65+
if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
66+
echo "Not in a pull_request or test_build event. Skipping..."
67+
else
68+
nvidia-docker run -d -t --name ${container_name} --net=host -v /dev/shm:/dev/shm --shm-size=32G \
69+
-v $work_dir/../../..:$work_dir/../../.. \
70+
-v $work_dir:/workspace \
71+
-v /home/.cache/pip:/home/.cache/pip \
72+
-v /home/FleetX_CI:/fleetx_data \
73+
-v /home/Llm_gpt_CI:/llm_gpt_data \
74+
-v /home/Llama_CI:/llama_data \
75+
-e BRANCH \
76+
-e AGILE_COMPILE_BRANCH \
77+
-e PR_ID \
78+
-e COMMIT_ID \
79+
-e work_dir \
80+
-e ci_scripts \
81+
-e no_proxy \
82+
-e CI_name \
83+
-e paddle_whl \
84+
-e FLAGS_dynamic_static_unified_comm \
85+
-e python_version \
86+
-w /workspace --runtime=nvidia ${{ env.IMAGE_NAME }}
87+
fi
88+
89+
- name: Download Code
90+
run: |
91+
if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
92+
echo "Not in a pull_request or test_build event. Skipping.."
93+
else
94+
docker exec -t $container_name /bin/bash -c '
95+
rm -rf * .[^.]*
96+
echo "Downloading PaddleNLP.tar.gz"
97+
wget -q --no-proxy https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddleNLP.tar --no-check-certificate
98+
echo "Extracting PaddleNLP.tar.gz"
99+
tar xf PaddleNLP.tar && rm -rf PaddleNLP.tar
100+
source $work_dir/../../../proxy
101+
cd PaddleNLP
102+
git config --global user.name "PaddleCI"
103+
git config --global user.email "paddle_ci@example.com"
104+
git pull
105+
git submodule update --init --recursive --force
106+
if [ -n "${PR_ID}" ]; then
107+
git fetch origin pull/${PR_ID}/head
108+
git checkout -b PR_${PR_ID} FETCH_HEAD
109+
git remote add upstream https://github.com/PaddlePaddle/PaddleFormers.git
110+
git fetch upstream ${BRANCH}
111+
git merge ${BRANCH} --no-edit
112+
git diff --numstat ${BRANCH} -- | awk "{print \$NF}"
113+
else
114+
echo "Not in a pull_request event. Skipping PR-specific operations."
115+
fi
116+
git log --pretty=oneline -10
117+
'
118+
fi
119+
120+
- name: Test
121+
run: |
122+
if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
123+
echo "Not in a pull_request or test_build event. Skipping..."
124+
else
125+
docker exec -t $container_name /bin/bash -c '
126+
ldconfig
127+
ln -sf $(which python${python_version}) /usr/bin/python
128+
pip config set global.cache-dir "/home/.cache/pip"
129+
source $work_dir/../../../proxy
130+
set -e
131+
cd /workspace/PaddleNLP && git config --global --add safe.directory $PWD
132+
timeout 80m bash scripts/distribute/run_ci.sh ${paddle_whl}
133+
'
134+
fi
135+
136+
- name: Upload Logs
137+
if: always()
138+
env:
139+
home_path: ${{ github.workspace }}/../../..
140+
bos_file: ${{ github.workspace }}/../../../bos/BosClient.py
141+
run: |
142+
if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
143+
echo "Not in a pull_request or test_build event. Skipping..."
144+
else
145+
docker exec -t $container_name /bin/bash -c '
146+
if [ ! -f "${{ env.bos_file }}" ]; then
147+
wget -q --no-proxy -O ${{ env.home_path }}/bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate
148+
mkdir ${{ env.home_path }}/bos
149+
tar xf ${{ env.home_path }}/bos_new.tar.gz -C ${{ env.home_path }}/bos
150+
fi
151+
152+
if [[ "${{ env.RUN_DOWNSTREAM }}" == "" && -n "${PR_ID}" ]]; then
153+
bos_prefix="${PR_ID}/${COMMIT_ID}"
154+
elif [[ "${{ env.RUN_DOWNSTREAM }}" == "true" && -n "${PR_ID}" ]]; then
155+
bos_prefix="${PR_ID}/${COMMIT_ID}/test_build"
156+
else
157+
bos_prefix="schedule/$(date +%Y%m%d)"
158+
fi
159+
160+
cd /workspace/case_logs
161+
for FILE in /workspace/case_logs/*; do
162+
file=$(basename "$FILE")
163+
python ${{ env.bos_file }} $file paddle-github-action/PR/PaddleNLP/distribute/${bos_prefix}/logs
164+
echo "$file: https://paddle-github-action.bj.bcebos.com/PR/PaddleNLP/distribute/${bos_prefix}/logs/$file"
165+
done
166+
tar -czf products.tar.gz ./
167+
python ${{ env.bos_file }} products.tar.gz paddle-github-action/PR/PaddleNLP/distribute/${bos_prefix}/logs
168+
echo "products: https://paddle-github-action.bj.bcebos.com/PR/PaddleNLP/distribute/${bos_prefix}/logs/products.tar.gz"
169+
'
170+
fi
171+
172+
- name: Terminate And Delete the Container
173+
if: always()
174+
run: |
175+
docker rm -f $container_name 2>/dev/null || true

0 commit comments

Comments
 (0)