Skip to content

Commit 642e193

Browse files
[GitHub Runner] Fix flax runner (huggingface#13357)
* correct * also comment out multi-gpu test push
1 parent c76de10 commit 642e193

File tree

2 files changed

+93
-96
lines changed

2 files changed

+93
-96
lines changed

.github/workflows/self-push.yml

Lines changed: 54 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -106,9 +106,9 @@ jobs:
106106
python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
107107
python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
108108
109-
# - name: Fetch the tests to run
110-
# run: |
111-
# python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
109+
- name: Fetch the tests to run
110+
run: |
111+
python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
112112
113113
- name: Report fetched tests
114114
uses: actions/upload-artifact@v2
@@ -118,10 +118,9 @@ jobs:
118118

119119
- name: Run all non-slow tests on GPU
120120
run: |
121-
python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_gpu
122-
# if [ -f test_list.txt ]; then
123-
# python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_gpu $(cat test_list.txt)
124-
# fi
121+
if [ -f test_list.txt ]; then
122+
python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_gpu $(cat test_list.txt)
123+
fi
125124
126125
- name: Failure short reports
127126
if: ${{ failure() }}
@@ -251,61 +250,60 @@ jobs:
251250
name: run_all_tests_torch_multi_gpu_test_reports
252251
path: reports
253252

254-
run_tests_flax_multi_gpu:
255-
runs-on: [self-hosted, docker-gpu, multi-gpu]
256-
container:
257-
image: tensorflow/tensorflow:2.4.1-gpu
258-
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
259-
steps:
260-
- name: Install dependencies
261-
run: |
262-
apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
263-
pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
264-
pip install --upgrade pip
265-
pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
266-
267-
- name: Launcher docker
268-
uses: actions/checkout@v2
269-
with:
270-
fetch-depth: 2
271-
272-
- name: NVIDIA-SMI
273-
continue-on-error: true
274-
run: |
275-
nvidia-smi
276-
277-
- name: Are GPUs recognized by our DL frameworks
278-
run: |
279-
python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
280-
python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
281-
253+
# run_tests_flax_multi_gpu:
254+
# runs-on: [self-hosted, docker-gpu, multi-gpu]
255+
# container:
256+
# image: tensorflow/tensorflow:2.4.1-gpu
257+
# options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
258+
# steps:
259+
# - name: Install dependencies
260+
# run: |
261+
# apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
262+
# pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
263+
# pip install --upgrade pip
264+
# pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
265+
#
266+
# - name: Launcher docker
267+
# uses: actions/checkout@v2
268+
# with:
269+
# fetch-depth: 2
270+
#
271+
# - name: NVIDIA-SMI
272+
# continue-on-error: true
273+
# run: |
274+
# nvidia-smi
275+
#
276+
# - name: Are GPUs recognized by our DL frameworks
277+
# run: |
278+
# python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
279+
# python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
280+
#
282281
# - name: Fetch the tests to run
283282
# run: |
284283
# python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
285-
286-
- name: Report fetched tests
287-
uses: actions/upload-artifact@v2
288-
with:
289-
name: test_fetched
290-
path: test_preparation.txt
291-
292-
- name: Run all non-slow tests on GPU
293-
run: |
294-
python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_multi_gpu
284+
#
285+
# - name: Report fetched tests
286+
# uses: actions/upload-artifact@v2
287+
# with:
288+
# name: test_fetched
289+
# path: test_preparation.txt
290+
#
291+
# - name: Run all non-slow tests on GPU
292+
# run: |
295293
# if [ -f test_list.txt ]; then
296294
# python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_multi_gpu $(cat test_list.txt)
297295
# fi
298-
299-
- name: Failure short reports
300-
if: ${{ failure() }}
301-
run: cat reports/tests_flax_multi_gpu_failures_short.txt
302-
303-
- name: Test suite reports artifacts
304-
if: ${{ always() }}
305-
uses: actions/upload-artifact@v2
306-
with:
307-
name: run_all_tests_flax_multi_gpu_test_reports
308-
path: reports
296+
#
297+
# - name: Failure short reports
298+
# if: ${{ failure() }}
299+
# run: cat reports/tests_flax_multi_gpu_failures_short.txt
300+
#
301+
# - name: Test suite reports artifacts
302+
# if: ${{ always() }}
303+
# uses: actions/upload-artifact@v2
304+
# with:
305+
# name: run_all_tests_flax_multi_gpu_test_reports
306+
# path: reports
309307

310308
# run_tests_tf_multi_gpu:
311309
# runs-on: [self-hosted, docker-gpu, multi-gpu]

.github/workflows/self-scheduled.yml

Lines changed: 39 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ jobs:
8686
path: reports
8787

8888
run_all_tests_flax_gpu:
89-
runs-on: [self-hosted, docker-gpu, single-gpu]
89+
runs-on: [self-hosted, docker-gpu-test, single-gpu]
9090
container:
9191
image: tensorflow/tensorflow:2.4.1-gpu
9292
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -291,45 +291,44 @@ jobs:
291291
name: run_all_tests_tf_multi_gpu_test_reports
292292
path: reports
293293

294-
run_all_tests_flax_multi_gpu:
295-
runs-on: [self-hosted, docker-gpu, multi-gpu]
296-
container:
297-
image: tensorflow/tensorflow:2.4.1-gpu
298-
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
299-
steps:
300-
- name: Launcher docker
301-
uses: actions/checkout@v2
302-
303-
- name: NVIDIA-SMI
304-
continue-on-error: true
305-
run: |
306-
nvidia-smi
307-
308-
- name: Install dependencies
309-
run: |
310-
pip install --upgrade pip
311-
pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
312-
pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision]
313-
314-
- name: Are GPUs recognized by our DL frameworks
315-
run: |
316-
python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
317-
python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
318-
319-
- name: Run all tests on GPU
320-
run: |
321-
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests
322-
323-
- name: Failure short reports
324-
if: ${{ always() }}
325-
run: cat reports/tests_flax_gpu_failures_short.txt
326-
327-
- name: Test suite reports artifacts
328-
if: ${{ always() }}
329-
uses: actions/upload-artifact@v2
330-
with:
331-
name: run_all_tests_flax_gpu_test_reports
332-
path: reports
294+
# run_all_tests_flax_multi_gpu:
295+
# runs-on: [self-hosted, docker-gpu, multi-gpu]
296+
# container:
297+
# image: tensorflow/tensorflow:2.4.1-gpu
298+
# options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
299+
# steps:
300+
# - name: Launcher docker
301+
# uses: actions/checkout@v2
302+
#
303+
# - name: NVIDIA-SMI
304+
# run: |
305+
# nvidia-smi
306+
#
307+
# - name: Install dependencies
308+
# run: |
309+
# pip install --upgrade pip
310+
# pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
311+
# pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision]
312+
#
313+
# - name: Are GPUs recognized by our DL frameworks
314+
# run: |
315+
# python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
316+
# python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
317+
#
318+
# - name: Run all tests on GPU
319+
# run: |
320+
# python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests
321+
#
322+
# - name: Failure short reports
323+
# if: ${{ always() }}
324+
# run: cat reports/tests_flax_gpu_failures_short.txt
325+
#
326+
# - name: Test suite reports artifacts
327+
# if: ${{ always() }}
328+
# uses: actions/upload-artifact@v2
329+
# with:
330+
# name: run_all_tests_flax_gpu_test_reports
331+
# path: reports
333332

334333
run_all_tests_torch_cuda_extensions_gpu:
335334
runs-on: [self-hosted, docker-gpu, single-gpu]

0 commit comments

Comments
 (0)