From 8b9b46efee7f47054929140cef24e572ce7c393a Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Sun, 6 Oct 2024 16:02:12 +0800
Subject: [PATCH 01/36] doc: update link
---
ADVANCED_USAGE.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md
index 428cf28..252abf1 100755
--- a/ADVANCED_USAGE.md
+++ b/ADVANCED_USAGE.md
@@ -71,7 +71,7 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio
- `--tokenizer_legacy`: Whether to use the legacy tokenizer, default to `False`
- `--samples`: The path to the generated samples file, default to `None`
- `--local_execute`: Whether to execute the samples locally, default to `False`
-- `--remote_execute_api`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://github.com/bigcode-project/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page.
+- `--remote_execute_api`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page.
- `--pass_k`: The `k` in `Pass@k`, default to `[1, 5, 10]`, e.g. `--pass_k 1,5,10` will evaluate `Pass@1`, `Pass@5` and `Pass@10`
- `--save_pass_rate`: Whether to save the pass rate to a file, default to `True`
- `--parallel`: The number of parallel processes, default to `None`, e.g. `--parallel 10` will evaluate 10 samples in parallel
@@ -111,7 +111,7 @@ bigcodebench.generate \
```
>
-The generated code samples will be stored in a file named `[model_name]--bigcodebench-[instruct|complete]--[backend]-[temp]-[n_samples].jsonl`. Alternatively, you can use the following command to utilize our pre-built docker images for generating code samples:
+The generated code samples will be stored in a file named `[model_name]--bigcodebench-[instruct|complete]--[backend]-[temp]-[n_samples]-sanitized_calibrated.jsonl`. Alternatively, you can use the following command to utilize our pre-built docker images for generating code samples:
>
```bash
From 991e41c06d54735d8ffd7e0651d1c080769039bb Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Sun, 6 Oct 2024 16:34:57 +0800
Subject: [PATCH 02/36] doc: add warning
---
README.md | 3 +++
1 file changed, 3 insertions(+)
diff --git a/README.md b/README.md
index 68b1945..25e61d7 100755
--- a/README.md
+++ b/README.md
@@ -85,6 +85,9 @@ pip install "git+https://github.com/bigcode-project/bigcodebench.git#egg=bigcode
## 🚀 Remote Evaluation
We use the greedy decoding as an example to show how to evaluate the generated code samples via remote API.
+> [!Warning]
+>
+> To ease the generation, we use batch inference by default. However, the batch inference results could vary from *batch sizes to batch sizes* and *versions to versions*, at least for the vLLM backend. If you want to get more deterministic results for greedy decoding, please set `--bs` to `1`.
> [!Note]
>
From 8cb06e4e4e9cf5070a625df246f0fa6a058c21d2 Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Mon, 7 Oct 2024 00:37:22 +0800
Subject: [PATCH 03/36] merge cfg
---
setup.cfg | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/setup.cfg b/setup.cfg
index a9b7c74..4897f68 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -29,9 +29,6 @@ install_requires =
wget>=3.2
datasets
gradio-client
-
-[options.extras_require]
-generate =
vllm
numpy
rich
@@ -48,4 +45,4 @@ console_scripts =
bigcodebench.syncheck = bigcodebench.syncheck:main
bigcodebench.legacy_sanitize = bigcodebench.legacy_sanitize:main
bigcodebench.generate = bigcodebench.generate:main
- bigcodebench.inspect = bigcodebench.inspect:main
\ No newline at end of file
+ bigcodebench.inspect = bigcodebench.inspect:main
From 0a5154f1afba214de5a28946f115181ccc3bd9ae Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Mon, 7 Oct 2024 00:38:16 +0800
Subject: [PATCH 04/36] doc: merge installation cfg
---
README.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 25e61d7..4a6c70c 100755
--- a/README.md
+++ b/README.md
@@ -61,7 +61,7 @@ To get started, please first set up the environment:
```bash
# By default, you will use the remote evaluation API to execute the output samples.
-pip install bigcodebench[generate] --upgrade
+pip install bigcodebench --upgrade
# You are suggested to use `flash-attn` for generating code samples.
pip install packaging ninja
@@ -75,7 +75,7 @@ pip install flash-attn --no-build-isolation
```bash
# Install to use bigcodebench.generate
-pip install "git+https://github.com/bigcode-project/bigcodebench.git#egg=bigcodebench[generate]" --upgrade
+pip install "git+https://github.com/bigcode-project/bigcodebench.git" --upgrade
```
From 1cb320fcb3b36945f9fd7ecec7e7f64cf492e5d9 Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Mon, 7 Oct 2024 00:40:15 +0800
Subject: [PATCH 05/36] Update ADVANCED_USAGE.md
---
ADVANCED_USAGE.md | 18 +++++-------------
1 file changed, 5 insertions(+), 13 deletions(-)
diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md
index 252abf1..ab6e0ae 100755
--- a/ADVANCED_USAGE.md
+++ b/ADVANCED_USAGE.md
@@ -3,25 +3,19 @@
To get started, please first set up the environment:
```bash
-# Install to use bigcodebench.evaluate
-pip install bigcodebench --upgrade
-# If you want to use the evaluate locally, you need to install the requirements
+# If you want to use the evaluate locally, you need to install the requirements in an isolated environment
pip install -I -r https://raw.githubusercontent.com/bigcode-project/bigcodebench/main/Requirements/requirements-eval.txt
-# Install to use bigcodebench.generate
-# You are strongly recommended to install the generate dependencies in a separate environment
-pip install bigcodebench[generate] --upgrade
+# You are strongly recommended to install the bigcodebench dependencies in another environment
+pip install bigcodebench --upgrade
```
⏬ Install nightly version :: click to expand ::
```bash
-# Install to use bigcodebench.evaluate
+# Install to use bigcodebench
pip install "git+https://github.com/bigcode-project/bigcodebench.git" --upgrade
-
-# Install to use bigcodebench.generate
-pip install "git+https://github.com/bigcode-project/bigcodebench.git#egg=bigcodebench[generate]" --upgrade
```
@@ -34,10 +28,8 @@ pip install "git+https://github.com/bigcode-project/bigcodebench.git#egg=bigcode
git clone https://github.com/bigcode-project/bigcodebench.git
cd bigcodebench
export PYTHONPATH=$PYTHONPATH:$(pwd)
-# Install to use bigcodebench.evaluate
+# Install to use bigcodebench
pip install -e .
-# Install to use bigcodebench.generate
-pip install -e .[generate]
```
From f6de469cb536920ac638345836cbacd11bd503b0 Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Sun, 6 Oct 2024 16:54:19 +0000
Subject: [PATCH 06/36] docker: update Gradio.Dockerfile
---
Docker/Gradio.Dockerfile | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Docker/Gradio.Dockerfile b/Docker/Gradio.Dockerfile
index 9de820b..df4018f 100644
--- a/Docker/Gradio.Dockerfile
+++ b/Docker/Gradio.Dockerfile
@@ -7,7 +7,7 @@ RUN apt-get update && apt-get install -y git g++ python3-tk zip unzip procps r-b
# upgrade to latest pip
RUN pip install --upgrade pip
-RUN pip install APScheduler==3.10.1 black==23.11.0 click==8.1.3 huggingface-hub>=0.18.0 plotly python-dateutil==2.8.2 gradio-space-ci@git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 isort ruff gradio[oauth]==4.31.0 gradio_leaderboard==0.0.11 schedule==1.2.2
+RUN pip install APScheduler==3.10.1 black==23.11.0 click==8.1.3 huggingface-hub>=0.18.0 plotly python-dateutil==2.8.2 gradio-space-ci@git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 isort ruff gradio[oauth] schedule==1.2.2
# Add a new user "bigcodebenchuser"
RUN adduser --disabled-password --gecos "" bigcodebenchuser
From b8c1811623aae0493152f9dff01d65781f5102f7 Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Sun, 6 Oct 2024 16:59:59 +0000
Subject: [PATCH 07/36] refactor(eval): update parallel default val
---
bigcodebench/evaluate.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index df8ad85..a082d56 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -119,7 +119,7 @@ def evaluate(
remote_execute_api: str = "https://bigcode-bigcodebench-evaluator.hf.space/",
pass_k: str = "1,5,10",
save_pass_rate: bool = True,
- parallel: int = None,
+ parallel: int = -1,
min_time_limit: float = 1,
max_as_limit: int = 30*1024,
max_data_limit: int = 30*1024,
@@ -167,7 +167,7 @@ def evaluate(
pass_k = [int(k) for k in pass_k.split(",")]
- if parallel is None:
+ if not parallel:
n_workers = max(1, multiprocessing.cpu_count() // 2)
else:
n_workers = parallel
From 0825835f4b86daf3f25e6bcf1786c71679cdba6e Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Sun, 6 Oct 2024 17:00:48 +0000
Subject: [PATCH 08/36] doc: update minimal full script
---
run.sh | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/run.sh b/run.sh
index a84199e..c069e8e 100755
--- a/run.sh
+++ b/run.sh
@@ -9,5 +9,4 @@ bigcodebench.evaluate \
--model $MODEL \
--split $SPLIT \
--subset $SUBSET \
- --backend $BACKEND \
- --tp $NUM_GPU
\ No newline at end of file
+ --backend $BACKEND
\ No newline at end of file
From dac3a0087a832aed84099f35705f93be2cb27119 Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Sun, 6 Oct 2024 17:01:56 +0000
Subject: [PATCH 09/36] doc: update parallel arg
---
ADVANCED_USAGE.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md
index ab6e0ae..1b2ca55 100755
--- a/ADVANCED_USAGE.md
+++ b/ADVANCED_USAGE.md
@@ -66,7 +66,7 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio
- `--remote_execute_api`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page.
- `--pass_k`: The `k` in `Pass@k`, default to `[1, 5, 10]`, e.g. `--pass_k 1,5,10` will evaluate `Pass@1`, `Pass@5` and `Pass@10`
- `--save_pass_rate`: Whether to save the pass rate to a file, default to `True`
-- `--parallel`: The number of parallel processes, default to `None`, e.g. `--parallel 10` will evaluate 10 samples in parallel
+- `--parallel`: The number of parallel processes, default to `-1`, e.g. `--parallel 10` will evaluate 10 samples in parallel
- `--min_time_limit`: The minimum time limit for the execution, default to `1`, e.g. `--min_time_limit 10` will evaluate the samples with at least 10 seconds
- `--max_as_limit`: The maximum address space limit for the execution, default to `30*1024` (30 GB), e.g. `--max_as_limit 20*1024` will evaluate the samples with at most 20 GB
- `--max_data_limit`: The maximum data segment limit for the execution, default to `30*1024` (30 GB), e.g. `--max_data_limit 20*1024` will evaluate the samples with at most 20 GB
From 58b3f2d01b285e5db9c2493114f4747dfb883dcc Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Sun, 6 Oct 2024 18:52:59 +0000
Subject: [PATCH 10/36] fix: change parallel logic
---
bigcodebench/evaluate.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index a082d56..5a9fab8 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -167,7 +167,7 @@ def evaluate(
pass_k = [int(k) for k in pass_k.split(",")]
- if not parallel:
+ if parallel < 1:
n_workers = max(1, multiprocessing.cpu_count() // 2)
else:
n_workers = parallel
From 2a28c61ccfdcecf0f3466b955b5a0b1028512310 Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Mon, 7 Oct 2024 04:39:33 +0800
Subject: [PATCH 11/36] doc: update model outputs link
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 4a6c70c..c6ca7b7 100755
--- a/README.md
+++ b/README.md
@@ -139,7 +139,7 @@ export GOOGLE_API_KEY=
## 💻 LLM-generated Code
We share pre-generated code samples from LLMs we have [evaluated](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard):
-* See the attachment of our [v0.2.0](https://github.com/bigcode-project/bigcodebench/releases/tag/v0.2.0). We include `sanitized_samples_calibrated.zip` for your convenience.
+* See the attachment of our [v0.2.0.post3](https://github.com/bigcode-project/bigcodebench/releases/tag/v0.2.0.post3). We include `sanitized_samples_calibrated.zip` for your convenience.
## Advanced Usage
From 817e63b25fe38a0f97abbfcc97fc8ed2e08b2474 Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Mon, 7 Oct 2024 11:39:47 +0800
Subject: [PATCH 12/36] doc: benchmark description
---
README.md | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/README.md b/README.md
index c6ca7b7..ab3747b 100755
--- a/README.md
+++ b/README.md
@@ -48,6 +48,10 @@
BigCodeBench is an **_easy-to-use_** benchmark for solving **_practical_** and **_challenging_** tasks via code. It aims to evaluate the true programming capabilities of large language models (LLMs) in a more realistic setting. The benchmark is designed for HumanEval-like function-level code generation tasks, but with much more complex instructions and diverse function calls.
+There are two splits in BigCodeBench:
+- `Complete`: Thes split is designed for code completion based on the comprehensive docstrings.
+- `Instruct`: The split works for the instruction-tuned and chat models only, where the models are asked to generate a code snippet based on the natural language instructions. The instructions only contain necessary information, and require more complex reasoning.
+
### Why BigCodeBench?
BigCodeBench focuses on task automation via code generation with *diverse function calls* and *complex instructions*, with:
From 112623038c349d4755f85f8b05a8bceeae1886df Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Tue, 8 Oct 2024 16:01:42 +0800
Subject: [PATCH 13/36] remove reflection model
---
analysis/utils.py | 36 ++++++++++++++++++------------------
1 file changed, 18 insertions(+), 18 deletions(-)
diff --git a/analysis/utils.py b/analysis/utils.py
index ce81bd6..87453fd 100755
--- a/analysis/utils.py
+++ b/analysis/utils.py
@@ -1133,24 +1133,24 @@
"act_param": 9,
"open-data": "None",
},
- "mattshumer/ref_70_e3_prefill": {
- "name": "Reflection-Llama-3.1-70B",
- "link": "https://huggingface.co/mattshumer/ref_70_e3",
- "prompted": True,
- "moe": False,
- "size": 70,
- "act_param": 70,
- "open-data": "None",
- },
- "mattshumer/ref_70_e3": {
- "name": "Reflection-Llama-3.1-70B (Recommended Settings)",
- "link": "https://huggingface.co/mattshumer/ref_70_e3",
- "prompted": True,
- "moe": False,
- "size": 70,
- "act_param": 70,
- "open-data": "None",
- },
+ # "mattshumer/ref_70_e3_prefill": {
+ # "name": "Reflection-Llama-3.1-70B",
+ # "link": "https://huggingface.co/mattshumer/ref_70_e3",
+ # "prompted": True,
+ # "moe": False,
+ # "size": 70,
+ # "act_param": 70,
+ # "open-data": "None",
+ # },
+ # "mattshumer/ref_70_e3": {
+ # "name": "Reflection-Llama-3.1-70B (Recommended Settings)",
+ # "link": "https://huggingface.co/mattshumer/ref_70_e3",
+ # "prompted": True,
+ # "moe": False,
+ # "size": 70,
+ # "act_param": 70,
+ # "open-data": "None",
+ # },
"o1-preview-2024-09-12": {
"name": "o1-Preview-2024-09-12 (temperature=1)",
"link": "https://o1.ai/o1-preview",
From e35d6257f149981fbc237abbb74493c14018e8ed Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Sun, 13 Oct 2024 03:39:01 +0000
Subject: [PATCH 14/36] doc: add impact
---
README.md | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/README.md b/README.md
index ab3747b..dba0bd6 100755
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@
+ 💥 Impact •
📰 News •
🔥 Quick Start •
🚀 Remote Evaluation •
@@ -23,6 +24,17 @@
📜 Citation
+## 💥 Impact
+BigCodeBench has been used by the many LLM teams including:
+- Zhipu AI
+- Alibaba Qwen
+- DeepSeek
+- Amazon AWS
+- Snowflake AI Research
+- ServiceNow Research
+- Meta AI
+- Cohere AI
+
## 📰 News
- **[2024-10-06]** We are releasing `bigcodebench==v0.2.0`!
- **[2024-10-05]** We create a public code execution API on the [Hugging Face space](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator).
From 3b4a058d0768ec1b9d737f1a4ad2de52e4b3f7f1 Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Mon, 14 Oct 2024 17:00:58 +0800
Subject: [PATCH 15/36] fix(doc): typos
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index dba0bd6..edb220e 100755
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@
## 💥 Impact
-BigCodeBench has been used by the many LLM teams including:
+BigCodeBench has been used by many LLM teams including:
- Zhipu AI
- Alibaba Qwen
- DeepSeek
From e10d361a8dde5d59724a5604b1fab4c65ff735c9 Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Fri, 18 Oct 2024 00:03:02 +0800
Subject: [PATCH 16/36] docs: update impact
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index edb220e..0c1958f 100755
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ BigCodeBench has been used by many LLM teams including:
- Zhipu AI
- Alibaba Qwen
- DeepSeek
-- Amazon AWS
+- Amazon AWS AI
- Snowflake AI Research
- ServiceNow Research
- Meta AI
From e25440ea1b8cd20b83c9f2881f492f33b92fc898 Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Thu, 31 Oct 2024 02:13:02 +0800
Subject: [PATCH 17/36] Update README.md
---
README.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/README.md b/README.md
index 0c1958f..c37b203 100755
--- a/README.md
+++ b/README.md
@@ -34,6 +34,7 @@ BigCodeBench has been used by many LLM teams including:
- ServiceNow Research
- Meta AI
- Cohere AI
+- Sakana AI
## 📰 News
- **[2024-10-06]** We are releasing `bigcodebench==v0.2.0`!
From c5a22bff708b43698630da8d03dfe5063f52216c Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Fri, 1 Nov 2024 17:21:31 +0800
Subject: [PATCH 18/36] feat(codegen): support model revision
---
bigcodebench/generate.py | 4 +++-
bigcodebench/provider/__init__.py | 4 ++++
bigcodebench/provider/base.py | 2 ++
bigcodebench/provider/hf.py | 1 +
bigcodebench/provider/vllm.py | 1 +
5 files changed, 11 insertions(+), 1 deletion(-)
diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py
index 6333261..58f1ab7 100644
--- a/bigcodebench/generate.py
+++ b/bigcodebench/generate.py
@@ -139,6 +139,7 @@ def run_codegen(
backend: str = "vllm",
base_url: str = None,
tp: int = 1,
+ revision: str = "main",
trust_remote_code: bool = False,
tokenizer_name: str = None,
tokenizer_legacy: bool = False,
@@ -173,6 +174,7 @@ def run_codegen(
response_prefix=response_prefix,
base_url=base_url,
tp=tp,
+ revision=revision,
trust_remote_code=trust_remote_code,
direct_completion=direct_completion,
tokenizer_name=tokenizer_name,
@@ -180,7 +182,7 @@ def run_codegen(
)
extra = "-" + subset if subset != "full" else ""
- identifier = model.replace("/", "--") + f"--bigcodebench{extra}-{split}--{backend}-{temperature}-{n_samples}-sanitized_calibrated.jsonl"
+ identifier = model.replace("/", "--") + f"--{revision}--bigcodebench{extra}-{split}--{backend}-{temperature}-{n_samples}-sanitized_calibrated.jsonl"
target_path = os.path.join(root, identifier)
diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py
index 67123f9..d519124 100644
--- a/bigcodebench/provider/__init__.py
+++ b/bigcodebench/provider/__init__.py
@@ -12,6 +12,8 @@ def make_model(
# instruction model only
instruction_prefix: str = None,
response_prefix: str = None,
+ # vllm and hf only
+ revision: str = "main",
# vllm only
tp: int = 1,
direct_completion: bool = False,
@@ -32,6 +34,7 @@ def make_model(
split=split,
temperature=temperature,
max_new_tokens=max_new_tokens,
+ revision=revision,
dataset=dataset,
direct_completion=direct_completion,
tp=tp,
@@ -47,6 +50,7 @@ def make_model(
split=split,
temperature=temperature,
max_new_tokens=max_new_tokens,
+ revision=revision,
dataset=dataset,
direct_completion=direct_completion,
instruction_prefix=instruction_prefix,
diff --git a/bigcodebench/provider/base.py b/bigcodebench/provider/base.py
index ebec843..5a24b59 100644
--- a/bigcodebench/provider/base.py
+++ b/bigcodebench/provider/base.py
@@ -12,6 +12,7 @@ def __init__(
split: str,
temperature: float = 0.8,
max_new_tokens: int = 1280,
+ revision: str = "main",
dtype: str = "bfloat16", # default
direct_completion: bool = False,
trust_remote_code: bool = False,
@@ -29,6 +30,7 @@ def __init__(
self.skip_special_tokens = False
self.max_new_tokens = max_new_tokens
self.dtype = dtype
+ self.revision = revision
self.direct_completion = direct_completion
self.trust_remote_code = trust_remote_code
self.tokenizer_name = tokenizer_name
diff --git a/bigcodebench/provider/hf.py b/bigcodebench/provider/hf.py
index c3136c8..a85957d 100644
--- a/bigcodebench/provider/hf.py
+++ b/bigcodebench/provider/hf.py
@@ -27,6 +27,7 @@ def __init__(
"trust_remote_code": self.trust_remote_code,
"torch_dtype": getattr(torch, self.dtype),
"attn_implementation": attn_implementation, # "eager", "flash_attention_2", "sdpa"
+ "revision": self.revision,
}
self.skip_special_tokens = True
diff --git a/bigcodebench/provider/vllm.py b/bigcodebench/provider/vllm.py
index 3d0aaf4..171a41c 100644
--- a/bigcodebench/provider/vllm.py
+++ b/bigcodebench/provider/vllm.py
@@ -18,6 +18,7 @@ def __init__(self, name: str, dataset: str, tp: int, **kwargs) -> None:
"tensor_parallel_size": int(os.getenv("VLLM_N_GPUS", tp)),
"dtype": self.dtype,
"trust_remote_code": self.trust_remote_code,
+ "revision": self.revision,
}
if self.tokenizer_name is None:
self.tokenizer_name = self.name
From a810f315287cc0a7860791db79b9967420dba80e Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Fri, 1 Nov 2024 17:24:20 +0800
Subject: [PATCH 19/36] doc: add model revision
---
ADVANCED_USAGE.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md
index 1b2ca55..67fe359 100755
--- a/ADVANCED_USAGE.md
+++ b/ADVANCED_USAGE.md
@@ -57,6 +57,7 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio
- `--id_range`: The range of the tasks to evaluate, default to `None`, e.g. `--id_range 10,20` will evaluate the tasks from 10 to 20
- `--backend`: The backend to use, default to `vllm`
- `--base_url`: The base URL of the backend for OpenAI-compatible APIs, default to `None`
+- `--revision`: The revision of the model with the vLLM or HF backend, default to `main`
- `--tp`: The tensor parallel size for the vLLM backend, default to `1`
- `--trust_remote_code`: Whether to trust the remote code, default to `False`
- `--tokenizer_name`: The name of the customized tokenizer, default to `None`
From e8798f47a2c43ada4625f9b67c7db8436980d9f9 Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Sat, 2 Nov 2024 17:22:46 +0800
Subject: [PATCH 20/36] fix: change id_range type
---
bigcodebench/generate.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py
index 58f1ab7..6b3fe37 100644
--- a/bigcodebench/generate.py
+++ b/bigcodebench/generate.py
@@ -135,7 +135,7 @@ def run_codegen(
strip_newlines: bool = False,
direct_completion: bool = False,
resume: bool = True,
- id_range: Tuple[int, int] = None,
+ id_range: str = None,
backend: str = "vllm",
base_url: str = None,
tp: int = 1,
@@ -152,6 +152,7 @@ def run_codegen(
print("Greedy decoding ON (--greedy): setting n_samples=1, temperature=0")
if id_range is not None:
+ id_range = [int(i) for i in id_range.split("-")]
assert len(id_range) == 2, "id_range must be a list of length 2"
assert id_range[0] < id_range[1], "id_range must be increasing"
id_range = tuple(id_range)
From 216543126cc8fafa9e7171f5546c38ec801b4a6c Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Sat, 2 Nov 2024 19:31:26 +0800
Subject: [PATCH 21/36] fix(codegen): stop by upper bound
---
bigcodebench/generate.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py
index 6b3fe37..9f29cad 100644
--- a/bigcodebench/generate.py
+++ b/bigcodebench/generate.py
@@ -64,7 +64,7 @@ def codegen(
if id_num < low:
p.console.print(f"Skipping {task_id} as it is not in {id_range}")
continue
- if id_num > id_range[1]:
+ if id_num >= id_range[1]:
break
p_name = task_id.replace("/", "_")
From 1e243647dfe02b7ce44ad4a3237cfc81fc4d5e69 Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Tue, 5 Nov 2024 03:09:00 +0800
Subject: [PATCH 22/36] feat: using datasets to load
---
bigcodebench/data/bigcodebench.py | 10 ++--------
1 file changed, 2 insertions(+), 8 deletions(-)
diff --git a/bigcodebench/data/bigcodebench.py b/bigcodebench/data/bigcodebench.py
index da2ad5d..9a3ee9d 100644
--- a/bigcodebench/data/bigcodebench.py
+++ b/bigcodebench/data/bigcodebench.py
@@ -26,14 +26,8 @@ def _ready_bigcodebench_path(subset="full", version="default") -> str:
)
extra = "-" + subset if subset != "full" else ""
-
- try:
- dataset = load_dataset(BIGCODEBENCH_HF+extra, split=BIGCODEBENCH_VERSION)
- make_cache(url, dataset, path)
- except:
- if os.path.exists(path):
- os.remove(path)
- make_cache(url, None, path, gh=True)
+ dataset = load_dataset(BIGCODEBENCH_HF+extra, split=BIGCODEBENCH_VERSION)
+ make_cache(url, dataset, path)
return path
From cb283fdd11d0f8abaaf5e7eec59641b52ccf13fc Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Tue, 5 Nov 2024 03:12:45 +0800
Subject: [PATCH 23/36] feat: customize instruction and response
---
bigcodebench/generate.py | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py
index 9f29cad..757b08c 100644
--- a/bigcodebench/generate.py
+++ b/bigcodebench/generate.py
@@ -139,6 +139,8 @@ def run_codegen(
backend: str = "vllm",
base_url: str = None,
tp: int = 1,
+ instruction_prefix: str = None,
+ response_prefix: str = None,
revision: str = "main",
trust_remote_code: bool = False,
tokenizer_name: str = None,
@@ -160,8 +162,10 @@ def run_codegen(
# Make project dir
os.makedirs(root, exist_ok=True)
- instruction_prefix = "Please provide a self-contained Python script that solves the following problem in a markdown code block:"
- response_prefix = "Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:"
+ if instruction_prefix is None:
+ instruction_prefix = "Please provide a self-contained Python script that solves the following problem in a markdown code block:"
+ if response_prefix is None:
+ response_prefix = "Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:"
# Make dir for codes generated by each model
model_runner = make_model(
From 974e67918e36ee923b06808e5a2edfaa8d7c9319 Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Tue, 5 Nov 2024 03:20:51 +0800
Subject: [PATCH 24/36] fix: make google api do n samples
---
bigcodebench/gen/util/google_request.py | 8 ++++++--
bigcodebench/provider/google.py | 3 +--
2 files changed, 7 insertions(+), 4 deletions(-)
diff --git a/bigcodebench/gen/util/google_request.py b/bigcodebench/gen/util/google_request.py
index 8a88842..7ce935b 100644
--- a/bigcodebench/gen/util/google_request.py
+++ b/bigcodebench/gen/util/google_request.py
@@ -5,13 +5,17 @@
def make_request(
- client: genai.GenerativeModel, temperature, messages, max_new_tokens=2048
+ client: genai.GenerativeModel,
+ messages: List,
+ temperature: float,
+ n: int,
+ max_new_tokens: int = 2048,
) -> genai.types.GenerateContentResponse:
messages = [{"role": m["role"], "parts": [m["content"]]} for m in messages]
response = client.generate_content(
messages,
generation_config=genai.types.GenerationConfig(
- candidate_count=1,
+ candidate_count=n,
max_output_tokens=max_new_tokens,
temperature=temperature,
),
diff --git a/bigcodebench/provider/google.py b/bigcodebench/provider/google.py
index 0cd5416..c9781ca 100644
--- a/bigcodebench/provider/google.py
+++ b/bigcodebench/provider/google.py
@@ -36,10 +36,9 @@ def codegen(
ret = make_auto_request(
self.client,
message,
- self.name,
n=num_samples,
- max_tokens=self.max_new_tokens,
temperature=self.temperature,
+ max_new_tokens=self.max_new_tokens,
)
for candidate in ret.candidates:
parts = candidate.content.parts
From 492080811f468758e69ca279489fc729783715bd Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Tue, 5 Nov 2024 03:56:25 +0800
Subject: [PATCH 25/36] feat: change google api request
---
bigcodebench/gen/util/google_request.py | 5 ++---
bigcodebench/provider/google.py | 2 +-
2 files changed, 3 insertions(+), 4 deletions(-)
diff --git a/bigcodebench/gen/util/google_request.py b/bigcodebench/gen/util/google_request.py
index 7ce935b..6517650 100644
--- a/bigcodebench/gen/util/google_request.py
+++ b/bigcodebench/gen/util/google_request.py
@@ -6,14 +6,13 @@
def make_request(
client: genai.GenerativeModel,
- messages: List,
+ message: str,
temperature: float,
n: int,
max_new_tokens: int = 2048,
) -> genai.types.GenerateContentResponse:
- messages = [{"role": m["role"], "parts": [m["content"]]} for m in messages]
response = client.generate_content(
- messages,
+ [{"role": "user", "parts": [message]}],
generation_config=genai.types.GenerationConfig(
candidate_count=n,
max_output_tokens=max_new_tokens,
diff --git a/bigcodebench/provider/google.py b/bigcodebench/provider/google.py
index c9781ca..2194c47 100644
--- a/bigcodebench/provider/google.py
+++ b/bigcodebench/provider/google.py
@@ -35,7 +35,7 @@ def codegen(
)
ret = make_auto_request(
self.client,
- message,
+ message=message,
n=num_samples,
temperature=self.temperature,
max_new_tokens=self.max_new_tokens,
From 1d9ea6af233cf8e86ccf279467b0f8e2b4c93122 Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Tue, 5 Nov 2024 03:56:55 +0800
Subject: [PATCH 26/36] feat: batch o1 and deepseek-chat via concurrency
---
bigcodebench/gen/util/openai_request.py | 30 +++-------
bigcodebench/provider/openai.py | 75 ++++++++++++++++++++-----
bigcodebench/provider/utility.py | 9 ++-
3 files changed, 77 insertions(+), 37 deletions(-)
diff --git a/bigcodebench/gen/util/openai_request.py b/bigcodebench/gen/util/openai_request.py
index e347ffe..a745d8d 100644
--- a/bigcodebench/gen/util/openai_request.py
+++ b/bigcodebench/gen/util/openai_request.py
@@ -1,4 +1,3 @@
-import signal
import time
import openai
@@ -14,53 +13,38 @@ def make_request(
n: int = 1,
**kwargs
) -> ChatCompletion:
- system_msg = "You are a helpful assistant good at coding."
- if (
- kwargs.get("response_format", None)
- and kwargs["response_format"]["type"] == "json_object"
- ):
- system_msg = "You are a helpful assistant designed to output JSON."
-
+ kwargs["top_p"] = 0.95
+ kwargs["max_completion_tokens"] = max_tokens
+ if model.startswith("o1-"): # pop top-p and max_completion_tokens
+ kwargs.pop("top_p")
+ kwargs.pop("max_completion_tokens")
+
return client.chat.completions.create(
model=model,
messages=[
- {"role": "system", "content": system_msg},
{"role": "user", "content": message},
],
- max_tokens=max_tokens,
temperature=temperature,
n=n,
**kwargs
)
-def handler(signum, frame):
- # swallow signum and frame
- raise Exception("end of time")
-
-
def make_auto_request(*args, **kwargs) -> ChatCompletion:
ret = None
while ret is None:
try:
- signal.signal(signal.SIGALRM, handler)
- signal.alarm(100)
ret = make_request(*args, **kwargs)
- signal.alarm(0)
except openai.RateLimitError:
print("Rate limit exceeded. Waiting...")
- signal.alarm(0)
time.sleep(5)
except openai.APIConnectionError:
print("API connection error. Waiting...")
- signal.alarm(0)
time.sleep(5)
except openai.APIError as e:
print(e)
- signal.alarm(0)
except Exception as e:
print("Unknown error. Waiting...")
print(e)
- signal.alarm(0)
time.sleep(1)
- return ret
+ return ret
\ No newline at end of file
diff --git a/bigcodebench/provider/openai.py b/bigcodebench/provider/openai.py
index 9eba02e..76e315e 100644
--- a/bigcodebench/provider/openai.py
+++ b/bigcodebench/provider/openai.py
@@ -1,12 +1,12 @@
import os
from typing import List
-from tqdm import tqdm
import openai
-from bigcodebench.provider.base import DecoderBase
from bigcodebench.gen.util.openai_request import make_auto_request
from bigcodebench.provider.utility import make_raw_chat_prompt
+from bigcodebench.provider.base import DecoderBase
+from bigcodebench.provider.utility import concurrent_call
class OpenAIChatDecoder(DecoderBase):
def __init__(self, name: str, base_url=None, **kwargs) -> None:
@@ -15,34 +15,83 @@ def __init__(self, name: str, base_url=None, **kwargs) -> None:
api_key=os.getenv("OPENAI_API_KEY", "none"), base_url=base_url
)
+ # def codegen(
+ # self, prompts: List[str], do_sample: bool = True, num_samples: int = 200
+ # ) -> List[str]:
+ # if do_sample:
+ # assert self.temperature > 0, "Temperature must be positive for sampling"
+ # all_outputs = []
+ # for prompt in tqdm(prompts):
+ # outputs = []
+ # message = make_raw_chat_prompt(
+ # task_prompt=prompt,
+ # subset=self.subset,
+ # split=self.split,
+ # instruction_prefix=self.instruction_prefix,
+ # response_prefix=self.response_prefix,
+ # tokenizer=None,
+ # )
+ # ret = make_auto_request(
+ # self.client,
+ # message=message,
+ # model=self.name,
+ # max_tokens=self.max_new_tokens,
+ # temperature=self.temperature,
+ # n=num_samples,
+ # )
+ # for item in ret.choices:
+ # outputs.append(item.message.content)
+ # all_outputs.append(outputs)
+ # return all_outputs
+
+ # def is_direct_completion(self) -> bool:
+ # return False
+
def codegen(
self, prompts: List[str], do_sample: bool = True, num_samples: int = 200
) -> List[str]:
if do_sample:
assert self.temperature > 0, "Temperature must be positive for sampling"
+ messages = [make_raw_chat_prompt(
+ task_prompt=prompt,
+ subset=self.subset,
+ split=self.split,
+ instruction_prefix=self.instruction_prefix,
+ response_prefix=self.response_prefix,
+ tokenizer=None,
+ ) for prompt in prompts]
+ # use concurrency based batching for o1 and deepseek models
+ if self.name.startswith("o1-") or self.name == "deepseek-chat":
+ return self._codegen_batch_via_concurrency(messages, num_samples)
+
+ return self._codegen_api_batch(messages, num_samples)
+
+ def _codegen_api_batch(self, messages: List[str], num_samples: int) -> List[str]:
+ client = openai.OpenAI(
+ api_key=os.getenv("OPENAI_API_KEY", "none"), base_url=self.base_url
+ )
+
all_outputs = []
- for prompt in tqdm(prompts):
- outputs = []
- message = make_raw_chat_prompt(
- task_prompt=prompt,
- subset=self.subset,
- split=self.split,
- instruction_prefix=self.instruction_prefix,
- response_prefix=self.response_prefix,
- tokenizer=None,
- )
+ for message in messages:
ret = make_auto_request(
- self.client,
+ client,
message=message,
model=self.name,
max_tokens=self.max_new_tokens,
temperature=self.temperature,
n=num_samples,
)
+ outputs = []
for item in ret.choices:
outputs.append(item.message.content)
all_outputs.append(outputs)
return all_outputs
+ def _codegen_batch_via_concurrency(self, messages: List[str], num_samples: int) -> List[str]:
+ batches = concurrent_call(
+ num_samples, self._codegen_api_batch, messages, num_samples=1
+ )
+ return [b[0] for b in batches]
+
def is_direct_completion(self) -> bool:
return False
\ No newline at end of file
diff --git a/bigcodebench/provider/utility.py b/bigcodebench/provider/utility.py
index 60a00e5..bb27539 100644
--- a/bigcodebench/provider/utility.py
+++ b/bigcodebench/provider/utility.py
@@ -1,5 +1,6 @@
from typing import List
from transformers import AutoTokenizer
+from concurrent.futures import ThreadPoolExecutor
EOS = [
"<|endoftext|>",
@@ -64,4 +65,10 @@ def make_raw_chat_prompt(
],
tokenize=False,
).split(_MAGIC_SPLITTER_)[0]
- return task_prompt
\ No newline at end of file
+ return task_prompt
+
+
+def concurrent_call(n, callback, /, *args, **kwargs):
+ with ThreadPoolExecutor(max_workers=n) as executor:
+ futures = [executor.submit(callback, *args, **kwargs) for _ in range(n)]
+ return [future.result() for future in futures]
\ No newline at end of file
From 813712f9220d0532f36757fee39ed841da2312d9 Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Tue, 5 Nov 2024 05:29:33 +0800
Subject: [PATCH 27/36] feat: add 3.5 haiku and grok beta
---
analysis/utils.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 55 insertions(+), 1 deletion(-)
diff --git a/analysis/utils.py b/analysis/utils.py
index 87453fd..4cd9862 100755
--- a/analysis/utils.py
+++ b/analysis/utils.py
@@ -1277,4 +1277,58 @@
"act_param": 3,
"open-data": "None",
},
-}
\ No newline at end of file
+ "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF": {
+ "name": "Llama-3.1-Nemotron-70B-Instruct",
+ "link": "https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
+ "prompted": True,
+ "moe": False,
+ "size": 70,
+ "act_param": 70,
+ "open-data": "Partial",
+ },
+ "claude-3-5-sonnet-20241022": {
+ "name": "Claude-3.5-Sonnet-20241022",
+ "link": "https://claude.ai/",
+ "prompted": True,
+ "moe": False,
+ "size": None,
+ "act_param": None,
+ "open-data": "None",
+ },
+ "ibm-granite/granite-3.0-8b-instruct": {
+ "name": "Granite-3.0-8B-Instruct",
+ "link": "https://huggingface.co/ibm-granite/granite-3.0-8b-instruct",
+ "prompted": True,
+ "moe": False,
+ "size": 8,
+ "act_param": 8,
+ "open-data": "None",
+ },
+ "ibm-granite/granite-3.0-2b-instruct": {
+ "name": "Granite-3.0-2B-Instruct",
+ "link": "https://huggingface.co/ibm-granite/granite-3.0-2b-instruct",
+ "prompted": True,
+ "moe": False,
+ "size": 2,
+ "act_param": 2,
+ "open-data": "None",
+ },
+ "grok-beta--main": {
+ "name": "Grok-Beta",
+ "link": "https://grok.com/",
+ "prompted": True,
+ "moe": False,
+ "size": None,
+ "act_param": None,
+ "open-data": "None",
+ },
+ "claude-3-5-haiku-20241022--main": {
+ "name": "Claude-3.5-Haiku-20241022",
+ "link": "https://claude.ai/",
+ "prompted": True,
+ "moe": False,
+ "size": None,
+ "act_param": None,
+ "open-data": "None",
+ },
+}
From 16ec422e9af5c9f6663bdca737cce4d8460647a5 Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Sat, 9 Nov 2024 01:16:07 +0800
Subject: [PATCH 28/36] fix(evaluate): update the calibration setup
---
bigcodebench/evaluate.py | 32 ++++++++++++++++----------------
1 file changed, 16 insertions(+), 16 deletions(-)
diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index 5a9fab8..44c7f93 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -233,7 +233,7 @@ def evaluate(
if "solution" in sample
else problems[task_id]["complete_prompt"] + sample["completion"]
)
- if "sanitized-calibrated" in samples:
+ if "sanitized_calibrated" in samples:
solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution
remainings.add(sample["_identifier"])
args = (
@@ -254,22 +254,22 @@ def evaluate(
assert n_samples == len(remainings), "Missing problems in unfinished"
assert len(completion_id) == len(problems), "Missing problems in samples"
- def stucking_checker():
- while remainings:
- last_size = len(remainings)
- time.sleep(240)
- if last_size != len(remainings) or len(remainings) == 0:
- continue
- # Potential stucking
- warn("No samples had finished testing in the last 240s")
- warn(f"{len(remainings)} samples to be tested: {remainings}")
+ def stucking_checker():
+ while remainings:
+ last_size = len(remainings)
+ time.sleep(240)
+ if last_size != len(remainings) or len(remainings) == 0:
+ continue
+ # Potential stucking
+ warn("No samples had finished testing in the last 240s")
+ warn(f"{len(remainings)} samples to be tested: {remainings}")
- threading.Thread(target=stucking_checker).start()
+ threading.Thread(target=stucking_checker).start()
- for future in tqdm(as_completed(futures), total=n_samples):
- result = future.result()
- remainings.remove(result["_identifier"])
- eval_results[result["task_id"]].append(result)
+ for future in tqdm(as_completed(futures), total=n_samples):
+ result = future.result()
+ remainings.remove(result["_identifier"])
+ eval_results[result["task_id"]].append(result)
# sort the results for each problem by completion_id
for task_id, task_results in eval_results.items():
@@ -307,7 +307,7 @@ def stucking_checker():
pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
pass_at_k["split"] = split
pass_at_k["subset"] = subset
- pass_at_k["calibrated"] = "sanitized-calibrated" in samples
+ pass_at_k["calibrated"] = "sanitized_calibrated" in samples
pass_at_k["gt_pass_rate"] = gt_pass_rate
pass_at_k["failed_tasks"] = failed_tasks
From 570a4c8f783f1c954e2256bf6d25e89c2e4cd0ea Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Sat, 9 Nov 2024 17:12:19 +0800
Subject: [PATCH 29/36] feat(evaluate): add no_execute flag
---
bigcodebench/evaluate.py | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index 44c7f93..6d02b4b 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -115,6 +115,7 @@ def evaluate(
split: str,
subset: str,
samples: Optional[str] = None,
+ no_execute: bool = False,
local_execute: bool = False,
remote_execute_api: str = "https://bigcode-bigcodebench-evaluator.hf.space/",
pass_k: str = "1,5,10",
@@ -135,6 +136,10 @@ def evaluate(
subset=subset,
**model_kwargs,
)
+
+ if no_execute:
+ return
+
assert samples is not None, "No samples provided"
if os.path.isdir(samples):
From 9ff42caca16b461b8eb5b5d74a371fe4f38c0ad9 Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Sat, 9 Nov 2024 17:15:52 +0800
Subject: [PATCH 30/36] fix(doc): change id_range input
---
ADVANCED_USAGE.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md
index 67fe359..0cd8007 100755
--- a/ADVANCED_USAGE.md
+++ b/ADVANCED_USAGE.md
@@ -54,7 +54,7 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio
- `--strip_newlines`: Whether to strip newlines, default to `False`, set to `True` to strip newlines for some model series like StarCoder2
- `--direct_completion`: Whether to use direct completion, default to `False`
- `--resume`: Whether to resume the evaluation, default to `True`, set to `False` to re-run the evaluation
-- `--id_range`: The range of the tasks to evaluate, default to `None`, e.g. `--id_range 10,20` will evaluate the tasks from 10 to 20
+- `--id_range`: The range of the tasks to evaluate, default to `None`, e.g. `--id_range 10-20` will evaluate the tasks from 10 to 20
- `--backend`: The backend to use, default to `vllm`
- `--base_url`: The base URL of the backend for OpenAI-compatible APIs, default to `None`
- `--revision`: The revision of the model with the vLLM or HF backend, default to `main`
From 8ed15f69c38b3f3d2c0b0ddf8bf638170af9aeba Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Sat, 9 Nov 2024 17:52:10 +0800
Subject: [PATCH 31/36] fix(codegen): update make_request
---
bigcodebench/gen/util/google_request.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/bigcodebench/gen/util/google_request.py b/bigcodebench/gen/util/google_request.py
index 6517650..8e696b4 100644
--- a/bigcodebench/gen/util/google_request.py
+++ b/bigcodebench/gen/util/google_request.py
@@ -26,7 +26,7 @@ def make_request(
],
)
- return response.text
+ return response
def make_auto_request(*args, **kwargs) -> genai.types.GenerateContentResponse:
From 0f4df3e764e9fa132374fbaa206d3caa060219d0 Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Sat, 9 Nov 2024 17:52:44 +0800
Subject: [PATCH 32/36] fix(codegen): remove commented code
---
bigcodebench/provider/openai.py | 36 +--------------------------------
1 file changed, 1 insertion(+), 35 deletions(-)
diff --git a/bigcodebench/provider/openai.py b/bigcodebench/provider/openai.py
index 76e315e..91c1882 100644
--- a/bigcodebench/provider/openai.py
+++ b/bigcodebench/provider/openai.py
@@ -11,41 +11,7 @@
class OpenAIChatDecoder(DecoderBase):
def __init__(self, name: str, base_url=None, **kwargs) -> None:
super().__init__(name, **kwargs)
- self.client = openai.OpenAI(
- api_key=os.getenv("OPENAI_API_KEY", "none"), base_url=base_url
- )
-
- # def codegen(
- # self, prompts: List[str], do_sample: bool = True, num_samples: int = 200
- # ) -> List[str]:
- # if do_sample:
- # assert self.temperature > 0, "Temperature must be positive for sampling"
- # all_outputs = []
- # for prompt in tqdm(prompts):
- # outputs = []
- # message = make_raw_chat_prompt(
- # task_prompt=prompt,
- # subset=self.subset,
- # split=self.split,
- # instruction_prefix=self.instruction_prefix,
- # response_prefix=self.response_prefix,
- # tokenizer=None,
- # )
- # ret = make_auto_request(
- # self.client,
- # message=message,
- # model=self.name,
- # max_tokens=self.max_new_tokens,
- # temperature=self.temperature,
- # n=num_samples,
- # )
- # for item in ret.choices:
- # outputs.append(item.message.content)
- # all_outputs.append(outputs)
- # return all_outputs
-
- # def is_direct_completion(self) -> bool:
- # return False
+ self.base_url = base_url
def codegen(
self, prompts: List[str], do_sample: bool = True, num_samples: int = 200
From d40eceb157ce030755d211412fd01f4f08e3df98 Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Sat, 9 Nov 2024 17:54:19 +0800
Subject: [PATCH 33/36] doc: add params
---
ADVANCED_USAGE.md | 3 +++
1 file changed, 3 insertions(+)
diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md
index 0cd8007..0b2bf7b 100755
--- a/ADVANCED_USAGE.md
+++ b/ADVANCED_USAGE.md
@@ -57,12 +57,15 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio
- `--id_range`: The range of the tasks to evaluate, default to `None`, e.g. `--id_range 10-20` will evaluate the tasks from 10 to 20
- `--backend`: The backend to use, default to `vllm`
- `--base_url`: The base URL of the backend for OpenAI-compatible APIs, default to `None`
+- `--instruction_prefix`: The instruction prefix for the Anthropic backend, default to `None`
+- `--response_prefix`: The response prefix for the Anthropic backend, default to `None`
- `--revision`: The revision of the model with the vLLM or HF backend, default to `main`
- `--tp`: The tensor parallel size for the vLLM backend, default to `1`
- `--trust_remote_code`: Whether to trust the remote code, default to `False`
- `--tokenizer_name`: The name of the customized tokenizer, default to `None`
- `--tokenizer_legacy`: Whether to use the legacy tokenizer, default to `False`
- `--samples`: The path to the generated samples file, default to `None`
+- `--no_execute`: Whether to not execute the samples, default to `False`
- `--local_execute`: Whether to execute the samples locally, default to `False`
- `--remote_execute_api`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page.
- `--pass_k`: The `k` in `Pass@k`, default to `[1, 5, 10]`, e.g. `--pass_k 1,5,10` will evaluate `Pass@1`, `Pass@5` and `Pass@10`
From e517a9e2e99e262cf3c464332c6ee0afbbe872d0 Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Mon, 11 Nov 2024 18:10:01 +0800
Subject: [PATCH 34/36] fix(evaluate): update backup pass_k result path
---
bigcodebench/evaluate.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index 6d02b4b..590d1ae 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -370,7 +370,7 @@ def stucking_checker():
print(f"Save pass@k to {pass_at_k_path}? [Y/N]")
decision = input()
if decision.lower() == "y":
- new_path = result_path + ".bak"
+ new_path = pass_at_k_path + ".bak"
while os.path.isfile(new_path):
new_path += ".bak"
os.rename(pass_at_k_path, new_path)
From 54794ed1510959df76dcad34fa50689e6ff9c666 Mon Sep 17 00:00:00 2001
From: LRL
Date: Tue, 12 Nov 2024 11:40:05 +0800
Subject: [PATCH 35/36] fix missing trust_remote_code parameter
---
bigcodebench/provider/__init__.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py
index d519124..ff27a91 100644
--- a/bigcodebench/provider/__init__.py
+++ b/bigcodebench/provider/__init__.py
@@ -40,6 +40,7 @@ def make_model(
tp=tp,
instruction_prefix=instruction_prefix,
response_prefix=response_prefix,
+ trust_remote_code=trust_remote_code,
)
elif backend == "hf":
from bigcodebench.provider.hf import HuggingFaceDecoder
@@ -56,6 +57,7 @@ def make_model(
instruction_prefix=instruction_prefix,
response_prefix=response_prefix,
attn_implementation=attn_implementation,
+ trust_remote_code=trust_remote_code,
)
elif backend == "openai":
from bigcodebench.provider.openai import OpenAIChatDecoder
From 864586393ef9e11e0d09d8e9a58f1d7c632e75f4 Mon Sep 17 00:00:00 2001
From: Terry Zhuo
Date: Tue, 12 Nov 2024 17:19:15 +0800
Subject: [PATCH 36/36] fix: add tokenizer customization back
---
bigcodebench/provider/__init__.py | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py
index ff27a91..ef19f4e 100644
--- a/bigcodebench/provider/__init__.py
+++ b/bigcodebench/provider/__init__.py
@@ -41,6 +41,8 @@ def make_model(
instruction_prefix=instruction_prefix,
response_prefix=response_prefix,
trust_remote_code=trust_remote_code,
+ tokenizer_name=tokenizer_name,
+ tokenizer_legacy=tokenizer_legacy,
)
elif backend == "hf":
from bigcodebench.provider.hf import HuggingFaceDecoder
@@ -58,6 +60,8 @@ def make_model(
response_prefix=response_prefix,
attn_implementation=attn_implementation,
trust_remote_code=trust_remote_code,
+ tokenizer_name=tokenizer_name,
+ tokenizer_legacy=tokenizer_legacy,
)
elif backend == "openai":
from bigcodebench.provider.openai import OpenAIChatDecoder