From 8b9b46efee7f47054929140cef24e572ce7c393a Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 6 Oct 2024 16:02:12 +0800 Subject: [PATCH 01/36] doc: update link --- ADVANCED_USAGE.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md index 428cf28..252abf1 100755 --- a/ADVANCED_USAGE.md +++ b/ADVANCED_USAGE.md @@ -71,7 +71,7 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio - `--tokenizer_legacy`: Whether to use the legacy tokenizer, default to `False` - `--samples`: The path to the generated samples file, default to `None` - `--local_execute`: Whether to execute the samples locally, default to `False` -- `--remote_execute_api`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://github.com/bigcode-project/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page. +- `--remote_execute_api`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page. - `--pass_k`: The `k` in `Pass@k`, default to `[1, 5, 10]`, e.g. `--pass_k 1,5,10` will evaluate `Pass@1`, `Pass@5` and `Pass@10` - `--save_pass_rate`: Whether to save the pass rate to a file, default to `True` - `--parallel`: The number of parallel processes, default to `None`, e.g. `--parallel 10` will evaluate 10 samples in parallel @@ -111,7 +111,7 @@ bigcodebench.generate \ ``` > -The generated code samples will be stored in a file named `[model_name]--bigcodebench-[instruct|complete]--[backend]-[temp]-[n_samples].jsonl`. Alternatively, you can use the following command to utilize our pre-built docker images for generating code samples: +The generated code samples will be stored in a file named `[model_name]--bigcodebench-[instruct|complete]--[backend]-[temp]-[n_samples]-sanitized_calibrated.jsonl`. Alternatively, you can use the following command to utilize our pre-built docker images for generating code samples: > ```bash From 991e41c06d54735d8ffd7e0651d1c080769039bb Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 6 Oct 2024 16:34:57 +0800 Subject: [PATCH 02/36] doc: add warning --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 68b1945..25e61d7 100755 --- a/README.md +++ b/README.md @@ -85,6 +85,9 @@ pip install "git+https://github.com/bigcode-project/bigcodebench.git#egg=bigcode ## 🚀 Remote Evaluation We use the greedy decoding as an example to show how to evaluate the generated code samples via remote API. +> [!Warning] +> +> To ease the generation, we use batch inference by default. However, the batch inference results could vary from *batch sizes to batch sizes* and *versions to versions*, at least for the vLLM backend. If you want to get more deterministic results for greedy decoding, please set `--bs` to `1`. > [!Note] > From 8cb06e4e4e9cf5070a625df246f0fa6a058c21d2 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Mon, 7 Oct 2024 00:37:22 +0800 Subject: [PATCH 03/36] merge cfg --- setup.cfg | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/setup.cfg b/setup.cfg index a9b7c74..4897f68 100644 --- a/setup.cfg +++ b/setup.cfg @@ -29,9 +29,6 @@ install_requires = wget>=3.2 datasets gradio-client - -[options.extras_require] -generate = vllm numpy rich @@ -48,4 +45,4 @@ console_scripts = bigcodebench.syncheck = bigcodebench.syncheck:main bigcodebench.legacy_sanitize = bigcodebench.legacy_sanitize:main bigcodebench.generate = bigcodebench.generate:main - bigcodebench.inspect = bigcodebench.inspect:main \ No newline at end of file + bigcodebench.inspect = bigcodebench.inspect:main From 0a5154f1afba214de5a28946f115181ccc3bd9ae Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Mon, 7 Oct 2024 00:38:16 +0800 Subject: [PATCH 04/36] doc: merge installation cfg --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 25e61d7..4a6c70c 100755 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ To get started, please first set up the environment: ```bash # By default, you will use the remote evaluation API to execute the output samples. -pip install bigcodebench[generate] --upgrade +pip install bigcodebench --upgrade # You are suggested to use `flash-attn` for generating code samples. pip install packaging ninja @@ -75,7 +75,7 @@ pip install flash-attn --no-build-isolation ```bash # Install to use bigcodebench.generate -pip install "git+https://github.com/bigcode-project/bigcodebench.git#egg=bigcodebench[generate]" --upgrade +pip install "git+https://github.com/bigcode-project/bigcodebench.git" --upgrade ``` From 1cb320fcb3b36945f9fd7ecec7e7f64cf492e5d9 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Mon, 7 Oct 2024 00:40:15 +0800 Subject: [PATCH 05/36] Update ADVANCED_USAGE.md --- ADVANCED_USAGE.md | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md index 252abf1..ab6e0ae 100755 --- a/ADVANCED_USAGE.md +++ b/ADVANCED_USAGE.md @@ -3,25 +3,19 @@ To get started, please first set up the environment: ```bash -# Install to use bigcodebench.evaluate -pip install bigcodebench --upgrade -# If you want to use the evaluate locally, you need to install the requirements +# If you want to use the evaluate locally, you need to install the requirements in an isolated environment pip install -I -r https://raw.githubusercontent.com/bigcode-project/bigcodebench/main/Requirements/requirements-eval.txt -# Install to use bigcodebench.generate -# You are strongly recommended to install the generate dependencies in a separate environment -pip install bigcodebench[generate] --upgrade +# You are strongly recommended to install the bigcodebench dependencies in another environment +pip install bigcodebench --upgrade ```
⏬ Install nightly version :: click to expand ::
```bash -# Install to use bigcodebench.evaluate +# Install to use bigcodebench pip install "git+https://github.com/bigcode-project/bigcodebench.git" --upgrade - -# Install to use bigcodebench.generate -pip install "git+https://github.com/bigcode-project/bigcodebench.git#egg=bigcodebench[generate]" --upgrade ```
@@ -34,10 +28,8 @@ pip install "git+https://github.com/bigcode-project/bigcodebench.git#egg=bigcode git clone https://github.com/bigcode-project/bigcodebench.git cd bigcodebench export PYTHONPATH=$PYTHONPATH:$(pwd) -# Install to use bigcodebench.evaluate +# Install to use bigcodebench pip install -e . -# Install to use bigcodebench.generate -pip install -e .[generate] ``` From f6de469cb536920ac638345836cbacd11bd503b0 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 6 Oct 2024 16:54:19 +0000 Subject: [PATCH 06/36] docker: update Gradio.Dockerfile --- Docker/Gradio.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Docker/Gradio.Dockerfile b/Docker/Gradio.Dockerfile index 9de820b..df4018f 100644 --- a/Docker/Gradio.Dockerfile +++ b/Docker/Gradio.Dockerfile @@ -7,7 +7,7 @@ RUN apt-get update && apt-get install -y git g++ python3-tk zip unzip procps r-b # upgrade to latest pip RUN pip install --upgrade pip -RUN pip install APScheduler==3.10.1 black==23.11.0 click==8.1.3 huggingface-hub>=0.18.0 plotly python-dateutil==2.8.2 gradio-space-ci@git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 isort ruff gradio[oauth]==4.31.0 gradio_leaderboard==0.0.11 schedule==1.2.2 +RUN pip install APScheduler==3.10.1 black==23.11.0 click==8.1.3 huggingface-hub>=0.18.0 plotly python-dateutil==2.8.2 gradio-space-ci@git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 isort ruff gradio[oauth] schedule==1.2.2 # Add a new user "bigcodebenchuser" RUN adduser --disabled-password --gecos "" bigcodebenchuser From b8c1811623aae0493152f9dff01d65781f5102f7 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 6 Oct 2024 16:59:59 +0000 Subject: [PATCH 07/36] refactor(eval): update parallel default val --- bigcodebench/evaluate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index df8ad85..a082d56 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -119,7 +119,7 @@ def evaluate( remote_execute_api: str = "https://bigcode-bigcodebench-evaluator.hf.space/", pass_k: str = "1,5,10", save_pass_rate: bool = True, - parallel: int = None, + parallel: int = -1, min_time_limit: float = 1, max_as_limit: int = 30*1024, max_data_limit: int = 30*1024, @@ -167,7 +167,7 @@ def evaluate( pass_k = [int(k) for k in pass_k.split(",")] - if parallel is None: + if not parallel: n_workers = max(1, multiprocessing.cpu_count() // 2) else: n_workers = parallel From 0825835f4b86daf3f25e6bcf1786c71679cdba6e Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 6 Oct 2024 17:00:48 +0000 Subject: [PATCH 08/36] doc: update minimal full script --- run.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/run.sh b/run.sh index a84199e..c069e8e 100755 --- a/run.sh +++ b/run.sh @@ -9,5 +9,4 @@ bigcodebench.evaluate \ --model $MODEL \ --split $SPLIT \ --subset $SUBSET \ - --backend $BACKEND \ - --tp $NUM_GPU \ No newline at end of file + --backend $BACKEND \ No newline at end of file From dac3a0087a832aed84099f35705f93be2cb27119 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 6 Oct 2024 17:01:56 +0000 Subject: [PATCH 09/36] doc: update parallel arg --- ADVANCED_USAGE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md index ab6e0ae..1b2ca55 100755 --- a/ADVANCED_USAGE.md +++ b/ADVANCED_USAGE.md @@ -66,7 +66,7 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio - `--remote_execute_api`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page. - `--pass_k`: The `k` in `Pass@k`, default to `[1, 5, 10]`, e.g. `--pass_k 1,5,10` will evaluate `Pass@1`, `Pass@5` and `Pass@10` - `--save_pass_rate`: Whether to save the pass rate to a file, default to `True` -- `--parallel`: The number of parallel processes, default to `None`, e.g. `--parallel 10` will evaluate 10 samples in parallel +- `--parallel`: The number of parallel processes, default to `-1`, e.g. `--parallel 10` will evaluate 10 samples in parallel - `--min_time_limit`: The minimum time limit for the execution, default to `1`, e.g. `--min_time_limit 10` will evaluate the samples with at least 10 seconds - `--max_as_limit`: The maximum address space limit for the execution, default to `30*1024` (30 GB), e.g. `--max_as_limit 20*1024` will evaluate the samples with at most 20 GB - `--max_data_limit`: The maximum data segment limit for the execution, default to `30*1024` (30 GB), e.g. `--max_data_limit 20*1024` will evaluate the samples with at most 20 GB From 58b3f2d01b285e5db9c2493114f4747dfb883dcc Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 6 Oct 2024 18:52:59 +0000 Subject: [PATCH 10/36] fix: change parallel logic --- bigcodebench/evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index a082d56..5a9fab8 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -167,7 +167,7 @@ def evaluate( pass_k = [int(k) for k in pass_k.split(",")] - if not parallel: + if parallel < 1: n_workers = max(1, multiprocessing.cpu_count() // 2) else: n_workers = parallel From 2a28c61ccfdcecf0f3466b955b5a0b1028512310 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Mon, 7 Oct 2024 04:39:33 +0800 Subject: [PATCH 11/36] doc: update model outputs link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4a6c70c..c6ca7b7 100755 --- a/README.md +++ b/README.md @@ -139,7 +139,7 @@ export GOOGLE_API_KEY= ## 💻 LLM-generated Code We share pre-generated code samples from LLMs we have [evaluated](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard): -* See the attachment of our [v0.2.0](https://github.com/bigcode-project/bigcodebench/releases/tag/v0.2.0). We include `sanitized_samples_calibrated.zip` for your convenience. +* See the attachment of our [v0.2.0.post3](https://github.com/bigcode-project/bigcodebench/releases/tag/v0.2.0.post3). We include `sanitized_samples_calibrated.zip` for your convenience. ## Advanced Usage From 817e63b25fe38a0f97abbfcc97fc8ed2e08b2474 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Mon, 7 Oct 2024 11:39:47 +0800 Subject: [PATCH 12/36] doc: benchmark description --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index c6ca7b7..ab3747b 100755 --- a/README.md +++ b/README.md @@ -48,6 +48,10 @@ BigCodeBench is an **_easy-to-use_** benchmark for solving **_practical_** and **_challenging_** tasks via code. It aims to evaluate the true programming capabilities of large language models (LLMs) in a more realistic setting. The benchmark is designed for HumanEval-like function-level code generation tasks, but with much more complex instructions and diverse function calls. +There are two splits in BigCodeBench: +- `Complete`: Thes split is designed for code completion based on the comprehensive docstrings. +- `Instruct`: The split works for the instruction-tuned and chat models only, where the models are asked to generate a code snippet based on the natural language instructions. The instructions only contain necessary information, and require more complex reasoning. + ### Why BigCodeBench? BigCodeBench focuses on task automation via code generation with *diverse function calls* and *complex instructions*, with: From 112623038c349d4755f85f8b05a8bceeae1886df Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 8 Oct 2024 16:01:42 +0800 Subject: [PATCH 13/36] remove reflection model --- analysis/utils.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/analysis/utils.py b/analysis/utils.py index ce81bd6..87453fd 100755 --- a/analysis/utils.py +++ b/analysis/utils.py @@ -1133,24 +1133,24 @@ "act_param": 9, "open-data": "None", }, - "mattshumer/ref_70_e3_prefill": { - "name": "Reflection-Llama-3.1-70B", - "link": "https://huggingface.co/mattshumer/ref_70_e3", - "prompted": True, - "moe": False, - "size": 70, - "act_param": 70, - "open-data": "None", - }, - "mattshumer/ref_70_e3": { - "name": "Reflection-Llama-3.1-70B (Recommended Settings)", - "link": "https://huggingface.co/mattshumer/ref_70_e3", - "prompted": True, - "moe": False, - "size": 70, - "act_param": 70, - "open-data": "None", - }, + # "mattshumer/ref_70_e3_prefill": { + # "name": "Reflection-Llama-3.1-70B", + # "link": "https://huggingface.co/mattshumer/ref_70_e3", + # "prompted": True, + # "moe": False, + # "size": 70, + # "act_param": 70, + # "open-data": "None", + # }, + # "mattshumer/ref_70_e3": { + # "name": "Reflection-Llama-3.1-70B (Recommended Settings)", + # "link": "https://huggingface.co/mattshumer/ref_70_e3", + # "prompted": True, + # "moe": False, + # "size": 70, + # "act_param": 70, + # "open-data": "None", + # }, "o1-preview-2024-09-12": { "name": "o1-Preview-2024-09-12 (temperature=1)", "link": "https://o1.ai/o1-preview", From e35d6257f149981fbc237abbb74493c14018e8ed Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 13 Oct 2024 03:39:01 +0000 Subject: [PATCH 14/36] doc: add impact --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index ab3747b..dba0bd6 100755 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@

+ 💥 Impact📰 News🔥 Quick Start🚀 Remote Evaluation • @@ -23,6 +24,17 @@ 📜 Citation

+## 💥 Impact +BigCodeBench has been used by the many LLM teams including: +- Zhipu AI +- Alibaba Qwen +- DeepSeek +- Amazon AWS +- Snowflake AI Research +- ServiceNow Research +- Meta AI +- Cohere AI + ## 📰 News - **[2024-10-06]** We are releasing `bigcodebench==v0.2.0`! - **[2024-10-05]** We create a public code execution API on the [Hugging Face space](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator). From 3b4a058d0768ec1b9d737f1a4ad2de52e4b3f7f1 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Mon, 14 Oct 2024 17:00:58 +0800 Subject: [PATCH 15/36] fix(doc): typos --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index dba0bd6..edb220e 100755 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@

## 💥 Impact -BigCodeBench has been used by the many LLM teams including: +BigCodeBench has been used by many LLM teams including: - Zhipu AI - Alibaba Qwen - DeepSeek From e10d361a8dde5d59724a5604b1fab4c65ff735c9 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 18 Oct 2024 00:03:02 +0800 Subject: [PATCH 16/36] docs: update impact --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index edb220e..0c1958f 100755 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ BigCodeBench has been used by many LLM teams including: - Zhipu AI - Alibaba Qwen - DeepSeek -- Amazon AWS +- Amazon AWS AI - Snowflake AI Research - ServiceNow Research - Meta AI From e25440ea1b8cd20b83c9f2881f492f33b92fc898 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Thu, 31 Oct 2024 02:13:02 +0800 Subject: [PATCH 17/36] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 0c1958f..c37b203 100755 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ BigCodeBench has been used by many LLM teams including: - ServiceNow Research - Meta AI - Cohere AI +- Sakana AI ## 📰 News - **[2024-10-06]** We are releasing `bigcodebench==v0.2.0`! From c5a22bff708b43698630da8d03dfe5063f52216c Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 1 Nov 2024 17:21:31 +0800 Subject: [PATCH 18/36] feat(codegen): support model revision --- bigcodebench/generate.py | 4 +++- bigcodebench/provider/__init__.py | 4 ++++ bigcodebench/provider/base.py | 2 ++ bigcodebench/provider/hf.py | 1 + bigcodebench/provider/vllm.py | 1 + 5 files changed, 11 insertions(+), 1 deletion(-) diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py index 6333261..58f1ab7 100644 --- a/bigcodebench/generate.py +++ b/bigcodebench/generate.py @@ -139,6 +139,7 @@ def run_codegen( backend: str = "vllm", base_url: str = None, tp: int = 1, + revision: str = "main", trust_remote_code: bool = False, tokenizer_name: str = None, tokenizer_legacy: bool = False, @@ -173,6 +174,7 @@ def run_codegen( response_prefix=response_prefix, base_url=base_url, tp=tp, + revision=revision, trust_remote_code=trust_remote_code, direct_completion=direct_completion, tokenizer_name=tokenizer_name, @@ -180,7 +182,7 @@ def run_codegen( ) extra = "-" + subset if subset != "full" else "" - identifier = model.replace("/", "--") + f"--bigcodebench{extra}-{split}--{backend}-{temperature}-{n_samples}-sanitized_calibrated.jsonl" + identifier = model.replace("/", "--") + f"--{revision}--bigcodebench{extra}-{split}--{backend}-{temperature}-{n_samples}-sanitized_calibrated.jsonl" target_path = os.path.join(root, identifier) diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py index 67123f9..d519124 100644 --- a/bigcodebench/provider/__init__.py +++ b/bigcodebench/provider/__init__.py @@ -12,6 +12,8 @@ def make_model( # instruction model only instruction_prefix: str = None, response_prefix: str = None, + # vllm and hf only + revision: str = "main", # vllm only tp: int = 1, direct_completion: bool = False, @@ -32,6 +34,7 @@ def make_model( split=split, temperature=temperature, max_new_tokens=max_new_tokens, + revision=revision, dataset=dataset, direct_completion=direct_completion, tp=tp, @@ -47,6 +50,7 @@ def make_model( split=split, temperature=temperature, max_new_tokens=max_new_tokens, + revision=revision, dataset=dataset, direct_completion=direct_completion, instruction_prefix=instruction_prefix, diff --git a/bigcodebench/provider/base.py b/bigcodebench/provider/base.py index ebec843..5a24b59 100644 --- a/bigcodebench/provider/base.py +++ b/bigcodebench/provider/base.py @@ -12,6 +12,7 @@ def __init__( split: str, temperature: float = 0.8, max_new_tokens: int = 1280, + revision: str = "main", dtype: str = "bfloat16", # default direct_completion: bool = False, trust_remote_code: bool = False, @@ -29,6 +30,7 @@ def __init__( self.skip_special_tokens = False self.max_new_tokens = max_new_tokens self.dtype = dtype + self.revision = revision self.direct_completion = direct_completion self.trust_remote_code = trust_remote_code self.tokenizer_name = tokenizer_name diff --git a/bigcodebench/provider/hf.py b/bigcodebench/provider/hf.py index c3136c8..a85957d 100644 --- a/bigcodebench/provider/hf.py +++ b/bigcodebench/provider/hf.py @@ -27,6 +27,7 @@ def __init__( "trust_remote_code": self.trust_remote_code, "torch_dtype": getattr(torch, self.dtype), "attn_implementation": attn_implementation, # "eager", "flash_attention_2", "sdpa" + "revision": self.revision, } self.skip_special_tokens = True diff --git a/bigcodebench/provider/vllm.py b/bigcodebench/provider/vllm.py index 3d0aaf4..171a41c 100644 --- a/bigcodebench/provider/vllm.py +++ b/bigcodebench/provider/vllm.py @@ -18,6 +18,7 @@ def __init__(self, name: str, dataset: str, tp: int, **kwargs) -> None: "tensor_parallel_size": int(os.getenv("VLLM_N_GPUS", tp)), "dtype": self.dtype, "trust_remote_code": self.trust_remote_code, + "revision": self.revision, } if self.tokenizer_name is None: self.tokenizer_name = self.name From a810f315287cc0a7860791db79b9967420dba80e Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 1 Nov 2024 17:24:20 +0800 Subject: [PATCH 19/36] doc: add model revision --- ADVANCED_USAGE.md | 1 + 1 file changed, 1 insertion(+) diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md index 1b2ca55..67fe359 100755 --- a/ADVANCED_USAGE.md +++ b/ADVANCED_USAGE.md @@ -57,6 +57,7 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio - `--id_range`: The range of the tasks to evaluate, default to `None`, e.g. `--id_range 10,20` will evaluate the tasks from 10 to 20 - `--backend`: The backend to use, default to `vllm` - `--base_url`: The base URL of the backend for OpenAI-compatible APIs, default to `None` +- `--revision`: The revision of the model with the vLLM or HF backend, default to `main` - `--tp`: The tensor parallel size for the vLLM backend, default to `1` - `--trust_remote_code`: Whether to trust the remote code, default to `False` - `--tokenizer_name`: The name of the customized tokenizer, default to `None` From e8798f47a2c43ada4625f9b67c7db8436980d9f9 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sat, 2 Nov 2024 17:22:46 +0800 Subject: [PATCH 20/36] fix: change id_range type --- bigcodebench/generate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py index 58f1ab7..6b3fe37 100644 --- a/bigcodebench/generate.py +++ b/bigcodebench/generate.py @@ -135,7 +135,7 @@ def run_codegen( strip_newlines: bool = False, direct_completion: bool = False, resume: bool = True, - id_range: Tuple[int, int] = None, + id_range: str = None, backend: str = "vllm", base_url: str = None, tp: int = 1, @@ -152,6 +152,7 @@ def run_codegen( print("Greedy decoding ON (--greedy): setting n_samples=1, temperature=0") if id_range is not None: + id_range = [int(i) for i in id_range.split("-")] assert len(id_range) == 2, "id_range must be a list of length 2" assert id_range[0] < id_range[1], "id_range must be increasing" id_range = tuple(id_range) From 216543126cc8fafa9e7171f5546c38ec801b4a6c Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sat, 2 Nov 2024 19:31:26 +0800 Subject: [PATCH 21/36] fix(codegen): stop by upper bound --- bigcodebench/generate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py index 6b3fe37..9f29cad 100644 --- a/bigcodebench/generate.py +++ b/bigcodebench/generate.py @@ -64,7 +64,7 @@ def codegen( if id_num < low: p.console.print(f"Skipping {task_id} as it is not in {id_range}") continue - if id_num > id_range[1]: + if id_num >= id_range[1]: break p_name = task_id.replace("/", "_") From 1e243647dfe02b7ce44ad4a3237cfc81fc4d5e69 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 5 Nov 2024 03:09:00 +0800 Subject: [PATCH 22/36] feat: using datasets to load --- bigcodebench/data/bigcodebench.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/bigcodebench/data/bigcodebench.py b/bigcodebench/data/bigcodebench.py index da2ad5d..9a3ee9d 100644 --- a/bigcodebench/data/bigcodebench.py +++ b/bigcodebench/data/bigcodebench.py @@ -26,14 +26,8 @@ def _ready_bigcodebench_path(subset="full", version="default") -> str: ) extra = "-" + subset if subset != "full" else "" - - try: - dataset = load_dataset(BIGCODEBENCH_HF+extra, split=BIGCODEBENCH_VERSION) - make_cache(url, dataset, path) - except: - if os.path.exists(path): - os.remove(path) - make_cache(url, None, path, gh=True) + dataset = load_dataset(BIGCODEBENCH_HF+extra, split=BIGCODEBENCH_VERSION) + make_cache(url, dataset, path) return path From cb283fdd11d0f8abaaf5e7eec59641b52ccf13fc Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 5 Nov 2024 03:12:45 +0800 Subject: [PATCH 23/36] feat: customize instruction and response --- bigcodebench/generate.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py index 9f29cad..757b08c 100644 --- a/bigcodebench/generate.py +++ b/bigcodebench/generate.py @@ -139,6 +139,8 @@ def run_codegen( backend: str = "vllm", base_url: str = None, tp: int = 1, + instruction_prefix: str = None, + response_prefix: str = None, revision: str = "main", trust_remote_code: bool = False, tokenizer_name: str = None, @@ -160,8 +162,10 @@ def run_codegen( # Make project dir os.makedirs(root, exist_ok=True) - instruction_prefix = "Please provide a self-contained Python script that solves the following problem in a markdown code block:" - response_prefix = "Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:" + if instruction_prefix is None: + instruction_prefix = "Please provide a self-contained Python script that solves the following problem in a markdown code block:" + if response_prefix is None: + response_prefix = "Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:" # Make dir for codes generated by each model model_runner = make_model( From 974e67918e36ee923b06808e5a2edfaa8d7c9319 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 5 Nov 2024 03:20:51 +0800 Subject: [PATCH 24/36] fix: make google api do n samples --- bigcodebench/gen/util/google_request.py | 8 ++++++-- bigcodebench/provider/google.py | 3 +-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/bigcodebench/gen/util/google_request.py b/bigcodebench/gen/util/google_request.py index 8a88842..7ce935b 100644 --- a/bigcodebench/gen/util/google_request.py +++ b/bigcodebench/gen/util/google_request.py @@ -5,13 +5,17 @@ def make_request( - client: genai.GenerativeModel, temperature, messages, max_new_tokens=2048 + client: genai.GenerativeModel, + messages: List, + temperature: float, + n: int, + max_new_tokens: int = 2048, ) -> genai.types.GenerateContentResponse: messages = [{"role": m["role"], "parts": [m["content"]]} for m in messages] response = client.generate_content( messages, generation_config=genai.types.GenerationConfig( - candidate_count=1, + candidate_count=n, max_output_tokens=max_new_tokens, temperature=temperature, ), diff --git a/bigcodebench/provider/google.py b/bigcodebench/provider/google.py index 0cd5416..c9781ca 100644 --- a/bigcodebench/provider/google.py +++ b/bigcodebench/provider/google.py @@ -36,10 +36,9 @@ def codegen( ret = make_auto_request( self.client, message, - self.name, n=num_samples, - max_tokens=self.max_new_tokens, temperature=self.temperature, + max_new_tokens=self.max_new_tokens, ) for candidate in ret.candidates: parts = candidate.content.parts From 492080811f468758e69ca279489fc729783715bd Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 5 Nov 2024 03:56:25 +0800 Subject: [PATCH 25/36] feat: change google api request --- bigcodebench/gen/util/google_request.py | 5 ++--- bigcodebench/provider/google.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/bigcodebench/gen/util/google_request.py b/bigcodebench/gen/util/google_request.py index 7ce935b..6517650 100644 --- a/bigcodebench/gen/util/google_request.py +++ b/bigcodebench/gen/util/google_request.py @@ -6,14 +6,13 @@ def make_request( client: genai.GenerativeModel, - messages: List, + message: str, temperature: float, n: int, max_new_tokens: int = 2048, ) -> genai.types.GenerateContentResponse: - messages = [{"role": m["role"], "parts": [m["content"]]} for m in messages] response = client.generate_content( - messages, + [{"role": "user", "parts": [message]}], generation_config=genai.types.GenerationConfig( candidate_count=n, max_output_tokens=max_new_tokens, diff --git a/bigcodebench/provider/google.py b/bigcodebench/provider/google.py index c9781ca..2194c47 100644 --- a/bigcodebench/provider/google.py +++ b/bigcodebench/provider/google.py @@ -35,7 +35,7 @@ def codegen( ) ret = make_auto_request( self.client, - message, + message=message, n=num_samples, temperature=self.temperature, max_new_tokens=self.max_new_tokens, From 1d9ea6af233cf8e86ccf279467b0f8e2b4c93122 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 5 Nov 2024 03:56:55 +0800 Subject: [PATCH 26/36] feat: batch o1 and deepseek-chat via concurrency --- bigcodebench/gen/util/openai_request.py | 30 +++------- bigcodebench/provider/openai.py | 75 ++++++++++++++++++++----- bigcodebench/provider/utility.py | 9 ++- 3 files changed, 77 insertions(+), 37 deletions(-) diff --git a/bigcodebench/gen/util/openai_request.py b/bigcodebench/gen/util/openai_request.py index e347ffe..a745d8d 100644 --- a/bigcodebench/gen/util/openai_request.py +++ b/bigcodebench/gen/util/openai_request.py @@ -1,4 +1,3 @@ -import signal import time import openai @@ -14,53 +13,38 @@ def make_request( n: int = 1, **kwargs ) -> ChatCompletion: - system_msg = "You are a helpful assistant good at coding." - if ( - kwargs.get("response_format", None) - and kwargs["response_format"]["type"] == "json_object" - ): - system_msg = "You are a helpful assistant designed to output JSON." - + kwargs["top_p"] = 0.95 + kwargs["max_completion_tokens"] = max_tokens + if model.startswith("o1-"): # pop top-p and max_completion_tokens + kwargs.pop("top_p") + kwargs.pop("max_completion_tokens") + return client.chat.completions.create( model=model, messages=[ - {"role": "system", "content": system_msg}, {"role": "user", "content": message}, ], - max_tokens=max_tokens, temperature=temperature, n=n, **kwargs ) -def handler(signum, frame): - # swallow signum and frame - raise Exception("end of time") - - def make_auto_request(*args, **kwargs) -> ChatCompletion: ret = None while ret is None: try: - signal.signal(signal.SIGALRM, handler) - signal.alarm(100) ret = make_request(*args, **kwargs) - signal.alarm(0) except openai.RateLimitError: print("Rate limit exceeded. Waiting...") - signal.alarm(0) time.sleep(5) except openai.APIConnectionError: print("API connection error. Waiting...") - signal.alarm(0) time.sleep(5) except openai.APIError as e: print(e) - signal.alarm(0) except Exception as e: print("Unknown error. Waiting...") print(e) - signal.alarm(0) time.sleep(1) - return ret + return ret \ No newline at end of file diff --git a/bigcodebench/provider/openai.py b/bigcodebench/provider/openai.py index 9eba02e..76e315e 100644 --- a/bigcodebench/provider/openai.py +++ b/bigcodebench/provider/openai.py @@ -1,12 +1,12 @@ import os from typing import List -from tqdm import tqdm import openai -from bigcodebench.provider.base import DecoderBase from bigcodebench.gen.util.openai_request import make_auto_request from bigcodebench.provider.utility import make_raw_chat_prompt +from bigcodebench.provider.base import DecoderBase +from bigcodebench.provider.utility import concurrent_call class OpenAIChatDecoder(DecoderBase): def __init__(self, name: str, base_url=None, **kwargs) -> None: @@ -15,34 +15,83 @@ def __init__(self, name: str, base_url=None, **kwargs) -> None: api_key=os.getenv("OPENAI_API_KEY", "none"), base_url=base_url ) + # def codegen( + # self, prompts: List[str], do_sample: bool = True, num_samples: int = 200 + # ) -> List[str]: + # if do_sample: + # assert self.temperature > 0, "Temperature must be positive for sampling" + # all_outputs = [] + # for prompt in tqdm(prompts): + # outputs = [] + # message = make_raw_chat_prompt( + # task_prompt=prompt, + # subset=self.subset, + # split=self.split, + # instruction_prefix=self.instruction_prefix, + # response_prefix=self.response_prefix, + # tokenizer=None, + # ) + # ret = make_auto_request( + # self.client, + # message=message, + # model=self.name, + # max_tokens=self.max_new_tokens, + # temperature=self.temperature, + # n=num_samples, + # ) + # for item in ret.choices: + # outputs.append(item.message.content) + # all_outputs.append(outputs) + # return all_outputs + + # def is_direct_completion(self) -> bool: + # return False + def codegen( self, prompts: List[str], do_sample: bool = True, num_samples: int = 200 ) -> List[str]: if do_sample: assert self.temperature > 0, "Temperature must be positive for sampling" + messages = [make_raw_chat_prompt( + task_prompt=prompt, + subset=self.subset, + split=self.split, + instruction_prefix=self.instruction_prefix, + response_prefix=self.response_prefix, + tokenizer=None, + ) for prompt in prompts] + # use concurrency based batching for o1 and deepseek models + if self.name.startswith("o1-") or self.name == "deepseek-chat": + return self._codegen_batch_via_concurrency(messages, num_samples) + + return self._codegen_api_batch(messages, num_samples) + + def _codegen_api_batch(self, messages: List[str], num_samples: int) -> List[str]: + client = openai.OpenAI( + api_key=os.getenv("OPENAI_API_KEY", "none"), base_url=self.base_url + ) + all_outputs = [] - for prompt in tqdm(prompts): - outputs = [] - message = make_raw_chat_prompt( - task_prompt=prompt, - subset=self.subset, - split=self.split, - instruction_prefix=self.instruction_prefix, - response_prefix=self.response_prefix, - tokenizer=None, - ) + for message in messages: ret = make_auto_request( - self.client, + client, message=message, model=self.name, max_tokens=self.max_new_tokens, temperature=self.temperature, n=num_samples, ) + outputs = [] for item in ret.choices: outputs.append(item.message.content) all_outputs.append(outputs) return all_outputs + def _codegen_batch_via_concurrency(self, messages: List[str], num_samples: int) -> List[str]: + batches = concurrent_call( + num_samples, self._codegen_api_batch, messages, num_samples=1 + ) + return [b[0] for b in batches] + def is_direct_completion(self) -> bool: return False \ No newline at end of file diff --git a/bigcodebench/provider/utility.py b/bigcodebench/provider/utility.py index 60a00e5..bb27539 100644 --- a/bigcodebench/provider/utility.py +++ b/bigcodebench/provider/utility.py @@ -1,5 +1,6 @@ from typing import List from transformers import AutoTokenizer +from concurrent.futures import ThreadPoolExecutor EOS = [ "<|endoftext|>", @@ -64,4 +65,10 @@ def make_raw_chat_prompt( ], tokenize=False, ).split(_MAGIC_SPLITTER_)[0] - return task_prompt \ No newline at end of file + return task_prompt + + +def concurrent_call(n, callback, /, *args, **kwargs): + with ThreadPoolExecutor(max_workers=n) as executor: + futures = [executor.submit(callback, *args, **kwargs) for _ in range(n)] + return [future.result() for future in futures] \ No newline at end of file From 813712f9220d0532f36757fee39ed841da2312d9 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 5 Nov 2024 05:29:33 +0800 Subject: [PATCH 27/36] feat: add 3.5 haiku and grok beta --- analysis/utils.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/analysis/utils.py b/analysis/utils.py index 87453fd..4cd9862 100755 --- a/analysis/utils.py +++ b/analysis/utils.py @@ -1277,4 +1277,58 @@ "act_param": 3, "open-data": "None", }, -} \ No newline at end of file + "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF": { + "name": "Llama-3.1-Nemotron-70B-Instruct", + "link": "https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", + "prompted": True, + "moe": False, + "size": 70, + "act_param": 70, + "open-data": "Partial", + }, + "claude-3-5-sonnet-20241022": { + "name": "Claude-3.5-Sonnet-20241022", + "link": "https://claude.ai/", + "prompted": True, + "moe": False, + "size": None, + "act_param": None, + "open-data": "None", + }, + "ibm-granite/granite-3.0-8b-instruct": { + "name": "Granite-3.0-8B-Instruct", + "link": "https://huggingface.co/ibm-granite/granite-3.0-8b-instruct", + "prompted": True, + "moe": False, + "size": 8, + "act_param": 8, + "open-data": "None", + }, + "ibm-granite/granite-3.0-2b-instruct": { + "name": "Granite-3.0-2B-Instruct", + "link": "https://huggingface.co/ibm-granite/granite-3.0-2b-instruct", + "prompted": True, + "moe": False, + "size": 2, + "act_param": 2, + "open-data": "None", + }, + "grok-beta--main": { + "name": "Grok-Beta", + "link": "https://grok.com/", + "prompted": True, + "moe": False, + "size": None, + "act_param": None, + "open-data": "None", + }, + "claude-3-5-haiku-20241022--main": { + "name": "Claude-3.5-Haiku-20241022", + "link": "https://claude.ai/", + "prompted": True, + "moe": False, + "size": None, + "act_param": None, + "open-data": "None", + }, +} From 16ec422e9af5c9f6663bdca737cce4d8460647a5 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sat, 9 Nov 2024 01:16:07 +0800 Subject: [PATCH 28/36] fix(evaluate): update the calibration setup --- bigcodebench/evaluate.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index 5a9fab8..44c7f93 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -233,7 +233,7 @@ def evaluate( if "solution" in sample else problems[task_id]["complete_prompt"] + sample["completion"] ) - if "sanitized-calibrated" in samples: + if "sanitized_calibrated" in samples: solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution remainings.add(sample["_identifier"]) args = ( @@ -254,22 +254,22 @@ def evaluate( assert n_samples == len(remainings), "Missing problems in unfinished" assert len(completion_id) == len(problems), "Missing problems in samples" - def stucking_checker(): - while remainings: - last_size = len(remainings) - time.sleep(240) - if last_size != len(remainings) or len(remainings) == 0: - continue - # Potential stucking - warn("No samples had finished testing in the last 240s") - warn(f"{len(remainings)} samples to be tested: {remainings}") + def stucking_checker(): + while remainings: + last_size = len(remainings) + time.sleep(240) + if last_size != len(remainings) or len(remainings) == 0: + continue + # Potential stucking + warn("No samples had finished testing in the last 240s") + warn(f"{len(remainings)} samples to be tested: {remainings}") - threading.Thread(target=stucking_checker).start() + threading.Thread(target=stucking_checker).start() - for future in tqdm(as_completed(futures), total=n_samples): - result = future.result() - remainings.remove(result["_identifier"]) - eval_results[result["task_id"]].append(result) + for future in tqdm(as_completed(futures), total=n_samples): + result = future.result() + remainings.remove(result["_identifier"]) + eval_results[result["task_id"]].append(result) # sort the results for each problem by completion_id for task_id, task_results in eval_results.items(): @@ -307,7 +307,7 @@ def stucking_checker(): pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0] pass_at_k["split"] = split pass_at_k["subset"] = subset - pass_at_k["calibrated"] = "sanitized-calibrated" in samples + pass_at_k["calibrated"] = "sanitized_calibrated" in samples pass_at_k["gt_pass_rate"] = gt_pass_rate pass_at_k["failed_tasks"] = failed_tasks From 570a4c8f783f1c954e2256bf6d25e89c2e4cd0ea Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sat, 9 Nov 2024 17:12:19 +0800 Subject: [PATCH 29/36] feat(evaluate): add no_execute flag --- bigcodebench/evaluate.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index 44c7f93..6d02b4b 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -115,6 +115,7 @@ def evaluate( split: str, subset: str, samples: Optional[str] = None, + no_execute: bool = False, local_execute: bool = False, remote_execute_api: str = "https://bigcode-bigcodebench-evaluator.hf.space/", pass_k: str = "1,5,10", @@ -135,6 +136,10 @@ def evaluate( subset=subset, **model_kwargs, ) + + if no_execute: + return + assert samples is not None, "No samples provided" if os.path.isdir(samples): From 9ff42caca16b461b8eb5b5d74a371fe4f38c0ad9 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sat, 9 Nov 2024 17:15:52 +0800 Subject: [PATCH 30/36] fix(doc): change id_range input --- ADVANCED_USAGE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md index 67fe359..0cd8007 100755 --- a/ADVANCED_USAGE.md +++ b/ADVANCED_USAGE.md @@ -54,7 +54,7 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio - `--strip_newlines`: Whether to strip newlines, default to `False`, set to `True` to strip newlines for some model series like StarCoder2 - `--direct_completion`: Whether to use direct completion, default to `False` - `--resume`: Whether to resume the evaluation, default to `True`, set to `False` to re-run the evaluation -- `--id_range`: The range of the tasks to evaluate, default to `None`, e.g. `--id_range 10,20` will evaluate the tasks from 10 to 20 +- `--id_range`: The range of the tasks to evaluate, default to `None`, e.g. `--id_range 10-20` will evaluate the tasks from 10 to 20 - `--backend`: The backend to use, default to `vllm` - `--base_url`: The base URL of the backend for OpenAI-compatible APIs, default to `None` - `--revision`: The revision of the model with the vLLM or HF backend, default to `main` From 8ed15f69c38b3f3d2c0b0ddf8bf638170af9aeba Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sat, 9 Nov 2024 17:52:10 +0800 Subject: [PATCH 31/36] fix(codegen): update make_request --- bigcodebench/gen/util/google_request.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigcodebench/gen/util/google_request.py b/bigcodebench/gen/util/google_request.py index 6517650..8e696b4 100644 --- a/bigcodebench/gen/util/google_request.py +++ b/bigcodebench/gen/util/google_request.py @@ -26,7 +26,7 @@ def make_request( ], ) - return response.text + return response def make_auto_request(*args, **kwargs) -> genai.types.GenerateContentResponse: From 0f4df3e764e9fa132374fbaa206d3caa060219d0 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sat, 9 Nov 2024 17:52:44 +0800 Subject: [PATCH 32/36] fix(codegen): remove commented code --- bigcodebench/provider/openai.py | 36 +-------------------------------- 1 file changed, 1 insertion(+), 35 deletions(-) diff --git a/bigcodebench/provider/openai.py b/bigcodebench/provider/openai.py index 76e315e..91c1882 100644 --- a/bigcodebench/provider/openai.py +++ b/bigcodebench/provider/openai.py @@ -11,41 +11,7 @@ class OpenAIChatDecoder(DecoderBase): def __init__(self, name: str, base_url=None, **kwargs) -> None: super().__init__(name, **kwargs) - self.client = openai.OpenAI( - api_key=os.getenv("OPENAI_API_KEY", "none"), base_url=base_url - ) - - # def codegen( - # self, prompts: List[str], do_sample: bool = True, num_samples: int = 200 - # ) -> List[str]: - # if do_sample: - # assert self.temperature > 0, "Temperature must be positive for sampling" - # all_outputs = [] - # for prompt in tqdm(prompts): - # outputs = [] - # message = make_raw_chat_prompt( - # task_prompt=prompt, - # subset=self.subset, - # split=self.split, - # instruction_prefix=self.instruction_prefix, - # response_prefix=self.response_prefix, - # tokenizer=None, - # ) - # ret = make_auto_request( - # self.client, - # message=message, - # model=self.name, - # max_tokens=self.max_new_tokens, - # temperature=self.temperature, - # n=num_samples, - # ) - # for item in ret.choices: - # outputs.append(item.message.content) - # all_outputs.append(outputs) - # return all_outputs - - # def is_direct_completion(self) -> bool: - # return False + self.base_url = base_url def codegen( self, prompts: List[str], do_sample: bool = True, num_samples: int = 200 From d40eceb157ce030755d211412fd01f4f08e3df98 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sat, 9 Nov 2024 17:54:19 +0800 Subject: [PATCH 33/36] doc: add params --- ADVANCED_USAGE.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md index 0cd8007..0b2bf7b 100755 --- a/ADVANCED_USAGE.md +++ b/ADVANCED_USAGE.md @@ -57,12 +57,15 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio - `--id_range`: The range of the tasks to evaluate, default to `None`, e.g. `--id_range 10-20` will evaluate the tasks from 10 to 20 - `--backend`: The backend to use, default to `vllm` - `--base_url`: The base URL of the backend for OpenAI-compatible APIs, default to `None` +- `--instruction_prefix`: The instruction prefix for the Anthropic backend, default to `None` +- `--response_prefix`: The response prefix for the Anthropic backend, default to `None` - `--revision`: The revision of the model with the vLLM or HF backend, default to `main` - `--tp`: The tensor parallel size for the vLLM backend, default to `1` - `--trust_remote_code`: Whether to trust the remote code, default to `False` - `--tokenizer_name`: The name of the customized tokenizer, default to `None` - `--tokenizer_legacy`: Whether to use the legacy tokenizer, default to `False` - `--samples`: The path to the generated samples file, default to `None` +- `--no_execute`: Whether to not execute the samples, default to `False` - `--local_execute`: Whether to execute the samples locally, default to `False` - `--remote_execute_api`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page. - `--pass_k`: The `k` in `Pass@k`, default to `[1, 5, 10]`, e.g. `--pass_k 1,5,10` will evaluate `Pass@1`, `Pass@5` and `Pass@10` From e517a9e2e99e262cf3c464332c6ee0afbbe872d0 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Mon, 11 Nov 2024 18:10:01 +0800 Subject: [PATCH 34/36] fix(evaluate): update backup pass_k result path --- bigcodebench/evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index 6d02b4b..590d1ae 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -370,7 +370,7 @@ def stucking_checker(): print(f"Save pass@k to {pass_at_k_path}? [Y/N]") decision = input() if decision.lower() == "y": - new_path = result_path + ".bak" + new_path = pass_at_k_path + ".bak" while os.path.isfile(new_path): new_path += ".bak" os.rename(pass_at_k_path, new_path) From 54794ed1510959df76dcad34fa50689e6ff9c666 Mon Sep 17 00:00:00 2001 From: LRL Date: Tue, 12 Nov 2024 11:40:05 +0800 Subject: [PATCH 35/36] fix missing trust_remote_code parameter --- bigcodebench/provider/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py index d519124..ff27a91 100644 --- a/bigcodebench/provider/__init__.py +++ b/bigcodebench/provider/__init__.py @@ -40,6 +40,7 @@ def make_model( tp=tp, instruction_prefix=instruction_prefix, response_prefix=response_prefix, + trust_remote_code=trust_remote_code, ) elif backend == "hf": from bigcodebench.provider.hf import HuggingFaceDecoder @@ -56,6 +57,7 @@ def make_model( instruction_prefix=instruction_prefix, response_prefix=response_prefix, attn_implementation=attn_implementation, + trust_remote_code=trust_remote_code, ) elif backend == "openai": from bigcodebench.provider.openai import OpenAIChatDecoder From 864586393ef9e11e0d09d8e9a58f1d7c632e75f4 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 12 Nov 2024 17:19:15 +0800 Subject: [PATCH 36/36] fix: add tokenizer customization back --- bigcodebench/provider/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py index ff27a91..ef19f4e 100644 --- a/bigcodebench/provider/__init__.py +++ b/bigcodebench/provider/__init__.py @@ -41,6 +41,8 @@ def make_model( instruction_prefix=instruction_prefix, response_prefix=response_prefix, trust_remote_code=trust_remote_code, + tokenizer_name=tokenizer_name, + tokenizer_legacy=tokenizer_legacy, ) elif backend == "hf": from bigcodebench.provider.hf import HuggingFaceDecoder @@ -58,6 +60,8 @@ def make_model( response_prefix=response_prefix, attn_implementation=attn_implementation, trust_remote_code=trust_remote_code, + tokenizer_name=tokenizer_name, + tokenizer_legacy=tokenizer_legacy, ) elif backend == "openai": from bigcodebench.provider.openai import OpenAIChatDecoder