diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md
index 428cf28..0b2bf7b 100755
--- a/ADVANCED_USAGE.md
+++ b/ADVANCED_USAGE.md
@@ -3,25 +3,19 @@
To get started, please first set up the environment:
```bash
-# Install to use bigcodebench.evaluate
-pip install bigcodebench --upgrade
-# If you want to use the evaluate locally, you need to install the requirements
+# If you want to use the evaluate locally, you need to install the requirements in an isolated environment
pip install -I -r https://raw.githubusercontent.com/bigcode-project/bigcodebench/main/Requirements/requirements-eval.txt
-# Install to use bigcodebench.generate
-# You are strongly recommended to install the generate dependencies in a separate environment
-pip install bigcodebench[generate] --upgrade
+# You are strongly recommended to install the bigcodebench dependencies in another environment
+pip install bigcodebench --upgrade
```
⏬ Install nightly version :: click to expand ::
```bash
-# Install to use bigcodebench.evaluate
+# Install to use bigcodebench
pip install "git+https://github.com/bigcode-project/bigcodebench.git" --upgrade
-
-# Install to use bigcodebench.generate
-pip install "git+https://github.com/bigcode-project/bigcodebench.git#egg=bigcodebench[generate]" --upgrade
```
@@ -34,10 +28,8 @@ pip install "git+https://github.com/bigcode-project/bigcodebench.git#egg=bigcode
git clone https://github.com/bigcode-project/bigcodebench.git
cd bigcodebench
export PYTHONPATH=$PYTHONPATH:$(pwd)
-# Install to use bigcodebench.evaluate
+# Install to use bigcodebench
pip install -e .
-# Install to use bigcodebench.generate
-pip install -e .[generate]
```
@@ -62,19 +54,23 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio
- `--strip_newlines`: Whether to strip newlines, default to `False`, set to `True` to strip newlines for some model series like StarCoder2
- `--direct_completion`: Whether to use direct completion, default to `False`
- `--resume`: Whether to resume the evaluation, default to `True`, set to `False` to re-run the evaluation
-- `--id_range`: The range of the tasks to evaluate, default to `None`, e.g. `--id_range 10,20` will evaluate the tasks from 10 to 20
+- `--id_range`: The range of the tasks to evaluate, default to `None`, e.g. `--id_range 10-20` will evaluate the tasks from 10 to 20
- `--backend`: The backend to use, default to `vllm`
- `--base_url`: The base URL of the backend for OpenAI-compatible APIs, default to `None`
+- `--instruction_prefix`: The instruction prefix for the Anthropic backend, default to `None`
+- `--response_prefix`: The response prefix for the Anthropic backend, default to `None`
+- `--revision`: The revision of the model with the vLLM or HF backend, default to `main`
- `--tp`: The tensor parallel size for the vLLM backend, default to `1`
- `--trust_remote_code`: Whether to trust the remote code, default to `False`
- `--tokenizer_name`: The name of the customized tokenizer, default to `None`
- `--tokenizer_legacy`: Whether to use the legacy tokenizer, default to `False`
- `--samples`: The path to the generated samples file, default to `None`
+- `--no_execute`: Whether to not execute the samples, default to `False`
- `--local_execute`: Whether to execute the samples locally, default to `False`
-- `--remote_execute_api`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://github.com/bigcode-project/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page.
+- `--remote_execute_api`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page.
- `--pass_k`: The `k` in `Pass@k`, default to `[1, 5, 10]`, e.g. `--pass_k 1,5,10` will evaluate `Pass@1`, `Pass@5` and `Pass@10`
- `--save_pass_rate`: Whether to save the pass rate to a file, default to `True`
-- `--parallel`: The number of parallel processes, default to `None`, e.g. `--parallel 10` will evaluate 10 samples in parallel
+- `--parallel`: The number of parallel processes, default to `-1`, e.g. `--parallel 10` will evaluate 10 samples in parallel
- `--min_time_limit`: The minimum time limit for the execution, default to `1`, e.g. `--min_time_limit 10` will evaluate the samples with at least 10 seconds
- `--max_as_limit`: The maximum address space limit for the execution, default to `30*1024` (30 GB), e.g. `--max_as_limit 20*1024` will evaluate the samples with at most 20 GB
- `--max_data_limit`: The maximum data segment limit for the execution, default to `30*1024` (30 GB), e.g. `--max_data_limit 20*1024` will evaluate the samples with at most 20 GB
@@ -111,7 +107,7 @@ bigcodebench.generate \
```
>
-The generated code samples will be stored in a file named `[model_name]--bigcodebench-[instruct|complete]--[backend]-[temp]-[n_samples].jsonl`. Alternatively, you can use the following command to utilize our pre-built docker images for generating code samples:
+The generated code samples will be stored in a file named `[model_name]--bigcodebench-[instruct|complete]--[backend]-[temp]-[n_samples]-sanitized_calibrated.jsonl`. Alternatively, you can use the following command to utilize our pre-built docker images for generating code samples:
>
```bash
diff --git a/Docker/Gradio.Dockerfile b/Docker/Gradio.Dockerfile
index 9de820b..df4018f 100644
--- a/Docker/Gradio.Dockerfile
+++ b/Docker/Gradio.Dockerfile
@@ -7,7 +7,7 @@ RUN apt-get update && apt-get install -y git g++ python3-tk zip unzip procps r-b
# upgrade to latest pip
RUN pip install --upgrade pip
-RUN pip install APScheduler==3.10.1 black==23.11.0 click==8.1.3 huggingface-hub>=0.18.0 plotly python-dateutil==2.8.2 gradio-space-ci@git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 isort ruff gradio[oauth]==4.31.0 gradio_leaderboard==0.0.11 schedule==1.2.2
+RUN pip install APScheduler==3.10.1 black==23.11.0 click==8.1.3 huggingface-hub>=0.18.0 plotly python-dateutil==2.8.2 gradio-space-ci@git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 isort ruff gradio[oauth] schedule==1.2.2
# Add a new user "bigcodebenchuser"
RUN adduser --disabled-password --gecos "" bigcodebenchuser
diff --git a/README.md b/README.md
index 68b1945..c37b203 100755
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@
+ 💥 Impact •
📰 News •
🔥 Quick Start •
🚀 Remote Evaluation •
@@ -23,6 +24,18 @@
📜 Citation
+## 💥 Impact
+BigCodeBench has been used by many LLM teams including:
+- Zhipu AI
+- Alibaba Qwen
+- DeepSeek
+- Amazon AWS AI
+- Snowflake AI Research
+- ServiceNow Research
+- Meta AI
+- Cohere AI
+- Sakana AI
+
## 📰 News
- **[2024-10-06]** We are releasing `bigcodebench==v0.2.0`!
- **[2024-10-05]** We create a public code execution API on the [Hugging Face space](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator).
@@ -48,6 +61,10 @@
BigCodeBench is an **_easy-to-use_** benchmark for solving **_practical_** and **_challenging_** tasks via code. It aims to evaluate the true programming capabilities of large language models (LLMs) in a more realistic setting. The benchmark is designed for HumanEval-like function-level code generation tasks, but with much more complex instructions and diverse function calls.
+There are two splits in BigCodeBench:
+- `Complete`: Thes split is designed for code completion based on the comprehensive docstrings.
+- `Instruct`: The split works for the instruction-tuned and chat models only, where the models are asked to generate a code snippet based on the natural language instructions. The instructions only contain necessary information, and require more complex reasoning.
+
### Why BigCodeBench?
BigCodeBench focuses on task automation via code generation with *diverse function calls* and *complex instructions*, with:
@@ -61,7 +78,7 @@ To get started, please first set up the environment:
```bash
# By default, you will use the remote evaluation API to execute the output samples.
-pip install bigcodebench[generate] --upgrade
+pip install bigcodebench --upgrade
# You are suggested to use `flash-attn` for generating code samples.
pip install packaging ninja
@@ -75,7 +92,7 @@ pip install flash-attn --no-build-isolation
```bash
# Install to use bigcodebench.generate
-pip install "git+https://github.com/bigcode-project/bigcodebench.git#egg=bigcodebench[generate]" --upgrade
+pip install "git+https://github.com/bigcode-project/bigcodebench.git" --upgrade
```
@@ -85,6 +102,9 @@ pip install "git+https://github.com/bigcode-project/bigcodebench.git#egg=bigcode
## 🚀 Remote Evaluation
We use the greedy decoding as an example to show how to evaluate the generated code samples via remote API.
+> [!Warning]
+>
+> To ease the generation, we use batch inference by default. However, the batch inference results could vary from *batch sizes to batch sizes* and *versions to versions*, at least for the vLLM backend. If you want to get more deterministic results for greedy decoding, please set `--bs` to `1`.
> [!Note]
>
@@ -136,7 +156,7 @@ export GOOGLE_API_KEY=
## 💻 LLM-generated Code
We share pre-generated code samples from LLMs we have [evaluated](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard):
-* See the attachment of our [v0.2.0](https://github.com/bigcode-project/bigcodebench/releases/tag/v0.2.0). We include `sanitized_samples_calibrated.zip` for your convenience.
+* See the attachment of our [v0.2.0.post3](https://github.com/bigcode-project/bigcodebench/releases/tag/v0.2.0.post3). We include `sanitized_samples_calibrated.zip` for your convenience.
## Advanced Usage
diff --git a/analysis/utils.py b/analysis/utils.py
index ce81bd6..4cd9862 100755
--- a/analysis/utils.py
+++ b/analysis/utils.py
@@ -1133,24 +1133,24 @@
"act_param": 9,
"open-data": "None",
},
- "mattshumer/ref_70_e3_prefill": {
- "name": "Reflection-Llama-3.1-70B",
- "link": "https://huggingface.co/mattshumer/ref_70_e3",
- "prompted": True,
- "moe": False,
- "size": 70,
- "act_param": 70,
- "open-data": "None",
- },
- "mattshumer/ref_70_e3": {
- "name": "Reflection-Llama-3.1-70B (Recommended Settings)",
- "link": "https://huggingface.co/mattshumer/ref_70_e3",
- "prompted": True,
- "moe": False,
- "size": 70,
- "act_param": 70,
- "open-data": "None",
- },
+ # "mattshumer/ref_70_e3_prefill": {
+ # "name": "Reflection-Llama-3.1-70B",
+ # "link": "https://huggingface.co/mattshumer/ref_70_e3",
+ # "prompted": True,
+ # "moe": False,
+ # "size": 70,
+ # "act_param": 70,
+ # "open-data": "None",
+ # },
+ # "mattshumer/ref_70_e3": {
+ # "name": "Reflection-Llama-3.1-70B (Recommended Settings)",
+ # "link": "https://huggingface.co/mattshumer/ref_70_e3",
+ # "prompted": True,
+ # "moe": False,
+ # "size": 70,
+ # "act_param": 70,
+ # "open-data": "None",
+ # },
"o1-preview-2024-09-12": {
"name": "o1-Preview-2024-09-12 (temperature=1)",
"link": "https://o1.ai/o1-preview",
@@ -1277,4 +1277,58 @@
"act_param": 3,
"open-data": "None",
},
-}
\ No newline at end of file
+ "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF": {
+ "name": "Llama-3.1-Nemotron-70B-Instruct",
+ "link": "https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
+ "prompted": True,
+ "moe": False,
+ "size": 70,
+ "act_param": 70,
+ "open-data": "Partial",
+ },
+ "claude-3-5-sonnet-20241022": {
+ "name": "Claude-3.5-Sonnet-20241022",
+ "link": "https://claude.ai/",
+ "prompted": True,
+ "moe": False,
+ "size": None,
+ "act_param": None,
+ "open-data": "None",
+ },
+ "ibm-granite/granite-3.0-8b-instruct": {
+ "name": "Granite-3.0-8B-Instruct",
+ "link": "https://huggingface.co/ibm-granite/granite-3.0-8b-instruct",
+ "prompted": True,
+ "moe": False,
+ "size": 8,
+ "act_param": 8,
+ "open-data": "None",
+ },
+ "ibm-granite/granite-3.0-2b-instruct": {
+ "name": "Granite-3.0-2B-Instruct",
+ "link": "https://huggingface.co/ibm-granite/granite-3.0-2b-instruct",
+ "prompted": True,
+ "moe": False,
+ "size": 2,
+ "act_param": 2,
+ "open-data": "None",
+ },
+ "grok-beta--main": {
+ "name": "Grok-Beta",
+ "link": "https://grok.com/",
+ "prompted": True,
+ "moe": False,
+ "size": None,
+ "act_param": None,
+ "open-data": "None",
+ },
+ "claude-3-5-haiku-20241022--main": {
+ "name": "Claude-3.5-Haiku-20241022",
+ "link": "https://claude.ai/",
+ "prompted": True,
+ "moe": False,
+ "size": None,
+ "act_param": None,
+ "open-data": "None",
+ },
+}
diff --git a/bigcodebench/data/bigcodebench.py b/bigcodebench/data/bigcodebench.py
index da2ad5d..9a3ee9d 100644
--- a/bigcodebench/data/bigcodebench.py
+++ b/bigcodebench/data/bigcodebench.py
@@ -26,14 +26,8 @@ def _ready_bigcodebench_path(subset="full", version="default") -> str:
)
extra = "-" + subset if subset != "full" else ""
-
- try:
- dataset = load_dataset(BIGCODEBENCH_HF+extra, split=BIGCODEBENCH_VERSION)
- make_cache(url, dataset, path)
- except:
- if os.path.exists(path):
- os.remove(path)
- make_cache(url, None, path, gh=True)
+ dataset = load_dataset(BIGCODEBENCH_HF+extra, split=BIGCODEBENCH_VERSION)
+ make_cache(url, dataset, path)
return path
diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index df8ad85..590d1ae 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -115,11 +115,12 @@ def evaluate(
split: str,
subset: str,
samples: Optional[str] = None,
+ no_execute: bool = False,
local_execute: bool = False,
remote_execute_api: str = "https://bigcode-bigcodebench-evaluator.hf.space/",
pass_k: str = "1,5,10",
save_pass_rate: bool = True,
- parallel: int = None,
+ parallel: int = -1,
min_time_limit: float = 1,
max_as_limit: int = 30*1024,
max_data_limit: int = 30*1024,
@@ -135,6 +136,10 @@ def evaluate(
subset=subset,
**model_kwargs,
)
+
+ if no_execute:
+ return
+
assert samples is not None, "No samples provided"
if os.path.isdir(samples):
@@ -167,7 +172,7 @@ def evaluate(
pass_k = [int(k) for k in pass_k.split(",")]
- if parallel is None:
+ if parallel < 1:
n_workers = max(1, multiprocessing.cpu_count() // 2)
else:
n_workers = parallel
@@ -233,7 +238,7 @@ def evaluate(
if "solution" in sample
else problems[task_id]["complete_prompt"] + sample["completion"]
)
- if "sanitized-calibrated" in samples:
+ if "sanitized_calibrated" in samples:
solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution
remainings.add(sample["_identifier"])
args = (
@@ -254,22 +259,22 @@ def evaluate(
assert n_samples == len(remainings), "Missing problems in unfinished"
assert len(completion_id) == len(problems), "Missing problems in samples"
- def stucking_checker():
- while remainings:
- last_size = len(remainings)
- time.sleep(240)
- if last_size != len(remainings) or len(remainings) == 0:
- continue
- # Potential stucking
- warn("No samples had finished testing in the last 240s")
- warn(f"{len(remainings)} samples to be tested: {remainings}")
+ def stucking_checker():
+ while remainings:
+ last_size = len(remainings)
+ time.sleep(240)
+ if last_size != len(remainings) or len(remainings) == 0:
+ continue
+ # Potential stucking
+ warn("No samples had finished testing in the last 240s")
+ warn(f"{len(remainings)} samples to be tested: {remainings}")
- threading.Thread(target=stucking_checker).start()
+ threading.Thread(target=stucking_checker).start()
- for future in tqdm(as_completed(futures), total=n_samples):
- result = future.result()
- remainings.remove(result["_identifier"])
- eval_results[result["task_id"]].append(result)
+ for future in tqdm(as_completed(futures), total=n_samples):
+ result = future.result()
+ remainings.remove(result["_identifier"])
+ eval_results[result["task_id"]].append(result)
# sort the results for each problem by completion_id
for task_id, task_results in eval_results.items():
@@ -307,7 +312,7 @@ def stucking_checker():
pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
pass_at_k["split"] = split
pass_at_k["subset"] = subset
- pass_at_k["calibrated"] = "sanitized-calibrated" in samples
+ pass_at_k["calibrated"] = "sanitized_calibrated" in samples
pass_at_k["gt_pass_rate"] = gt_pass_rate
pass_at_k["failed_tasks"] = failed_tasks
@@ -365,7 +370,7 @@ def stucking_checker():
print(f"Save pass@k to {pass_at_k_path}? [Y/N]")
decision = input()
if decision.lower() == "y":
- new_path = result_path + ".bak"
+ new_path = pass_at_k_path + ".bak"
while os.path.isfile(new_path):
new_path += ".bak"
os.rename(pass_at_k_path, new_path)
diff --git a/bigcodebench/gen/util/google_request.py b/bigcodebench/gen/util/google_request.py
index 8a88842..8e696b4 100644
--- a/bigcodebench/gen/util/google_request.py
+++ b/bigcodebench/gen/util/google_request.py
@@ -5,13 +5,16 @@
def make_request(
- client: genai.GenerativeModel, temperature, messages, max_new_tokens=2048
+ client: genai.GenerativeModel,
+ message: str,
+ temperature: float,
+ n: int,
+ max_new_tokens: int = 2048,
) -> genai.types.GenerateContentResponse:
- messages = [{"role": m["role"], "parts": [m["content"]]} for m in messages]
response = client.generate_content(
- messages,
+ [{"role": "user", "parts": [message]}],
generation_config=genai.types.GenerationConfig(
- candidate_count=1,
+ candidate_count=n,
max_output_tokens=max_new_tokens,
temperature=temperature,
),
@@ -23,7 +26,7 @@ def make_request(
],
)
- return response.text
+ return response
def make_auto_request(*args, **kwargs) -> genai.types.GenerateContentResponse:
diff --git a/bigcodebench/gen/util/openai_request.py b/bigcodebench/gen/util/openai_request.py
index e347ffe..a745d8d 100644
--- a/bigcodebench/gen/util/openai_request.py
+++ b/bigcodebench/gen/util/openai_request.py
@@ -1,4 +1,3 @@
-import signal
import time
import openai
@@ -14,53 +13,38 @@ def make_request(
n: int = 1,
**kwargs
) -> ChatCompletion:
- system_msg = "You are a helpful assistant good at coding."
- if (
- kwargs.get("response_format", None)
- and kwargs["response_format"]["type"] == "json_object"
- ):
- system_msg = "You are a helpful assistant designed to output JSON."
-
+ kwargs["top_p"] = 0.95
+ kwargs["max_completion_tokens"] = max_tokens
+ if model.startswith("o1-"): # pop top-p and max_completion_tokens
+ kwargs.pop("top_p")
+ kwargs.pop("max_completion_tokens")
+
return client.chat.completions.create(
model=model,
messages=[
- {"role": "system", "content": system_msg},
{"role": "user", "content": message},
],
- max_tokens=max_tokens,
temperature=temperature,
n=n,
**kwargs
)
-def handler(signum, frame):
- # swallow signum and frame
- raise Exception("end of time")
-
-
def make_auto_request(*args, **kwargs) -> ChatCompletion:
ret = None
while ret is None:
try:
- signal.signal(signal.SIGALRM, handler)
- signal.alarm(100)
ret = make_request(*args, **kwargs)
- signal.alarm(0)
except openai.RateLimitError:
print("Rate limit exceeded. Waiting...")
- signal.alarm(0)
time.sleep(5)
except openai.APIConnectionError:
print("API connection error. Waiting...")
- signal.alarm(0)
time.sleep(5)
except openai.APIError as e:
print(e)
- signal.alarm(0)
except Exception as e:
print("Unknown error. Waiting...")
print(e)
- signal.alarm(0)
time.sleep(1)
- return ret
+ return ret
\ No newline at end of file
diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py
index 6333261..757b08c 100644
--- a/bigcodebench/generate.py
+++ b/bigcodebench/generate.py
@@ -64,7 +64,7 @@ def codegen(
if id_num < low:
p.console.print(f"Skipping {task_id} as it is not in {id_range}")
continue
- if id_num > id_range[1]:
+ if id_num >= id_range[1]:
break
p_name = task_id.replace("/", "_")
@@ -135,10 +135,13 @@ def run_codegen(
strip_newlines: bool = False,
direct_completion: bool = False,
resume: bool = True,
- id_range: Tuple[int, int] = None,
+ id_range: str = None,
backend: str = "vllm",
base_url: str = None,
tp: int = 1,
+ instruction_prefix: str = None,
+ response_prefix: str = None,
+ revision: str = "main",
trust_remote_code: bool = False,
tokenizer_name: str = None,
tokenizer_legacy: bool = False,
@@ -151,6 +154,7 @@ def run_codegen(
print("Greedy decoding ON (--greedy): setting n_samples=1, temperature=0")
if id_range is not None:
+ id_range = [int(i) for i in id_range.split("-")]
assert len(id_range) == 2, "id_range must be a list of length 2"
assert id_range[0] < id_range[1], "id_range must be increasing"
id_range = tuple(id_range)
@@ -158,8 +162,10 @@ def run_codegen(
# Make project dir
os.makedirs(root, exist_ok=True)
- instruction_prefix = "Please provide a self-contained Python script that solves the following problem in a markdown code block:"
- response_prefix = "Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:"
+ if instruction_prefix is None:
+ instruction_prefix = "Please provide a self-contained Python script that solves the following problem in a markdown code block:"
+ if response_prefix is None:
+ response_prefix = "Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:"
# Make dir for codes generated by each model
model_runner = make_model(
@@ -173,6 +179,7 @@ def run_codegen(
response_prefix=response_prefix,
base_url=base_url,
tp=tp,
+ revision=revision,
trust_remote_code=trust_remote_code,
direct_completion=direct_completion,
tokenizer_name=tokenizer_name,
@@ -180,7 +187,7 @@ def run_codegen(
)
extra = "-" + subset if subset != "full" else ""
- identifier = model.replace("/", "--") + f"--bigcodebench{extra}-{split}--{backend}-{temperature}-{n_samples}-sanitized_calibrated.jsonl"
+ identifier = model.replace("/", "--") + f"--{revision}--bigcodebench{extra}-{split}--{backend}-{temperature}-{n_samples}-sanitized_calibrated.jsonl"
target_path = os.path.join(root, identifier)
diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py
index 67123f9..ef19f4e 100644
--- a/bigcodebench/provider/__init__.py
+++ b/bigcodebench/provider/__init__.py
@@ -12,6 +12,8 @@ def make_model(
# instruction model only
instruction_prefix: str = None,
response_prefix: str = None,
+ # vllm and hf only
+ revision: str = "main",
# vllm only
tp: int = 1,
direct_completion: bool = False,
@@ -32,11 +34,15 @@ def make_model(
split=split,
temperature=temperature,
max_new_tokens=max_new_tokens,
+ revision=revision,
dataset=dataset,
direct_completion=direct_completion,
tp=tp,
instruction_prefix=instruction_prefix,
response_prefix=response_prefix,
+ trust_remote_code=trust_remote_code,
+ tokenizer_name=tokenizer_name,
+ tokenizer_legacy=tokenizer_legacy,
)
elif backend == "hf":
from bigcodebench.provider.hf import HuggingFaceDecoder
@@ -47,11 +53,15 @@ def make_model(
split=split,
temperature=temperature,
max_new_tokens=max_new_tokens,
+ revision=revision,
dataset=dataset,
direct_completion=direct_completion,
instruction_prefix=instruction_prefix,
response_prefix=response_prefix,
attn_implementation=attn_implementation,
+ trust_remote_code=trust_remote_code,
+ tokenizer_name=tokenizer_name,
+ tokenizer_legacy=tokenizer_legacy,
)
elif backend == "openai":
from bigcodebench.provider.openai import OpenAIChatDecoder
diff --git a/bigcodebench/provider/base.py b/bigcodebench/provider/base.py
index ebec843..5a24b59 100644
--- a/bigcodebench/provider/base.py
+++ b/bigcodebench/provider/base.py
@@ -12,6 +12,7 @@ def __init__(
split: str,
temperature: float = 0.8,
max_new_tokens: int = 1280,
+ revision: str = "main",
dtype: str = "bfloat16", # default
direct_completion: bool = False,
trust_remote_code: bool = False,
@@ -29,6 +30,7 @@ def __init__(
self.skip_special_tokens = False
self.max_new_tokens = max_new_tokens
self.dtype = dtype
+ self.revision = revision
self.direct_completion = direct_completion
self.trust_remote_code = trust_remote_code
self.tokenizer_name = tokenizer_name
diff --git a/bigcodebench/provider/google.py b/bigcodebench/provider/google.py
index 0cd5416..2194c47 100644
--- a/bigcodebench/provider/google.py
+++ b/bigcodebench/provider/google.py
@@ -35,11 +35,10 @@ def codegen(
)
ret = make_auto_request(
self.client,
- message,
- self.name,
+ message=message,
n=num_samples,
- max_tokens=self.max_new_tokens,
temperature=self.temperature,
+ max_new_tokens=self.max_new_tokens,
)
for candidate in ret.candidates:
parts = candidate.content.parts
diff --git a/bigcodebench/provider/hf.py b/bigcodebench/provider/hf.py
index c3136c8..a85957d 100644
--- a/bigcodebench/provider/hf.py
+++ b/bigcodebench/provider/hf.py
@@ -27,6 +27,7 @@ def __init__(
"trust_remote_code": self.trust_remote_code,
"torch_dtype": getattr(torch, self.dtype),
"attn_implementation": attn_implementation, # "eager", "flash_attention_2", "sdpa"
+ "revision": self.revision,
}
self.skip_special_tokens = True
diff --git a/bigcodebench/provider/openai.py b/bigcodebench/provider/openai.py
index 9eba02e..91c1882 100644
--- a/bigcodebench/provider/openai.py
+++ b/bigcodebench/provider/openai.py
@@ -1,48 +1,63 @@
import os
from typing import List
-from tqdm import tqdm
import openai
-from bigcodebench.provider.base import DecoderBase
from bigcodebench.gen.util.openai_request import make_auto_request
from bigcodebench.provider.utility import make_raw_chat_prompt
+from bigcodebench.provider.base import DecoderBase
+from bigcodebench.provider.utility import concurrent_call
class OpenAIChatDecoder(DecoderBase):
def __init__(self, name: str, base_url=None, **kwargs) -> None:
super().__init__(name, **kwargs)
- self.client = openai.OpenAI(
- api_key=os.getenv("OPENAI_API_KEY", "none"), base_url=base_url
- )
-
+ self.base_url = base_url
+
def codegen(
self, prompts: List[str], do_sample: bool = True, num_samples: int = 200
) -> List[str]:
if do_sample:
assert self.temperature > 0, "Temperature must be positive for sampling"
+ messages = [make_raw_chat_prompt(
+ task_prompt=prompt,
+ subset=self.subset,
+ split=self.split,
+ instruction_prefix=self.instruction_prefix,
+ response_prefix=self.response_prefix,
+ tokenizer=None,
+ ) for prompt in prompts]
+ # use concurrency based batching for o1 and deepseek models
+ if self.name.startswith("o1-") or self.name == "deepseek-chat":
+ return self._codegen_batch_via_concurrency(messages, num_samples)
+
+ return self._codegen_api_batch(messages, num_samples)
+
+ def _codegen_api_batch(self, messages: List[str], num_samples: int) -> List[str]:
+ client = openai.OpenAI(
+ api_key=os.getenv("OPENAI_API_KEY", "none"), base_url=self.base_url
+ )
+
all_outputs = []
- for prompt in tqdm(prompts):
- outputs = []
- message = make_raw_chat_prompt(
- task_prompt=prompt,
- subset=self.subset,
- split=self.split,
- instruction_prefix=self.instruction_prefix,
- response_prefix=self.response_prefix,
- tokenizer=None,
- )
+ for message in messages:
ret = make_auto_request(
- self.client,
+ client,
message=message,
model=self.name,
max_tokens=self.max_new_tokens,
temperature=self.temperature,
n=num_samples,
)
+ outputs = []
for item in ret.choices:
outputs.append(item.message.content)
all_outputs.append(outputs)
return all_outputs
+ def _codegen_batch_via_concurrency(self, messages: List[str], num_samples: int) -> List[str]:
+ batches = concurrent_call(
+ num_samples, self._codegen_api_batch, messages, num_samples=1
+ )
+ return [b[0] for b in batches]
+
def is_direct_completion(self) -> bool:
return False
\ No newline at end of file
diff --git a/bigcodebench/provider/utility.py b/bigcodebench/provider/utility.py
index 60a00e5..bb27539 100644
--- a/bigcodebench/provider/utility.py
+++ b/bigcodebench/provider/utility.py
@@ -1,5 +1,6 @@
from typing import List
from transformers import AutoTokenizer
+from concurrent.futures import ThreadPoolExecutor
EOS = [
"<|endoftext|>",
@@ -64,4 +65,10 @@ def make_raw_chat_prompt(
],
tokenize=False,
).split(_MAGIC_SPLITTER_)[0]
- return task_prompt
\ No newline at end of file
+ return task_prompt
+
+
+def concurrent_call(n, callback, /, *args, **kwargs):
+ with ThreadPoolExecutor(max_workers=n) as executor:
+ futures = [executor.submit(callback, *args, **kwargs) for _ in range(n)]
+ return [future.result() for future in futures]
\ No newline at end of file
diff --git a/bigcodebench/provider/vllm.py b/bigcodebench/provider/vllm.py
index 3d0aaf4..171a41c 100644
--- a/bigcodebench/provider/vllm.py
+++ b/bigcodebench/provider/vllm.py
@@ -18,6 +18,7 @@ def __init__(self, name: str, dataset: str, tp: int, **kwargs) -> None:
"tensor_parallel_size": int(os.getenv("VLLM_N_GPUS", tp)),
"dtype": self.dtype,
"trust_remote_code": self.trust_remote_code,
+ "revision": self.revision,
}
if self.tokenizer_name is None:
self.tokenizer_name = self.name
diff --git a/run.sh b/run.sh
index a84199e..c069e8e 100755
--- a/run.sh
+++ b/run.sh
@@ -9,5 +9,4 @@ bigcodebench.evaluate \
--model $MODEL \
--split $SPLIT \
--subset $SUBSET \
- --backend $BACKEND \
- --tp $NUM_GPU
\ No newline at end of file
+ --backend $BACKEND
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index a9b7c74..4897f68 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -29,9 +29,6 @@ install_requires =
wget>=3.2
datasets
gradio-client
-
-[options.extras_require]
-generate =
vllm
numpy
rich
@@ -48,4 +45,4 @@ console_scripts =
bigcodebench.syncheck = bigcodebench.syncheck:main
bigcodebench.legacy_sanitize = bigcodebench.legacy_sanitize:main
bigcodebench.generate = bigcodebench.generate:main
- bigcodebench.inspect = bigcodebench.inspect:main
\ No newline at end of file
+ bigcodebench.inspect = bigcodebench.inspect:main