diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md index 428cf28..0b2bf7b 100755 --- a/ADVANCED_USAGE.md +++ b/ADVANCED_USAGE.md @@ -3,25 +3,19 @@ To get started, please first set up the environment: ```bash -# Install to use bigcodebench.evaluate -pip install bigcodebench --upgrade -# If you want to use the evaluate locally, you need to install the requirements +# If you want to use the evaluate locally, you need to install the requirements in an isolated environment pip install -I -r https://raw.githubusercontent.com/bigcode-project/bigcodebench/main/Requirements/requirements-eval.txt -# Install to use bigcodebench.generate -# You are strongly recommended to install the generate dependencies in a separate environment -pip install bigcodebench[generate] --upgrade +# You are strongly recommended to install the bigcodebench dependencies in another environment +pip install bigcodebench --upgrade ```
⏬ Install nightly version :: click to expand ::
```bash -# Install to use bigcodebench.evaluate +# Install to use bigcodebench pip install "git+https://github.com/bigcode-project/bigcodebench.git" --upgrade - -# Install to use bigcodebench.generate -pip install "git+https://github.com/bigcode-project/bigcodebench.git#egg=bigcodebench[generate]" --upgrade ```
@@ -34,10 +28,8 @@ pip install "git+https://github.com/bigcode-project/bigcodebench.git#egg=bigcode git clone https://github.com/bigcode-project/bigcodebench.git cd bigcodebench export PYTHONPATH=$PYTHONPATH:$(pwd) -# Install to use bigcodebench.evaluate +# Install to use bigcodebench pip install -e . -# Install to use bigcodebench.generate -pip install -e .[generate] ``` @@ -62,19 +54,23 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio - `--strip_newlines`: Whether to strip newlines, default to `False`, set to `True` to strip newlines for some model series like StarCoder2 - `--direct_completion`: Whether to use direct completion, default to `False` - `--resume`: Whether to resume the evaluation, default to `True`, set to `False` to re-run the evaluation -- `--id_range`: The range of the tasks to evaluate, default to `None`, e.g. `--id_range 10,20` will evaluate the tasks from 10 to 20 +- `--id_range`: The range of the tasks to evaluate, default to `None`, e.g. `--id_range 10-20` will evaluate the tasks from 10 to 20 - `--backend`: The backend to use, default to `vllm` - `--base_url`: The base URL of the backend for OpenAI-compatible APIs, default to `None` +- `--instruction_prefix`: The instruction prefix for the Anthropic backend, default to `None` +- `--response_prefix`: The response prefix for the Anthropic backend, default to `None` +- `--revision`: The revision of the model with the vLLM or HF backend, default to `main` - `--tp`: The tensor parallel size for the vLLM backend, default to `1` - `--trust_remote_code`: Whether to trust the remote code, default to `False` - `--tokenizer_name`: The name of the customized tokenizer, default to `None` - `--tokenizer_legacy`: Whether to use the legacy tokenizer, default to `False` - `--samples`: The path to the generated samples file, default to `None` +- `--no_execute`: Whether to not execute the samples, default to `False` - `--local_execute`: Whether to execute the samples locally, default to `False` -- `--remote_execute_api`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://github.com/bigcode-project/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page. +- `--remote_execute_api`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page. - `--pass_k`: The `k` in `Pass@k`, default to `[1, 5, 10]`, e.g. `--pass_k 1,5,10` will evaluate `Pass@1`, `Pass@5` and `Pass@10` - `--save_pass_rate`: Whether to save the pass rate to a file, default to `True` -- `--parallel`: The number of parallel processes, default to `None`, e.g. `--parallel 10` will evaluate 10 samples in parallel +- `--parallel`: The number of parallel processes, default to `-1`, e.g. `--parallel 10` will evaluate 10 samples in parallel - `--min_time_limit`: The minimum time limit for the execution, default to `1`, e.g. `--min_time_limit 10` will evaluate the samples with at least 10 seconds - `--max_as_limit`: The maximum address space limit for the execution, default to `30*1024` (30 GB), e.g. `--max_as_limit 20*1024` will evaluate the samples with at most 20 GB - `--max_data_limit`: The maximum data segment limit for the execution, default to `30*1024` (30 GB), e.g. `--max_data_limit 20*1024` will evaluate the samples with at most 20 GB @@ -111,7 +107,7 @@ bigcodebench.generate \ ``` > -The generated code samples will be stored in a file named `[model_name]--bigcodebench-[instruct|complete]--[backend]-[temp]-[n_samples].jsonl`. Alternatively, you can use the following command to utilize our pre-built docker images for generating code samples: +The generated code samples will be stored in a file named `[model_name]--bigcodebench-[instruct|complete]--[backend]-[temp]-[n_samples]-sanitized_calibrated.jsonl`. Alternatively, you can use the following command to utilize our pre-built docker images for generating code samples: > ```bash diff --git a/Docker/Gradio.Dockerfile b/Docker/Gradio.Dockerfile index 9de820b..df4018f 100644 --- a/Docker/Gradio.Dockerfile +++ b/Docker/Gradio.Dockerfile @@ -7,7 +7,7 @@ RUN apt-get update && apt-get install -y git g++ python3-tk zip unzip procps r-b # upgrade to latest pip RUN pip install --upgrade pip -RUN pip install APScheduler==3.10.1 black==23.11.0 click==8.1.3 huggingface-hub>=0.18.0 plotly python-dateutil==2.8.2 gradio-space-ci@git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 isort ruff gradio[oauth]==4.31.0 gradio_leaderboard==0.0.11 schedule==1.2.2 +RUN pip install APScheduler==3.10.1 black==23.11.0 click==8.1.3 huggingface-hub>=0.18.0 plotly python-dateutil==2.8.2 gradio-space-ci@git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 isort ruff gradio[oauth] schedule==1.2.2 # Add a new user "bigcodebenchuser" RUN adduser --disabled-password --gecos "" bigcodebenchuser diff --git a/README.md b/README.md index 68b1945..c37b203 100755 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@

+ 💥 Impact📰 News🔥 Quick Start🚀 Remote Evaluation • @@ -23,6 +24,18 @@ 📜 Citation

+## 💥 Impact +BigCodeBench has been used by many LLM teams including: +- Zhipu AI +- Alibaba Qwen +- DeepSeek +- Amazon AWS AI +- Snowflake AI Research +- ServiceNow Research +- Meta AI +- Cohere AI +- Sakana AI + ## 📰 News - **[2024-10-06]** We are releasing `bigcodebench==v0.2.0`! - **[2024-10-05]** We create a public code execution API on the [Hugging Face space](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator). @@ -48,6 +61,10 @@ BigCodeBench is an **_easy-to-use_** benchmark for solving **_practical_** and **_challenging_** tasks via code. It aims to evaluate the true programming capabilities of large language models (LLMs) in a more realistic setting. The benchmark is designed for HumanEval-like function-level code generation tasks, but with much more complex instructions and diverse function calls. +There are two splits in BigCodeBench: +- `Complete`: Thes split is designed for code completion based on the comprehensive docstrings. +- `Instruct`: The split works for the instruction-tuned and chat models only, where the models are asked to generate a code snippet based on the natural language instructions. The instructions only contain necessary information, and require more complex reasoning. + ### Why BigCodeBench? BigCodeBench focuses on task automation via code generation with *diverse function calls* and *complex instructions*, with: @@ -61,7 +78,7 @@ To get started, please first set up the environment: ```bash # By default, you will use the remote evaluation API to execute the output samples. -pip install bigcodebench[generate] --upgrade +pip install bigcodebench --upgrade # You are suggested to use `flash-attn` for generating code samples. pip install packaging ninja @@ -75,7 +92,7 @@ pip install flash-attn --no-build-isolation ```bash # Install to use bigcodebench.generate -pip install "git+https://github.com/bigcode-project/bigcodebench.git#egg=bigcodebench[generate]" --upgrade +pip install "git+https://github.com/bigcode-project/bigcodebench.git" --upgrade ``` @@ -85,6 +102,9 @@ pip install "git+https://github.com/bigcode-project/bigcodebench.git#egg=bigcode ## 🚀 Remote Evaluation We use the greedy decoding as an example to show how to evaluate the generated code samples via remote API. +> [!Warning] +> +> To ease the generation, we use batch inference by default. However, the batch inference results could vary from *batch sizes to batch sizes* and *versions to versions*, at least for the vLLM backend. If you want to get more deterministic results for greedy decoding, please set `--bs` to `1`. > [!Note] > @@ -136,7 +156,7 @@ export GOOGLE_API_KEY= ## 💻 LLM-generated Code We share pre-generated code samples from LLMs we have [evaluated](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard): -* See the attachment of our [v0.2.0](https://github.com/bigcode-project/bigcodebench/releases/tag/v0.2.0). We include `sanitized_samples_calibrated.zip` for your convenience. +* See the attachment of our [v0.2.0.post3](https://github.com/bigcode-project/bigcodebench/releases/tag/v0.2.0.post3). We include `sanitized_samples_calibrated.zip` for your convenience. ## Advanced Usage diff --git a/analysis/utils.py b/analysis/utils.py index ce81bd6..4cd9862 100755 --- a/analysis/utils.py +++ b/analysis/utils.py @@ -1133,24 +1133,24 @@ "act_param": 9, "open-data": "None", }, - "mattshumer/ref_70_e3_prefill": { - "name": "Reflection-Llama-3.1-70B", - "link": "https://huggingface.co/mattshumer/ref_70_e3", - "prompted": True, - "moe": False, - "size": 70, - "act_param": 70, - "open-data": "None", - }, - "mattshumer/ref_70_e3": { - "name": "Reflection-Llama-3.1-70B (Recommended Settings)", - "link": "https://huggingface.co/mattshumer/ref_70_e3", - "prompted": True, - "moe": False, - "size": 70, - "act_param": 70, - "open-data": "None", - }, + # "mattshumer/ref_70_e3_prefill": { + # "name": "Reflection-Llama-3.1-70B", + # "link": "https://huggingface.co/mattshumer/ref_70_e3", + # "prompted": True, + # "moe": False, + # "size": 70, + # "act_param": 70, + # "open-data": "None", + # }, + # "mattshumer/ref_70_e3": { + # "name": "Reflection-Llama-3.1-70B (Recommended Settings)", + # "link": "https://huggingface.co/mattshumer/ref_70_e3", + # "prompted": True, + # "moe": False, + # "size": 70, + # "act_param": 70, + # "open-data": "None", + # }, "o1-preview-2024-09-12": { "name": "o1-Preview-2024-09-12 (temperature=1)", "link": "https://o1.ai/o1-preview", @@ -1277,4 +1277,58 @@ "act_param": 3, "open-data": "None", }, -} \ No newline at end of file + "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF": { + "name": "Llama-3.1-Nemotron-70B-Instruct", + "link": "https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", + "prompted": True, + "moe": False, + "size": 70, + "act_param": 70, + "open-data": "Partial", + }, + "claude-3-5-sonnet-20241022": { + "name": "Claude-3.5-Sonnet-20241022", + "link": "https://claude.ai/", + "prompted": True, + "moe": False, + "size": None, + "act_param": None, + "open-data": "None", + }, + "ibm-granite/granite-3.0-8b-instruct": { + "name": "Granite-3.0-8B-Instruct", + "link": "https://huggingface.co/ibm-granite/granite-3.0-8b-instruct", + "prompted": True, + "moe": False, + "size": 8, + "act_param": 8, + "open-data": "None", + }, + "ibm-granite/granite-3.0-2b-instruct": { + "name": "Granite-3.0-2B-Instruct", + "link": "https://huggingface.co/ibm-granite/granite-3.0-2b-instruct", + "prompted": True, + "moe": False, + "size": 2, + "act_param": 2, + "open-data": "None", + }, + "grok-beta--main": { + "name": "Grok-Beta", + "link": "https://grok.com/", + "prompted": True, + "moe": False, + "size": None, + "act_param": None, + "open-data": "None", + }, + "claude-3-5-haiku-20241022--main": { + "name": "Claude-3.5-Haiku-20241022", + "link": "https://claude.ai/", + "prompted": True, + "moe": False, + "size": None, + "act_param": None, + "open-data": "None", + }, +} diff --git a/bigcodebench/data/bigcodebench.py b/bigcodebench/data/bigcodebench.py index da2ad5d..9a3ee9d 100644 --- a/bigcodebench/data/bigcodebench.py +++ b/bigcodebench/data/bigcodebench.py @@ -26,14 +26,8 @@ def _ready_bigcodebench_path(subset="full", version="default") -> str: ) extra = "-" + subset if subset != "full" else "" - - try: - dataset = load_dataset(BIGCODEBENCH_HF+extra, split=BIGCODEBENCH_VERSION) - make_cache(url, dataset, path) - except: - if os.path.exists(path): - os.remove(path) - make_cache(url, None, path, gh=True) + dataset = load_dataset(BIGCODEBENCH_HF+extra, split=BIGCODEBENCH_VERSION) + make_cache(url, dataset, path) return path diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index df8ad85..590d1ae 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -115,11 +115,12 @@ def evaluate( split: str, subset: str, samples: Optional[str] = None, + no_execute: bool = False, local_execute: bool = False, remote_execute_api: str = "https://bigcode-bigcodebench-evaluator.hf.space/", pass_k: str = "1,5,10", save_pass_rate: bool = True, - parallel: int = None, + parallel: int = -1, min_time_limit: float = 1, max_as_limit: int = 30*1024, max_data_limit: int = 30*1024, @@ -135,6 +136,10 @@ def evaluate( subset=subset, **model_kwargs, ) + + if no_execute: + return + assert samples is not None, "No samples provided" if os.path.isdir(samples): @@ -167,7 +172,7 @@ def evaluate( pass_k = [int(k) for k in pass_k.split(",")] - if parallel is None: + if parallel < 1: n_workers = max(1, multiprocessing.cpu_count() // 2) else: n_workers = parallel @@ -233,7 +238,7 @@ def evaluate( if "solution" in sample else problems[task_id]["complete_prompt"] + sample["completion"] ) - if "sanitized-calibrated" in samples: + if "sanitized_calibrated" in samples: solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution remainings.add(sample["_identifier"]) args = ( @@ -254,22 +259,22 @@ def evaluate( assert n_samples == len(remainings), "Missing problems in unfinished" assert len(completion_id) == len(problems), "Missing problems in samples" - def stucking_checker(): - while remainings: - last_size = len(remainings) - time.sleep(240) - if last_size != len(remainings) or len(remainings) == 0: - continue - # Potential stucking - warn("No samples had finished testing in the last 240s") - warn(f"{len(remainings)} samples to be tested: {remainings}") + def stucking_checker(): + while remainings: + last_size = len(remainings) + time.sleep(240) + if last_size != len(remainings) or len(remainings) == 0: + continue + # Potential stucking + warn("No samples had finished testing in the last 240s") + warn(f"{len(remainings)} samples to be tested: {remainings}") - threading.Thread(target=stucking_checker).start() + threading.Thread(target=stucking_checker).start() - for future in tqdm(as_completed(futures), total=n_samples): - result = future.result() - remainings.remove(result["_identifier"]) - eval_results[result["task_id"]].append(result) + for future in tqdm(as_completed(futures), total=n_samples): + result = future.result() + remainings.remove(result["_identifier"]) + eval_results[result["task_id"]].append(result) # sort the results for each problem by completion_id for task_id, task_results in eval_results.items(): @@ -307,7 +312,7 @@ def stucking_checker(): pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0] pass_at_k["split"] = split pass_at_k["subset"] = subset - pass_at_k["calibrated"] = "sanitized-calibrated" in samples + pass_at_k["calibrated"] = "sanitized_calibrated" in samples pass_at_k["gt_pass_rate"] = gt_pass_rate pass_at_k["failed_tasks"] = failed_tasks @@ -365,7 +370,7 @@ def stucking_checker(): print(f"Save pass@k to {pass_at_k_path}? [Y/N]") decision = input() if decision.lower() == "y": - new_path = result_path + ".bak" + new_path = pass_at_k_path + ".bak" while os.path.isfile(new_path): new_path += ".bak" os.rename(pass_at_k_path, new_path) diff --git a/bigcodebench/gen/util/google_request.py b/bigcodebench/gen/util/google_request.py index 8a88842..8e696b4 100644 --- a/bigcodebench/gen/util/google_request.py +++ b/bigcodebench/gen/util/google_request.py @@ -5,13 +5,16 @@ def make_request( - client: genai.GenerativeModel, temperature, messages, max_new_tokens=2048 + client: genai.GenerativeModel, + message: str, + temperature: float, + n: int, + max_new_tokens: int = 2048, ) -> genai.types.GenerateContentResponse: - messages = [{"role": m["role"], "parts": [m["content"]]} for m in messages] response = client.generate_content( - messages, + [{"role": "user", "parts": [message]}], generation_config=genai.types.GenerationConfig( - candidate_count=1, + candidate_count=n, max_output_tokens=max_new_tokens, temperature=temperature, ), @@ -23,7 +26,7 @@ def make_request( ], ) - return response.text + return response def make_auto_request(*args, **kwargs) -> genai.types.GenerateContentResponse: diff --git a/bigcodebench/gen/util/openai_request.py b/bigcodebench/gen/util/openai_request.py index e347ffe..a745d8d 100644 --- a/bigcodebench/gen/util/openai_request.py +++ b/bigcodebench/gen/util/openai_request.py @@ -1,4 +1,3 @@ -import signal import time import openai @@ -14,53 +13,38 @@ def make_request( n: int = 1, **kwargs ) -> ChatCompletion: - system_msg = "You are a helpful assistant good at coding." - if ( - kwargs.get("response_format", None) - and kwargs["response_format"]["type"] == "json_object" - ): - system_msg = "You are a helpful assistant designed to output JSON." - + kwargs["top_p"] = 0.95 + kwargs["max_completion_tokens"] = max_tokens + if model.startswith("o1-"): # pop top-p and max_completion_tokens + kwargs.pop("top_p") + kwargs.pop("max_completion_tokens") + return client.chat.completions.create( model=model, messages=[ - {"role": "system", "content": system_msg}, {"role": "user", "content": message}, ], - max_tokens=max_tokens, temperature=temperature, n=n, **kwargs ) -def handler(signum, frame): - # swallow signum and frame - raise Exception("end of time") - - def make_auto_request(*args, **kwargs) -> ChatCompletion: ret = None while ret is None: try: - signal.signal(signal.SIGALRM, handler) - signal.alarm(100) ret = make_request(*args, **kwargs) - signal.alarm(0) except openai.RateLimitError: print("Rate limit exceeded. Waiting...") - signal.alarm(0) time.sleep(5) except openai.APIConnectionError: print("API connection error. Waiting...") - signal.alarm(0) time.sleep(5) except openai.APIError as e: print(e) - signal.alarm(0) except Exception as e: print("Unknown error. Waiting...") print(e) - signal.alarm(0) time.sleep(1) - return ret + return ret \ No newline at end of file diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py index 6333261..757b08c 100644 --- a/bigcodebench/generate.py +++ b/bigcodebench/generate.py @@ -64,7 +64,7 @@ def codegen( if id_num < low: p.console.print(f"Skipping {task_id} as it is not in {id_range}") continue - if id_num > id_range[1]: + if id_num >= id_range[1]: break p_name = task_id.replace("/", "_") @@ -135,10 +135,13 @@ def run_codegen( strip_newlines: bool = False, direct_completion: bool = False, resume: bool = True, - id_range: Tuple[int, int] = None, + id_range: str = None, backend: str = "vllm", base_url: str = None, tp: int = 1, + instruction_prefix: str = None, + response_prefix: str = None, + revision: str = "main", trust_remote_code: bool = False, tokenizer_name: str = None, tokenizer_legacy: bool = False, @@ -151,6 +154,7 @@ def run_codegen( print("Greedy decoding ON (--greedy): setting n_samples=1, temperature=0") if id_range is not None: + id_range = [int(i) for i in id_range.split("-")] assert len(id_range) == 2, "id_range must be a list of length 2" assert id_range[0] < id_range[1], "id_range must be increasing" id_range = tuple(id_range) @@ -158,8 +162,10 @@ def run_codegen( # Make project dir os.makedirs(root, exist_ok=True) - instruction_prefix = "Please provide a self-contained Python script that solves the following problem in a markdown code block:" - response_prefix = "Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:" + if instruction_prefix is None: + instruction_prefix = "Please provide a self-contained Python script that solves the following problem in a markdown code block:" + if response_prefix is None: + response_prefix = "Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:" # Make dir for codes generated by each model model_runner = make_model( @@ -173,6 +179,7 @@ def run_codegen( response_prefix=response_prefix, base_url=base_url, tp=tp, + revision=revision, trust_remote_code=trust_remote_code, direct_completion=direct_completion, tokenizer_name=tokenizer_name, @@ -180,7 +187,7 @@ def run_codegen( ) extra = "-" + subset if subset != "full" else "" - identifier = model.replace("/", "--") + f"--bigcodebench{extra}-{split}--{backend}-{temperature}-{n_samples}-sanitized_calibrated.jsonl" + identifier = model.replace("/", "--") + f"--{revision}--bigcodebench{extra}-{split}--{backend}-{temperature}-{n_samples}-sanitized_calibrated.jsonl" target_path = os.path.join(root, identifier) diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py index 67123f9..ef19f4e 100644 --- a/bigcodebench/provider/__init__.py +++ b/bigcodebench/provider/__init__.py @@ -12,6 +12,8 @@ def make_model( # instruction model only instruction_prefix: str = None, response_prefix: str = None, + # vllm and hf only + revision: str = "main", # vllm only tp: int = 1, direct_completion: bool = False, @@ -32,11 +34,15 @@ def make_model( split=split, temperature=temperature, max_new_tokens=max_new_tokens, + revision=revision, dataset=dataset, direct_completion=direct_completion, tp=tp, instruction_prefix=instruction_prefix, response_prefix=response_prefix, + trust_remote_code=trust_remote_code, + tokenizer_name=tokenizer_name, + tokenizer_legacy=tokenizer_legacy, ) elif backend == "hf": from bigcodebench.provider.hf import HuggingFaceDecoder @@ -47,11 +53,15 @@ def make_model( split=split, temperature=temperature, max_new_tokens=max_new_tokens, + revision=revision, dataset=dataset, direct_completion=direct_completion, instruction_prefix=instruction_prefix, response_prefix=response_prefix, attn_implementation=attn_implementation, + trust_remote_code=trust_remote_code, + tokenizer_name=tokenizer_name, + tokenizer_legacy=tokenizer_legacy, ) elif backend == "openai": from bigcodebench.provider.openai import OpenAIChatDecoder diff --git a/bigcodebench/provider/base.py b/bigcodebench/provider/base.py index ebec843..5a24b59 100644 --- a/bigcodebench/provider/base.py +++ b/bigcodebench/provider/base.py @@ -12,6 +12,7 @@ def __init__( split: str, temperature: float = 0.8, max_new_tokens: int = 1280, + revision: str = "main", dtype: str = "bfloat16", # default direct_completion: bool = False, trust_remote_code: bool = False, @@ -29,6 +30,7 @@ def __init__( self.skip_special_tokens = False self.max_new_tokens = max_new_tokens self.dtype = dtype + self.revision = revision self.direct_completion = direct_completion self.trust_remote_code = trust_remote_code self.tokenizer_name = tokenizer_name diff --git a/bigcodebench/provider/google.py b/bigcodebench/provider/google.py index 0cd5416..2194c47 100644 --- a/bigcodebench/provider/google.py +++ b/bigcodebench/provider/google.py @@ -35,11 +35,10 @@ def codegen( ) ret = make_auto_request( self.client, - message, - self.name, + message=message, n=num_samples, - max_tokens=self.max_new_tokens, temperature=self.temperature, + max_new_tokens=self.max_new_tokens, ) for candidate in ret.candidates: parts = candidate.content.parts diff --git a/bigcodebench/provider/hf.py b/bigcodebench/provider/hf.py index c3136c8..a85957d 100644 --- a/bigcodebench/provider/hf.py +++ b/bigcodebench/provider/hf.py @@ -27,6 +27,7 @@ def __init__( "trust_remote_code": self.trust_remote_code, "torch_dtype": getattr(torch, self.dtype), "attn_implementation": attn_implementation, # "eager", "flash_attention_2", "sdpa" + "revision": self.revision, } self.skip_special_tokens = True diff --git a/bigcodebench/provider/openai.py b/bigcodebench/provider/openai.py index 9eba02e..91c1882 100644 --- a/bigcodebench/provider/openai.py +++ b/bigcodebench/provider/openai.py @@ -1,48 +1,63 @@ import os from typing import List -from tqdm import tqdm import openai -from bigcodebench.provider.base import DecoderBase from bigcodebench.gen.util.openai_request import make_auto_request from bigcodebench.provider.utility import make_raw_chat_prompt +from bigcodebench.provider.base import DecoderBase +from bigcodebench.provider.utility import concurrent_call class OpenAIChatDecoder(DecoderBase): def __init__(self, name: str, base_url=None, **kwargs) -> None: super().__init__(name, **kwargs) - self.client = openai.OpenAI( - api_key=os.getenv("OPENAI_API_KEY", "none"), base_url=base_url - ) - + self.base_url = base_url + def codegen( self, prompts: List[str], do_sample: bool = True, num_samples: int = 200 ) -> List[str]: if do_sample: assert self.temperature > 0, "Temperature must be positive for sampling" + messages = [make_raw_chat_prompt( + task_prompt=prompt, + subset=self.subset, + split=self.split, + instruction_prefix=self.instruction_prefix, + response_prefix=self.response_prefix, + tokenizer=None, + ) for prompt in prompts] + # use concurrency based batching for o1 and deepseek models + if self.name.startswith("o1-") or self.name == "deepseek-chat": + return self._codegen_batch_via_concurrency(messages, num_samples) + + return self._codegen_api_batch(messages, num_samples) + + def _codegen_api_batch(self, messages: List[str], num_samples: int) -> List[str]: + client = openai.OpenAI( + api_key=os.getenv("OPENAI_API_KEY", "none"), base_url=self.base_url + ) + all_outputs = [] - for prompt in tqdm(prompts): - outputs = [] - message = make_raw_chat_prompt( - task_prompt=prompt, - subset=self.subset, - split=self.split, - instruction_prefix=self.instruction_prefix, - response_prefix=self.response_prefix, - tokenizer=None, - ) + for message in messages: ret = make_auto_request( - self.client, + client, message=message, model=self.name, max_tokens=self.max_new_tokens, temperature=self.temperature, n=num_samples, ) + outputs = [] for item in ret.choices: outputs.append(item.message.content) all_outputs.append(outputs) return all_outputs + def _codegen_batch_via_concurrency(self, messages: List[str], num_samples: int) -> List[str]: + batches = concurrent_call( + num_samples, self._codegen_api_batch, messages, num_samples=1 + ) + return [b[0] for b in batches] + def is_direct_completion(self) -> bool: return False \ No newline at end of file diff --git a/bigcodebench/provider/utility.py b/bigcodebench/provider/utility.py index 60a00e5..bb27539 100644 --- a/bigcodebench/provider/utility.py +++ b/bigcodebench/provider/utility.py @@ -1,5 +1,6 @@ from typing import List from transformers import AutoTokenizer +from concurrent.futures import ThreadPoolExecutor EOS = [ "<|endoftext|>", @@ -64,4 +65,10 @@ def make_raw_chat_prompt( ], tokenize=False, ).split(_MAGIC_SPLITTER_)[0] - return task_prompt \ No newline at end of file + return task_prompt + + +def concurrent_call(n, callback, /, *args, **kwargs): + with ThreadPoolExecutor(max_workers=n) as executor: + futures = [executor.submit(callback, *args, **kwargs) for _ in range(n)] + return [future.result() for future in futures] \ No newline at end of file diff --git a/bigcodebench/provider/vllm.py b/bigcodebench/provider/vllm.py index 3d0aaf4..171a41c 100644 --- a/bigcodebench/provider/vllm.py +++ b/bigcodebench/provider/vllm.py @@ -18,6 +18,7 @@ def __init__(self, name: str, dataset: str, tp: int, **kwargs) -> None: "tensor_parallel_size": int(os.getenv("VLLM_N_GPUS", tp)), "dtype": self.dtype, "trust_remote_code": self.trust_remote_code, + "revision": self.revision, } if self.tokenizer_name is None: self.tokenizer_name = self.name diff --git a/run.sh b/run.sh index a84199e..c069e8e 100755 --- a/run.sh +++ b/run.sh @@ -9,5 +9,4 @@ bigcodebench.evaluate \ --model $MODEL \ --split $SPLIT \ --subset $SUBSET \ - --backend $BACKEND \ - --tp $NUM_GPU \ No newline at end of file + --backend $BACKEND \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index a9b7c74..4897f68 100644 --- a/setup.cfg +++ b/setup.cfg @@ -29,9 +29,6 @@ install_requires = wget>=3.2 datasets gradio-client - -[options.extras_require] -generate = vllm numpy rich @@ -48,4 +45,4 @@ console_scripts = bigcodebench.syncheck = bigcodebench.syncheck:main bigcodebench.legacy_sanitize = bigcodebench.legacy_sanitize:main bigcodebench.generate = bigcodebench.generate:main - bigcodebench.inspect = bigcodebench.inspect:main \ No newline at end of file + bigcodebench.inspect = bigcodebench.inspect:main