From 89309066c6e4e590c8a20c1392d504cb9e68917a Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 25 Feb 2025 21:31:42 +0800 Subject: [PATCH 01/17] feat: support anthropic extended thinking --- bigcodebench/gen/util/anthropic_request.py | 13 ++++++++++++- bigcodebench/generate.py | 11 ++++++++++- bigcodebench/provider/__init__.py | 7 ++++++- bigcodebench/provider/anthropic.py | 6 +++++- 4 files changed, 33 insertions(+), 4 deletions(-) diff --git a/bigcodebench/gen/util/anthropic_request.py b/bigcodebench/gen/util/anthropic_request.py index e53feab..e240dee 100644 --- a/bigcodebench/gen/util/anthropic_request.py +++ b/bigcodebench/gen/util/anthropic_request.py @@ -16,7 +16,18 @@ def make_auto_request(client: anthropic.Client, *args, **kwargs) -> Message: try: signal.signal(signal.SIGALRM, handler) signal.alarm(100) - ret = client.messages.create(*args, **kwargs) + if "reasoning_budget" in kwargs and "reasoning_beta" in kwargs: + ret = client.beta.messages.create( + *args, + **kwargs, + thinking = { + "type": "enabled", + "budget": kwargs["reasoning_budget"], + }, + betas=[kwargs["reasoning_beta"]] + ) + else: + ret = client.messages.create(*args, **kwargs) signal.alarm(0) except anthropic.RateLimitError: print("Rate limit exceeded. Waiting...") diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py index bcf1463..9823d0c 100644 --- a/bigcodebench/generate.py +++ b/bigcodebench/generate.py @@ -132,7 +132,11 @@ def run_codegen( temperature: float = 0.0, max_new_tokens: int = 1280, greedy: bool = False, + # openai reasoning_effort: str = "medium", + # anthropic + reasoning_budget: int = 0, + reasoning_beta: str = "output-128k-2025-02-19", strip_newlines: bool = False, direct_completion: bool = False, resume: bool = True, @@ -173,6 +177,8 @@ def run_codegen( temperature=temperature, max_new_tokens=max_new_tokens, reasoning_effort=reasoning_effort, + reasoning_budget=reasoning_budget, + reasoning_beta=reasoning_beta, instruction_prefix=instruction_prefix, response_prefix=response_prefix, prefill=not skip_prefill, @@ -186,8 +192,11 @@ def run_codegen( ) extra = "-" + subset if subset != "full" else "" - if reasoning_effort and model.startswith("o1-") or model.startswith("o3-") or model.endswith("-reasoner"): + if backend == "openai" and reasoning_effort and model.startswith("o1-") or model.startswith("o3-") or model.endswith("-reasoner"): model = model + f"--{reasoning_effort}" + + if backend == "anthropic" and reasoning_budget and reasoning_beta: + model = model + f"--{reasoning_budget}-{reasoning_beta}" if skip_prefill: identifier = model.replace("/", "--") + "--skip_prefill" + f"--{revision}--bigcodebench{extra}-{split}--{backend}-{temperature}-{n_samples}-sanitized_calibrated.jsonl" diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py index c78d870..f76ec29 100644 --- a/bigcodebench/provider/__init__.py +++ b/bigcodebench/provider/__init__.py @@ -9,8 +9,11 @@ def make_model( dataset: str = "bigcodebench", temperature: float = 0.0, max_new_tokens: int = 1280, - # o1 and o3 only + # openai only reasoning_effort: str = "medium", + # anthropic only + reasoning_budget: int = 0, + reasoning_beta: str = "output-128k-2025-02-19", # instruction model only instruction_prefix: str = None, response_prefix: str = None, @@ -118,6 +121,8 @@ def make_model( split=split, temperature=temperature, max_new_tokens=max_new_tokens, + reasoning_budget=reasoning_budget, + reasoning_beta=reasoning_beta, instruction_prefix=instruction_prefix, response_prefix=response_prefix, ) diff --git a/bigcodebench/provider/anthropic.py b/bigcodebench/provider/anthropic.py index 1969e0c..1612456 100644 --- a/bigcodebench/provider/anthropic.py +++ b/bigcodebench/provider/anthropic.py @@ -9,9 +9,11 @@ from bigcodebench.provider.utility import make_raw_chat_prompt class AnthropicDecoder(DecoderBase): - def __init__(self, name: str, **kwargs) -> None: + def __init__(self, name: str, reasoning_budget: int = 0, reasoning_beta: str = "output-128k-2025-02-19", **kwargs) -> None: super().__init__(name, **kwargs) self.client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_KEY")) + self.reasoning_budget = reasoning_budget + self.reasoning_beta = reasoning_beta def codegen( self, prompts: List[str], do_sample: bool = True, num_samples: int = 200 @@ -43,6 +45,8 @@ def codegen( max_tokens=self.max_new_tokens, temperature=self.temperature, stop_sequences=self.eos, + reasoning_budget=self.reasoning_budget, + reasoning_beta=self.reasoning_beta, ) outputs.append(ret.content[0].text) all_outputs.append(outputs) From c05694cde596c9728664dbab2c8bed5e5ea9c036 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 25 Feb 2025 21:41:23 +0800 Subject: [PATCH 02/17] fix: remove unused args --- bigcodebench/gen/util/anthropic_request.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/bigcodebench/gen/util/anthropic_request.py b/bigcodebench/gen/util/anthropic_request.py index e240dee..20ce444 100644 --- a/bigcodebench/gen/util/anthropic_request.py +++ b/bigcodebench/gen/util/anthropic_request.py @@ -17,15 +17,14 @@ def make_auto_request(client: anthropic.Client, *args, **kwargs) -> Message: signal.signal(signal.SIGALRM, handler) signal.alarm(100) if "reasoning_budget" in kwargs and "reasoning_beta" in kwargs: - ret = client.beta.messages.create( - *args, - **kwargs, - thinking = { - "type": "enabled", - "budget": kwargs["reasoning_budget"], - }, - betas=[kwargs["reasoning_beta"]] - ) + kwargs["thinking"] = { + "type": "enabled", + "budget": kwargs["reasoning_budget"], + } + kwargs["betas"] = [kwargs["reasoning_beta"]] + kwargs.pop("reasoning_budget") + kwargs.pop("reasoning_beta") + ret = client.beta.messages.create(*args, **kwargs) else: ret = client.messages.create(*args, **kwargs) signal.alarm(0) From 57eb973f34666067287cbb05e1845e16b87b5e26 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Wed, 26 Feb 2025 00:57:31 +0800 Subject: [PATCH 03/17] fix: correctly process anthropic treaming --- bigcodebench/gen/util/anthropic_request.py | 6 ++++-- bigcodebench/provider/anthropic.py | 12 +++++++++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/bigcodebench/gen/util/anthropic_request.py b/bigcodebench/gen/util/anthropic_request.py index 20ce444..f6d18fd 100644 --- a/bigcodebench/gen/util/anthropic_request.py +++ b/bigcodebench/gen/util/anthropic_request.py @@ -19,12 +19,14 @@ def make_auto_request(client: anthropic.Client, *args, **kwargs) -> Message: if "reasoning_budget" in kwargs and "reasoning_beta" in kwargs: kwargs["thinking"] = { "type": "enabled", - "budget": kwargs["reasoning_budget"], + "budget_tokens": kwargs["reasoning_budget"], } kwargs["betas"] = [kwargs["reasoning_beta"]] kwargs.pop("reasoning_budget") kwargs.pop("reasoning_beta") - ret = client.beta.messages.create(*args, **kwargs) + kwargs.pop("temperature") + if "thinking" in kwargs: + ret = client.beta.messages.create(*args, **kwargs, stream=True) else: ret = client.messages.create(*args, **kwargs) signal.alarm(0) diff --git a/bigcodebench/provider/anthropic.py b/bigcodebench/provider/anthropic.py index 1612456..59aec09 100644 --- a/bigcodebench/provider/anthropic.py +++ b/bigcodebench/provider/anthropic.py @@ -48,7 +48,17 @@ def codegen( reasoning_budget=self.reasoning_budget, reasoning_beta=self.reasoning_beta, ) - outputs.append(ret.content[0].text) + if isinstance(ret, anthropic.Stream): + output = "" + for chunk in ret: + if chunk.type == "content_block_delta": + if chunk.delta.type == "thinking_delta": + output += chunk.delta.thinking + elif chunk.delta.type == "text_delta": + output += chunk.delta.text + outputs.append(output) + else: + outputs.append(ret.content[0].text) all_outputs.append(outputs) return all_outputs From 78dceb21430359efa05c235324e10523453d7d2f Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Wed, 26 Feb 2025 01:02:05 +0800 Subject: [PATCH 04/17] fix: only append text output --- bigcodebench/provider/anthropic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bigcodebench/provider/anthropic.py b/bigcodebench/provider/anthropic.py index 59aec09..b4a7e43 100644 --- a/bigcodebench/provider/anthropic.py +++ b/bigcodebench/provider/anthropic.py @@ -52,9 +52,9 @@ def codegen( output = "" for chunk in ret: if chunk.type == "content_block_delta": - if chunk.delta.type == "thinking_delta": - output += chunk.delta.thinking - elif chunk.delta.type == "text_delta": + # if chunk.delta.type == "thinking_delta": + # output += chunk.delta.thinking + if chunk.delta.type == "text_delta": output += chunk.delta.text outputs.append(output) else: From 05b7f1f93355f2e64cc3576c4dd1f6c2dbdeab67 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 2 Mar 2025 19:27:56 +0800 Subject: [PATCH 05/17] doc: fix endpoints --- ADVANCED_USAGE.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md index 4f48eca..9bb81b8 100755 --- a/ADVANCED_USAGE.md +++ b/ADVANCED_USAGE.md @@ -69,7 +69,8 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio - `--tokenizer_legacy`: Whether to use the legacy tokenizer, default to `False` - `--samples`: The path to the generated samples file, default to `None` - `--no_execute`: Whether to not execute the samples, default to `False` -- `--remote_execute_api`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page +- `--e2b_endpoint`: The API endpoint for remote execution, default to `bigcodebench_evaluator`, you can also use your own E2B API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page +- `--gradio_endpoint`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page - `--pass_k`: The `k` in `Pass@k`, default to `[1, 5, 10]`, e.g. `--pass_k 1,5,10` will evaluate `Pass@1`, `Pass@5` and `Pass@10` - `--calibrated`: Whether to use the calibrated samples, default to `True` - `--save_pass_rate`: Whether to save the pass rate to a file, default to `True` From 0ecd667f74cd5f789b36e22dc8564f0fc1c09884 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 2 Mar 2025 20:30:39 +0800 Subject: [PATCH 06/17] update the results analysis script --- analysis/get_results.py | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/analysis/get_results.py b/analysis/get_results.py index fc5aa17..607615a 100755 --- a/analysis/get_results.py +++ b/analysis/get_results.py @@ -118,12 +118,12 @@ def check_valid(results): def split_gen(): - shutil.rmtree("sanitized_samples", ignore_errors=True) shutil.rmtree("sanitized_calibrated_samples", ignore_errors=True) - os.makedirs("sanitized_samples/complete", exist_ok=True) - os.makedirs("sanitized_samples/instruct", exist_ok=True) - os.makedirs("sanitized_calibrated_samples/complete", exist_ok=True) - os.makedirs("sanitized_calibrated_samples/instruct", exist_ok=True) + os.makedirs("sanitized_calibrated_samples/hard/complete", exist_ok=True) + os.makedirs("sanitized_calibrated_samples/hard/instruct", exist_ok=True) + os.makedirs("sanitized_calibrated_samples/full/complete", exist_ok=True) + os.makedirs("sanitized_calibrated_samples/full/instruct", exist_ok=True) + for model, info in model_info.items(): model = model.replace("/", "--") files = glob(f"results/{model}--bigcodebench-*.jsonl") @@ -131,27 +131,21 @@ def split_gen(): model = info["link"].split("https://huggingface.co/")[-1].replace("/", "--") for file in files: + if "-sanitized" not in file or "calibrated" not in file: + continue + _, suffix = os.path.basename(file).split("--bigcodebench-") with open(file, "r") as f: data = f.readlines() - if "-sanitized" in file: - if "calibrated" in file: - if info["prompted"]: - if suffix.startswith("complete"): - with open(f"sanitized_calibrated_samples/complete/{model}--bigcodebench-{suffix}", "w") as f: - f.writelines(data) - else: - with open(f"sanitized_calibrated_samples/instruct/{model}--bigcodebench-{suffix}", "w") as f: - f.writelines(data) + split_type = "hard" if "-hard-" in file else "full" + if info["prompted"]: + if suffix.startswith("complete") or suffix.startswith("hard-complete"): + with open(f"sanitized_calibrated_samples/{split_type}/complete/{model}--bigcodebench-{suffix}", "w") as f: + f.writelines(data) else: - if suffix.startswith("complete"): - with open(f"sanitized_samples/complete/{model}--bigcodebench-{suffix}", "w") as f: - f.writelines(data) - else: - with open(f"sanitized_samples/instruct/{model}--bigcodebench-{suffix}", "w") as f: - f.writelines(data) - + with open(f"sanitized_calibrated_samples/{split_type}/instruct/{model}--bigcodebench-{suffix}", "w") as f: + f.writelines(data) def read_task_perf(tids, task="complete"): model_results = dict() @@ -302,7 +296,7 @@ def get_perf_df(data_dict): if __name__ == "__main__": - # split_gen() + split_gen() bcb_orig = load_dataset("bigcode/bigcodebench", split="v0.1.1") bcb_hard = load_dataset("bigcode/bigcodebench-hard", split="v0.1.1") bcb_config = { From f087e3b03ce1df72cf889b201b421bd90346d445 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 2 Mar 2025 20:31:18 +0800 Subject: [PATCH 07/17] doc: add new model outputs --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 94ad2ef..d3913d9 100755 --- a/README.md +++ b/README.md @@ -187,7 +187,7 @@ Please make sure your HF access token has the `Make calls to inference providers ## 💻 LLM-generated Code We share pre-generated code samples from LLMs we have [evaluated](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard) on the full set: -* See the attachment of our [v0.2.1.post7](https://github.com/bigcode-project/bigcodebench/releases/tag/v0.2.1.post7). We include `sanitized_samples_calibrated.zip` for your convenience. +* See the attachment of our [v0.2.4](https://github.com/bigcode-project/bigcodebench/releases/tag/v0.2.4). We include `sanitized_samples_calibrated.zip` for your convenience. ## 🧑 Advanced Usage From 6d967338737d4fa02cb2a8d19207528278282321 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 2 Mar 2025 21:42:32 +0800 Subject: [PATCH 08/17] feat: support vllm lora --- bigcodebench/generate.py | 2 ++ bigcodebench/provider/__init__.py | 3 +++ bigcodebench/provider/vllm.py | 17 +++++++++++++++-- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py index 9823d0c..c5fa368 100644 --- a/bigcodebench/generate.py +++ b/bigcodebench/generate.py @@ -127,6 +127,7 @@ def run_codegen( split: str, subset: str, root: str = "bcb_results", + lora_path: str = None, bs: Optional[int] = None, n_samples: int = 1, temperature: float = 0.0, @@ -174,6 +175,7 @@ def run_codegen( backend=backend, subset=subset, split=split, + lora_path=lora_path, temperature=temperature, max_new_tokens=max_new_tokens, reasoning_effort=reasoning_effort, diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py index f76ec29..202d049 100644 --- a/bigcodebench/provider/__init__.py +++ b/bigcodebench/provider/__init__.py @@ -6,6 +6,7 @@ def make_model( backend: str, subset: str, split: str, + lora_path: str = None, dataset: str = "bigcodebench", temperature: float = 0.0, max_new_tokens: int = 1280, @@ -38,6 +39,7 @@ def make_model( name=model, subset=subset, split=split, + lora_path=lora_path, temperature=temperature, max_new_tokens=max_new_tokens, revision=revision, @@ -58,6 +60,7 @@ def make_model( name=model, subset=subset, split=split, + lora_path=lora_path, temperature=temperature, max_new_tokens=max_new_tokens, revision=revision, diff --git a/bigcodebench/provider/vllm.py b/bigcodebench/provider/vllm.py index cc928e4..570d4c5 100644 --- a/bigcodebench/provider/vllm.py +++ b/bigcodebench/provider/vllm.py @@ -3,6 +3,8 @@ from transformers import AutoTokenizer from vllm import LLM, SamplingParams +from vllm.lora.request import LoRARequest +from huggingface_hub import snapshot_download from bigcodebench.provider.base import DecoderBase from bigcodebench.provider.utility import ( @@ -11,7 +13,7 @@ ) class VllmDecoder(DecoderBase): - def __init__(self, name: str, dataset: str, tp: int, **kwargs) -> None: + def __init__(self, name: str, lora_path: str, dataset: str, tp: int, **kwargs) -> None: super().__init__(name, **kwargs) kwargs = { @@ -29,7 +31,17 @@ def __init__(self, name: str, dataset: str, tp: int, **kwargs) -> None: else: if self.prefill and "```" in self.response_prefix: self.eos += ["\n```\n"] - self.llm = LLM(model=name, max_model_len=self.max_new_tokens, **kwargs) + + self.lora_request = None + if lora_path: + local_lora_path = snapshot_download(lora_path) + self.lora_request = LoRARequest( + "lora", + 1, + local_lora_path, + ) + + self.llm = LLM(model=name, max_model_len=self.max_new_tokens, enable_lora=True if self.lora_path else False, **kwargs) self.llm.set_tokenizer(tokenizer=self.tokenizer) def is_direct_completion(self) -> bool: @@ -64,6 +76,7 @@ def codegen( stop=self.eos, skip_special_tokens=self.skip_special_tokens, ), + lora_request=self.lora_request, use_tqdm=True, ) From 82fc40dfe33381b8bdbe5c695414afa5a543ba16 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 2 Mar 2025 21:50:48 +0800 Subject: [PATCH 09/17] fix: vllm lora attribute --- bigcodebench/provider/vllm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigcodebench/provider/vllm.py b/bigcodebench/provider/vllm.py index 570d4c5..25f00b4 100644 --- a/bigcodebench/provider/vllm.py +++ b/bigcodebench/provider/vllm.py @@ -41,7 +41,7 @@ def __init__(self, name: str, lora_path: str, dataset: str, tp: int, **kwargs) - local_lora_path, ) - self.llm = LLM(model=name, max_model_len=self.max_new_tokens, enable_lora=True if self.lora_path else False, **kwargs) + self.llm = LLM(model=name, max_model_len=self.max_new_tokens, enable_lora=True if self.lora_request else False, **kwargs) self.llm.set_tokenizer(tokenizer=self.tokenizer) def is_direct_completion(self) -> bool: From d37847db62972decb626645699e403ed237b0d73 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 2 Mar 2025 21:57:21 +0800 Subject: [PATCH 10/17] fix: customize lora output file --- bigcodebench/generate.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py index c5fa368..87b67ea 100644 --- a/bigcodebench/generate.py +++ b/bigcodebench/generate.py @@ -197,9 +197,12 @@ def run_codegen( if backend == "openai" and reasoning_effort and model.startswith("o1-") or model.startswith("o3-") or model.endswith("-reasoner"): model = model + f"--{reasoning_effort}" + if lora_path: + model = model + f"--lora-{lora_path}" + if backend == "anthropic" and reasoning_budget and reasoning_beta: model = model + f"--{reasoning_budget}-{reasoning_beta}" - + if skip_prefill: identifier = model.replace("/", "--") + "--skip_prefill" + f"--{revision}--bigcodebench{extra}-{split}--{backend}-{temperature}-{n_samples}-sanitized_calibrated.jsonl" else: From fa21527b1fdd727fd6f629408e16a65813231823 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 25 Mar 2025 00:02:05 +0800 Subject: [PATCH 11/17] feat: add model release date --- analysis/utils.py | 252 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 250 insertions(+), 2 deletions(-) diff --git a/analysis/utils.py b/analysis/utils.py index 430e113..ec774c7 100755 --- a/analysis/utils.py +++ b/analysis/utils.py @@ -8,6 +8,7 @@ "act_param": 6.7, "open-data": "Partial", "reasoning": False, + "date": "2024-12-04", }, "bigcode/starcoder2-15b-instruct-v0.1": { "name": "StarCoder2-15B-Instruct-v0.1", @@ -18,6 +19,7 @@ "act_param": 15, "open-data": "Full", "reasoning": False, + "date": "2024-04-30" }, "bigcode/starcoder2-3b": { "name": "StarCoder2-3B", @@ -28,6 +30,7 @@ "act_param": 3, "open-data": "Full", "reasoning": False, + "date": "2024-02-29" }, "bigcode/starcoder2-7b": { "name": "StarCoder2-7B", @@ -38,6 +41,7 @@ "act_param": 7, "open-data": "Full", "reasoning": False, + "date": "2024-02-29" }, "bigcode/starcoder2-15b": { "name": "StarCoder2-15B", @@ -48,6 +52,7 @@ "act_param": 15, "open-data": "Full", "reasoning": False, + "date": "2024-02-29" }, "Qwen/CodeQwen1.5-7B": { "name": "CodeQwen1.5-7B", @@ -58,6 +63,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-04-16" }, "google/codegemma-2b": { "name": "CodeGemma-2B", @@ -68,6 +74,7 @@ "act_param": 2, "open-data": "None", "reasoning": False, + "date": "2024-04-10" }, "google/codegemma-7b": { "name": "CodeGemma-7B", @@ -78,6 +85,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-04-10" }, "google/codegemma-7b-it": { "name": "CodeGemma-7B-Instruct", @@ -88,6 +96,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-04-10" }, "gpt-3.5-turbo-0125": { "name": "GPT-3.5-Turbo-0125", @@ -98,6 +107,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-01-25" }, "gpt-4o": { "name": "GPT-4o-2024-05-13", @@ -108,6 +118,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-05-13" }, "gpt-4-turbo-2024-04-09": { "name": "GPT-4-Turbo-2024-04-09", @@ -118,6 +129,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-04-09" }, "gpt-4-0613": { "name": "GPT-4-0613", @@ -128,6 +140,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-06-13" }, "codellama/CodeLlama-7b-hf": { "name": "CodeLlama-7B-Base", @@ -138,6 +151,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2023-08-25" }, "codellama/CodeLlama-13b-hf": { "name": "CodeLlama-13B-Base", @@ -148,6 +162,7 @@ "act_param": 13, "open-data": "None", "reasoning": False, + "date": "2023-08-25" }, "codellama/CodeLlama-7b-Instruct-hf": { "name": "CodeLlama-7B-Instruct", @@ -158,6 +173,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2023-08-25" }, "codellama/CodeLlama-13b-Instruct-hf": { "name": "CodeLlama-13B-Instruct", @@ -168,6 +184,7 @@ "act_param": 13, "open-data": "None", "reasoning": False, + "date": "2023-08-25" }, "mistral-large-2402": { "name": "Mistral-Large-2402", @@ -178,6 +195,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-02-26" }, "mistral-small-2402": { "name": "Mistral-Small-2402", @@ -188,6 +206,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-02-26" }, "mistralai/Mixtral-8x22B-v0.1": { "name": "Mixtral-8x22B-Base", @@ -198,6 +217,7 @@ "act_param": 44, "open-data": "None", "reasoning": False, + "date": "2024-04-17" }, "mistralai/Mixtral-8x22B-Instruct-v0.1": { "name": "Mixtral-8x22B-Instruct", @@ -208,6 +228,7 @@ "act_param": 44, "open-data": "None", "reasoning": False, + "date": "2024-04-17" }, "codellama/CodeLlama-34b-hf": { "name": "CodeLlama-34B-Base", @@ -218,6 +239,7 @@ "act_param": 34, "open-data": "None", "reasoning": False, + "date": "2023-08-25" }, "codellama/CodeLlama-34b-Instruct-hf": { "name": "CodeLlama-34B-Instruct", @@ -228,6 +250,7 @@ "act_param": 34, "open-data": "None", "reasoning": False, + "date": "2023-08-25" }, "codellama/CodeLlama-70b-hf": { "name": "CodeLlama-70B-Base", @@ -238,6 +261,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2023-08-25" }, "codellama/CodeLlama-70b-Instruct-hf": { "name": "CodeLlama-70B-Instruct", @@ -248,6 +272,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2023-08-25" }, "Qwen/CodeQwen1.5-7B-Chat": { "name": "CodeQwen1.5-7B-Chat", @@ -258,6 +283,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-04-16" }, "Qwen/Qwen1.5-110B-Chat": { "name": "Qwen1.5-110B-Chat", @@ -268,6 +294,7 @@ "act_param": 110, "open-data": "None", "reasoning": False, + "date": "2024-04-26" }, "Qwen/Qwen1.5-72B-Chat": { "name": "Qwen1.5-72B-Chat", @@ -278,6 +305,7 @@ "act_param": 72, "open-data": "None", "reasoning": False, + "date": "2024-04-26" }, "Qwen/Qwen1.5-32B-Chat": { "name": "Qwen1.5-32B-Chat", @@ -288,6 +316,7 @@ "act_param": 32, "open-data": "None", "reasoning": False, + "date": "2024-04-26" }, "deepseek-ai/DeepSeek-V2-Chat": { "name": "DeepSeek-V2-Chat", @@ -298,6 +327,7 @@ "act_param": 21, "open-data": "None", "reasoning": False, + "date": "2024-04-06" }, "deepseek-ai/deepseek-coder-1.3b-base": { "name": "DeepSeek-Coder-1.3B-Base", @@ -308,6 +338,7 @@ "act_param": 1.3, "open-data": "None", "reasoning": False, + "date": "2023-10-28" }, "deepseek-ai/deepseek-coder-1.3b-instruct": { "name": "DeepSeek-Coder-1.3B-Instruct", @@ -318,6 +349,7 @@ "act_param": 1.3, "open-data": "None", "reasoning": False, + "date": "2023-10-28" }, "deepseek-ai/deepseek-coder-33b-base": { "name": "DeepSeek-Coder-33B-Base", @@ -328,6 +360,7 @@ "act_param": 33, "open-data": "None", "reasoning": False, + "date": "2023-10-28" }, "deepseek-ai/deepseek-coder-33b-instruct": { "name": "DeepSeek-Coder-33B-Instruct", @@ -338,6 +371,7 @@ "act_param": 33, "open-data": "None", "reasoning": False, + "date": "2023-10-28" }, "deepseek-ai/deepseek-coder-6.7b-base": { "name": "DeepSeek-Coder-6.7B-Base", @@ -348,6 +382,7 @@ "act_param": 6.7, "open-data": "None", "reasoning": False, + "date": "2023-10-28" }, "deepseek-ai/deepseek-coder-6.7b-instruct": { "name": "DeepSeek-Coder-6.7B-Instruct", @@ -358,6 +393,7 @@ "act_param": 6.7, "open-data": "None", "reasoning": False, + "date": "2023-10-28" }, "meta-llama/Meta-Llama-3-70B": { "name": "Llama-3-70B-Base", @@ -368,6 +404,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2024-04-18" }, "meta-llama/Meta-Llama-3-70B-Instruct": { "name": "Llama-3-70B-Instruct", @@ -378,6 +415,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2024-04-18" }, "meta-llama/Meta-Llama-3-8B": { "name": "Llama-3-8B-Base", @@ -388,6 +426,7 @@ "act_param": 8, "open-data": "None", "reasoning": False, + "date": "2024-04-18" }, "meta-llama/Meta-Llama-3-8B-Instruct": { "name": "Llama-3-8B-Instruct", @@ -398,6 +437,7 @@ "act_param": 8, "open-data": "None", "reasoning": False, + "date": "2024-04-18" }, "ibm-granite/granite-3b-code-instruct": { "name": "Granite-Code-3B-Instruct", @@ -408,6 +448,7 @@ "act_param": 3, "open-data": "None", "reasoning": False, + "date": "2024-05-06" }, "ibm-granite/granite-8b-code-instruct": { "name": "Granite-Code-8B-Instruct", @@ -418,6 +459,7 @@ "act_param": 8, "open-data": "None", "reasoning": False, + "date": "2024-05-06" }, "ibm-granite/granite-20b-code-instruct": { "name": "Granite-Code-20B-Instruct", @@ -428,6 +470,7 @@ "act_param": 20, "open-data": "None", "reasoning": False, + "date": "2024-05-06" }, "ibm-granite/granite-34b-code-instruct": { "name": "Granite-Code-34B-Instruct", @@ -438,6 +481,7 @@ "act_param": 34, "open-data": "None", "reasoning": False, + "date": "2024-05-06" }, "ibm-granite/granite-3b-code-base": { "name": "Granite-Code-3B-Base", @@ -448,6 +492,7 @@ "act_param": 3, "open-data": "None", "reasoning": False, + "date": "2024-05-06" }, "ibm-granite/granite-8b-code-base": { "name": "Granite-Code-8B-Base", @@ -458,6 +503,7 @@ "act_param": 8, "open-data": "None", "reasoning": False, + "date": "2024-05-06" }, "ibm-granite/granite-20b-code-base": { "name": "Granite-Code-20B-Base", @@ -468,6 +514,7 @@ "act_param": 20, "open-data": "None", "reasoning": False, + "date": "2024-05-06" }, "ibm-granite/granite-34b-code-base": { "name": "Granite-Code-34B-Base", @@ -478,6 +525,7 @@ "act_param": 34, "open-data": "None", "reasoning": False, + "date": "2024-05-06" }, "claude-3-haiku-20240307": { "name": "Claude-3-Haiku-20240307", @@ -488,6 +536,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-03-07" }, "claude-3-sonnet-20240229": { "name": "Claude-3-Sonnet-20240229", @@ -498,6 +547,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-02-29" }, "claude-3-opus-20240229": { "name": "Claude-3-Opus-20240229", @@ -508,6 +558,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-02-29" }, "01-ai/Yi-1.5-34B-Chat": { "name": "Yi-1.5-34B-Chat", @@ -518,6 +569,7 @@ "act_param": 34, "open-data": "None", "reasoning": False, + "date": "2024-05-20" }, "01-ai/Yi-1.5-34B": { "name": "Yi-1.5-34B", @@ -528,6 +580,7 @@ "act_param": 34, "open-data": "None", "reasoning": False, + "date": "2024-05-20" }, "01-ai/Yi-1.5-9B-Chat": { "name": "Yi-1.5-9B-Chat", @@ -538,6 +591,7 @@ "act_param": 9, "open-data": "None", "reasoning": False, + "date": "2024-05-20" }, "01-ai/Yi-1.5-9B": { "name": "Yi-1.5-9B", @@ -548,6 +602,7 @@ "act_param": 9, "open-data": "None", "reasoning": False, + "date": "2024-05-20" }, "01-ai/Yi-1.5-6B-Chat": { "name": "Yi-1.5-6B-Chat", @@ -558,6 +613,7 @@ "act_param": 6, "open-data": "None", "reasoning": False, + "date": "2024-05-20" }, "01-ai/Yi-1.5-6B": { "name": "Yi-1.5-6B", @@ -568,6 +624,7 @@ "act_param": 6, "open-data": "None", "reasoning": False, + "date": "2024-05-20" }, "Qwen/Qwen2-57B-A14B": { "name": "Qwen2-57B-A14B", @@ -578,6 +635,7 @@ "act_param": 14, "open-data": "None", "reasoning": False, + "date": "2024-06-07" }, "Qwen/Qwen2-7B-Instruct": { "name": "Qwen2-7B-Instruct", @@ -588,6 +646,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-06-07" }, "Qwen/Qwen2-72B-Chat": { "name": "Qwen2-72B-Chat", @@ -598,6 +657,7 @@ "act_param": 72, "open-data": "None", "reasoning": False, + "date": "2024-06-07" }, "gemini-1.5-pro": { "name": "Gemini-1.5-Pro-API-0514", @@ -608,6 +668,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-05-14" }, "gemini-1.5-flash": { "name": "Gemini-1.5-Flash-API-0514", @@ -618,6 +679,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-05-14" }, "m-a-p/OpenCodeInterpreter-DS-33B": { "name": "OpenCodeInterpreter-DS-33B", @@ -628,6 +690,7 @@ "act_param": 33, "open-data": "Partial", "reasoning": False, + "date": "2024-02-22" }, "m-a-p/OpenCodeInterpreter-DS-6.7B": { "name": "OpenCodeInterpreter-DS-6.7B", @@ -638,6 +701,7 @@ "act_param": 6.7, "open-data": "Partial", "reasoning": False, + "date": "2024-02-22" }, "m-a-p/OpenCodeInterpreter-DS-1.3B": { "name": "OpenCodeInterpreter-DS-1.3B", @@ -648,6 +712,7 @@ "act_param": 1.3, "open-data": "Partial", "reasoning": False, + "date": "2024-02-22" }, "microsoft/Phi-3-medium-128k-instruct": { "name": "Phi-3-Medium-128K-Instruct", @@ -658,6 +723,7 @@ "act_param": 14, "open-data": "None", "reasoning": False, + "date": "2024-05-21" }, "microsoft/Phi-3-small-128k-instruct": { "name": "Phi-3-Small-128K-Instruct", @@ -668,6 +734,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-05-21" }, "codestral-2405": { "name": "Codestral-22B-v0.1", @@ -678,6 +745,7 @@ "act_param": 22, "open-data": "None", "reasoning": False, + "date": "2024-05-23" }, "codestral-mamba-2407": { "name": "Codestral-Mamba", @@ -688,6 +756,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-07-16" }, "mistralai/Mistral-7B-Instruct-v0.3": { "name": "Mistral-7B-Instruct-v0.3", @@ -698,6 +767,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-05-22" }, "mistralai/Mistral-7B-v0.3": { "name": "Mistral-7B-v0.3", @@ -708,6 +778,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-05-22" }, "CohereForAI/c4ai-command-r-plus": { "name": "Command R+", @@ -718,6 +789,7 @@ "act_param": 104, "open-data": "None", "reasoning": False, + "date": "2024-04-04" }, "deepseek-coder": { "name": "DeepSeek-Coder-V2-Instruct", @@ -728,6 +800,7 @@ "act_param": 21, "open-data": "None", "reasoning": True, + "date": "2024-06-17" }, "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": { "name": "DeepSeek-Coder-V2-Lite-Instruct", @@ -738,6 +811,7 @@ "act_param": 2.4, "open-data": "None", "reasoning": False, + "date": "2024-06-17" }, "deepseek-ai/DeepSeek-Coder-V2-Lite-Base": { "name": "DeepSeek-Coder-V2-Lite-Base", @@ -748,6 +822,7 @@ "act_param": 2.4, "open-data": "None", "reasoning": False, + "date": "2024-06-17" }, "claude-3-5-sonnet-20240620": { "name": "Claude-3.5-Sonnet-20240620", @@ -758,6 +833,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-06-20" }, "NousResearch/Hermes-2-Theta-Llama-3-70B": { "name": "Hermes-2-Theta-Llama-3-70B", @@ -768,6 +844,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2024-06-24" }, "microsoft/wavecoder-ultra-6.7b": { "name": "WaveCoder-Ultra-6.7B", @@ -778,6 +855,7 @@ "act_param": 6.7, "open-data": "None", "reasoning": False, + "date": "2023-12-26" }, "google/gemma-2-9b-it": { "name": "Gemma-2-9B-Instruct", @@ -788,6 +866,7 @@ "act_param": 9, "open-data": "None", "reasoning": False, + "date": "2024-06-19" }, "Bin12345/AutoCoder": { "name": "AutoCoder", @@ -798,6 +877,7 @@ "act_param": 33, "open-data": "None", "reasoning": False, + "date": "2024-05-23" }, "Bin12345/AutoCoder_S_6.7B": { "name": "AutoCoder-S-6.7B", @@ -808,6 +888,7 @@ "act_param": 6.7, "open-data": "None", "reasoning": False, + "date": "2024-05-23" }, "Bin12345/AutoCoder_QW_7B": { "name": "AutoCoder-QW-7B", @@ -818,6 +899,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-05-23" }, "SenseLLM/ReflectionCoder-DS-33B": { "name": "ReflectionCoder-DS-33B", @@ -828,6 +910,7 @@ "act_param": 33, "open-data": "Partial", "reasoning": False, + "date": "2024-05-27" }, "SenseLLM/ReflectionCoder-DS-6.7B": { "name": "ReflectionCoder-DS-6.7B", @@ -838,6 +921,7 @@ "act_param": 6.7, "open-data": "Partial", "reasoning": False, + "date": "2024-05-27" }, "SenseLLM/ReflectionCoder-CL-34B": { "name": "ReflectionCoder-CL-34B", @@ -848,6 +932,7 @@ "act_param": 34, "open-data": "Partial", "reasoning": False, + "date": "2024-05-27" }, "SenseLLM/ReflectionCoder-CL-7B": { "name": "ReflectionCoder-CL-7B", @@ -858,6 +943,7 @@ "act_param": 7, "open-data": "Partial", "reasoning": False, + "date": "2024-05-27" }, "new-microsoft/Phi-3-mini-128k-instruct": { "name": "Phi-3.1-Mini-128K-Instruct", @@ -868,6 +954,7 @@ "act_param": 3.8, "open-data": "None", "reasoning": False, + "date": "2024-05-21" }, "old-microsoft/Phi-3-mini-128k-instruct": { "name": "Phi-3-Mini-128K-Instruct", @@ -878,6 +965,7 @@ "act_param": 3.8, "open-data": "None", "reasoning": False, + "date": "2024-05-21" }, "internlm/internlm2_5-7b-chat": { "name": "InternLM2.5-7B-Chat", @@ -888,6 +976,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-07-03" }, "NousResearch/Hermes-2-Pro-Llama-3-70B": { "name": "Hermes-2-Pro-Llama-3-70B", @@ -898,6 +987,7 @@ "act_param": 70, "open-data": "Partial", "reasoning": False, + "date": "2024-06-27" }, "new-deepseek-chat": { "name": "DeepSeek-V2-Chat (2024-06-28)", @@ -908,6 +998,7 @@ "act_param": 21, "open-data": "None", "reasoning": True, + "date": "2024-06-28" }, "vllm-google/gemma-2-27b-it": { "name": "Gemma-2-27B-Instruct", @@ -918,6 +1009,7 @@ "act_param": 27, "open-data": "None", "reasoning": False, + "date": "2024-06-19" }, "Artigenz/Artigenz-Coder-DS-6.7B": { "name": "Artigenz-Coder-DS-6.7B", @@ -928,6 +1020,7 @@ "act_param": 6.7, "open-data": "None", "reasoning": False, + "date": "2024-04-16" }, "openchat/openchat-3.6-8b-20240522": { "name": "OpenChat-3.6-8B-20240522", @@ -938,6 +1031,7 @@ "act_param": 8, "open-data": "None", "reasoning": False, + "date": "2024-05-22" }, "Phind/Phind-CodeLlama-34B-v2": { "name": "Phind-CodeLlama-34B-v2", @@ -948,6 +1042,7 @@ "act_param": 34, "open-data": "None", "reasoning": False, + "date": "2023-08-25" }, "yi-large": { "name": "Yi-Large", @@ -958,6 +1053,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-05-13" }, "THUDM/codegeex4-all-9b": { "name": "CodeGeex4-All-9B", @@ -968,6 +1064,7 @@ "act_param": 9, "open-data": "None", "reasoning": False, + "date": "2024-07-05" }, "gpt-4o-mini-2024-07-18": { "name": "GPT-4o-mini-2024-07-18", @@ -978,6 +1075,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-07-18" }, "Nexusflow/Athene-70B": { "name": "Athene-70B", @@ -988,6 +1086,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2024-07-20" }, "NTQAI/Nxcode-CQ-7B-orpo": { "name": "Nxcode-CQ-7B-Orpo", @@ -998,6 +1097,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-04-25" }, "migtissera/Llama-3-70B-Synthia-v3.5": { "name": "Llama-3-70B-Synthia-v3.5", @@ -1008,6 +1108,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2024-05-27" }, "migtissera/Tess-v2.5.2-Qwen2-72B": { "name": "Tess-v2.5.2-Qwen2-72B", @@ -1018,6 +1119,7 @@ "act_param": 72, "open-data": "None", "reasoning": False, + "date": "2024-07-18" }, "WhiteRabbitNeo/WhiteRabbitNeo-33B-v1.5": { "name": "WhiteRabbitNeo-33B-v1.5", @@ -1028,6 +1130,7 @@ "act_param": 33, "open-data": "None", "reasoning": False, + "date": "2024-02-10" }, "mistral-large-2407": { "name": "Mistral-Large-Instruct-2407", @@ -1038,6 +1141,7 @@ "act_param": 123, "open-data": "None", "reasoning": True, + "date": "2024-07-24" }, "meta-llama/Meta-Llama-3.1-8B-Instruct": { "name": "Llama-3.1-8B-Instruct", @@ -1048,6 +1152,7 @@ "act_param": 8, "open-data": "None", "reasoning": False, + "date": "2024-07-23" }, "meta-llama/Meta-Llama-3.1-70B-Instruct": { "name": "Llama-3.1-70B-Instruct", @@ -1058,6 +1163,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2024-07-23" }, "meta--llama-3.1-405b-instruct": { "name": "Llama-3.1-405B-Instruct", @@ -1068,6 +1174,7 @@ "act_param": 405, "open-data": "None", "reasoning": False, + "date": "2024-07-23" }, "deepseek-coder-20240724": { "name": "DeepSeek-Coder-V2-Instruct (2024-07-24)", @@ -1078,6 +1185,7 @@ "act_param": 21, "open-data": "None", "reasoning": False, + "date": "2024-07-24" }, "microsoft/Phi-3.5-mini-instruct": { "name": "Phi-3.5-Mini-Instruct", @@ -1088,6 +1196,7 @@ "act_param": 3.8, "open-data": "None", "reasoning": False, + "date": "2024-04-23" }, "nv-mistralai--mistral-nemo-12b-instruct": { "name": "Mistral-Nemo-12B-Instruct", @@ -1098,6 +1207,7 @@ "act_param": 12, "open-data": "None", "reasoning": False, + "date": "2024-07-18" }, "wyt2000/InverseCoder-CL-13B": { "name": "InverseCoder-CL-13B", @@ -1108,6 +1218,7 @@ "act_param": 13, "open-data": "Partial", "reasoning": False, + "date": "2024-07-08" }, "wyt2000/InverseCoder-CL-7B": { "name": "InverseCoder-CL-7B", @@ -1118,6 +1229,7 @@ "act_param": 7, "open-data": "Partial", "reasoning": False, + "date": "2024-07-08" }, "wyt2000/InverseCoder-DS-6.7B": { "name": "InverseCoder-DS-6.7B", @@ -1128,6 +1240,7 @@ "act_param": 6.7, "open-data": "Partial", "reasoning": False, + "date": "2024-07-08" }, "gemini-1.5-pro-exp-0801": { "name": "Gemini-1.5-Pro-Exp-0801", @@ -1138,6 +1251,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-08-01" }, "gpt-4o-2024-08-06": { "name": "GPT-4o-2024-08-06", @@ -1148,6 +1262,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-08-06" }, "abacusai/Dracarys-Llama-3.1-70B-Instruct": { "name": "Dracarys-Llama-3.1-70B-Instruct", @@ -1158,6 +1273,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2024-08-23" }, "abacusai/Dracarys-72B-Instruct": { "name": "Dracarys-72B-Instruct", @@ -1168,6 +1284,7 @@ "act_param": 72, "open-data": "None", "reasoning": False, + "date": "2024-08-23" }, "gemini-1.5-pro-exp-0827": { "name": "Gemini-1.5-Pro-Exp-0827", @@ -1178,6 +1295,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-08-27" }, "gemini-1.5-flash-exp-0827": { "name": "Gemini-1.5-Flash-Exp-0827", @@ -1188,6 +1306,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-08-27" }, "microsoft/Phi-3.5-mini-instruct": { "name": "Phi-3.5-Mini-Instruct", @@ -1198,6 +1317,7 @@ "act_param": 3.8, "open-data": "None", "reasoning": False, + "date": "2024-04-23" }, "abacusai/Dracarys-Llama-3.1-70B-Instruct": { "name": "Dracarys-Llama-3.1-70B-Instruct", @@ -1208,6 +1328,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2024-04-23" }, "abacusai/Dracarys-72B-Instruct": { "name": "Dracarys-72B-Instruct", @@ -1218,6 +1339,7 @@ "act_param": 72, "open-data": "None", "reasoning": False, + "date": "2024-04-23" }, "deepseek-coder-v2.5": { "name": "DeepSeek-V2.5", @@ -1228,6 +1350,7 @@ "act_param": 21, "open-data": "None", "reasoning": False, + "date": "2024-09-18" }, "CohereForAI/c4ai-command-r-08-2024": { "name": "C4AI-Command-R-08-2024", @@ -1238,6 +1361,7 @@ "act_param": 32.3, "open-data": "None", "reasoning": False, + "date": "2024-08-30" }, "CohereForAI/c4ai-command-r-plus-08-2024": { "name": "C4AI-Command-R-Plus-08-2024", @@ -1248,6 +1372,7 @@ "act_param": 104, "open-data": "None", "reasoning": False, + "date": "2024-08-30" }, "ayueei--yue-coder-9b-preview": { "name": "Yi-Coder-9B-Chat", @@ -1258,6 +1383,7 @@ "act_param": 9, "open-data": "None", "reasoning": False, + "date": "2024-09-04" }, # "mattshumer/ref_70_e3_prefill": { # "name": "Reflection-Llama-3.1-70B", @@ -1286,6 +1412,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-09-12" }, "o1-mini-2024-09-12": { "name": "o1-Mini-2024-09-12 (temperature=1)", @@ -1296,6 +1423,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-09-12" }, "Qwen/Qwen2.5-Coder-1.5B-Instruct": { "name": "Qwen2.5-Coder-1.5B-Instruct", @@ -1306,6 +1434,7 @@ "act_param": 1.5, "open-data": "None", "reasoning": False, + "date": "2024-11-12" }, "Qwen/Qwen2.5-Coder-7B-Instruct": { "name": "Qwen2.5-Coder-7B-Instruct", @@ -1316,6 +1445,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-11-12" }, "gemini-1.5-pro-002": { "name": "Gemini-1.5-Pro-002", @@ -1326,6 +1456,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-09-25" }, "mistralai/Mistral-Small-Instruct-2409": { "name": "Mistral-Small-Instruct-2409", @@ -1336,6 +1467,7 @@ "act_param": 22.2, "open-data": "None", "reasoning": False, + "date": "2024-09-18" }, "Qwen/Qwen2.5-0.5B-Instruct": { "name": "Qwen2.5-0.5B-Instruct", @@ -1346,6 +1478,7 @@ "act_param": 0.5, "open-data": "None", "reasoning": False, + "date": "2024-09-19" }, "Qwen/Qwen2.5-1.5B-Instruct": { "name": "Qwen2.5-1.5B-Instruct", @@ -1356,6 +1489,7 @@ "act_param": 1.5, "open-data": "None", "reasoning": False, + "date": "2024-09-19" }, "Qwen/Qwen2.5-7B-Instruct": { "name": "Qwen2.5-7B-Instruct", @@ -1366,6 +1500,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-09-19" }, "Qwen/Qwen2.5-14B-Instruct": { "name": "Qwen2.5-14B-Instruct", @@ -1376,6 +1511,7 @@ "act_param": 14, "open-data": "None", "reasoning": False, + "date": "2024-09-19" }, "Qwen/Qwen2.5-32B-Instruct": { "name": "Qwen2.5-32B-Instruct", @@ -1386,6 +1522,7 @@ "act_param": 32, "open-data": "None", "reasoning": False, + "date": "2024-09-19" }, "Qwen/Qwen2.5-72B-Instruct": { "name": "Qwen2.5-72B-Instruct", @@ -1396,6 +1533,7 @@ "act_param": 72, "open-data": "None", "reasoning": False, + "date": "2024-09-19" }, "meta-llama/Llama-3.2-1B-Instruct": { "name": "Llama-3.2-1B-Instruct", @@ -1406,6 +1544,7 @@ "act_param": 1, "open-data": "None", "reasoning": False, + "date": "2024-09-25" }, "meta-llama/Llama-3.2-3B-Instruct": { "name": "Llama-3.2-3B-Instruct", @@ -1416,6 +1555,7 @@ "act_param": 3, "open-data": "None", "reasoning": False, + "date": "2024-09-25" }, "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF": { "name": "Llama-3.1-Nemotron-70B-Instruct", @@ -1426,6 +1566,7 @@ "act_param": 70, "open-data": "Partial", "reasoning": False, + "date": "2024-09-25" }, "claude-3-5-sonnet-20241022": { "name": "Claude-3.5-Sonnet-20241022", @@ -1436,6 +1577,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-10-22" }, "ibm-granite/granite-3.0-8b-instruct": { "name": "Granite-3.0-8B-Instruct", @@ -1446,6 +1588,7 @@ "act_param": 8, "open-data": "None", "reasoning": False, + "date": "2024-10-21" }, "ibm-granite/granite-3.0-2b-instruct": { "name": "Granite-3.0-2B-Instruct", @@ -1456,6 +1599,7 @@ "act_param": 2, "open-data": "None", "reasoning": False, + "date": "2024-10-21" }, "grok-beta--main": { "name": "Grok-Beta", @@ -1466,6 +1610,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-03-17" }, "claude-3-5-haiku-20241022--main": { "name": "Claude-3.5-Haiku-20241022", @@ -1476,6 +1621,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-10-22" }, "Qwen/Qwen2.5-Coder-14B-Instruct--main": { "name": "Qwen2.5-Coder-14B-Instruct", @@ -1486,6 +1632,7 @@ "act_param": 14, "open-data": "None", "reasoning": False, + "date": "2024-09-19" }, "Qwen/Qwen2.5-Coder-32B-Instruct--main": { "name": "Qwen2.5-Coder-32B-Instruct", @@ -1496,6 +1643,7 @@ "act_param": 32, "open-data": "None", "reasoning": False, + "date": "2024-09-19" }, "infly/OpenCoder-1.5B-Instruct--main": { "name": "OpenCoder-1.5B-Instruct", @@ -1506,6 +1654,7 @@ "act_param": 1.5, "open-data": "None", "reasoning": False, + "date": "2024-11-09" }, "infly/OpenCoder-8B-Instruct--main": { "name": "OpenCoder-8B-Instruct", @@ -1516,6 +1665,7 @@ "act_param": 8, "open-data": "None", "reasoning": False, + "date": "2024-11-09" }, "microsoft/Phi-3.5-mini-instruct--main": { "name": "Phi-3.5-Mini-Instruct", @@ -1526,6 +1676,7 @@ "act_param": 3.8, "open-data": "None", "reasoning": False, + "date": "2024-08-21" }, "Nexusflow/Athene-V2-Agent--main": { "name": "Athene-V2-Agent", @@ -1536,6 +1687,7 @@ "act_param": 72, "open-data": "None", "reasoning": False, + "date": "2024-11-14" }, "Nexusflow/Athene-V2-Chat--main": { "name": "Athene-V2-Chat", @@ -1546,6 +1698,7 @@ "act_param": 72, "open-data": "None", "reasoning": False, + "date": "2024-11-14" }, "gemini-exp-1114--main": { "name": "Gemini-Exp-1114", @@ -1556,6 +1709,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-11-14" }, "gpt-4o-2024-11-20--main": { "name": "GPT-4o-2024-11-20", @@ -1566,6 +1720,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-11-20" }, "gemini-exp-1121--main": { "name": "Gemini-Exp-1121", @@ -1576,6 +1731,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-11-21" }, "gemini-exp-1206--main": { "name": "Gemini-Exp-1206", @@ -1586,6 +1742,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-12-06" }, "meta-llama--Llama-3.3-70B-Instruct--main": { "name": "Llama-3.3-70B-Instruct", @@ -1596,6 +1753,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2024-12-19" }, "deepseek-ai--DeepSeek-V2.5-1210--main": { "name": "DeepSeek-V2.5-1210", @@ -1606,6 +1764,7 @@ "act_param": 21, "open-data": "None", "reasoning": False, + "date": "2024-12-10" }, "gemini-2.0-flash-exp--main": { "name": "Gemini-2.0-Flash-Exp", @@ -1616,6 +1775,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-12-11" }, "gemini-2.0-flash-thinking-exp-1219--main": { "name": "Gemini-2.0-Flash-Thinking-Exp-1219", @@ -1626,6 +1786,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-12-19" }, "gemini-2.0-flash-thinking-exp-01-21--main": { "name": "Gemini-2.0-Flash-Thinking-Exp-01-21", @@ -1636,6 +1797,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2025-01-21" }, "o1-2024-12-17--main": { "name": "o1-2024-12-17 (temperature=1, reasoning=medium)", @@ -1646,6 +1808,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-12-17" }, "o1-2024-12-17--low--main": { "name": "o1-2024-12-17 (temperature=1, reasoning=low)", @@ -1656,6 +1819,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-12-17" }, "o1-2024-12-17--high--main": { "name": "o1-2024-12-17 (temperature=1, reasoning=high)", @@ -1666,16 +1830,18 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-12-17" }, "deepseek-v3-chat--main": { - "name": "DeepSeek-V3-Chat", - "link": "https://huggingface.co/deepseek-ai/DeepSeek-V3-Chat", + "name": "DeepSeek-V3", + "link": "https://huggingface.co/deepseek-ai/DeepSeek-V3", "prompted": True, "moe": True, "size": 671, "act_param": 37, "open-data": "None", "reasoning": True, + "date": "2024-12-26" }, "microsoft--phi-4--main": { "name": "Phi-4", @@ -1686,6 +1852,7 @@ "act_param": 14.7, "open-data": "None", "reasoning": False, + "date": "2024-12-13" }, "deepseek-reasoner--main": { "name": "DeepSeek-R1", @@ -1696,6 +1863,7 @@ "act_param": 37, "open-data": "None", "reasoning": True, + "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Llama-70B--main": { "name": "DeepSeek-R1-Distill-Llama-70B", @@ -1706,6 +1874,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B--main": { "name": "DeepSeek-R1-Distill-Qwen-32B", @@ -1716,6 +1885,7 @@ "act_param": 32, "open-data": "None", "reasoning": False, + "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B--main": { "name": "DeepSeek-R1-Distill-Qwen-14B", @@ -1726,6 +1896,7 @@ "act_param": 14, "open-data": "None", "reasoning": False, + "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Llama-8B--main": { "name": "DeepSeek-R1-Distill-Llama-8B", @@ -1736,6 +1907,7 @@ "act_param": 14, "open-data": "None", "reasoning": False, + "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B--main": { "name": "DeepSeek-R1-Distill-Qwen-7B", @@ -1746,6 +1918,7 @@ "act_param": 14, "open-data": "None", "reasoning": False, + "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B--main": { "name": "DeepSeek-R1-Distill-Qwen-1.5B", @@ -1756,6 +1929,7 @@ "act_param": 1.5, "open-data": "None", "reasoning": False, + "date": "2025-01-20" }, "mistralai/Mistral-Small-24B-Instruct-2501--main": { "name": "Mistral-Small-24B-Instruct-2501", @@ -1766,6 +1940,7 @@ "act_param": 24, "open-data": "None", "reasoning": False, + "date": "2025-01-31" }, "o3-mini-2025-01-31--medium--main": { "name": "o3-mini-2025-01-31 (temperature=1, reasoning=medium)", @@ -1776,6 +1951,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2025-01-31" }, "o3-mini-2025-01-31--low--main": { "name": "o3-mini-2025-01-31 (temperature=1, reasoning=low)", @@ -1786,6 +1962,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2025-01-31" }, "o3-mini-2025-01-31--high--main": { "name": "o3-mini-2025-01-31 (temperature=1, reasoning=high)", @@ -1796,6 +1973,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2025-01-31" }, "gemini-2.0-flash-001--main": { "name": "Gemini-2.0-Flash-001", @@ -1806,6 +1984,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2025-02-05" }, "gemini-2.0-flash-exp--main": { "name": "Gemini-2.0-Flash-Exp", @@ -1816,6 +1995,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2025-02-05" }, "gemini-2.0-flash-lite-preview-02-05--main": { "name": "Gemini-2.0-Flash-Lite-Preview-02-05", @@ -1826,6 +2006,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2025-02-05" }, "gemini-2.0-pro-exp-02-05--main": { "name": "Gemini-2.0-Pro-Exp-02-05", @@ -1836,6 +2017,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2025-02-05" }, "NovaSky-AI--Sky-T1-32B-Flash--main": { "name": "Sky-T1-32B-Flash", @@ -1846,6 +2028,7 @@ "act_param": 32, "open-data": "None", "reasoning": False, + "date": "2025-01-12" }, "NovaSky-AI--Sky-T1-32B-Preview--main": { "name": "Sky-T1-32B-Preview", @@ -1856,6 +2039,7 @@ "act_param": 32, "open-data": "None", "reasoning": False, + "date": "2025-01-12" }, "Qwen--QwQ-32B-Preview--main": { "name": "QwQ-32B-Preview", @@ -1866,5 +2050,69 @@ "act_param": 32, "open-data": "None", "reasoning": False, + "date": "2024-11-28" + }, + "claude-3-7-sonnet-20250219--main": { + "name": "Claude-3-Haiku-20240307", + "link": "https://www.anthropic.com/news/claude-3-family", + "prompted": True, + "moe": False, + "size": None, + "act_param": None, + "open-data": "None", + "reasoning": True, + "date": "2025-02-19" + }, + "chatgpt-4o-latest--main": { + "name": "ChatGPT-4o-latest-20250129", + "link": "https://chat.openai.com/", + "open-data": "None", + "prompted": True, + "moe": False, + "size": None, + "act_param": None, + "date": "2025-01-29" + }, + "Kwaipilot--KwaiCoder-23B-A4B-v1--main": { + "name": "KwaiCoder-23B-A4B-v1", + "link": "https://huggingface.co/Kwaipilot/KwaiCoder-23B-A4B-v1", + "open-data": "None", + "prompted": False, + "moe": True, + "size": 23, + "act_param": 4, + "date": "2025-01-25" + }, + "qwen-max-latest--main": { + "name": "Qwen2.5-Max", + "link": "https://qwenlm.github.io/blog/qwen2.5-max/", + "open-data": "None", + "prompted": True, + "moe": True, + "size": None, + "act_param": None, + "date": "2025-01-28" + }, + "claude-3-7-sonnet-20250219--3200-output-128k-2025-02-19--main": { + "name": "Claude-3.7-Sonnet-20250219 (temperature=1, length=12800, reasoning=3200)", + "link": "https://www.anthropic.com/news/claude-3-7-sonnet", + "prompted": True, + "moe": False, + "size": None, + "act_param": None, + "open-data": "None", + "reasoning": True, + "date": "2025-02-19" + }, + "claude-3-7-sonnet-20250219--main": { + "name": "Claude-3.7-Sonnet-20250219", + "link": "https://www.anthropic.com/news/claude-3-7-sonnet", + "prompted": True, + "moe": False, + "size": None, + "act_param": None, + "open-data": "None", + "reasoning": True, + "date": "2025-02-19" }, } From 5f0743d0a6874fd6fdfe6ab616fe7f65145fb038 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 25 Mar 2025 00:03:20 +0800 Subject: [PATCH 12/17] fix: remove vllm max length --- bigcodebench/provider/vllm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigcodebench/provider/vllm.py b/bigcodebench/provider/vllm.py index 25f00b4..60b2285 100644 --- a/bigcodebench/provider/vllm.py +++ b/bigcodebench/provider/vllm.py @@ -41,7 +41,7 @@ def __init__(self, name: str, lora_path: str, dataset: str, tp: int, **kwargs) - local_lora_path, ) - self.llm = LLM(model=name, max_model_len=self.max_new_tokens, enable_lora=True if self.lora_request else False, **kwargs) + self.llm = LLM(model=name, enable_lora=True if self.lora_request else False, **kwargs) self.llm.set_tokenizer(tokenizer=self.tokenizer) def is_direct_completion(self) -> bool: From 3513d997f55c383dec3436d7b43704a4affbc8d9 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 25 Mar 2025 00:13:39 +0800 Subject: [PATCH 13/17] fix: hardcode the model max length for vllm --- bigcodebench/provider/vllm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bigcodebench/provider/vllm.py b/bigcodebench/provider/vllm.py index 60b2285..229e4c9 100644 --- a/bigcodebench/provider/vllm.py +++ b/bigcodebench/provider/vllm.py @@ -41,7 +41,8 @@ def __init__(self, name: str, lora_path: str, dataset: str, tp: int, **kwargs) - local_lora_path, ) - self.llm = LLM(model=name, enable_lora=True if self.lora_request else False, **kwargs) + # max_model_len is set to max_new_tokens * 10 + self.llm = LLM(model=name, max_model_len=self.max_new_tokens * 10, enable_lora=True if self.lora_request else False, **kwargs) self.llm.set_tokenizer(tokenizer=self.tokenizer) def is_direct_completion(self) -> bool: From 00fc9bb98c932424c2e9bf82ab417142aaca5e1d Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 25 Mar 2025 02:20:37 +0800 Subject: [PATCH 14/17] fix model metadata --- analysis/utils.py | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/analysis/utils.py b/analysis/utils.py index ec774c7..29a1cb7 100755 --- a/analysis/utils.py +++ b/analysis/utils.py @@ -1903,8 +1903,8 @@ "link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "prompted": True, "moe": False, - "size": 14, - "act_param": 14, + "size": 8, + "act_param": 8, "open-data": "None", "reasoning": False, "date": "2025-01-20" @@ -1914,8 +1914,8 @@ "link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", "prompted": True, "moe": False, - "size": 14, - "act_param": 14, + "size": 7, + "act_param": 7, "open-data": "None", "reasoning": False, "date": "2025-01-20" @@ -2115,4 +2115,36 @@ "reasoning": True, "date": "2025-02-19" }, + "WarriorCoder-6.7B--main": { + "name": "WarriorCoder-6.7B (Reproduced)", + "link": "https://arxiv.org/abs/2412.17395", + "open-data": "None", + "prompted": True, + "moe": False, + "date": "2025-02-18" + }, + "google--gemma-3-27b-it--main": { + "name": "Gemma-3-27B-Instruct", + "link": "https://huggingface.co/google/gemma-3-27b-it", + "open-data": "None", + "prompted": True, + "moe": False, + "date": "2025-03-12" + }, + "Qwen--QwQ-32B--skip_prefill--main": { + "name": "QwQ-32B (w/ Reasoning)", + "link": "https://huggingface.co/Qwen/QwQ-32B", + "open-data": "None", + "prompted": True, + "moe": False, + "date": "2025-03-06" + }, + "deepseek-chat-0324--main": { + "name": "DeepSeek-V3-0324", + "link": "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324", + "open-data": "None", + "prompted": True, + "moe": False, + "date": "2025-03-24" + } } From 720681b8ecbcabbfafa6f4c1aae1ca8365d726c4 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 25 Mar 2025 02:22:11 +0800 Subject: [PATCH 15/17] feat: add max_model_len for vllm --- ADVANCED_USAGE.md | 1 + bigcodebench/generate.py | 3 +++ bigcodebench/provider/__init__.py | 2 ++ bigcodebench/provider/vllm.py | 5 ++--- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md index 9bb81b8..c0905ba 100755 --- a/ADVANCED_USAGE.md +++ b/ADVANCED_USAGE.md @@ -50,6 +50,7 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio - `--n_samples`: The number of samples, default to `1` - `--temperature`: The temperature, default to `0.0` - `--max_new_tokens`: The length of max new tokens, default to `1280` +- `--max_model_len`: The length of max tokens in VLLM, default to `12800` - `--greedy`: Whether to use greedy decoding, default to `False` - `--strip_newlines`: Whether to strip newlines, default to `False`, set to `True` to strip newlines for some model series like StarCoder2 - `--direct_completion`: Whether to use direct completion, default to `False` diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py index 87b67ea..912abcd 100644 --- a/bigcodebench/generate.py +++ b/bigcodebench/generate.py @@ -132,6 +132,8 @@ def run_codegen( n_samples: int = 1, temperature: float = 0.0, max_new_tokens: int = 1280, + # vllm + max_model_len: int = 12800, greedy: bool = False, # openai reasoning_effort: str = "medium", @@ -178,6 +180,7 @@ def run_codegen( lora_path=lora_path, temperature=temperature, max_new_tokens=max_new_tokens, + max_model_len=max_model_len, reasoning_effort=reasoning_effort, reasoning_budget=reasoning_budget, reasoning_beta=reasoning_beta, diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py index 202d049..4cb3410 100644 --- a/bigcodebench/provider/__init__.py +++ b/bigcodebench/provider/__init__.py @@ -10,6 +10,7 @@ def make_model( dataset: str = "bigcodebench", temperature: float = 0.0, max_new_tokens: int = 1280, + max_model_len: int = 12800, # openai only reasoning_effort: str = "medium", # anthropic only @@ -42,6 +43,7 @@ def make_model( lora_path=lora_path, temperature=temperature, max_new_tokens=max_new_tokens, + max_model_len=max_model_len, revision=revision, dataset=dataset, direct_completion=direct_completion, diff --git a/bigcodebench/provider/vllm.py b/bigcodebench/provider/vllm.py index 229e4c9..41cd251 100644 --- a/bigcodebench/provider/vllm.py +++ b/bigcodebench/provider/vllm.py @@ -13,7 +13,7 @@ ) class VllmDecoder(DecoderBase): - def __init__(self, name: str, lora_path: str, dataset: str, tp: int, **kwargs) -> None: + def __init__(self, name: str, lora_path: str, dataset: str, tp: int, max_model_len: int, **kwargs) -> None: super().__init__(name, **kwargs) kwargs = { @@ -41,8 +41,7 @@ def __init__(self, name: str, lora_path: str, dataset: str, tp: int, **kwargs) - local_lora_path, ) - # max_model_len is set to max_new_tokens * 10 - self.llm = LLM(model=name, max_model_len=self.max_new_tokens * 10, enable_lora=True if self.lora_request else False, **kwargs) + self.llm = LLM(model=name, max_model_len=max_model_len, enable_lora=True if self.lora_request else False, **kwargs) self.llm.set_tokenizer(tokenizer=self.tokenizer) def is_direct_completion(self) -> bool: From c9e2cbba6618bec6ced0aa08892e4a7446d128ee Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 1 Apr 2025 01:10:09 +0800 Subject: [PATCH 16/17] update model metadata --- analysis/utils.py | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/analysis/utils.py b/analysis/utils.py index 29a1cb7..798499b 100755 --- a/analysis/utils.py +++ b/analysis/utils.py @@ -2071,6 +2071,7 @@ "moe": False, "size": None, "act_param": None, + "reasoning": True, "date": "2025-01-29" }, "Kwaipilot--KwaiCoder-23B-A4B-v1--main": { @@ -2081,6 +2082,7 @@ "moe": True, "size": 23, "act_param": 4, + "reasoning": False, "date": "2025-01-25" }, "qwen-max-latest--main": { @@ -2091,6 +2093,7 @@ "moe": True, "size": None, "act_param": None, + "reasoning": True, "date": "2025-01-28" }, "claude-3-7-sonnet-20250219--3200-output-128k-2025-02-19--main": { @@ -2121,6 +2124,10 @@ "open-data": "None", "prompted": True, "moe": False, + "size": 6.7, + "act_param": 6.7, + "open-data": "None", + "reasoning": False, "date": "2025-02-18" }, "google--gemma-3-27b-it--main": { @@ -2129,6 +2136,10 @@ "open-data": "None", "prompted": True, "moe": False, + "size": 27, + "act_param": 27, + "open-data": "None", + "reasoning": False, "date": "2025-03-12" }, "Qwen--QwQ-32B--skip_prefill--main": { @@ -2137,6 +2148,10 @@ "open-data": "None", "prompted": True, "moe": False, + "size": 32, + "act_param": 32, + "open-data": "None", + "reasoning": True, "date": "2025-03-06" }, "deepseek-chat-0324--main": { @@ -2144,7 +2159,23 @@ "link": "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324", "open-data": "None", "prompted": True, - "moe": False, + "moe": True, + "size": 671, + "act_param": 37, + "open-data": "None", + "reasoning": True, "date": "2025-03-24" + }, + "gemini-2.5-pro-exp-03-25--main": { + "name": "Gemini-2.5-Pro-Exp-03-25", + "link": "https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/", + "open-data": "None", + "prompted": True, + "moe": False, + "size": None, + "act_param": 37, + "open-data": "None", + "reasoning": True, + "date": "2025-03-25" } -} +} \ No newline at end of file From 9bd90fedee89d7dc3676838c75d9642cb0cd0702 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 1 Apr 2025 01:11:27 +0800 Subject: [PATCH 17/17] feat: use google genai --- Docker/Evaluate.Dockerfile | 2 +- bigcodebench/gen/util/google_request.py | 42 ++++++++++++++++--------- bigcodebench/provider/google.py | 9 +++--- setup.cfg | 2 +- 4 files changed, 35 insertions(+), 20 deletions(-) diff --git a/Docker/Evaluate.Dockerfile b/Docker/Evaluate.Dockerfile index 90e7f40..8b2cdcd 100755 --- a/Docker/Evaluate.Dockerfile +++ b/Docker/Evaluate.Dockerfile @@ -54,7 +54,7 @@ RUN pip install \ rich \ accelerate \ anthropic \ - google-generativeai \ + google-genai \ mistralai \ openai \ e2b diff --git a/bigcodebench/gen/util/google_request.py b/bigcodebench/gen/util/google_request.py index 9e13607..5a76362 100644 --- a/bigcodebench/gen/util/google_request.py +++ b/bigcodebench/gen/util/google_request.py @@ -1,11 +1,12 @@ import time -import google.generativeai as genai +from google import genai from google.api_core.exceptions import GoogleAPICallError, ResourceExhausted def make_request( - client: genai.GenerativeModel, + model: str, + client: genai.Client, message: str, temperature: float, n: int, @@ -13,21 +14,34 @@ def make_request( ) -> genai.types.GenerateContentResponse: kwargs = {"temperature": temperature, "max_output_tokens": max_new_tokens} - if "-thinking-" in client.model_name: + if "-thinking-" in model: kwargs.pop("max_output_tokens") - - response = client.generate_content( - [{"role": "user", "parts": [message]}], - generation_config=genai.types.GenerationConfig( + + response = client.models.generate_content( + model=model, + contents=message, + config=genai.types.GenerateContentConfig( candidate_count=n, + safety_settings=[ + genai.types.SafetySetting( + category='HARM_CATEGORY_DANGEROUS_CONTENT', + threshold='BLOCK_NONE' + ), + genai.types.SafetySetting( + category='HARM_CATEGORY_SEXUALLY_EXPLICIT', + threshold='BLOCK_NONE' + ), + genai.types.SafetySetting( + category='HARM_CATEGORY_HATE_SPEECH', + threshold='BLOCK_NONE' + ), + genai.types.SafetySetting( + category='HARM_CATEGORY_HARASSMENT', + threshold='BLOCK_NONE' + ), + ], **kwargs - ), - safety_settings=[ - {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"}, - {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"}, - {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"}, - {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"}, - ], + ), ) return response diff --git a/bigcodebench/provider/google.py b/bigcodebench/provider/google.py index 2194c47..e3b18ff 100644 --- a/bigcodebench/provider/google.py +++ b/bigcodebench/provider/google.py @@ -2,7 +2,7 @@ from typing import List from tqdm import tqdm -import google.generativeai as genai +from google import genai from bigcodebench.provider.base import DecoderBase from bigcodebench.gen.util.google_request import make_auto_request @@ -12,8 +12,8 @@ class GoogleDecoder(DecoderBase): def __init__(self, name: str, **kwargs): super().__init__(name, **kwargs) - genai.configure(api_key=os.getenv("GOOGLE_API_KEY")) - self.client = genai.GenerativeModel(name) + self.model = name + self.client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY")) def codegen( self, prompts: List[str], do_sample: bool = True, num_samples: int = 200 @@ -34,7 +34,8 @@ def codegen( tokenizer=None, ) ret = make_auto_request( - self.client, + model=self.model, + client=self.client, message=message, n=num_samples, temperature=self.temperature, diff --git a/setup.cfg b/setup.cfg index cc20139..5907add 100644 --- a/setup.cfg +++ b/setup.cfg @@ -35,7 +35,7 @@ install_requires = rich accelerate>=0.30.1 anthropic>=0.26.1 - google-generativeai>=0.5.4 + google-genai mistralai>=0.2.0,<1.0.0 openai>=1.11.1 e2b