From 89309066c6e4e590c8a20c1392d504cb9e68917a Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 25 Feb 2025 21:31:42 +0800 Subject: [PATCH 01/24] feat: support anthropic extended thinking --- bigcodebench/gen/util/anthropic_request.py | 13 ++++++++++++- bigcodebench/generate.py | 11 ++++++++++- bigcodebench/provider/__init__.py | 7 ++++++- bigcodebench/provider/anthropic.py | 6 +++++- 4 files changed, 33 insertions(+), 4 deletions(-) diff --git a/bigcodebench/gen/util/anthropic_request.py b/bigcodebench/gen/util/anthropic_request.py index e53feab..e240dee 100644 --- a/bigcodebench/gen/util/anthropic_request.py +++ b/bigcodebench/gen/util/anthropic_request.py @@ -16,7 +16,18 @@ def make_auto_request(client: anthropic.Client, *args, **kwargs) -> Message: try: signal.signal(signal.SIGALRM, handler) signal.alarm(100) - ret = client.messages.create(*args, **kwargs) + if "reasoning_budget" in kwargs and "reasoning_beta" in kwargs: + ret = client.beta.messages.create( + *args, + **kwargs, + thinking = { + "type": "enabled", + "budget": kwargs["reasoning_budget"], + }, + betas=[kwargs["reasoning_beta"]] + ) + else: + ret = client.messages.create(*args, **kwargs) signal.alarm(0) except anthropic.RateLimitError: print("Rate limit exceeded. Waiting...") diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py index bcf1463..9823d0c 100644 --- a/bigcodebench/generate.py +++ b/bigcodebench/generate.py @@ -132,7 +132,11 @@ def run_codegen( temperature: float = 0.0, max_new_tokens: int = 1280, greedy: bool = False, + # openai reasoning_effort: str = "medium", + # anthropic + reasoning_budget: int = 0, + reasoning_beta: str = "output-128k-2025-02-19", strip_newlines: bool = False, direct_completion: bool = False, resume: bool = True, @@ -173,6 +177,8 @@ def run_codegen( temperature=temperature, max_new_tokens=max_new_tokens, reasoning_effort=reasoning_effort, + reasoning_budget=reasoning_budget, + reasoning_beta=reasoning_beta, instruction_prefix=instruction_prefix, response_prefix=response_prefix, prefill=not skip_prefill, @@ -186,8 +192,11 @@ def run_codegen( ) extra = "-" + subset if subset != "full" else "" - if reasoning_effort and model.startswith("o1-") or model.startswith("o3-") or model.endswith("-reasoner"): + if backend == "openai" and reasoning_effort and model.startswith("o1-") or model.startswith("o3-") or model.endswith("-reasoner"): model = model + f"--{reasoning_effort}" + + if backend == "anthropic" and reasoning_budget and reasoning_beta: + model = model + f"--{reasoning_budget}-{reasoning_beta}" if skip_prefill: identifier = model.replace("/", "--") + "--skip_prefill" + f"--{revision}--bigcodebench{extra}-{split}--{backend}-{temperature}-{n_samples}-sanitized_calibrated.jsonl" diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py index c78d870..f76ec29 100644 --- a/bigcodebench/provider/__init__.py +++ b/bigcodebench/provider/__init__.py @@ -9,8 +9,11 @@ def make_model( dataset: str = "bigcodebench", temperature: float = 0.0, max_new_tokens: int = 1280, - # o1 and o3 only + # openai only reasoning_effort: str = "medium", + # anthropic only + reasoning_budget: int = 0, + reasoning_beta: str = "output-128k-2025-02-19", # instruction model only instruction_prefix: str = None, response_prefix: str = None, @@ -118,6 +121,8 @@ def make_model( split=split, temperature=temperature, max_new_tokens=max_new_tokens, + reasoning_budget=reasoning_budget, + reasoning_beta=reasoning_beta, instruction_prefix=instruction_prefix, response_prefix=response_prefix, ) diff --git a/bigcodebench/provider/anthropic.py b/bigcodebench/provider/anthropic.py index 1969e0c..1612456 100644 --- a/bigcodebench/provider/anthropic.py +++ b/bigcodebench/provider/anthropic.py @@ -9,9 +9,11 @@ from bigcodebench.provider.utility import make_raw_chat_prompt class AnthropicDecoder(DecoderBase): - def __init__(self, name: str, **kwargs) -> None: + def __init__(self, name: str, reasoning_budget: int = 0, reasoning_beta: str = "output-128k-2025-02-19", **kwargs) -> None: super().__init__(name, **kwargs) self.client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_KEY")) + self.reasoning_budget = reasoning_budget + self.reasoning_beta = reasoning_beta def codegen( self, prompts: List[str], do_sample: bool = True, num_samples: int = 200 @@ -43,6 +45,8 @@ def codegen( max_tokens=self.max_new_tokens, temperature=self.temperature, stop_sequences=self.eos, + reasoning_budget=self.reasoning_budget, + reasoning_beta=self.reasoning_beta, ) outputs.append(ret.content[0].text) all_outputs.append(outputs) From c05694cde596c9728664dbab2c8bed5e5ea9c036 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 25 Feb 2025 21:41:23 +0800 Subject: [PATCH 02/24] fix: remove unused args --- bigcodebench/gen/util/anthropic_request.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/bigcodebench/gen/util/anthropic_request.py b/bigcodebench/gen/util/anthropic_request.py index e240dee..20ce444 100644 --- a/bigcodebench/gen/util/anthropic_request.py +++ b/bigcodebench/gen/util/anthropic_request.py @@ -17,15 +17,14 @@ def make_auto_request(client: anthropic.Client, *args, **kwargs) -> Message: signal.signal(signal.SIGALRM, handler) signal.alarm(100) if "reasoning_budget" in kwargs and "reasoning_beta" in kwargs: - ret = client.beta.messages.create( - *args, - **kwargs, - thinking = { - "type": "enabled", - "budget": kwargs["reasoning_budget"], - }, - betas=[kwargs["reasoning_beta"]] - ) + kwargs["thinking"] = { + "type": "enabled", + "budget": kwargs["reasoning_budget"], + } + kwargs["betas"] = [kwargs["reasoning_beta"]] + kwargs.pop("reasoning_budget") + kwargs.pop("reasoning_beta") + ret = client.beta.messages.create(*args, **kwargs) else: ret = client.messages.create(*args, **kwargs) signal.alarm(0) From 57eb973f34666067287cbb05e1845e16b87b5e26 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Wed, 26 Feb 2025 00:57:31 +0800 Subject: [PATCH 03/24] fix: correctly process anthropic treaming --- bigcodebench/gen/util/anthropic_request.py | 6 ++++-- bigcodebench/provider/anthropic.py | 12 +++++++++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/bigcodebench/gen/util/anthropic_request.py b/bigcodebench/gen/util/anthropic_request.py index 20ce444..f6d18fd 100644 --- a/bigcodebench/gen/util/anthropic_request.py +++ b/bigcodebench/gen/util/anthropic_request.py @@ -19,12 +19,14 @@ def make_auto_request(client: anthropic.Client, *args, **kwargs) -> Message: if "reasoning_budget" in kwargs and "reasoning_beta" in kwargs: kwargs["thinking"] = { "type": "enabled", - "budget": kwargs["reasoning_budget"], + "budget_tokens": kwargs["reasoning_budget"], } kwargs["betas"] = [kwargs["reasoning_beta"]] kwargs.pop("reasoning_budget") kwargs.pop("reasoning_beta") - ret = client.beta.messages.create(*args, **kwargs) + kwargs.pop("temperature") + if "thinking" in kwargs: + ret = client.beta.messages.create(*args, **kwargs, stream=True) else: ret = client.messages.create(*args, **kwargs) signal.alarm(0) diff --git a/bigcodebench/provider/anthropic.py b/bigcodebench/provider/anthropic.py index 1612456..59aec09 100644 --- a/bigcodebench/provider/anthropic.py +++ b/bigcodebench/provider/anthropic.py @@ -48,7 +48,17 @@ def codegen( reasoning_budget=self.reasoning_budget, reasoning_beta=self.reasoning_beta, ) - outputs.append(ret.content[0].text) + if isinstance(ret, anthropic.Stream): + output = "" + for chunk in ret: + if chunk.type == "content_block_delta": + if chunk.delta.type == "thinking_delta": + output += chunk.delta.thinking + elif chunk.delta.type == "text_delta": + output += chunk.delta.text + outputs.append(output) + else: + outputs.append(ret.content[0].text) all_outputs.append(outputs) return all_outputs From 78dceb21430359efa05c235324e10523453d7d2f Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Wed, 26 Feb 2025 01:02:05 +0800 Subject: [PATCH 04/24] fix: only append text output --- bigcodebench/provider/anthropic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bigcodebench/provider/anthropic.py b/bigcodebench/provider/anthropic.py index 59aec09..b4a7e43 100644 --- a/bigcodebench/provider/anthropic.py +++ b/bigcodebench/provider/anthropic.py @@ -52,9 +52,9 @@ def codegen( output = "" for chunk in ret: if chunk.type == "content_block_delta": - if chunk.delta.type == "thinking_delta": - output += chunk.delta.thinking - elif chunk.delta.type == "text_delta": + # if chunk.delta.type == "thinking_delta": + # output += chunk.delta.thinking + if chunk.delta.type == "text_delta": output += chunk.delta.text outputs.append(output) else: From 05b7f1f93355f2e64cc3576c4dd1f6c2dbdeab67 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 2 Mar 2025 19:27:56 +0800 Subject: [PATCH 05/24] doc: fix endpoints --- ADVANCED_USAGE.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md index 4f48eca..9bb81b8 100755 --- a/ADVANCED_USAGE.md +++ b/ADVANCED_USAGE.md @@ -69,7 +69,8 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio - `--tokenizer_legacy`: Whether to use the legacy tokenizer, default to `False` - `--samples`: The path to the generated samples file, default to `None` - `--no_execute`: Whether to not execute the samples, default to `False` -- `--remote_execute_api`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page +- `--e2b_endpoint`: The API endpoint for remote execution, default to `bigcodebench_evaluator`, you can also use your own E2B API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page +- `--gradio_endpoint`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page - `--pass_k`: The `k` in `Pass@k`, default to `[1, 5, 10]`, e.g. `--pass_k 1,5,10` will evaluate `Pass@1`, `Pass@5` and `Pass@10` - `--calibrated`: Whether to use the calibrated samples, default to `True` - `--save_pass_rate`: Whether to save the pass rate to a file, default to `True` From 0ecd667f74cd5f789b36e22dc8564f0fc1c09884 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 2 Mar 2025 20:30:39 +0800 Subject: [PATCH 06/24] update the results analysis script --- analysis/get_results.py | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/analysis/get_results.py b/analysis/get_results.py index fc5aa17..607615a 100755 --- a/analysis/get_results.py +++ b/analysis/get_results.py @@ -118,12 +118,12 @@ def check_valid(results): def split_gen(): - shutil.rmtree("sanitized_samples", ignore_errors=True) shutil.rmtree("sanitized_calibrated_samples", ignore_errors=True) - os.makedirs("sanitized_samples/complete", exist_ok=True) - os.makedirs("sanitized_samples/instruct", exist_ok=True) - os.makedirs("sanitized_calibrated_samples/complete", exist_ok=True) - os.makedirs("sanitized_calibrated_samples/instruct", exist_ok=True) + os.makedirs("sanitized_calibrated_samples/hard/complete", exist_ok=True) + os.makedirs("sanitized_calibrated_samples/hard/instruct", exist_ok=True) + os.makedirs("sanitized_calibrated_samples/full/complete", exist_ok=True) + os.makedirs("sanitized_calibrated_samples/full/instruct", exist_ok=True) + for model, info in model_info.items(): model = model.replace("/", "--") files = glob(f"results/{model}--bigcodebench-*.jsonl") @@ -131,27 +131,21 @@ def split_gen(): model = info["link"].split("https://huggingface.co/")[-1].replace("/", "--") for file in files: + if "-sanitized" not in file or "calibrated" not in file: + continue + _, suffix = os.path.basename(file).split("--bigcodebench-") with open(file, "r") as f: data = f.readlines() - if "-sanitized" in file: - if "calibrated" in file: - if info["prompted"]: - if suffix.startswith("complete"): - with open(f"sanitized_calibrated_samples/complete/{model}--bigcodebench-{suffix}", "w") as f: - f.writelines(data) - else: - with open(f"sanitized_calibrated_samples/instruct/{model}--bigcodebench-{suffix}", "w") as f: - f.writelines(data) + split_type = "hard" if "-hard-" in file else "full" + if info["prompted"]: + if suffix.startswith("complete") or suffix.startswith("hard-complete"): + with open(f"sanitized_calibrated_samples/{split_type}/complete/{model}--bigcodebench-{suffix}", "w") as f: + f.writelines(data) else: - if suffix.startswith("complete"): - with open(f"sanitized_samples/complete/{model}--bigcodebench-{suffix}", "w") as f: - f.writelines(data) - else: - with open(f"sanitized_samples/instruct/{model}--bigcodebench-{suffix}", "w") as f: - f.writelines(data) - + with open(f"sanitized_calibrated_samples/{split_type}/instruct/{model}--bigcodebench-{suffix}", "w") as f: + f.writelines(data) def read_task_perf(tids, task="complete"): model_results = dict() @@ -302,7 +296,7 @@ def get_perf_df(data_dict): if __name__ == "__main__": - # split_gen() + split_gen() bcb_orig = load_dataset("bigcode/bigcodebench", split="v0.1.1") bcb_hard = load_dataset("bigcode/bigcodebench-hard", split="v0.1.1") bcb_config = { From f087e3b03ce1df72cf889b201b421bd90346d445 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 2 Mar 2025 20:31:18 +0800 Subject: [PATCH 07/24] doc: add new model outputs --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 94ad2ef..d3913d9 100755 --- a/README.md +++ b/README.md @@ -187,7 +187,7 @@ Please make sure your HF access token has the `Make calls to inference providers ## 💻 LLM-generated Code We share pre-generated code samples from LLMs we have [evaluated](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard) on the full set: -* See the attachment of our [v0.2.1.post7](https://github.com/bigcode-project/bigcodebench/releases/tag/v0.2.1.post7). We include `sanitized_samples_calibrated.zip` for your convenience. +* See the attachment of our [v0.2.4](https://github.com/bigcode-project/bigcodebench/releases/tag/v0.2.4). We include `sanitized_samples_calibrated.zip` for your convenience. ## 🧑 Advanced Usage From 6d967338737d4fa02cb2a8d19207528278282321 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 2 Mar 2025 21:42:32 +0800 Subject: [PATCH 08/24] feat: support vllm lora --- bigcodebench/generate.py | 2 ++ bigcodebench/provider/__init__.py | 3 +++ bigcodebench/provider/vllm.py | 17 +++++++++++++++-- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py index 9823d0c..c5fa368 100644 --- a/bigcodebench/generate.py +++ b/bigcodebench/generate.py @@ -127,6 +127,7 @@ def run_codegen( split: str, subset: str, root: str = "bcb_results", + lora_path: str = None, bs: Optional[int] = None, n_samples: int = 1, temperature: float = 0.0, @@ -174,6 +175,7 @@ def run_codegen( backend=backend, subset=subset, split=split, + lora_path=lora_path, temperature=temperature, max_new_tokens=max_new_tokens, reasoning_effort=reasoning_effort, diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py index f76ec29..202d049 100644 --- a/bigcodebench/provider/__init__.py +++ b/bigcodebench/provider/__init__.py @@ -6,6 +6,7 @@ def make_model( backend: str, subset: str, split: str, + lora_path: str = None, dataset: str = "bigcodebench", temperature: float = 0.0, max_new_tokens: int = 1280, @@ -38,6 +39,7 @@ def make_model( name=model, subset=subset, split=split, + lora_path=lora_path, temperature=temperature, max_new_tokens=max_new_tokens, revision=revision, @@ -58,6 +60,7 @@ def make_model( name=model, subset=subset, split=split, + lora_path=lora_path, temperature=temperature, max_new_tokens=max_new_tokens, revision=revision, diff --git a/bigcodebench/provider/vllm.py b/bigcodebench/provider/vllm.py index cc928e4..570d4c5 100644 --- a/bigcodebench/provider/vllm.py +++ b/bigcodebench/provider/vllm.py @@ -3,6 +3,8 @@ from transformers import AutoTokenizer from vllm import LLM, SamplingParams +from vllm.lora.request import LoRARequest +from huggingface_hub import snapshot_download from bigcodebench.provider.base import DecoderBase from bigcodebench.provider.utility import ( @@ -11,7 +13,7 @@ ) class VllmDecoder(DecoderBase): - def __init__(self, name: str, dataset: str, tp: int, **kwargs) -> None: + def __init__(self, name: str, lora_path: str, dataset: str, tp: int, **kwargs) -> None: super().__init__(name, **kwargs) kwargs = { @@ -29,7 +31,17 @@ def __init__(self, name: str, dataset: str, tp: int, **kwargs) -> None: else: if self.prefill and "```" in self.response_prefix: self.eos += ["\n```\n"] - self.llm = LLM(model=name, max_model_len=self.max_new_tokens, **kwargs) + + self.lora_request = None + if lora_path: + local_lora_path = snapshot_download(lora_path) + self.lora_request = LoRARequest( + "lora", + 1, + local_lora_path, + ) + + self.llm = LLM(model=name, max_model_len=self.max_new_tokens, enable_lora=True if self.lora_path else False, **kwargs) self.llm.set_tokenizer(tokenizer=self.tokenizer) def is_direct_completion(self) -> bool: @@ -64,6 +76,7 @@ def codegen( stop=self.eos, skip_special_tokens=self.skip_special_tokens, ), + lora_request=self.lora_request, use_tqdm=True, ) From 82fc40dfe33381b8bdbe5c695414afa5a543ba16 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 2 Mar 2025 21:50:48 +0800 Subject: [PATCH 09/24] fix: vllm lora attribute --- bigcodebench/provider/vllm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigcodebench/provider/vllm.py b/bigcodebench/provider/vllm.py index 570d4c5..25f00b4 100644 --- a/bigcodebench/provider/vllm.py +++ b/bigcodebench/provider/vllm.py @@ -41,7 +41,7 @@ def __init__(self, name: str, lora_path: str, dataset: str, tp: int, **kwargs) - local_lora_path, ) - self.llm = LLM(model=name, max_model_len=self.max_new_tokens, enable_lora=True if self.lora_path else False, **kwargs) + self.llm = LLM(model=name, max_model_len=self.max_new_tokens, enable_lora=True if self.lora_request else False, **kwargs) self.llm.set_tokenizer(tokenizer=self.tokenizer) def is_direct_completion(self) -> bool: From d37847db62972decb626645699e403ed237b0d73 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 2 Mar 2025 21:57:21 +0800 Subject: [PATCH 10/24] fix: customize lora output file --- bigcodebench/generate.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py index c5fa368..87b67ea 100644 --- a/bigcodebench/generate.py +++ b/bigcodebench/generate.py @@ -197,9 +197,12 @@ def run_codegen( if backend == "openai" and reasoning_effort and model.startswith("o1-") or model.startswith("o3-") or model.endswith("-reasoner"): model = model + f"--{reasoning_effort}" + if lora_path: + model = model + f"--lora-{lora_path}" + if backend == "anthropic" and reasoning_budget and reasoning_beta: model = model + f"--{reasoning_budget}-{reasoning_beta}" - + if skip_prefill: identifier = model.replace("/", "--") + "--skip_prefill" + f"--{revision}--bigcodebench{extra}-{split}--{backend}-{temperature}-{n_samples}-sanitized_calibrated.jsonl" else: From fa21527b1fdd727fd6f629408e16a65813231823 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 25 Mar 2025 00:02:05 +0800 Subject: [PATCH 11/24] feat: add model release date --- analysis/utils.py | 252 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 250 insertions(+), 2 deletions(-) diff --git a/analysis/utils.py b/analysis/utils.py index 430e113..ec774c7 100755 --- a/analysis/utils.py +++ b/analysis/utils.py @@ -8,6 +8,7 @@ "act_param": 6.7, "open-data": "Partial", "reasoning": False, + "date": "2024-12-04", }, "bigcode/starcoder2-15b-instruct-v0.1": { "name": "StarCoder2-15B-Instruct-v0.1", @@ -18,6 +19,7 @@ "act_param": 15, "open-data": "Full", "reasoning": False, + "date": "2024-04-30" }, "bigcode/starcoder2-3b": { "name": "StarCoder2-3B", @@ -28,6 +30,7 @@ "act_param": 3, "open-data": "Full", "reasoning": False, + "date": "2024-02-29" }, "bigcode/starcoder2-7b": { "name": "StarCoder2-7B", @@ -38,6 +41,7 @@ "act_param": 7, "open-data": "Full", "reasoning": False, + "date": "2024-02-29" }, "bigcode/starcoder2-15b": { "name": "StarCoder2-15B", @@ -48,6 +52,7 @@ "act_param": 15, "open-data": "Full", "reasoning": False, + "date": "2024-02-29" }, "Qwen/CodeQwen1.5-7B": { "name": "CodeQwen1.5-7B", @@ -58,6 +63,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-04-16" }, "google/codegemma-2b": { "name": "CodeGemma-2B", @@ -68,6 +74,7 @@ "act_param": 2, "open-data": "None", "reasoning": False, + "date": "2024-04-10" }, "google/codegemma-7b": { "name": "CodeGemma-7B", @@ -78,6 +85,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-04-10" }, "google/codegemma-7b-it": { "name": "CodeGemma-7B-Instruct", @@ -88,6 +96,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-04-10" }, "gpt-3.5-turbo-0125": { "name": "GPT-3.5-Turbo-0125", @@ -98,6 +107,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-01-25" }, "gpt-4o": { "name": "GPT-4o-2024-05-13", @@ -108,6 +118,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-05-13" }, "gpt-4-turbo-2024-04-09": { "name": "GPT-4-Turbo-2024-04-09", @@ -118,6 +129,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-04-09" }, "gpt-4-0613": { "name": "GPT-4-0613", @@ -128,6 +140,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-06-13" }, "codellama/CodeLlama-7b-hf": { "name": "CodeLlama-7B-Base", @@ -138,6 +151,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2023-08-25" }, "codellama/CodeLlama-13b-hf": { "name": "CodeLlama-13B-Base", @@ -148,6 +162,7 @@ "act_param": 13, "open-data": "None", "reasoning": False, + "date": "2023-08-25" }, "codellama/CodeLlama-7b-Instruct-hf": { "name": "CodeLlama-7B-Instruct", @@ -158,6 +173,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2023-08-25" }, "codellama/CodeLlama-13b-Instruct-hf": { "name": "CodeLlama-13B-Instruct", @@ -168,6 +184,7 @@ "act_param": 13, "open-data": "None", "reasoning": False, + "date": "2023-08-25" }, "mistral-large-2402": { "name": "Mistral-Large-2402", @@ -178,6 +195,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-02-26" }, "mistral-small-2402": { "name": "Mistral-Small-2402", @@ -188,6 +206,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-02-26" }, "mistralai/Mixtral-8x22B-v0.1": { "name": "Mixtral-8x22B-Base", @@ -198,6 +217,7 @@ "act_param": 44, "open-data": "None", "reasoning": False, + "date": "2024-04-17" }, "mistralai/Mixtral-8x22B-Instruct-v0.1": { "name": "Mixtral-8x22B-Instruct", @@ -208,6 +228,7 @@ "act_param": 44, "open-data": "None", "reasoning": False, + "date": "2024-04-17" }, "codellama/CodeLlama-34b-hf": { "name": "CodeLlama-34B-Base", @@ -218,6 +239,7 @@ "act_param": 34, "open-data": "None", "reasoning": False, + "date": "2023-08-25" }, "codellama/CodeLlama-34b-Instruct-hf": { "name": "CodeLlama-34B-Instruct", @@ -228,6 +250,7 @@ "act_param": 34, "open-data": "None", "reasoning": False, + "date": "2023-08-25" }, "codellama/CodeLlama-70b-hf": { "name": "CodeLlama-70B-Base", @@ -238,6 +261,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2023-08-25" }, "codellama/CodeLlama-70b-Instruct-hf": { "name": "CodeLlama-70B-Instruct", @@ -248,6 +272,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2023-08-25" }, "Qwen/CodeQwen1.5-7B-Chat": { "name": "CodeQwen1.5-7B-Chat", @@ -258,6 +283,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-04-16" }, "Qwen/Qwen1.5-110B-Chat": { "name": "Qwen1.5-110B-Chat", @@ -268,6 +294,7 @@ "act_param": 110, "open-data": "None", "reasoning": False, + "date": "2024-04-26" }, "Qwen/Qwen1.5-72B-Chat": { "name": "Qwen1.5-72B-Chat", @@ -278,6 +305,7 @@ "act_param": 72, "open-data": "None", "reasoning": False, + "date": "2024-04-26" }, "Qwen/Qwen1.5-32B-Chat": { "name": "Qwen1.5-32B-Chat", @@ -288,6 +316,7 @@ "act_param": 32, "open-data": "None", "reasoning": False, + "date": "2024-04-26" }, "deepseek-ai/DeepSeek-V2-Chat": { "name": "DeepSeek-V2-Chat", @@ -298,6 +327,7 @@ "act_param": 21, "open-data": "None", "reasoning": False, + "date": "2024-04-06" }, "deepseek-ai/deepseek-coder-1.3b-base": { "name": "DeepSeek-Coder-1.3B-Base", @@ -308,6 +338,7 @@ "act_param": 1.3, "open-data": "None", "reasoning": False, + "date": "2023-10-28" }, "deepseek-ai/deepseek-coder-1.3b-instruct": { "name": "DeepSeek-Coder-1.3B-Instruct", @@ -318,6 +349,7 @@ "act_param": 1.3, "open-data": "None", "reasoning": False, + "date": "2023-10-28" }, "deepseek-ai/deepseek-coder-33b-base": { "name": "DeepSeek-Coder-33B-Base", @@ -328,6 +360,7 @@ "act_param": 33, "open-data": "None", "reasoning": False, + "date": "2023-10-28" }, "deepseek-ai/deepseek-coder-33b-instruct": { "name": "DeepSeek-Coder-33B-Instruct", @@ -338,6 +371,7 @@ "act_param": 33, "open-data": "None", "reasoning": False, + "date": "2023-10-28" }, "deepseek-ai/deepseek-coder-6.7b-base": { "name": "DeepSeek-Coder-6.7B-Base", @@ -348,6 +382,7 @@ "act_param": 6.7, "open-data": "None", "reasoning": False, + "date": "2023-10-28" }, "deepseek-ai/deepseek-coder-6.7b-instruct": { "name": "DeepSeek-Coder-6.7B-Instruct", @@ -358,6 +393,7 @@ "act_param": 6.7, "open-data": "None", "reasoning": False, + "date": "2023-10-28" }, "meta-llama/Meta-Llama-3-70B": { "name": "Llama-3-70B-Base", @@ -368,6 +404,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2024-04-18" }, "meta-llama/Meta-Llama-3-70B-Instruct": { "name": "Llama-3-70B-Instruct", @@ -378,6 +415,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2024-04-18" }, "meta-llama/Meta-Llama-3-8B": { "name": "Llama-3-8B-Base", @@ -388,6 +426,7 @@ "act_param": 8, "open-data": "None", "reasoning": False, + "date": "2024-04-18" }, "meta-llama/Meta-Llama-3-8B-Instruct": { "name": "Llama-3-8B-Instruct", @@ -398,6 +437,7 @@ "act_param": 8, "open-data": "None", "reasoning": False, + "date": "2024-04-18" }, "ibm-granite/granite-3b-code-instruct": { "name": "Granite-Code-3B-Instruct", @@ -408,6 +448,7 @@ "act_param": 3, "open-data": "None", "reasoning": False, + "date": "2024-05-06" }, "ibm-granite/granite-8b-code-instruct": { "name": "Granite-Code-8B-Instruct", @@ -418,6 +459,7 @@ "act_param": 8, "open-data": "None", "reasoning": False, + "date": "2024-05-06" }, "ibm-granite/granite-20b-code-instruct": { "name": "Granite-Code-20B-Instruct", @@ -428,6 +470,7 @@ "act_param": 20, "open-data": "None", "reasoning": False, + "date": "2024-05-06" }, "ibm-granite/granite-34b-code-instruct": { "name": "Granite-Code-34B-Instruct", @@ -438,6 +481,7 @@ "act_param": 34, "open-data": "None", "reasoning": False, + "date": "2024-05-06" }, "ibm-granite/granite-3b-code-base": { "name": "Granite-Code-3B-Base", @@ -448,6 +492,7 @@ "act_param": 3, "open-data": "None", "reasoning": False, + "date": "2024-05-06" }, "ibm-granite/granite-8b-code-base": { "name": "Granite-Code-8B-Base", @@ -458,6 +503,7 @@ "act_param": 8, "open-data": "None", "reasoning": False, + "date": "2024-05-06" }, "ibm-granite/granite-20b-code-base": { "name": "Granite-Code-20B-Base", @@ -468,6 +514,7 @@ "act_param": 20, "open-data": "None", "reasoning": False, + "date": "2024-05-06" }, "ibm-granite/granite-34b-code-base": { "name": "Granite-Code-34B-Base", @@ -478,6 +525,7 @@ "act_param": 34, "open-data": "None", "reasoning": False, + "date": "2024-05-06" }, "claude-3-haiku-20240307": { "name": "Claude-3-Haiku-20240307", @@ -488,6 +536,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-03-07" }, "claude-3-sonnet-20240229": { "name": "Claude-3-Sonnet-20240229", @@ -498,6 +547,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-02-29" }, "claude-3-opus-20240229": { "name": "Claude-3-Opus-20240229", @@ -508,6 +558,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-02-29" }, "01-ai/Yi-1.5-34B-Chat": { "name": "Yi-1.5-34B-Chat", @@ -518,6 +569,7 @@ "act_param": 34, "open-data": "None", "reasoning": False, + "date": "2024-05-20" }, "01-ai/Yi-1.5-34B": { "name": "Yi-1.5-34B", @@ -528,6 +580,7 @@ "act_param": 34, "open-data": "None", "reasoning": False, + "date": "2024-05-20" }, "01-ai/Yi-1.5-9B-Chat": { "name": "Yi-1.5-9B-Chat", @@ -538,6 +591,7 @@ "act_param": 9, "open-data": "None", "reasoning": False, + "date": "2024-05-20" }, "01-ai/Yi-1.5-9B": { "name": "Yi-1.5-9B", @@ -548,6 +602,7 @@ "act_param": 9, "open-data": "None", "reasoning": False, + "date": "2024-05-20" }, "01-ai/Yi-1.5-6B-Chat": { "name": "Yi-1.5-6B-Chat", @@ -558,6 +613,7 @@ "act_param": 6, "open-data": "None", "reasoning": False, + "date": "2024-05-20" }, "01-ai/Yi-1.5-6B": { "name": "Yi-1.5-6B", @@ -568,6 +624,7 @@ "act_param": 6, "open-data": "None", "reasoning": False, + "date": "2024-05-20" }, "Qwen/Qwen2-57B-A14B": { "name": "Qwen2-57B-A14B", @@ -578,6 +635,7 @@ "act_param": 14, "open-data": "None", "reasoning": False, + "date": "2024-06-07" }, "Qwen/Qwen2-7B-Instruct": { "name": "Qwen2-7B-Instruct", @@ -588,6 +646,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-06-07" }, "Qwen/Qwen2-72B-Chat": { "name": "Qwen2-72B-Chat", @@ -598,6 +657,7 @@ "act_param": 72, "open-data": "None", "reasoning": False, + "date": "2024-06-07" }, "gemini-1.5-pro": { "name": "Gemini-1.5-Pro-API-0514", @@ -608,6 +668,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-05-14" }, "gemini-1.5-flash": { "name": "Gemini-1.5-Flash-API-0514", @@ -618,6 +679,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-05-14" }, "m-a-p/OpenCodeInterpreter-DS-33B": { "name": "OpenCodeInterpreter-DS-33B", @@ -628,6 +690,7 @@ "act_param": 33, "open-data": "Partial", "reasoning": False, + "date": "2024-02-22" }, "m-a-p/OpenCodeInterpreter-DS-6.7B": { "name": "OpenCodeInterpreter-DS-6.7B", @@ -638,6 +701,7 @@ "act_param": 6.7, "open-data": "Partial", "reasoning": False, + "date": "2024-02-22" }, "m-a-p/OpenCodeInterpreter-DS-1.3B": { "name": "OpenCodeInterpreter-DS-1.3B", @@ -648,6 +712,7 @@ "act_param": 1.3, "open-data": "Partial", "reasoning": False, + "date": "2024-02-22" }, "microsoft/Phi-3-medium-128k-instruct": { "name": "Phi-3-Medium-128K-Instruct", @@ -658,6 +723,7 @@ "act_param": 14, "open-data": "None", "reasoning": False, + "date": "2024-05-21" }, "microsoft/Phi-3-small-128k-instruct": { "name": "Phi-3-Small-128K-Instruct", @@ -668,6 +734,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-05-21" }, "codestral-2405": { "name": "Codestral-22B-v0.1", @@ -678,6 +745,7 @@ "act_param": 22, "open-data": "None", "reasoning": False, + "date": "2024-05-23" }, "codestral-mamba-2407": { "name": "Codestral-Mamba", @@ -688,6 +756,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-07-16" }, "mistralai/Mistral-7B-Instruct-v0.3": { "name": "Mistral-7B-Instruct-v0.3", @@ -698,6 +767,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-05-22" }, "mistralai/Mistral-7B-v0.3": { "name": "Mistral-7B-v0.3", @@ -708,6 +778,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-05-22" }, "CohereForAI/c4ai-command-r-plus": { "name": "Command R+", @@ -718,6 +789,7 @@ "act_param": 104, "open-data": "None", "reasoning": False, + "date": "2024-04-04" }, "deepseek-coder": { "name": "DeepSeek-Coder-V2-Instruct", @@ -728,6 +800,7 @@ "act_param": 21, "open-data": "None", "reasoning": True, + "date": "2024-06-17" }, "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": { "name": "DeepSeek-Coder-V2-Lite-Instruct", @@ -738,6 +811,7 @@ "act_param": 2.4, "open-data": "None", "reasoning": False, + "date": "2024-06-17" }, "deepseek-ai/DeepSeek-Coder-V2-Lite-Base": { "name": "DeepSeek-Coder-V2-Lite-Base", @@ -748,6 +822,7 @@ "act_param": 2.4, "open-data": "None", "reasoning": False, + "date": "2024-06-17" }, "claude-3-5-sonnet-20240620": { "name": "Claude-3.5-Sonnet-20240620", @@ -758,6 +833,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-06-20" }, "NousResearch/Hermes-2-Theta-Llama-3-70B": { "name": "Hermes-2-Theta-Llama-3-70B", @@ -768,6 +844,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2024-06-24" }, "microsoft/wavecoder-ultra-6.7b": { "name": "WaveCoder-Ultra-6.7B", @@ -778,6 +855,7 @@ "act_param": 6.7, "open-data": "None", "reasoning": False, + "date": "2023-12-26" }, "google/gemma-2-9b-it": { "name": "Gemma-2-9B-Instruct", @@ -788,6 +866,7 @@ "act_param": 9, "open-data": "None", "reasoning": False, + "date": "2024-06-19" }, "Bin12345/AutoCoder": { "name": "AutoCoder", @@ -798,6 +877,7 @@ "act_param": 33, "open-data": "None", "reasoning": False, + "date": "2024-05-23" }, "Bin12345/AutoCoder_S_6.7B": { "name": "AutoCoder-S-6.7B", @@ -808,6 +888,7 @@ "act_param": 6.7, "open-data": "None", "reasoning": False, + "date": "2024-05-23" }, "Bin12345/AutoCoder_QW_7B": { "name": "AutoCoder-QW-7B", @@ -818,6 +899,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-05-23" }, "SenseLLM/ReflectionCoder-DS-33B": { "name": "ReflectionCoder-DS-33B", @@ -828,6 +910,7 @@ "act_param": 33, "open-data": "Partial", "reasoning": False, + "date": "2024-05-27" }, "SenseLLM/ReflectionCoder-DS-6.7B": { "name": "ReflectionCoder-DS-6.7B", @@ -838,6 +921,7 @@ "act_param": 6.7, "open-data": "Partial", "reasoning": False, + "date": "2024-05-27" }, "SenseLLM/ReflectionCoder-CL-34B": { "name": "ReflectionCoder-CL-34B", @@ -848,6 +932,7 @@ "act_param": 34, "open-data": "Partial", "reasoning": False, + "date": "2024-05-27" }, "SenseLLM/ReflectionCoder-CL-7B": { "name": "ReflectionCoder-CL-7B", @@ -858,6 +943,7 @@ "act_param": 7, "open-data": "Partial", "reasoning": False, + "date": "2024-05-27" }, "new-microsoft/Phi-3-mini-128k-instruct": { "name": "Phi-3.1-Mini-128K-Instruct", @@ -868,6 +954,7 @@ "act_param": 3.8, "open-data": "None", "reasoning": False, + "date": "2024-05-21" }, "old-microsoft/Phi-3-mini-128k-instruct": { "name": "Phi-3-Mini-128K-Instruct", @@ -878,6 +965,7 @@ "act_param": 3.8, "open-data": "None", "reasoning": False, + "date": "2024-05-21" }, "internlm/internlm2_5-7b-chat": { "name": "InternLM2.5-7B-Chat", @@ -888,6 +976,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-07-03" }, "NousResearch/Hermes-2-Pro-Llama-3-70B": { "name": "Hermes-2-Pro-Llama-3-70B", @@ -898,6 +987,7 @@ "act_param": 70, "open-data": "Partial", "reasoning": False, + "date": "2024-06-27" }, "new-deepseek-chat": { "name": "DeepSeek-V2-Chat (2024-06-28)", @@ -908,6 +998,7 @@ "act_param": 21, "open-data": "None", "reasoning": True, + "date": "2024-06-28" }, "vllm-google/gemma-2-27b-it": { "name": "Gemma-2-27B-Instruct", @@ -918,6 +1009,7 @@ "act_param": 27, "open-data": "None", "reasoning": False, + "date": "2024-06-19" }, "Artigenz/Artigenz-Coder-DS-6.7B": { "name": "Artigenz-Coder-DS-6.7B", @@ -928,6 +1020,7 @@ "act_param": 6.7, "open-data": "None", "reasoning": False, + "date": "2024-04-16" }, "openchat/openchat-3.6-8b-20240522": { "name": "OpenChat-3.6-8B-20240522", @@ -938,6 +1031,7 @@ "act_param": 8, "open-data": "None", "reasoning": False, + "date": "2024-05-22" }, "Phind/Phind-CodeLlama-34B-v2": { "name": "Phind-CodeLlama-34B-v2", @@ -948,6 +1042,7 @@ "act_param": 34, "open-data": "None", "reasoning": False, + "date": "2023-08-25" }, "yi-large": { "name": "Yi-Large", @@ -958,6 +1053,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-05-13" }, "THUDM/codegeex4-all-9b": { "name": "CodeGeex4-All-9B", @@ -968,6 +1064,7 @@ "act_param": 9, "open-data": "None", "reasoning": False, + "date": "2024-07-05" }, "gpt-4o-mini-2024-07-18": { "name": "GPT-4o-mini-2024-07-18", @@ -978,6 +1075,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-07-18" }, "Nexusflow/Athene-70B": { "name": "Athene-70B", @@ -988,6 +1086,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2024-07-20" }, "NTQAI/Nxcode-CQ-7B-orpo": { "name": "Nxcode-CQ-7B-Orpo", @@ -998,6 +1097,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-04-25" }, "migtissera/Llama-3-70B-Synthia-v3.5": { "name": "Llama-3-70B-Synthia-v3.5", @@ -1008,6 +1108,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2024-05-27" }, "migtissera/Tess-v2.5.2-Qwen2-72B": { "name": "Tess-v2.5.2-Qwen2-72B", @@ -1018,6 +1119,7 @@ "act_param": 72, "open-data": "None", "reasoning": False, + "date": "2024-07-18" }, "WhiteRabbitNeo/WhiteRabbitNeo-33B-v1.5": { "name": "WhiteRabbitNeo-33B-v1.5", @@ -1028,6 +1130,7 @@ "act_param": 33, "open-data": "None", "reasoning": False, + "date": "2024-02-10" }, "mistral-large-2407": { "name": "Mistral-Large-Instruct-2407", @@ -1038,6 +1141,7 @@ "act_param": 123, "open-data": "None", "reasoning": True, + "date": "2024-07-24" }, "meta-llama/Meta-Llama-3.1-8B-Instruct": { "name": "Llama-3.1-8B-Instruct", @@ -1048,6 +1152,7 @@ "act_param": 8, "open-data": "None", "reasoning": False, + "date": "2024-07-23" }, "meta-llama/Meta-Llama-3.1-70B-Instruct": { "name": "Llama-3.1-70B-Instruct", @@ -1058,6 +1163,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2024-07-23" }, "meta--llama-3.1-405b-instruct": { "name": "Llama-3.1-405B-Instruct", @@ -1068,6 +1174,7 @@ "act_param": 405, "open-data": "None", "reasoning": False, + "date": "2024-07-23" }, "deepseek-coder-20240724": { "name": "DeepSeek-Coder-V2-Instruct (2024-07-24)", @@ -1078,6 +1185,7 @@ "act_param": 21, "open-data": "None", "reasoning": False, + "date": "2024-07-24" }, "microsoft/Phi-3.5-mini-instruct": { "name": "Phi-3.5-Mini-Instruct", @@ -1088,6 +1196,7 @@ "act_param": 3.8, "open-data": "None", "reasoning": False, + "date": "2024-04-23" }, "nv-mistralai--mistral-nemo-12b-instruct": { "name": "Mistral-Nemo-12B-Instruct", @@ -1098,6 +1207,7 @@ "act_param": 12, "open-data": "None", "reasoning": False, + "date": "2024-07-18" }, "wyt2000/InverseCoder-CL-13B": { "name": "InverseCoder-CL-13B", @@ -1108,6 +1218,7 @@ "act_param": 13, "open-data": "Partial", "reasoning": False, + "date": "2024-07-08" }, "wyt2000/InverseCoder-CL-7B": { "name": "InverseCoder-CL-7B", @@ -1118,6 +1229,7 @@ "act_param": 7, "open-data": "Partial", "reasoning": False, + "date": "2024-07-08" }, "wyt2000/InverseCoder-DS-6.7B": { "name": "InverseCoder-DS-6.7B", @@ -1128,6 +1240,7 @@ "act_param": 6.7, "open-data": "Partial", "reasoning": False, + "date": "2024-07-08" }, "gemini-1.5-pro-exp-0801": { "name": "Gemini-1.5-Pro-Exp-0801", @@ -1138,6 +1251,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-08-01" }, "gpt-4o-2024-08-06": { "name": "GPT-4o-2024-08-06", @@ -1148,6 +1262,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-08-06" }, "abacusai/Dracarys-Llama-3.1-70B-Instruct": { "name": "Dracarys-Llama-3.1-70B-Instruct", @@ -1158,6 +1273,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2024-08-23" }, "abacusai/Dracarys-72B-Instruct": { "name": "Dracarys-72B-Instruct", @@ -1168,6 +1284,7 @@ "act_param": 72, "open-data": "None", "reasoning": False, + "date": "2024-08-23" }, "gemini-1.5-pro-exp-0827": { "name": "Gemini-1.5-Pro-Exp-0827", @@ -1178,6 +1295,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-08-27" }, "gemini-1.5-flash-exp-0827": { "name": "Gemini-1.5-Flash-Exp-0827", @@ -1188,6 +1306,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-08-27" }, "microsoft/Phi-3.5-mini-instruct": { "name": "Phi-3.5-Mini-Instruct", @@ -1198,6 +1317,7 @@ "act_param": 3.8, "open-data": "None", "reasoning": False, + "date": "2024-04-23" }, "abacusai/Dracarys-Llama-3.1-70B-Instruct": { "name": "Dracarys-Llama-3.1-70B-Instruct", @@ -1208,6 +1328,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2024-04-23" }, "abacusai/Dracarys-72B-Instruct": { "name": "Dracarys-72B-Instruct", @@ -1218,6 +1339,7 @@ "act_param": 72, "open-data": "None", "reasoning": False, + "date": "2024-04-23" }, "deepseek-coder-v2.5": { "name": "DeepSeek-V2.5", @@ -1228,6 +1350,7 @@ "act_param": 21, "open-data": "None", "reasoning": False, + "date": "2024-09-18" }, "CohereForAI/c4ai-command-r-08-2024": { "name": "C4AI-Command-R-08-2024", @@ -1238,6 +1361,7 @@ "act_param": 32.3, "open-data": "None", "reasoning": False, + "date": "2024-08-30" }, "CohereForAI/c4ai-command-r-plus-08-2024": { "name": "C4AI-Command-R-Plus-08-2024", @@ -1248,6 +1372,7 @@ "act_param": 104, "open-data": "None", "reasoning": False, + "date": "2024-08-30" }, "ayueei--yue-coder-9b-preview": { "name": "Yi-Coder-9B-Chat", @@ -1258,6 +1383,7 @@ "act_param": 9, "open-data": "None", "reasoning": False, + "date": "2024-09-04" }, # "mattshumer/ref_70_e3_prefill": { # "name": "Reflection-Llama-3.1-70B", @@ -1286,6 +1412,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-09-12" }, "o1-mini-2024-09-12": { "name": "o1-Mini-2024-09-12 (temperature=1)", @@ -1296,6 +1423,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-09-12" }, "Qwen/Qwen2.5-Coder-1.5B-Instruct": { "name": "Qwen2.5-Coder-1.5B-Instruct", @@ -1306,6 +1434,7 @@ "act_param": 1.5, "open-data": "None", "reasoning": False, + "date": "2024-11-12" }, "Qwen/Qwen2.5-Coder-7B-Instruct": { "name": "Qwen2.5-Coder-7B-Instruct", @@ -1316,6 +1445,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-11-12" }, "gemini-1.5-pro-002": { "name": "Gemini-1.5-Pro-002", @@ -1326,6 +1456,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-09-25" }, "mistralai/Mistral-Small-Instruct-2409": { "name": "Mistral-Small-Instruct-2409", @@ -1336,6 +1467,7 @@ "act_param": 22.2, "open-data": "None", "reasoning": False, + "date": "2024-09-18" }, "Qwen/Qwen2.5-0.5B-Instruct": { "name": "Qwen2.5-0.5B-Instruct", @@ -1346,6 +1478,7 @@ "act_param": 0.5, "open-data": "None", "reasoning": False, + "date": "2024-09-19" }, "Qwen/Qwen2.5-1.5B-Instruct": { "name": "Qwen2.5-1.5B-Instruct", @@ -1356,6 +1489,7 @@ "act_param": 1.5, "open-data": "None", "reasoning": False, + "date": "2024-09-19" }, "Qwen/Qwen2.5-7B-Instruct": { "name": "Qwen2.5-7B-Instruct", @@ -1366,6 +1500,7 @@ "act_param": 7, "open-data": "None", "reasoning": False, + "date": "2024-09-19" }, "Qwen/Qwen2.5-14B-Instruct": { "name": "Qwen2.5-14B-Instruct", @@ -1376,6 +1511,7 @@ "act_param": 14, "open-data": "None", "reasoning": False, + "date": "2024-09-19" }, "Qwen/Qwen2.5-32B-Instruct": { "name": "Qwen2.5-32B-Instruct", @@ -1386,6 +1522,7 @@ "act_param": 32, "open-data": "None", "reasoning": False, + "date": "2024-09-19" }, "Qwen/Qwen2.5-72B-Instruct": { "name": "Qwen2.5-72B-Instruct", @@ -1396,6 +1533,7 @@ "act_param": 72, "open-data": "None", "reasoning": False, + "date": "2024-09-19" }, "meta-llama/Llama-3.2-1B-Instruct": { "name": "Llama-3.2-1B-Instruct", @@ -1406,6 +1544,7 @@ "act_param": 1, "open-data": "None", "reasoning": False, + "date": "2024-09-25" }, "meta-llama/Llama-3.2-3B-Instruct": { "name": "Llama-3.2-3B-Instruct", @@ -1416,6 +1555,7 @@ "act_param": 3, "open-data": "None", "reasoning": False, + "date": "2024-09-25" }, "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF": { "name": "Llama-3.1-Nemotron-70B-Instruct", @@ -1426,6 +1566,7 @@ "act_param": 70, "open-data": "Partial", "reasoning": False, + "date": "2024-09-25" }, "claude-3-5-sonnet-20241022": { "name": "Claude-3.5-Sonnet-20241022", @@ -1436,6 +1577,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-10-22" }, "ibm-granite/granite-3.0-8b-instruct": { "name": "Granite-3.0-8B-Instruct", @@ -1446,6 +1588,7 @@ "act_param": 8, "open-data": "None", "reasoning": False, + "date": "2024-10-21" }, "ibm-granite/granite-3.0-2b-instruct": { "name": "Granite-3.0-2B-Instruct", @@ -1456,6 +1599,7 @@ "act_param": 2, "open-data": "None", "reasoning": False, + "date": "2024-10-21" }, "grok-beta--main": { "name": "Grok-Beta", @@ -1466,6 +1610,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-03-17" }, "claude-3-5-haiku-20241022--main": { "name": "Claude-3.5-Haiku-20241022", @@ -1476,6 +1621,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-10-22" }, "Qwen/Qwen2.5-Coder-14B-Instruct--main": { "name": "Qwen2.5-Coder-14B-Instruct", @@ -1486,6 +1632,7 @@ "act_param": 14, "open-data": "None", "reasoning": False, + "date": "2024-09-19" }, "Qwen/Qwen2.5-Coder-32B-Instruct--main": { "name": "Qwen2.5-Coder-32B-Instruct", @@ -1496,6 +1643,7 @@ "act_param": 32, "open-data": "None", "reasoning": False, + "date": "2024-09-19" }, "infly/OpenCoder-1.5B-Instruct--main": { "name": "OpenCoder-1.5B-Instruct", @@ -1506,6 +1654,7 @@ "act_param": 1.5, "open-data": "None", "reasoning": False, + "date": "2024-11-09" }, "infly/OpenCoder-8B-Instruct--main": { "name": "OpenCoder-8B-Instruct", @@ -1516,6 +1665,7 @@ "act_param": 8, "open-data": "None", "reasoning": False, + "date": "2024-11-09" }, "microsoft/Phi-3.5-mini-instruct--main": { "name": "Phi-3.5-Mini-Instruct", @@ -1526,6 +1676,7 @@ "act_param": 3.8, "open-data": "None", "reasoning": False, + "date": "2024-08-21" }, "Nexusflow/Athene-V2-Agent--main": { "name": "Athene-V2-Agent", @@ -1536,6 +1687,7 @@ "act_param": 72, "open-data": "None", "reasoning": False, + "date": "2024-11-14" }, "Nexusflow/Athene-V2-Chat--main": { "name": "Athene-V2-Chat", @@ -1546,6 +1698,7 @@ "act_param": 72, "open-data": "None", "reasoning": False, + "date": "2024-11-14" }, "gemini-exp-1114--main": { "name": "Gemini-Exp-1114", @@ -1556,6 +1709,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-11-14" }, "gpt-4o-2024-11-20--main": { "name": "GPT-4o-2024-11-20", @@ -1566,6 +1720,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-11-20" }, "gemini-exp-1121--main": { "name": "Gemini-Exp-1121", @@ -1576,6 +1731,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-11-21" }, "gemini-exp-1206--main": { "name": "Gemini-Exp-1206", @@ -1586,6 +1742,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-12-06" }, "meta-llama--Llama-3.3-70B-Instruct--main": { "name": "Llama-3.3-70B-Instruct", @@ -1596,6 +1753,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2024-12-19" }, "deepseek-ai--DeepSeek-V2.5-1210--main": { "name": "DeepSeek-V2.5-1210", @@ -1606,6 +1764,7 @@ "act_param": 21, "open-data": "None", "reasoning": False, + "date": "2024-12-10" }, "gemini-2.0-flash-exp--main": { "name": "Gemini-2.0-Flash-Exp", @@ -1616,6 +1775,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-12-11" }, "gemini-2.0-flash-thinking-exp-1219--main": { "name": "Gemini-2.0-Flash-Thinking-Exp-1219", @@ -1626,6 +1786,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2024-12-19" }, "gemini-2.0-flash-thinking-exp-01-21--main": { "name": "Gemini-2.0-Flash-Thinking-Exp-01-21", @@ -1636,6 +1797,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2025-01-21" }, "o1-2024-12-17--main": { "name": "o1-2024-12-17 (temperature=1, reasoning=medium)", @@ -1646,6 +1808,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-12-17" }, "o1-2024-12-17--low--main": { "name": "o1-2024-12-17 (temperature=1, reasoning=low)", @@ -1656,6 +1819,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-12-17" }, "o1-2024-12-17--high--main": { "name": "o1-2024-12-17 (temperature=1, reasoning=high)", @@ -1666,16 +1830,18 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2024-12-17" }, "deepseek-v3-chat--main": { - "name": "DeepSeek-V3-Chat", - "link": "https://huggingface.co/deepseek-ai/DeepSeek-V3-Chat", + "name": "DeepSeek-V3", + "link": "https://huggingface.co/deepseek-ai/DeepSeek-V3", "prompted": True, "moe": True, "size": 671, "act_param": 37, "open-data": "None", "reasoning": True, + "date": "2024-12-26" }, "microsoft--phi-4--main": { "name": "Phi-4", @@ -1686,6 +1852,7 @@ "act_param": 14.7, "open-data": "None", "reasoning": False, + "date": "2024-12-13" }, "deepseek-reasoner--main": { "name": "DeepSeek-R1", @@ -1696,6 +1863,7 @@ "act_param": 37, "open-data": "None", "reasoning": True, + "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Llama-70B--main": { "name": "DeepSeek-R1-Distill-Llama-70B", @@ -1706,6 +1874,7 @@ "act_param": 70, "open-data": "None", "reasoning": False, + "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B--main": { "name": "DeepSeek-R1-Distill-Qwen-32B", @@ -1716,6 +1885,7 @@ "act_param": 32, "open-data": "None", "reasoning": False, + "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B--main": { "name": "DeepSeek-R1-Distill-Qwen-14B", @@ -1726,6 +1896,7 @@ "act_param": 14, "open-data": "None", "reasoning": False, + "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Llama-8B--main": { "name": "DeepSeek-R1-Distill-Llama-8B", @@ -1736,6 +1907,7 @@ "act_param": 14, "open-data": "None", "reasoning": False, + "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B--main": { "name": "DeepSeek-R1-Distill-Qwen-7B", @@ -1746,6 +1918,7 @@ "act_param": 14, "open-data": "None", "reasoning": False, + "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B--main": { "name": "DeepSeek-R1-Distill-Qwen-1.5B", @@ -1756,6 +1929,7 @@ "act_param": 1.5, "open-data": "None", "reasoning": False, + "date": "2025-01-20" }, "mistralai/Mistral-Small-24B-Instruct-2501--main": { "name": "Mistral-Small-24B-Instruct-2501", @@ -1766,6 +1940,7 @@ "act_param": 24, "open-data": "None", "reasoning": False, + "date": "2025-01-31" }, "o3-mini-2025-01-31--medium--main": { "name": "o3-mini-2025-01-31 (temperature=1, reasoning=medium)", @@ -1776,6 +1951,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2025-01-31" }, "o3-mini-2025-01-31--low--main": { "name": "o3-mini-2025-01-31 (temperature=1, reasoning=low)", @@ -1786,6 +1962,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2025-01-31" }, "o3-mini-2025-01-31--high--main": { "name": "o3-mini-2025-01-31 (temperature=1, reasoning=high)", @@ -1796,6 +1973,7 @@ "act_param": None, "open-data": "None", "reasoning": True, + "date": "2025-01-31" }, "gemini-2.0-flash-001--main": { "name": "Gemini-2.0-Flash-001", @@ -1806,6 +1984,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2025-02-05" }, "gemini-2.0-flash-exp--main": { "name": "Gemini-2.0-Flash-Exp", @@ -1816,6 +1995,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2025-02-05" }, "gemini-2.0-flash-lite-preview-02-05--main": { "name": "Gemini-2.0-Flash-Lite-Preview-02-05", @@ -1826,6 +2006,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2025-02-05" }, "gemini-2.0-pro-exp-02-05--main": { "name": "Gemini-2.0-Pro-Exp-02-05", @@ -1836,6 +2017,7 @@ "act_param": None, "open-data": "None", "reasoning": False, + "date": "2025-02-05" }, "NovaSky-AI--Sky-T1-32B-Flash--main": { "name": "Sky-T1-32B-Flash", @@ -1846,6 +2028,7 @@ "act_param": 32, "open-data": "None", "reasoning": False, + "date": "2025-01-12" }, "NovaSky-AI--Sky-T1-32B-Preview--main": { "name": "Sky-T1-32B-Preview", @@ -1856,6 +2039,7 @@ "act_param": 32, "open-data": "None", "reasoning": False, + "date": "2025-01-12" }, "Qwen--QwQ-32B-Preview--main": { "name": "QwQ-32B-Preview", @@ -1866,5 +2050,69 @@ "act_param": 32, "open-data": "None", "reasoning": False, + "date": "2024-11-28" + }, + "claude-3-7-sonnet-20250219--main": { + "name": "Claude-3-Haiku-20240307", + "link": "https://www.anthropic.com/news/claude-3-family", + "prompted": True, + "moe": False, + "size": None, + "act_param": None, + "open-data": "None", + "reasoning": True, + "date": "2025-02-19" + }, + "chatgpt-4o-latest--main": { + "name": "ChatGPT-4o-latest-20250129", + "link": "https://chat.openai.com/", + "open-data": "None", + "prompted": True, + "moe": False, + "size": None, + "act_param": None, + "date": "2025-01-29" + }, + "Kwaipilot--KwaiCoder-23B-A4B-v1--main": { + "name": "KwaiCoder-23B-A4B-v1", + "link": "https://huggingface.co/Kwaipilot/KwaiCoder-23B-A4B-v1", + "open-data": "None", + "prompted": False, + "moe": True, + "size": 23, + "act_param": 4, + "date": "2025-01-25" + }, + "qwen-max-latest--main": { + "name": "Qwen2.5-Max", + "link": "https://qwenlm.github.io/blog/qwen2.5-max/", + "open-data": "None", + "prompted": True, + "moe": True, + "size": None, + "act_param": None, + "date": "2025-01-28" + }, + "claude-3-7-sonnet-20250219--3200-output-128k-2025-02-19--main": { + "name": "Claude-3.7-Sonnet-20250219 (temperature=1, length=12800, reasoning=3200)", + "link": "https://www.anthropic.com/news/claude-3-7-sonnet", + "prompted": True, + "moe": False, + "size": None, + "act_param": None, + "open-data": "None", + "reasoning": True, + "date": "2025-02-19" + }, + "claude-3-7-sonnet-20250219--main": { + "name": "Claude-3.7-Sonnet-20250219", + "link": "https://www.anthropic.com/news/claude-3-7-sonnet", + "prompted": True, + "moe": False, + "size": None, + "act_param": None, + "open-data": "None", + "reasoning": True, + "date": "2025-02-19" }, } From 5f0743d0a6874fd6fdfe6ab616fe7f65145fb038 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 25 Mar 2025 00:03:20 +0800 Subject: [PATCH 12/24] fix: remove vllm max length --- bigcodebench/provider/vllm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigcodebench/provider/vllm.py b/bigcodebench/provider/vllm.py index 25f00b4..60b2285 100644 --- a/bigcodebench/provider/vllm.py +++ b/bigcodebench/provider/vllm.py @@ -41,7 +41,7 @@ def __init__(self, name: str, lora_path: str, dataset: str, tp: int, **kwargs) - local_lora_path, ) - self.llm = LLM(model=name, max_model_len=self.max_new_tokens, enable_lora=True if self.lora_request else False, **kwargs) + self.llm = LLM(model=name, enable_lora=True if self.lora_request else False, **kwargs) self.llm.set_tokenizer(tokenizer=self.tokenizer) def is_direct_completion(self) -> bool: From 3513d997f55c383dec3436d7b43704a4affbc8d9 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 25 Mar 2025 00:13:39 +0800 Subject: [PATCH 13/24] fix: hardcode the model max length for vllm --- bigcodebench/provider/vllm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bigcodebench/provider/vllm.py b/bigcodebench/provider/vllm.py index 60b2285..229e4c9 100644 --- a/bigcodebench/provider/vllm.py +++ b/bigcodebench/provider/vllm.py @@ -41,7 +41,8 @@ def __init__(self, name: str, lora_path: str, dataset: str, tp: int, **kwargs) - local_lora_path, ) - self.llm = LLM(model=name, enable_lora=True if self.lora_request else False, **kwargs) + # max_model_len is set to max_new_tokens * 10 + self.llm = LLM(model=name, max_model_len=self.max_new_tokens * 10, enable_lora=True if self.lora_request else False, **kwargs) self.llm.set_tokenizer(tokenizer=self.tokenizer) def is_direct_completion(self) -> bool: From 00fc9bb98c932424c2e9bf82ab417142aaca5e1d Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 25 Mar 2025 02:20:37 +0800 Subject: [PATCH 14/24] fix model metadata --- analysis/utils.py | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/analysis/utils.py b/analysis/utils.py index ec774c7..29a1cb7 100755 --- a/analysis/utils.py +++ b/analysis/utils.py @@ -1903,8 +1903,8 @@ "link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "prompted": True, "moe": False, - "size": 14, - "act_param": 14, + "size": 8, + "act_param": 8, "open-data": "None", "reasoning": False, "date": "2025-01-20" @@ -1914,8 +1914,8 @@ "link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", "prompted": True, "moe": False, - "size": 14, - "act_param": 14, + "size": 7, + "act_param": 7, "open-data": "None", "reasoning": False, "date": "2025-01-20" @@ -2115,4 +2115,36 @@ "reasoning": True, "date": "2025-02-19" }, + "WarriorCoder-6.7B--main": { + "name": "WarriorCoder-6.7B (Reproduced)", + "link": "https://arxiv.org/abs/2412.17395", + "open-data": "None", + "prompted": True, + "moe": False, + "date": "2025-02-18" + }, + "google--gemma-3-27b-it--main": { + "name": "Gemma-3-27B-Instruct", + "link": "https://huggingface.co/google/gemma-3-27b-it", + "open-data": "None", + "prompted": True, + "moe": False, + "date": "2025-03-12" + }, + "Qwen--QwQ-32B--skip_prefill--main": { + "name": "QwQ-32B (w/ Reasoning)", + "link": "https://huggingface.co/Qwen/QwQ-32B", + "open-data": "None", + "prompted": True, + "moe": False, + "date": "2025-03-06" + }, + "deepseek-chat-0324--main": { + "name": "DeepSeek-V3-0324", + "link": "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324", + "open-data": "None", + "prompted": True, + "moe": False, + "date": "2025-03-24" + } } From 720681b8ecbcabbfafa6f4c1aae1ca8365d726c4 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 25 Mar 2025 02:22:11 +0800 Subject: [PATCH 15/24] feat: add max_model_len for vllm --- ADVANCED_USAGE.md | 1 + bigcodebench/generate.py | 3 +++ bigcodebench/provider/__init__.py | 2 ++ bigcodebench/provider/vllm.py | 5 ++--- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md index 9bb81b8..c0905ba 100755 --- a/ADVANCED_USAGE.md +++ b/ADVANCED_USAGE.md @@ -50,6 +50,7 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio - `--n_samples`: The number of samples, default to `1` - `--temperature`: The temperature, default to `0.0` - `--max_new_tokens`: The length of max new tokens, default to `1280` +- `--max_model_len`: The length of max tokens in VLLM, default to `12800` - `--greedy`: Whether to use greedy decoding, default to `False` - `--strip_newlines`: Whether to strip newlines, default to `False`, set to `True` to strip newlines for some model series like StarCoder2 - `--direct_completion`: Whether to use direct completion, default to `False` diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py index 87b67ea..912abcd 100644 --- a/bigcodebench/generate.py +++ b/bigcodebench/generate.py @@ -132,6 +132,8 @@ def run_codegen( n_samples: int = 1, temperature: float = 0.0, max_new_tokens: int = 1280, + # vllm + max_model_len: int = 12800, greedy: bool = False, # openai reasoning_effort: str = "medium", @@ -178,6 +180,7 @@ def run_codegen( lora_path=lora_path, temperature=temperature, max_new_tokens=max_new_tokens, + max_model_len=max_model_len, reasoning_effort=reasoning_effort, reasoning_budget=reasoning_budget, reasoning_beta=reasoning_beta, diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py index 202d049..4cb3410 100644 --- a/bigcodebench/provider/__init__.py +++ b/bigcodebench/provider/__init__.py @@ -10,6 +10,7 @@ def make_model( dataset: str = "bigcodebench", temperature: float = 0.0, max_new_tokens: int = 1280, + max_model_len: int = 12800, # openai only reasoning_effort: str = "medium", # anthropic only @@ -42,6 +43,7 @@ def make_model( lora_path=lora_path, temperature=temperature, max_new_tokens=max_new_tokens, + max_model_len=max_model_len, revision=revision, dataset=dataset, direct_completion=direct_completion, diff --git a/bigcodebench/provider/vllm.py b/bigcodebench/provider/vllm.py index 229e4c9..41cd251 100644 --- a/bigcodebench/provider/vllm.py +++ b/bigcodebench/provider/vllm.py @@ -13,7 +13,7 @@ ) class VllmDecoder(DecoderBase): - def __init__(self, name: str, lora_path: str, dataset: str, tp: int, **kwargs) -> None: + def __init__(self, name: str, lora_path: str, dataset: str, tp: int, max_model_len: int, **kwargs) -> None: super().__init__(name, **kwargs) kwargs = { @@ -41,8 +41,7 @@ def __init__(self, name: str, lora_path: str, dataset: str, tp: int, **kwargs) - local_lora_path, ) - # max_model_len is set to max_new_tokens * 10 - self.llm = LLM(model=name, max_model_len=self.max_new_tokens * 10, enable_lora=True if self.lora_request else False, **kwargs) + self.llm = LLM(model=name, max_model_len=max_model_len, enable_lora=True if self.lora_request else False, **kwargs) self.llm.set_tokenizer(tokenizer=self.tokenizer) def is_direct_completion(self) -> bool: From c9e2cbba6618bec6ced0aa08892e4a7446d128ee Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 1 Apr 2025 01:10:09 +0800 Subject: [PATCH 16/24] update model metadata --- analysis/utils.py | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/analysis/utils.py b/analysis/utils.py index 29a1cb7..798499b 100755 --- a/analysis/utils.py +++ b/analysis/utils.py @@ -2071,6 +2071,7 @@ "moe": False, "size": None, "act_param": None, + "reasoning": True, "date": "2025-01-29" }, "Kwaipilot--KwaiCoder-23B-A4B-v1--main": { @@ -2081,6 +2082,7 @@ "moe": True, "size": 23, "act_param": 4, + "reasoning": False, "date": "2025-01-25" }, "qwen-max-latest--main": { @@ -2091,6 +2093,7 @@ "moe": True, "size": None, "act_param": None, + "reasoning": True, "date": "2025-01-28" }, "claude-3-7-sonnet-20250219--3200-output-128k-2025-02-19--main": { @@ -2121,6 +2124,10 @@ "open-data": "None", "prompted": True, "moe": False, + "size": 6.7, + "act_param": 6.7, + "open-data": "None", + "reasoning": False, "date": "2025-02-18" }, "google--gemma-3-27b-it--main": { @@ -2129,6 +2136,10 @@ "open-data": "None", "prompted": True, "moe": False, + "size": 27, + "act_param": 27, + "open-data": "None", + "reasoning": False, "date": "2025-03-12" }, "Qwen--QwQ-32B--skip_prefill--main": { @@ -2137,6 +2148,10 @@ "open-data": "None", "prompted": True, "moe": False, + "size": 32, + "act_param": 32, + "open-data": "None", + "reasoning": True, "date": "2025-03-06" }, "deepseek-chat-0324--main": { @@ -2144,7 +2159,23 @@ "link": "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324", "open-data": "None", "prompted": True, - "moe": False, + "moe": True, + "size": 671, + "act_param": 37, + "open-data": "None", + "reasoning": True, "date": "2025-03-24" + }, + "gemini-2.5-pro-exp-03-25--main": { + "name": "Gemini-2.5-Pro-Exp-03-25", + "link": "https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/", + "open-data": "None", + "prompted": True, + "moe": False, + "size": None, + "act_param": 37, + "open-data": "None", + "reasoning": True, + "date": "2025-03-25" } -} +} \ No newline at end of file From 9bd90fedee89d7dc3676838c75d9642cb0cd0702 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 1 Apr 2025 01:11:27 +0800 Subject: [PATCH 17/24] feat: use google genai --- Docker/Evaluate.Dockerfile | 2 +- bigcodebench/gen/util/google_request.py | 42 ++++++++++++++++--------- bigcodebench/provider/google.py | 9 +++--- setup.cfg | 2 +- 4 files changed, 35 insertions(+), 20 deletions(-) diff --git a/Docker/Evaluate.Dockerfile b/Docker/Evaluate.Dockerfile index 90e7f40..8b2cdcd 100755 --- a/Docker/Evaluate.Dockerfile +++ b/Docker/Evaluate.Dockerfile @@ -54,7 +54,7 @@ RUN pip install \ rich \ accelerate \ anthropic \ - google-generativeai \ + google-genai \ mistralai \ openai \ e2b diff --git a/bigcodebench/gen/util/google_request.py b/bigcodebench/gen/util/google_request.py index 9e13607..5a76362 100644 --- a/bigcodebench/gen/util/google_request.py +++ b/bigcodebench/gen/util/google_request.py @@ -1,11 +1,12 @@ import time -import google.generativeai as genai +from google import genai from google.api_core.exceptions import GoogleAPICallError, ResourceExhausted def make_request( - client: genai.GenerativeModel, + model: str, + client: genai.Client, message: str, temperature: float, n: int, @@ -13,21 +14,34 @@ def make_request( ) -> genai.types.GenerateContentResponse: kwargs = {"temperature": temperature, "max_output_tokens": max_new_tokens} - if "-thinking-" in client.model_name: + if "-thinking-" in model: kwargs.pop("max_output_tokens") - - response = client.generate_content( - [{"role": "user", "parts": [message]}], - generation_config=genai.types.GenerationConfig( + + response = client.models.generate_content( + model=model, + contents=message, + config=genai.types.GenerateContentConfig( candidate_count=n, + safety_settings=[ + genai.types.SafetySetting( + category='HARM_CATEGORY_DANGEROUS_CONTENT', + threshold='BLOCK_NONE' + ), + genai.types.SafetySetting( + category='HARM_CATEGORY_SEXUALLY_EXPLICIT', + threshold='BLOCK_NONE' + ), + genai.types.SafetySetting( + category='HARM_CATEGORY_HATE_SPEECH', + threshold='BLOCK_NONE' + ), + genai.types.SafetySetting( + category='HARM_CATEGORY_HARASSMENT', + threshold='BLOCK_NONE' + ), + ], **kwargs - ), - safety_settings=[ - {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"}, - {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"}, - {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"}, - {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"}, - ], + ), ) return response diff --git a/bigcodebench/provider/google.py b/bigcodebench/provider/google.py index 2194c47..e3b18ff 100644 --- a/bigcodebench/provider/google.py +++ b/bigcodebench/provider/google.py @@ -2,7 +2,7 @@ from typing import List from tqdm import tqdm -import google.generativeai as genai +from google import genai from bigcodebench.provider.base import DecoderBase from bigcodebench.gen.util.google_request import make_auto_request @@ -12,8 +12,8 @@ class GoogleDecoder(DecoderBase): def __init__(self, name: str, **kwargs): super().__init__(name, **kwargs) - genai.configure(api_key=os.getenv("GOOGLE_API_KEY")) - self.client = genai.GenerativeModel(name) + self.model = name + self.client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY")) def codegen( self, prompts: List[str], do_sample: bool = True, num_samples: int = 200 @@ -34,7 +34,8 @@ def codegen( tokenizer=None, ) ret = make_auto_request( - self.client, + model=self.model, + client=self.client, message=message, n=num_samples, temperature=self.temperature, diff --git a/setup.cfg b/setup.cfg index cc20139..5907add 100644 --- a/setup.cfg +++ b/setup.cfg @@ -35,7 +35,7 @@ install_requires = rich accelerate>=0.30.1 anthropic>=0.26.1 - google-generativeai>=0.5.4 + google-genai mistralai>=0.2.0,<1.0.0 openai>=1.11.1 e2b From 8fb8e2399822ebc2a998f00e2a28cbeeeff40c7f Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Mon, 7 Apr 2025 20:26:22 +0800 Subject: [PATCH 18/24] update model meta info and processing script --- analysis/get_results.py | 8 +- analysis/utils.py | 418 +++++++++++++++++++++------------------- 2 files changed, 227 insertions(+), 199 deletions(-) diff --git a/analysis/get_results.py b/analysis/get_results.py index 607615a..641c43b 100755 --- a/analysis/get_results.py +++ b/analysis/get_results.py @@ -4,7 +4,7 @@ import numpy as np from numpy import mean from glob import glob -from utils import * +from utils import model_info from tqdm import tqdm import pandas as pd import itertools @@ -48,6 +48,8 @@ def get_results(tids): "moe": info["moe"], "size": info["size"], "act_param": info["act_param"], + "date": info.get("date", None), + "prefill": info.get("prefill", False), # "direct_complete": info["direct_complete"], } @@ -249,7 +251,7 @@ def get_solve_rate(data_dict, task="complete"): def get_hf_ds(results): hf_dataset = {"model": [], "link": [], "moe": [], "size": [], "act_param": [], "type": [], #"lazy": [],# "direct_complete": [], - "complete": [], "instruct": []} + "complete": [], "instruct": [], "date": [], "prefill": []} for model, result in results.items(): hf_dataset["model"].append(model) @@ -261,6 +263,8 @@ def get_hf_ds(results): # hf_dataset["lazy"].append(result["lazy"]) hf_dataset["complete"].append(result["pass@1"]["complete"]) hf_dataset["instruct"].append(result["pass@1"]["instruct"]) + hf_dataset["date"].append(result["date"]) + hf_dataset["prefill"].append(result["prefill"]) # hf_dataset["direct_complete"].append(result["direct_complete"]) return Dataset.from_dict(hf_dataset) diff --git a/analysis/utils.py b/analysis/utils.py index 798499b..9aa7203 100755 --- a/analysis/utils.py +++ b/analysis/utils.py @@ -7,7 +7,7 @@ "size": 6.7, "act_param": 6.7, "open-data": "Partial", - "reasoning": False, + "prefill": True, "date": "2024-12-04", }, "bigcode/starcoder2-15b-instruct-v0.1": { @@ -18,7 +18,7 @@ "size": 15, "act_param": 15, "open-data": "Full", - "reasoning": False, + "prefill": True, "date": "2024-04-30" }, "bigcode/starcoder2-3b": { @@ -29,7 +29,7 @@ "size": 3, "act_param": 3, "open-data": "Full", - "reasoning": False, + "prefill": True, "date": "2024-02-29" }, "bigcode/starcoder2-7b": { @@ -40,7 +40,7 @@ "size": 7, "act_param": 7, "open-data": "Full", - "reasoning": False, + "prefill": True, "date": "2024-02-29" }, "bigcode/starcoder2-15b": { @@ -51,7 +51,7 @@ "size": 15, "act_param": 15, "open-data": "Full", - "reasoning": False, + "prefill": True, "date": "2024-02-29" }, "Qwen/CodeQwen1.5-7B": { @@ -62,7 +62,7 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-16" }, "google/codegemma-2b": { @@ -73,7 +73,7 @@ "size": 2, "act_param": 2, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-10" }, "google/codegemma-7b": { @@ -84,7 +84,7 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-10" }, "google/codegemma-7b-it": { @@ -95,7 +95,7 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-10" }, "gpt-3.5-turbo-0125": { @@ -106,7 +106,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-01-25" }, "gpt-4o": { @@ -117,7 +117,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-13" }, "gpt-4-turbo-2024-04-09": { @@ -128,7 +128,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-09" }, "gpt-4-0613": { @@ -139,7 +139,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-06-13" }, "codellama/CodeLlama-7b-hf": { @@ -150,7 +150,7 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2023-08-25" }, "codellama/CodeLlama-13b-hf": { @@ -161,7 +161,7 @@ "size": 13, "act_param": 13, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2023-08-25" }, "codellama/CodeLlama-7b-Instruct-hf": { @@ -172,7 +172,7 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2023-08-25" }, "codellama/CodeLlama-13b-Instruct-hf": { @@ -183,7 +183,7 @@ "size": 13, "act_param": 13, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2023-08-25" }, "mistral-large-2402": { @@ -194,7 +194,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-02-26" }, "mistral-small-2402": { @@ -205,7 +205,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-02-26" }, "mistralai/Mixtral-8x22B-v0.1": { @@ -216,7 +216,7 @@ "size": 176, "act_param": 44, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-17" }, "mistralai/Mixtral-8x22B-Instruct-v0.1": { @@ -227,7 +227,7 @@ "size": 176, "act_param": 44, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-17" }, "codellama/CodeLlama-34b-hf": { @@ -238,7 +238,7 @@ "size": 34, "act_param": 34, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2023-08-25" }, "codellama/CodeLlama-34b-Instruct-hf": { @@ -249,7 +249,7 @@ "size": 34, "act_param": 34, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2023-08-25" }, "codellama/CodeLlama-70b-hf": { @@ -260,7 +260,7 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2023-08-25" }, "codellama/CodeLlama-70b-Instruct-hf": { @@ -271,7 +271,7 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2023-08-25" }, "Qwen/CodeQwen1.5-7B-Chat": { @@ -282,7 +282,7 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-16" }, "Qwen/Qwen1.5-110B-Chat": { @@ -293,7 +293,7 @@ "size": 110, "act_param": 110, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-26" }, "Qwen/Qwen1.5-72B-Chat": { @@ -304,7 +304,7 @@ "size": 72, "act_param": 72, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-26" }, "Qwen/Qwen1.5-32B-Chat": { @@ -315,7 +315,7 @@ "size": 32, "act_param": 32, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-26" }, "deepseek-ai/DeepSeek-V2-Chat": { @@ -326,7 +326,7 @@ "size": 236, "act_param": 21, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-06" }, "deepseek-ai/deepseek-coder-1.3b-base": { @@ -337,7 +337,7 @@ "size": 1.3, "act_param": 1.3, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2023-10-28" }, "deepseek-ai/deepseek-coder-1.3b-instruct": { @@ -348,7 +348,7 @@ "size": 1.3, "act_param": 1.3, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2023-10-28" }, "deepseek-ai/deepseek-coder-33b-base": { @@ -359,7 +359,7 @@ "size": 33, "act_param": 33, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2023-10-28" }, "deepseek-ai/deepseek-coder-33b-instruct": { @@ -370,7 +370,7 @@ "size": 33, "act_param": 33, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2023-10-28" }, "deepseek-ai/deepseek-coder-6.7b-base": { @@ -381,7 +381,7 @@ "size": 6.7, "act_param": 6.7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2023-10-28" }, "deepseek-ai/deepseek-coder-6.7b-instruct": { @@ -392,7 +392,7 @@ "size": 6.7, "act_param": 6.7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2023-10-28" }, "meta-llama/Meta-Llama-3-70B": { @@ -403,7 +403,7 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-18" }, "meta-llama/Meta-Llama-3-70B-Instruct": { @@ -414,7 +414,7 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-18" }, "meta-llama/Meta-Llama-3-8B": { @@ -425,7 +425,7 @@ "size": 8, "act_param": 8, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-18" }, "meta-llama/Meta-Llama-3-8B-Instruct": { @@ -436,7 +436,7 @@ "size": 8, "act_param": 8, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-18" }, "ibm-granite/granite-3b-code-instruct": { @@ -447,7 +447,7 @@ "size": 3, "act_param": 3, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-06" }, "ibm-granite/granite-8b-code-instruct": { @@ -458,7 +458,7 @@ "size": 8, "act_param": 8, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-06" }, "ibm-granite/granite-20b-code-instruct": { @@ -469,7 +469,7 @@ "size": 20, "act_param": 20, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-06" }, "ibm-granite/granite-34b-code-instruct": { @@ -480,7 +480,7 @@ "size": 34, "act_param": 34, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-06" }, "ibm-granite/granite-3b-code-base": { @@ -491,7 +491,7 @@ "size": 3, "act_param": 3, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-06" }, "ibm-granite/granite-8b-code-base": { @@ -502,7 +502,7 @@ "size": 8, "act_param": 8, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-06" }, "ibm-granite/granite-20b-code-base": { @@ -513,7 +513,7 @@ "size": 20, "act_param": 20, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-06" }, "ibm-granite/granite-34b-code-base": { @@ -524,7 +524,7 @@ "size": 34, "act_param": 34, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-06" }, "claude-3-haiku-20240307": { @@ -535,7 +535,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2024-03-07" }, "claude-3-sonnet-20240229": { @@ -546,7 +546,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2024-02-29" }, "claude-3-opus-20240229": { @@ -557,7 +557,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2024-02-29" }, "01-ai/Yi-1.5-34B-Chat": { @@ -568,7 +568,7 @@ "size": 34, "act_param": 34, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-20" }, "01-ai/Yi-1.5-34B": { @@ -579,7 +579,7 @@ "size": 34, "act_param": 34, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-20" }, "01-ai/Yi-1.5-9B-Chat": { @@ -590,7 +590,7 @@ "size": 9, "act_param": 9, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-20" }, "01-ai/Yi-1.5-9B": { @@ -601,7 +601,7 @@ "size": 9, "act_param": 9, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-20" }, "01-ai/Yi-1.5-6B-Chat": { @@ -612,7 +612,7 @@ "size": 6, "act_param": 6, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-20" }, "01-ai/Yi-1.5-6B": { @@ -623,7 +623,7 @@ "size": 6, "act_param": 6, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-20" }, "Qwen/Qwen2-57B-A14B": { @@ -634,7 +634,7 @@ "size": 57, "act_param": 14, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-06-07" }, "Qwen/Qwen2-7B-Instruct": { @@ -645,7 +645,7 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-06-07" }, "Qwen/Qwen2-72B-Chat": { @@ -656,7 +656,7 @@ "size": 72, "act_param": 72, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-06-07" }, "gemini-1.5-pro": { @@ -667,7 +667,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-14" }, "gemini-1.5-flash": { @@ -678,7 +678,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-14" }, "m-a-p/OpenCodeInterpreter-DS-33B": { @@ -689,7 +689,7 @@ "size": 33, "act_param": 33, "open-data": "Partial", - "reasoning": False, + "prefill": True, "date": "2024-02-22" }, "m-a-p/OpenCodeInterpreter-DS-6.7B": { @@ -700,7 +700,7 @@ "size": 6.7, "act_param": 6.7, "open-data": "Partial", - "reasoning": False, + "prefill": True, "date": "2024-02-22" }, "m-a-p/OpenCodeInterpreter-DS-1.3B": { @@ -711,7 +711,7 @@ "size": 1.3, "act_param": 1.3, "open-data": "Partial", - "reasoning": False, + "prefill": True, "date": "2024-02-22" }, "microsoft/Phi-3-medium-128k-instruct": { @@ -722,7 +722,7 @@ "size": 14, "act_param": 14, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-21" }, "microsoft/Phi-3-small-128k-instruct": { @@ -733,7 +733,7 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-21" }, "codestral-2405": { @@ -744,7 +744,7 @@ "size": 22, "act_param": 22, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-23" }, "codestral-mamba-2407": { @@ -755,7 +755,7 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-07-16" }, "mistralai/Mistral-7B-Instruct-v0.3": { @@ -766,7 +766,7 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-22" }, "mistralai/Mistral-7B-v0.3": { @@ -777,7 +777,7 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-22" }, "CohereForAI/c4ai-command-r-plus": { @@ -788,7 +788,7 @@ "size": 104, "act_param": 104, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-04" }, "deepseek-coder": { @@ -799,7 +799,7 @@ "size": 236, "act_param": 21, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2024-06-17" }, "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": { @@ -810,7 +810,7 @@ "size": 16, "act_param": 2.4, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-06-17" }, "deepseek-ai/DeepSeek-Coder-V2-Lite-Base": { @@ -821,7 +821,7 @@ "size": 16, "act_param": 2.4, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-06-17" }, "claude-3-5-sonnet-20240620": { @@ -832,7 +832,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2024-06-20" }, "NousResearch/Hermes-2-Theta-Llama-3-70B": { @@ -843,7 +843,7 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-06-24" }, "microsoft/wavecoder-ultra-6.7b": { @@ -854,7 +854,7 @@ "size": 6.7, "act_param": 6.7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2023-12-26" }, "google/gemma-2-9b-it": { @@ -865,7 +865,7 @@ "size": 9, "act_param": 9, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-06-19" }, "Bin12345/AutoCoder": { @@ -876,7 +876,7 @@ "size": 33, "act_param": 33, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-23" }, "Bin12345/AutoCoder_S_6.7B": { @@ -887,7 +887,7 @@ "size": 6.7, "act_param": 6.7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-23" }, "Bin12345/AutoCoder_QW_7B": { @@ -898,7 +898,7 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-23" }, "SenseLLM/ReflectionCoder-DS-33B": { @@ -909,7 +909,7 @@ "size": 33, "act_param": 33, "open-data": "Partial", - "reasoning": False, + "prefill": True, "date": "2024-05-27" }, "SenseLLM/ReflectionCoder-DS-6.7B": { @@ -920,7 +920,7 @@ "size": 6.7, "act_param": 6.7, "open-data": "Partial", - "reasoning": False, + "prefill": True, "date": "2024-05-27" }, "SenseLLM/ReflectionCoder-CL-34B": { @@ -931,7 +931,7 @@ "size": 34, "act_param": 34, "open-data": "Partial", - "reasoning": False, + "prefill": True, "date": "2024-05-27" }, "SenseLLM/ReflectionCoder-CL-7B": { @@ -942,7 +942,7 @@ "size": 7, "act_param": 7, "open-data": "Partial", - "reasoning": False, + "prefill": True, "date": "2024-05-27" }, "new-microsoft/Phi-3-mini-128k-instruct": { @@ -953,7 +953,7 @@ "size": 3.8, "act_param": 3.8, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-21" }, "old-microsoft/Phi-3-mini-128k-instruct": { @@ -964,7 +964,7 @@ "size": 3.8, "act_param": 3.8, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-21" }, "internlm/internlm2_5-7b-chat": { @@ -975,7 +975,7 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-07-03" }, "NousResearch/Hermes-2-Pro-Llama-3-70B": { @@ -986,7 +986,7 @@ "size": 70, "act_param": 70, "open-data": "Partial", - "reasoning": False, + "prefill": True, "date": "2024-06-27" }, "new-deepseek-chat": { @@ -997,7 +997,7 @@ "size": 236, "act_param": 21, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2024-06-28" }, "vllm-google/gemma-2-27b-it": { @@ -1008,7 +1008,7 @@ "size": 27, "act_param": 27, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-06-19" }, "Artigenz/Artigenz-Coder-DS-6.7B": { @@ -1019,7 +1019,7 @@ "size": 6.7, "act_param": 6.7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-16" }, "openchat/openchat-3.6-8b-20240522": { @@ -1030,7 +1030,7 @@ "size": 8, "act_param": 8, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-22" }, "Phind/Phind-CodeLlama-34B-v2": { @@ -1041,7 +1041,7 @@ "size": 34, "act_param": 34, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2023-08-25" }, "yi-large": { @@ -1052,7 +1052,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-13" }, "THUDM/codegeex4-all-9b": { @@ -1063,7 +1063,7 @@ "size": 9, "act_param": 9, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-07-05" }, "gpt-4o-mini-2024-07-18": { @@ -1074,7 +1074,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-07-18" }, "Nexusflow/Athene-70B": { @@ -1085,7 +1085,7 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-07-20" }, "NTQAI/Nxcode-CQ-7B-orpo": { @@ -1096,7 +1096,7 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-25" }, "migtissera/Llama-3-70B-Synthia-v3.5": { @@ -1107,7 +1107,7 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-05-27" }, "migtissera/Tess-v2.5.2-Qwen2-72B": { @@ -1118,7 +1118,7 @@ "size": 72, "act_param": 72, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-07-18" }, "WhiteRabbitNeo/WhiteRabbitNeo-33B-v1.5": { @@ -1129,7 +1129,7 @@ "size": 33, "act_param": 33, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-02-10" }, "mistral-large-2407": { @@ -1140,7 +1140,7 @@ "size": 123, "act_param": 123, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2024-07-24" }, "meta-llama/Meta-Llama-3.1-8B-Instruct": { @@ -1151,7 +1151,7 @@ "size": 8, "act_param": 8, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-07-23" }, "meta-llama/Meta-Llama-3.1-70B-Instruct": { @@ -1162,7 +1162,7 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-07-23" }, "meta--llama-3.1-405b-instruct": { @@ -1173,7 +1173,7 @@ "size": 405, "act_param": 405, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-07-23" }, "deepseek-coder-20240724": { @@ -1184,7 +1184,7 @@ "size": 236, "act_param": 21, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-07-24" }, "microsoft/Phi-3.5-mini-instruct": { @@ -1195,7 +1195,7 @@ "size": 3.8, "act_param": 3.8, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-23" }, "nv-mistralai--mistral-nemo-12b-instruct": { @@ -1206,7 +1206,7 @@ "size": 12, "act_param": 12, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-07-18" }, "wyt2000/InverseCoder-CL-13B": { @@ -1217,7 +1217,7 @@ "size": 13, "act_param": 13, "open-data": "Partial", - "reasoning": False, + "prefill": True, "date": "2024-07-08" }, "wyt2000/InverseCoder-CL-7B": { @@ -1228,7 +1228,7 @@ "size": 7, "act_param": 7, "open-data": "Partial", - "reasoning": False, + "prefill": True, "date": "2024-07-08" }, "wyt2000/InverseCoder-DS-6.7B": { @@ -1239,7 +1239,7 @@ "size": 6.7, "act_param": 6.7, "open-data": "Partial", - "reasoning": False, + "prefill": True, "date": "2024-07-08" }, "gemini-1.5-pro-exp-0801": { @@ -1250,7 +1250,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2024-08-01" }, "gpt-4o-2024-08-06": { @@ -1261,7 +1261,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-08-06" }, "abacusai/Dracarys-Llama-3.1-70B-Instruct": { @@ -1272,7 +1272,7 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-08-23" }, "abacusai/Dracarys-72B-Instruct": { @@ -1283,7 +1283,7 @@ "size": 72, "act_param": 72, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-08-23" }, "gemini-1.5-pro-exp-0827": { @@ -1294,7 +1294,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-08-27" }, "gemini-1.5-flash-exp-0827": { @@ -1305,7 +1305,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-08-27" }, "microsoft/Phi-3.5-mini-instruct": { @@ -1316,7 +1316,7 @@ "size": 3.8, "act_param": 3.8, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-23" }, "abacusai/Dracarys-Llama-3.1-70B-Instruct": { @@ -1327,7 +1327,7 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-23" }, "abacusai/Dracarys-72B-Instruct": { @@ -1338,7 +1338,7 @@ "size": 72, "act_param": 72, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-04-23" }, "deepseek-coder-v2.5": { @@ -1349,7 +1349,7 @@ "size": 236, "act_param": 21, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-09-18" }, "CohereForAI/c4ai-command-r-08-2024": { @@ -1360,7 +1360,7 @@ "size": 32.3, "act_param": 32.3, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-08-30" }, "CohereForAI/c4ai-command-r-plus-08-2024": { @@ -1371,7 +1371,7 @@ "size": 104, "act_param": 104, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-08-30" }, "ayueei--yue-coder-9b-preview": { @@ -1382,7 +1382,7 @@ "size": 9, "act_param": 9, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-09-04" }, # "mattshumer/ref_70_e3_prefill": { @@ -1411,7 +1411,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2024-09-12" }, "o1-mini-2024-09-12": { @@ -1422,7 +1422,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2024-09-12" }, "Qwen/Qwen2.5-Coder-1.5B-Instruct": { @@ -1433,7 +1433,7 @@ "size": 1.5, "act_param": 1.5, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-11-12" }, "Qwen/Qwen2.5-Coder-7B-Instruct": { @@ -1444,7 +1444,7 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-11-12" }, "gemini-1.5-pro-002": { @@ -1455,7 +1455,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2024-09-25" }, "mistralai/Mistral-Small-Instruct-2409": { @@ -1466,7 +1466,7 @@ "size": 22.2, "act_param": 22.2, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-09-18" }, "Qwen/Qwen2.5-0.5B-Instruct": { @@ -1477,7 +1477,7 @@ "size": 0.5, "act_param": 0.5, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-09-19" }, "Qwen/Qwen2.5-1.5B-Instruct": { @@ -1488,7 +1488,7 @@ "size": 1.5, "act_param": 1.5, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-09-19" }, "Qwen/Qwen2.5-7B-Instruct": { @@ -1499,7 +1499,7 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-09-19" }, "Qwen/Qwen2.5-14B-Instruct": { @@ -1510,7 +1510,7 @@ "size": 14, "act_param": 14, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-09-19" }, "Qwen/Qwen2.5-32B-Instruct": { @@ -1521,7 +1521,7 @@ "size": 32, "act_param": 32, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-09-19" }, "Qwen/Qwen2.5-72B-Instruct": { @@ -1532,7 +1532,7 @@ "size": 72, "act_param": 72, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-09-19" }, "meta-llama/Llama-3.2-1B-Instruct": { @@ -1543,7 +1543,7 @@ "size": 1, "act_param": 1, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-09-25" }, "meta-llama/Llama-3.2-3B-Instruct": { @@ -1554,7 +1554,7 @@ "size": 3, "act_param": 3, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-09-25" }, "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF": { @@ -1565,7 +1565,7 @@ "size": 70, "act_param": 70, "open-data": "Partial", - "reasoning": False, + "prefill": True, "date": "2024-09-25" }, "claude-3-5-sonnet-20241022": { @@ -1576,7 +1576,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-10-22" }, "ibm-granite/granite-3.0-8b-instruct": { @@ -1587,7 +1587,7 @@ "size": 8, "act_param": 8, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-10-21" }, "ibm-granite/granite-3.0-2b-instruct": { @@ -1598,7 +1598,7 @@ "size": 2, "act_param": 2, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-10-21" }, "grok-beta--main": { @@ -1609,7 +1609,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2024-03-17" }, "claude-3-5-haiku-20241022--main": { @@ -1620,7 +1620,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2024-10-22" }, "Qwen/Qwen2.5-Coder-14B-Instruct--main": { @@ -1631,7 +1631,7 @@ "size": 14, "act_param": 14, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-09-19" }, "Qwen/Qwen2.5-Coder-32B-Instruct--main": { @@ -1642,7 +1642,7 @@ "size": 32, "act_param": 32, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-09-19" }, "infly/OpenCoder-1.5B-Instruct--main": { @@ -1653,7 +1653,7 @@ "size": 1.5, "act_param": 1.5, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-11-09" }, "infly/OpenCoder-8B-Instruct--main": { @@ -1664,7 +1664,7 @@ "size": 8, "act_param": 8, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-11-09" }, "microsoft/Phi-3.5-mini-instruct--main": { @@ -1675,7 +1675,7 @@ "size": 3.8, "act_param": 3.8, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-08-21" }, "Nexusflow/Athene-V2-Agent--main": { @@ -1686,7 +1686,7 @@ "size": 72, "act_param": 72, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-11-14" }, "Nexusflow/Athene-V2-Chat--main": { @@ -1697,7 +1697,7 @@ "size": 72, "act_param": 72, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-11-14" }, "gemini-exp-1114--main": { @@ -1708,7 +1708,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2024-11-14" }, "gpt-4o-2024-11-20--main": { @@ -1719,7 +1719,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2024-11-20" }, "gemini-exp-1121--main": { @@ -1730,7 +1730,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-11-21" }, "gemini-exp-1206--main": { @@ -1741,7 +1741,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2024-12-06" }, "meta-llama--Llama-3.3-70B-Instruct--main": { @@ -1752,7 +1752,7 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-12-19" }, "deepseek-ai--DeepSeek-V2.5-1210--main": { @@ -1763,7 +1763,7 @@ "size": 236, "act_param": 21, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-12-10" }, "gemini-2.0-flash-exp--main": { @@ -1774,7 +1774,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-12-11" }, "gemini-2.0-flash-thinking-exp-1219--main": { @@ -1785,7 +1785,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-12-19" }, "gemini-2.0-flash-thinking-exp-01-21--main": { @@ -1796,7 +1796,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2025-01-21" }, "o1-2024-12-17--main": { @@ -1807,7 +1807,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2024-12-17" }, "o1-2024-12-17--low--main": { @@ -1818,7 +1818,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2024-12-17" }, "o1-2024-12-17--high--main": { @@ -1829,7 +1829,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2024-12-17" }, "deepseek-v3-chat--main": { @@ -1840,7 +1840,7 @@ "size": 671, "act_param": 37, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2024-12-26" }, "microsoft--phi-4--main": { @@ -1851,7 +1851,7 @@ "size": 14.7, "act_param": 14.7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-12-13" }, "deepseek-reasoner--main": { @@ -1862,7 +1862,7 @@ "size": 671, "act_param": 37, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Llama-70B--main": { @@ -1873,7 +1873,7 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B--main": { @@ -1884,7 +1884,7 @@ "size": 32, "act_param": 32, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B--main": { @@ -1895,7 +1895,7 @@ "size": 14, "act_param": 14, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Llama-8B--main": { @@ -1906,7 +1906,7 @@ "size": 8, "act_param": 8, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B--main": { @@ -1917,7 +1917,7 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B--main": { @@ -1928,7 +1928,7 @@ "size": 1.5, "act_param": 1.5, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2025-01-20" }, "mistralai/Mistral-Small-24B-Instruct-2501--main": { @@ -1939,7 +1939,7 @@ "size": 24, "act_param": 24, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2025-01-31" }, "o3-mini-2025-01-31--medium--main": { @@ -1950,7 +1950,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2025-01-31" }, "o3-mini-2025-01-31--low--main": { @@ -1961,7 +1961,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2025-01-31" }, "o3-mini-2025-01-31--high--main": { @@ -1972,7 +1972,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2025-01-31" }, "gemini-2.0-flash-001--main": { @@ -1983,7 +1983,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2025-02-05" }, "gemini-2.0-flash-exp--main": { @@ -1994,7 +1994,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2025-02-05" }, "gemini-2.0-flash-lite-preview-02-05--main": { @@ -2005,7 +2005,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2025-02-05" }, "gemini-2.0-pro-exp-02-05--main": { @@ -2016,7 +2016,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2025-02-05" }, "NovaSky-AI--Sky-T1-32B-Flash--main": { @@ -2027,7 +2027,7 @@ "size": 32, "act_param": 32, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2025-01-12" }, "NovaSky-AI--Sky-T1-32B-Preview--main": { @@ -2038,7 +2038,7 @@ "size": 32, "act_param": 32, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2025-01-12" }, "Qwen--QwQ-32B-Preview--main": { @@ -2049,7 +2049,7 @@ "size": 32, "act_param": 32, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2024-11-28" }, "claude-3-7-sonnet-20250219--main": { @@ -2060,7 +2060,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2025-02-19" }, "chatgpt-4o-latest--main": { @@ -2071,7 +2071,7 @@ "moe": False, "size": None, "act_param": None, - "reasoning": True, + "prefill": False, "date": "2025-01-29" }, "Kwaipilot--KwaiCoder-23B-A4B-v1--main": { @@ -2082,7 +2082,7 @@ "moe": True, "size": 23, "act_param": 4, - "reasoning": False, + "prefill": True, "date": "2025-01-25" }, "qwen-max-latest--main": { @@ -2093,7 +2093,7 @@ "moe": True, "size": None, "act_param": None, - "reasoning": True, + "prefill": False, "date": "2025-01-28" }, "claude-3-7-sonnet-20250219--3200-output-128k-2025-02-19--main": { @@ -2104,7 +2104,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2025-02-19" }, "claude-3-7-sonnet-20250219--main": { @@ -2115,7 +2115,7 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2025-02-19" }, "WarriorCoder-6.7B--main": { @@ -2127,7 +2127,7 @@ "size": 6.7, "act_param": 6.7, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2025-02-18" }, "google--gemma-3-27b-it--main": { @@ -2139,7 +2139,7 @@ "size": 27, "act_param": 27, "open-data": "None", - "reasoning": False, + "prefill": True, "date": "2025-03-12" }, "Qwen--QwQ-32B--skip_prefill--main": { @@ -2151,7 +2151,7 @@ "size": 32, "act_param": 32, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2025-03-06" }, "deepseek-chat-0324--main": { @@ -2163,7 +2163,7 @@ "size": 671, "act_param": 37, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2025-03-24" }, "gemini-2.5-pro-exp-03-25--main": { @@ -2175,7 +2175,31 @@ "size": None, "act_param": 37, "open-data": "None", - "reasoning": True, + "prefill": False, "date": "2025-03-25" - } + }, + "meta/llama-4-scout-17b-16e-instruct--main": { + "name": "Llama-4-Scout", + "link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", + "open-data": "None", + "prompted": True, + "moe": True, + "size": 109, + "act_param": 17, + "open-data": "None", + "prefill": False, + "date": "2025-04-05" + }, + "meta/llama-4-maverick-17b-128e-instruct--main": { + "name": "Llama-4-Maverick", + "link": "https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct", + "open-data": "None", + "prompted": True, + "moe": True, + "size": 109, + "act_param": 17, + "open-data": "None", + "prefill": False, + "date": "2025-04-05" + }, } \ No newline at end of file From 1bf199d4a88d73b0940c973a4b6d2c1d86503179 Mon Sep 17 00:00:00 2001 From: Alessandro Giagnorio Date: Thu, 10 Apr 2025 21:28:53 +0200 Subject: [PATCH 19/24] Fix nltk resource download --- tools/fix_v025.py | 135 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 tools/fix_v025.py diff --git a/tools/fix_v025.py b/tools/fix_v025.py new file mode 100644 index 0000000..902fad1 --- /dev/null +++ b/tools/fix_v025.py @@ -0,0 +1,135 @@ +from datasets import load_dataset +from huggingface_hub import HfApi + +BIGCODEBENCH_HF = "bigcode/bigcodebench" +BIGCODEBENCH_HARD_HF = "bigcode/bigcodebench-hard" +BIGCODEBENCH_VERSION = "v0.1.4" +BIGCODEBENCH_UPDATE = "bigcode/bcb_update" +BIGCODEBENCH_NEW_VERSION = "v0.1.5" + +def map_ds(sample): + if sample["task_id"] in ["BigCodeBench/332"]: + sample['code_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['code_prompt'] + sample['complete_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['complete_prompt'] + sample['instruct_prompt'] = sample['instruct_prompt'].replace( + "\nYou should write self-contained code starting with:\n```\n", + "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n" + ) + + if sample["task_id"] in ["BigCodeBench/334"]: + sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt'] + sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt'] + sample['instruct_prompt'] = sample['instruct_prompt'].replace( + "\nYou should write self-contained code starting with:\n```\n", + "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n" + ) + + if sample["task_id"] in ["BigCodeBench/376"]: + sample['code_prompt'] = sample['code_prompt'].replace( + "import nltk\n", + "import nltk\nnltk.download('stopwords')\n", + 1 + ) + sample['complete_prompt'] = sample['complete_prompt'].replace( + "import nltk\n", + "import nltk\nnltk.download('stopwords')\n", + 1 + ) + sample['instruct_prompt'] = sample['instruct_prompt'].replace( + "\nYou should write self-contained code starting with:\n```\nimport nltk\n", + "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n" + ) + + if sample["task_id"] in ["BigCodeBench/383"]: + sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt'] + sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt'] + sample['instruct_prompt'] = sample['instruct_prompt'].replace( + "\nYou should write self-contained code starting with:\n```\n", + "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n" + ) + + if sample["task_id"] in ["BigCodeBench/633"]: + sample['code_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['code_prompt'] + sample['complete_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['complete_prompt'] + sample['instruct_prompt'] = sample['instruct_prompt'].replace( + "\nYou should write self-contained code starting with:\n```\n", + "\nYou should write self-contained code starting with:\n```\n" + ) + + if sample["task_id"] in ["BigCodeBench/635"]: + sample['code_prompt'] = sample['code_prompt'].replace( + "# Importing the required libraries", + "# Importing the required libraries\nimport nltk\nnltk.download('stopwords')\n" + ) + + sample['complete_prompt'] = sample['complete_prompt'].replace( + "# Importing the required libraries", + "# Importing the required libraries\nimport nltk\nnltk.download('stopwords')\n" + ) + + sample['instruct_prompt'] = sample['instruct_prompt'].replace( + "# Importing the required libraries", + "# Importing the required libraries\nimport nltk\nnltk.download('stopwords')\n" + ) + + if sample["task_id"] in ["BigCodeBench/849"]: + sample['code_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['code_prompt'] + sample['complete_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['complete_prompt'] + sample['instruct_prompt'] = sample['instruct_prompt'].replace( + "\nYou should write self-contained code starting with:\n```\n", + "\nYou should write self-contained code starting with:\n```\n" + ) + + if sample["task_id"] in ["BigCodeBench/940"]: + sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt'] + sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt'] + sample['instruct_prompt'] = sample['instruct_prompt'].replace( + "\nYou should write self-contained code starting with:\n```\n", + "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n" + ) + + if sample["task_id"] in ["BigCodeBench/1109"]: + sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt'] + sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt'] + sample['instruct_prompt'] = sample['instruct_prompt'].replace( + "\nYou should write self-contained code starting with:\n```\n", + "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n" + ) + + return sample + +if __name__ == "__main__": + api = HfApi() + ds_dict = load_dataset(BIGCODEBENCH_HF) + hard_ds_dict = load_dataset(BIGCODEBENCH_HARD_HF) + ds = ds_dict[BIGCODEBENCH_VERSION] + hard_ds = hard_ds_dict[BIGCODEBENCH_VERSION] + function_id = [332, 334, 376, 383, 633, 635, 849, 940, 1109] + + new_ds = ds.map(map_ds) + new_ds.to_json("BigCodeBench.jsonl") + ds_dict[BIGCODEBENCH_NEW_VERSION] = new_ds + ds_dict.push_to_hub(BIGCODEBENCH_HF) + + new_hard_ds = hard_ds.map(map_ds) + new_hard_ds.to_json("BigCodeBench-Hard.jsonl") + hard_ds_dict[BIGCODEBENCH_NEW_VERSION] = new_hard_ds + hard_ds_dict.push_to_hub(BIGCODEBENCH_HARD_HF) + + for i in function_id: + old_sample = ds.select([i]) + new_sample = new_ds.select([i]) + old_sample.to_json("old.jsonl") + new_sample.to_json("new.jsonl") + api.upload_file( + path_or_fileobj="old.jsonl", + path_in_repo=f"{i}/old.jsonl", + repo_id=BIGCODEBENCH_UPDATE, + # repo_type="dataset" + ) + api.upload_file( + path_or_fileobj="new.jsonl", + path_in_repo=f"{i}/new.jsonl", + repo_id=BIGCODEBENCH_UPDATE, + # repo_type="dataset" + ) \ No newline at end of file From 821f3a54e5b3fd285cfe8b32956dbff56b432a86 Mon Sep 17 00:00:00 2001 From: Alessandro Giagnorio Date: Thu, 10 Apr 2025 21:49:30 +0200 Subject: [PATCH 20/24] Update instruction prompts with ntlk fix --- tools/fix_v025.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/fix_v025.py b/tools/fix_v025.py index 902fad1..edbeb71 100644 --- a/tools/fix_v025.py +++ b/tools/fix_v025.py @@ -53,7 +53,7 @@ def map_ds(sample): sample['complete_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['complete_prompt'] sample['instruct_prompt'] = sample['instruct_prompt'].replace( "\nYou should write self-contained code starting with:\n```\n", - "\nYou should write self-contained code starting with:\n```\n" + "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n" ) if sample["task_id"] in ["BigCodeBench/635"]: @@ -68,8 +68,8 @@ def map_ds(sample): ) sample['instruct_prompt'] = sample['instruct_prompt'].replace( - "# Importing the required libraries", - "# Importing the required libraries\nimport nltk\nnltk.download('stopwords')\n" + "\nYou should write self-contained code starting with:\n```\n", + "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n" ) if sample["task_id"] in ["BigCodeBench/849"]: @@ -77,7 +77,7 @@ def map_ds(sample): sample['complete_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['complete_prompt'] sample['instruct_prompt'] = sample['instruct_prompt'].replace( "\nYou should write self-contained code starting with:\n```\n", - "\nYou should write self-contained code starting with:\n```\n" + "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n" ) if sample["task_id"] in ["BigCodeBench/940"]: From bb082968da15445403a1a33b3cb238b5c9531b47 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 11 Apr 2025 18:04:45 +0800 Subject: [PATCH 21/24] feat: support reasoning for grok-3-mini --- bigcodebench/gen/util/openai_request.py | 2 +- bigcodebench/generate.py | 2 +- bigcodebench/provider/openai.py | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/bigcodebench/gen/util/openai_request.py b/bigcodebench/gen/util/openai_request.py index f8db3f5..f14f6d1 100644 --- a/bigcodebench/gen/util/openai_request.py +++ b/bigcodebench/gen/util/openai_request.py @@ -17,7 +17,7 @@ def make_request( kwargs["top_p"] = 0.95 kwargs["max_completion_tokens"] = max_tokens kwargs["temperature"] = temperature - if model.startswith("o1-") or model.startswith("o3-") or model.endswith("-reasoner"): # pop top-p and max_completion_tokens + if any(model.startswith(m) or model.endswith(m) for m in ["o1-", "o3-", "reasoner", "grok-3-mini"]): # pop top-p and max_completion_tokens kwargs.pop("top_p") kwargs.pop("max_completion_tokens") kwargs.pop("temperature") diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py index 912abcd..7eeecfc 100644 --- a/bigcodebench/generate.py +++ b/bigcodebench/generate.py @@ -197,7 +197,7 @@ def run_codegen( ) extra = "-" + subset if subset != "full" else "" - if backend == "openai" and reasoning_effort and model.startswith("o1-") or model.startswith("o3-") or model.endswith("-reasoner"): + if backend == "openai" and reasoning_effort and any(model.startswith(m) or model.endswith(m) for m in ["o1-", "o3-", "reasoner", "grok-3-mini"]): model = model + f"--{reasoning_effort}" if lora_path: diff --git a/bigcodebench/provider/openai.py b/bigcodebench/provider/openai.py index 12790f6..8b187d1 100644 --- a/bigcodebench/provider/openai.py +++ b/bigcodebench/provider/openai.py @@ -28,7 +28,7 @@ def codegen( tokenizer=None, ) for prompt in prompts] # use concurrency based batching for o1 and deepseek models - if self.name.startswith("o1-") or self.name.startswith("o3-") or self.name.startswith("deepseek"): + if any(self.name.startswith(model) or self.name.endswith(model) for model in ["o1-", "o3-", "reasoner", "grok-3-mini"]): return self._codegen_batch_via_concurrency(messages, num_samples) return self._codegen_api_batch(messages, num_samples) @@ -49,6 +49,7 @@ def _codegen_api_batch(self, messages: List[str], num_samples: int) -> List[str] reasoning_effort=self.reasoning_effort, n=num_samples, ) + print(ret) outputs = [] for item in ret.choices: outputs.append(item.message.content) From 33ed54d44343b1e61c1f2a777833130a0b57b2b7 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 11 Apr 2025 23:03:57 +0800 Subject: [PATCH 22/24] fix: update grok3 name for reasoning --- bigcodebench/gen/util/openai_request.py | 2 +- bigcodebench/generate.py | 2 +- bigcodebench/provider/openai.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bigcodebench/gen/util/openai_request.py b/bigcodebench/gen/util/openai_request.py index f14f6d1..3c8b741 100644 --- a/bigcodebench/gen/util/openai_request.py +++ b/bigcodebench/gen/util/openai_request.py @@ -17,7 +17,7 @@ def make_request( kwargs["top_p"] = 0.95 kwargs["max_completion_tokens"] = max_tokens kwargs["temperature"] = temperature - if any(model.startswith(m) or model.endswith(m) for m in ["o1-", "o3-", "reasoner", "grok-3-mini"]): # pop top-p and max_completion_tokens + if any(model.startswith(m) or model.endswith(m) for m in ["o1-", "o3-", "reasoner", "grok-3-mini-beta"]): # pop top-p and max_completion_tokens kwargs.pop("top_p") kwargs.pop("max_completion_tokens") kwargs.pop("temperature") diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py index 7eeecfc..adbf892 100644 --- a/bigcodebench/generate.py +++ b/bigcodebench/generate.py @@ -197,7 +197,7 @@ def run_codegen( ) extra = "-" + subset if subset != "full" else "" - if backend == "openai" and reasoning_effort and any(model.startswith(m) or model.endswith(m) for m in ["o1-", "o3-", "reasoner", "grok-3-mini"]): + if backend == "openai" and reasoning_effort and any(model.startswith(m) or model.endswith(m) for m in ["o1-", "o3-", "reasoner", "grok-3-mini-beta"]): model = model + f"--{reasoning_effort}" if lora_path: diff --git a/bigcodebench/provider/openai.py b/bigcodebench/provider/openai.py index 8b187d1..046e13e 100644 --- a/bigcodebench/provider/openai.py +++ b/bigcodebench/provider/openai.py @@ -28,7 +28,7 @@ def codegen( tokenizer=None, ) for prompt in prompts] # use concurrency based batching for o1 and deepseek models - if any(self.name.startswith(model) or self.name.endswith(model) for model in ["o1-", "o3-", "reasoner", "grok-3-mini"]): + if any(self.name.startswith(model) or self.name.endswith(model) for model in ["o1-", "o3-", "reasoner", "grok-3-mini-beta"]): return self._codegen_batch_via_concurrency(messages, num_samples) return self._codegen_api_batch(messages, num_samples) From 10c8327f3ef5d94644e84233b7400e3fb3178e4d Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 11 Apr 2025 23:05:02 +0800 Subject: [PATCH 23/24] add models --- analysis/utils.py | 48 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/analysis/utils.py b/analysis/utils.py index 9aa7203..20ecbf5 100755 --- a/analysis/utils.py +++ b/analysis/utils.py @@ -2202,4 +2202,52 @@ "prefill": False, "date": "2025-04-05" }, + "agentica-org/DeepCoder-14B-Preview--main": { + "name": "DeepCoder-14B-Preview", + "link": "https://huggingface.co/agentica-org/DeepCoder-14B-Preview", + "open-data": "None", + "prompted": True, + "moe": True, + "size": 14, + "act_param": 14, + "open-data": "None", + "prefill": True, + "date": "2025-04-09" + }, + "openrouter/quasar-alpha--main": { + "name": "Quasar-Alpha", + "link": "https://openrouter.ai/openrouter/quasar-alpha", + "open-data": "None", + "prompted": True, + "moe": True, + "size": None, + "act_param": None, + "open-data": "None", + "prefill": False, + "date": "2025-04-02" + }, + "agentica-org/DeepCoder-14B-Preview--skip_prefill--main": { + "name": "DeepCoder-14B-Preview (w/ Reasoning, 64k tokens, temperature=0.6)", + "link": "https://huggingface.co/agentica-org/DeepCoder-14B-Preview", + "open-data": "None", + "prompted": True, + "moe": False, + "size": 14, + "act_param": 14, + "open-data": "None", + "prefill": False, + "date": "2025-04-09" + }, + "openrouter/optimus-alpha--main": { + "name": "Optimus-Alpha", + "link": "https://openrouter.ai/openrouter/optimus-alpha", + "open-data": "None", + "prompted": True, + "moe": True, + "size": None, + "act_param": None, + "open-data": "None", + "prefill": False, + "date": "2025-04-10" + } } \ No newline at end of file From 77b286f79e43a898b52ab96f48e25fa96fab843d Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 11 Apr 2025 23:09:06 +0800 Subject: [PATCH 24/24] fix: rm printout --- bigcodebench/provider/openai.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bigcodebench/provider/openai.py b/bigcodebench/provider/openai.py index 046e13e..ff1459f 100644 --- a/bigcodebench/provider/openai.py +++ b/bigcodebench/provider/openai.py @@ -49,7 +49,6 @@ def _codegen_api_batch(self, messages: List[str], num_samples: int) -> List[str] reasoning_effort=self.reasoning_effort, n=num_samples, ) - print(ret) outputs = [] for item in ret.choices: outputs.append(item.message.content)