diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md index 4f48eca..c0905ba 100755 --- a/ADVANCED_USAGE.md +++ b/ADVANCED_USAGE.md @@ -50,6 +50,7 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio - `--n_samples`: The number of samples, default to `1` - `--temperature`: The temperature, default to `0.0` - `--max_new_tokens`: The length of max new tokens, default to `1280` +- `--max_model_len`: The length of max tokens in VLLM, default to `12800` - `--greedy`: Whether to use greedy decoding, default to `False` - `--strip_newlines`: Whether to strip newlines, default to `False`, set to `True` to strip newlines for some model series like StarCoder2 - `--direct_completion`: Whether to use direct completion, default to `False` @@ -69,7 +70,8 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio - `--tokenizer_legacy`: Whether to use the legacy tokenizer, default to `False` - `--samples`: The path to the generated samples file, default to `None` - `--no_execute`: Whether to not execute the samples, default to `False` -- `--remote_execute_api`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page +- `--e2b_endpoint`: The API endpoint for remote execution, default to `bigcodebench_evaluator`, you can also use your own E2B API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page +- `--gradio_endpoint`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page - `--pass_k`: The `k` in `Pass@k`, default to `[1, 5, 10]`, e.g. `--pass_k 1,5,10` will evaluate `Pass@1`, `Pass@5` and `Pass@10` - `--calibrated`: Whether to use the calibrated samples, default to `True` - `--save_pass_rate`: Whether to save the pass rate to a file, default to `True` diff --git a/Docker/Evaluate.Dockerfile b/Docker/Evaluate.Dockerfile index 90e7f40..8b2cdcd 100755 --- a/Docker/Evaluate.Dockerfile +++ b/Docker/Evaluate.Dockerfile @@ -54,7 +54,7 @@ RUN pip install \ rich \ accelerate \ anthropic \ - google-generativeai \ + google-genai \ mistralai \ openai \ e2b diff --git a/README.md b/README.md index 94ad2ef..d3913d9 100755 --- a/README.md +++ b/README.md @@ -187,7 +187,7 @@ Please make sure your HF access token has the `Make calls to inference providers ## 💻 LLM-generated Code We share pre-generated code samples from LLMs we have [evaluated](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard) on the full set: -* See the attachment of our [v0.2.1.post7](https://github.com/bigcode-project/bigcodebench/releases/tag/v0.2.1.post7). We include `sanitized_samples_calibrated.zip` for your convenience. +* See the attachment of our [v0.2.4](https://github.com/bigcode-project/bigcodebench/releases/tag/v0.2.4). We include `sanitized_samples_calibrated.zip` for your convenience. ## 🧑 Advanced Usage diff --git a/analysis/get_results.py b/analysis/get_results.py index fc5aa17..641c43b 100755 --- a/analysis/get_results.py +++ b/analysis/get_results.py @@ -4,7 +4,7 @@ import numpy as np from numpy import mean from glob import glob -from utils import * +from utils import model_info from tqdm import tqdm import pandas as pd import itertools @@ -48,6 +48,8 @@ def get_results(tids): "moe": info["moe"], "size": info["size"], "act_param": info["act_param"], + "date": info.get("date", None), + "prefill": info.get("prefill", False), # "direct_complete": info["direct_complete"], } @@ -118,12 +120,12 @@ def check_valid(results): def split_gen(): - shutil.rmtree("sanitized_samples", ignore_errors=True) shutil.rmtree("sanitized_calibrated_samples", ignore_errors=True) - os.makedirs("sanitized_samples/complete", exist_ok=True) - os.makedirs("sanitized_samples/instruct", exist_ok=True) - os.makedirs("sanitized_calibrated_samples/complete", exist_ok=True) - os.makedirs("sanitized_calibrated_samples/instruct", exist_ok=True) + os.makedirs("sanitized_calibrated_samples/hard/complete", exist_ok=True) + os.makedirs("sanitized_calibrated_samples/hard/instruct", exist_ok=True) + os.makedirs("sanitized_calibrated_samples/full/complete", exist_ok=True) + os.makedirs("sanitized_calibrated_samples/full/instruct", exist_ok=True) + for model, info in model_info.items(): model = model.replace("/", "--") files = glob(f"results/{model}--bigcodebench-*.jsonl") @@ -131,27 +133,21 @@ def split_gen(): model = info["link"].split("https://huggingface.co/")[-1].replace("/", "--") for file in files: + if "-sanitized" not in file or "calibrated" not in file: + continue + _, suffix = os.path.basename(file).split("--bigcodebench-") with open(file, "r") as f: data = f.readlines() - if "-sanitized" in file: - if "calibrated" in file: - if info["prompted"]: - if suffix.startswith("complete"): - with open(f"sanitized_calibrated_samples/complete/{model}--bigcodebench-{suffix}", "w") as f: - f.writelines(data) - else: - with open(f"sanitized_calibrated_samples/instruct/{model}--bigcodebench-{suffix}", "w") as f: - f.writelines(data) + split_type = "hard" if "-hard-" in file else "full" + if info["prompted"]: + if suffix.startswith("complete") or suffix.startswith("hard-complete"): + with open(f"sanitized_calibrated_samples/{split_type}/complete/{model}--bigcodebench-{suffix}", "w") as f: + f.writelines(data) else: - if suffix.startswith("complete"): - with open(f"sanitized_samples/complete/{model}--bigcodebench-{suffix}", "w") as f: - f.writelines(data) - else: - with open(f"sanitized_samples/instruct/{model}--bigcodebench-{suffix}", "w") as f: - f.writelines(data) - + with open(f"sanitized_calibrated_samples/{split_type}/instruct/{model}--bigcodebench-{suffix}", "w") as f: + f.writelines(data) def read_task_perf(tids, task="complete"): model_results = dict() @@ -255,7 +251,7 @@ def get_solve_rate(data_dict, task="complete"): def get_hf_ds(results): hf_dataset = {"model": [], "link": [], "moe": [], "size": [], "act_param": [], "type": [], #"lazy": [],# "direct_complete": [], - "complete": [], "instruct": []} + "complete": [], "instruct": [], "date": [], "prefill": []} for model, result in results.items(): hf_dataset["model"].append(model) @@ -267,6 +263,8 @@ def get_hf_ds(results): # hf_dataset["lazy"].append(result["lazy"]) hf_dataset["complete"].append(result["pass@1"]["complete"]) hf_dataset["instruct"].append(result["pass@1"]["instruct"]) + hf_dataset["date"].append(result["date"]) + hf_dataset["prefill"].append(result["prefill"]) # hf_dataset["direct_complete"].append(result["direct_complete"]) return Dataset.from_dict(hf_dataset) @@ -302,7 +300,7 @@ def get_perf_df(data_dict): if __name__ == "__main__": - # split_gen() + split_gen() bcb_orig = load_dataset("bigcode/bigcodebench", split="v0.1.1") bcb_hard = load_dataset("bigcode/bigcodebench-hard", split="v0.1.1") bcb_config = { diff --git a/analysis/utils.py b/analysis/utils.py index 430e113..20ecbf5 100755 --- a/analysis/utils.py +++ b/analysis/utils.py @@ -7,7 +7,8 @@ "size": 6.7, "act_param": 6.7, "open-data": "Partial", - "reasoning": False, + "prefill": True, + "date": "2024-12-04", }, "bigcode/starcoder2-15b-instruct-v0.1": { "name": "StarCoder2-15B-Instruct-v0.1", @@ -17,7 +18,8 @@ "size": 15, "act_param": 15, "open-data": "Full", - "reasoning": False, + "prefill": True, + "date": "2024-04-30" }, "bigcode/starcoder2-3b": { "name": "StarCoder2-3B", @@ -27,7 +29,8 @@ "size": 3, "act_param": 3, "open-data": "Full", - "reasoning": False, + "prefill": True, + "date": "2024-02-29" }, "bigcode/starcoder2-7b": { "name": "StarCoder2-7B", @@ -37,7 +40,8 @@ "size": 7, "act_param": 7, "open-data": "Full", - "reasoning": False, + "prefill": True, + "date": "2024-02-29" }, "bigcode/starcoder2-15b": { "name": "StarCoder2-15B", @@ -47,7 +51,8 @@ "size": 15, "act_param": 15, "open-data": "Full", - "reasoning": False, + "prefill": True, + "date": "2024-02-29" }, "Qwen/CodeQwen1.5-7B": { "name": "CodeQwen1.5-7B", @@ -57,7 +62,8 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-16" }, "google/codegemma-2b": { "name": "CodeGemma-2B", @@ -67,7 +73,8 @@ "size": 2, "act_param": 2, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-10" }, "google/codegemma-7b": { "name": "CodeGemma-7B", @@ -77,7 +84,8 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-10" }, "google/codegemma-7b-it": { "name": "CodeGemma-7B-Instruct", @@ -87,7 +95,8 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-10" }, "gpt-3.5-turbo-0125": { "name": "GPT-3.5-Turbo-0125", @@ -97,7 +106,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-01-25" }, "gpt-4o": { "name": "GPT-4o-2024-05-13", @@ -107,7 +117,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-13" }, "gpt-4-turbo-2024-04-09": { "name": "GPT-4-Turbo-2024-04-09", @@ -117,7 +128,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-09" }, "gpt-4-0613": { "name": "GPT-4-0613", @@ -127,7 +139,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-06-13" }, "codellama/CodeLlama-7b-hf": { "name": "CodeLlama-7B-Base", @@ -137,7 +150,8 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2023-08-25" }, "codellama/CodeLlama-13b-hf": { "name": "CodeLlama-13B-Base", @@ -147,7 +161,8 @@ "size": 13, "act_param": 13, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2023-08-25" }, "codellama/CodeLlama-7b-Instruct-hf": { "name": "CodeLlama-7B-Instruct", @@ -157,7 +172,8 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2023-08-25" }, "codellama/CodeLlama-13b-Instruct-hf": { "name": "CodeLlama-13B-Instruct", @@ -167,7 +183,8 @@ "size": 13, "act_param": 13, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2023-08-25" }, "mistral-large-2402": { "name": "Mistral-Large-2402", @@ -177,7 +194,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-02-26" }, "mistral-small-2402": { "name": "Mistral-Small-2402", @@ -187,7 +205,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-02-26" }, "mistralai/Mixtral-8x22B-v0.1": { "name": "Mixtral-8x22B-Base", @@ -197,7 +216,8 @@ "size": 176, "act_param": 44, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-17" }, "mistralai/Mixtral-8x22B-Instruct-v0.1": { "name": "Mixtral-8x22B-Instruct", @@ -207,7 +227,8 @@ "size": 176, "act_param": 44, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-17" }, "codellama/CodeLlama-34b-hf": { "name": "CodeLlama-34B-Base", @@ -217,7 +238,8 @@ "size": 34, "act_param": 34, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2023-08-25" }, "codellama/CodeLlama-34b-Instruct-hf": { "name": "CodeLlama-34B-Instruct", @@ -227,7 +249,8 @@ "size": 34, "act_param": 34, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2023-08-25" }, "codellama/CodeLlama-70b-hf": { "name": "CodeLlama-70B-Base", @@ -237,7 +260,8 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2023-08-25" }, "codellama/CodeLlama-70b-Instruct-hf": { "name": "CodeLlama-70B-Instruct", @@ -247,7 +271,8 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2023-08-25" }, "Qwen/CodeQwen1.5-7B-Chat": { "name": "CodeQwen1.5-7B-Chat", @@ -257,7 +282,8 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-16" }, "Qwen/Qwen1.5-110B-Chat": { "name": "Qwen1.5-110B-Chat", @@ -267,7 +293,8 @@ "size": 110, "act_param": 110, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-26" }, "Qwen/Qwen1.5-72B-Chat": { "name": "Qwen1.5-72B-Chat", @@ -277,7 +304,8 @@ "size": 72, "act_param": 72, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-26" }, "Qwen/Qwen1.5-32B-Chat": { "name": "Qwen1.5-32B-Chat", @@ -287,7 +315,8 @@ "size": 32, "act_param": 32, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-26" }, "deepseek-ai/DeepSeek-V2-Chat": { "name": "DeepSeek-V2-Chat", @@ -297,7 +326,8 @@ "size": 236, "act_param": 21, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-06" }, "deepseek-ai/deepseek-coder-1.3b-base": { "name": "DeepSeek-Coder-1.3B-Base", @@ -307,7 +337,8 @@ "size": 1.3, "act_param": 1.3, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2023-10-28" }, "deepseek-ai/deepseek-coder-1.3b-instruct": { "name": "DeepSeek-Coder-1.3B-Instruct", @@ -317,7 +348,8 @@ "size": 1.3, "act_param": 1.3, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2023-10-28" }, "deepseek-ai/deepseek-coder-33b-base": { "name": "DeepSeek-Coder-33B-Base", @@ -327,7 +359,8 @@ "size": 33, "act_param": 33, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2023-10-28" }, "deepseek-ai/deepseek-coder-33b-instruct": { "name": "DeepSeek-Coder-33B-Instruct", @@ -337,7 +370,8 @@ "size": 33, "act_param": 33, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2023-10-28" }, "deepseek-ai/deepseek-coder-6.7b-base": { "name": "DeepSeek-Coder-6.7B-Base", @@ -347,7 +381,8 @@ "size": 6.7, "act_param": 6.7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2023-10-28" }, "deepseek-ai/deepseek-coder-6.7b-instruct": { "name": "DeepSeek-Coder-6.7B-Instruct", @@ -357,7 +392,8 @@ "size": 6.7, "act_param": 6.7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2023-10-28" }, "meta-llama/Meta-Llama-3-70B": { "name": "Llama-3-70B-Base", @@ -367,7 +403,8 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-18" }, "meta-llama/Meta-Llama-3-70B-Instruct": { "name": "Llama-3-70B-Instruct", @@ -377,7 +414,8 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-18" }, "meta-llama/Meta-Llama-3-8B": { "name": "Llama-3-8B-Base", @@ -387,7 +425,8 @@ "size": 8, "act_param": 8, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-18" }, "meta-llama/Meta-Llama-3-8B-Instruct": { "name": "Llama-3-8B-Instruct", @@ -397,7 +436,8 @@ "size": 8, "act_param": 8, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-18" }, "ibm-granite/granite-3b-code-instruct": { "name": "Granite-Code-3B-Instruct", @@ -407,7 +447,8 @@ "size": 3, "act_param": 3, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-06" }, "ibm-granite/granite-8b-code-instruct": { "name": "Granite-Code-8B-Instruct", @@ -417,7 +458,8 @@ "size": 8, "act_param": 8, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-06" }, "ibm-granite/granite-20b-code-instruct": { "name": "Granite-Code-20B-Instruct", @@ -427,7 +469,8 @@ "size": 20, "act_param": 20, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-06" }, "ibm-granite/granite-34b-code-instruct": { "name": "Granite-Code-34B-Instruct", @@ -437,7 +480,8 @@ "size": 34, "act_param": 34, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-06" }, "ibm-granite/granite-3b-code-base": { "name": "Granite-Code-3B-Base", @@ -447,7 +491,8 @@ "size": 3, "act_param": 3, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-06" }, "ibm-granite/granite-8b-code-base": { "name": "Granite-Code-8B-Base", @@ -457,7 +502,8 @@ "size": 8, "act_param": 8, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-06" }, "ibm-granite/granite-20b-code-base": { "name": "Granite-Code-20B-Base", @@ -467,7 +513,8 @@ "size": 20, "act_param": 20, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-06" }, "ibm-granite/granite-34b-code-base": { "name": "Granite-Code-34B-Base", @@ -477,7 +524,8 @@ "size": 34, "act_param": 34, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-06" }, "claude-3-haiku-20240307": { "name": "Claude-3-Haiku-20240307", @@ -487,7 +535,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2024-03-07" }, "claude-3-sonnet-20240229": { "name": "Claude-3-Sonnet-20240229", @@ -497,7 +546,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2024-02-29" }, "claude-3-opus-20240229": { "name": "Claude-3-Opus-20240229", @@ -507,7 +557,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2024-02-29" }, "01-ai/Yi-1.5-34B-Chat": { "name": "Yi-1.5-34B-Chat", @@ -517,7 +568,8 @@ "size": 34, "act_param": 34, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-20" }, "01-ai/Yi-1.5-34B": { "name": "Yi-1.5-34B", @@ -527,7 +579,8 @@ "size": 34, "act_param": 34, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-20" }, "01-ai/Yi-1.5-9B-Chat": { "name": "Yi-1.5-9B-Chat", @@ -537,7 +590,8 @@ "size": 9, "act_param": 9, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-20" }, "01-ai/Yi-1.5-9B": { "name": "Yi-1.5-9B", @@ -547,7 +601,8 @@ "size": 9, "act_param": 9, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-20" }, "01-ai/Yi-1.5-6B-Chat": { "name": "Yi-1.5-6B-Chat", @@ -557,7 +612,8 @@ "size": 6, "act_param": 6, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-20" }, "01-ai/Yi-1.5-6B": { "name": "Yi-1.5-6B", @@ -567,7 +623,8 @@ "size": 6, "act_param": 6, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-20" }, "Qwen/Qwen2-57B-A14B": { "name": "Qwen2-57B-A14B", @@ -577,7 +634,8 @@ "size": 57, "act_param": 14, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-06-07" }, "Qwen/Qwen2-7B-Instruct": { "name": "Qwen2-7B-Instruct", @@ -587,7 +645,8 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-06-07" }, "Qwen/Qwen2-72B-Chat": { "name": "Qwen2-72B-Chat", @@ -597,7 +656,8 @@ "size": 72, "act_param": 72, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-06-07" }, "gemini-1.5-pro": { "name": "Gemini-1.5-Pro-API-0514", @@ -607,7 +667,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-14" }, "gemini-1.5-flash": { "name": "Gemini-1.5-Flash-API-0514", @@ -617,7 +678,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-14" }, "m-a-p/OpenCodeInterpreter-DS-33B": { "name": "OpenCodeInterpreter-DS-33B", @@ -627,7 +689,8 @@ "size": 33, "act_param": 33, "open-data": "Partial", - "reasoning": False, + "prefill": True, + "date": "2024-02-22" }, "m-a-p/OpenCodeInterpreter-DS-6.7B": { "name": "OpenCodeInterpreter-DS-6.7B", @@ -637,7 +700,8 @@ "size": 6.7, "act_param": 6.7, "open-data": "Partial", - "reasoning": False, + "prefill": True, + "date": "2024-02-22" }, "m-a-p/OpenCodeInterpreter-DS-1.3B": { "name": "OpenCodeInterpreter-DS-1.3B", @@ -647,7 +711,8 @@ "size": 1.3, "act_param": 1.3, "open-data": "Partial", - "reasoning": False, + "prefill": True, + "date": "2024-02-22" }, "microsoft/Phi-3-medium-128k-instruct": { "name": "Phi-3-Medium-128K-Instruct", @@ -657,7 +722,8 @@ "size": 14, "act_param": 14, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-21" }, "microsoft/Phi-3-small-128k-instruct": { "name": "Phi-3-Small-128K-Instruct", @@ -667,7 +733,8 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-21" }, "codestral-2405": { "name": "Codestral-22B-v0.1", @@ -677,7 +744,8 @@ "size": 22, "act_param": 22, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-23" }, "codestral-mamba-2407": { "name": "Codestral-Mamba", @@ -687,7 +755,8 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-07-16" }, "mistralai/Mistral-7B-Instruct-v0.3": { "name": "Mistral-7B-Instruct-v0.3", @@ -697,7 +766,8 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-22" }, "mistralai/Mistral-7B-v0.3": { "name": "Mistral-7B-v0.3", @@ -707,7 +777,8 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-22" }, "CohereForAI/c4ai-command-r-plus": { "name": "Command R+", @@ -717,7 +788,8 @@ "size": 104, "act_param": 104, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-04" }, "deepseek-coder": { "name": "DeepSeek-Coder-V2-Instruct", @@ -727,7 +799,8 @@ "size": 236, "act_param": 21, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2024-06-17" }, "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": { "name": "DeepSeek-Coder-V2-Lite-Instruct", @@ -737,7 +810,8 @@ "size": 16, "act_param": 2.4, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-06-17" }, "deepseek-ai/DeepSeek-Coder-V2-Lite-Base": { "name": "DeepSeek-Coder-V2-Lite-Base", @@ -747,7 +821,8 @@ "size": 16, "act_param": 2.4, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-06-17" }, "claude-3-5-sonnet-20240620": { "name": "Claude-3.5-Sonnet-20240620", @@ -757,7 +832,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2024-06-20" }, "NousResearch/Hermes-2-Theta-Llama-3-70B": { "name": "Hermes-2-Theta-Llama-3-70B", @@ -767,7 +843,8 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-06-24" }, "microsoft/wavecoder-ultra-6.7b": { "name": "WaveCoder-Ultra-6.7B", @@ -777,7 +854,8 @@ "size": 6.7, "act_param": 6.7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2023-12-26" }, "google/gemma-2-9b-it": { "name": "Gemma-2-9B-Instruct", @@ -787,7 +865,8 @@ "size": 9, "act_param": 9, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-06-19" }, "Bin12345/AutoCoder": { "name": "AutoCoder", @@ -797,7 +876,8 @@ "size": 33, "act_param": 33, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-23" }, "Bin12345/AutoCoder_S_6.7B": { "name": "AutoCoder-S-6.7B", @@ -807,7 +887,8 @@ "size": 6.7, "act_param": 6.7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-23" }, "Bin12345/AutoCoder_QW_7B": { "name": "AutoCoder-QW-7B", @@ -817,7 +898,8 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-23" }, "SenseLLM/ReflectionCoder-DS-33B": { "name": "ReflectionCoder-DS-33B", @@ -827,7 +909,8 @@ "size": 33, "act_param": 33, "open-data": "Partial", - "reasoning": False, + "prefill": True, + "date": "2024-05-27" }, "SenseLLM/ReflectionCoder-DS-6.7B": { "name": "ReflectionCoder-DS-6.7B", @@ -837,7 +920,8 @@ "size": 6.7, "act_param": 6.7, "open-data": "Partial", - "reasoning": False, + "prefill": True, + "date": "2024-05-27" }, "SenseLLM/ReflectionCoder-CL-34B": { "name": "ReflectionCoder-CL-34B", @@ -847,7 +931,8 @@ "size": 34, "act_param": 34, "open-data": "Partial", - "reasoning": False, + "prefill": True, + "date": "2024-05-27" }, "SenseLLM/ReflectionCoder-CL-7B": { "name": "ReflectionCoder-CL-7B", @@ -857,7 +942,8 @@ "size": 7, "act_param": 7, "open-data": "Partial", - "reasoning": False, + "prefill": True, + "date": "2024-05-27" }, "new-microsoft/Phi-3-mini-128k-instruct": { "name": "Phi-3.1-Mini-128K-Instruct", @@ -867,7 +953,8 @@ "size": 3.8, "act_param": 3.8, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-21" }, "old-microsoft/Phi-3-mini-128k-instruct": { "name": "Phi-3-Mini-128K-Instruct", @@ -877,7 +964,8 @@ "size": 3.8, "act_param": 3.8, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-21" }, "internlm/internlm2_5-7b-chat": { "name": "InternLM2.5-7B-Chat", @@ -887,7 +975,8 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-07-03" }, "NousResearch/Hermes-2-Pro-Llama-3-70B": { "name": "Hermes-2-Pro-Llama-3-70B", @@ -897,7 +986,8 @@ "size": 70, "act_param": 70, "open-data": "Partial", - "reasoning": False, + "prefill": True, + "date": "2024-06-27" }, "new-deepseek-chat": { "name": "DeepSeek-V2-Chat (2024-06-28)", @@ -907,7 +997,8 @@ "size": 236, "act_param": 21, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2024-06-28" }, "vllm-google/gemma-2-27b-it": { "name": "Gemma-2-27B-Instruct", @@ -917,7 +1008,8 @@ "size": 27, "act_param": 27, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-06-19" }, "Artigenz/Artigenz-Coder-DS-6.7B": { "name": "Artigenz-Coder-DS-6.7B", @@ -927,7 +1019,8 @@ "size": 6.7, "act_param": 6.7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-16" }, "openchat/openchat-3.6-8b-20240522": { "name": "OpenChat-3.6-8B-20240522", @@ -937,7 +1030,8 @@ "size": 8, "act_param": 8, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-22" }, "Phind/Phind-CodeLlama-34B-v2": { "name": "Phind-CodeLlama-34B-v2", @@ -947,7 +1041,8 @@ "size": 34, "act_param": 34, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2023-08-25" }, "yi-large": { "name": "Yi-Large", @@ -957,7 +1052,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-13" }, "THUDM/codegeex4-all-9b": { "name": "CodeGeex4-All-9B", @@ -967,7 +1063,8 @@ "size": 9, "act_param": 9, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-07-05" }, "gpt-4o-mini-2024-07-18": { "name": "GPT-4o-mini-2024-07-18", @@ -977,7 +1074,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-07-18" }, "Nexusflow/Athene-70B": { "name": "Athene-70B", @@ -987,7 +1085,8 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-07-20" }, "NTQAI/Nxcode-CQ-7B-orpo": { "name": "Nxcode-CQ-7B-Orpo", @@ -997,7 +1096,8 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-25" }, "migtissera/Llama-3-70B-Synthia-v3.5": { "name": "Llama-3-70B-Synthia-v3.5", @@ -1007,7 +1107,8 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-05-27" }, "migtissera/Tess-v2.5.2-Qwen2-72B": { "name": "Tess-v2.5.2-Qwen2-72B", @@ -1017,7 +1118,8 @@ "size": 72, "act_param": 72, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-07-18" }, "WhiteRabbitNeo/WhiteRabbitNeo-33B-v1.5": { "name": "WhiteRabbitNeo-33B-v1.5", @@ -1027,7 +1129,8 @@ "size": 33, "act_param": 33, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-02-10" }, "mistral-large-2407": { "name": "Mistral-Large-Instruct-2407", @@ -1037,7 +1140,8 @@ "size": 123, "act_param": 123, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2024-07-24" }, "meta-llama/Meta-Llama-3.1-8B-Instruct": { "name": "Llama-3.1-8B-Instruct", @@ -1047,7 +1151,8 @@ "size": 8, "act_param": 8, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-07-23" }, "meta-llama/Meta-Llama-3.1-70B-Instruct": { "name": "Llama-3.1-70B-Instruct", @@ -1057,7 +1162,8 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-07-23" }, "meta--llama-3.1-405b-instruct": { "name": "Llama-3.1-405B-Instruct", @@ -1067,7 +1173,8 @@ "size": 405, "act_param": 405, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-07-23" }, "deepseek-coder-20240724": { "name": "DeepSeek-Coder-V2-Instruct (2024-07-24)", @@ -1077,7 +1184,8 @@ "size": 236, "act_param": 21, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-07-24" }, "microsoft/Phi-3.5-mini-instruct": { "name": "Phi-3.5-Mini-Instruct", @@ -1087,7 +1195,8 @@ "size": 3.8, "act_param": 3.8, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-23" }, "nv-mistralai--mistral-nemo-12b-instruct": { "name": "Mistral-Nemo-12B-Instruct", @@ -1097,7 +1206,8 @@ "size": 12, "act_param": 12, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-07-18" }, "wyt2000/InverseCoder-CL-13B": { "name": "InverseCoder-CL-13B", @@ -1107,7 +1217,8 @@ "size": 13, "act_param": 13, "open-data": "Partial", - "reasoning": False, + "prefill": True, + "date": "2024-07-08" }, "wyt2000/InverseCoder-CL-7B": { "name": "InverseCoder-CL-7B", @@ -1117,7 +1228,8 @@ "size": 7, "act_param": 7, "open-data": "Partial", - "reasoning": False, + "prefill": True, + "date": "2024-07-08" }, "wyt2000/InverseCoder-DS-6.7B": { "name": "InverseCoder-DS-6.7B", @@ -1127,7 +1239,8 @@ "size": 6.7, "act_param": 6.7, "open-data": "Partial", - "reasoning": False, + "prefill": True, + "date": "2024-07-08" }, "gemini-1.5-pro-exp-0801": { "name": "Gemini-1.5-Pro-Exp-0801", @@ -1137,7 +1250,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2024-08-01" }, "gpt-4o-2024-08-06": { "name": "GPT-4o-2024-08-06", @@ -1147,7 +1261,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-08-06" }, "abacusai/Dracarys-Llama-3.1-70B-Instruct": { "name": "Dracarys-Llama-3.1-70B-Instruct", @@ -1157,7 +1272,8 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-08-23" }, "abacusai/Dracarys-72B-Instruct": { "name": "Dracarys-72B-Instruct", @@ -1167,7 +1283,8 @@ "size": 72, "act_param": 72, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-08-23" }, "gemini-1.5-pro-exp-0827": { "name": "Gemini-1.5-Pro-Exp-0827", @@ -1177,7 +1294,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-08-27" }, "gemini-1.5-flash-exp-0827": { "name": "Gemini-1.5-Flash-Exp-0827", @@ -1187,7 +1305,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-08-27" }, "microsoft/Phi-3.5-mini-instruct": { "name": "Phi-3.5-Mini-Instruct", @@ -1197,7 +1316,8 @@ "size": 3.8, "act_param": 3.8, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-23" }, "abacusai/Dracarys-Llama-3.1-70B-Instruct": { "name": "Dracarys-Llama-3.1-70B-Instruct", @@ -1207,7 +1327,8 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-23" }, "abacusai/Dracarys-72B-Instruct": { "name": "Dracarys-72B-Instruct", @@ -1217,7 +1338,8 @@ "size": 72, "act_param": 72, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-04-23" }, "deepseek-coder-v2.5": { "name": "DeepSeek-V2.5", @@ -1227,7 +1349,8 @@ "size": 236, "act_param": 21, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-09-18" }, "CohereForAI/c4ai-command-r-08-2024": { "name": "C4AI-Command-R-08-2024", @@ -1237,7 +1360,8 @@ "size": 32.3, "act_param": 32.3, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-08-30" }, "CohereForAI/c4ai-command-r-plus-08-2024": { "name": "C4AI-Command-R-Plus-08-2024", @@ -1247,7 +1371,8 @@ "size": 104, "act_param": 104, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-08-30" }, "ayueei--yue-coder-9b-preview": { "name": "Yi-Coder-9B-Chat", @@ -1257,7 +1382,8 @@ "size": 9, "act_param": 9, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-09-04" }, # "mattshumer/ref_70_e3_prefill": { # "name": "Reflection-Llama-3.1-70B", @@ -1285,7 +1411,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2024-09-12" }, "o1-mini-2024-09-12": { "name": "o1-Mini-2024-09-12 (temperature=1)", @@ -1295,7 +1422,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2024-09-12" }, "Qwen/Qwen2.5-Coder-1.5B-Instruct": { "name": "Qwen2.5-Coder-1.5B-Instruct", @@ -1305,7 +1433,8 @@ "size": 1.5, "act_param": 1.5, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-11-12" }, "Qwen/Qwen2.5-Coder-7B-Instruct": { "name": "Qwen2.5-Coder-7B-Instruct", @@ -1315,7 +1444,8 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-11-12" }, "gemini-1.5-pro-002": { "name": "Gemini-1.5-Pro-002", @@ -1325,7 +1455,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2024-09-25" }, "mistralai/Mistral-Small-Instruct-2409": { "name": "Mistral-Small-Instruct-2409", @@ -1335,7 +1466,8 @@ "size": 22.2, "act_param": 22.2, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-09-18" }, "Qwen/Qwen2.5-0.5B-Instruct": { "name": "Qwen2.5-0.5B-Instruct", @@ -1345,7 +1477,8 @@ "size": 0.5, "act_param": 0.5, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-09-19" }, "Qwen/Qwen2.5-1.5B-Instruct": { "name": "Qwen2.5-1.5B-Instruct", @@ -1355,7 +1488,8 @@ "size": 1.5, "act_param": 1.5, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-09-19" }, "Qwen/Qwen2.5-7B-Instruct": { "name": "Qwen2.5-7B-Instruct", @@ -1365,7 +1499,8 @@ "size": 7, "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-09-19" }, "Qwen/Qwen2.5-14B-Instruct": { "name": "Qwen2.5-14B-Instruct", @@ -1375,7 +1510,8 @@ "size": 14, "act_param": 14, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-09-19" }, "Qwen/Qwen2.5-32B-Instruct": { "name": "Qwen2.5-32B-Instruct", @@ -1385,7 +1521,8 @@ "size": 32, "act_param": 32, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-09-19" }, "Qwen/Qwen2.5-72B-Instruct": { "name": "Qwen2.5-72B-Instruct", @@ -1395,7 +1532,8 @@ "size": 72, "act_param": 72, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-09-19" }, "meta-llama/Llama-3.2-1B-Instruct": { "name": "Llama-3.2-1B-Instruct", @@ -1405,7 +1543,8 @@ "size": 1, "act_param": 1, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-09-25" }, "meta-llama/Llama-3.2-3B-Instruct": { "name": "Llama-3.2-3B-Instruct", @@ -1415,7 +1554,8 @@ "size": 3, "act_param": 3, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-09-25" }, "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF": { "name": "Llama-3.1-Nemotron-70B-Instruct", @@ -1425,7 +1565,8 @@ "size": 70, "act_param": 70, "open-data": "Partial", - "reasoning": False, + "prefill": True, + "date": "2024-09-25" }, "claude-3-5-sonnet-20241022": { "name": "Claude-3.5-Sonnet-20241022", @@ -1435,7 +1576,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-10-22" }, "ibm-granite/granite-3.0-8b-instruct": { "name": "Granite-3.0-8B-Instruct", @@ -1445,7 +1587,8 @@ "size": 8, "act_param": 8, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-10-21" }, "ibm-granite/granite-3.0-2b-instruct": { "name": "Granite-3.0-2B-Instruct", @@ -1455,7 +1598,8 @@ "size": 2, "act_param": 2, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-10-21" }, "grok-beta--main": { "name": "Grok-Beta", @@ -1465,7 +1609,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2024-03-17" }, "claude-3-5-haiku-20241022--main": { "name": "Claude-3.5-Haiku-20241022", @@ -1475,7 +1620,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2024-10-22" }, "Qwen/Qwen2.5-Coder-14B-Instruct--main": { "name": "Qwen2.5-Coder-14B-Instruct", @@ -1485,7 +1631,8 @@ "size": 14, "act_param": 14, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-09-19" }, "Qwen/Qwen2.5-Coder-32B-Instruct--main": { "name": "Qwen2.5-Coder-32B-Instruct", @@ -1495,7 +1642,8 @@ "size": 32, "act_param": 32, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-09-19" }, "infly/OpenCoder-1.5B-Instruct--main": { "name": "OpenCoder-1.5B-Instruct", @@ -1505,7 +1653,8 @@ "size": 1.5, "act_param": 1.5, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-11-09" }, "infly/OpenCoder-8B-Instruct--main": { "name": "OpenCoder-8B-Instruct", @@ -1515,7 +1664,8 @@ "size": 8, "act_param": 8, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-11-09" }, "microsoft/Phi-3.5-mini-instruct--main": { "name": "Phi-3.5-Mini-Instruct", @@ -1525,7 +1675,8 @@ "size": 3.8, "act_param": 3.8, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-08-21" }, "Nexusflow/Athene-V2-Agent--main": { "name": "Athene-V2-Agent", @@ -1535,7 +1686,8 @@ "size": 72, "act_param": 72, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-11-14" }, "Nexusflow/Athene-V2-Chat--main": { "name": "Athene-V2-Chat", @@ -1545,7 +1697,8 @@ "size": 72, "act_param": 72, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-11-14" }, "gemini-exp-1114--main": { "name": "Gemini-Exp-1114", @@ -1555,7 +1708,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2024-11-14" }, "gpt-4o-2024-11-20--main": { "name": "GPT-4o-2024-11-20", @@ -1565,7 +1719,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2024-11-20" }, "gemini-exp-1121--main": { "name": "Gemini-Exp-1121", @@ -1575,7 +1730,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-11-21" }, "gemini-exp-1206--main": { "name": "Gemini-Exp-1206", @@ -1585,7 +1741,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2024-12-06" }, "meta-llama--Llama-3.3-70B-Instruct--main": { "name": "Llama-3.3-70B-Instruct", @@ -1595,7 +1752,8 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-12-19" }, "deepseek-ai--DeepSeek-V2.5-1210--main": { "name": "DeepSeek-V2.5-1210", @@ -1605,7 +1763,8 @@ "size": 236, "act_param": 21, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-12-10" }, "gemini-2.0-flash-exp--main": { "name": "Gemini-2.0-Flash-Exp", @@ -1615,7 +1774,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-12-11" }, "gemini-2.0-flash-thinking-exp-1219--main": { "name": "Gemini-2.0-Flash-Thinking-Exp-1219", @@ -1625,7 +1785,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-12-19" }, "gemini-2.0-flash-thinking-exp-01-21--main": { "name": "Gemini-2.0-Flash-Thinking-Exp-01-21", @@ -1635,7 +1796,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2025-01-21" }, "o1-2024-12-17--main": { "name": "o1-2024-12-17 (temperature=1, reasoning=medium)", @@ -1645,7 +1807,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2024-12-17" }, "o1-2024-12-17--low--main": { "name": "o1-2024-12-17 (temperature=1, reasoning=low)", @@ -1655,7 +1818,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2024-12-17" }, "o1-2024-12-17--high--main": { "name": "o1-2024-12-17 (temperature=1, reasoning=high)", @@ -1665,17 +1829,19 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2024-12-17" }, "deepseek-v3-chat--main": { - "name": "DeepSeek-V3-Chat", - "link": "https://huggingface.co/deepseek-ai/DeepSeek-V3-Chat", + "name": "DeepSeek-V3", + "link": "https://huggingface.co/deepseek-ai/DeepSeek-V3", "prompted": True, "moe": True, "size": 671, "act_param": 37, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2024-12-26" }, "microsoft--phi-4--main": { "name": "Phi-4", @@ -1685,7 +1851,8 @@ "size": 14.7, "act_param": 14.7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-12-13" }, "deepseek-reasoner--main": { "name": "DeepSeek-R1", @@ -1695,7 +1862,8 @@ "size": 671, "act_param": 37, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Llama-70B--main": { "name": "DeepSeek-R1-Distill-Llama-70B", @@ -1705,7 +1873,8 @@ "size": 70, "act_param": 70, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B--main": { "name": "DeepSeek-R1-Distill-Qwen-32B", @@ -1715,7 +1884,8 @@ "size": 32, "act_param": 32, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B--main": { "name": "DeepSeek-R1-Distill-Qwen-14B", @@ -1725,27 +1895,30 @@ "size": 14, "act_param": 14, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Llama-8B--main": { "name": "DeepSeek-R1-Distill-Llama-8B", "link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "prompted": True, "moe": False, - "size": 14, - "act_param": 14, + "size": 8, + "act_param": 8, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B--main": { "name": "DeepSeek-R1-Distill-Qwen-7B", "link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", "prompted": True, "moe": False, - "size": 14, - "act_param": 14, + "size": 7, + "act_param": 7, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2025-01-20" }, "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B--main": { "name": "DeepSeek-R1-Distill-Qwen-1.5B", @@ -1755,7 +1928,8 @@ "size": 1.5, "act_param": 1.5, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2025-01-20" }, "mistralai/Mistral-Small-24B-Instruct-2501--main": { "name": "Mistral-Small-24B-Instruct-2501", @@ -1765,7 +1939,8 @@ "size": 24, "act_param": 24, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2025-01-31" }, "o3-mini-2025-01-31--medium--main": { "name": "o3-mini-2025-01-31 (temperature=1, reasoning=medium)", @@ -1775,7 +1950,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2025-01-31" }, "o3-mini-2025-01-31--low--main": { "name": "o3-mini-2025-01-31 (temperature=1, reasoning=low)", @@ -1785,7 +1961,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2025-01-31" }, "o3-mini-2025-01-31--high--main": { "name": "o3-mini-2025-01-31 (temperature=1, reasoning=high)", @@ -1795,7 +1972,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": True, + "prefill": False, + "date": "2025-01-31" }, "gemini-2.0-flash-001--main": { "name": "Gemini-2.0-Flash-001", @@ -1805,7 +1983,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2025-02-05" }, "gemini-2.0-flash-exp--main": { "name": "Gemini-2.0-Flash-Exp", @@ -1815,7 +1994,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2025-02-05" }, "gemini-2.0-flash-lite-preview-02-05--main": { "name": "Gemini-2.0-Flash-Lite-Preview-02-05", @@ -1825,7 +2005,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2025-02-05" }, "gemini-2.0-pro-exp-02-05--main": { "name": "Gemini-2.0-Pro-Exp-02-05", @@ -1835,7 +2016,8 @@ "size": None, "act_param": None, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2025-02-05" }, "NovaSky-AI--Sky-T1-32B-Flash--main": { "name": "Sky-T1-32B-Flash", @@ -1845,7 +2027,8 @@ "size": 32, "act_param": 32, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2025-01-12" }, "NovaSky-AI--Sky-T1-32B-Preview--main": { "name": "Sky-T1-32B-Preview", @@ -1855,7 +2038,8 @@ "size": 32, "act_param": 32, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2025-01-12" }, "Qwen--QwQ-32B-Preview--main": { "name": "QwQ-32B-Preview", @@ -1865,6 +2049,205 @@ "size": 32, "act_param": 32, "open-data": "None", - "reasoning": False, + "prefill": True, + "date": "2024-11-28" + }, + "claude-3-7-sonnet-20250219--main": { + "name": "Claude-3-Haiku-20240307", + "link": "https://www.anthropic.com/news/claude-3-family", + "prompted": True, + "moe": False, + "size": None, + "act_param": None, + "open-data": "None", + "prefill": False, + "date": "2025-02-19" + }, + "chatgpt-4o-latest--main": { + "name": "ChatGPT-4o-latest-20250129", + "link": "https://chat.openai.com/", + "open-data": "None", + "prompted": True, + "moe": False, + "size": None, + "act_param": None, + "prefill": False, + "date": "2025-01-29" + }, + "Kwaipilot--KwaiCoder-23B-A4B-v1--main": { + "name": "KwaiCoder-23B-A4B-v1", + "link": "https://huggingface.co/Kwaipilot/KwaiCoder-23B-A4B-v1", + "open-data": "None", + "prompted": False, + "moe": True, + "size": 23, + "act_param": 4, + "prefill": True, + "date": "2025-01-25" + }, + "qwen-max-latest--main": { + "name": "Qwen2.5-Max", + "link": "https://qwenlm.github.io/blog/qwen2.5-max/", + "open-data": "None", + "prompted": True, + "moe": True, + "size": None, + "act_param": None, + "prefill": False, + "date": "2025-01-28" + }, + "claude-3-7-sonnet-20250219--3200-output-128k-2025-02-19--main": { + "name": "Claude-3.7-Sonnet-20250219 (temperature=1, length=12800, reasoning=3200)", + "link": "https://www.anthropic.com/news/claude-3-7-sonnet", + "prompted": True, + "moe": False, + "size": None, + "act_param": None, + "open-data": "None", + "prefill": False, + "date": "2025-02-19" + }, + "claude-3-7-sonnet-20250219--main": { + "name": "Claude-3.7-Sonnet-20250219", + "link": "https://www.anthropic.com/news/claude-3-7-sonnet", + "prompted": True, + "moe": False, + "size": None, + "act_param": None, + "open-data": "None", + "prefill": False, + "date": "2025-02-19" + }, + "WarriorCoder-6.7B--main": { + "name": "WarriorCoder-6.7B (Reproduced)", + "link": "https://arxiv.org/abs/2412.17395", + "open-data": "None", + "prompted": True, + "moe": False, + "size": 6.7, + "act_param": 6.7, + "open-data": "None", + "prefill": True, + "date": "2025-02-18" + }, + "google--gemma-3-27b-it--main": { + "name": "Gemma-3-27B-Instruct", + "link": "https://huggingface.co/google/gemma-3-27b-it", + "open-data": "None", + "prompted": True, + "moe": False, + "size": 27, + "act_param": 27, + "open-data": "None", + "prefill": True, + "date": "2025-03-12" + }, + "Qwen--QwQ-32B--skip_prefill--main": { + "name": "QwQ-32B (w/ Reasoning)", + "link": "https://huggingface.co/Qwen/QwQ-32B", + "open-data": "None", + "prompted": True, + "moe": False, + "size": 32, + "act_param": 32, + "open-data": "None", + "prefill": False, + "date": "2025-03-06" + }, + "deepseek-chat-0324--main": { + "name": "DeepSeek-V3-0324", + "link": "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324", + "open-data": "None", + "prompted": True, + "moe": True, + "size": 671, + "act_param": 37, + "open-data": "None", + "prefill": False, + "date": "2025-03-24" + }, + "gemini-2.5-pro-exp-03-25--main": { + "name": "Gemini-2.5-Pro-Exp-03-25", + "link": "https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/", + "open-data": "None", + "prompted": True, + "moe": False, + "size": None, + "act_param": 37, + "open-data": "None", + "prefill": False, + "date": "2025-03-25" + }, + "meta/llama-4-scout-17b-16e-instruct--main": { + "name": "Llama-4-Scout", + "link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", + "open-data": "None", + "prompted": True, + "moe": True, + "size": 109, + "act_param": 17, + "open-data": "None", + "prefill": False, + "date": "2025-04-05" + }, + "meta/llama-4-maverick-17b-128e-instruct--main": { + "name": "Llama-4-Maverick", + "link": "https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct", + "open-data": "None", + "prompted": True, + "moe": True, + "size": 109, + "act_param": 17, + "open-data": "None", + "prefill": False, + "date": "2025-04-05" }, -} + "agentica-org/DeepCoder-14B-Preview--main": { + "name": "DeepCoder-14B-Preview", + "link": "https://huggingface.co/agentica-org/DeepCoder-14B-Preview", + "open-data": "None", + "prompted": True, + "moe": True, + "size": 14, + "act_param": 14, + "open-data": "None", + "prefill": True, + "date": "2025-04-09" + }, + "openrouter/quasar-alpha--main": { + "name": "Quasar-Alpha", + "link": "https://openrouter.ai/openrouter/quasar-alpha", + "open-data": "None", + "prompted": True, + "moe": True, + "size": None, + "act_param": None, + "open-data": "None", + "prefill": False, + "date": "2025-04-02" + }, + "agentica-org/DeepCoder-14B-Preview--skip_prefill--main": { + "name": "DeepCoder-14B-Preview (w/ Reasoning, 64k tokens, temperature=0.6)", + "link": "https://huggingface.co/agentica-org/DeepCoder-14B-Preview", + "open-data": "None", + "prompted": True, + "moe": False, + "size": 14, + "act_param": 14, + "open-data": "None", + "prefill": False, + "date": "2025-04-09" + }, + "openrouter/optimus-alpha--main": { + "name": "Optimus-Alpha", + "link": "https://openrouter.ai/openrouter/optimus-alpha", + "open-data": "None", + "prompted": True, + "moe": True, + "size": None, + "act_param": None, + "open-data": "None", + "prefill": False, + "date": "2025-04-10" + } +} \ No newline at end of file diff --git a/bigcodebench/gen/util/anthropic_request.py b/bigcodebench/gen/util/anthropic_request.py index e53feab..f6d18fd 100644 --- a/bigcodebench/gen/util/anthropic_request.py +++ b/bigcodebench/gen/util/anthropic_request.py @@ -16,7 +16,19 @@ def make_auto_request(client: anthropic.Client, *args, **kwargs) -> Message: try: signal.signal(signal.SIGALRM, handler) signal.alarm(100) - ret = client.messages.create(*args, **kwargs) + if "reasoning_budget" in kwargs and "reasoning_beta" in kwargs: + kwargs["thinking"] = { + "type": "enabled", + "budget_tokens": kwargs["reasoning_budget"], + } + kwargs["betas"] = [kwargs["reasoning_beta"]] + kwargs.pop("reasoning_budget") + kwargs.pop("reasoning_beta") + kwargs.pop("temperature") + if "thinking" in kwargs: + ret = client.beta.messages.create(*args, **kwargs, stream=True) + else: + ret = client.messages.create(*args, **kwargs) signal.alarm(0) except anthropic.RateLimitError: print("Rate limit exceeded. Waiting...") diff --git a/bigcodebench/gen/util/google_request.py b/bigcodebench/gen/util/google_request.py index 9e13607..5a76362 100644 --- a/bigcodebench/gen/util/google_request.py +++ b/bigcodebench/gen/util/google_request.py @@ -1,11 +1,12 @@ import time -import google.generativeai as genai +from google import genai from google.api_core.exceptions import GoogleAPICallError, ResourceExhausted def make_request( - client: genai.GenerativeModel, + model: str, + client: genai.Client, message: str, temperature: float, n: int, @@ -13,21 +14,34 @@ def make_request( ) -> genai.types.GenerateContentResponse: kwargs = {"temperature": temperature, "max_output_tokens": max_new_tokens} - if "-thinking-" in client.model_name: + if "-thinking-" in model: kwargs.pop("max_output_tokens") - - response = client.generate_content( - [{"role": "user", "parts": [message]}], - generation_config=genai.types.GenerationConfig( + + response = client.models.generate_content( + model=model, + contents=message, + config=genai.types.GenerateContentConfig( candidate_count=n, + safety_settings=[ + genai.types.SafetySetting( + category='HARM_CATEGORY_DANGEROUS_CONTENT', + threshold='BLOCK_NONE' + ), + genai.types.SafetySetting( + category='HARM_CATEGORY_SEXUALLY_EXPLICIT', + threshold='BLOCK_NONE' + ), + genai.types.SafetySetting( + category='HARM_CATEGORY_HATE_SPEECH', + threshold='BLOCK_NONE' + ), + genai.types.SafetySetting( + category='HARM_CATEGORY_HARASSMENT', + threshold='BLOCK_NONE' + ), + ], **kwargs - ), - safety_settings=[ - {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"}, - {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"}, - {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"}, - {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"}, - ], + ), ) return response diff --git a/bigcodebench/gen/util/openai_request.py b/bigcodebench/gen/util/openai_request.py index f8db3f5..3c8b741 100644 --- a/bigcodebench/gen/util/openai_request.py +++ b/bigcodebench/gen/util/openai_request.py @@ -17,7 +17,7 @@ def make_request( kwargs["top_p"] = 0.95 kwargs["max_completion_tokens"] = max_tokens kwargs["temperature"] = temperature - if model.startswith("o1-") or model.startswith("o3-") or model.endswith("-reasoner"): # pop top-p and max_completion_tokens + if any(model.startswith(m) or model.endswith(m) for m in ["o1-", "o3-", "reasoner", "grok-3-mini-beta"]): # pop top-p and max_completion_tokens kwargs.pop("top_p") kwargs.pop("max_completion_tokens") kwargs.pop("temperature") diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py index bcf1463..adbf892 100644 --- a/bigcodebench/generate.py +++ b/bigcodebench/generate.py @@ -127,12 +127,19 @@ def run_codegen( split: str, subset: str, root: str = "bcb_results", + lora_path: str = None, bs: Optional[int] = None, n_samples: int = 1, temperature: float = 0.0, max_new_tokens: int = 1280, + # vllm + max_model_len: int = 12800, greedy: bool = False, + # openai reasoning_effort: str = "medium", + # anthropic + reasoning_budget: int = 0, + reasoning_beta: str = "output-128k-2025-02-19", strip_newlines: bool = False, direct_completion: bool = False, resume: bool = True, @@ -170,9 +177,13 @@ def run_codegen( backend=backend, subset=subset, split=split, + lora_path=lora_path, temperature=temperature, max_new_tokens=max_new_tokens, + max_model_len=max_model_len, reasoning_effort=reasoning_effort, + reasoning_budget=reasoning_budget, + reasoning_beta=reasoning_beta, instruction_prefix=instruction_prefix, response_prefix=response_prefix, prefill=not skip_prefill, @@ -186,9 +197,15 @@ def run_codegen( ) extra = "-" + subset if subset != "full" else "" - if reasoning_effort and model.startswith("o1-") or model.startswith("o3-") or model.endswith("-reasoner"): + if backend == "openai" and reasoning_effort and any(model.startswith(m) or model.endswith(m) for m in ["o1-", "o3-", "reasoner", "grok-3-mini-beta"]): model = model + f"--{reasoning_effort}" - + + if lora_path: + model = model + f"--lora-{lora_path}" + + if backend == "anthropic" and reasoning_budget and reasoning_beta: + model = model + f"--{reasoning_budget}-{reasoning_beta}" + if skip_prefill: identifier = model.replace("/", "--") + "--skip_prefill" + f"--{revision}--bigcodebench{extra}-{split}--{backend}-{temperature}-{n_samples}-sanitized_calibrated.jsonl" else: diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py index c78d870..4cb3410 100644 --- a/bigcodebench/provider/__init__.py +++ b/bigcodebench/provider/__init__.py @@ -6,11 +6,16 @@ def make_model( backend: str, subset: str, split: str, + lora_path: str = None, dataset: str = "bigcodebench", temperature: float = 0.0, max_new_tokens: int = 1280, - # o1 and o3 only + max_model_len: int = 12800, + # openai only reasoning_effort: str = "medium", + # anthropic only + reasoning_budget: int = 0, + reasoning_beta: str = "output-128k-2025-02-19", # instruction model only instruction_prefix: str = None, response_prefix: str = None, @@ -35,8 +40,10 @@ def make_model( name=model, subset=subset, split=split, + lora_path=lora_path, temperature=temperature, max_new_tokens=max_new_tokens, + max_model_len=max_model_len, revision=revision, dataset=dataset, direct_completion=direct_completion, @@ -55,6 +62,7 @@ def make_model( name=model, subset=subset, split=split, + lora_path=lora_path, temperature=temperature, max_new_tokens=max_new_tokens, revision=revision, @@ -118,6 +126,8 @@ def make_model( split=split, temperature=temperature, max_new_tokens=max_new_tokens, + reasoning_budget=reasoning_budget, + reasoning_beta=reasoning_beta, instruction_prefix=instruction_prefix, response_prefix=response_prefix, ) diff --git a/bigcodebench/provider/anthropic.py b/bigcodebench/provider/anthropic.py index 1969e0c..b4a7e43 100644 --- a/bigcodebench/provider/anthropic.py +++ b/bigcodebench/provider/anthropic.py @@ -9,9 +9,11 @@ from bigcodebench.provider.utility import make_raw_chat_prompt class AnthropicDecoder(DecoderBase): - def __init__(self, name: str, **kwargs) -> None: + def __init__(self, name: str, reasoning_budget: int = 0, reasoning_beta: str = "output-128k-2025-02-19", **kwargs) -> None: super().__init__(name, **kwargs) self.client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_KEY")) + self.reasoning_budget = reasoning_budget + self.reasoning_beta = reasoning_beta def codegen( self, prompts: List[str], do_sample: bool = True, num_samples: int = 200 @@ -43,8 +45,20 @@ def codegen( max_tokens=self.max_new_tokens, temperature=self.temperature, stop_sequences=self.eos, + reasoning_budget=self.reasoning_budget, + reasoning_beta=self.reasoning_beta, ) - outputs.append(ret.content[0].text) + if isinstance(ret, anthropic.Stream): + output = "" + for chunk in ret: + if chunk.type == "content_block_delta": + # if chunk.delta.type == "thinking_delta": + # output += chunk.delta.thinking + if chunk.delta.type == "text_delta": + output += chunk.delta.text + outputs.append(output) + else: + outputs.append(ret.content[0].text) all_outputs.append(outputs) return all_outputs diff --git a/bigcodebench/provider/google.py b/bigcodebench/provider/google.py index 2194c47..e3b18ff 100644 --- a/bigcodebench/provider/google.py +++ b/bigcodebench/provider/google.py @@ -2,7 +2,7 @@ from typing import List from tqdm import tqdm -import google.generativeai as genai +from google import genai from bigcodebench.provider.base import DecoderBase from bigcodebench.gen.util.google_request import make_auto_request @@ -12,8 +12,8 @@ class GoogleDecoder(DecoderBase): def __init__(self, name: str, **kwargs): super().__init__(name, **kwargs) - genai.configure(api_key=os.getenv("GOOGLE_API_KEY")) - self.client = genai.GenerativeModel(name) + self.model = name + self.client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY")) def codegen( self, prompts: List[str], do_sample: bool = True, num_samples: int = 200 @@ -34,7 +34,8 @@ def codegen( tokenizer=None, ) ret = make_auto_request( - self.client, + model=self.model, + client=self.client, message=message, n=num_samples, temperature=self.temperature, diff --git a/bigcodebench/provider/openai.py b/bigcodebench/provider/openai.py index 12790f6..ff1459f 100644 --- a/bigcodebench/provider/openai.py +++ b/bigcodebench/provider/openai.py @@ -28,7 +28,7 @@ def codegen( tokenizer=None, ) for prompt in prompts] # use concurrency based batching for o1 and deepseek models - if self.name.startswith("o1-") or self.name.startswith("o3-") or self.name.startswith("deepseek"): + if any(self.name.startswith(model) or self.name.endswith(model) for model in ["o1-", "o3-", "reasoner", "grok-3-mini-beta"]): return self._codegen_batch_via_concurrency(messages, num_samples) return self._codegen_api_batch(messages, num_samples) diff --git a/bigcodebench/provider/vllm.py b/bigcodebench/provider/vllm.py index cc928e4..41cd251 100644 --- a/bigcodebench/provider/vllm.py +++ b/bigcodebench/provider/vllm.py @@ -3,6 +3,8 @@ from transformers import AutoTokenizer from vllm import LLM, SamplingParams +from vllm.lora.request import LoRARequest +from huggingface_hub import snapshot_download from bigcodebench.provider.base import DecoderBase from bigcodebench.provider.utility import ( @@ -11,7 +13,7 @@ ) class VllmDecoder(DecoderBase): - def __init__(self, name: str, dataset: str, tp: int, **kwargs) -> None: + def __init__(self, name: str, lora_path: str, dataset: str, tp: int, max_model_len: int, **kwargs) -> None: super().__init__(name, **kwargs) kwargs = { @@ -29,7 +31,17 @@ def __init__(self, name: str, dataset: str, tp: int, **kwargs) -> None: else: if self.prefill and "```" in self.response_prefix: self.eos += ["\n```\n"] - self.llm = LLM(model=name, max_model_len=self.max_new_tokens, **kwargs) + + self.lora_request = None + if lora_path: + local_lora_path = snapshot_download(lora_path) + self.lora_request = LoRARequest( + "lora", + 1, + local_lora_path, + ) + + self.llm = LLM(model=name, max_model_len=max_model_len, enable_lora=True if self.lora_request else False, **kwargs) self.llm.set_tokenizer(tokenizer=self.tokenizer) def is_direct_completion(self) -> bool: @@ -64,6 +76,7 @@ def codegen( stop=self.eos, skip_special_tokens=self.skip_special_tokens, ), + lora_request=self.lora_request, use_tqdm=True, ) diff --git a/setup.cfg b/setup.cfg index cc20139..5907add 100644 --- a/setup.cfg +++ b/setup.cfg @@ -35,7 +35,7 @@ install_requires = rich accelerate>=0.30.1 anthropic>=0.26.1 - google-generativeai>=0.5.4 + google-genai mistralai>=0.2.0,<1.0.0 openai>=1.11.1 e2b diff --git a/tools/fix_v025.py b/tools/fix_v025.py new file mode 100644 index 0000000..edbeb71 --- /dev/null +++ b/tools/fix_v025.py @@ -0,0 +1,135 @@ +from datasets import load_dataset +from huggingface_hub import HfApi + +BIGCODEBENCH_HF = "bigcode/bigcodebench" +BIGCODEBENCH_HARD_HF = "bigcode/bigcodebench-hard" +BIGCODEBENCH_VERSION = "v0.1.4" +BIGCODEBENCH_UPDATE = "bigcode/bcb_update" +BIGCODEBENCH_NEW_VERSION = "v0.1.5" + +def map_ds(sample): + if sample["task_id"] in ["BigCodeBench/332"]: + sample['code_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['code_prompt'] + sample['complete_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['complete_prompt'] + sample['instruct_prompt'] = sample['instruct_prompt'].replace( + "\nYou should write self-contained code starting with:\n```\n", + "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n" + ) + + if sample["task_id"] in ["BigCodeBench/334"]: + sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt'] + sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt'] + sample['instruct_prompt'] = sample['instruct_prompt'].replace( + "\nYou should write self-contained code starting with:\n```\n", + "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n" + ) + + if sample["task_id"] in ["BigCodeBench/376"]: + sample['code_prompt'] = sample['code_prompt'].replace( + "import nltk\n", + "import nltk\nnltk.download('stopwords')\n", + 1 + ) + sample['complete_prompt'] = sample['complete_prompt'].replace( + "import nltk\n", + "import nltk\nnltk.download('stopwords')\n", + 1 + ) + sample['instruct_prompt'] = sample['instruct_prompt'].replace( + "\nYou should write self-contained code starting with:\n```\nimport nltk\n", + "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n" + ) + + if sample["task_id"] in ["BigCodeBench/383"]: + sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt'] + sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt'] + sample['instruct_prompt'] = sample['instruct_prompt'].replace( + "\nYou should write self-contained code starting with:\n```\n", + "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n" + ) + + if sample["task_id"] in ["BigCodeBench/633"]: + sample['code_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['code_prompt'] + sample['complete_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['complete_prompt'] + sample['instruct_prompt'] = sample['instruct_prompt'].replace( + "\nYou should write self-contained code starting with:\n```\n", + "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n" + ) + + if sample["task_id"] in ["BigCodeBench/635"]: + sample['code_prompt'] = sample['code_prompt'].replace( + "# Importing the required libraries", + "# Importing the required libraries\nimport nltk\nnltk.download('stopwords')\n" + ) + + sample['complete_prompt'] = sample['complete_prompt'].replace( + "# Importing the required libraries", + "# Importing the required libraries\nimport nltk\nnltk.download('stopwords')\n" + ) + + sample['instruct_prompt'] = sample['instruct_prompt'].replace( + "\nYou should write self-contained code starting with:\n```\n", + "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n" + ) + + if sample["task_id"] in ["BigCodeBench/849"]: + sample['code_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['code_prompt'] + sample['complete_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['complete_prompt'] + sample['instruct_prompt'] = sample['instruct_prompt'].replace( + "\nYou should write self-contained code starting with:\n```\n", + "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n" + ) + + if sample["task_id"] in ["BigCodeBench/940"]: + sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt'] + sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt'] + sample['instruct_prompt'] = sample['instruct_prompt'].replace( + "\nYou should write self-contained code starting with:\n```\n", + "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n" + ) + + if sample["task_id"] in ["BigCodeBench/1109"]: + sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt'] + sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt'] + sample['instruct_prompt'] = sample['instruct_prompt'].replace( + "\nYou should write self-contained code starting with:\n```\n", + "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n" + ) + + return sample + +if __name__ == "__main__": + api = HfApi() + ds_dict = load_dataset(BIGCODEBENCH_HF) + hard_ds_dict = load_dataset(BIGCODEBENCH_HARD_HF) + ds = ds_dict[BIGCODEBENCH_VERSION] + hard_ds = hard_ds_dict[BIGCODEBENCH_VERSION] + function_id = [332, 334, 376, 383, 633, 635, 849, 940, 1109] + + new_ds = ds.map(map_ds) + new_ds.to_json("BigCodeBench.jsonl") + ds_dict[BIGCODEBENCH_NEW_VERSION] = new_ds + ds_dict.push_to_hub(BIGCODEBENCH_HF) + + new_hard_ds = hard_ds.map(map_ds) + new_hard_ds.to_json("BigCodeBench-Hard.jsonl") + hard_ds_dict[BIGCODEBENCH_NEW_VERSION] = new_hard_ds + hard_ds_dict.push_to_hub(BIGCODEBENCH_HARD_HF) + + for i in function_id: + old_sample = ds.select([i]) + new_sample = new_ds.select([i]) + old_sample.to_json("old.jsonl") + new_sample.to_json("new.jsonl") + api.upload_file( + path_or_fileobj="old.jsonl", + path_in_repo=f"{i}/old.jsonl", + repo_id=BIGCODEBENCH_UPDATE, + # repo_type="dataset" + ) + api.upload_file( + path_or_fileobj="new.jsonl", + path_in_repo=f"{i}/new.jsonl", + repo_id=BIGCODEBENCH_UPDATE, + # repo_type="dataset" + ) \ No newline at end of file