From 89309066c6e4e590c8a20c1392d504cb9e68917a Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Tue, 25 Feb 2025 21:31:42 +0800
Subject: [PATCH 01/17] feat: support anthropic extended thinking

---
 bigcodebench/gen/util/anthropic_request.py | 13 ++++++++++++-
 bigcodebench/generate.py                   | 11 ++++++++++-
 bigcodebench/provider/__init__.py          |  7 ++++++-
 bigcodebench/provider/anthropic.py         |  6 +++++-
 4 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/bigcodebench/gen/util/anthropic_request.py b/bigcodebench/gen/util/anthropic_request.py
index e53feab..e240dee 100644
--- a/bigcodebench/gen/util/anthropic_request.py
+++ b/bigcodebench/gen/util/anthropic_request.py
@@ -16,7 +16,18 @@ def make_auto_request(client: anthropic.Client, *args, **kwargs) -> Message:
         try:
             signal.signal(signal.SIGALRM, handler)
             signal.alarm(100)
-            ret = client.messages.create(*args, **kwargs)
+            if "reasoning_budget" in kwargs and "reasoning_beta" in kwargs:
+                ret = client.beta.messages.create(
+                    *args, 
+                    **kwargs, 
+                    thinking = {
+                        "type": "enabled",
+                        "budget": kwargs["reasoning_budget"],
+                    },
+                    betas=[kwargs["reasoning_beta"]]
+                )
+            else:
+                ret = client.messages.create(*args, **kwargs)
             signal.alarm(0)
         except anthropic.RateLimitError:
             print("Rate limit exceeded. Waiting...")
diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py
index bcf1463..9823d0c 100644
--- a/bigcodebench/generate.py
+++ b/bigcodebench/generate.py
@@ -132,7 +132,11 @@ def run_codegen(
     temperature: float = 0.0,
     max_new_tokens: int = 1280,
     greedy: bool = False,
+    # openai
     reasoning_effort: str = "medium",
+    # anthropic
+    reasoning_budget: int = 0,
+    reasoning_beta: str = "output-128k-2025-02-19",
     strip_newlines: bool = False,
     direct_completion: bool = False,
     resume: bool = True,
@@ -173,6 +177,8 @@ def run_codegen(
         temperature=temperature,
         max_new_tokens=max_new_tokens,
         reasoning_effort=reasoning_effort,
+        reasoning_budget=reasoning_budget,
+        reasoning_beta=reasoning_beta,
         instruction_prefix=instruction_prefix,
         response_prefix=response_prefix,
         prefill=not skip_prefill,
@@ -186,8 +192,11 @@ def run_codegen(
     )
     
     extra = "-" + subset if subset != "full" else ""
-    if reasoning_effort and model.startswith("o1-") or model.startswith("o3-") or model.endswith("-reasoner"):
+    if backend == "openai" and reasoning_effort and model.startswith("o1-") or model.startswith("o3-") or model.endswith("-reasoner"):
         model = model + f"--{reasoning_effort}"
+    
+    if backend == "anthropic" and reasoning_budget and reasoning_beta:
+        model = model + f"--{reasoning_budget}-{reasoning_beta}"
 
     if skip_prefill:
         identifier = model.replace("/", "--") + "--skip_prefill" + f"--{revision}--bigcodebench{extra}-{split}--{backend}-{temperature}-{n_samples}-sanitized_calibrated.jsonl"
diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py
index c78d870..f76ec29 100644
--- a/bigcodebench/provider/__init__.py
+++ b/bigcodebench/provider/__init__.py
@@ -9,8 +9,11 @@ def make_model(
     dataset: str = "bigcodebench",
     temperature: float = 0.0,
     max_new_tokens: int = 1280,
-    # o1 and o3 only
+    # openai only
     reasoning_effort: str = "medium",
+    # anthropic only
+    reasoning_budget: int = 0,
+    reasoning_beta: str = "output-128k-2025-02-19",
     # instruction model only
     instruction_prefix: str = None,
     response_prefix: str = None,
@@ -118,6 +121,8 @@ def make_model(
             split=split,
             temperature=temperature,
             max_new_tokens=max_new_tokens,
+            reasoning_budget=reasoning_budget,
+            reasoning_beta=reasoning_beta,
             instruction_prefix=instruction_prefix,
             response_prefix=response_prefix,
         )
diff --git a/bigcodebench/provider/anthropic.py b/bigcodebench/provider/anthropic.py
index 1969e0c..1612456 100644
--- a/bigcodebench/provider/anthropic.py
+++ b/bigcodebench/provider/anthropic.py
@@ -9,9 +9,11 @@
 from bigcodebench.provider.utility import make_raw_chat_prompt
 
 class AnthropicDecoder(DecoderBase):
-    def __init__(self, name: str, **kwargs) -> None:
+    def __init__(self, name: str, reasoning_budget: int = 0, reasoning_beta: str = "output-128k-2025-02-19", **kwargs) -> None:
         super().__init__(name, **kwargs)
         self.client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_KEY"))
+        self.reasoning_budget = reasoning_budget
+        self.reasoning_beta = reasoning_beta
 
     def codegen(
         self, prompts: List[str], do_sample: bool = True, num_samples: int = 200
@@ -43,6 +45,8 @@ def codegen(
                     max_tokens=self.max_new_tokens,
                     temperature=self.temperature,
                     stop_sequences=self.eos,
+                    reasoning_budget=self.reasoning_budget,
+                    reasoning_beta=self.reasoning_beta,
                 )
                 outputs.append(ret.content[0].text)
             all_outputs.append(outputs)

From c05694cde596c9728664dbab2c8bed5e5ea9c036 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Tue, 25 Feb 2025 21:41:23 +0800
Subject: [PATCH 02/17] fix: remove unused args

---
 bigcodebench/gen/util/anthropic_request.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/bigcodebench/gen/util/anthropic_request.py b/bigcodebench/gen/util/anthropic_request.py
index e240dee..20ce444 100644
--- a/bigcodebench/gen/util/anthropic_request.py
+++ b/bigcodebench/gen/util/anthropic_request.py
@@ -17,15 +17,14 @@ def make_auto_request(client: anthropic.Client, *args, **kwargs) -> Message:
             signal.signal(signal.SIGALRM, handler)
             signal.alarm(100)
             if "reasoning_budget" in kwargs and "reasoning_beta" in kwargs:
-                ret = client.beta.messages.create(
-                    *args, 
-                    **kwargs, 
-                    thinking = {
-                        "type": "enabled",
-                        "budget": kwargs["reasoning_budget"],
-                    },
-                    betas=[kwargs["reasoning_beta"]]
-                )
+                kwargs["thinking"] = {
+                    "type": "enabled",
+                    "budget": kwargs["reasoning_budget"],
+                }
+                kwargs["betas"] = [kwargs["reasoning_beta"]]
+                kwargs.pop("reasoning_budget")
+                kwargs.pop("reasoning_beta")
+                ret = client.beta.messages.create(*args, **kwargs)
             else:
                 ret = client.messages.create(*args, **kwargs)
             signal.alarm(0)

From 57eb973f34666067287cbb05e1845e16b87b5e26 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Wed, 26 Feb 2025 00:57:31 +0800
Subject: [PATCH 03/17] fix: correctly process anthropic treaming

---
 bigcodebench/gen/util/anthropic_request.py |  6 ++++--
 bigcodebench/provider/anthropic.py         | 12 +++++++++++-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/bigcodebench/gen/util/anthropic_request.py b/bigcodebench/gen/util/anthropic_request.py
index 20ce444..f6d18fd 100644
--- a/bigcodebench/gen/util/anthropic_request.py
+++ b/bigcodebench/gen/util/anthropic_request.py
@@ -19,12 +19,14 @@ def make_auto_request(client: anthropic.Client, *args, **kwargs) -> Message:
             if "reasoning_budget" in kwargs and "reasoning_beta" in kwargs:
                 kwargs["thinking"] = {
                     "type": "enabled",
-                    "budget": kwargs["reasoning_budget"],
+                    "budget_tokens": kwargs["reasoning_budget"],
                 }
                 kwargs["betas"] = [kwargs["reasoning_beta"]]
                 kwargs.pop("reasoning_budget")
                 kwargs.pop("reasoning_beta")
-                ret = client.beta.messages.create(*args, **kwargs)
+                kwargs.pop("temperature")
+            if "thinking" in kwargs:
+                ret = client.beta.messages.create(*args, **kwargs, stream=True)
             else:
                 ret = client.messages.create(*args, **kwargs)
             signal.alarm(0)
diff --git a/bigcodebench/provider/anthropic.py b/bigcodebench/provider/anthropic.py
index 1612456..59aec09 100644
--- a/bigcodebench/provider/anthropic.py
+++ b/bigcodebench/provider/anthropic.py
@@ -48,7 +48,17 @@ def codegen(
                     reasoning_budget=self.reasoning_budget,
                     reasoning_beta=self.reasoning_beta,
                 )
-                outputs.append(ret.content[0].text)
+                if isinstance(ret, anthropic.Stream):
+                    output = ""
+                    for chunk in ret:
+                        if chunk.type == "content_block_delta":
+                            if chunk.delta.type == "thinking_delta":
+                                output += chunk.delta.thinking
+                            elif chunk.delta.type == "text_delta":
+                                output += chunk.delta.text
+                    outputs.append(output)
+                else:
+                    outputs.append(ret.content[0].text)
             all_outputs.append(outputs)
         return all_outputs
 

From 78dceb21430359efa05c235324e10523453d7d2f Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Wed, 26 Feb 2025 01:02:05 +0800
Subject: [PATCH 04/17] fix: only append text output

---
 bigcodebench/provider/anthropic.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bigcodebench/provider/anthropic.py b/bigcodebench/provider/anthropic.py
index 59aec09..b4a7e43 100644
--- a/bigcodebench/provider/anthropic.py
+++ b/bigcodebench/provider/anthropic.py
@@ -52,9 +52,9 @@ def codegen(
                     output = ""
                     for chunk in ret:
                         if chunk.type == "content_block_delta":
-                            if chunk.delta.type == "thinking_delta":
-                                output += chunk.delta.thinking
-                            elif chunk.delta.type == "text_delta":
+                            # if chunk.delta.type == "thinking_delta":
+                            #     output += chunk.delta.thinking
+                            if chunk.delta.type == "text_delta":
                                 output += chunk.delta.text
                     outputs.append(output)
                 else:

From 05b7f1f93355f2e64cc3576c4dd1f6c2dbdeab67 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Sun, 2 Mar 2025 19:27:56 +0800
Subject: [PATCH 05/17] doc: fix endpoints

---
 ADVANCED_USAGE.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md
index 4f48eca..9bb81b8 100755
--- a/ADVANCED_USAGE.md
+++ b/ADVANCED_USAGE.md
@@ -69,7 +69,8 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio
 - `--tokenizer_legacy`: Whether to use the legacy tokenizer, default to `False`
 - `--samples`: The path to the generated samples file, default to `None`
 - `--no_execute`: Whether to not execute the samples, default to `False`
-- `--remote_execute_api`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page
+- `--e2b_endpoint`: The API endpoint for remote execution, default to `bigcodebench_evaluator`, you can also use your own E2B API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page
+- `--gradio_endpoint`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page
 - `--pass_k`: The `k` in `Pass@k`, default to `[1, 5, 10]`, e.g. `--pass_k 1,5,10` will evaluate `Pass@1`, `Pass@5` and `Pass@10`
 - `--calibrated`: Whether to use the calibrated samples, default to `True`
 - `--save_pass_rate`: Whether to save the pass rate to a file, default to `True`

From 0ecd667f74cd5f789b36e22dc8564f0fc1c09884 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Sun, 2 Mar 2025 20:30:39 +0800
Subject: [PATCH 06/17] update the results analysis script

---
 analysis/get_results.py | 38 ++++++++++++++++----------------------
 1 file changed, 16 insertions(+), 22 deletions(-)

diff --git a/analysis/get_results.py b/analysis/get_results.py
index fc5aa17..607615a 100755
--- a/analysis/get_results.py
+++ b/analysis/get_results.py
@@ -118,12 +118,12 @@ def check_valid(results):
 
 
 def split_gen():
-    shutil.rmtree("sanitized_samples", ignore_errors=True)
     shutil.rmtree("sanitized_calibrated_samples", ignore_errors=True)
-    os.makedirs("sanitized_samples/complete", exist_ok=True)
-    os.makedirs("sanitized_samples/instruct", exist_ok=True)
-    os.makedirs("sanitized_calibrated_samples/complete", exist_ok=True)
-    os.makedirs("sanitized_calibrated_samples/instruct", exist_ok=True)
+    os.makedirs("sanitized_calibrated_samples/hard/complete", exist_ok=True)
+    os.makedirs("sanitized_calibrated_samples/hard/instruct", exist_ok=True)
+    os.makedirs("sanitized_calibrated_samples/full/complete", exist_ok=True)
+    os.makedirs("sanitized_calibrated_samples/full/instruct", exist_ok=True)
+    
     for model, info in model_info.items():
         model = model.replace("/", "--")
         files = glob(f"results/{model}--bigcodebench-*.jsonl")
@@ -131,27 +131,21 @@ def split_gen():
             model = info["link"].split("https://huggingface.co/")[-1].replace("/", "--")
         
         for file in files:
+            if "-sanitized" not in file or "calibrated" not in file:
+                continue
+                
             _, suffix = os.path.basename(file).split("--bigcodebench-")
             with open(file, "r") as f:
                 data = f.readlines()
                 
-            if "-sanitized" in file:
-                if "calibrated" in file:
-                    if info["prompted"]:
-                        if suffix.startswith("complete"):
-                            with open(f"sanitized_calibrated_samples/complete/{model}--bigcodebench-{suffix}", "w") as f:
-                                f.writelines(data)
-                        else:
-                            with open(f"sanitized_calibrated_samples/instruct/{model}--bigcodebench-{suffix}", "w") as f:
-                                f.writelines(data)
+            split_type = "hard" if "-hard-" in file else "full"
+            if info["prompted"]:
+                if suffix.startswith("complete") or suffix.startswith("hard-complete"):
+                    with open(f"sanitized_calibrated_samples/{split_type}/complete/{model}--bigcodebench-{suffix}", "w") as f:
+                        f.writelines(data)
                 else:
-                    if suffix.startswith("complete"):
-                        with open(f"sanitized_samples/complete/{model}--bigcodebench-{suffix}", "w") as f:
-                            f.writelines(data)
-                    else:
-                        with open(f"sanitized_samples/instruct/{model}--bigcodebench-{suffix}", "w") as f:
-                            f.writelines(data)
-
+                    with open(f"sanitized_calibrated_samples/{split_type}/instruct/{model}--bigcodebench-{suffix}", "w") as f:
+                        f.writelines(data)
 
 def read_task_perf(tids, task="complete"):
     model_results = dict()
@@ -302,7 +296,7 @@ def get_perf_df(data_dict):
 
     
 if __name__ == "__main__":
-    # split_gen()
+    split_gen()
     bcb_orig = load_dataset("bigcode/bigcodebench", split="v0.1.1")
     bcb_hard = load_dataset("bigcode/bigcodebench-hard", split="v0.1.1")
     bcb_config = {

From f087e3b03ce1df72cf889b201b421bd90346d445 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Sun, 2 Mar 2025 20:31:18 +0800
Subject: [PATCH 07/17] doc: add new model outputs

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 94ad2ef..d3913d9 100755
--- a/README.md
+++ b/README.md
@@ -187,7 +187,7 @@ Please make sure your HF access token has the `Make calls to inference providers
 ## 💻 LLM-generated Code
 
 We share pre-generated code samples from LLMs we have [evaluated](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard) on the full set:
-*  See the attachment of our [v0.2.1.post7](https://github.com/bigcode-project/bigcodebench/releases/tag/v0.2.1.post7). We include `sanitized_samples_calibrated.zip` for your convenience.
+*  See the attachment of our [v0.2.4](https://github.com/bigcode-project/bigcodebench/releases/tag/v0.2.4). We include `sanitized_samples_calibrated.zip` for your convenience.
 
 ## 🧑 Advanced Usage
 

From 6d967338737d4fa02cb2a8d19207528278282321 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Sun, 2 Mar 2025 21:42:32 +0800
Subject: [PATCH 08/17] feat: support vllm lora

---
 bigcodebench/generate.py          |  2 ++
 bigcodebench/provider/__init__.py |  3 +++
 bigcodebench/provider/vllm.py     | 17 +++++++++++++++--
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py
index 9823d0c..c5fa368 100644
--- a/bigcodebench/generate.py
+++ b/bigcodebench/generate.py
@@ -127,6 +127,7 @@ def run_codegen(
     split: str,
     subset: str,
     root: str = "bcb_results",
+    lora_path: str = None,
     bs: Optional[int] = None,
     n_samples: int = 1,
     temperature: float = 0.0,
@@ -174,6 +175,7 @@ def run_codegen(
         backend=backend,
         subset=subset,
         split=split,
+        lora_path=lora_path,
         temperature=temperature,
         max_new_tokens=max_new_tokens,
         reasoning_effort=reasoning_effort,
diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py
index f76ec29..202d049 100644
--- a/bigcodebench/provider/__init__.py
+++ b/bigcodebench/provider/__init__.py
@@ -6,6 +6,7 @@ def make_model(
     backend: str,
     subset: str,
     split: str,
+    lora_path: str = None,
     dataset: str = "bigcodebench",
     temperature: float = 0.0,
     max_new_tokens: int = 1280,
@@ -38,6 +39,7 @@ def make_model(
             name=model,
             subset=subset,
             split=split,
+            lora_path=lora_path,
             temperature=temperature,
             max_new_tokens=max_new_tokens,
             revision=revision,
@@ -58,6 +60,7 @@ def make_model(
             name=model,
             subset=subset,
             split=split,
+            lora_path=lora_path,
             temperature=temperature,
             max_new_tokens=max_new_tokens,
             revision=revision,
diff --git a/bigcodebench/provider/vllm.py b/bigcodebench/provider/vllm.py
index cc928e4..570d4c5 100644
--- a/bigcodebench/provider/vllm.py
+++ b/bigcodebench/provider/vllm.py
@@ -3,6 +3,8 @@
 
 from transformers import AutoTokenizer
 from vllm import LLM, SamplingParams
+from vllm.lora.request import LoRARequest
+from huggingface_hub import snapshot_download
 
 from bigcodebench.provider.base import DecoderBase
 from bigcodebench.provider.utility import (
@@ -11,7 +13,7 @@
 )
 
 class VllmDecoder(DecoderBase):
-    def __init__(self, name: str, dataset: str, tp: int, **kwargs) -> None:
+    def __init__(self, name: str, lora_path: str, dataset: str, tp: int, **kwargs) -> None:
         super().__init__(name, **kwargs)
 
         kwargs = {
@@ -29,7 +31,17 @@ def __init__(self, name: str, dataset: str, tp: int, **kwargs) -> None:
         else:
             if self.prefill and "```" in self.response_prefix:
                 self.eos += ["\n```\n"]
-        self.llm = LLM(model=name, max_model_len=self.max_new_tokens, **kwargs)
+        
+        self.lora_request = None
+        if lora_path:
+            local_lora_path = snapshot_download(lora_path)
+            self.lora_request = LoRARequest(
+                "lora",
+                1,
+                local_lora_path,
+            )
+        
+        self.llm = LLM(model=name, max_model_len=self.max_new_tokens, enable_lora=True if self.lora_path else False, **kwargs)
         self.llm.set_tokenizer(tokenizer=self.tokenizer)
 
     def is_direct_completion(self) -> bool:
@@ -64,6 +76,7 @@ def codegen(
                 stop=self.eos,
                 skip_special_tokens=self.skip_special_tokens,
             ),
+            lora_request=self.lora_request,
             use_tqdm=True,
         )
 

From 82fc40dfe33381b8bdbe5c695414afa5a543ba16 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Sun, 2 Mar 2025 21:50:48 +0800
Subject: [PATCH 09/17] fix: vllm lora attribute

---
 bigcodebench/provider/vllm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigcodebench/provider/vllm.py b/bigcodebench/provider/vllm.py
index 570d4c5..25f00b4 100644
--- a/bigcodebench/provider/vllm.py
+++ b/bigcodebench/provider/vllm.py
@@ -41,7 +41,7 @@ def __init__(self, name: str, lora_path: str, dataset: str, tp: int, **kwargs) -
                 local_lora_path,
             )
         
-        self.llm = LLM(model=name, max_model_len=self.max_new_tokens, enable_lora=True if self.lora_path else False, **kwargs)
+        self.llm = LLM(model=name, max_model_len=self.max_new_tokens, enable_lora=True if self.lora_request else False, **kwargs)
         self.llm.set_tokenizer(tokenizer=self.tokenizer)
 
     def is_direct_completion(self) -> bool:

From d37847db62972decb626645699e403ed237b0d73 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Sun, 2 Mar 2025 21:57:21 +0800
Subject: [PATCH 10/17] fix: customize lora output file

---
 bigcodebench/generate.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py
index c5fa368..87b67ea 100644
--- a/bigcodebench/generate.py
+++ b/bigcodebench/generate.py
@@ -197,9 +197,12 @@ def run_codegen(
     if backend == "openai" and reasoning_effort and model.startswith("o1-") or model.startswith("o3-") or model.endswith("-reasoner"):
         model = model + f"--{reasoning_effort}"
     
+    if lora_path:
+        model = model + f"--lora-{lora_path}"
+    
     if backend == "anthropic" and reasoning_budget and reasoning_beta:
         model = model + f"--{reasoning_budget}-{reasoning_beta}"
-
+    
     if skip_prefill:
         identifier = model.replace("/", "--") + "--skip_prefill" + f"--{revision}--bigcodebench{extra}-{split}--{backend}-{temperature}-{n_samples}-sanitized_calibrated.jsonl"
     else:

From fa21527b1fdd727fd6f629408e16a65813231823 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Tue, 25 Mar 2025 00:02:05 +0800
Subject: [PATCH 11/17] feat: add model release date

---
 analysis/utils.py | 252 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 250 insertions(+), 2 deletions(-)

diff --git a/analysis/utils.py b/analysis/utils.py
index 430e113..ec774c7 100755
--- a/analysis/utils.py
+++ b/analysis/utils.py
@@ -8,6 +8,7 @@
         "act_param": 6.7,
         "open-data": "Partial",
         "reasoning": False,
+        "date": "2024-12-04",
     },
     "bigcode/starcoder2-15b-instruct-v0.1": {
         "name": "StarCoder2-15B-Instruct-v0.1",
@@ -18,6 +19,7 @@
         "act_param": 15,
         "open-data": "Full",
         "reasoning": False,
+        "date": "2024-04-30"
     },
     "bigcode/starcoder2-3b": {
         "name": "StarCoder2-3B",
@@ -28,6 +30,7 @@
         "act_param": 3,
         "open-data": "Full",
         "reasoning": False,
+        "date": "2024-02-29"
     },
     "bigcode/starcoder2-7b": {
         "name": "StarCoder2-7B",
@@ -38,6 +41,7 @@
         "act_param": 7,
         "open-data": "Full",
         "reasoning": False,
+        "date": "2024-02-29"
     },
     "bigcode/starcoder2-15b": {
         "name": "StarCoder2-15B",
@@ -48,6 +52,7 @@
         "act_param": 15,
         "open-data": "Full",
         "reasoning": False,
+        "date": "2024-02-29"
     },
     "Qwen/CodeQwen1.5-7B": {
         "name": "CodeQwen1.5-7B",
@@ -58,6 +63,7 @@
         "act_param": 7,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-16"
     },
     "google/codegemma-2b": {
         "name": "CodeGemma-2B",
@@ -68,6 +74,7 @@
         "act_param": 2,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-10"
     },
     "google/codegemma-7b": {
         "name": "CodeGemma-7B",
@@ -78,6 +85,7 @@
         "act_param": 7,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-10"
     },
     "google/codegemma-7b-it": {
         "name": "CodeGemma-7B-Instruct",
@@ -88,6 +96,7 @@
         "act_param": 7,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-10"
     },
     "gpt-3.5-turbo-0125": {
         "name": "GPT-3.5-Turbo-0125",
@@ -98,6 +107,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-01-25"
     },
     "gpt-4o": {
         "name": "GPT-4o-2024-05-13",
@@ -108,6 +118,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-13"
     },
     "gpt-4-turbo-2024-04-09": {
         "name": "GPT-4-Turbo-2024-04-09",
@@ -118,6 +129,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-09"
     },
     "gpt-4-0613": {
         "name": "GPT-4-0613",
@@ -128,6 +140,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-06-13"
     },
     "codellama/CodeLlama-7b-hf": {
         "name": "CodeLlama-7B-Base",
@@ -138,6 +151,7 @@
         "act_param": 7,
         "open-data": "None",
         "reasoning": False,
+        "date": "2023-08-25"
     },
     "codellama/CodeLlama-13b-hf": {
         "name": "CodeLlama-13B-Base",
@@ -148,6 +162,7 @@
         "act_param": 13,
         "open-data": "None",
         "reasoning": False,
+        "date": "2023-08-25"
     },
     "codellama/CodeLlama-7b-Instruct-hf": {
         "name": "CodeLlama-7B-Instruct",
@@ -158,6 +173,7 @@
         "act_param": 7,
         "open-data": "None",
         "reasoning": False,
+        "date": "2023-08-25"
     },
     "codellama/CodeLlama-13b-Instruct-hf": {
         "name": "CodeLlama-13B-Instruct",
@@ -168,6 +184,7 @@
         "act_param": 13,
         "open-data": "None",
         "reasoning": False,
+        "date": "2023-08-25"
     },
     "mistral-large-2402": {
         "name": "Mistral-Large-2402",
@@ -178,6 +195,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-02-26"
     },
     "mistral-small-2402": {
         "name": "Mistral-Small-2402",
@@ -188,6 +206,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-02-26"
     },
     "mistralai/Mixtral-8x22B-v0.1": {
         "name": "Mixtral-8x22B-Base",
@@ -198,6 +217,7 @@
         "act_param": 44,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-17"
     },
     "mistralai/Mixtral-8x22B-Instruct-v0.1": {
         "name": "Mixtral-8x22B-Instruct",
@@ -208,6 +228,7 @@
         "act_param": 44,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-17"
     },
     "codellama/CodeLlama-34b-hf": {
         "name": "CodeLlama-34B-Base",
@@ -218,6 +239,7 @@
         "act_param": 34,
         "open-data": "None",
         "reasoning": False,
+        "date": "2023-08-25"
     },
     "codellama/CodeLlama-34b-Instruct-hf": {
         "name": "CodeLlama-34B-Instruct",
@@ -228,6 +250,7 @@
         "act_param": 34,
         "open-data": "None",
         "reasoning": False,
+        "date": "2023-08-25"
     },
     "codellama/CodeLlama-70b-hf": {
         "name": "CodeLlama-70B-Base",
@@ -238,6 +261,7 @@
         "act_param": 70,
         "open-data": "None",
         "reasoning": False,
+        "date": "2023-08-25"
     },
     "codellama/CodeLlama-70b-Instruct-hf": {
         "name": "CodeLlama-70B-Instruct",
@@ -248,6 +272,7 @@
         "act_param": 70,
         "open-data": "None",
         "reasoning": False,
+        "date": "2023-08-25"
     },
     "Qwen/CodeQwen1.5-7B-Chat": {
         "name": "CodeQwen1.5-7B-Chat",
@@ -258,6 +283,7 @@
         "act_param": 7,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-16"
     },
     "Qwen/Qwen1.5-110B-Chat": {
         "name": "Qwen1.5-110B-Chat",
@@ -268,6 +294,7 @@
         "act_param": 110,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-26"
     },
     "Qwen/Qwen1.5-72B-Chat": {
         "name": "Qwen1.5-72B-Chat",
@@ -278,6 +305,7 @@
         "act_param": 72,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-26"
     },
     "Qwen/Qwen1.5-32B-Chat": {
         "name": "Qwen1.5-32B-Chat",
@@ -288,6 +316,7 @@
         "act_param": 32,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-26"
     },
     "deepseek-ai/DeepSeek-V2-Chat": {
         "name": "DeepSeek-V2-Chat",
@@ -298,6 +327,7 @@
         "act_param": 21,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-06"
     },
     "deepseek-ai/deepseek-coder-1.3b-base": {
         "name": "DeepSeek-Coder-1.3B-Base",
@@ -308,6 +338,7 @@
         "act_param": 1.3,
         "open-data": "None",
         "reasoning": False,
+        "date": "2023-10-28"
     },
     "deepseek-ai/deepseek-coder-1.3b-instruct": {
         "name": "DeepSeek-Coder-1.3B-Instruct",
@@ -318,6 +349,7 @@
         "act_param": 1.3,
         "open-data": "None",
         "reasoning": False,
+        "date": "2023-10-28"
     },
     "deepseek-ai/deepseek-coder-33b-base": {
         "name": "DeepSeek-Coder-33B-Base",
@@ -328,6 +360,7 @@
         "act_param": 33,
         "open-data": "None",
         "reasoning": False,
+        "date": "2023-10-28"
     },
     "deepseek-ai/deepseek-coder-33b-instruct": {
         "name": "DeepSeek-Coder-33B-Instruct",
@@ -338,6 +371,7 @@
         "act_param": 33,
         "open-data": "None",
         "reasoning": False,
+        "date": "2023-10-28"
     },
     "deepseek-ai/deepseek-coder-6.7b-base": {
         "name": "DeepSeek-Coder-6.7B-Base",
@@ -348,6 +382,7 @@
         "act_param": 6.7,
         "open-data": "None",
         "reasoning": False,
+        "date": "2023-10-28"
     },
     "deepseek-ai/deepseek-coder-6.7b-instruct": {
         "name": "DeepSeek-Coder-6.7B-Instruct",
@@ -358,6 +393,7 @@
         "act_param": 6.7,
         "open-data": "None",
         "reasoning": False,
+        "date": "2023-10-28"
     },
     "meta-llama/Meta-Llama-3-70B": {
         "name": "Llama-3-70B-Base",
@@ -368,6 +404,7 @@
         "act_param": 70,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-18"
     },
     "meta-llama/Meta-Llama-3-70B-Instruct": {
         "name": "Llama-3-70B-Instruct",
@@ -378,6 +415,7 @@
         "act_param": 70,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-18"
     },
     "meta-llama/Meta-Llama-3-8B": {
         "name": "Llama-3-8B-Base",
@@ -388,6 +426,7 @@
         "act_param": 8,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-18"
     },
     "meta-llama/Meta-Llama-3-8B-Instruct": {
         "name": "Llama-3-8B-Instruct",
@@ -398,6 +437,7 @@
         "act_param": 8,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-18"
     },
     "ibm-granite/granite-3b-code-instruct": {
         "name": "Granite-Code-3B-Instruct",
@@ -408,6 +448,7 @@
         "act_param": 3,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-06"
     },
     "ibm-granite/granite-8b-code-instruct": {
         "name": "Granite-Code-8B-Instruct",
@@ -418,6 +459,7 @@
         "act_param": 8,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-06"
     },
     "ibm-granite/granite-20b-code-instruct": {
         "name": "Granite-Code-20B-Instruct",
@@ -428,6 +470,7 @@
         "act_param": 20,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-06"
     },
     "ibm-granite/granite-34b-code-instruct": {
         "name": "Granite-Code-34B-Instruct",
@@ -438,6 +481,7 @@
         "act_param": 34,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-06"
     },
     "ibm-granite/granite-3b-code-base": {
         "name": "Granite-Code-3B-Base",
@@ -448,6 +492,7 @@
         "act_param": 3,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-06"
     },
     "ibm-granite/granite-8b-code-base": {
         "name": "Granite-Code-8B-Base",
@@ -458,6 +503,7 @@
         "act_param": 8,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-06"
     },
     "ibm-granite/granite-20b-code-base": {
         "name": "Granite-Code-20B-Base",
@@ -468,6 +514,7 @@
         "act_param": 20,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-06"
     },
     "ibm-granite/granite-34b-code-base": {
         "name": "Granite-Code-34B-Base",
@@ -478,6 +525,7 @@
         "act_param": 34,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-06"
     },
     "claude-3-haiku-20240307": {
         "name": "Claude-3-Haiku-20240307",
@@ -488,6 +536,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": True,
+        "date": "2024-03-07"
     },
     "claude-3-sonnet-20240229": {
         "name": "Claude-3-Sonnet-20240229",
@@ -498,6 +547,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": True,
+        "date": "2024-02-29"
     },
     "claude-3-opus-20240229": {
         "name": "Claude-3-Opus-20240229",
@@ -508,6 +558,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": True,
+        "date": "2024-02-29"
     },
     "01-ai/Yi-1.5-34B-Chat": {
         "name": "Yi-1.5-34B-Chat",
@@ -518,6 +569,7 @@
         "act_param": 34,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-20"
     },
     "01-ai/Yi-1.5-34B": {
         "name": "Yi-1.5-34B",
@@ -528,6 +580,7 @@
         "act_param": 34,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-20"
     },
     "01-ai/Yi-1.5-9B-Chat": {
         "name": "Yi-1.5-9B-Chat",
@@ -538,6 +591,7 @@
         "act_param": 9,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-20"
     },
     "01-ai/Yi-1.5-9B": {
         "name": "Yi-1.5-9B",
@@ -548,6 +602,7 @@
         "act_param": 9,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-20"
     },
     "01-ai/Yi-1.5-6B-Chat": {
         "name": "Yi-1.5-6B-Chat",
@@ -558,6 +613,7 @@
         "act_param": 6,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-20"
     },
     "01-ai/Yi-1.5-6B": {
         "name": "Yi-1.5-6B",
@@ -568,6 +624,7 @@
         "act_param": 6,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-20"
     },
     "Qwen/Qwen2-57B-A14B": {
         "name": "Qwen2-57B-A14B",
@@ -578,6 +635,7 @@
         "act_param": 14,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-06-07"
     },
     "Qwen/Qwen2-7B-Instruct": {
         "name": "Qwen2-7B-Instruct",
@@ -588,6 +646,7 @@
         "act_param": 7,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-06-07"
     },
     "Qwen/Qwen2-72B-Chat": {
         "name": "Qwen2-72B-Chat",
@@ -598,6 +657,7 @@
         "act_param": 72,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-06-07"
     },
     "gemini-1.5-pro": {
         "name": "Gemini-1.5-Pro-API-0514",
@@ -608,6 +668,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-14"
     },
     "gemini-1.5-flash": {
         "name": "Gemini-1.5-Flash-API-0514",
@@ -618,6 +679,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-14"
     },
     "m-a-p/OpenCodeInterpreter-DS-33B": {
         "name": "OpenCodeInterpreter-DS-33B",
@@ -628,6 +690,7 @@
         "act_param": 33,
         "open-data": "Partial",
         "reasoning": False,
+        "date": "2024-02-22"
     },
     "m-a-p/OpenCodeInterpreter-DS-6.7B": {
         "name": "OpenCodeInterpreter-DS-6.7B",
@@ -638,6 +701,7 @@
         "act_param": 6.7,
         "open-data": "Partial",
         "reasoning": False,
+        "date": "2024-02-22"
     },
     "m-a-p/OpenCodeInterpreter-DS-1.3B": {
         "name": "OpenCodeInterpreter-DS-1.3B",
@@ -648,6 +712,7 @@
         "act_param": 1.3,
         "open-data": "Partial",
         "reasoning": False,
+        "date": "2024-02-22"
     },
     "microsoft/Phi-3-medium-128k-instruct": {
         "name": "Phi-3-Medium-128K-Instruct",
@@ -658,6 +723,7 @@
         "act_param": 14,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-21"
     },
     "microsoft/Phi-3-small-128k-instruct": {
         "name": "Phi-3-Small-128K-Instruct",
@@ -668,6 +734,7 @@
         "act_param": 7,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-21"
     },
     "codestral-2405": {
         "name": "Codestral-22B-v0.1",
@@ -678,6 +745,7 @@
         "act_param": 22,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-23"
     },
     "codestral-mamba-2407": {
         "name": "Codestral-Mamba",
@@ -688,6 +756,7 @@
         "act_param": 7,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-07-16"
     },
     "mistralai/Mistral-7B-Instruct-v0.3": {
         "name": "Mistral-7B-Instruct-v0.3",
@@ -698,6 +767,7 @@
         "act_param": 7,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-22"
     },
     "mistralai/Mistral-7B-v0.3": {
         "name": "Mistral-7B-v0.3",
@@ -708,6 +778,7 @@
         "act_param": 7,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-22"
     },
     "CohereForAI/c4ai-command-r-plus": {
         "name": "Command R+",
@@ -718,6 +789,7 @@
         "act_param": 104,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-04"
     },
     "deepseek-coder": {
         "name": "DeepSeek-Coder-V2-Instruct",
@@ -728,6 +800,7 @@
         "act_param": 21,
         "open-data": "None",
         "reasoning": True,
+        "date": "2024-06-17"
     },
     "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": {
         "name": "DeepSeek-Coder-V2-Lite-Instruct",
@@ -738,6 +811,7 @@
         "act_param": 2.4,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-06-17"
     },
     "deepseek-ai/DeepSeek-Coder-V2-Lite-Base": {
         "name": "DeepSeek-Coder-V2-Lite-Base",
@@ -748,6 +822,7 @@
         "act_param": 2.4,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-06-17"
     },
     "claude-3-5-sonnet-20240620": {
         "name": "Claude-3.5-Sonnet-20240620",
@@ -758,6 +833,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": True,
+        "date": "2024-06-20"
     },
     "NousResearch/Hermes-2-Theta-Llama-3-70B": {
         "name": "Hermes-2-Theta-Llama-3-70B",
@@ -768,6 +844,7 @@
         "act_param": 70,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-06-24"
     },
     "microsoft/wavecoder-ultra-6.7b": {
         "name": "WaveCoder-Ultra-6.7B",
@@ -778,6 +855,7 @@
         "act_param": 6.7,
         "open-data": "None",
         "reasoning": False,
+        "date": "2023-12-26"
     },
     "google/gemma-2-9b-it": {
         "name": "Gemma-2-9B-Instruct",
@@ -788,6 +866,7 @@
         "act_param": 9,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-06-19"
     },
     "Bin12345/AutoCoder": {
         "name": "AutoCoder",
@@ -798,6 +877,7 @@
         "act_param": 33,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-23"
     },
     "Bin12345/AutoCoder_S_6.7B": {
         "name": "AutoCoder-S-6.7B",
@@ -808,6 +888,7 @@
         "act_param": 6.7,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-23"
     },
     "Bin12345/AutoCoder_QW_7B": {
         "name": "AutoCoder-QW-7B",
@@ -818,6 +899,7 @@
         "act_param": 7,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-23"
     },
     "SenseLLM/ReflectionCoder-DS-33B": {
         "name": "ReflectionCoder-DS-33B",
@@ -828,6 +910,7 @@
         "act_param": 33,
         "open-data": "Partial",
         "reasoning": False,
+        "date": "2024-05-27"
     },
     "SenseLLM/ReflectionCoder-DS-6.7B": {
         "name": "ReflectionCoder-DS-6.7B",
@@ -838,6 +921,7 @@
         "act_param": 6.7,
         "open-data": "Partial",
         "reasoning": False,
+        "date": "2024-05-27"
     },
     "SenseLLM/ReflectionCoder-CL-34B": {
         "name": "ReflectionCoder-CL-34B",
@@ -848,6 +932,7 @@
         "act_param": 34,
         "open-data": "Partial",
         "reasoning": False,
+        "date": "2024-05-27"
     },
     "SenseLLM/ReflectionCoder-CL-7B": {
         "name": "ReflectionCoder-CL-7B",
@@ -858,6 +943,7 @@
         "act_param": 7,
         "open-data": "Partial",
         "reasoning": False,
+        "date": "2024-05-27"
     },
     "new-microsoft/Phi-3-mini-128k-instruct": {
         "name": "Phi-3.1-Mini-128K-Instruct",
@@ -868,6 +954,7 @@
         "act_param": 3.8,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-21"
     },
     "old-microsoft/Phi-3-mini-128k-instruct": {
         "name": "Phi-3-Mini-128K-Instruct",
@@ -878,6 +965,7 @@
         "act_param": 3.8,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-21"
     },
     "internlm/internlm2_5-7b-chat": {
         "name": "InternLM2.5-7B-Chat",
@@ -888,6 +976,7 @@
         "act_param": 7,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-07-03"
     },
     "NousResearch/Hermes-2-Pro-Llama-3-70B": {
         "name": "Hermes-2-Pro-Llama-3-70B",
@@ -898,6 +987,7 @@
         "act_param": 70,
         "open-data": "Partial",
         "reasoning": False,
+        "date": "2024-06-27"
     },
     "new-deepseek-chat": {
         "name": "DeepSeek-V2-Chat (2024-06-28)",
@@ -908,6 +998,7 @@
         "act_param": 21,
         "open-data": "None",
         "reasoning": True,
+        "date": "2024-06-28"
     },
     "vllm-google/gemma-2-27b-it": {
         "name": "Gemma-2-27B-Instruct",
@@ -918,6 +1009,7 @@
         "act_param": 27,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-06-19"
     },
     "Artigenz/Artigenz-Coder-DS-6.7B": {
         "name": "Artigenz-Coder-DS-6.7B",
@@ -928,6 +1020,7 @@
         "act_param": 6.7,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-16"
     },
     "openchat/openchat-3.6-8b-20240522": {
         "name": "OpenChat-3.6-8B-20240522",
@@ -938,6 +1031,7 @@
         "act_param": 8,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-22"
     },
     "Phind/Phind-CodeLlama-34B-v2": {
         "name": "Phind-CodeLlama-34B-v2",
@@ -948,6 +1042,7 @@
         "act_param": 34,
         "open-data": "None",
         "reasoning": False,
+        "date": "2023-08-25"
     },
     "yi-large": {
         "name": "Yi-Large",
@@ -958,6 +1053,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-13"
     },
     "THUDM/codegeex4-all-9b": {
         "name": "CodeGeex4-All-9B",
@@ -968,6 +1064,7 @@
         "act_param": 9,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-07-05"
     },
     "gpt-4o-mini-2024-07-18": {
         "name": "GPT-4o-mini-2024-07-18",
@@ -978,6 +1075,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-07-18"
     },
     "Nexusflow/Athene-70B": {
         "name": "Athene-70B",
@@ -988,6 +1086,7 @@
         "act_param": 70,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-07-20"
     },
     "NTQAI/Nxcode-CQ-7B-orpo": {
         "name": "Nxcode-CQ-7B-Orpo",
@@ -998,6 +1097,7 @@
         "act_param": 7,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-25"
     },
     "migtissera/Llama-3-70B-Synthia-v3.5": {
         "name": "Llama-3-70B-Synthia-v3.5",
@@ -1008,6 +1108,7 @@
         "act_param": 70,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-05-27"
     },
     "migtissera/Tess-v2.5.2-Qwen2-72B": {
         "name": "Tess-v2.5.2-Qwen2-72B",
@@ -1018,6 +1119,7 @@
         "act_param": 72,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-07-18"
     },
     "WhiteRabbitNeo/WhiteRabbitNeo-33B-v1.5": {
         "name": "WhiteRabbitNeo-33B-v1.5",
@@ -1028,6 +1130,7 @@
         "act_param": 33,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-02-10"
     },
     "mistral-large-2407": {
         "name": "Mistral-Large-Instruct-2407",
@@ -1038,6 +1141,7 @@
         "act_param": 123,
         "open-data": "None",
         "reasoning": True,
+        "date": "2024-07-24"
     },
     "meta-llama/Meta-Llama-3.1-8B-Instruct": {
         "name": "Llama-3.1-8B-Instruct",
@@ -1048,6 +1152,7 @@
         "act_param": 8,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-07-23"
     },
     "meta-llama/Meta-Llama-3.1-70B-Instruct": {
         "name": "Llama-3.1-70B-Instruct",
@@ -1058,6 +1163,7 @@
         "act_param": 70,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-07-23"
     },
     "meta--llama-3.1-405b-instruct": {
         "name": "Llama-3.1-405B-Instruct",
@@ -1068,6 +1174,7 @@
         "act_param": 405,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-07-23"
     },
     "deepseek-coder-20240724": {
         "name": "DeepSeek-Coder-V2-Instruct (2024-07-24)",
@@ -1078,6 +1185,7 @@
         "act_param": 21,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-07-24"
     },
     "microsoft/Phi-3.5-mini-instruct": {
         "name": "Phi-3.5-Mini-Instruct",
@@ -1088,6 +1196,7 @@
         "act_param": 3.8,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-23"
     },
     "nv-mistralai--mistral-nemo-12b-instruct": {
         "name": "Mistral-Nemo-12B-Instruct",
@@ -1098,6 +1207,7 @@
         "act_param": 12,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-07-18"
     },
     "wyt2000/InverseCoder-CL-13B": {
         "name": "InverseCoder-CL-13B",
@@ -1108,6 +1218,7 @@
         "act_param": 13,
         "open-data": "Partial",
         "reasoning": False,
+        "date": "2024-07-08"
     },
     "wyt2000/InverseCoder-CL-7B": {
         "name": "InverseCoder-CL-7B",
@@ -1118,6 +1229,7 @@
         "act_param": 7,
         "open-data": "Partial",
         "reasoning": False,
+        "date": "2024-07-08"
     },
     "wyt2000/InverseCoder-DS-6.7B": {
         "name": "InverseCoder-DS-6.7B",
@@ -1128,6 +1240,7 @@
         "act_param": 6.7,
         "open-data": "Partial",
         "reasoning": False,
+        "date": "2024-07-08"
     },
     "gemini-1.5-pro-exp-0801": {
         "name": "Gemini-1.5-Pro-Exp-0801",
@@ -1138,6 +1251,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": True,
+        "date": "2024-08-01"
     },
     "gpt-4o-2024-08-06": {
         "name": "GPT-4o-2024-08-06",
@@ -1148,6 +1262,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-08-06"
     },
     "abacusai/Dracarys-Llama-3.1-70B-Instruct": {
         "name": "Dracarys-Llama-3.1-70B-Instruct",
@@ -1158,6 +1273,7 @@
         "act_param": 70,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-08-23"
     },
     "abacusai/Dracarys-72B-Instruct": {
         "name": "Dracarys-72B-Instruct",
@@ -1168,6 +1284,7 @@
         "act_param": 72,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-08-23"
     },
     "gemini-1.5-pro-exp-0827": {
         "name": "Gemini-1.5-Pro-Exp-0827",
@@ -1178,6 +1295,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-08-27"
     },
     "gemini-1.5-flash-exp-0827": {
         "name": "Gemini-1.5-Flash-Exp-0827",
@@ -1188,6 +1306,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-08-27"
     },
     "microsoft/Phi-3.5-mini-instruct": {
         "name": "Phi-3.5-Mini-Instruct",
@@ -1198,6 +1317,7 @@
         "act_param": 3.8,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-23"
     },
     "abacusai/Dracarys-Llama-3.1-70B-Instruct": {
         "name": "Dracarys-Llama-3.1-70B-Instruct",
@@ -1208,6 +1328,7 @@
         "act_param": 70,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-23"
     },
     "abacusai/Dracarys-72B-Instruct": {
         "name": "Dracarys-72B-Instruct",
@@ -1218,6 +1339,7 @@
         "act_param": 72,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-04-23"
     },
     "deepseek-coder-v2.5": {
         "name": "DeepSeek-V2.5",
@@ -1228,6 +1350,7 @@
         "act_param": 21,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-09-18"
     },
     "CohereForAI/c4ai-command-r-08-2024": {
         "name": "C4AI-Command-R-08-2024",
@@ -1238,6 +1361,7 @@
         "act_param": 32.3,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-08-30"
     },
     "CohereForAI/c4ai-command-r-plus-08-2024": {
         "name": "C4AI-Command-R-Plus-08-2024",
@@ -1248,6 +1372,7 @@
         "act_param": 104,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-08-30"
     },
     "ayueei--yue-coder-9b-preview": {
         "name": "Yi-Coder-9B-Chat",
@@ -1258,6 +1383,7 @@
         "act_param": 9,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-09-04"
     },
     # "mattshumer/ref_70_e3_prefill": {
     #     "name": "Reflection-Llama-3.1-70B",
@@ -1286,6 +1412,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": True,
+        "date": "2024-09-12"
     },
     "o1-mini-2024-09-12": {
         "name": "o1-Mini-2024-09-12 (temperature=1)",
@@ -1296,6 +1423,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": True,
+        "date": "2024-09-12"
     },
     "Qwen/Qwen2.5-Coder-1.5B-Instruct": {
         "name": "Qwen2.5-Coder-1.5B-Instruct",
@@ -1306,6 +1434,7 @@
         "act_param": 1.5,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-11-12"
     },
     "Qwen/Qwen2.5-Coder-7B-Instruct": {
         "name": "Qwen2.5-Coder-7B-Instruct",
@@ -1316,6 +1445,7 @@
         "act_param": 7,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-11-12"
     },
     "gemini-1.5-pro-002": {
         "name": "Gemini-1.5-Pro-002",
@@ -1326,6 +1456,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": True,
+        "date": "2024-09-25"
     },
     "mistralai/Mistral-Small-Instruct-2409": {
         "name": "Mistral-Small-Instruct-2409",
@@ -1336,6 +1467,7 @@
         "act_param": 22.2,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-09-18"
     },
     "Qwen/Qwen2.5-0.5B-Instruct": {
         "name": "Qwen2.5-0.5B-Instruct",
@@ -1346,6 +1478,7 @@
         "act_param": 0.5,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-09-19"
     },
     "Qwen/Qwen2.5-1.5B-Instruct": {
         "name": "Qwen2.5-1.5B-Instruct",
@@ -1356,6 +1489,7 @@
         "act_param": 1.5,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-09-19"
     },
     "Qwen/Qwen2.5-7B-Instruct": {
         "name": "Qwen2.5-7B-Instruct",
@@ -1366,6 +1500,7 @@
         "act_param": 7,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-09-19"
     },
     "Qwen/Qwen2.5-14B-Instruct": {
         "name": "Qwen2.5-14B-Instruct",
@@ -1376,6 +1511,7 @@
         "act_param": 14,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-09-19"
     },
     "Qwen/Qwen2.5-32B-Instruct": {
         "name": "Qwen2.5-32B-Instruct",
@@ -1386,6 +1522,7 @@
         "act_param": 32,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-09-19"
     },
     "Qwen/Qwen2.5-72B-Instruct": {
         "name": "Qwen2.5-72B-Instruct",
@@ -1396,6 +1533,7 @@
         "act_param": 72,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-09-19"
     },
     "meta-llama/Llama-3.2-1B-Instruct": {
         "name": "Llama-3.2-1B-Instruct",
@@ -1406,6 +1544,7 @@
         "act_param": 1,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-09-25"
     },
     "meta-llama/Llama-3.2-3B-Instruct": {
         "name": "Llama-3.2-3B-Instruct",
@@ -1416,6 +1555,7 @@
         "act_param": 3,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-09-25"
     },
     "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF": {
         "name": "Llama-3.1-Nemotron-70B-Instruct",
@@ -1426,6 +1566,7 @@
         "act_param": 70,
         "open-data": "Partial",
         "reasoning": False,
+        "date": "2024-09-25"
     },
     "claude-3-5-sonnet-20241022": {
         "name": "Claude-3.5-Sonnet-20241022",
@@ -1436,6 +1577,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-10-22"
     },
     "ibm-granite/granite-3.0-8b-instruct": {
         "name": "Granite-3.0-8B-Instruct",
@@ -1446,6 +1588,7 @@
         "act_param": 8,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-10-21"
     },
     "ibm-granite/granite-3.0-2b-instruct": {
         "name": "Granite-3.0-2B-Instruct",
@@ -1456,6 +1599,7 @@
         "act_param": 2,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-10-21"
     },
     "grok-beta--main": {
         "name": "Grok-Beta",
@@ -1466,6 +1610,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": True,
+        "date": "2024-03-17"
     },
     "claude-3-5-haiku-20241022--main": {
         "name": "Claude-3.5-Haiku-20241022",
@@ -1476,6 +1621,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": True,
+        "date": "2024-10-22"
     },
     "Qwen/Qwen2.5-Coder-14B-Instruct--main": {
         "name": "Qwen2.5-Coder-14B-Instruct",
@@ -1486,6 +1632,7 @@
         "act_param": 14,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-09-19"
     },
     "Qwen/Qwen2.5-Coder-32B-Instruct--main": {
         "name": "Qwen2.5-Coder-32B-Instruct",
@@ -1496,6 +1643,7 @@
         "act_param": 32,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-09-19"
     },
     "infly/OpenCoder-1.5B-Instruct--main": {
         "name": "OpenCoder-1.5B-Instruct",
@@ -1506,6 +1654,7 @@
         "act_param": 1.5,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-11-09"
     },
     "infly/OpenCoder-8B-Instruct--main": {
         "name": "OpenCoder-8B-Instruct",
@@ -1516,6 +1665,7 @@
         "act_param": 8,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-11-09"
     },
     "microsoft/Phi-3.5-mini-instruct--main": {
         "name": "Phi-3.5-Mini-Instruct",
@@ -1526,6 +1676,7 @@
         "act_param": 3.8,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-08-21"
     },
     "Nexusflow/Athene-V2-Agent--main": {
         "name": "Athene-V2-Agent",
@@ -1536,6 +1687,7 @@
         "act_param": 72,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-11-14"
     },
     "Nexusflow/Athene-V2-Chat--main": {
         "name": "Athene-V2-Chat",
@@ -1546,6 +1698,7 @@
         "act_param": 72,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-11-14"
     },
     "gemini-exp-1114--main": {
         "name": "Gemini-Exp-1114",
@@ -1556,6 +1709,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": True,
+        "date": "2024-11-14"
     },
     "gpt-4o-2024-11-20--main": {
         "name": "GPT-4o-2024-11-20",
@@ -1566,6 +1720,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": True,
+        "date": "2024-11-20"
     },
     "gemini-exp-1121--main": {
         "name": "Gemini-Exp-1121",
@@ -1576,6 +1731,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-11-21"
     },
     "gemini-exp-1206--main": {
         "name": "Gemini-Exp-1206",
@@ -1586,6 +1742,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": True,
+        "date": "2024-12-06"
     },
     "meta-llama--Llama-3.3-70B-Instruct--main": {
         "name": "Llama-3.3-70B-Instruct",
@@ -1596,6 +1753,7 @@
         "act_param": 70,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-12-19"
     },
     "deepseek-ai--DeepSeek-V2.5-1210--main": {
         "name": "DeepSeek-V2.5-1210",
@@ -1606,6 +1764,7 @@
         "act_param": 21,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-12-10"
     },
     "gemini-2.0-flash-exp--main": {
         "name": "Gemini-2.0-Flash-Exp",
@@ -1616,6 +1775,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-12-11"
     },
     "gemini-2.0-flash-thinking-exp-1219--main": {
         "name": "Gemini-2.0-Flash-Thinking-Exp-1219",
@@ -1626,6 +1786,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-12-19"
     },
     "gemini-2.0-flash-thinking-exp-01-21--main": {
         "name": "Gemini-2.0-Flash-Thinking-Exp-01-21",
@@ -1636,6 +1797,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": False,
+        "date": "2025-01-21"
     },
     "o1-2024-12-17--main": {
         "name": "o1-2024-12-17 (temperature=1, reasoning=medium)",
@@ -1646,6 +1808,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": True,
+        "date": "2024-12-17"
     },
     "o1-2024-12-17--low--main": {
         "name": "o1-2024-12-17 (temperature=1, reasoning=low)",
@@ -1656,6 +1819,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": True,
+        "date": "2024-12-17"
     },
     "o1-2024-12-17--high--main": {
         "name": "o1-2024-12-17 (temperature=1, reasoning=high)",
@@ -1666,16 +1830,18 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": True,
+        "date": "2024-12-17"
     },
     "deepseek-v3-chat--main": {
-        "name": "DeepSeek-V3-Chat",
-        "link": "https://huggingface.co/deepseek-ai/DeepSeek-V3-Chat",
+        "name": "DeepSeek-V3",
+        "link": "https://huggingface.co/deepseek-ai/DeepSeek-V3",
         "prompted": True,
         "moe": True,
         "size": 671,
         "act_param": 37,
         "open-data": "None",
         "reasoning": True,
+        "date": "2024-12-26"
     },
     "microsoft--phi-4--main": {
         "name": "Phi-4",
@@ -1686,6 +1852,7 @@
         "act_param": 14.7,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-12-13"
     },
     "deepseek-reasoner--main": {
         "name": "DeepSeek-R1",
@@ -1696,6 +1863,7 @@
         "act_param": 37,
         "open-data": "None",
         "reasoning": True,
+        "date": "2025-01-20"
     },
     "deepseek-ai/DeepSeek-R1-Distill-Llama-70B--main": {
         "name": "DeepSeek-R1-Distill-Llama-70B",
@@ -1706,6 +1874,7 @@
         "act_param": 70,
         "open-data": "None",
         "reasoning": False,
+        "date": "2025-01-20"
     },
     "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B--main": {
         "name": "DeepSeek-R1-Distill-Qwen-32B",
@@ -1716,6 +1885,7 @@
         "act_param": 32,
         "open-data": "None",
         "reasoning": False,
+        "date": "2025-01-20"
     },
     "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B--main": {
         "name": "DeepSeek-R1-Distill-Qwen-14B",
@@ -1726,6 +1896,7 @@
         "act_param": 14,
         "open-data": "None",
         "reasoning": False,
+        "date": "2025-01-20"
     },
     "deepseek-ai/DeepSeek-R1-Distill-Llama-8B--main": {
         "name": "DeepSeek-R1-Distill-Llama-8B",
@@ -1736,6 +1907,7 @@
         "act_param": 14,
         "open-data": "None",
         "reasoning": False,
+        "date": "2025-01-20"
     },
     "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B--main": {
         "name": "DeepSeek-R1-Distill-Qwen-7B",
@@ -1746,6 +1918,7 @@
         "act_param": 14,
         "open-data": "None",
         "reasoning": False,
+        "date": "2025-01-20"
     },
     "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B--main": {
         "name": "DeepSeek-R1-Distill-Qwen-1.5B",
@@ -1756,6 +1929,7 @@
         "act_param": 1.5,
         "open-data": "None",
         "reasoning": False,
+        "date": "2025-01-20"
     },
     "mistralai/Mistral-Small-24B-Instruct-2501--main": {
         "name": "Mistral-Small-24B-Instruct-2501",
@@ -1766,6 +1940,7 @@
         "act_param": 24,
         "open-data": "None",
         "reasoning": False,
+        "date": "2025-01-31"
     },
     "o3-mini-2025-01-31--medium--main": {
         "name": "o3-mini-2025-01-31 (temperature=1, reasoning=medium)",
@@ -1776,6 +1951,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": True,
+        "date": "2025-01-31"
     },
     "o3-mini-2025-01-31--low--main": {
         "name": "o3-mini-2025-01-31 (temperature=1, reasoning=low)",
@@ -1786,6 +1962,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": True,
+        "date": "2025-01-31"
     },
     "o3-mini-2025-01-31--high--main": {
         "name": "o3-mini-2025-01-31 (temperature=1, reasoning=high)",
@@ -1796,6 +1973,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": True,
+        "date": "2025-01-31"
     },
     "gemini-2.0-flash-001--main": {
         "name": "Gemini-2.0-Flash-001",
@@ -1806,6 +1984,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": False,
+        "date": "2025-02-05"
     },
     "gemini-2.0-flash-exp--main": {
         "name": "Gemini-2.0-Flash-Exp",
@@ -1816,6 +1995,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": False,
+        "date": "2025-02-05"
     },
     "gemini-2.0-flash-lite-preview-02-05--main": {
         "name": "Gemini-2.0-Flash-Lite-Preview-02-05",
@@ -1826,6 +2006,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": False,
+        "date": "2025-02-05"
     },
     "gemini-2.0-pro-exp-02-05--main": {
         "name": "Gemini-2.0-Pro-Exp-02-05",
@@ -1836,6 +2017,7 @@
         "act_param": None,
         "open-data": "None",
         "reasoning": False,
+        "date": "2025-02-05"
     },
     "NovaSky-AI--Sky-T1-32B-Flash--main": {
         "name": "Sky-T1-32B-Flash",
@@ -1846,6 +2028,7 @@
         "act_param": 32,
         "open-data": "None",
         "reasoning": False,
+        "date": "2025-01-12"
     },
     "NovaSky-AI--Sky-T1-32B-Preview--main": {
         "name": "Sky-T1-32B-Preview",
@@ -1856,6 +2039,7 @@
         "act_param": 32,
         "open-data": "None",
         "reasoning": False,
+        "date": "2025-01-12"
     },
     "Qwen--QwQ-32B-Preview--main": {
         "name": "QwQ-32B-Preview",
@@ -1866,5 +2050,69 @@
         "act_param": 32,
         "open-data": "None",
         "reasoning": False,
+        "date": "2024-11-28"
+    },
+    "claude-3-7-sonnet-20250219--main": {
+        "name": "Claude-3-Haiku-20240307",
+        "link": "https://www.anthropic.com/news/claude-3-family",
+        "prompted": True,
+        "moe": False,
+        "size": None,
+        "act_param": None,
+        "open-data": "None",
+        "reasoning": True,
+        "date": "2025-02-19"
+    },
+    "chatgpt-4o-latest--main": {
+        "name": "ChatGPT-4o-latest-20250129",
+        "link": "https://chat.openai.com/",
+        "open-data": "None",
+        "prompted": True,
+        "moe": False,
+        "size": None,
+        "act_param": None,
+        "date": "2025-01-29"
+    },
+    "Kwaipilot--KwaiCoder-23B-A4B-v1--main": {
+        "name": "KwaiCoder-23B-A4B-v1",
+        "link": "https://huggingface.co/Kwaipilot/KwaiCoder-23B-A4B-v1",
+        "open-data": "None",
+        "prompted": False,
+        "moe": True,
+        "size": 23,
+        "act_param": 4,
+        "date": "2025-01-25"
+    },
+    "qwen-max-latest--main": {
+        "name": "Qwen2.5-Max",
+        "link": "https://qwenlm.github.io/blog/qwen2.5-max/",
+        "open-data": "None",
+        "prompted": True,
+        "moe": True,
+        "size": None,
+        "act_param": None,
+        "date": "2025-01-28"
+    },
+    "claude-3-7-sonnet-20250219--3200-output-128k-2025-02-19--main": {
+        "name": "Claude-3.7-Sonnet-20250219 (temperature=1, length=12800, reasoning=3200)",
+        "link": "https://www.anthropic.com/news/claude-3-7-sonnet",
+        "prompted": True,
+        "moe": False,
+        "size": None,
+        "act_param": None,
+        "open-data": "None",
+        "reasoning": True,
+        "date": "2025-02-19"
+    },
+    "claude-3-7-sonnet-20250219--main": {
+        "name": "Claude-3.7-Sonnet-20250219",
+        "link": "https://www.anthropic.com/news/claude-3-7-sonnet",
+        "prompted": True,
+        "moe": False,
+        "size": None,
+        "act_param": None,
+        "open-data": "None",
+        "reasoning": True,
+        "date": "2025-02-19"
     },
 }

From 5f0743d0a6874fd6fdfe6ab616fe7f65145fb038 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Tue, 25 Mar 2025 00:03:20 +0800
Subject: [PATCH 12/17] fix: remove vllm max length

---
 bigcodebench/provider/vllm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigcodebench/provider/vllm.py b/bigcodebench/provider/vllm.py
index 25f00b4..60b2285 100644
--- a/bigcodebench/provider/vllm.py
+++ b/bigcodebench/provider/vllm.py
@@ -41,7 +41,7 @@ def __init__(self, name: str, lora_path: str, dataset: str, tp: int, **kwargs) -
                 local_lora_path,
             )
         
-        self.llm = LLM(model=name, max_model_len=self.max_new_tokens, enable_lora=True if self.lora_request else False, **kwargs)
+        self.llm = LLM(model=name, enable_lora=True if self.lora_request else False, **kwargs)
         self.llm.set_tokenizer(tokenizer=self.tokenizer)
 
     def is_direct_completion(self) -> bool:

From 3513d997f55c383dec3436d7b43704a4affbc8d9 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Tue, 25 Mar 2025 00:13:39 +0800
Subject: [PATCH 13/17] fix: hardcode the model max length for vllm

---
 bigcodebench/provider/vllm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bigcodebench/provider/vllm.py b/bigcodebench/provider/vllm.py
index 60b2285..229e4c9 100644
--- a/bigcodebench/provider/vllm.py
+++ b/bigcodebench/provider/vllm.py
@@ -41,7 +41,8 @@ def __init__(self, name: str, lora_path: str, dataset: str, tp: int, **kwargs) -
                 local_lora_path,
             )
         
-        self.llm = LLM(model=name, enable_lora=True if self.lora_request else False, **kwargs)
+        # max_model_len is set to max_new_tokens * 10
+        self.llm = LLM(model=name, max_model_len=self.max_new_tokens * 10, enable_lora=True if self.lora_request else False, **kwargs)
         self.llm.set_tokenizer(tokenizer=self.tokenizer)
 
     def is_direct_completion(self) -> bool:

From 00fc9bb98c932424c2e9bf82ab417142aaca5e1d Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Tue, 25 Mar 2025 02:20:37 +0800
Subject: [PATCH 14/17] fix model metadata

---
 analysis/utils.py | 40 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/analysis/utils.py b/analysis/utils.py
index ec774c7..29a1cb7 100755
--- a/analysis/utils.py
+++ b/analysis/utils.py
@@ -1903,8 +1903,8 @@
         "link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
         "prompted": True,
         "moe": False,
-        "size": 14,
-        "act_param": 14,
+        "size": 8,
+        "act_param": 8,
         "open-data": "None",
         "reasoning": False,
         "date": "2025-01-20"
@@ -1914,8 +1914,8 @@
         "link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
         "prompted": True,
         "moe": False,
-        "size": 14,
-        "act_param": 14,
+        "size": 7,
+        "act_param": 7,
         "open-data": "None",
         "reasoning": False,
         "date": "2025-01-20"
@@ -2115,4 +2115,36 @@
         "reasoning": True,
         "date": "2025-02-19"
     },
+    "WarriorCoder-6.7B--main": {
+        "name": "WarriorCoder-6.7B (Reproduced)",
+        "link": "https://arxiv.org/abs/2412.17395",
+        "open-data": "None",
+        "prompted": True,
+        "moe": False,
+        "date": "2025-02-18"
+    },
+    "google--gemma-3-27b-it--main": {
+        "name": "Gemma-3-27B-Instruct",
+        "link": "https://huggingface.co/google/gemma-3-27b-it",
+        "open-data": "None",
+        "prompted": True,
+        "moe": False,
+        "date": "2025-03-12"
+    },
+    "Qwen--QwQ-32B--skip_prefill--main": {
+        "name": "QwQ-32B (w/ Reasoning)",
+        "link": "https://huggingface.co/Qwen/QwQ-32B",
+        "open-data": "None",
+        "prompted": True,
+        "moe": False,
+        "date": "2025-03-06"
+    },
+    "deepseek-chat-0324--main": {
+        "name": "DeepSeek-V3-0324",
+        "link": "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324",
+        "open-data": "None",
+        "prompted": True,
+        "moe": False,
+        "date": "2025-03-24"
+    }
 }

From 720681b8ecbcabbfafa6f4c1aae1ca8365d726c4 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Tue, 25 Mar 2025 02:22:11 +0800
Subject: [PATCH 15/17] feat: add max_model_len for vllm

---
 ADVANCED_USAGE.md                 | 1 +
 bigcodebench/generate.py          | 3 +++
 bigcodebench/provider/__init__.py | 2 ++
 bigcodebench/provider/vllm.py     | 5 ++---
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md
index 9bb81b8..c0905ba 100755
--- a/ADVANCED_USAGE.md
+++ b/ADVANCED_USAGE.md
@@ -50,6 +50,7 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio
 - `--n_samples`: The number of samples, default to `1`
 - `--temperature`: The temperature, default to `0.0`
 - `--max_new_tokens`: The length of max new tokens, default to `1280`
+- `--max_model_len`: The length of max tokens in VLLM, default to `12800`
 - `--greedy`: Whether to use greedy decoding, default to `False`
 - `--strip_newlines`: Whether to strip newlines, default to `False`, set to `True` to strip newlines for some model series like StarCoder2
 - `--direct_completion`: Whether to use direct completion, default to `False`
diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py
index 87b67ea..912abcd 100644
--- a/bigcodebench/generate.py
+++ b/bigcodebench/generate.py
@@ -132,6 +132,8 @@ def run_codegen(
     n_samples: int = 1,
     temperature: float = 0.0,
     max_new_tokens: int = 1280,
+    # vllm
+    max_model_len: int = 12800,
     greedy: bool = False,
     # openai
     reasoning_effort: str = "medium",
@@ -178,6 +180,7 @@ def run_codegen(
         lora_path=lora_path,
         temperature=temperature,
         max_new_tokens=max_new_tokens,
+        max_model_len=max_model_len,
         reasoning_effort=reasoning_effort,
         reasoning_budget=reasoning_budget,
         reasoning_beta=reasoning_beta,
diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py
index 202d049..4cb3410 100644
--- a/bigcodebench/provider/__init__.py
+++ b/bigcodebench/provider/__init__.py
@@ -10,6 +10,7 @@ def make_model(
     dataset: str = "bigcodebench",
     temperature: float = 0.0,
     max_new_tokens: int = 1280,
+    max_model_len: int = 12800,
     # openai only
     reasoning_effort: str = "medium",
     # anthropic only
@@ -42,6 +43,7 @@ def make_model(
             lora_path=lora_path,
             temperature=temperature,
             max_new_tokens=max_new_tokens,
+            max_model_len=max_model_len,
             revision=revision,
             dataset=dataset,
             direct_completion=direct_completion,
diff --git a/bigcodebench/provider/vllm.py b/bigcodebench/provider/vllm.py
index 229e4c9..41cd251 100644
--- a/bigcodebench/provider/vllm.py
+++ b/bigcodebench/provider/vllm.py
@@ -13,7 +13,7 @@
 )
 
 class VllmDecoder(DecoderBase):
-    def __init__(self, name: str, lora_path: str, dataset: str, tp: int, **kwargs) -> None:
+    def __init__(self, name: str, lora_path: str, dataset: str, tp: int, max_model_len: int, **kwargs) -> None:
         super().__init__(name, **kwargs)
 
         kwargs = {
@@ -41,8 +41,7 @@ def __init__(self, name: str, lora_path: str, dataset: str, tp: int, **kwargs) -
                 local_lora_path,
             )
         
-        # max_model_len is set to max_new_tokens * 10
-        self.llm = LLM(model=name, max_model_len=self.max_new_tokens * 10, enable_lora=True if self.lora_request else False, **kwargs)
+        self.llm = LLM(model=name, max_model_len=max_model_len, enable_lora=True if self.lora_request else False, **kwargs)
         self.llm.set_tokenizer(tokenizer=self.tokenizer)
 
     def is_direct_completion(self) -> bool:

From c9e2cbba6618bec6ced0aa08892e4a7446d128ee Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Tue, 1 Apr 2025 01:10:09 +0800
Subject: [PATCH 16/17] update model metadata

---
 analysis/utils.py | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/analysis/utils.py b/analysis/utils.py
index 29a1cb7..798499b 100755
--- a/analysis/utils.py
+++ b/analysis/utils.py
@@ -2071,6 +2071,7 @@
         "moe": False,
         "size": None,
         "act_param": None,
+        "reasoning": True,
         "date": "2025-01-29"
     },
     "Kwaipilot--KwaiCoder-23B-A4B-v1--main": {
@@ -2081,6 +2082,7 @@
         "moe": True,
         "size": 23,
         "act_param": 4,
+        "reasoning": False,
         "date": "2025-01-25"
     },
     "qwen-max-latest--main": {
@@ -2091,6 +2093,7 @@
         "moe": True,
         "size": None,
         "act_param": None,
+        "reasoning": True,
         "date": "2025-01-28"
     },
     "claude-3-7-sonnet-20250219--3200-output-128k-2025-02-19--main": {
@@ -2121,6 +2124,10 @@
         "open-data": "None",
         "prompted": True,
         "moe": False,
+        "size": 6.7,
+        "act_param": 6.7,
+        "open-data": "None",
+        "reasoning": False,
         "date": "2025-02-18"
     },
     "google--gemma-3-27b-it--main": {
@@ -2129,6 +2136,10 @@
         "open-data": "None",
         "prompted": True,
         "moe": False,
+        "size": 27,
+        "act_param": 27,
+        "open-data": "None",
+        "reasoning": False,
         "date": "2025-03-12"
     },
     "Qwen--QwQ-32B--skip_prefill--main": {
@@ -2137,6 +2148,10 @@
         "open-data": "None",
         "prompted": True,
         "moe": False,
+        "size": 32,
+        "act_param": 32,
+        "open-data": "None",
+        "reasoning": True,
         "date": "2025-03-06"
     },
     "deepseek-chat-0324--main": {
@@ -2144,7 +2159,23 @@
         "link": "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324",
         "open-data": "None",
         "prompted": True,
-        "moe": False,
+        "moe": True,
+        "size": 671,
+        "act_param": 37,
+        "open-data": "None",
+        "reasoning": True,
         "date": "2025-03-24"
+    },
+    "gemini-2.5-pro-exp-03-25--main": {
+        "name": "Gemini-2.5-Pro-Exp-03-25",
+        "link": "https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/",
+        "open-data": "None",
+        "prompted": True,
+        "moe": False,
+        "size": None,
+        "act_param": 37,
+        "open-data": "None",
+        "reasoning": True,
+        "date": "2025-03-25"
     }
-}
+}
\ No newline at end of file

From 9bd90fedee89d7dc3676838c75d9642cb0cd0702 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Tue, 1 Apr 2025 01:11:27 +0800
Subject: [PATCH 17/17] feat: use google genai

---
 Docker/Evaluate.Dockerfile              |  2 +-
 bigcodebench/gen/util/google_request.py | 42 ++++++++++++++++---------
 bigcodebench/provider/google.py         |  9 +++---
 setup.cfg                               |  2 +-
 4 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/Docker/Evaluate.Dockerfile b/Docker/Evaluate.Dockerfile
index 90e7f40..8b2cdcd 100755
--- a/Docker/Evaluate.Dockerfile
+++ b/Docker/Evaluate.Dockerfile
@@ -54,7 +54,7 @@ RUN pip install \
     rich \
     accelerate \
     anthropic \
-    google-generativeai \
+    google-genai \
     mistralai \
     openai \
     e2b
diff --git a/bigcodebench/gen/util/google_request.py b/bigcodebench/gen/util/google_request.py
index 9e13607..5a76362 100644
--- a/bigcodebench/gen/util/google_request.py
+++ b/bigcodebench/gen/util/google_request.py
@@ -1,11 +1,12 @@
 import time
 
-import google.generativeai as genai
+from google import genai
 from google.api_core.exceptions import GoogleAPICallError, ResourceExhausted
 
 
 def make_request(
-    client: genai.GenerativeModel,
+    model: str,
+    client: genai.Client,
     message: str,
     temperature: float,
     n: int,
@@ -13,21 +14,34 @@ def make_request(
 ) -> genai.types.GenerateContentResponse:
     kwargs = {"temperature": temperature, "max_output_tokens": max_new_tokens}
 
-    if "-thinking-" in client.model_name:
+    if "-thinking-" in model:
         kwargs.pop("max_output_tokens")
-
-    response = client.generate_content(
-        [{"role": "user", "parts": [message]}],
-        generation_config=genai.types.GenerationConfig(
+    
+    response = client.models.generate_content(
+        model=model,
+        contents=message,
+        config=genai.types.GenerateContentConfig(
             candidate_count=n,
+            safety_settings=[
+                genai.types.SafetySetting(
+                    category='HARM_CATEGORY_DANGEROUS_CONTENT',
+                    threshold='BLOCK_NONE'
+                ),
+                genai.types.SafetySetting(
+                    category='HARM_CATEGORY_SEXUALLY_EXPLICIT',
+                    threshold='BLOCK_NONE'
+                ),
+                genai.types.SafetySetting(
+                    category='HARM_CATEGORY_HATE_SPEECH',
+                    threshold='BLOCK_NONE'
+                ),
+                genai.types.SafetySetting(
+                    category='HARM_CATEGORY_HARASSMENT',
+                    threshold='BLOCK_NONE'
+                ),
+            ],
             **kwargs
-        ),
-        safety_settings=[
-            {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
-            {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
-            {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
-            {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
-        ],
+        ),            
     )
 
     return response
diff --git a/bigcodebench/provider/google.py b/bigcodebench/provider/google.py
index 2194c47..e3b18ff 100644
--- a/bigcodebench/provider/google.py
+++ b/bigcodebench/provider/google.py
@@ -2,7 +2,7 @@
 from typing import List
 from tqdm import tqdm
 
-import google.generativeai as genai
+from google import genai
 
 from bigcodebench.provider.base import DecoderBase
 from bigcodebench.gen.util.google_request import make_auto_request
@@ -12,8 +12,8 @@
 class GoogleDecoder(DecoderBase):
     def __init__(self, name: str, **kwargs):
         super().__init__(name, **kwargs)
-        genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
-        self.client = genai.GenerativeModel(name)
+        self.model = name
+        self.client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
 
     def codegen(
         self, prompts: List[str], do_sample: bool = True, num_samples: int = 200
@@ -34,7 +34,8 @@ def codegen(
                 tokenizer=None,
             )
             ret = make_auto_request(
-                self.client,
+                model=self.model,
+                client=self.client,
                 message=message,
                 n=num_samples,
                 temperature=self.temperature,
diff --git a/setup.cfg b/setup.cfg
index cc20139..5907add 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -35,7 +35,7 @@ install_requires =
     rich
     accelerate>=0.30.1
     anthropic>=0.26.1
-    google-generativeai>=0.5.4
+    google-genai
     mistralai>=0.2.0,<1.0.0
     openai>=1.11.1
     e2b