[GPT Big Code] Fix attention scaling (#40041)

vasqu · web-flow · commit e29919f9936c · 2025-08-11T19:01:31.000Z
* fix

* update integration tests

* fmt

* add regression test
diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -144,7 +144,7 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
             )
 
         self.scale_attn_weights = config.scale_attn_weights
-        self.scaling = self.head_dim**0.5 if config.scale_attn_weights else 1.0
+        self.scaling = self.head_dim**-0.5 if config.scale_attn_weights else 1.0
         self.is_cross_attention = is_cross_attention
 
         self.layer_idx = layer_idx
diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
@@ -29,6 +29,7 @@
     import torch
 
     from transformers import (
+        AutoTokenizer,
         GPT2TokenizerFast,
         GPTBigCodeForCausalLM,
         GPTBigCodeForSequenceClassification,
@@ -510,7 +511,7 @@ def test_generate_simple(self):
         output_sequence = model.generate(input_ids)
         output_sentence = tokenizer.decode(output_sequence[0], skip_special_tokens=True)
 
-        expected_output = """def print_hello_world():\n    print("Hello World!")\n\n\ndef print_hello_"""
+        expected_output = 'def print_hello_world():\n    print("Hello World!")\n\n\ndef print_hello_world_with_args(name'  # fmt: skip
         self.assertEqual(output_sentence, expected_output)
 
     def test_generate_batched(self):
@@ -527,11 +528,27 @@ def test_generate_batched(self):
         outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
 
         expected_output = [
-            'def print_hello_world():\n    print("Hello World!")\n\n\ndef print_hello_',
-            'def say_hello():\n    print("Hello, World!")\n\n\nsay_hello()',
+            'def print_hello_world():\n    print("Hello World!")\n\n\ndef print_hello_world_with_args(name',
+            'def say_hello():\n    print("Hello, World!")\n\n\nsay_hello()\n',
         ]
         self.assertListEqual(outputs, expected_output)
 
+    def test_newline_regression(self):
+        """Added to prevent regressions regarding attention (scaling) indicated by excessive newlines"""
+        tokenizer = AutoTokenizer.from_pretrained("bigcode/tiny_starcoder_py")
+        model = GPTBigCodeForCausalLM.from_pretrained("bigcode/tiny_starcoder_py").to(torch_device)
+
+        input_ids = tokenizer(
+            "Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.\n",
+            return_tensors="pt",
+        ).input_ids.to(torch_device)
+
+        output_sequence = model.generate(input_ids, max_new_tokens=20, do_sample=False)
+        output_sentence = tokenizer.decode(output_sequence[0], skip_special_tokens=True)
+
+        expected_output = 'Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.\n\nThe impact of the COVID-19 pandemic on global economic structures and future business'  # fmt: skip
+        self.assertEqual(output_sentence, expected_output)
+
 
 @require_torch
 class GPTBigCodeMQATest(unittest.TestCase):

Original file line number	Diff line number	Diff line change
`@@ -144,7 +144,7 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):`
`144`	`144`	`)`
`145`	`145`
`146`	`146`	`self.scale_attn_weights = config.scale_attn_weights`
`147`		`- self.scaling = self.head_dim**0.5 if config.scale_attn_weights else 1.0`
	`147`	`+ self.scaling = self.head_dim**-0.5 if config.scale_attn_weights else 1.0`
`148`	`148`	`self.is_cross_attention = is_cross_attention`
`149`	`149`
`150`	`150`	`self.layer_idx = layer_idx`