huggingface · cyyever · Aug 10, 2025
diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py
@@ -1255,8 +1255,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 class AriaTextAttention(LlamaAttention):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
-    def __init__(self, config: AriaTextConfig, layer_idx: int):
-        super().__init__()
+    pass
 
 
 class AriaTextDecoderLayer(LlamaDecoderLayer):
@@ -1273,7 +1272,7 @@ class AriaTextDecoderLayer(LlamaDecoderLayer):
     """
 
     def __init__(self, config: AriaTextConfig, layer_idx: int):
-        super().__init__(self)
+        super().__init__(self, layer_idx)
         self.mlp = AriaTextMoELayer(config)
 
 
@@ -1306,7 +1305,7 @@ class AriaPreTrainedModel(LlamaPreTrainedModel):
     _supports_attention_backend = True
 
     def _init_weights(self, module):
-        LlamaPreTrainedModel._init_weights(module)
+        LlamaPreTrainedModel._init_weights(self, module)
         if isinstance(module, AriaProjector):
             nn.init.trunc_normal_(module.query, std=self.config.initializer_range)
 

diff --git a/src/transformers/models/cohere2/modular_cohere2.py b/src/transformers/models/cohere2/modular_cohere2.py
@@ -275,7 +275,7 @@ class Cohere2Attention(CohereAttention, nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
     def __init__(self, config: Cohere2Config, layer_idx: Optional[int] = None):
-        nn.Module.__init__()
+        nn.Module.__init__(self)
         self.config = config
         self.layer_idx = layer_idx
         self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)

diff --git a/src/transformers/models/data2vec/modular_data2vec_audio.py b/src/transformers/models/data2vec/modular_data2vec_audio.py
@@ -114,7 +114,7 @@ def forward(self, hidden_states):
 
 class Data2VecAudioFeatureEncoder(Wav2Vec2FeatureEncoder, nn.Module):
     def __init__(self, config):
-        nn.Module.__init__()
+        nn.Module.__init__(self)
         self.conv_layers = nn.ModuleList(
             [Data2VecAudioConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
         )

diff --git a/src/transformers/models/deepseek_v2/modular_deepseek_v2.py b/src/transformers/models/deepseek_v2/modular_deepseek_v2.py
@@ -505,7 +505,7 @@ def __init__(self, config: DeepseekV2Config, layer_idx: int):
 
 class DeepseekV2PreTrainedModel(LlamaPreTrainedModel):
     def _init_weights(self, module):
-        LlamaPreTrainedModel._init_weights(module)
+        LlamaPreTrainedModel._init_weights(self, module)
         if isinstance(module, DeepseekV2MoEGate):
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
 

diff --git a/src/transformers/models/deepseek_v3/modular_deepseek_v3.py b/src/transformers/models/deepseek_v3/modular_deepseek_v3.py
@@ -341,7 +341,7 @@ def __init__(self, config: DeepseekV3Config, layer_idx: int):
 
 class DeepseekV3PreTrainedModel(LlamaPreTrainedModel):
     def _init_weights(self, module):
-        LlamaPreTrainedModel._init_weights(module)
+        LlamaPreTrainedModel._init_weights(self, module)
         if isinstance(module, DeepseekV3TopkRouter):
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
 

diff --git a/src/transformers/models/dia/modular_dia.py b/src/transformers/models/dia/modular_dia.py
@@ -111,7 +111,7 @@ class DiaSelfAttention(LlamaAttention, nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
     def __init__(self, config: Union[DiaEncoderConfig, DiaDecoderConfig], layer_idx: int, is_causal: bool = False):
-        nn.Module.__init__()
+        nn.Module.__init__(self)
         self.config = config
         self.layer_idx = layer_idx
         self.hidden_size = config.hidden_size

diff --git a/src/transformers/models/diffllama/modular_diffllama.py b/src/transformers/models/diffllama/modular_diffllama.py
@@ -408,7 +408,7 @@ class DiffLlamaPreTrainedModel(LlamaPreTrainedModel):
     _supports_attention_backend = False
 
     def _init_weights(self, module):
-        LlamaPreTrainedModel._init_weights(module)
+        LlamaPreTrainedModel._init_weights(self, module)
         if isinstance(module, DiffLlamaAttention):
             module.lambda_q1.data.normal_(0, self.config.lambda_std_dev)
             module.lambda_k1.data.normal_(0, self.config.lambda_std_dev)

diff --git a/src/transformers/models/doge/modular_doge.py b/src/transformers/models/doge/modular_doge.py
@@ -576,7 +576,7 @@ class DogePreTrainedModel(LlamaPreTrainedModel):
 
     def _init_weights(self, module):
         """Initialize the weights"""
-        LlamaPreTrainedModel._init_weights(module)
+        LlamaPreTrainedModel._init_weights(self, module)
         if isinstance(module, DogeAttention):
             if hasattr(module, "A"):
                 module.A.data.zero_()

diff --git a/src/transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py b/src/transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py
@@ -224,7 +224,7 @@ class Ernie4_5_MoePreTrainedModel(MixtralPreTrainedModel):
     }
 
     def _init_weights(self, module):
-        MixtralPreTrainedModel._init_weights(module)
+        MixtralPreTrainedModel._init_weights(self, module)
         if isinstance(module, Ernie4_5_MoeStatics):
             module.e_score_correction_bias.data.zero_()
 

diff --git a/src/transformers/models/evolla/modular_evolla.py b/src/transformers/models/evolla/modular_evolla.py
@@ -129,7 +129,7 @@ def forward(self, q: torch.Tensor, k: torch.Tensor) -> tuple[torch.Tensor, torch
 
 class EvollaSaProtSelfAttention(EsmSelfAttention, nn.Module):
     def __init__(self, config, position_embedding_type=None, layer_idx=None):
-        nn.Module.__init__()
+        nn.Module.__init__(self)
         self.config = config
 
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
@@ -781,7 +781,7 @@ class EvollaPreTrainedModel(LlamaPreTrainedModel):
 
     def _init_weights(self, module):
         std = self.config.initializer_range
-        LlamaPreTrainedModel._init_weights(module)
+        LlamaPreTrainedModel._init_weights(self, module)
         if isinstance(module, EvollaSequenceAlignerCrossAttention):
             module.gate_attention.zero_()
             module.gate_ffw.zero_()

diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py
@@ -530,7 +530,7 @@ class Gemma3PreTrainedModel(Gemma2PreTrainedModel):
     ]
 
     def _init_weights(self, module):
-        Gemma2PreTrainedModel._init_weights(module)
+        Gemma2PreTrainedModel._init_weights(self, module)
         if isinstance(module, Gemma3MultiModalProjector):
             module.mm_input_projection_weight.data.zero_()
 

diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py
@@ -1918,7 +1918,7 @@ class Gemma3nPreTrainedModel(Gemma2PreTrainedModel):
     _no_split_modules = ["Gemma3nTextDecoderLayer"]
 
     def _init_weights(self, module):
-        Gemma2PreTrainedModel._init_weights(module)
+        Gemma2PreTrainedModel._init_weights(self, module)
         if isinstance(module, Gemma3nAudioCumulativeGroupNorm):
             module.weight.data.fill_(1.0)
         elif isinstance(module, Gemma3nAudioAttention):

diff --git a/src/transformers/models/glm4_moe/modular_glm4_moe.py b/src/transformers/models/glm4_moe/modular_glm4_moe.py
@@ -257,7 +257,7 @@ def __init__(
 
 class Glm4MoeAttention(CohereAttention, nn.Module):
     def __init__(self, config: Glm4MoeConfig, layer_idx: Optional[int] = None):
-        nn.Module.__init__()
+        nn.Module.__init__(self)
         self.config = config
         self.layer_idx = layer_idx
         self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
@@ -289,7 +289,7 @@ class Glm4MoeMLP(DeepseekV3MLP):
 
 class Glm4MoeTopkRouter(DeepseekV3TopkRouter, nn.Module):
     def __init__(self, config: Glm4MoeConfig):
-        nn.Module.__init__()
+        nn.Module.__init__(self)
         self.config = config
         self.top_k = config.num_experts_per_tok
         self.n_routed_experts = config.n_routed_experts

diff --git a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py
@@ -1730,14 +1730,9 @@ def _repeat_interleave_samples(x, lengths, repeat_times):
                         dict_to_expand[key], lengths=lengths, repeat_times=expand_size
                     )
                 elif key == "second_per_grid_ts":
-                    if not isinstance(dict_to_expand[key], list):
-                        raise TypeError(
-                            f"Expected value for key '{key}' to be a list, but got {type(dict_to_expand[key])} instead."
-                        )
-                    tensor = torch.tensor(dict_to_expand[key])
-                    lengths = list(video_nums)
-                    tensor = _repeat_interleave_samples(tensor, lengths=lengths, repeat_times=expand_size)
-                    dict_to_expand[key] = tensor.tolist()
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=list(video_nums), repeat_times=expand_size
+                    )
             return dict_to_expand
 
         def _expand_dict_for_generation(dict_to_expand):

diff --git a/src/transformers/models/got_ocr2/modular_got_ocr2.py b/src/transformers/models/got_ocr2/modular_got_ocr2.py
@@ -291,7 +291,7 @@ class GotOcr2PreTrainedModel(LlavaPreTrainedModel):
     _supports_flex_attn = False
 
     def _init_weights(self, module):
-        LlavaPreTrainedModel._init_weights(module)
+        LlavaPreTrainedModel._init_weights(self, module)
         if isinstance(module, GotOcr2VisionAttention):
             if module.use_rel_pos:
                 module.rel_pos_h.data.zero_()

diff --git a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py
@@ -584,7 +584,7 @@ def _init_weights(self, module):
 
 class Phi4MultimodalVisionEmbeddings(SiglipVisionEmbeddings, nn.Module):
     def __init__(self, config: Phi4MultimodalVisionConfig):
-        nn.Module.__init__()
+        nn.Module.__init__(self)
         self.config = config
         self.patch_size = config.patch_size
         self.num_patches_per_side = config.image_size // self.patch_size
@@ -1449,7 +1449,7 @@ class Phi4MultimodalRotaryEmbedding(Phi3RotaryEmbedding):
 
 class Phi4MultimodalPreTrainedModel(Phi3PreTrainedModel):
     def _init_weights(self, module):
-        Phi3PreTrainedModel._init_weights(module)
+        Phi3PreTrainedModel._init_weights(self, module)
         if isinstance(module, Phi4MultimodalImageEmbedding):
             module.global_img_feature_extensor.data.zero_()
             module.sub_img_feature_extensor.data.zero_()

diff --git a/src/transformers/models/qwen2/modular_qwen2.py b/src/transformers/models/qwen2/modular_qwen2.py
@@ -99,7 +99,7 @@ def forward(
 
 class Qwen2DecoderLayer(LlamaDecoderLayer):
     def __init__(self, config: Qwen2Config, layer_idx: int):
-        super().__init__()
+        super().__init__(config=config, layer_idx=layer_idx)
         self.attention_type = config.layer_types[layer_idx]
 
 

diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py
@@ -2071,7 +2071,7 @@ def __init__(self, config: Qwen2_5OmniThinkerConfig, device=None):
 # Removes the value error as a workaround.
 class Qwen2_5OmniAttention(Qwen2_5_VLAttention, nn.Module):
     def __init__(self, config: Qwen2_5OmniConfig, layer_idx: Optional[int] = None):
-        nn.Module.__init__()
+        nn.Module.__init__(self)
         self.config = config
         self.layer_idx = layer_idx
         if layer_idx is None:

diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
@@ -142,7 +142,7 @@ def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> N
 
 class Qwen2_5_VLVisionAttention(VisionAttention):
     def __init__(self, config: Qwen2_5_VLVisionConfig) -> None:
-        super().__init__()
+        super().__init__(config)
         self.dim = config.hidden_size
 
 

diff --git a/src/transformers/models/starcoder2/modular_starcoder2.py b/src/transformers/models/starcoder2/modular_starcoder2.py
@@ -72,7 +72,7 @@ def forward(self, hidden_states: Optional[tuple[torch.FloatTensor]]) -> torch.Fl
 
 class Starcoder2Attention(MistralAttention):
     def __init__(self, config: Starcoder2Config, layer_idx: Optional[int] = None):
-        super().__init__()
+        super().__init__(config=config, layer_idx=layer_idx)
         self.residual_dropout = config.residual_dropout
         self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.use_bias)
         self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias)
@@ -131,7 +131,7 @@ def forward(
 
 class Starcoder2DecoderLayer(MistralDecoderLayer):
     def __init__(self, config: Starcoder2Config, layer_idx: int):
-        super().__init__(self)
+        super().__init__(self, layer_idx)
         self.self_attn = Starcoder2Attention(config=config, layer_idx=layer_idx)
         self.mlp = Starcoder2MLP(config)
         self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)

diff --git a/src/transformers/models/t5gemma/modular_t5gemma.py b/src/transformers/models/t5gemma/modular_t5gemma.py
@@ -488,7 +488,7 @@ class T5GemmaPreTrainedModel(Gemma2PreTrainedModel):
 
     def _init_weights(self, module):
         # TODO: support intialization for encoders and decoders separately(?)
-        Gemma2PreTrainedModel._init_weights(module)
+        Gemma2PreTrainedModel._init_weights(self, module)
         std = self.config.initializer_range
         if isinstance(module, T5GemmaClassificationHead):
             scale = module.out_proj.weight.shape[0] ** -0.5

diff --git a/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py
@@ -66,7 +66,7 @@ def _compute_new_attention_mask(hidden_states: torch.Tensor, seq_lens: torch.Ten
 
 class Wav2Vec2BertRotaryPositionalEmbedding(Wav2Vec2ConformerRotaryPositionalEmbedding, nn.Module):
     def __init__(self, config):
-        nn.Module.__init__()
+        nn.Module.__init__(self)
         dim = config.hidden_size // config.num_attention_heads
         base = config.rotary_embedding_base
 
@@ -98,7 +98,7 @@ def forward(self, hidden_states):
 
 class Wav2Vec2BertFeedForward(Wav2Vec2FeedForward, nn.Module):
     def __init__(self, config, act_fn=None, hidden_size=None):
-        nn.Module.__init__()
+        nn.Module.__init__(self)
         act_fn = act_fn if act_fn is not None else config.hidden_act
         hidden_size = hidden_size if hidden_size is not None else config.hidden_size
         self.intermediate_dropout = nn.Dropout(config.activation_dropout)
@@ -188,7 +188,7 @@ class Wav2Vec2BertSelfAttention(Wav2Vec2ConformerSelfAttention, nn.Module):
     """
 
     def __init__(self, config, is_adapter_attention=False):
-        nn.Module.__init__()
+        nn.Module.__init__(self)
         hidden_size = config.hidden_size if not is_adapter_attention else config.output_hidden_size
 
         self.head_size = hidden_size // config.num_attention_heads

diff --git a/src/transformers/pipelines/image_text_to_text.py b/src/transformers/pipelines/image_text_to_text.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 import enum
-from collections.abc import Iterable  # pylint: disable=g-importing-member
+from collections.abc import Iterable
 from typing import Any, Optional, Union, overload
 
 from ..generation import GenerationConfig

diff --git a/src/transformers/pipelines/table_question_answering.py b/src/transformers/pipelines/table_question_answering.py
@@ -451,7 +451,7 @@ def postprocess(self, model_outputs):
 
                 answers.append(answer)
             if len(answer) == 0:
-                raise PipelineException("Empty answer")
+                raise PipelineException("Table question answering", self.model.name_or_path, "Empty answer")
         else:
             answers = [{"answer": answer} for answer in self.tokenizer.batch_decode(outputs, skip_special_tokens=True)]