Skip to content

Add missing arguments to class constructors #40068

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions src/transformers/models/aria/modular_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -1255,8 +1255,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
class AriaTextAttention(LlamaAttention):
"""Multi-headed attention from 'Attention Is All You Need' paper"""

def __init__(self, config: AriaTextConfig, layer_idx: int):
super().__init__()
pass


class AriaTextDecoderLayer(LlamaDecoderLayer):
Expand All @@ -1273,7 +1272,7 @@ class AriaTextDecoderLayer(LlamaDecoderLayer):
"""

def __init__(self, config: AriaTextConfig, layer_idx: int):
super().__init__(self)
super().__init__(self, layer_idx)
self.mlp = AriaTextMoELayer(config)


Expand Down Expand Up @@ -1306,7 +1305,7 @@ class AriaPreTrainedModel(LlamaPreTrainedModel):
_supports_attention_backend = True

def _init_weights(self, module):
LlamaPreTrainedModel._init_weights(module)
LlamaPreTrainedModel._init_weights(self, module)
if isinstance(module, AriaProjector):
nn.init.trunc_normal_(module.query, std=self.config.initializer_range)

Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/cohere2/modular_cohere2.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ class Cohere2Attention(CohereAttention, nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""

def __init__(self, config: Cohere2Config, layer_idx: Optional[int] = None):
nn.Module.__init__()
nn.Module.__init__(self)
self.config = config
self.layer_idx = layer_idx
self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/data2vec/modular_data2vec_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def forward(self, hidden_states):

class Data2VecAudioFeatureEncoder(Wav2Vec2FeatureEncoder, nn.Module):
def __init__(self, config):
nn.Module.__init__()
nn.Module.__init__(self)
self.conv_layers = nn.ModuleList(
[Data2VecAudioConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/deepseek_v2/modular_deepseek_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,7 @@ def __init__(self, config: DeepseekV2Config, layer_idx: int):

class DeepseekV2PreTrainedModel(LlamaPreTrainedModel):
def _init_weights(self, module):
LlamaPreTrainedModel._init_weights(module)
LlamaPreTrainedModel._init_weights(self, module)
if isinstance(module, DeepseekV2MoEGate):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)

Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/deepseek_v3/modular_deepseek_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ def __init__(self, config: DeepseekV3Config, layer_idx: int):

class DeepseekV3PreTrainedModel(LlamaPreTrainedModel):
def _init_weights(self, module):
LlamaPreTrainedModel._init_weights(module)
LlamaPreTrainedModel._init_weights(self, module)
if isinstance(module, DeepseekV3TopkRouter):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)

Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/dia/modular_dia.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ class DiaSelfAttention(LlamaAttention, nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""

def __init__(self, config: Union[DiaEncoderConfig, DiaDecoderConfig], layer_idx: int, is_causal: bool = False):
nn.Module.__init__()
nn.Module.__init__(self)
self.config = config
self.layer_idx = layer_idx
self.hidden_size = config.hidden_size
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/diffllama/modular_diffllama.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@ class DiffLlamaPreTrainedModel(LlamaPreTrainedModel):
_supports_attention_backend = False

def _init_weights(self, module):
LlamaPreTrainedModel._init_weights(module)
LlamaPreTrainedModel._init_weights(self, module)
if isinstance(module, DiffLlamaAttention):
module.lambda_q1.data.normal_(0, self.config.lambda_std_dev)
module.lambda_k1.data.normal_(0, self.config.lambda_std_dev)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/doge/modular_doge.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,7 @@ class DogePreTrainedModel(LlamaPreTrainedModel):

def _init_weights(self, module):
"""Initialize the weights"""
LlamaPreTrainedModel._init_weights(module)
LlamaPreTrainedModel._init_weights(self, module)
if isinstance(module, DogeAttention):
if hasattr(module, "A"):
module.A.data.zero_()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ class Ernie4_5_MoePreTrainedModel(MixtralPreTrainedModel):
}

def _init_weights(self, module):
MixtralPreTrainedModel._init_weights(module)
MixtralPreTrainedModel._init_weights(self, module)
if isinstance(module, Ernie4_5_MoeStatics):
module.e_score_correction_bias.data.zero_()

Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/evolla/modular_evolla.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def forward(self, q: torch.Tensor, k: torch.Tensor) -> tuple[torch.Tensor, torch

class EvollaSaProtSelfAttention(EsmSelfAttention, nn.Module):
def __init__(self, config, position_embedding_type=None, layer_idx=None):
nn.Module.__init__()
nn.Module.__init__(self)
self.config = config

if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
Expand Down Expand Up @@ -781,7 +781,7 @@ class EvollaPreTrainedModel(LlamaPreTrainedModel):

def _init_weights(self, module):
std = self.config.initializer_range
LlamaPreTrainedModel._init_weights(module)
LlamaPreTrainedModel._init_weights(self, module)
if isinstance(module, EvollaSequenceAlignerCrossAttention):
module.gate_attention.zero_()
module.gate_ffw.zero_()
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/gemma3/modular_gemma3.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,7 @@ class Gemma3PreTrainedModel(Gemma2PreTrainedModel):
]

def _init_weights(self, module):
Gemma2PreTrainedModel._init_weights(module)
Gemma2PreTrainedModel._init_weights(self, module)
if isinstance(module, Gemma3MultiModalProjector):
module.mm_input_projection_weight.data.zero_()

Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/gemma3n/modular_gemma3n.py
Original file line number Diff line number Diff line change
Expand Up @@ -1918,7 +1918,7 @@ class Gemma3nPreTrainedModel(Gemma2PreTrainedModel):
_no_split_modules = ["Gemma3nTextDecoderLayer"]

def _init_weights(self, module):
Gemma2PreTrainedModel._init_weights(module)
Gemma2PreTrainedModel._init_weights(self, module)
if isinstance(module, Gemma3nAudioCumulativeGroupNorm):
module.weight.data.fill_(1.0)
elif isinstance(module, Gemma3nAudioAttention):
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/glm4_moe/modular_glm4_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def __init__(

class Glm4MoeAttention(CohereAttention, nn.Module):
def __init__(self, config: Glm4MoeConfig, layer_idx: Optional[int] = None):
nn.Module.__init__()
nn.Module.__init__(self)
self.config = config
self.layer_idx = layer_idx
self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
Expand Down Expand Up @@ -289,7 +289,7 @@ class Glm4MoeMLP(DeepseekV3MLP):

class Glm4MoeTopkRouter(DeepseekV3TopkRouter, nn.Module):
def __init__(self, config: Glm4MoeConfig):
nn.Module.__init__()
nn.Module.__init__(self)
self.config = config
self.top_k = config.num_experts_per_tok
self.n_routed_experts = config.n_routed_experts
Expand Down
11 changes: 3 additions & 8 deletions src/transformers/models/glm4v_moe/modeling_glm4v_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1730,14 +1730,9 @@ def _repeat_interleave_samples(x, lengths, repeat_times):
dict_to_expand[key], lengths=lengths, repeat_times=expand_size
)
elif key == "second_per_grid_ts":
if not isinstance(dict_to_expand[key], list):
raise TypeError(
f"Expected value for key '{key}' to be a list, but got {type(dict_to_expand[key])} instead."
)
tensor = torch.tensor(dict_to_expand[key])
lengths = list(video_nums)
tensor = _repeat_interleave_samples(tensor, lengths=lengths, repeat_times=expand_size)
dict_to_expand[key] = tensor.tolist()
dict_to_expand[key] = _repeat_interleave_samples(
dict_to_expand[key], lengths=list(video_nums), repeat_times=expand_size
)
return dict_to_expand

def _expand_dict_for_generation(dict_to_expand):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/got_ocr2/modular_got_ocr2.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ class GotOcr2PreTrainedModel(LlavaPreTrainedModel):
_supports_flex_attn = False

def _init_weights(self, module):
LlavaPreTrainedModel._init_weights(module)
LlavaPreTrainedModel._init_weights(self, module)
if isinstance(module, GotOcr2VisionAttention):
if module.use_rel_pos:
module.rel_pos_h.data.zero_()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -584,7 +584,7 @@ def _init_weights(self, module):

class Phi4MultimodalVisionEmbeddings(SiglipVisionEmbeddings, nn.Module):
def __init__(self, config: Phi4MultimodalVisionConfig):
nn.Module.__init__()
nn.Module.__init__(self)
self.config = config
self.patch_size = config.patch_size
self.num_patches_per_side = config.image_size // self.patch_size
Expand Down Expand Up @@ -1449,7 +1449,7 @@ class Phi4MultimodalRotaryEmbedding(Phi3RotaryEmbedding):

class Phi4MultimodalPreTrainedModel(Phi3PreTrainedModel):
def _init_weights(self, module):
Phi3PreTrainedModel._init_weights(module)
Phi3PreTrainedModel._init_weights(self, module)
if isinstance(module, Phi4MultimodalImageEmbedding):
module.global_img_feature_extensor.data.zero_()
module.sub_img_feature_extensor.data.zero_()
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/qwen2/modular_qwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def forward(

class Qwen2DecoderLayer(LlamaDecoderLayer):
def __init__(self, config: Qwen2Config, layer_idx: int):
super().__init__()
super().__init__(config=config, layer_idx=layer_idx)
self.attention_type = config.layer_types[layer_idx]


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2071,7 +2071,7 @@ def __init__(self, config: Qwen2_5OmniThinkerConfig, device=None):
# Removes the value error as a workaround.
class Qwen2_5OmniAttention(Qwen2_5_VLAttention, nn.Module):
def __init__(self, config: Qwen2_5OmniConfig, layer_idx: Optional[int] = None):
nn.Module.__init__()
nn.Module.__init__(self)
self.config = config
self.layer_idx = layer_idx
if layer_idx is None:
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> N

class Qwen2_5_VLVisionAttention(VisionAttention):
def __init__(self, config: Qwen2_5_VLVisionConfig) -> None:
super().__init__()
super().__init__(config)
self.dim = config.hidden_size


Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/starcoder2/modular_starcoder2.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def forward(self, hidden_states: Optional[tuple[torch.FloatTensor]]) -> torch.Fl

class Starcoder2Attention(MistralAttention):
def __init__(self, config: Starcoder2Config, layer_idx: Optional[int] = None):
super().__init__()
super().__init__(config=config, layer_idx=layer_idx)
self.residual_dropout = config.residual_dropout
self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.use_bias)
self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias)
Expand Down Expand Up @@ -131,7 +131,7 @@ def forward(

class Starcoder2DecoderLayer(MistralDecoderLayer):
def __init__(self, config: Starcoder2Config, layer_idx: int):
super().__init__(self)
super().__init__(self, layer_idx)
self.self_attn = Starcoder2Attention(config=config, layer_idx=layer_idx)
self.mlp = Starcoder2MLP(config)
self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/t5gemma/modular_t5gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ class T5GemmaPreTrainedModel(Gemma2PreTrainedModel):

def _init_weights(self, module):
# TODO: support intialization for encoders and decoders separately(?)
Gemma2PreTrainedModel._init_weights(module)
Gemma2PreTrainedModel._init_weights(self, module)
std = self.config.initializer_range
if isinstance(module, T5GemmaClassificationHead):
scale = module.out_proj.weight.shape[0] ** -0.5
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def _compute_new_attention_mask(hidden_states: torch.Tensor, seq_lens: torch.Ten

class Wav2Vec2BertRotaryPositionalEmbedding(Wav2Vec2ConformerRotaryPositionalEmbedding, nn.Module):
def __init__(self, config):
nn.Module.__init__()
nn.Module.__init__(self)
dim = config.hidden_size // config.num_attention_heads
base = config.rotary_embedding_base

Expand Down Expand Up @@ -98,7 +98,7 @@ def forward(self, hidden_states):

class Wav2Vec2BertFeedForward(Wav2Vec2FeedForward, nn.Module):
def __init__(self, config, act_fn=None, hidden_size=None):
nn.Module.__init__()
nn.Module.__init__(self)
act_fn = act_fn if act_fn is not None else config.hidden_act
hidden_size = hidden_size if hidden_size is not None else config.hidden_size
self.intermediate_dropout = nn.Dropout(config.activation_dropout)
Expand Down Expand Up @@ -188,7 +188,7 @@ class Wav2Vec2BertSelfAttention(Wav2Vec2ConformerSelfAttention, nn.Module):
"""

def __init__(self, config, is_adapter_attention=False):
nn.Module.__init__()
nn.Module.__init__(self)
hidden_size = config.hidden_size if not is_adapter_attention else config.output_hidden_size

self.head_size = hidden_size // config.num_attention_heads
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/pipelines/image_text_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# limitations under the License.

import enum
from collections.abc import Iterable # pylint: disable=g-importing-member
from collections.abc import Iterable
from typing import Any, Optional, Union, overload

from ..generation import GenerationConfig
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/pipelines/table_question_answering.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,7 @@ def postprocess(self, model_outputs):

answers.append(answer)
if len(answer) == 0:
raise PipelineException("Empty answer")
raise PipelineException("Table question answering", self.model.name_or_path, "Empty answer")
else:
answers = [{"answer": answer} for answer in self.tokenizer.batch_decode(outputs, skip_special_tokens=True)]

Expand Down