@@ -2853,7 +2853,7 @@ def prepare_for_model(
2853
2853
encoded_inputs ["special_tokens_mask" ] = [0 ] * len (sequence )
2854
2854
2855
2855
# Check lengths
2856
- self .eventual_warn_about_too_long_sequence (encoded_inputs ["input_ids" ], max_length , verbose )
2856
+ self ._eventual_warn_about_too_long_sequence (encoded_inputs ["input_ids" ], max_length , verbose )
2857
2857
2858
2858
# Padding
2859
2859
if padding_strategy != PaddingStrategy .DO_NOT_PAD or return_attention_mask :
@@ -3163,13 +3163,11 @@ def get_special_tokens_mask(
3163
3163
@staticmethod
3164
3164
def clean_up_tokenization (out_string : str ) -> str :
3165
3165
"""
3166
- Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
3166
+ clean up a list of simple english tokenization artifacts like spaces before punctuations and abbreviated forms.
3167
3167
3168
- Args:
3169
- out_string (:obj:`str`): The text to clean up.
3168
+ args: out_string (:obj:`str`): the text to clean up.
3170
3169
3171
- Returns:
3172
- :obj:`str`: The cleaned-up string.
3170
+ returns: :obj:`str`: the cleaned-up string.
3173
3171
"""
3174
3172
out_string = (
3175
3173
out_string .replace (" ." , "." )
@@ -3185,7 +3183,17 @@ def clean_up_tokenization(out_string: str) -> str:
3185
3183
)
3186
3184
return out_string
3187
3185
3188
- def eventual_warn_about_too_long_sequence (self , ids : List [int ], max_length : Optional [int ], verbose : bool ):
3186
+ def _eventual_warn_about_too_long_sequence (self , ids : List [int ], max_length : Optional [int ], verbose : bool ):
3187
+ """
3188
+ Depending on the input and internal state we might trigger a warning about a sequence that is too long for it's
3189
+ corresponding model
3190
+
3191
+ Args:
3192
+ ids (:obj:`List[str]`): The ids produced by the tokenization
3193
+ max_length (:obj:`int`, `optional`): The max_length desired (does not trigger a warning if it is set)
3194
+ verbose (:obj:`bool`): Whether or not to print more information and warnings.
3195
+
3196
+ """
3189
3197
if max_length is None and len (ids ) > self .model_max_length and verbose :
3190
3198
if not self .deprecation_warnings .get ("sequence-length-is-longer-than-the-specified-maximum" , False ):
3191
3199
logger .warning (
0 commit comments