Skip to content

Commit b8458ad

Browse files
committed
Making the function private and adding it's doc.
1 parent 806d033 commit b8458ad

File tree

2 files changed

+17
-9
lines changed

2 files changed

+17
-9
lines changed

src/transformers/tokenization_utils_base.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2853,7 +2853,7 @@ def prepare_for_model(
28532853
encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
28542854

28552855
# Check lengths
2856-
self.eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
2856+
self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
28572857

28582858
# Padding
28592859
if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
@@ -3163,13 +3163,11 @@ def get_special_tokens_mask(
31633163
@staticmethod
31643164
def clean_up_tokenization(out_string: str) -> str:
31653165
"""
3166-
Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
3166+
clean up a list of simple english tokenization artifacts like spaces before punctuations and abbreviated forms.
31673167
3168-
Args:
3169-
out_string (:obj:`str`): The text to clean up.
3168+
args: out_string (:obj:`str`): the text to clean up.
31703169
3171-
Returns:
3172-
:obj:`str`: The cleaned-up string.
3170+
returns: :obj:`str`: the cleaned-up string.
31733171
"""
31743172
out_string = (
31753173
out_string.replace(" .", ".")
@@ -3185,7 +3183,17 @@ def clean_up_tokenization(out_string: str) -> str:
31853183
)
31863184
return out_string
31873185

3188-
def eventual_warn_about_too_long_sequence(self, ids: List[int], max_length: Optional[int], verbose: bool):
3186+
def _eventual_warn_about_too_long_sequence(self, ids: List[int], max_length: Optional[int], verbose: bool):
3187+
"""
3188+
Depending on the input and internal state we might trigger a warning about a sequence that is too long for it's
3189+
corresponding model
3190+
3191+
Args:
3192+
ids (:obj:`List[str]`): The ids produced by the tokenization
3193+
max_length (:obj:`int`, `optional`): The max_length desired (does not trigger a warning if it is set)
3194+
verbose (:obj:`bool`): Whether or not to print more information and warnings.
3195+
3196+
"""
31893197
if max_length is None and len(ids) > self.model_max_length and verbose:
31903198
if not self.deprecation_warnings.get("sequence-length-is-longer-than-the-specified-maximum", False):
31913199
logger.warning(

src/transformers/tokenization_utils_fast.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,7 @@ def _batch_encode_plus(
419419
sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
420420

421421
for input_ids in sanitized_tokens["input_ids"]:
422-
self.eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
422+
self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
423423
return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
424424

425425
def _encode_plus(
@@ -476,7 +476,7 @@ def _encode_plus(
476476
batched_output.encodings,
477477
)
478478

479-
self.eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
479+
self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
480480

481481
return batched_output
482482

0 commit comments

Comments
 (0)