@@ -260,19 +260,28 @@ def prepare_for_tokenization(self, text, is_pretokenized=False, **kwargs):
260
260
261
261
class RobertaTokenizerFast (GPT2TokenizerFast ):
262
262
"""
263
- Constructs a "Fast" RoBERTa BPE tokenizer (backed by HuggingFace's `tokenizers` library).
263
+ Constructs a "Fast" RoBERTa BPE tokenizer (backed by HuggingFace's `tokenizers` library), derived from the GPT-2
264
+ tokenizer, using byte-level Byte-Pair-Encoding.
264
265
265
- Peculiarities:
266
-
267
- - Byte-level Byte-Pair-Encoding
268
- - Requires a space to start the input string => the encoding methods should be called with the
269
- ``add_prefix_space`` flag set to ``True``.
270
- Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
271
- the absence of a space at the beginning of a string:
266
+ This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
267
+ be encoded differently whether it is at the beginning of the sentence (without space) or not:
272
268
273
269
::
274
270
275
- tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
271
+ >>> from transformers import RobertaTokenizerFast
272
+ >>> tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
273
+ >>> tokenizer("Hello world")['input_ids']
274
+ [0, 31414, 232, 328, 2]
275
+ >>> tokenizer(" Hello world")['input_ids']
276
+ [0, 20920, 232, 2]
277
+
278
+ You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
279
+ call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
280
+
281
+ .. note::
282
+
283
+ When used with ``is_pretokenized=True``, this tokenizer needs to be instantiated with
284
+ ``add_prefix_space=True``.
276
285
277
286
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the methods. Users
278
287
should refer to the superclass for more information regarding methods.
0 commit comments