Skip to content

Commit 3d66146

Browse files
LysandreJikNarsil
andauthored
Fixing tests for Perceiver (huggingface#14745)
- Do not run image-classification pipeline (_CHECKPOINT_FOR_DOC uses the checkpoint for langage, which cannot load a FeatureExtractor so current logic fails). - Add a safeguard to not run tests when `tokenizer_class` or `feature_extractor_class` **are** defined, but cannot be loaded This happens for Perceiver for the "FastTokenizer" (which doesn't exist so None) and FeatureExtractor (which does exist but cannot be loaded because the checkpoint doesn't define one which is reasonable for the said checkpoint) - Added `get_vocab` function to `PerceiverTokenizer` since it is used by `fill-mask` pipeline when the argument `targets` is used to narrow a subset of possible values. Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
1 parent 4c99e55 commit 3d66146

File tree

4 files changed

+27
-3
lines changed

4 files changed

+27
-3
lines changed

src/transformers/models/auto/feature_extraction_auto.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
("detr", "DetrFeatureExtractor"),
4444
("layoutlmv2", "LayoutLMv2FeatureExtractor"),
4545
("clip", "CLIPFeatureExtractor"),
46+
("perceiver", "PerceiverFeatureExtractor"),
4647
]
4748
)
4849

src/transformers/models/perceiver/tokenization_perceiver.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def __init__(
8787
self._utf_vocab_size = 2 ** 8 # utf is 8 bits
8888

8989
# define special tokens dict
90-
self.special_tokens_encoder: Dict[int, str] = {
90+
self.special_tokens_encoder: Dict[str, int] = {
9191
self.pad_token: 0,
9292
self.bos_token: 1,
9393
self.eos_token: 2,
@@ -96,7 +96,15 @@ def __init__(
9696
self.sep_token: 5,
9797
}
9898
self._num_special_tokens = len(self.special_tokens_encoder)
99-
self.special_tokens_decoder: Dict[str, int] = {v: k for k, v in self.special_tokens_encoder.items()}
99+
self.special_tokens_decoder: Dict[int, str] = {v: k for k, v in self.special_tokens_encoder.items()}
100+
101+
def get_vocab(self) -> Dict[str, int]:
102+
vocab = self.special_tokens_encoder.copy()
103+
vocab.update(self.added_tokens_encoder)
104+
for i in range(self._utf_vocab_size):
105+
token = chr(i)
106+
vocab[token] = i + len(self.special_tokens_encoder)
107+
return vocab
100108

101109
@property
102110
def vocab_size(self):

tests/test_pipelines_common.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,11 @@ def test(self):
169169
else:
170170
tokenizer = None
171171
feature_extractor = get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config)
172+
173+
if tokenizer is None and feature_extractor is None:
174+
self.skipTest(
175+
f"Ignoring {ModelClass}, cannot create a tokenizer or feature_extractor (PerceiverConfig with no FastTokenizer ?)"
176+
)
172177
pipeline, examples = self.get_test_pipeline(model, tokenizer, feature_extractor)
173178
if pipeline is None:
174179
# The test can disable itself, but it should be very marginal
@@ -213,6 +218,7 @@ def data(n):
213218
if not tokenizer_classes:
214219
# We need to test even if there are no tokenizers.
215220
tokenizer_classes = [None]
221+
216222
for tokenizer_class in tokenizer_classes:
217223
if tokenizer_class is not None:
218224
tokenizer_name = tokenizer_class.__name__

tests/test_pipelines_image_classification.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,12 @@
1414

1515
import unittest
1616

17-
from transformers import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, PreTrainedTokenizer, is_vision_available
17+
from transformers import (
18+
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
19+
PerceiverConfig,
20+
PreTrainedTokenizer,
21+
is_vision_available,
22+
)
1823
from transformers.pipelines import ImageClassificationPipeline, pipeline
1924
from transformers.testing_utils import (
2025
is_pipeline_test,
@@ -45,6 +50,10 @@ class ImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
4550
model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
4651

4752
def get_test_pipeline(self, model, tokenizer, feature_extractor):
53+
if isinstance(model.config, PerceiverConfig):
54+
self.skipTest(
55+
"Perceiver model tester is defined with a language one, which has no feature_extractor, so the automated test cannot work here"
56+
)
4857

4958
image_classifier = ImageClassificationPipeline(model=model, feature_extractor=feature_extractor)
5059
examples = [

0 commit comments

Comments
 (0)