@@ -757,12 +757,7 @@ class PreTrainedModel
757
757
self .encoder .layer [layer ].attention .prune_heads (heads )
758
758
759
759
@add_start_docstrings_to_model_forward (PERCEIVER_INPUTS_DOCSTRING .format ("(batch_size, sequence_length)" ))
760
- @add_code_sample_docstrings (
761
- processor_class = _TOKENIZER_FOR_DOC ,
762
- checkpoint = _CHECKPOINT_FOR_DOC ,
763
- output_type = PerceiverModelOutput ,
764
- config_class = _CONFIG_FOR_DOC ,
765
- )
760
+ @replace_return_docstrings (output_type = PerceiverModelOutput , config_class = _CONFIG_FOR_DOC )
766
761
def forward (
767
762
self ,
768
763
inputs ,
@@ -773,6 +768,85 @@ def forward(
773
768
output_hidden_states = None ,
774
769
return_dict = None ,
775
770
):
771
+ r"""
772
+ Returns:
773
+
774
+ Examples::
775
+
776
+ >>> from transformers import PerceiverConfig, PerceiverTokenizer, PerceiverFeatureExtractor, PerceiverModel
777
+ >>> from transformers.models.perceiver.modeling_perceiver import PerceiverTextPreprocessor, PerceiverImagePreprocessor, PerceiverClassificationDecoder
778
+ >>> import torch
779
+ >>> import requests
780
+ >>> from PIL import Image
781
+
782
+ >>> # EXAMPLE 1: using the Perceiver to classify texts
783
+ >>> # - we define a TextPreprocessor, which can be used to embed tokens
784
+ >>> # - we define a ClassificationDecoder, which can be used to decode the
785
+ >>> # final hidden states of the latents to classification logits
786
+ >>> # using trainable position embeddings
787
+ >>> config = PerceiverConfig()
788
+ >>> preprocessor = PerceiverTextPreprocessor(config)
789
+ >>> decoder = PerceiverClassificationDecoder(config,
790
+ ... num_channels=config.d_latents,
791
+ ... trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1),
792
+ ... use_query_residual=True)
793
+ >>> model = PerceiverModel(config, input_preprocessor=preprocessor, decoder=decoder)
794
+
795
+ >>> # you can then do a forward pass as follows:
796
+ >>> tokenizer = PerceiverTokenizer()
797
+ >>> text = "hello world"
798
+ >>> inputs = tokenizer(text, return_tensors="pt").input_ids
799
+
800
+ >>> with torch.no_grad():
801
+ >>> outputs = model(inputs=inputs)
802
+ >>> logits = outputs.logits
803
+
804
+ >>> # to train, one can train the model using standard cross-entropy:
805
+ >>> criterion = torch.nn.CrossEntropyLoss()
806
+
807
+ >>> labels = torch.tensor([1])
808
+ >>> loss = criterion(logits, labels)
809
+
810
+ >>> # EXAMPLE 2: using the Perceiver to classify images
811
+ >>> # - we define an ImagePreprocessor, which can be used to embed images
812
+ >>> preprocessor=PerceiverImagePreprocessor(
813
+ config,
814
+ prep_type="conv1x1",
815
+ spatial_downsample=1,
816
+ out_channels=256,
817
+ position_encoding_type="trainable",
818
+ concat_or_add_pos="concat",
819
+ project_pos_dim=256,
820
+ trainable_position_encoding_kwargs=dict(num_channels=256, index_dims=config.image_size ** 2),
821
+ )
822
+
823
+ >>> model = PerceiverModel(
824
+ ... config,
825
+ ... input_preprocessor=preprocessor,
826
+ ... decoder=PerceiverClassificationDecoder(
827
+ ... config,
828
+ ... num_channels=config.d_latents,
829
+ ... trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1),
830
+ ... use_query_residual=True,
831
+ ... ),
832
+ ... )
833
+
834
+ >>> # you can then do a forward pass as follows:
835
+ >>> feature_extractor = PerceiverFeatureExtractor()
836
+ >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
837
+ >>> image = Image.open(requests.get(url, stream=True).raw)
838
+ >>> inputs = feature_extractor(image, return_tensors="pt").pixel_values
839
+
840
+ >>> with torch.no_grad():
841
+ >>> outputs = model(inputs=inputs)
842
+ >>> logits = outputs.logits
843
+
844
+ >>> # to train, one can train the model using standard cross-entropy:
845
+ >>> criterion = torch.nn.CrossEntropyLoss()
846
+
847
+ >>> labels = torch.tensor([1])
848
+ >>> loss = criterion(logits, labels)
849
+ """
776
850
output_attentions = output_attentions if output_attentions is not None else self .config .output_attentions
777
851
output_hidden_states = (
778
852
output_hidden_states if output_hidden_states is not None else self .config .output_hidden_states
@@ -901,12 +975,7 @@ def __init__(self, config):
901
975
self .post_init ()
902
976
903
977
@add_start_docstrings_to_model_forward (PERCEIVER_INPUTS_DOCSTRING .format ("batch_size, sequence_length" ))
904
- @add_code_sample_docstrings (
905
- processor_class = _TOKENIZER_FOR_DOC ,
906
- checkpoint = _CHECKPOINT_FOR_DOC ,
907
- output_type = PerceiverMaskedLMOutput ,
908
- config_class = _CONFIG_FOR_DOC ,
909
- )
978
+ @replace_return_docstrings (output_type = PerceiverMaskedLMOutput , config_class = _CONFIG_FOR_DOC )
910
979
def forward (
911
980
self ,
912
981
inputs = None ,
@@ -923,6 +992,42 @@ def forward(
923
992
Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
924
993
config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
925
994
(masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
995
+
996
+ Returns:
997
+
998
+ Examples::
999
+ >>> from transformers import PerceiverTokenizer, PerceiverForMaskedLM
1000
+ >>> import torch
1001
+
1002
+ >>> tokenizer = PerceiverTokenizer.from_pretrained('deepmind/language-perceiver')
1003
+ >>> model = PerceiverForMaskedLM.from_pretrained('deepmind/language-perceiver')
1004
+
1005
+ >>> # training
1006
+ >>> text = "This is an incomplete sentence where some words are missing."
1007
+ >>> inputs = tokenizer(text, padding="max_length", return_tensors="pt")
1008
+ >>> # mask " missing."
1009
+ >>> inputs['input_ids'][0, 52:61] = tokenizer.mask_token_id
1010
+ >>> labels = tokenizer(text, padding="max_length", return_tensors="pt").input_ids
1011
+
1012
+ >>> outputs = model(**inputs, labels=labels)
1013
+ >>> loss = outputs.loss
1014
+ >>> logits = outputs.logits
1015
+
1016
+ >>> # inference
1017
+ >>> text = "This is an incomplete sentence where some words are missing."
1018
+ >>> encoding = tokenizer(text, padding="max_length", return_tensors="pt")
1019
+
1020
+ >>> # mask bytes corresponding to " missing.". Note that the model performs much better if the masked span starts with a space.
1021
+ >>> encoding['input_ids'][0, 52:61] = tokenizer.mask_token_id
1022
+
1023
+ >>> # forward pass
1024
+ >>> with torch.no_grad():
1025
+ >>> outputs = model(**encoding)
1026
+ >>> logits = outputs.logits
1027
+
1028
+ >>> masked_tokens_predictions = logits[0, 52:61].argmax(dim=-1).tolist()
1029
+ >>> tokenizer.decode(masked_tokens_predictions)
1030
+ ' missing.'
926
1031
"""
927
1032
if inputs is not None and input_ids is not None :
928
1033
raise ValueError ("You cannot use both `inputs` and `input_ids`" )
0 commit comments