@@ -741,27 +741,27 @@ def forward(
741
741
742
742
Example::
743
743
744
- >>> # Assumption: `get_visual_embeddings(image)` gets the visual embeddings of the image.
745
- >>> from transformers import BertTokenizer, VisualBertModel
746
- >>> import torch
744
+ # Assumption: `get_visual_embeddings(image)` gets the visual embeddings of the image.
745
+ from transformers import BertTokenizer, VisualBertModel
746
+ import torch
747
747
748
- >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
749
- >>> model = VisualBertModel.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
748
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
749
+ model = VisualBertModel.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
750
750
751
- >>> inputs = tokenizer("The capital of France is Paris.", return_tensors="pt")
752
- >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
753
- >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
754
- >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
751
+ inputs = tokenizer("The capital of France is Paris.", return_tensors="pt")
752
+ visual_embeds = get_visual_embeddings(image).unsqueeze(0)
753
+ visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
754
+ visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
755
755
756
- >>> inputs.update({
757
- ... "visual_embeds": visual_embeds,
758
- ... "visual_token_type_ids": visual_token_type_ids,
759
- ... "visual_attention_mask": visual_attention_mask
760
- ... })
756
+ inputs.update({
757
+ "visual_embeds": visual_embeds,
758
+ "visual_token_type_ids": visual_token_type_ids,
759
+ "visual_attention_mask": visual_attention_mask
760
+ })
761
761
762
- >>> outputs = model(**inputs)
762
+ outputs = model(**inputs)
763
763
764
- >>> last_hidden_states = outputs.last_hidden_state
764
+ last_hidden_states = outputs.last_hidden_state
765
765
"""
766
766
767
767
output_attentions = output_attentions if output_attentions is not None else self .config .output_attentions
@@ -923,31 +923,31 @@ def forward(
923
923
924
924
Example::
925
925
926
- >>> # Assumption: `get_visual_embeddings(image)` gets the visual embeddings of the image in the batch.
927
- >>> from transformers import BertTokenizer, VisualBertForPreTraining
926
+ # Assumption: `get_visual_embeddings(image)` gets the visual embeddings of the image in the batch.
927
+ from transformers import BertTokenizer, VisualBertForPreTraining
928
928
929
- >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
930
- >>> model = VisualBertForPreTraining.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
929
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
930
+ model = VisualBertForPreTraining.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
931
931
932
- >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt")
933
- >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
934
- >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
935
- >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
932
+ inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt")
933
+ visual_embeds = get_visual_embeddings(image).unsqueeze(0)
934
+ visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
935
+ visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
936
936
937
- >>> inputs.update({
938
- ... "visual_embeds": visual_embeds,
939
- ... "visual_token_type_ids": visual_token_type_ids,
940
- ... "visual_attention_mask": visual_attention_mask
941
- ... })
942
- >>> max_length = inputs["input_ids"].shape[-1]+visual_embeds.shape[-2]
943
- >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt", padding="max_length", max_length=max_length)["input_ids"]
944
- >>> sentence_image_labels = torch.tensor(1).unsqueeze(0) # Batch_size
937
+ inputs.update({
938
+ "visual_embeds": visual_embeds,
939
+ "visual_token_type_ids": visual_token_type_ids,
940
+ "visual_attention_mask": visual_attention_mask
941
+ })
942
+ max_length = inputs["input_ids"].shape[-1]+visual_embeds.shape[-2]
943
+ labels = tokenizer("The capital of France is Paris.", return_tensors="pt", padding="max_length", max_length=max_length)["input_ids"]
944
+ sentence_image_labels = torch.tensor(1).unsqueeze(0) # Batch_size
945
945
946
946
947
- >>> outputs = model(**inputs, labels=labels, sentence_image_labels=sentence_image_labels)
948
- >>> loss = outputs.loss
949
- >>> prediction_logits = outputs.prediction_logits
950
- >>> seq_relationship_logits = outputs.seq_relationship_logits
947
+ outputs = model(**inputs, labels=labels, sentence_image_labels=sentence_image_labels)
948
+ loss = outputs.loss
949
+ prediction_logits = outputs.prediction_logits
950
+ seq_relationship_logits = outputs.seq_relationship_logits
951
951
"""
952
952
return_dict = return_dict if return_dict is not None else self .config .use_return_dict
953
953
@@ -1057,37 +1057,38 @@ def forward(
1057
1057
1058
1058
Example::
1059
1059
1060
- >>> from transformers import BertTokenizer, VisualBertForMultipleChoice
1061
- >>> import torch
1062
-
1063
- >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
1064
- >>> model = VisualBertForMultipleChoice.from_pretrained('uclanlp/visualbert-vcr')
1065
-
1066
- >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
1067
- >>> choice0 = "It is eaten with a fork and a knife."
1068
- >>> choice1 = "It is eaten while held in the hand."
1069
-
1070
- >>> visual_embeds = get_visual_embeddings(image)
1071
- >>> # (batch_size, num_choices, visual_seq_length, visual_embedding_dim)
1072
- >>> visual_embeds = visual_embeds.expand(1, 2, *visual_embeds.shape)
1073
- >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
1074
- >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
1075
-
1076
- >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
1077
-
1078
- >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', padding=True)
1079
- >>> # batch size is 1
1080
- >>> inputs_dict = {k: v.unsqueeze(0) for k,v in encoding.items()}
1081
- >>> inputs_dict.update({
1082
- ... "visual_embeds": visual_embeds,
1083
- ... "visual_attention_mask": visual_attention_mask,
1084
- ... "visual_token_type_ids": visual_token_type_ids,
1085
- ... "labels": labels
1086
- ... })
1087
- >>> outputs = model(**inputs_dict)
1088
-
1089
- >>> loss = outputs.loss
1090
- >>> logits = outputs.logits
1060
+ # Assumption: `get_visual_embeddings(image)` gets the visual embeddings of the image in the batch.
1061
+ from transformers import BertTokenizer, VisualBertForMultipleChoice
1062
+ import torch
1063
+
1064
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
1065
+ model = VisualBertForMultipleChoice.from_pretrained('uclanlp/visualbert-vcr')
1066
+
1067
+ prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
1068
+ choice0 = "It is eaten with a fork and a knife."
1069
+ choice1 = "It is eaten while held in the hand."
1070
+
1071
+ visual_embeds = get_visual_embeddings(image)
1072
+ # (batch_size, num_choices, visual_seq_length, visual_embedding_dim)
1073
+ visual_embeds = visual_embeds.expand(1, 2, *visual_embeds.shape)
1074
+ visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
1075
+ visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
1076
+
1077
+ labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
1078
+
1079
+ encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', padding=True)
1080
+ # batch size is 1
1081
+ inputs_dict = {k: v.unsqueeze(0) for k,v in encoding.items()}
1082
+ inputs_dict.update({
1083
+ "visual_embeds": visual_embeds,
1084
+ "visual_attention_mask": visual_attention_mask,
1085
+ "visual_token_type_ids": visual_token_type_ids,
1086
+ "labels": labels
1087
+ })
1088
+ outputs = model(**inputs_dict)
1089
+
1090
+ loss = outputs.loss
1091
+ logits = outputs.logits
1091
1092
"""
1092
1093
return_dict = return_dict if return_dict is not None else self .config .use_return_dict
1093
1094
num_choices = input_ids .shape [1 ] if input_ids is not None else inputs_embeds .shape [1 ]
@@ -1204,30 +1205,30 @@ def forward(
1204
1205
1205
1206
Example::
1206
1207
1207
- >>> # Assumption: `get_visual_embeddings(image)` gets the visual embeddings of the image in the batch.
1208
- >>> from transformers import BertTokenizer, VisualBertForQuestionAnswering
1209
- >>> import torch
1208
+ # Assumption: `get_visual_embeddings(image)` gets the visual embeddings of the image in the batch.
1209
+ from transformers import BertTokenizer, VisualBertForQuestionAnswering
1210
+ import torch
1210
1211
1211
- >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
1212
- >>> model = VisualBertForQuestionAnswering.from_pretrained('uclanlp/visualbert-vqa')
1212
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
1213
+ model = VisualBertForQuestionAnswering.from_pretrained('uclanlp/visualbert-vqa')
1213
1214
1214
- >>> text = "Who is eating the apple?"
1215
- >>> inputs = tokenizer(text, return_tensors='pt')
1216
- >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
1217
- >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
1218
- >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
1215
+ text = "Who is eating the apple?"
1216
+ inputs = tokenizer(text, return_tensors='pt')
1217
+ visual_embeds = get_visual_embeddings(image).unsqueeze(0)
1218
+ visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
1219
+ visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
1219
1220
1220
- >>> inputs.update({
1221
- ... "visual_embeds": visual_embeds,
1222
- ... "visual_token_type_ids": visual_token_type_ids,
1223
- ... "visual_attention_mask": visual_attention_mask
1224
- ... })
1221
+ inputs.update({
1222
+ "visual_embeds": visual_embeds,
1223
+ "visual_token_type_ids": visual_token_type_ids,
1224
+ "visual_attention_mask": visual_attention_mask
1225
+ })
1225
1226
1226
- >>> labels = torch.tensor([[0.0,1.0]]).unsqueeze(0) # Batch size 1, Num labels 2
1227
+ labels = torch.tensor([[0.0,1.0]]).unsqueeze(0) # Batch size 1, Num labels 2
1227
1228
1228
- >>> outputs = model(**inputs, labels=labels)
1229
- >>> loss = outputs.loss
1230
- >>> scores = outputs.logits
1229
+ outputs = model(**inputs, labels=labels)
1230
+ loss = outputs.loss
1231
+ scores = outputs.logits
1231
1232
"""
1232
1233
return_dict = return_dict if return_dict is not None else self .config .use_return_dict
1233
1234
@@ -1327,30 +1328,30 @@ def forward(
1327
1328
1328
1329
Example::
1329
1330
1330
- >>> # Assumption: `get_visual_embeddings(image)` gets the visual embeddings of the image in the batch.
1331
- >>> from transformers import BertTokenizer, VisualBertForVisualReasoning
1332
- >>> import torch
1331
+ # Assumption: `get_visual_embeddings(image)` gets the visual embeddings of the image in the batch.
1332
+ from transformers import BertTokenizer, VisualBertForVisualReasoning
1333
+ import torch
1333
1334
1334
- >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
1335
- >>> model = VisualBertForVisualReasoning.from_pretrained('uclanlp/visualbert-nlvr2')
1335
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
1336
+ model = VisualBertForVisualReasoning.from_pretrained('uclanlp/visualbert-nlvr2')
1336
1337
1337
- >>> text = "Who is eating the apple?"
1338
- >>> inputs = tokenizer(text, return_tensors='pt')
1339
- >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
1340
- >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
1341
- >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
1338
+ text = "Who is eating the apple?"
1339
+ inputs = tokenizer(text, return_tensors='pt')
1340
+ visual_embeds = get_visual_embeddings(image).unsqueeze(0)
1341
+ visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
1342
+ visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
1342
1343
1343
- >>> inputs.update({
1344
- ... "visual_embeds": visual_embeds,
1345
- ... "visual_token_type_ids": visual_token_type_ids,
1346
- ... "visual_attention_mask": visual_attention_mask
1347
- ... })
1344
+ inputs.update({
1345
+ "visual_embeds": visual_embeds,
1346
+ "visual_token_type_ids": visual_token_type_ids,
1347
+ "visual_attention_mask": visual_attention_mask
1348
+ })
1348
1349
1349
- >>> labels = torch.tensor(1).unsqueeze(0) # Batch size 1, Num choices 2
1350
+ labels = torch.tensor(1).unsqueeze(0) # Batch size 1, Num choices 2
1350
1351
1351
- >>> outputs = model(**inputs, labels=labels)
1352
- >>> loss = outputs.loss
1353
- >>> scores = outputs.logits
1352
+ outputs = model(**inputs, labels=labels)
1353
+ loss = outputs.loss
1354
+ scores = outputs.logits
1354
1355
"""
1355
1356
return_dict = return_dict if return_dict is not None else self .config .use_return_dict
1356
1357
@@ -1488,32 +1489,32 @@ def forward(
1488
1489
1489
1490
Example::
1490
1491
1491
- >>> # Assumption: `get_visual_embeddings(image)` gets the visual embeddings of the image in the batch.
1492
- >>> from transformers import BertTokenizer, VisualBertForRegionToPhraseAlignment
1493
- >>> import torch
1492
+ # Assumption: `get_visual_embeddings(image)` gets the visual embeddings of the image in the batch.
1493
+ from transformers import BertTokenizer, VisualBertForRegionToPhraseAlignment
1494
+ import torch
1494
1495
1495
- >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
1496
- >>> model = VisualBertForRegionToPhraseAlignment.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
1496
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
1497
+ model = VisualBertForRegionToPhraseAlignment.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
1497
1498
1498
- >>> text = "Who is eating the apple?"
1499
- >>> inputs = tokenizer(text, return_tensors='pt')
1500
- >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
1501
- >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
1502
- >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
1503
- >>> region_to_phrase_position = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2]))
1499
+ text = "Who is eating the apple?"
1500
+ inputs = tokenizer(text, return_tensors='pt')
1501
+ visual_embeds = get_visual_embeddings(image).unsqueeze(0)
1502
+ visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
1503
+ visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
1504
+ region_to_phrase_position = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2]))
1504
1505
1505
- >>> inputs.update({
1506
- ... "region_to_phrase_position": region_to_phrase_position,
1507
- ... "visual_embeds": visual_embeds,
1508
- ... "visual_token_type_ids": visual_token_type_ids,
1509
- ... "visual_attention_mask": visual_attention_mask
1510
- ... })
1506
+ inputs.update({
1507
+ "region_to_phrase_position": region_to_phrase_position,
1508
+ "visual_embeds": visual_embeds,
1509
+ "visual_token_type_ids": visual_token_type_ids,
1510
+ "visual_attention_mask": visual_attention_mask
1511
+ })
1511
1512
1512
- >>> labels = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2], visual_embeds.shape[-2])) # Batch size 1
1513
+ labels = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2], visual_embeds.shape[-2])) # Batch size 1
1513
1514
1514
- >>> outputs = model(**inputs, labels=labels)
1515
- >>> loss = outputs.loss
1516
- >>> scores = outputs.logits
1515
+ outputs = model(**inputs, labels=labels)
1516
+ loss = outputs.loss
1517
+ scores = outputs.logits
1517
1518
"""
1518
1519
if region_to_phrase_position is None :
1519
1520
raise ValueError ("`region_to_phrase_position` should not be None when using Flickr Model." )
0 commit comments