@@ -1435,11 +1435,11 @@ def _cfg(url='', **kwargs):
1435
1435
mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 1280 ),
1436
1436
1437
1437
'vit_base_patch32_clip_224.datacompxl' : _cfg (
1438
- hf_hub_id = 'laion/' ,
1438
+ hf_hub_id = 'laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K ' ,
1439
1439
hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1440
1440
mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 512 ),
1441
1441
'vit_base_patch32_clip_256.datacompxl' : _cfg (
1442
- hf_hub_id = 'laion/' ,
1442
+ hf_hub_id = 'laion/CLIP-ViT-B-32-256x256-DataComp-s34B-b86K ' ,
1443
1443
hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1444
1444
mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
1445
1445
crop_pct = 1.0 , input_size = (3 , 256 , 256 ), num_classes = 512 ),
@@ -1994,6 +1994,17 @@ def vit_base_patch32_clip_224(pretrained=False, **kwargs) -> VisionTransformer:
1994
1994
return model
1995
1995
1996
1996
1997
+ @register_model
1998
+ def vit_base_patch32_clip_256 (pretrained = False , ** kwargs ) -> VisionTransformer :
1999
+ """ ViT-B/32 CLIP image tower @ 256x256
2000
+ """
2001
+ model_args = dict (
2002
+ patch_size = 32 , embed_dim = 768 , depth = 12 , num_heads = 12 , pre_norm = True , norm_layer = nn .LayerNorm )
2003
+ model = _create_vision_transformer (
2004
+ 'vit_base_patch32_clip_256' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2005
+ return model
2006
+
2007
+
1997
2008
@register_model
1998
2009
def vit_base_patch32_clip_384 (pretrained = False , ** kwargs ) -> VisionTransformer :
1999
2010
""" ViT-B/32 CLIP image tower @ 384x384
0 commit comments