@@ -1473,18 +1473,22 @@ def _cfg(url='', **kwargs):
1473
1473
'vit_base_patch32_clip_224.metaclip_2pt5b' : _cfg (
1474
1474
hf_hub_id = 'facebook/metaclip-b32-fullcc2.5b' ,
1475
1475
hf_hub_filename = 'metaclip_b32_fullcc2.5b.bin' ,
1476
+ license = 'cc-by-nc-4.0' ,
1476
1477
mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 512 ),
1477
1478
'vit_base_patch16_clip_224.metaclip_2pt5b' : _cfg (
1478
1479
hf_hub_id = 'facebook/metaclip-b16-fullcc2.5b' ,
1479
1480
hf_hub_filename = 'metaclip_b16_fullcc2.5b.bin' ,
1481
+ license = 'cc-by-nc-4.0' ,
1480
1482
mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 512 ),
1481
1483
'vit_large_patch14_clip_224.metaclip_2pt5b' : _cfg (
1482
1484
hf_hub_id = 'facebook/metaclip-l14-fullcc2.5b' ,
1483
1485
hf_hub_filename = 'metaclip_l14_fullcc2.5b.bin' ,
1486
+ license = 'cc-by-nc-4.0' ,
1484
1487
mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 768 ),
1485
1488
'vit_huge_patch14_clip_224.metaclip_2pt5b' : _cfg (
1486
1489
hf_hub_id = 'facebook/metaclip-h14-fullcc2.5b' ,
1487
1490
hf_hub_filename = 'metaclip_h14_fullcc2.5b.bin' ,
1491
+ license = 'cc-by-nc-4.0' ,
1488
1492
mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 1024 ),
1489
1493
1490
1494
'vit_base_patch32_clip_224.openai' : _cfg (
@@ -2129,7 +2133,8 @@ def vit_base_patch32_clip_quickgelu_224(pretrained=False, **kwargs) -> VisionTra
2129
2133
patch_size = 32 , embed_dim = 768 , depth = 12 , num_heads = 12 , pre_norm = True ,
2130
2134
norm_layer = nn .LayerNorm , act_layer = 'quick_gelu' )
2131
2135
model = _create_vision_transformer (
2132
- 'vit_base_patch32_clip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2136
+ 'vit_base_patch32_clip_224' , # map to non quickgelu pretrained_cfg intentionally
2137
+ pretrained = pretrained , ** dict (model_args , ** kwargs ))
2133
2138
return model
2134
2139
2135
2140
@@ -2141,7 +2146,8 @@ def vit_base_patch16_clip_quickgelu_224(pretrained=False, **kwargs) -> VisionTra
2141
2146
patch_size = 16 , embed_dim = 768 , depth = 12 , num_heads = 12 , pre_norm = True ,
2142
2147
norm_layer = nn .LayerNorm , act_layer = 'quick_gelu' )
2143
2148
model = _create_vision_transformer (
2144
- 'vit_base_patch16_clip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2149
+ 'vit_base_patch16_clip_224' , # map to non quickgelu pretrained_cfg intentionally
2150
+ pretrained = pretrained , ** dict (model_args , ** kwargs ))
2145
2151
return model
2146
2152
2147
2153
@@ -2154,7 +2160,8 @@ def vit_large_patch14_clip_quickgelu_224(pretrained=False, **kwargs) -> VisionTr
2154
2160
patch_size = 14 , embed_dim = 1024 , depth = 24 , num_heads = 16 , pre_norm = True ,
2155
2161
norm_layer = nn .LayerNorm , act_layer = 'quick_gelu' )
2156
2162
model = _create_vision_transformer (
2157
- 'vit_large_patch14_clip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2163
+ 'vit_large_patch14_clip_224' , # map to non quickgelu pretrained_cfg intentionally
2164
+ pretrained = pretrained , ** dict (model_args , ** kwargs ))
2158
2165
return model
2159
2166
2160
2167
@@ -2166,7 +2173,8 @@ def vit_large_patch14_clip_quickgelu_336(pretrained=False, **kwargs) -> VisionTr
2166
2173
patch_size = 14 , embed_dim = 1024 , depth = 24 , num_heads = 16 , pre_norm = True ,
2167
2174
norm_layer = nn .LayerNorm , act_layer = 'quick_gelu' )
2168
2175
model = _create_vision_transformer (
2169
- 'vit_large_patch14_clip_336' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2176
+ 'vit_large_patch14_clip_336' , # map to non quickgelu pretrained_cfg intentionally
2177
+ pretrained = pretrained , ** dict (model_args , ** kwargs ))
2170
2178
return model
2171
2179
2172
2180
@@ -2178,7 +2186,8 @@ def vit_huge_patch14_clip_quickgelu_224(pretrained=False, **kwargs) -> VisionTra
2178
2186
patch_size = 14 , embed_dim = 1280 , depth = 32 , num_heads = 16 , pre_norm = True ,
2179
2187
norm_layer = nn .LayerNorm , act_layer = 'quick_gelu' )
2180
2188
model = _create_vision_transformer (
2181
- 'vit_huge_patch14_clip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2189
+ 'vit_huge_patch14_clip_224' , # map to non quickgelu pretrained_cfg intentionally
2190
+ pretrained = pretrained , ** dict (model_args , ** kwargs ))
2182
2191
return model
2183
2192
2184
2193
@@ -2190,7 +2199,8 @@ def vit_huge_patch14_clip_quickgelu_378(pretrained=False, **kwargs) -> VisionTra
2190
2199
patch_size = 14 , embed_dim = 1280 , depth = 32 , num_heads = 16 , pre_norm = True ,
2191
2200
norm_layer = nn .LayerNorm , act_layer = 'quick_gelu' )
2192
2201
model = _create_vision_transformer (
2193
- 'vit_huge_patch14_clip_378' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2202
+ 'vit_huge_patch14_clip_378' , # map to non quickgelu pretrained_cfg intentionally
2203
+ pretrained = pretrained , ** dict (model_args , ** kwargs ))
2194
2204
return model
2195
2205
2196
2206
0 commit comments