Skip to content

Commit eba07b0

Browse files
committed
Add eva models to beit.py
1 parent da6644b commit eba07b0

File tree

2 files changed

+88
-24
lines changed

2 files changed

+88
-24
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ And a big thanks to all GitHub sponsors who helped with some of my costs before
2121

2222
## What's New
2323

24+
# Dec 6, 2022
25+
* Add 'EVA g', BEiT style ViT-g/14 model weights w/ both MIM pretrain and CLIP pretrain from https://github.com/baaivision/EVA
26+
2427
# Dec 5, 2022
2528

2629
* Pre-release (`0.8.0dev0`) of multi-weight support (`model_arch.pretrained_tag`). Install with `pip install --pre timm`

timm/models/beit.py

Lines changed: 85 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
""" BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254)
22
33
Model from official source: https://github.com/microsoft/unilm/tree/master/beit
4-
and
5-
https://github.com/microsoft/unilm/tree/master/beit2
64
75
@inproceedings{beit,
86
title={{BEiT}: {BERT} Pre-Training of Image Transformers},
@@ -12,6 +10,8 @@
1210
url={https://openreview.net/forum?id=p-BhZSz59o4}
1311
}
1412
13+
BEiT-v2 from https://github.com/microsoft/unilm/tree/master/beit2
14+
1515
@article{beitv2,
1616
title={{BEiT v2}: Masked Image Modeling with Vector-Quantized Visual Tokenizers},
1717
author={Zhiliang Peng and Li Dong and Hangbo Bao and Qixiang Ye and Furu Wei},
@@ -21,6 +21,17 @@
2121
primaryClass={cs.CV}
2222
}
2323
24+
EVA from https://github.com/baaivision/EVA , paper: https://arxiv.org/abs/2211.07636
25+
26+
@article{EVA,
27+
title={EVA: Exploring the Limits of Masked Visual Representation Learning at Scale},
28+
author={Fang, Yuxin and Wang, Wen and Xie, Binhui and Sun, Quan and Wu, Ledell and Wang, Xinggang and Huang,
29+
Tiejun and Wang, Xinlong and Cao, Yue},
30+
journal={arXiv preprint arXiv:2211.07636},
31+
year={2022}
32+
}
33+
34+
2435
At this point only the 1k fine-tuned classification weights and model configs have been added,
2536
see original source above for pre-training models and procedure.
2637
@@ -37,6 +48,9 @@
3748
# https://github.com/facebookresearch/deit/
3849
# https://github.com/facebookresearch/dino
3950
# --------------------------------------------------------'
51+
52+
# EVA models Copyright (c) 2022 BAAI-Vision
53+
4054
import math
4155
from functools import partial
4256
from typing import Optional, Tuple
@@ -46,9 +60,10 @@
4660
import torch.nn.functional as F
4761
from torch.utils.checkpoint import checkpoint
4862

49-
from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
63+
from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
5064
from .helpers import build_model_with_cfg
5165
from .layers import PatchEmbed, Mlp, DropPath, trunc_normal_
66+
from .pretrained import generate_default_cfgs
5267
from .registry import register_model
5368
from .vision_transformer import checkpoint_filter_fn
5469

@@ -64,52 +79,72 @@ def _cfg(url='', **kwargs):
6479
}
6580

6681

67-
default_cfgs = {
68-
'beit_base_patch16_224': _cfg(
82+
default_cfgs = generate_default_cfgs({
83+
'beit_base_patch16_224.in22k_ft_in22k_in1k': _cfg(
6984
url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth'),
70-
'beit_base_patch16_384': _cfg(
85+
'beit_base_patch16_384.in22k_ft_in22k_in1k': _cfg(
7186
url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_384_pt22k_ft22kto1k.pth',
7287
input_size=(3, 384, 384), crop_pct=1.0,
7388
),
74-
'beit_base_patch16_224_in22k': _cfg(
89+
'beit_base_patch16_224.in22k_ft_in22k': _cfg(
7590
url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22k.pth',
7691
num_classes=21841,
7792
),
78-
'beit_large_patch16_224': _cfg(
93+
'beit_large_patch16_224.in22k_ft_in22k_in1k': _cfg(
7994
url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22kto1k.pth'),
80-
'beit_large_patch16_384': _cfg(
95+
'beit_large_patch16_384.in22k_ft_in22k_in1k': _cfg(
8196
url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_384_pt22k_ft22kto1k.pth',
8297
input_size=(3, 384, 384), crop_pct=1.0,
8398
),
84-
'beit_large_patch16_512': _cfg(
99+
'beit_large_patch16_512.in22k_ft_in22k_in1k': _cfg(
85100
url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_512_pt22k_ft22kto1k.pth',
86101
input_size=(3, 512, 512), crop_pct=1.0,
87102
),
88-
'beit_large_patch16_224_in22k': _cfg(
103+
'beit_large_patch16_224.in22k_ft_in22k': _cfg(
89104
url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22k.pth',
90105
num_classes=21841,
91106
),
92107

93-
'beitv2_base_patch16_224': _cfg(
108+
'beitv2_base_patch16_224.in1k_ft_in22k_in1k': _cfg(
94109
url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_base_patch16_224_pt1k_ft21kto1k.pth',
95110
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD
96111
),
97-
'beitv2_base_patch16_224_in22k': _cfg(
112+
'beitv2_base_patch16_224.in1k_ft_in22k': _cfg(
98113
url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_base_patch16_224_pt1k_ft21k.pth',
99114
num_classes=21841,
100115
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD
101116
),
102-
'beitv2_large_patch16_224': _cfg(
117+
'beitv2_large_patch16_224.in1k_ft_in22k_in1k': _cfg(
103118
url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_large_patch16_224_pt1k_ft21kto1k.pth',
104119
crop_pct=0.95,
105120
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD
106121
),
107-
'beitv2_large_patch16_224_in22k': _cfg(
122+
'beitv2_large_patch16_224.in1k_ft_in22k': _cfg(
108123
url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_large_patch16_224_pt1k_ft21k.pth',
109124
num_classes=21841,
110125
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD
111126
),
112-
}
127+
128+
'eva_giant_patch14_224.clip_ft_in1k': _cfg(
129+
hf_hub_id='BAAI/EVA', hf_hub_filename='eva_clip_vis_enc_sz224_ftcls_89p1.pt',
130+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
131+
),
132+
'eva_giant_patch14_336.clip_ft_in1k': _cfg(
133+
hf_hub_id='BAAI/EVA',
134+
hf_hub_filename='eva_clip_vis_enc_sz336_ftcls_89p4.pt',
135+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
136+
input_size=(3, 336, 336)),
137+
'eva_giant_patch14_336.m30m_ft_in22k_in1k': _cfg(
138+
hf_hub_id='BAAI/EVA',
139+
hf_hub_filename='eva_21k_1k_336px_psz14_ema_89p6.pt',
140+
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD,
141+
input_size=(3, 336, 336)),
142+
'eva_giant_patch14_560.m30m_ft_in22k_in1k': _cfg(
143+
hf_hub_id='BAAI/EVA',
144+
hf_hub_filename='eva_21k_1k_560px_psz14_ema_89p7.pt',
145+
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD,
146+
input_size=(3, 560, 560)),
147+
})
113148

114149

115150
def gen_relative_position_index(window_size: Tuple[int, int]) -> torch.Tensor:
@@ -415,7 +450,7 @@ def beit_base_patch16_224(pretrained=False, **kwargs):
415450
@register_model
416451
def beit_base_patch16_384(pretrained=False, **kwargs):
417452
model_kwargs = dict(
418-
img_size=384, patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4,
453+
img_size=384, patch_size=16, embed_dim=768, depth=12, num_heads=12,
419454
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=0.1, **kwargs)
420455
model = _create_beit('beit_base_patch16_384', pretrained=pretrained, **model_kwargs)
421456
return model
@@ -424,7 +459,7 @@ def beit_base_patch16_384(pretrained=False, **kwargs):
424459
@register_model
425460
def beit_base_patch16_224_in22k(pretrained=False, **kwargs):
426461
model_kwargs = dict(
427-
patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4,
462+
patch_size=16, embed_dim=768, depth=12, num_heads=12,
428463
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=0.1, **kwargs)
429464
model = _create_beit('beit_base_patch16_224_in22k', pretrained=pretrained, **model_kwargs)
430465
return model
@@ -433,7 +468,7 @@ def beit_base_patch16_224_in22k(pretrained=False, **kwargs):
433468
@register_model
434469
def beit_large_patch16_224(pretrained=False, **kwargs):
435470
model_kwargs = dict(
436-
patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
471+
patch_size=16, embed_dim=1024, depth=24, num_heads=16,
437472
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs)
438473
model = _create_beit('beit_large_patch16_224', pretrained=pretrained, **model_kwargs)
439474
return model
@@ -442,7 +477,7 @@ def beit_large_patch16_224(pretrained=False, **kwargs):
442477
@register_model
443478
def beit_large_patch16_384(pretrained=False, **kwargs):
444479
model_kwargs = dict(
445-
img_size=384, patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
480+
img_size=384, patch_size=16, embed_dim=1024, depth=24, num_heads=16,
446481
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs)
447482
model = _create_beit('beit_large_patch16_384', pretrained=pretrained, **model_kwargs)
448483
return model
@@ -451,7 +486,7 @@ def beit_large_patch16_384(pretrained=False, **kwargs):
451486
@register_model
452487
def beit_large_patch16_512(pretrained=False, **kwargs):
453488
model_kwargs = dict(
454-
img_size=512, patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
489+
img_size=512, patch_size=16, embed_dim=1024, depth=24, num_heads=16,
455490
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs)
456491
model = _create_beit('beit_large_patch16_512', pretrained=pretrained, **model_kwargs)
457492
return model
@@ -460,7 +495,7 @@ def beit_large_patch16_512(pretrained=False, **kwargs):
460495
@register_model
461496
def beit_large_patch16_224_in22k(pretrained=False, **kwargs):
462497
model_kwargs = dict(
463-
patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
498+
patch_size=16, embed_dim=1024, depth=24, num_heads=16,
464499
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs)
465500
model = _create_beit('beit_large_patch16_224_in22k', pretrained=pretrained, **model_kwargs)
466501
return model
@@ -487,7 +522,7 @@ def beitv2_base_patch16_224_in22k(pretrained=False, **kwargs):
487522
@register_model
488523
def beitv2_large_patch16_224(pretrained=False, **kwargs):
489524
model_kwargs = dict(
490-
patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
525+
patch_size=16, embed_dim=1024, depth=24, num_heads=16,
491526
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs)
492527
model = _create_beit('beitv2_large_patch16_224', pretrained=pretrained, **model_kwargs)
493528
return model
@@ -496,7 +531,33 @@ def beitv2_large_patch16_224(pretrained=False, **kwargs):
496531
@register_model
497532
def beitv2_large_patch16_224_in22k(pretrained=False, **kwargs):
498533
model_kwargs = dict(
499-
patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
534+
patch_size=16, embed_dim=1024, depth=24, num_heads=16,
500535
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs)
501536
model = _create_beit('beitv2_large_patch16_224_in22k', pretrained=pretrained, **model_kwargs)
502537
return model
538+
539+
540+
def eva_giant_patch14_224(pretrained=False, **kwargs):
541+
""" EVA-g model https://arxiv.org/abs/2211.07636 """
542+
model_kwargs = dict(
543+
patch_size=14, embed_dim=1408, depth=40, num_heads=16, mlp_ratio=6144 / 1408, **kwargs)
544+
model = _create_beit('eva_giant_patch14_224', pretrained=pretrained, **model_kwargs)
545+
return model
546+
547+
548+
@register_model
549+
def eva_giant_patch14_336(pretrained=False, **kwargs):
550+
""" EVA-g model https://arxiv.org/abs/2211.07636 """
551+
model_kwargs = dict(
552+
patch_size=14, embed_dim=1408, depth=40, num_heads=16, mlp_ratio=6144 / 1408, **kwargs)
553+
model = _create_beit('eva_giant_patch14_336', pretrained=pretrained, **model_kwargs)
554+
return model
555+
556+
557+
@register_model
558+
def eva_giant_patch14_560(pretrained=False, **kwargs):
559+
""" EVA-g model https://arxiv.org/abs/2211.07636 """
560+
model_kwargs = dict(
561+
patch_size=14, embed_dim=1408, depth=40, num_heads=16, mlp_ratio=6144 / 1408, **kwargs)
562+
model = _create_beit('eva_giant_patch14_560', pretrained=pretrained, **model_kwargs)
563+
return model

0 commit comments

Comments
 (0)