File size: 3,174 Bytes
60c8e7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
pretrained_paths = dict(
    BEATs_PATH="/mnt/petrelfs/lixinhao/lxh_exp/pretrained_models/beats/BEATs_iter3+.pt",
    UMT_S1_B_PATH="/mnt/lustre/share/videointern/annotations/pretained_model/clipmae_vit_b16_k710_e200.pth",
    UMT_S1_L_PATH="/mnt/lustre/share/videointern/annotations/pretained_model/clipmae_vit_l16_k710_e200.pth",
    UMT_S1_g_PATH='/mnt/petrelfs/share_data/likunchang/model/um_teacher/umt2/vit_g14_1.1M_CLIP+MAE_300e_pt_k710_ft.pth',
    InternVL_6B_PATH = "/mnt/petrelfs/share_data/wangwenhai/internvl/6b_vit_exp126_clip_alpaca_7b_laion5b_peak_1e-5_256gpu_all_trainable_degradation.sh/1499/mp_rank_00_model_states.pt"
)


VisionEncoders = dict()
VisionEncoders["beit"] = dict(
    name="beit_base",
    pretrained="microsoft/beit-base-patch16-224-pt22k-ft22k",
    d_model=768,
)
VisionEncoders["beit_large"] = dict(
    name="beit_large",
    pretrained="microsoft/beit-large-patch16-224-pt22k-ft22k",
    d_model=1024,
)

TextEncoders = dict()
TextEncoders["bert"] = dict(
    name="bert_base",
    pretrained="bert-base-uncased",
    config="configs/config_bert.json",
    d_model=768,
    fusion_layer=9,
)
TextEncoders["bert_fusion6"] = dict(
    name="bert_base_fusion6",
    pretrained="bert-base-uncased",
    config="configs/config_bert_fusion6.json",
    d_model=768,
    fusion_layer=6,
)
TextEncoders["bert_large"] = dict(
    name="bert_large",
    pretrained="bert-large-uncased",
    config="configs/config_bert_large.json",
    d_model=1024,
    fusion_layer=19,
)
TextEncoders["med_bert"] = dict(
    name="med_bert_base",
    pretrained="bert-base-uncased",
    config="configs/med_config.json",
    d_model=768,
)
TextEncoders["med_bert_freq2"] = dict(
    name="med_bert_base_freq2",
    pretrained="bert-base-uncased",
    config="configs/med_config_freq2.json",
    d_model=768,
)
TextEncoders["med_bert_freq2_must"] = dict(
    name="med_bert_base_freq2_must",
    pretrained="bert-base-uncased",
    config="configs/med_config_freq2_must.json",
    d_model=768,
)

TextEncoders["med_bert_fusion10"] = dict(
    name="med_bert_base_fusion",
    pretrained="bert-base-uncased",
    config="configs/med_config_fusion.json",
    d_model=768,
    fusion_layer=10
)
TextEncoders["med_bert_fusion9"] = dict(
    name="med_bert_base_fusion",
    pretrained="bert-base-uncased",
    config="configs/med_config_fusion.json",
    d_model=768,
    fusion_layer=9
)
TextEncoders["med_bert_fusion6"] = dict(
    name="med_bert_base_fusion",
    pretrained="bert-base-uncased",
    config="configs/med_config_fusion.json",
    d_model=768,
    fusion_layer=6
)
TextEncoders["med_bert_fusion0"] = dict(
    name="med_bert_base_fusion",
    pretrained="bert-base-uncased",
    config="configs/med_config_fusion.json",
    d_model=768,
    fusion_layer=0
)
TextEncoders["med_bert_fusion3"] = dict(
    name="med_bert_base_fusion",
    pretrained="bert-base-uncased",
    config="configs/med_config_fusion.json",
    d_model=768,
    fusion_layer=3
)
TextEncoders["med_bert_large"] = dict(
    name="med_bert_large",
    pretrained="bert-base-uncased", # not a bug, it just follows BLIP.
    config="configs/med_large_config.json",
    d_model=768
)