anonymous-upload-neurips-2025 commited on
Commit
88c922f
·
verified ·
1 Parent(s): cb3baf9

Upload 221 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +28 -0
  2. Finetuning/.gitattributes +2 -0
  3. Finetuning/.gitignore +153 -0
  4. Finetuning/CITATION.cff +33 -0
  5. Finetuning/HISTORY.md +223 -0
  6. Finetuning/LICENSE +23 -0
  7. Finetuning/MANIFEST.in +3 -0
  8. Finetuning/Makefile +12 -0
  9. Finetuning/README.md +47 -0
  10. Finetuning/annotations/annotations_real_100percent.txt +3 -0
  11. Finetuning/annotations/annotations_real_50percent.txt +3 -0
  12. Finetuning/annotations/annotations_real_all.csv +3 -0
  13. Finetuning/annotations/annotations_real_men.csv +3 -0
  14. Finetuning/annotations/annotations_real_men_synthetic_women.csv +3 -0
  15. Finetuning/annotations/annotations_real_men_synthetic_women_from_women.csv +3 -0
  16. Finetuning/annotations/annotations_real_men_synthetic_women_paths.csv +3 -0
  17. Finetuning/annotations/annotations_real_women.csv +3 -0
  18. Finetuning/annotations/annotations_real_women_synthetic_men.csv +3 -0
  19. Finetuning/annotations/annotations_real_women_synthetic_men_from_men.csv +3 -0
  20. Finetuning/annotations/annotations_real_women_synthetic_men_paths.csv +3 -0
  21. Finetuning/annotations/annotations_synthetic_100percent.txt +3 -0
  22. Finetuning/annotations/annotations_synthetic_50percent.txt +3 -0
  23. Finetuning/annotations/annotations_synthetic_all.csv +3 -0
  24. Finetuning/annotations/annotations_synthetic_men_from_men_synthetic_women_from_women.csv +3 -0
  25. Finetuning/docs/CLIP.png +3 -0
  26. Finetuning/docs/Interacting_with_open_clip.ipynb +0 -0
  27. Finetuning/docs/Interacting_with_open_coca.ipynb +118 -0
  28. Finetuning/docs/LOW_ACC.md +38 -0
  29. Finetuning/docs/PRETRAINED.md +288 -0
  30. Finetuning/docs/clip_conceptual_captions.md +13 -0
  31. Finetuning/docs/clip_loss.png +0 -0
  32. Finetuning/docs/clip_recall.png +0 -0
  33. Finetuning/docs/clip_val_loss.png +0 -0
  34. Finetuning/docs/clip_zeroshot.png +0 -0
  35. Finetuning/docs/clipa.md +103 -0
  36. Finetuning/docs/clipa_acc_compute.png +3 -0
  37. Finetuning/docs/clipa_reduce_image_token.png +3 -0
  38. Finetuning/docs/clipa_reduce_text_token.png +0 -0
  39. Finetuning/docs/datacomp_models.md +69 -0
  40. Finetuning/docs/effective_robustness.png +3 -0
  41. Finetuning/docs/inverse_scaling_law.png +3 -0
  42. Finetuning/docs/laion2b_clip_zeroshot_b32.png +3 -0
  43. Finetuning/docs/laion_clip_zeroshot.png +3 -0
  44. Finetuning/docs/laion_clip_zeroshot_b16.png +3 -0
  45. Finetuning/docs/laion_clip_zeroshot_b16_plus_240.png +3 -0
  46. Finetuning/docs/laion_clip_zeroshot_l14.png +3 -0
  47. Finetuning/docs/laion_openai_compare_b32.jpg +0 -0
  48. Finetuning/docs/model_profile.csv +86 -0
  49. Finetuning/docs/openclip_classification_results.csv +122 -0
  50. Finetuning/docs/openclip_multilingual_retrieval_results.csv +0 -0
.gitattributes CHANGED
@@ -35,3 +35,31 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  BrushNet/docs/source/en/imgs/access_request.png filter=lfs diff=lfs merge=lfs -text
37
  Color-Invariant-Skin-Segmentation/color[[:space:]]augmentation/color_augmentation.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  BrushNet/docs/source/en/imgs/access_request.png filter=lfs diff=lfs merge=lfs -text
37
  Color-Invariant-Skin-Segmentation/color[[:space:]]augmentation/color_augmentation.png filter=lfs diff=lfs merge=lfs -text
38
+ Finetuning/annotations/annotations_real_100percent.txt filter=lfs diff=lfs merge=lfs -text
39
+ Finetuning/annotations/annotations_real_50percent.txt filter=lfs diff=lfs merge=lfs -text
40
+ Finetuning/annotations/annotations_real_all.csv filter=lfs diff=lfs merge=lfs -text
41
+ Finetuning/annotations/annotations_real_men_synthetic_women_from_women.csv filter=lfs diff=lfs merge=lfs -text
42
+ Finetuning/annotations/annotations_real_men_synthetic_women_paths.csv filter=lfs diff=lfs merge=lfs -text
43
+ Finetuning/annotations/annotations_real_men_synthetic_women.csv filter=lfs diff=lfs merge=lfs -text
44
+ Finetuning/annotations/annotations_real_men.csv filter=lfs diff=lfs merge=lfs -text
45
+ Finetuning/annotations/annotations_real_women_synthetic_men_from_men.csv filter=lfs diff=lfs merge=lfs -text
46
+ Finetuning/annotations/annotations_real_women_synthetic_men_paths.csv filter=lfs diff=lfs merge=lfs -text
47
+ Finetuning/annotations/annotations_real_women_synthetic_men.csv filter=lfs diff=lfs merge=lfs -text
48
+ Finetuning/annotations/annotations_real_women.csv filter=lfs diff=lfs merge=lfs -text
49
+ Finetuning/annotations/annotations_synthetic_100percent.txt filter=lfs diff=lfs merge=lfs -text
50
+ Finetuning/annotations/annotations_synthetic_50percent.txt filter=lfs diff=lfs merge=lfs -text
51
+ Finetuning/annotations/annotations_synthetic_all.csv filter=lfs diff=lfs merge=lfs -text
52
+ Finetuning/annotations/annotations_synthetic_men_from_men_synthetic_women_from_women.csv filter=lfs diff=lfs merge=lfs -text
53
+ Finetuning/docs/CLIP.png filter=lfs diff=lfs merge=lfs -text
54
+ Finetuning/docs/clipa_acc_compute.png filter=lfs diff=lfs merge=lfs -text
55
+ Finetuning/docs/clipa_reduce_image_token.png filter=lfs diff=lfs merge=lfs -text
56
+ Finetuning/docs/effective_robustness.png filter=lfs diff=lfs merge=lfs -text
57
+ Finetuning/docs/inverse_scaling_law.png filter=lfs diff=lfs merge=lfs -text
58
+ Finetuning/docs/laion_clip_zeroshot_b16_plus_240.png filter=lfs diff=lfs merge=lfs -text
59
+ Finetuning/docs/laion_clip_zeroshot_b16.png filter=lfs diff=lfs merge=lfs -text
60
+ Finetuning/docs/laion_clip_zeroshot_l14.png filter=lfs diff=lfs merge=lfs -text
61
+ Finetuning/docs/laion_clip_zeroshot.png filter=lfs diff=lfs merge=lfs -text
62
+ Finetuning/docs/laion2b_clip_zeroshot_b32.png filter=lfs diff=lfs merge=lfs -text
63
+ Finetuning/has_synthetic_versions filter=lfs diff=lfs merge=lfs -text
64
+ Finetuning/no_synthetic_versions filter=lfs diff=lfs merge=lfs -text
65
+ Finetuning/src/open_clip_train/profile.pstat filter=lfs diff=lfs merge=lfs -text
Finetuning/.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.py linguist-language=python
2
+ *.ipynb linguist-documentation
Finetuning/.gitignore ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ **/logs/
2
+ **/wandb/
3
+ models/
4
+ features/
5
+ results/
6
+
7
+ tests/data/
8
+ *.pt
9
+
10
+ # Byte-compiled / optimized / DLL files
11
+ __pycache__/
12
+ *.py[cod]
13
+ *$py.class
14
+
15
+ # C extensions
16
+ *.so
17
+
18
+ # Distribution / packaging
19
+ .Python
20
+ build/
21
+ develop-eggs/
22
+ dist/
23
+ downloads/
24
+ eggs/
25
+ .eggs/
26
+ lib/
27
+ lib64/
28
+ parts/
29
+ sdist/
30
+ var/
31
+ wheels/
32
+ pip-wheel-metadata/
33
+ share/python-wheels/
34
+ *.egg-info/
35
+ .installed.cfg
36
+ *.egg
37
+ MANIFEST
38
+
39
+ # PyInstaller
40
+ # Usually these files are written by a python script from a template
41
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
42
+ *.manifest
43
+ *.spec
44
+
45
+ # Installer logs
46
+ pip-log.txt
47
+ pip-delete-this-directory.txt
48
+
49
+ # Unit test / coverage reports
50
+ htmlcov/
51
+ .tox/
52
+ .nox/
53
+ .coverage
54
+ .coverage.*
55
+ .cache
56
+ nosetests.xml
57
+ coverage.xml
58
+ *.cover
59
+ *.py,cover
60
+ .hypothesis/
61
+ .pytest_cache/
62
+
63
+ # Translations
64
+ *.mo
65
+ *.pot
66
+
67
+ # Django stuff:
68
+ *.log
69
+ local_settings.py
70
+ db.sqlite3
71
+ db.sqlite3-journal
72
+
73
+ # Flask stuff:
74
+ instance/
75
+ .webassets-cache
76
+
77
+ # Scrapy stuff:
78
+ .scrapy
79
+
80
+ # Sphinx documentation
81
+ docs/_build/
82
+
83
+ # PyBuilder
84
+ target/
85
+
86
+ # Jupyter Notebook
87
+ .ipynb_checkpoints
88
+
89
+ # IPython
90
+ profile_default/
91
+ ipython_config.py
92
+
93
+ # pyenv
94
+ .python-version
95
+
96
+ # pipenv
97
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
98
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
99
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
100
+ # install all needed dependencies.
101
+ #Pipfile.lock
102
+
103
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
104
+ __pypackages__/
105
+
106
+ # Celery stuff
107
+ celerybeat-schedule
108
+ celerybeat.pid
109
+
110
+ # SageMath parsed files
111
+ *.sage.py
112
+
113
+ # Environments
114
+ .env
115
+ .venv
116
+ env/
117
+ venv/
118
+ ENV/
119
+ env.bak/
120
+ venv.bak/
121
+
122
+ # Spyder project settings
123
+ .spyderproject
124
+ .spyproject
125
+
126
+ # Rope project settings
127
+ .ropeproject
128
+
129
+ # mkdocs documentation
130
+ /site
131
+
132
+ # mypy
133
+ .mypy_cache/
134
+ .dmypy.json
135
+ dmypy.json
136
+
137
+ # Pyre type checker
138
+ .pyre/
139
+ sync.sh
140
+ gpu1sync.sh
141
+ .idea
142
+ *.pdf
143
+ **/._*
144
+ **/*DS_*
145
+ **.jsonl
146
+ src/sbatch
147
+ src/misc
148
+ .vscode
149
+ src/debug
150
+ core.*
151
+
152
+ # Allow
153
+ !src/evaluation/misc/results_dbs/*
Finetuning/CITATION.cff ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cff-version: 1.1.0
2
+ message: If you use this software, please cite it as below.
3
+ authors:
4
+ - family-names: Ilharco
5
+ given-names: Gabriel
6
+ - family-names: Wortsman
7
+ given-names: Mitchell
8
+ - family-names: Wightman
9
+ given-names: Ross
10
+ - family-names: Gordon
11
+ given-names: Cade
12
+ - family-names: Carlini
13
+ given-names: Nicholas
14
+ - family-names: Taori
15
+ given-names: Rohan
16
+ - family-names: Dave
17
+ given-names: Achal
18
+ - family-names: Shankar
19
+ given-names: Vaishaal
20
+ - family-names: Namkoong
21
+ given-names: Hongseok
22
+ - family-names: Miller
23
+ given-names: John
24
+ - family-names: Hajishirzi
25
+ given-names: Hannaneh
26
+ - family-names: Farhadi
27
+ given-names: Ali
28
+ - family-names: Schmidt
29
+ given-names: Ludwig
30
+ title: OpenCLIP
31
+ version: v0.1
32
+ doi: 10.5281/zenodo.5143773
33
+ date-released: 2021-07-28
Finetuning/HISTORY.md ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## 2.24.0
2
+
3
+ * Fix missing space in error message
4
+ * use model flag for normalizing embeddings
5
+ * init logit_bias for non siglip pretrained models
6
+ * Fix logit_bias load_checkpoint addition
7
+ * Make CoCa model match CLIP models for logit scale/bias init
8
+ * Fix missing return of "logit_bias" in CoCa.forward
9
+ * Add NLLB-CLIP with SigLIP models
10
+ * Add get_logits method and NLLB tokenizer
11
+ * Remove the empty file src/open_clip/generation_utils.py
12
+ * Update params.py: "BatchNorm" -> "LayerNorm" in the description string for "--lock-text-freeze-layer-norm"
13
+
14
+ ## 2.23.0
15
+
16
+ * Add CLIPA-v2 models
17
+ * Add SigLIP models
18
+ * Add MetaCLIP models
19
+ * Add NLLB-CLIP models
20
+ * CLIPA train code
21
+ * Minor changes/fixes
22
+ * Remove protobuf version limit
23
+ * Stop checking model name when loading CoCa models
24
+ * Log native wandb step
25
+ * Use bool instead of long masks
26
+
27
+ ## 2.21.0
28
+
29
+ * Add SigLIP loss + training support
30
+ * Add more DataComp models (B/16, B/32 and B/32@256)
31
+ * Update default num workers
32
+ * Update CoCa generation for `transformers>=4.31`
33
+ * PyTorch 2.0 `state_dict()` compatibility fix for compiled models
34
+ * Fix padding in `ResizeMaxSize`
35
+ * Convert JIT model on state dict load for `pretrained='filename…'`
36
+ * Other minor changes and fixes (typos, README, dependencies, CI)
37
+
38
+ ## 2.20.0
39
+
40
+ * Add EVA models
41
+ * Support serial worker training
42
+ * Fix Python 3.7 compatibility
43
+
44
+ ## 2.19.0
45
+
46
+ * Add DataComp models
47
+
48
+ ## 2.18.0
49
+
50
+ * Enable int8 inference without `.weight` attribute
51
+
52
+ ## 2.17.2
53
+
54
+ * Update push_to_hf_hub
55
+
56
+ ## 2.17.0
57
+
58
+ * Add int8 support
59
+ * Update notebook demo
60
+ * Refactor zero-shot classification code
61
+
62
+ ## 2.16.2
63
+
64
+ * Fixes for context_length and vocab_size attributes
65
+
66
+ ## 2.16.1
67
+
68
+ * Fixes for context_length and vocab_size attributes
69
+ * Fix --train-num-samples logic
70
+ * Add HF BERT configs for PubMed CLIP model
71
+
72
+ ## 2.16.0
73
+
74
+ * Add improved g-14 weights
75
+ * Update protobuf version
76
+
77
+ ## 2.15.0
78
+
79
+ * Add convnext_xxlarge weights
80
+ * Fixed import in readme
81
+ * Add samples per second per gpu logging
82
+ * Fix slurm example
83
+
84
+ ## 2.14.0
85
+
86
+ * Move dataset mixtures logic to shard level
87
+ * Fix CoCa accum-grad training
88
+ * Safer transformers import guard
89
+ * get_labels refactoring
90
+
91
+ ## 2.13.0
92
+
93
+ * Add support for dataset mixtures with different sampling weights
94
+ * Make transformers optional again
95
+
96
+ ## 2.12.0
97
+
98
+ * Updated convnext configs for consistency
99
+ * Added input_patchnorm option
100
+ * Clean and improve CoCa generation
101
+ * Support model distillation
102
+ * Add ConvNeXt-Large 320x320 fine-tune weights
103
+
104
+ ## 2.11.1
105
+
106
+ * Make transformers optional
107
+ * Add MSCOCO CoCa finetunes to pretrained models
108
+
109
+ ## 2.11.0
110
+
111
+ * coca support and weights
112
+ * ConvNeXt-Large weights
113
+
114
+ ## 2.10.1
115
+
116
+ * `hf-hub:org/model_id` support for loading models w/ config and weights in Hugging Face Hub
117
+
118
+ ## 2.10.0
119
+
120
+ * Added a ViT-bigG-14 model.
121
+ * Added an up-to-date example slurm script for large training jobs.
122
+ * Added a option to sync logs and checkpoints to S3 during training.
123
+ * New options for LR schedulers, constant and constant with cooldown
124
+ * Fix wandb autoresuming when resume is not set
125
+ * ConvNeXt `base` & `base_w` pretrained models added
126
+ * `timm-` model prefix removed from configs
127
+ * `timm` augmentation + regularization (dropout / drop-path) supported
128
+
129
+ ## 2.9.3
130
+
131
+ * Fix wandb collapsing multiple parallel runs into a single one
132
+
133
+ ## 2.9.2
134
+
135
+ * Fix braceexpand memory explosion for complex webdataset urls
136
+
137
+ ## 2.9.1
138
+
139
+ * Fix release
140
+
141
+ ## 2.9.0
142
+
143
+ * Add training feature to auto-resume from the latest checkpoint on restart via `--resume latest`
144
+ * Allow webp in webdataset
145
+ * Fix logging for number of samples when using gradient accumulation
146
+ * Add model configs for convnext xxlarge
147
+
148
+ ## 2.8.2
149
+
150
+ * wrapped patchdropout in a torch.nn.Module
151
+
152
+ ## 2.8.1
153
+
154
+ * relax protobuf dependency
155
+ * override the default patch dropout value in 'vision_cfg'
156
+
157
+ ## 2.8.0
158
+
159
+ * better support for HF models
160
+ * add support for gradient accumulation
161
+ * CI fixes
162
+ * add support for patch dropout
163
+ * add convnext configs
164
+
165
+
166
+ ## 2.7.0
167
+
168
+ * add multilingual H/14 xlm roberta large
169
+
170
+ ## 2.6.1
171
+
172
+ * fix setup.py _read_reqs
173
+
174
+ ## 2.6.0
175
+
176
+ * Make openclip training usable from pypi.
177
+ * Add xlm roberta large vit h 14 config.
178
+
179
+ ## 2.5.0
180
+
181
+ * pretrained B/32 xlm roberta base: first multilingual clip trained on laion5B
182
+ * pretrained B/32 roberta base: first clip trained using an HF text encoder
183
+
184
+ ## 2.4.1
185
+
186
+ * Add missing hf_tokenizer_name in CLIPTextCfg.
187
+
188
+ ## 2.4.0
189
+
190
+ * Fix #211, missing RN50x64 config. Fix type of dropout param for ResNet models
191
+ * Bring back LayerNorm impl that casts to input for non bf16/fp16
192
+ * zero_shot.py: set correct tokenizer based on args
193
+ * training/params.py: remove hf params and get them from model config
194
+
195
+ ## 2.3.1
196
+
197
+ * Implement grad checkpointing for hf model.
198
+ * custom_text: True if hf_model_name is set
199
+ * Disable hf tokenizer parallelism
200
+
201
+ ## 2.3.0
202
+
203
+ * Generalizable Text Transformer with HuggingFace Models (@iejMac)
204
+
205
+ ## 2.2.0
206
+
207
+ * Support for custom text tower
208
+ * Add checksum verification for pretrained model weights
209
+
210
+ ## 2.1.0
211
+
212
+ * lot including sota models, bfloat16 option, better loading, better metrics
213
+
214
+ ## 1.2.0
215
+
216
+ * ViT-B/32 trained on Laion2B-en
217
+ * add missing openai RN50x64 model
218
+
219
+ ## 1.1.1
220
+
221
+ * ViT-B/16+
222
+ * Add grad checkpointing support
223
+ * more robust data loader
Finetuning/LICENSE ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2012-2021 Gabriel Ilharco, Mitchell Wortsman,
2
+ Nicholas Carlini, Rohan Taori, Achal Dave, Vaishaal Shankar,
3
+ John Miller, Hongseok Namkoong, Hannaneh Hajishirzi, Ali Farhadi,
4
+ Ludwig Schmidt
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining
7
+ a copy of this software and associated documentation files (the
8
+ "Software"), to deal in the Software without restriction, including
9
+ without limitation the rights to use, copy, modify, merge, publish,
10
+ distribute, sublicense, and/or sell copies of the Software, and to
11
+ permit persons to whom the Software is furnished to do so, subject to
12
+ the following conditions:
13
+
14
+ The above copyright notice and this permission notice shall be
15
+ included in all copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
Finetuning/MANIFEST.in ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ include src/open_clip/bpe_simple_vocab_16e6.txt.gz
2
+ include src/open_clip/model_configs/*.json
3
+
Finetuning/Makefile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ install: ## [Local development] Upgrade pip, install requirements, install package.
2
+ python -m pip install -U pip
3
+ python -m pip install -e .
4
+
5
+ install-training:
6
+ python -m pip install -r requirements-training.txt
7
+
8
+ install-test: ## [Local development] Install test requirements
9
+ python -m pip install -r requirements-test.txt
10
+
11
+ test: ## [Local development] Run unit tests
12
+ python -m pytest -x -s -v tests
Finetuning/README.md ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OpenCLIP
2
+
3
+ This is a fork of <a href="https://github.com/mlfoundations/open_clip">OpenCLIP</a> used to fine-tune CLIP models with PinPoint counterfactuals. Refer to the original repository for more details on open_clip.
4
+
5
+
6
+ ### Installation
7
+
8
+ ```
9
+ pip install open_clip_torch
10
+ ```
11
+
12
+
13
+ ### Pretrained models
14
+
15
+ For LAION-pretrained models, download and place them in the ./pretrained_models (this can be done with open_clip CLI interface)/
16
+
17
+ ### Sample single-process running code:
18
+
19
+ To finetune CLIP models on CC3M:
20
+
21
+ ```bash
22
+ python -m open_clip_train.main \
23
+ --save-frequency 1 \
24
+ --zeroshot-frequency 1 \
25
+ --report-to tensorboard \
26
+ --train-data="..path_to_image_list.csv" \
27
+ --csv-img-key="Image_ID" \
28
+ --csv-caption-key="Caption" \
29
+ --val-data="/path/to/validation_data.csv" \
30
+ --imagenet-val="/path/to/imagenet/root/val/" \
31
+ --warmup 10000 \
32
+ --batch-size=128 \
33
+ --accum_freq=10 \
34
+ --lr=5e-06 \
35
+ --wd=0.1 \
36
+ --epochs=410 \
37
+ --workers=8 \
38
+ --pretrained_model="pretrained_models/vit_b16_laion2b.pth" \
39
+ --model ViT-B-16
40
+ ```
41
+
42
+ Note: `imagenet-val` is the path to the *validation* set of ImageNet for zero-shot evaluation, not the training set!
43
+ You can remove this argument if you do not want to perform zero-shot evaluation on ImageNet throughout training. Note that the `val` folder should contain subfolders. If it does not, please use [this script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh).
44
+
45
+ Note: the `train_data` should point to a *.csv file that contains the filelist with generated images in the following format:
46
+ `ÌMAGE_ID IMAGE_CAPTION`, separated by '\t'. You can find the lists for our in-painted data under `./annotations`.
47
+
Finetuning/annotations/annotations_real_100percent.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98638b902336c244e5bd86ea5e6a60138f5b5794adcc948a14920c33265fa789
3
+ size 41566227
Finetuning/annotations/annotations_real_50percent.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:674fe0c0dc21ab88a74cdabb55aa02f2ddea7f30b9b7b53ae2650fce1dfe322d
3
+ size 22229313
Finetuning/annotations/annotations_real_all.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:468f674896ffe6a4ee3a2651b1e1c271b14a87ffd0c20a83c98f62f2616039f6
3
+ size 26600575
Finetuning/annotations/annotations_real_men.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:620fb29bfc66bb33e21e3e198aeb2a4832cff6c6dba03b9ab08708e9950b60fb
3
+ size 14198335
Finetuning/annotations/annotations_real_men_synthetic_women.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c58a16ff99edd0dc1822ec0fee2e0b249491a5464f7394f39b2e082f27a4625
3
+ size 19894682
Finetuning/annotations/annotations_real_men_synthetic_women_from_women.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fbd9dc20937285cf630dffbfe876ae403e8f7dd26fea574e102a2fb4a1bd1f4
3
+ size 32623606
Finetuning/annotations/annotations_real_men_synthetic_women_paths.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cba0c694055130be125b8d8edb68a4543c4ef38c46c2b5df14f28bbca1d8d8ce
3
+ size 35839721
Finetuning/annotations/annotations_real_women.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3708c1a46bc8aec62f9589aeb4e0cf686e4645e1eafb6e9b70283f1cbef9b3a6
3
+ size 12402257
Finetuning/annotations/annotations_real_women_synthetic_men.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28369e7cab12fec141af67b076190ede8c3676a32a974b659a506c70772cfa0b
3
+ size 17371559
Finetuning/annotations/annotations_real_women_synthetic_men_from_men.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd72421e35fc4ca03a4b73b2980dd7f9524b229c15c53bf4e1389d302e81b1c3
3
+ size 30036906
Finetuning/annotations/annotations_real_women_synthetic_men_paths.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7cfa0641e5b15778b5c80e86469b5fbd253d74385a63048c993189addb29865
3
+ size 30459998
Finetuning/annotations/annotations_synthetic_100percent.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fba3407e2ab50815d3e52d9b0a6e85467394c363ae38f8b7bf37ec15d5ea47a
3
+ size 92116871
Finetuning/annotations/annotations_synthetic_50percent.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2eb68c45114caa94c90bf2e737a0fd4983fa1eb5f1e35e8b9b061ee2b9a05736
3
+ size 43097186
Finetuning/annotations/annotations_synthetic_all.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66a910ffa9d7cd24b837a04b229f15c2ba70a6055ed4a318ca1f2aa8fd61a19d
3
+ size 39699144
Finetuning/annotations/annotations_synthetic_men_from_men_synthetic_women_from_women.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f16ac4b764930607639f762ad6271e800fe83df2de90cb798ca85bef483cd9a
3
+ size 36059937
Finetuning/docs/CLIP.png ADDED

Git LFS Details

  • SHA256: 308a3ca4503f1c7a07803916c369d78c4ef501e5ab7fc727da9b5e1d2f9ec85b
  • Pointer size: 131 Bytes
  • Size of remote file: 252 kB
Finetuning/docs/Interacting_with_open_clip.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Finetuning/docs/Interacting_with_open_coca.ipynb ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ },
15
+ "accelerator": "GPU",
16
+ "gpuClass": "standard"
17
+ },
18
+ "cells": [
19
+ {
20
+ "cell_type": "code",
21
+ "source": [
22
+ "!pip install open_clip_torch transformers"
23
+ ],
24
+ "metadata": {
25
+ "id": "JvaEkx8Cyvhg"
26
+ },
27
+ "execution_count": null,
28
+ "outputs": []
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "execution_count": 18,
33
+ "metadata": {
34
+ "id": "vE4lFFkKyotX"
35
+ },
36
+ "outputs": [],
37
+ "source": [
38
+ "import open_clip\n",
39
+ "import torch\n",
40
+ "\n",
41
+ "model, _, transform = open_clip.create_model_and_transforms(\n",
42
+ " model_name=\"coca_ViT-L-14\",\n",
43
+ " pretrained=\"mscoco_finetuned_laion2B-s13B-b90k\"\n",
44
+ ")"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "source": [
50
+ "!wget https://i.imgur.com/8H7XCH0.jpg -O cat.jpg"
51
+ ],
52
+ "metadata": {
53
+ "id": "oOaE1AmDyth_"
54
+ },
55
+ "execution_count": null,
56
+ "outputs": []
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "source": [
61
+ "from IPython.display import Image\n",
62
+ "Image('cat.jpg')"
63
+ ],
64
+ "metadata": {
65
+ "colab": {
66
+ "base_uri": "https://localhost:8080/",
67
+ "height": 407
68
+ },
69
+ "id": "Y9Q6bhVA2L01",
70
+ "outputId": "1b920080-e8cd-4d2f-fb23-30e6f1b612f9"
71
+ },
72
+ "execution_count": 19,
73
+ "outputs": [
74
+ {
75
+ "output_type": "execute_result",
76
+ "data": {
77
+ "image/jpeg": "\n",
78
+ "text/plain": [
79
+ "<IPython.core.display.Image object>"
80
+ ]
81
+ },
82
+ "metadata": {},
83
+ "execution_count": 19
84
+ }
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "source": [
90
+ "from PIL import Image\n",
91
+ "im = Image.open(\"cat.jpg\").convert(\"RGB\")\n",
92
+ "im = transform(im).unsqueeze(0)\n",
93
+ "\n",
94
+ "with torch.no_grad(), torch.cuda.amp.autocast():\n",
95
+ " generated = model.generate(im)\n",
96
+ "\n",
97
+ "print(open_clip.decode(generated[0]).split(\"<end_of_text>\")[0].replace(\"<start_of_text>\", \"\"))"
98
+ ],
99
+ "metadata": {
100
+ "colab": {
101
+ "base_uri": "https://localhost:8080/"
102
+ },
103
+ "id": "byZKXMGzyr5Y",
104
+ "outputId": "122eb099-6704-4e3c-fa7c-a05dd87ce64f"
105
+ },
106
+ "execution_count": 22,
107
+ "outputs": [
108
+ {
109
+ "output_type": "stream",
110
+ "name": "stdout",
111
+ "text": [
112
+ "an orange and white cat on top of a turtle . \n"
113
+ ]
114
+ }
115
+ ]
116
+ }
117
+ ]
118
+ }
Finetuning/docs/LOW_ACC.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ As we describe in more detail below, CLIP models in a medium accuracy regime already allow us to draw conclusions about the robustness of larger CLIP models since the models follow reliable scaling laws.
2
+
3
+ [Cherti et al., 2022](https://arxiv.org/abs/2212.07143) and [Gadre et al., 2023](https://arxiv.org/abs/2304.14108) show additional discussions about the scaling behavior of CLIP models.
4
+
5
+ ## Scaling trends
6
+
7
+ The plot below shows how zero-shot performance of CLIP models varies as we scale the number of samples used for training. Zero-shot performance increases steadily for both ImageNet and [ImageNetV2](https://arxiv.org/abs/1902.10811), and is far from saturated at ~15M samples.
8
+
9
+ <img src="https://raw.githubusercontent.com/mlfoundations/open_clip/main/docs/scaling.png" width="700">
10
+
11
+ ## Why are low-accuracy CLIP models interesting?
12
+
13
+ **TL;DR:** CLIP models have high effective robustness, even at small scales.
14
+
15
+ CLIP models are particularly intriguing because they are more robust to natural distribution shifts (see Section 3.3 in the [CLIP paper](https://arxiv.org/abs/2103.00020)).
16
+ This phenomena is illustrated by the figure below, with ImageNet accuracy on the x-axis
17
+ and [ImageNetV2](https://arxiv.org/abs/1902.10811) (a reproduction of the ImageNet validation set with distribution shift) accuracy on the y-axis.
18
+ Standard training denotes training on the ImageNet train set and the CLIP zero-shot models
19
+ are shown as stars.
20
+
21
+ ![CLIP scatter plot](https://raw.githubusercontent.com/mlfoundations/open_clip/main/docs/effective_robustness.png)
22
+
23
+ As observed by [Taori et al., 2020](https://arxiv.org/abs/2007.00644) and [Miller et al., 2021](https://arxiv.org/abs/2107.04649), the in-distribution
24
+ and out-of-distribution accuracies of models trained on ImageNet follow a predictable linear trend (the red line in the above plot). *Effective robustness*
25
+ quantifies robustness as accuracy beyond this baseline, i.e., how far a model lies above the red line. Ideally a model would not suffer from distribution shift and fall on the y = x line ([trained human labelers are within a percentage point of the y = x line](http://proceedings.mlr.press/v119/shankar20c.html)).
26
+
27
+ Even though the CLIP models trained with
28
+ this codebase achieve much lower accuracy than those trained by OpenAI, our models still lie on the same
29
+ trend of improved effective robustness (the purple line). Therefore, we can study what makes
30
+ CLIP robust without requiring industrial-scale compute.
31
+
32
+ For more information on effective robustness, please see:
33
+
34
+ - [Recht et al., 2019](https://arxiv.org/abs/1902.10811).
35
+ - [Taori et al., 2020](https://arxiv.org/abs/2007.00644).
36
+ - [Miller et al., 2021](https://arxiv.org/abs/2107.04649).
37
+
38
+ To know more about the factors that contribute to CLIP's robustness refer to [Fang et al., 2022](https://arxiv.org/abs/2205.01397).
Finetuning/docs/PRETRAINED.md ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Pretrained model results
2
+
3
+ We evaluate the full collection of available models on a suite of 38 datasets in a zero-shot setting (i.e., without fine-tuning), following [Gadre et al., 2023](https://arxiv.org/abs/2304.14108).
4
+ Click below to see the full results.
5
+
6
+ - [Full results (English)](openclip_results.csv)
7
+ - [Classification-only results](openclip_classification_results.csv)
8
+ - [Retrieval results](openclip_retrieval_results.csv)
9
+ - [Multilingual retrieval results](openclip_multilingual_retrieval_results.csv)
10
+
11
+ ## Pretrained model details
12
+
13
+ Below are details for several of our pretrained models.
14
+
15
+ ### LAION-400M - https://laion.ai/laion-400-open-dataset
16
+
17
+ We ran experiments in an attempt to reproduce OpenAI's ViT results with the comparably sized (and open) LAION-400M dataset. Trained
18
+ weights can be found in release [v0.2](https://github.com/mlfoundations/open_clip/releases/tag/v0.2-weights).
19
+
20
+ The LAION400M weights have been trained on the JUWELS supercomputer (see acknowledgements section below).
21
+
22
+ #### ViT-B/32 224x224
23
+
24
+ We replicate OpenAI's results on ViT-B/32, reaching a top-1 ImageNet-1k zero-shot accuracy of 62.96%.
25
+
26
+ <img src="https://raw.githubusercontent.com/mlfoundations/open_clip/main/docs/laion_clip_zeroshot.png" width="700">
27
+
28
+ **Zero-shot comparison (courtesy of Andreas Fürst)**
29
+ <img src="https://raw.githubusercontent.com/mlfoundations/open_clip/main/docs/laion_openai_compare_b32.jpg" width="700">
30
+
31
+ ViT-B/32 was trained with 128 A100 (40 GB) GPUs for ~36 hours, 4600 GPU-hours. The per-GPU batch size was 256 for a global batch size of 32768. 256 is much lower than it could have been (~320-384) due to being sized initially before moving to 'local' contrastive loss.
32
+
33
+ #### ViT-B/16 224x224
34
+
35
+ The B/16 LAION400M training reached a top-1 ImageNet-1k zero-shot validation score of 67.07.
36
+
37
+ <img src="https://raw.githubusercontent.com/mlfoundations/open_clip/main/docs/laion_clip_zeroshot_b16.png" width="700">
38
+
39
+ This was the first major train session using the updated webdataset 0.2.x code. A bug was found that prevented shards from being shuffled properly between nodes/workers each epoch. This was fixed part way through training (epoch 26) but likely had an impact.
40
+
41
+ ViT-B/16 was trained with 176 A100 (40 GB) GPUS for ~61 hours, 10700 GPU-hours. Batch size per GPU was 192 for a global batch size of 33792.
42
+
43
+ #### ViT-B/16+ 240x240
44
+
45
+ The B/16+ 240x240 LAION400M training reached a top-1 ImageNet-1k zero-shot validation score of 69.21.
46
+
47
+ This model is the same depth as the B/16, but increases the
48
+
49
+ - vision width from 768 -> 896
50
+ - text width from 512 -> 640
51
+ - the resolution 224x224 -> 240x240 (196 -> 225 tokens)
52
+
53
+ <img src="https://raw.githubusercontent.com/mlfoundations/open_clip/main/docs/laion_clip_zeroshot_b16_plus_240.png" width="700">
54
+
55
+ Unlike the B/16 run above, this model was a clean run with no dataset shuffling issues.
56
+
57
+ ViT-B/16+ was trained with 224 A100 (40 GB) GPUS for ~61 hours, 13620 GPU-hours. Batch size per GPU was 160 for a global batch size of 35840.
58
+
59
+ #### ViT-L/14 224x224
60
+
61
+ The L/14 LAION-400M training reached a top-1 ImageNet-1k zero-shot validation score of 72.77.
62
+
63
+ <img src="https://raw.githubusercontent.com/mlfoundations/open_clip/main/docs/laion_clip_zeroshot_l14.png" width="700">
64
+
65
+ ViT-L/14 was trained with 400 A100 (40 GB) GPUS for ~127 hours, 50800 GPU-hours. Batch size per GPU was 96 for a global batch size of 38400. Grad checkpointing was enabled.
66
+
67
+ ### LAION-2B (en) - https://laion.ai/laion-5b-a-new-era-of-open-large-scale-multi-modal-datasets/
68
+
69
+ A ~2B sample subset of LAION-5B with english captions (https://huggingface.co/datasets/laion/laion2B-en)
70
+
71
+ #### ViT-B/32 224x224
72
+
73
+ A ViT-B/32 trained on LAION-2B, reaching a top-1 ImageNet-1k zero-shot accuracy of 65.62%.
74
+
75
+ <img src="https://raw.githubusercontent.com/mlfoundations/open_clip/main/docs/laion2b_clip_zeroshot_b32.png" width="700">
76
+
77
+ ViT-B/32 was trained with 112 A100 (40 GB) GPUs. The per-GPU batch size was 416 for a global batch size of 46592. Compute generously provided by [stability.ai](https://stability.ai/).
78
+
79
+ A second iteration of B/32 was trained on stability.ai cluster with a larger global batch size and learning rate, hitting 66.6% top-1. See https://huggingface.co/laion/CLIP-ViT-B-32-laion2B-s34B-b79K
80
+
81
+ #### ViT-L/14 224x224
82
+
83
+ A ViT-L/14 with a 75.3% top-1 ImageNet-1k zero-shot was trained on JUWELS Booster. See model details here https://huggingface.co/laion/CLIP-ViT-L-14-laion2B-s32B-b82K
84
+
85
+ These weights use a different dataset mean and std than others. Instead of using the OpenAI mean & std, inception style normalization `[-1, 1]` is used via a mean and std of `[0.5, 0.5, 0.5]`. This is handled automatically if using `open_clip.create_model_and_transforms` from pretrained weights.
86
+
87
+ #### ViT-H/14 224x224
88
+
89
+ A ViT-H/14 with a 78.0% top-1 ImageNet-1k zero-shot was trained on JUWELS Booster. See model details here https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K
90
+
91
+ #### ViT-g/14 224x224
92
+
93
+ A ViT-g/14 with a 76.6% top-1 ImageNet-1k zero-shot was trained on JUWELS Booster. See model details here https://huggingface.co/laion/CLIP-ViT-g-14-laion2B-s12B-b42K
94
+
95
+ This model was trained with a shorted schedule than other LAION-2B models with 12B samples seen instead of 32+B. It matches LAION-400M training in samples seen. Many zero-shot results are lower as a result, but despite this it performs very well in some OOD zero-shot and retrieval tasks.
96
+
97
+ #### ViT-B/32 roberta base
98
+
99
+ A ViT-B/32 with roberta base encoder with a 61.7% top-1 ImageNet-1k zero-shot was trained on stability. See model details here https://huggingface.co/laion/CLIP-ViT-B-32-roberta-base-laion2B-s12B-b32k
100
+ This is the first openclip model using a HF text tower. It has better performance on a range of tasks compared to the standard text encoder, see [metrics](https://huggingface.co/laion/CLIP-ViT-B-32-roberta-base-laion2B-s12B-b32k/blob/main/unknown.png)
101
+
102
+ #### ViT-B/32 xlm roberta base
103
+
104
+ A ViT-B/32 with xlm roberta base encoder with a 62.33% top-1 ImageNet-1k zero-shot was trained on stability. See model details here https://huggingface.co/laion/CLIP-ViT-B-32-xlm-roberta-base-laion5B-s13B-b90k
105
+ This is the first openclip model trained on the full laion5B dataset; hence the first multilingual clip trained with openclip. It has better performance on a range of tasks compared to the standard text encoder, see [metrics](https://huggingface.co/laion/CLIP-ViT-B-32-xlm-roberta-base-laion5B-s13B-b90k/blob/main/metrics.png)
106
+ A preliminary multilingual evaluation was run: 43% on imagenet1k italian (vs 21% for english B/32), 37% for imagenet1k japanese (vs 1% for english B/32 and 50% for B/16 clip japanese). It shows the multilingual property is indeed there as expected. Larger models will get even better performance.
107
+
108
+ #### ViT-H/14 xlm roberta large
109
+
110
+ A ViT-H/14 with xlm roberta large encoder with a 77.0% (vs 78% for the english equivalent) top-1 ImageNet-1k zero-shot was trained on stability. See model details here https://huggingface.co/laion/CLIP-ViT-H-14-frozen-xlm-roberta-large-laion5B-s13B-b90k
111
+
112
+ This model was trained following the [LiT](https://arxiv.org/abs/2111.07991) methodology: the image tower was frozen (initialized from english openclip ViT-H/14), the text tower was initialized from [xlm roberta large](https://huggingface.co/xlm-roberta-large) and unfrozen. This reduced training cost by a 3x factor.
113
+
114
+ See full english [metrics](https://huggingface.co/laion/CLIP-ViT-H-14-frozen-xlm-roberta-large-laion5B-s13B-b90k/resolve/main/results_xlm_roberta_large.png)
115
+
116
+ On zero shot classification on imagenet with translated prompts this model reaches:
117
+
118
+ - 56% in italian (vs 21% for https://github.com/clip-italian/clip-italian)
119
+ - 53% in japanese (vs 54.6% for https://github.com/rinnakk/japanese-clip)
120
+ - 55.7% in chinese (to be compared with https://github.com/OFA-Sys/Chinese-CLIP)
121
+
122
+ #### YFCC-15M
123
+
124
+ Below are checkpoints of models trained on YFCC-15M, along with their zero-shot top-1 accuracies on ImageNet and ImageNetV2. These models were trained using 8 GPUs and the same hyperparameters described in the "Sample running code" section, with the exception of `lr=5e-4` and `epochs=32`.
125
+
126
+ - [ResNet-50](https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-yfcc15m-455df137.pt) (32.7% / 27.9%)
127
+ - [ResNet-101](https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn101-quickgelu-yfcc15m-3e04b30e.pt) (34.8% / 30.0%)
128
+
129
+ #### CC12M - https://github.com/google-research-datasets/conceptual-12m
130
+
131
+ - [ResNet-50](https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-cc12m-f000538c.pt) (36.45%)
132
+
133
+ ### CommonPool and DataComp models
134
+
135
+ As part of [DataComp](https://github.com/mlfoundations/datacomp), we trained models on CommonPool using various data filtering strategies.
136
+
137
+ The best performing models are specified below for the xlarge scale, see our paper [DataComp: In seearch of the next generation of multimodal datasets](https://arxiv.org/abs/2304.14108) for more details.
138
+
139
+ Additional models and more information can be found at [/docs/datacomp_models.md](/docs/datacomp_models.md).
140
+
141
+ - `datacomp_xl_s13b_b90k`: A ViT-L/14 trained on DataComp-1B for 12.8B steps and batch size 90k. Achieves 79.2% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K.
142
+
143
+ - `commonpool_xl_clip_s13b_b90k`: A ViT-L/14 trained on CommonPool-XL filtered using CLIP scores, for 12.8B steps and batch size 90k. Achieves 76.4% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-L-14-CommonPool.XL.clip-s13B-b90K.
144
+
145
+ - `commonpool_xl_laion_s13b_b90k`: A ViT-L/14 trained on CommonPool-XL filtered using the LAION-2B filtering scheme, for 12.8B steps and batch size 90k. Achieves 75.5% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-L-14-CommonPool.XL.laion-s13B-b90K.
146
+
147
+ - `commonpool_xl_s13b_b90k`: A ViT-L/14 trained on CommonPool-XL without any filtering, for 12.8B steps and batch size 90k. Achieves 72.3% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-L-14-CommonPool.XL-s13B-b90K.
148
+
149
+ If you use models trained on DataComp-1B or CommonPool variations, please consider citing the following:
150
+
151
+ ```bibtex
152
+ @article{datacomp,
153
+ title={DataComp: In search of the next generation of multimodal datasets},
154
+ author={Samir Yitzhak Gadre, Gabriel Ilharco, Alex Fang, Jonathan Hayase, Georgios Smyrnis, Thao Nguyen, Ryan Marten, Mitchell Wortsman, Dhruba Ghosh, Jieyu Zhang, Eyal Orgad, Rahim Entezari, Giannis Daras, Sarah Pratt, Vivek Ramanujan, Yonatan Bitton, Kalyani Marathe, Stephen Mussmann, Richard Vencu, Mehdi Cherti, Ranjay Krishna, Pang Wei Koh, Olga Saukh, Alexander Ratner, Shuran Song, Hannaneh Hajishirzi, Ali Farhadi, Romain Beaumont, Sewoong Oh, Alex Dimakis, Jenia Jitsev, Yair Carmon, Vaishaal Shankar, Ludwig Schmidt},
155
+ journal={arXiv preprint arXiv:2304.14108},
156
+ year={2023}
157
+ }
158
+ ```
159
+
160
+ ### MetaCLIP
161
+
162
+ MetaCLIP models are described in the paper [Demystifying CLIP Data](https://arxiv.org/abs/2309.16671).
163
+ These models were developed by Hu Xu, Saining Xie, Xiaoqing Ellen Tan, Po-Yao Huang, Russell Howes, Vasu Sharma, Shang-Wen Li, Gargi Ghosh, Luke Zettlemoyer and Christoph Feichtenhofer from Meta, New York University and the University of Washington.
164
+
165
+ Models are licensed under CC-BY-NC.
166
+ More details are available at https://github.com/facebookresearch/MetaCLIP.
167
+
168
+ If you use MetaCLIP models, please cite the following:
169
+
170
+ ```bibtex
171
+ @inproceedings{xu2023metaclip,
172
+ title={Demystifying CLIP Data},
173
+ author={Hu Xu, Saining Xie, Xiaoqing Ellen Tan, Po-Yao Huang, Russell Howes, Vasu, Sharma, Shang-Wen Li, Gargi Ghosh, Luke Zettlemoyer and Christoph Feichtenhofer},
174
+ journal={arXiv preprint arXiv:2309.16671},
175
+ year={2023}
176
+ }
177
+ ```
178
+
179
+ ### EVA-CLIP
180
+
181
+ EVA-CLIP models are described in the paper [EVA-CLIP: Improved Training Techniques for CLIP at Scale](https://arxiv.org/abs/2303.15389).
182
+ These models were developed by Quan Sun, Yuxin Fang, Ledell Wu, Xinlong Wang and Yue Cao from BAAI and HUST.
183
+
184
+ Models are licensed under the MIT License.
185
+ More details are available at https://github.com/baaivision/EVA/tree/master/EVA-CLIP.
186
+
187
+ If you use EVA models, please cite the following:
188
+
189
+ ```bibtex
190
+ @article{EVA-CLIP,
191
+ title={EVA-CLIP: Improved Training Techniques for CLIP at Scale},
192
+ author={Sun, Quan and Fang, Yuxin and Wu, Ledell and Wang, Xinlong and Cao, Yue},
193
+ journal={arXiv preprint arXiv:2303.15389},
194
+ year={2023}
195
+ }
196
+ ```
197
+
198
+ ### NLLB-CLIP
199
+
200
+ NLLB-CLIP models are described in the paper [NLLB-CLIP - train performant multilingual image retrieval model on a budget](https://arxiv.org/abs/2309.01859) by Alexander Visheratin.
201
+
202
+ The model was trained following the [LiT](https://arxiv.org/abs/2111.07991) methodology: the image tower was frozen, the text tower was initialized from the [NLLB](https://arxiv.org/abs/2207.04672) encoder and unfrozen.
203
+
204
+ The model was trained on the [LAION-COCO-NLLB](https://huggingface.co/datasets/visheratin/laion-coco-nllb) dataset.
205
+
206
+ The first version of the model (`nllb-clip`) described in the paper was trained using the OpenAI CLIP image encoder.
207
+
208
+ The second version of the model (`nllb-clip-siglip`) was trained using the [SigLIP](https://arxiv.org/abs/2303.15343) image encoder.
209
+
210
+ Models are licensed under CC-BY-NC.
211
+
212
+ If you use NLLB-CLIP models, please cite the following:
213
+
214
+ ```bibtex
215
+ @article{visheratin2023nllb,
216
+ title={NLLB-CLIP--train performant multilingual image retrieval model on a budget},
217
+ author={Visheratin, Alexander},
218
+ journal={arXiv preprint arXiv:2309.01859},
219
+ year={2023}
220
+ }
221
+ ```
222
+
223
+ ### CLIPA
224
+
225
+ CLIPA models are described in the following papers by Xianhang Li, Zeyu Wang, Cihang Xie from UC Santa Cruz:
226
+
227
+ - [An Inverse Scaling Law for CLIP Training](https://arxiv.org/abs/2305.07017)
228
+ - [CLIPA-v2: Scaling CLIP Training with 81.1% Zero-shot ImageNet Accuracy within a $10,000 Budget; An Extra $4,000 Unlocks 81.8% Accuracy](https://arxiv.org/abs/2306.15658)
229
+
230
+ Models are licensed under Apache 2.0.
231
+ More details are available at https://github.com/UCSC-VLAA/CLIPA and [here](clipa.md).
232
+
233
+ If you use CLIPA models, please cite the following:
234
+
235
+ ```bibtex
236
+ @inproceedings{li2023clipa,
237
+ title={An Inverse Scaling Law for CLIP Training},
238
+ author={Xianhang Li and Zeyu Wang and Cihang Xie},
239
+ booktitle={NeurIPS},
240
+ year={2023},
241
+ }
242
+ ```
243
+
244
+ ```bibtex
245
+ @article{li2023clipav2,
246
+ title={CLIPA-v2: Scaling CLIP Training with 81.1% Zero-shot ImageNet Accuracy within a $10,000 Budget; An Extra $4,000 Unlocks 81.8% Accuracy},
247
+ author={Xianhang Li and Zeyu Wang and Cihang Xie},
248
+ journal={arXiv preprint arXiv:2306.15658},
249
+ year={2023},
250
+ }
251
+ ```
252
+
253
+ ### SigLIP
254
+
255
+ SigLIP models are described in the paper [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343).
256
+ These models were developed by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer from Google Deepmind.
257
+
258
+ Models are licensed under the Apache 2 license.
259
+ More details are available at hhttps://github.com/google-research/big_vision.
260
+
261
+ If you use SigLIP models, please cite the following:
262
+
263
+ ```bibtex
264
+ @article{zhai2023sigmoid,
265
+ title={Sigmoid loss for language image pre-training},
266
+ author={Zhai, Xiaohua and Mustafa, Basil and Kolesnikov, Alexander and Beyer, Lucas},
267
+ journal={arXiv preprint arXiv:2303.15343},
268
+ year={2023}
269
+ }
270
+ ```
271
+
272
+ ### DFN
273
+
274
+ Data Filtering Network models are described in https://arxiv.org/abs/2309.17425.
275
+ These models were developed by Alex Fang, Albin Madappally Jose, Amit Jain, Ludwig Schmidt, Alexander Toshev and Vaishaal Shankar from Apple and the University of Washington.
276
+
277
+ Models are licensed under the following: https://huggingface.co/apple/DFN5B-CLIP-ViT-H-14-384/blob/main/LICENSE.
278
+
279
+ If you use DFN models, please cite the following:
280
+
281
+ ```bibtext
282
+ @article{fang2023data,
283
+ title={Data Filtering Networks},
284
+ author={Fang, Alex and Jose, Albin Madappally and Jain, Amit and Schmidt, Ludwig and Toshev, Alexander and Shankar, Vaishaal},
285
+ journal={arXiv preprint arXiv:2309.17425},
286
+ year={2023}
287
+ }
288
+ ```
Finetuning/docs/clip_conceptual_captions.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Additional training curves for CLIP on Conceptual Captions
2
+
3
+ # Zero shot accuracy
4
+ ![](/docs/clip_zeroshot.png)
5
+
6
+ # Training loss curve
7
+ ![](/docs/clip_loss.png)
8
+
9
+ # Validation loss curve
10
+ ![](/docs/clip_val_loss.png)
11
+
12
+ # Validation recall
13
+ ![](/docs/clip_recall.png)
Finetuning/docs/clip_loss.png ADDED
Finetuning/docs/clip_recall.png ADDED
Finetuning/docs/clip_val_loss.png ADDED
Finetuning/docs/clip_zeroshot.png ADDED
Finetuning/docs/clipa.md ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## CLIPA
2
+
3
+ In this work, we present a surprising finding that there exists an _inverse_ scaling law for CLIP training,
4
+ whereby the larger the image/text encoders used, the shorter the sequence length of image/text tokens that can be applied in training.
5
+ Moreover, we showcase that the strategy for reducing image/text token length plays a crucial role in determining the quality of this scaling law.
6
+
7
+ ![](/docs/inverse_scaling_law.png)
8
+
9
+ As a result of this finding, we are able to successfully train CLIP even by using academic resources.
10
+ For example, on an A100 eight-GPU server, our CLIP models achieve zero-shot top-1 ImageNet accuracies of **63.2%** in about **2 days**,
11
+ **67.8%** in about **3 days**, and **69.3%** in about **4 days**.
12
+
13
+ Moreover, We find that CLIPA at scale leads to state-of-the-art performance. For example, our CLIPA-v2 H/14 achieves a zero-shot top-1 ImageNet accuracy of **81.8%**,
14
+ with a budget less than **$15000**.
15
+
16
+ ![](/docs/clipa_acc_compute.png)
17
+
18
+ For more details, please see our paper [An Inverse Scaling Law for CLIP Training](https://arxiv.org/abs/2305.07017) and
19
+ [CLIPA-v2: Scaling CLIP Training with 81.1% Zero-shot ImageNet Accuracy within a $10,000 Budget; An Extra $4,000 Unlocks 81.8% Accuracy](https://arxiv.org/abs/2306.15658).
20
+
21
+
22
+ Eight token length reduction strategies are investigated in this work, detailed as follows.
23
+
24
+
25
+ ## Image token length reduction
26
+
27
+ ![](/docs/clipa_reduce_image_token.png)
28
+
29
+ * `resize`: use `--force-image-size` to specify the image size you want to adopt. We find this strategy generally works the best as it retains full image information.
30
+
31
+ * `random mask`: Randomly mask out image patches. use `--force-patch-dropout` to specify the mask ratio you want to adopt.
32
+
33
+ * `grid mask`: Preserve one patch in each 2 × 2 grid window. We do not provide implementation for grid masking, as it is only experimental and we generally find resizing works better.
34
+
35
+ * `block mask`: Keep a single block and remove other patches. We do not provide implementation for block masking, as it is only experimental and we generally find resizing works better.
36
+
37
+
38
+ ## Text token length reduction
39
+
40
+ * `syntax mask`: Assign different masking priorities to parts of speech. Specify `"text_mask": syntax` in `"tokenizer_kwargs"` in `"text_cfg"` of model config `json` file to use.
41
+ Specifically, we prioritize retaining nouns, followed by adjectives, and then other words.
42
+ We find this strategy generally works the best as it retains critical information for contrastive learning.
43
+
44
+ * `truncate`: Truncation selects the first N text tokens and discards the rest. This is the default setting of `open_clip`.
45
+
46
+ * `random mask`: Randomly drops a portion of the text tokens. Specify `"text_mask": random` in `"tokenizer_kwargs"` in `"text_cfg"` of model config `json` file to use.
47
+
48
+ * `block mask`: Randomly preserves consecutive text sequences. Specify `"text_mask": block` in `"tokenizer_kwargs"` in `"text_cfg"` of model config `json` file to use.
49
+
50
+
51
+ ## Installation
52
+
53
+ The installation is really the same as `open_clip`, except for the usage of Natural Language Toolkit (NLTK) in `syntax mask` of text token length reduction.
54
+ Please follow the [official doc](https://www.nltk.org/) to install NLTK.
55
+
56
+ Note that the the usage of NLTK brings two constraints:
57
+ * Because certain functions like `nltk.pos_tag` from NLTK only support English and Russian for now, the `syntax mask` only works for English.
58
+ we have not tested it on Russian or any other language. Theoretically, it should work the same, given a proper language processing toolkit for other languages.
59
+ If you still want to apply `syntax mask` on other languages, try finding the right toolkit. Otherwise, use other text token length reduction strategies
60
+ * some modules of NLTK like `punkt` or `averaged_perceptron_tagger` need to be downloaded first before using NLTK.
61
+ We have included the downloading code in `tokenizer.py`, but this might cause trouble in certain cases.
62
+ You may want to manually download those modules first, by `nltk.download('punkt')` and `nltk.download('averaged_perceptron_tagger')`,
63
+ and then setup the environmental variable before running the script `export NLTK_DATA=cache`.
64
+ Note that this is a one-time effort. Remember to comment out those `nltk.download` lines in `tokenizer.py` afterwards.
65
+
66
+ ## Training
67
+ We provide example scripts to reproduce our CLIPA results on an A100 eight-GPU machine under path `docs/script_examples/clipa`.
68
+
69
+ For instance, to reproduce the CLIPA-L16(I37,T8) results, first run the pre-training script
70
+ ```
71
+ bash docs/script_examples/clipa/vit_l16/i37_t8_pretrain.sh
72
+ ```
73
+ and fine-tune the pre-trained checkpoint with
74
+ ```
75
+ bash docs/script_examples/clipa/vit_l16/i37_t8_finetune.sh
76
+ ```
77
+ - Remember to change the path to dataset to your own path.
78
+ - This is a two-stage training pipeline. Remember to change the path to pre-trained checkpoint to your own when fine-tuning.
79
+ - The training time is ~3 days for pre-training and ~1 day for fine-tuning on an A100 eight-GPU machine.
80
+
81
+ ## Model Weights
82
+ Below are CLIPA trained weights on LAION-400M with an A100 eight-GPU machine.
83
+ All models are pre-trained for 6 epochs with reduced input token lengths and subsequently fine-tuned for 0.36 epoch with full input token lengths.
84
+
85
+
86
+ | | Pre-trained Weights | zero-shot IN-1K |
87
+ |---------------------|:----------------------------------------------------------------------------------------------:|:---------------:|
88
+ | CLIPA-B/16(I50,T16) | [download](https://drive.google.com/file/d/1MDpz8gV2Vjaazk16rBhLxU8811U7_cGL/view?usp=sharing) | 59.7 |
89
+ | CLIPA-L/16(I17,T16) | [download](https://drive.google.com/file/d/1Tr2GYiKAaMH6EGIn5l7eX_1K20eaA3WA/view?usp=sharing) | 60.3 |
90
+ | CLIPA_L/16(I37,T8) | [download](https://drive.google.com/file/d/1EM1ChRNARpLckkJjf6m7njCY3xyvpGBu/view?usp=sharing) | 57.9 |
91
+
92
+ | | Fine-tuned Weights | zero-shot IN-1K |
93
+ |---------------------|:----------------------------------------------------------------------------------------------:|:-----:|
94
+ | CLIPA-B/16(I50,T16) | [download](https://drive.google.com/file/d/1fURK0K_a3-83jVEI4PVEbnEJb_V6UbGv/view?usp=sharing) | 63.2 |
95
+ | CLIPA-L/16(I17,T16) | [download](https://drive.google.com/file/d/18qqZGOTGOgb3I3JWONuat6qObsgLq7sR/view?usp=sharing) | 67.8 |
96
+ | CLIPA_L/16(I37,T8) | [download](https://drive.google.com/file/d/1lV7pLORUK04T9QKKx9TpYtMws-AZrib0/view?usp=sharing) | 69.3 |
97
+
98
+
99
+ ## CLIPA-v2
100
+ We also provide example scripts to reproduce our CLIPA-v2 H/14 results under path `docs/script_examples/clipav2`.
101
+ Note that the original results are obtained with [our JAX implementation](https://github.com/UCSC-VLAA/CLIPA/tree/master/clipa_jax).
102
+ These scripts are written after manually scanning the JAX config files.
103
+ As it is infeasible for us to retrain those models again with pytorch, its correctness cannot be verified with 100% confidence. Use them at your own discretion.
Finetuning/docs/clipa_acc_compute.png ADDED

Git LFS Details

  • SHA256: ca5ab83e5185c16424851801af0bfbe972e27e91dcb819605a8bd930109d07d0
  • Pointer size: 131 Bytes
  • Size of remote file: 262 kB
Finetuning/docs/clipa_reduce_image_token.png ADDED

Git LFS Details

  • SHA256: cd90f60c0acf49feb0731999c0c70234bc0b13b2d347e69174b5c532400f41d0
  • Pointer size: 132 Bytes
  • Size of remote file: 1.09 MB
Finetuning/docs/clipa_reduce_text_token.png ADDED
Finetuning/docs/datacomp_models.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## CommonPool and DataComp models
2
+
3
+ As part of [DataComp](https://github.com/mlfoundations/datacomp), we trained models on CommonPool using various data filtering strategies.
4
+ We release models for all four scales of the competition, small, medium, large and xlarge, corresponding to a pool size and number of samples seen of 12.8M, 128M, 1.28B and 12.8B, respectively.
5
+
6
+ The models are specified below, see our paper [DataComp: In seearch of the next generation of multimodal datasets](https://arxiv.org/abs/2304.14108) for more details.
7
+
8
+
9
+ ## xlarge scale models
10
+
11
+ * `datacomp_xl_s13b_b90k`: A ViT-L/14 trained on DataComp-1B for 12.8B steps and batch size 90k. Achieves 79.2% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K.
12
+
13
+ * `commonpool_xl_clip_s13b_b90k`: A ViT-L/14 trained on CommonPool-XL filtered using CLIP scores, for 12.8B steps and batch size 90k. Achieves 76.4% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-L-14-CommonPool.XL.clip-s13B-b90K.
14
+
15
+ * `commonpool_xl_laion_s13b_b90k`: A ViT-L/14 trained on CommonPool-XL filtered using the LAION-2B filtering scheme, for 12.8B steps and batch size 90k. Achieves 75.5% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-L-14-CommonPool.XL.laion-s13B-b90K.
16
+
17
+ * `commonpool_xl_s13b_b90k`: A ViT-L/14 trained on CommonPool-XL without any filtering, for 12.8B steps and batch size 90k. Achieves 72.3% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-L-14-CommonPool.XL-s13B-b90K.
18
+
19
+
20
+ ## large scale models
21
+
22
+ * `datacomp_l_s1b_b8k`: A ViT-B/16 trained on a 140M subset of DataComp-1B, for 1.28B steps and batch size 8k. Achieves 63.1% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-B-16-DataComp.L-s1B-b8K.
23
+
24
+ * `commonpool_l_clip_s1b_b8k`: A ViT-B/16 trained on CommonPool-L filtered using CLIP scores, for 1.28B steps and batch size 8k. Achieves 57.8% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-B-16-CommonPool.L.clip-s1B-b8K.
25
+
26
+ * `commonpool_l_laion_s1b_b8k`: A ViT-B/16 trained on CommonPool-L filtered using the LAION-2B filtering scheme, for 1.28B steps and batch size 8k. Achieves 55.3% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-B-16-CommonPool.L.laion-s1B-b8K.
27
+
28
+ * `commonpool_l_image_s1b_b8k`: A ViT-B/16 trained on CommonPool-L filtered using image-based filtering, for 1.28B steps and batch size 8k. Achieves 57.2% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-B-16-CommonPool.L.image-s1B-b8K.
29
+
30
+ * `commonpool_l_text_s1b_b8k`: A ViT-B/16 trained on CommonPool-L filtered using text-based filtering, for 1.28B steps and batch size 8k. Achieves 56.1% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-B-16-CommonPool.L.text-s1B-b8K.
31
+
32
+ * `commonpool_l_basic_s1b_b8k`: A ViT-B/16 trained on CommonPool-L filtered using basic filtering (English filtering + caption length and image size filtering), for 1.28B steps and batch size 8k. Achieves 51.6% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-B-16-CommonPool.L.basic-s1B-b8K.
33
+
34
+ * `commonpool_l_s1b_b8k`: A ViT-B/16 trained on CommonPool-L without any filtering, for 1.28B steps and batch size 8k. Achieves 45.9% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-B-16-CommonPool.L-s1B-b8K.
35
+
36
+
37
+ ## medium scale models
38
+
39
+ * `datacomp_m_s128m_b4k`: A ViT-B/32 trained on a 14M subset of DataComp-1B, for 128M steps and batch size 4k. Achieves 29.7% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-B-32-DataComp.M-s128M-b4K.
40
+
41
+ * `commonpool_m_clip_s128m_b4k`: A ViT-B/32 trained on CommonPool-M filtered using CLIP scores, for 128M steps and batch size 4k. Achieves 27.3% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-B-32-CommonPool.M.clip-s128M-b4K.
42
+
43
+ * `commonpool_m_laion_s128m_b4k`: A ViT-B/32 trained on CommonPool-M filtered using the LAION-2B filtering scheme, for 128M steps and batch size 4k. Achieves 23.0% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-B-32-CommonPool.M.laion-s128M-b4K.
44
+
45
+ * `commonpool_m_image_s128m_b4k`: A ViT-B/32 trained on CommonPool-M filtered using image-based filtering, for 128M steps and batch size 4k. Achieves 26.8% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-B-32-CommonPool.M.image-s128M-b4K.
46
+
47
+ * `commonpool_m_text_s128m_b4k`: A ViT-B/32 trained on CommonPool-M filtered using text-based filtering, for 128M steps and batch size 4k. Achieves 25.5% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-B-32-CommonPool.M.text-s128M-b4K.
48
+
49
+ * `commonpool_m_basic_s128m_b4k`: A ViT-B/32 trained on CommonPool-M filtered using basic filtering (English filtering + caption length and image size filtering), for 128M steps and batch size 4k. Achieves 22.6% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-B-32-CommonPool.M.basic-s128M-b4K.
50
+
51
+ * `commonpool_m_s128m_b4k`: A ViT-B/32 trained on CommonPool-M without any filtering, for 128M steps and batch size 4k. Achieves 17.6% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-B-32-CommonPool.M-s128M-b4K.
52
+
53
+
54
+ ## small scale models
55
+
56
+ * `datacomp_s_s13m_b4k`: A ViT-B/32 trained on a 1.4M subset of DataComp-1B, for 12.8M steps and batch size 4k. Achieves 3.9% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-B-32-DataComp.S-s13M-b4K.
57
+
58
+ * `commonpool_s_clip_s13m_b4k`: A ViT-B/32 trained on CommonPool-S filtered using CLIP scores, for 12.8M steps and batch size 4k. Achieves 5.1% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-B-32-CommonPool.S.clip-s13M-b4K.
59
+
60
+ * `commonpool_s_laion_s13m_b4k`: A ViT-B/32 trained on CommonPool-S filtered using the LAION-2B filtering scheme scores, for 12.8M steps and batch size 4k. Achieves 3.1% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-B-32-CommonPool.S.laion-s13M-b4K.
61
+
62
+ * `commonpool_s_image_s13m_b4k`: A ViT-B/32 trained on CommonPool-S filtered using image-based filtering, for 12.8M steps and batch size 4k. Achieves 4.3% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-B-32-CommonPool.S.image-s13M-b4K.
63
+
64
+ * `commonpool_s_text_s13m_b4k`: A ViT-B/32 trained on CommonPool-S filtered using text-based filtering, for 12.8M steps and batch size 4k. Achieves 4.6% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-B-32-CommonPool.S.text-s13M-b4K.
65
+
66
+ * `commonpool_s_basic_s13m_b4k`: A ViT-B/32 trained on CommonPool-S filtered using basic filtering (English filtering + caption length and image size filtering), for 12.8M steps and batch size 4k. Achieves 3.0% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-B-32-CommonPool.S.basic-s13M-b4K.
67
+
68
+ * `commonpool_s_s13m_b4k`: A ViT-B/32 trained on CommonPool-S without any filtering, for 12.8M steps and batch size 4k. Achieves 2.5% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-B-32-CommonPool.S-s13M-b4K.
69
+
Finetuning/docs/effective_robustness.png ADDED

Git LFS Details

  • SHA256: 51603ddf1b9407f3fa4e9186580d96acaa27b97aec0333b394a176fc435cd349
  • Pointer size: 132 Bytes
  • Size of remote file: 1.02 MB
Finetuning/docs/inverse_scaling_law.png ADDED

Git LFS Details

  • SHA256: 9745415b7b257588ca627d6747ebd2c723b479674ba33412c0f7ac973a441826
  • Pointer size: 131 Bytes
  • Size of remote file: 597 kB
Finetuning/docs/laion2b_clip_zeroshot_b32.png ADDED

Git LFS Details

  • SHA256: b0c64dcee9996123102bb1449d6972b810fdaaabfd825f05ea781c22fc9fca3f
  • Pointer size: 131 Bytes
  • Size of remote file: 246 kB
Finetuning/docs/laion_clip_zeroshot.png ADDED

Git LFS Details

  • SHA256: 6e3df77ab8bcde02d0a2c21bb47524d2f62eee8deb28cf44624891dab7711cb9
  • Pointer size: 131 Bytes
  • Size of remote file: 195 kB
Finetuning/docs/laion_clip_zeroshot_b16.png ADDED

Git LFS Details

  • SHA256: d726159933713c3a27c727aec132824ec92ca3606d3dca6e6fa76ef9abbf2e17
  • Pointer size: 131 Bytes
  • Size of remote file: 196 kB
Finetuning/docs/laion_clip_zeroshot_b16_plus_240.png ADDED

Git LFS Details

  • SHA256: 5e93fd1cf851ffd2a1ae5e4080a6d107ec470152eb201e20c63f2ba59d2f391e
  • Pointer size: 131 Bytes
  • Size of remote file: 255 kB
Finetuning/docs/laion_clip_zeroshot_l14.png ADDED

Git LFS Details

  • SHA256: b4f73b022debf23c31f969859fe9a63c8b360342afadb52db0400e0b9a1d293d
  • Pointer size: 131 Bytes
  • Size of remote file: 204 kB
Finetuning/docs/laion_openai_compare_b32.jpg ADDED
Finetuning/docs/model_profile.csv ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model,image_size,image_width,text_width,embed_dim,mparams,image_mparams,text_mparams,gflops,image_gflops,text_gflops
2
+ ViT-S-32-alt,224,384,256,256,43.22,22.59,20.63,3.56,2.29,1.27
3
+ ViT-S-32,224,384,384,384,63.09,22.64,40.44,5.66,2.29,3.38
4
+ ViT-M-32-alt,224,512,384,384,80.07,39.63,40.44,7.37,3.99,3.38
5
+ ViT-M-32,224,512,512,512,103.12,39.69,63.43,9.95,3.99,5.96
6
+ ViT-S-16-alt,224,384,256,256,42.4,21.76,20.63,10.47,9.2,1.27
7
+ ViT-S-16,224,384,384,384,62.26,21.81,40.44,12.58,9.2,3.38
8
+ ViT-B-32,224,768,512,512,151.28,87.85,63.43,14.78,8.82,5.96
9
+ ViT-B-32-quickgelu,224,768,512,512,151.28,87.85,63.43,14.78,8.82,5.96
10
+ convnext_tiny,224,768,512,1024,92.3,28.61,63.69,14.87,8.91,5.96
11
+ ViT-B-32-256,256,768,512,512,151.29,87.86,63.43,17.46,11.5,5.96
12
+ RN50,224,64,512,1024,102.01,38.32,63.69,18.18,12.22,5.96
13
+ RN50-quickgelu,224,64,512,1024,102.01,38.32,63.69,18.18,12.22,5.96
14
+ ViT-M-16-alt,224,512,384,384,78.98,38.53,40.44,19.36,15.98,3.38
15
+ ViT-M-16,224,512,512,512,102.02,38.59,63.43,21.94,15.98,5.96
16
+ vit_relpos_medium_patch16_cls_224,224,768,512,512,101.94,38.51,63.43,21.99,16.03,5.96
17
+ mt5-base-ViT-B-32,224,768,512,512,365.71,87.85,277.86,22.12,8.82,13.3
18
+ convnext_small,224,768,512,512,113.28,49.85,63.43,23.33,17.37,5.96
19
+ ViT-B-32-plus-256,256,896,640,640,210.3,119.13,91.16,24.83,15.56,9.27
20
+ RN101,224,64,512,512,119.69,56.26,63.43,25.5,19.54,5.96
21
+ RN101-quickgelu,224,64,512,512,119.69,56.26,63.43,25.5,19.54,5.96
22
+ vit_medium_patch16_gap_256,256,768,512,512,102.04,38.61,63.43,27.1,21.14,5.96
23
+ coca_ViT-B-32,224,768,512,512,253.56,89.16,63.43,33.34,9.19,5.96
24
+ convnext_base,224,768,512,512,151.52,88.09,63.43,36.67,30.71,5.96
25
+ swin_base_patch4_window7_224,224,768,640,640,178.56,87.4,91.16,40.13,30.86,9.27
26
+ ViT-B-16,224,768,512,512,149.62,86.19,63.43,41.09,35.13,5.96
27
+ ViT-B-16-quickgelu,224,768,512,512,149.62,86.19,63.43,41.09,35.13,5.96
28
+ EVA02-B-16,224,768,512,512,149.69,86.26,63.43,41.09,35.13,5.96
29
+ ViT-B-16-SigLIP,224,768,768,768,203.16,92.88,110.27,46.44,35.42,11.02
30
+ convnext_base_w,256,768,640,640,179.39,88.22,91.16,49.38,40.11,9.27
31
+ RN50x4,288,80,640,640,178.3,87.14,91.16,51.82,42.56,9.27
32
+ coca_roberta-ViT-B-32,224,768,768,512,420.37,87.85,124.45,53.12,8.82,13.12
33
+ ViT-B-16-plus,224,896,640,640,208.35,117.19,91.16,56.75,47.49,9.27
34
+ ViT-B-16-SigLIP-256,256,768,768,768,203.2,92.93,110.27,57.84,46.82,11.02
35
+ ViT-B-16-SigLIP-i18n-256,256,768,768,768,370.63,92.93,277.7,57.84,46.82,11.02
36
+ ViT-B-16-plus-240,240,896,640,640,208.38,117.21,91.16,64.03,54.76,9.27
37
+ convnext_base_w_320,320,768,640,640,179.39,88.22,91.16,71.94,62.67,9.27
38
+ convnext_large,224,768,768,768,321.06,197.41,123.65,82.02,68.72,13.3
39
+ coca_base,288,768,768,512,440.34,86.4,134.66,99.09,46.47,13.3
40
+ roberta-ViT-B-32,224,768,512,512,212.72,87.85,124.87,105.87,8.82,97.05
41
+ xlm-roberta-base-ViT-B-32,224,768,512,512,366.12,87.85,278.27,105.87,8.82,97.05
42
+ convnext_large_d,256,768,768,768,351.77,199.77,152.0,107.5,89.76,17.73
43
+ ViT-B-16-SigLIP-384,384,768,768,768,203.45,93.18,110.27,123.15,112.13,11.02
44
+ ViT-L-16,224,1024,768,768,427.74,304.09,123.65,136.41,123.11,13.3
45
+ convnext_large_d_320,320,768,768,768,351.77,199.77,152.0,157.98,140.25,17.73
46
+ RN50x16,384,96,768,768,290.98,167.33,123.65,162.69,149.39,13.3
47
+ ViT-L-14-CLIPA,224,1024,768,768,414.21,303.96,110.25,167.5,162.03,5.47
48
+ EVA02-L-14,224,768,768,768,427.76,304.11,123.65,175.3,162.0,13.3
49
+ ViT-L-14,224,1024,768,768,427.62,303.97,123.65,175.33,162.03,13.3
50
+ ViT-L-14-quickgelu,224,1024,768,768,427.62,303.97,123.65,175.33,162.03,13.3
51
+ convnext_xlarge,256,768,1024,1024,653.89,350.25,303.65,198.38,159.14,39.24
52
+ ViT-L-16-SigLIP-256,256,768,1024,1024,652.15,315.96,336.19,201.62,162.56,39.06
53
+ coca_ViT-L-14,224,1024,768,768,638.45,306.72,123.65,214.52,163.64,13.3
54
+ ViT-B-16-SigLIP-512,512,768,768,768,203.79,93.52,110.27,227.26,216.24,11.02
55
+ ViT-SO400M-14-SigLIP,224,768,1152,1152,877.36,427.68,449.68,233.54,220.35,13.19
56
+ ViT-L-14-280,280,1024,768,768,427.76,304.11,123.65,271.79,258.49,13.3
57
+ ViT-L-16-320,320,1024,768,768,427.95,304.3,123.65,271.93,258.63,13.3
58
+ ViT-H-16,224,1280,1024,1024,986.26,632.23,354.03,301.72,254.63,47.09
59
+ ViT-H-14-CLIPA,224,1280,1024,1024,968.24,632.07,336.16,354.02,334.59,19.43
60
+ nllb-clip-base,224,768,512,512,501.89,87.85,414.04,369.6,8.82,360.78
61
+ ViT-H-14,224,1280,1024,1024,986.11,632.08,354.03,381.68,334.59,47.09
62
+ ViT-H-14-quickgelu,224,1280,1024,1024,986.11,632.08,354.03,381.68,334.59,47.09
63
+ ViT-L-14-CLIPA-336,336,1024,768,768,414.54,304.29,110.25,387.39,381.92,5.47
64
+ EVA02-L-14-336,336,768,768,768,428.08,304.43,123.65,395.16,381.86,13.3
65
+ ViT-L-14-336,336,1024,768,768,427.94,304.29,123.65,395.22,381.92,13.3
66
+ ViT-L-16-SigLIP-384,384,768,1024,1024,652.48,316.28,336.19,422.91,383.85,39.06
67
+ convnext_xxlarge,256,768,1024,1024,1200.58,846.54,354.03,443.03,395.94,47.09
68
+ nllb-clip-base-siglip,384,768,512,768,507.47,93.18,414.3,472.91,112.13,360.78
69
+ mt5-xl-ViT-H-14,224,1280,512,1024,2306.75,632.08,1674.68,514.04,334.59,179.45
70
+ EVA01-g-14,224,768,768,1024,1136.44,1012.59,123.85,547.36,534.06,13.3
71
+ RN50x64,448,128,1024,1024,623.26,420.38,202.88,552.65,529.11,23.55
72
+ EVA01-g-14-plus,224,768,1024,1024,1366.62,1012.59,354.03,581.15,534.06,47.09
73
+ ViT-g-14,224,1408,1024,1024,1366.68,1012.65,354.03,581.15,534.06,47.09
74
+ convnext_xxlarge_320,320,768,1024,1024,1200.58,846.54,354.03,665.74,618.65,47.09
75
+ xlm-roberta-large-ViT-H-14,224,1280,512,1024,1193.01,632.08,560.94,671.01,334.59,336.42
76
+ ViT-SO400M-14-SigLIP-384,384,768,1152,1152,877.96,428.23,449.73,723.48,670.35,53.13
77
+ ViT-H-14-CLIPA-336,336,1280,1024,1024,968.64,632.48,336.16,800.88,781.45,19.43
78
+ ViT-bigG-14-CLIPA,224,1664,1280,1280,2517.22,1844.9,672.32,1007.93,967.5,40.44
79
+ ViT-H-14-378-quickgelu,378,1280,1024,1024,986.71,632.68,354.03,1054.05,1006.96,47.09
80
+ ViT-bigG-14,224,1664,1280,1280,2539.57,1844.91,694.66,1065.36,967.5,97.86
81
+ nllb-clip-large,224,1280,512,1024,1399.22,632.08,767.14,1468.46,334.59,1133.87
82
+ nllb-clip-large-siglip,384,768,512,1152,1195.5,428.23,767.27,1804.22,670.35,1133.87
83
+ ViT-e-14,224,1792,1280,1280,4581.09,3807.72,773.37,2091.45,1981.35,110.1
84
+ ViT-bigG-14-CLIPA-336,336,1664,1280,1280,2517.76,1845.44,672.32,2271.58,2231.15,40.44
85
+ EVA02-E-14,224,768,1024,1024,4704.59,4350.56,354.03,2311.42,2264.33,47.09
86
+ EVA02-E-14-plus,224,768,1280,1024,5044.89,4350.56,694.33,2362.19,2264.33,97.86
Finetuning/docs/openclip_classification_results.csv ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,pretrained,params (M),FLOPs (B),Average perf. on 35 datasets,ImageNet 1k,Caltech-101,CIFAR-10,CIFAR-100,CLEVR Counts,CLEVR Distance,Country211,Describable Textures,EuroSAT,FGVC Aircraft,Food-101,GTSRB,ImageNet Sketch,ImageNet v2,ImageNet-A,ImageNet-O,ImageNet-R,KITTI Vehicle Distance,MNIST,ObjectNet,Oxford Flowers-102,Oxford-IIIT Pet,Pascal VOC 2007,PatchCamelyon,Rendered SST2,RESISC45,Stanford Cars,STL-10,SUN397,SVHN,iWildCam,Camelyon17,FMoW,Dollar Street,GeoDE
2
+ ViT-H-14-378-quickgelu,dfn5b,986.71,1054.05,0.7090,0.8437,0.9517,0.9880,0.9043,0.3596,0.2085,0.3787,0.7106,0.6133,0.7219,0.9623,0.6782,0.7324,0.7833,0.7964,0.3810,0.9376,0.3966,0.8364,0.7340,0.8935,0.9696,0.8241,0.6964,0.5546,0.7589,0.9598,0.9906,0.7733,0.6739,0.2205,0.7211,0.2075,0.7173,0.9349
3
+ EVA02-E-14-plus,laion2b_s9b_b144k,5044.89,2362.19,0.6980,0.8201,0.9535,0.9934,0.9316,0.2991,0.1998,0.3564,0.6777,0.7574,0.5360,0.9496,0.6740,0.7162,0.7564,0.8223,0.3540,0.9456,0.1842,0.7463,0.7937,0.8433,0.9567,0.8569,0.6442,0.6271,0.7490,0.9457,0.9926,0.7510,0.7560,0.2591,0.6948,0.2668,0.6951,0.9244
4
+ ViT-H-14-quickgelu,dfn5b,986.11,381.68,0.6972,0.8344,0.9552,0.9878,0.9051,0.2967,0.2117,0.3442,0.7064,0.6546,0.7147,0.9568,0.6772,0.7274,0.7736,0.6987,0.3810,0.9296,0.3347,0.8579,0.6813,0.8995,0.9658,0.8184,0.6539,0.5464,0.7508,0.9580,0.9890,0.7691,0.6764,0.2025,0.7050,0.2079,0.7009,0.9286
5
+ ViT-SO400M-14-SigLIP-384,webli,877.96,723.48,0.6916,0.8308,0.9599,0.9672,0.8357,0.4071,0.2246,0.3645,0.7303,0.6354,0.6069,0.9635,0.6429,0.7454,0.7717,0.8247,0.2775,0.9575,0.2082,0.8862,0.7695,0.9114,0.9680,0.7171,0.5268,0.7002,0.7211,0.9521,0.9930,0.7541,0.5151,0.2294,0.6149,0.3309,0.7301,0.9328
6
+ ViT-bigG-14-CLIPA-336,datacomp1b,2517.76,2271.58,0.6888,0.8309,0.9529,0.9904,0.9123,0.1399,0.2161,0.4094,0.7293,0.6457,0.5561,0.9623,0.6407,0.7454,0.7726,0.8599,0.3130,0.9535,0.2630,0.8533,0.7966,0.8694,0.9562,0.8162,0.5411,0.6420,0.7257,0.9542,0.9956,0.7645,0.6691,0.2383,0.5874,0.1766,0.6869,0.9407
7
+ ViT-bigG-14-CLIPA,datacomp1b,2517.22,1007.93,0.6871,0.8270,0.9513,0.9912,0.9135,0.1357,0.2113,0.3921,0.7207,0.6861,0.5576,0.9583,0.6460,0.7431,0.7699,0.8179,0.3075,0.9512,0.2743,0.8544,0.7694,0.8693,0.9576,0.8188,0.5345,0.6332,0.7137,0.9560,0.9965,0.7642,0.6811,0.2269,0.5955,0.1959,0.6869,0.9382
8
+ ViT-SO400M-14-SigLIP,webli,877.36,233.54,0.6819,0.8203,0.9600,0.9679,0.8417,0.4210,0.2213,0.3243,0.7106,0.6274,0.6029,0.9556,0.6382,0.7402,0.7607,0.7185,0.2960,0.9506,0.2489,0.8929,0.7060,0.8982,0.9522,0.7034,0.5057,0.6936,0.7257,0.9032,0.9939,0.7436,0.5670,0.1915,0.6215,0.3163,0.7173,0.9278
9
+ EVA02-E-14,laion2b_s4b_b115k,4704.59,2311.42,0.6725,0.8196,0.9541,0.9925,0.9258,0.1632,0.2499,0.3482,0.6878,0.7446,0.4892,0.9523,0.6729,0.7151,0.7566,0.8044,0.3340,0.9407,0.1294,0.7581,0.7674,0.8210,0.9569,0.8136,0.4972,0.5859,0.7324,0.9438,0.9926,0.7658,0.6381,0.2289,0.4894,0.2801,0.6682,0.9182
10
+ ViT-H-14-CLIPA-336,datacomp1b,968.64,800.88,0.6713,0.8180,0.9467,0.9890,0.8968,0.1326,0.2254,0.3551,0.7197,0.6604,0.4718,0.9572,0.5816,0.7282,0.7562,0.8275,0.3115,0.9438,0.2574,0.8245,0.7742,0.8463,0.9573,0.8134,0.4979,0.6052,0.7114,0.9483,0.9955,0.7635,0.6599,0.2239,0.4357,0.2500,0.6822,0.9278
11
+ ViT-L-14-quickgelu,dfn2b,427.62,175.33,0.6703,0.8141,0.9532,0.9836,0.8837,0.3325,0.2481,0.2823,0.6606,0.6493,0.3936,0.9457,0.6168,0.6832,0.7461,0.6677,0.3930,0.9000,0.2011,0.8470,0.7397,0.8654,0.9555,0.8162,0.6318,0.5502,0.7327,0.9470,0.9768,0.7546,0.6525,0.1883,0.6237,0.2237,0.6916,0.9111
12
+ ViT-bigG-14,laion2b_s39b_b160k,2539.57,1065.36,0.6694,0.8009,0.9484,0.9824,0.8752,0.2989,0.2002,0.3379,0.6867,0.6919,0.4953,0.9309,0.6244,0.6894,0.7359,0.6933,0.3785,0.9213,0.1308,0.7157,0.7284,0.8163,0.9529,0.8077,0.6364,0.6535,0.7235,0.9460,0.9850,0.7450,0.6961,0.1760,0.5905,0.2352,0.6857,0.9127
13
+ ViT-H-14-CLIPA,datacomp1b,968.24,354.02,0.6688,0.8152,0.9458,0.9888,0.8991,0.1513,0.2255,0.3401,0.7090,0.7146,0.4751,0.9554,0.5538,0.7272,0.7498,0.7701,0.3135,0.9426,0.2461,0.8189,0.7423,0.8437,0.9559,0.8170,0.4958,0.6189,0.7098,0.9458,0.9948,0.7608,0.6622,0.2160,0.4415,0.2684,0.6694,0.9236
14
+ ViT-H-14-quickgelu,metaclip_fullcc,986.11,381.68,0.6684,0.8051,0.9536,0.9804,0.8634,0.2115,0.1881,0.3716,0.7271,0.6450,0.5114,0.9423,0.6257,0.7052,0.7417,0.7533,0.3040,0.9342,0.2771,0.7266,0.7642,0.8448,0.9561,0.7495,0.6222,0.6925,0.7024,0.8990,0.9944,0.7440,0.5910,0.1680,0.5782,0.2314,0.6811,0.9077
15
+ ViT-L-14,datacomp_xl_s13b_b90k,427.62,175.33,0.6674,0.7921,0.9465,0.9824,0.8736,0.3555,0.2443,0.3157,0.6649,0.7124,0.4750,0.9452,0.5853,0.6795,0.7205,0.6959,0.3255,0.9083,0.2785,0.8661,0.7425,0.8262,0.9506,0.8247,0.5118,0.6101,0.6941,0.9305,0.9925,0.7427,0.6769,0.1614,0.5089,0.2403,0.6624,0.9152
16
+ ViT-L-16-SigLIP-384,webli,652.48,422.91,0.6668,0.8207,0.9611,0.9605,0.8188,0.3275,0.2077,0.2470,0.7080,0.5817,0.5312,0.9564,0.6385,0.7360,0.7593,0.7663,0.3130,0.9507,0.2222,0.8525,0.7284,0.8934,0.9681,0.7172,0.5466,0.5634,0.6789,0.9493,0.9924,0.7250,0.5672,0.2236,0.6637,0.1489,0.6916,0.9207
17
+ EVA01-g-14-plus,merged2b_s11b_b114k,1366.62,581.15,0.6645,0.7933,0.9506,0.9910,0.9008,0.2302,0.2293,0.3087,0.6734,0.7280,0.3947,0.9366,0.6644,0.6814,0.7214,0.7416,0.3415,0.9246,0.1491,0.7176,0.7491,0.7959,0.9490,0.8285,0.6244,0.5854,0.7079,0.9073,0.9949,0.7426,0.5951,0.1882,0.7100,0.2283,0.6589,0.9148
18
+ ViT-L-14-CLIPA-336,datacomp1b,414.54,387.39,0.6609,0.8026,0.9439,0.9864,0.8826,0.1566,0.2439,0.3066,0.6856,0.5811,0.4281,0.9456,0.5695,0.7087,0.7346,0.7771,0.3290,0.9329,0.1997,0.7667,0.7317,0.8100,0.9495,0.7979,0.6028,0.5316,0.6884,0.9407,0.9929,0.7560,0.6290,0.1937,0.6783,0.2500,0.6752,0.9240
19
+ ViT-L-14-quickgelu,metaclip_fullcc,427.62,175.33,0.6607,0.7917,0.9527,0.9759,0.8410,0.3107,0.2260,0.3394,0.6862,0.5894,0.4537,0.9352,0.5623,0.6896,0.7256,0.7231,0.3010,0.9205,0.2785,0.6444,0.7457,0.8143,0.9461,0.8030,0.6197,0.6678,0.7360,0.8868,0.9933,0.7355,0.4681,0.1581,0.7551,0.2592,0.6752,0.9140
20
+ EVA02-L-14-336,merged2b_s6b_b61k,428.08,395.16,0.6603,0.8039,0.9525,0.9892,0.8980,0.3635,0.2485,0.3354,0.6473,0.7139,0.3758,0.9421,0.5759,0.6891,0.7380,0.8289,0.2850,0.9324,0.2377,0.6421,0.7789,0.7645,0.9424,0.8267,0.5487,0.6463,0.6910,0.9158,0.9966,0.7480,0.4575,0.2105,0.5691,0.2198,0.6811,0.9136
21
+ ViT-L-14-CLIPA,datacomp1b,414.21,167.5,0.6577,0.7957,0.9453,0.9866,0.8850,0.1857,0.2449,0.2941,0.6963,0.6044,0.4299,0.9415,0.5906,0.7061,0.7305,0.7125,0.3370,0.9288,0.1927,0.7374,0.6988,0.8101,0.9497,0.8067,0.5915,0.5387,0.6843,0.9366,0.9919,0.7528,0.6390,0.1724,0.6760,0.2457,0.6647,0.9152
22
+ ViT-L-14,commonpool_xl_clip_s13b_b90k,427.62,175.33,0.6553,0.7637,0.9502,0.9797,0.8615,0.2547,0.2451,0.2984,0.6521,0.6681,0.3860,0.9355,0.5980,0.6538,0.6953,0.6197,0.3525,0.8924,0.2982,0.9040,0.7165,0.8006,0.9424,0.8336,0.5688,0.6178,0.6978,0.9352,0.9875,0.7351,0.6853,0.1439,0.5100,0.1705,0.6776,0.9056
23
+ convnext_xxlarge,laion2b_s34b_b82k_augreg_soup,1200.58,443.03,0.6545,0.7947,0.9448,0.9822,0.8687,0.1454,0.2365,0.3170,0.7053,0.6128,0.4434,0.9321,0.5508,0.6840,0.7260,0.6719,0.4060,0.9160,0.2363,0.8277,0.7273,0.8241,0.9445,0.8090,0.5142,0.6952,0.7190,0.9409,0.9810,0.7458,0.6254,0.1730,0.6071,0.0000,0.6764,0.9215
24
+ ViT-L-16-SigLIP-256,webli,652.15,201.62,0.6534,0.8045,0.9593,0.9619,0.8191,0.4065,0.2150,0.2141,0.7027,0.5598,0.5259,0.9463,0.6115,0.7209,0.7376,0.6213,0.3265,0.9396,0.1983,0.8499,0.6526,0.8827,0.9604,0.7409,0.5458,0.6172,0.6817,0.9386,0.9911,0.7253,0.5211,0.1796,0.5757,0.1296,0.6904,0.9173
25
+ convnext_xxlarge,laion2b_s34b_b82k_augreg_rewind,1200.58,443.03,0.6534,0.7931,0.9452,0.9823,0.8686,0.1651,0.2534,0.3155,0.7016,0.6331,0.4398,0.9308,0.5491,0.6825,0.7228,0.6657,0.3975,0.9139,0.2419,0.7930,0.7252,0.8241,0.9438,0.8100,0.5014,0.6897,0.7168,0.9406,0.9801,0.7459,0.6137,0.1735,0.6071,0.0000,0.6799,0.9228
26
+ xlm-roberta-large-ViT-H-14,frozen_laion5b_s13b_b90k,1193.01,671.01,0.6519,0.7695,0.9422,0.9718,0.8430,0.3358,0.2050,0.3172,0.6926,0.6793,0.4673,0.9236,0.6239,0.6581,0.6944,0.5935,0.3390,0.8940,0.1364,0.7804,0.6911,0.7532,0.9431,0.7995,0.5792,0.6436,0.6825,0.9362,0.9889,0.7551,0.5950,0.1392,0.6749,0.2098,0.6460,0.9111
27
+ EVA02-L-14,merged2b_s4b_b131k,427.76,175.3,0.6502,0.7977,0.9512,0.9908,0.9071,0.3176,0.2462,0.3091,0.6319,0.6994,0.3638,0.9340,0.5718,0.6813,0.7295,0.7619,0.2880,0.9272,0.2518,0.6729,0.7489,0.7631,0.9398,0.8220,0.5431,0.6150,0.6968,0.9055,0.9961,0.7410,0.4793,0.1886,0.5124,0.2017,0.6624,0.9073
28
+ convnext_xxlarge,laion2b_s34b_b82k_augreg,1200.58,443.03,0.6494,0.7907,0.9429,0.9816,0.8677,0.1399,0.1195,0.3127,0.7096,0.6030,0.4250,0.9295,0.5454,0.6806,0.7223,0.6692,0.4025,0.9131,0.2616,0.8687,0.7235,0.8091,0.9455,0.8116,0.5340,0.6782,0.7100,0.9399,0.9824,0.7436,0.6379,0.1616,0.5719,0.0000,0.6729,0.9228
29
+ ViT-g-14,laion2b_s34b_b88k,1366.68,581.15,0.6454,0.7847,0.9452,0.9815,0.8465,0.3768,0.1870,0.3091,0.6856,0.6530,0.4441,0.9241,0.4964,0.6754,0.7158,0.6092,0.3705,0.9020,0.2700,0.7191,0.6908,0.8010,0.9379,0.8166,0.5384,0.5678,0.6960,0.9394,0.9893,0.7411,0.5611,0.1524,0.4771,0.2090,0.6671,0.9090
30
+ ViT-H-14,laion2b_s32b_b79k,986.11,381.68,0.6442,0.7796,0.9421,0.9745,0.8473,0.2676,0.2358,0.2986,0.6782,0.7278,0.4265,0.9273,0.5832,0.6657,0.7090,0.5935,0.3825,0.8934,0.1097,0.7284,0.6941,0.7982,0.9438,0.7768,0.5430,0.6392,0.6995,0.9338,0.9848,0.7521,0.5252,0.1528,0.5638,0.2264,0.6343,0.9086
31
+ ViT-H-14-CLIPA-336,laion2b,968.64,800.88,0.6440,0.7910,0.9438,0.9826,0.8643,0.1835,0.2158,0.3111,0.7160,0.6393,0.3437,0.9303,0.5007,0.6994,0.7241,0.7213,0.3655,0.9269,0.1561,0.6365,0.7022,0.8009,0.9444,0.7723,0.5787,0.6178,0.7029,0.9476,0.9894,0.7567,0.6255,0.1853,0.5001,0.1666,0.6706,0.9257
32
+ ViT-B-16-SigLIP-512,webli,203.79,227.26,0.6434,0.7914,0.9516,0.9265,0.7146,0.2411,0.2226,0.1927,0.6793,0.4007,0.4521,0.9394,0.5171,0.6990,0.7283,0.6769,0.3615,0.9264,0.3924,0.8288,0.6764,0.8677,0.9499,0.7139,0.6615,0.5722,0.6538,0.9249,0.9853,0.7152,0.5444,0.1925,0.6606,0.1411,0.6928,0.9244
33
+ convnext_large_d_320,laion2b_s29b_b131k_ft_soup,351.77,157.98,0.6401,0.7685,0.9348,0.9659,0.8304,0.4293,0.2010,0.2654,0.6830,0.7161,0.3621,0.9162,0.5822,0.6504,0.6944,0.6044,0.4410,0.8862,0.1027,0.7434,0.6898,0.7755,0.9358,0.8129,0.4814,0.5585,0.7078,0.9369,0.9856,0.7376,0.6712,0.1786,0.4088,0.1901,0.6449,0.9094
34
+ EVA01-g-14,laion400m_s11b_b41k,1136.44,547.36,0.6390,0.7852,0.9477,0.9829,0.8865,0.1966,0.2467,0.2862,0.6144,0.7237,0.3226,0.9345,0.4913,0.6730,0.7152,0.7359,0.3285,0.9250,0.2405,0.6218,0.7200,0.7427,0.9414,0.8325,0.4987,0.5832,0.6976,0.9171,0.9889,0.7416,0.5889,0.1975,0.4999,0.1859,0.6741,0.8969
35
+ ViT-L-14,commonpool_xl_laion_s13b_b90k,427.62,175.33,0.6374,0.7545,0.9352,0.9796,0.8585,0.3819,0.2489,0.2503,0.6191,0.7378,0.2869,0.9200,0.6018,0.6352,0.6851,0.5747,0.3730,0.8708,0.1378,0.7740,0.6846,0.7435,0.9308,0.8107,0.5069,0.5986,0.7065,0.8912,0.9903,0.7327,0.5730,0.1421,0.5671,0.2337,0.6600,0.9115
36
+ convnext_large_d_320,laion2b_s29b_b131k_ft,351.77,157.98,0.6358,0.7660,0.9341,0.9647,0.8313,0.3688,0.1999,0.2673,0.6846,0.7131,0.3770,0.9160,0.5688,0.6472,0.6929,0.5933,0.4400,0.8823,0.1027,0.7695,0.6813,0.7696,0.9346,0.8002,0.4576,0.5623,0.6989,0.9348,0.9854,0.7355,0.6496,0.1664,0.4342,0.1782,0.6355,0.9090
37
+ ViT-B-16-SigLIP-384,webli,203.45,123.15,0.6349,0.7849,0.9507,0.9276,0.7147,0.2195,0.2239,0.1858,0.6718,0.4307,0.4522,0.9362,0.5196,0.6955,0.7211,0.6233,0.3640,0.9214,0.3333,0.8088,0.6343,0.8624,0.9515,0.7162,0.7010,0.5607,0.6579,0.9245,0.9863,0.7096,0.5285,0.1719,0.5931,0.1365,0.6846,0.9194
38
+ ViT-L-14-336,openai,427.94,395.22,0.6348,0.7656,0.9225,0.9493,0.7436,0.2003,0.1895,0.3445,0.5559,0.6144,0.3346,0.9386,0.5239,0.6100,0.7089,0.7748,0.3265,0.8905,0.2616,0.7916,0.7183,0.7852,0.9369,0.7815,0.6073,0.7057,0.6379,0.7932,0.9943,0.6865,0.5560,0.1490,0.6456,0.2325,0.6390,0.9015
39
+ coca_ViT-L-14,laion2b_s13b_b90k,638.45,214.52,0.6346,0.7561,0.9430,0.9722,0.8318,0.3781,0.2446,0.2551,0.6239,0.6752,0.3590,0.9038,0.5624,0.6453,0.6798,0.5336,0.3540,0.8812,0.1899,0.7790,0.6405,0.7643,0.9402,0.8096,0.5500,0.6634,0.6878,0.9276,0.9894,0.7406,0.6237,0.1375,0.4268,0.1932,0.6542,0.8960
40
+ ViT-g-14,laion2b_s12b_b42k,1366.68,581.15,0.6312,0.7663,0.9415,0.9706,0.8392,0.3317,0.2225,0.2878,0.6824,0.6469,0.3768,0.9155,0.4985,0.6516,0.6956,0.5716,0.3785,0.8869,0.1350,0.6840,0.6761,0.7800,0.9431,0.8108,0.5624,0.6425,0.7176,0.9292,0.9865,0.7541,0.3930,0.1486,0.4948,0.2040,0.6542,0.9132
41
+ convnext_large_d,laion2b_s26b_b102k_augreg,351.77,107.5,0.6303,0.7591,0.9365,0.9655,0.8309,0.3461,0.1997,0.2525,0.6739,0.6959,0.3610,0.9055,0.5299,0.6430,0.6826,0.5352,0.4425,0.8767,0.1027,0.8063,0.6618,0.7667,0.9282,0.7891,0.5309,0.5612,0.6768,0.9316,0.9829,0.7307,0.6812,0.1549,0.3964,0.1793,0.6402,0.9019
42
+ ViT-L-14-quickgelu,metaclip_400m,427.62,175.33,0.6264,0.7620,0.9464,0.9544,0.7727,0.2271,0.2514,0.3085,0.6245,0.6033,0.3983,0.9073,0.4755,0.6505,0.6977,0.6640,0.2895,0.8889,0.2419,0.6186,0.6923,0.7648,0.9381,0.7440,0.7039,0.6551,0.6848,0.8477,0.9928,0.7073,0.3239,0.1408,0.6916,0.1874,0.6741,0.8931
43
+ ViT-L-14,commonpool_xl_s13b_b90k,427.62,175.33,0.6251,0.7229,0.9327,0.9801,0.8410,0.1985,0.2461,0.2962,0.6202,0.6889,0.1957,0.9107,0.5467,0.6118,0.6511,0.5625,0.2855,0.8594,0.3390,0.9084,0.7022,0.6966,0.9060,0.8076,0.5248,0.5953,0.5756,0.8939,0.9890,0.7103,0.6589,0.1229,0.5246,0.1948,0.6811,0.8990
44
+ ViT-L-14,openai,427.62,175.33,0.6237,0.7554,0.9249,0.9559,0.7582,0.1943,0.2021,0.3187,0.5537,0.6263,0.3181,0.9305,0.5055,0.5959,0.6983,0.7075,0.3235,0.8784,0.2180,0.7634,0.6889,0.7923,0.9323,0.7828,0.5204,0.6881,0.6337,0.7788,0.9936,0.6756,0.5840,0.1211,0.6741,0.2229,0.6297,0.8839
45
+ ViT-L-14,laion2b_s32b_b82k,427.62,175.33,0.6219,0.7525,0.9388,0.9662,0.8332,0.3123,0.2234,0.2631,0.6293,0.6459,0.3652,0.9100,0.5618,0.6328,0.6780,0.5385,0.3870,0.8742,0.2293,0.5410,0.6529,0.7479,0.9309,0.8053,0.5641,0.5925,0.6687,0.9263,0.9885,0.7434,0.4087,0.1257,0.5972,0.2007,0.6402,0.8919
46
+ ViT-B-16-SigLIP,webli,203.16,46.44,0.6206,0.7604,0.9518,0.9234,0.7223,0.2373,0.2409,0.1594,0.6468,0.4428,0.4377,0.9162,0.5164,0.6792,0.6893,0.4541,0.3815,0.9030,0.4093,0.8354,0.5509,0.8549,0.9420,0.7212,0.5953,0.5244,0.6454,0.9081,0.9821,0.7001,0.5586,0.1309,0.6045,0.1265,0.6589,0.9106
47
+ ViT-B-16-SigLIP-256,webli,203.2,57.84,0.6196,0.7653,0.9496,0.9334,0.7327,0.2276,0.2340,0.1581,0.6574,0.4606,0.4473,0.9200,0.4940,0.6810,0.6920,0.4877,0.3785,0.9076,0.3685,0.8457,0.5723,0.8521,0.9424,0.7254,0.5657,0.5739,0.6440,0.9106,0.9818,0.7026,0.5399,0.1493,0.4966,0.1253,0.6589,0.9061
48
+ ViT-B-16,datacomp_xl_s13b_b90k,149.62,41.09,0.6178,0.7349,0.9380,0.9624,0.8212,0.3267,0.2461,0.2215,0.5793,0.5883,0.2970,0.9047,0.5523,0.6044,0.6598,0.4840,0.4285,0.8362,0.2883,0.7649,0.6350,0.7701,0.9254,0.8178,0.6002,0.5162,0.6535,0.8883,0.9811,0.7051,0.6272,0.1181,0.4799,0.1504,0.6168,0.8990
49
+ ViT-B-32-256,datacomp_s34b_b86k,151.29,17.46,0.6133,0.7281,0.9348,0.9653,0.8287,0.2489,0.2271,0.1968,0.6064,0.6469,0.3645,0.8909,0.5152,0.6065,0.6481,0.3757,0.4635,0.8344,0.2658,0.7939,0.5960,0.7822,0.9115,0.7880,0.5880,0.5294,0.6505,0.8990,0.9731,0.7021,0.6708,0.0910,0.6252,0.0000,0.6238,0.8923
50
+ coca_ViT-L-14,mscoco_finetuned_laion2b_s13b_b90k,638.45,214.52,0.6128,0.7204,0.9420,0.9630,0.7965,0.3765,0.2501,0.1800,0.6213,0.5867,0.2329,0.8436,0.5453,0.6114,0.6475,0.4548,0.3865,0.8574,0.3797,0.8292,0.6253,0.7074,0.9115,0.8106,0.4943,0.6107,0.6267,0.8865,0.9861,0.7398,0.5564,0.1303,0.4294,0.1678,0.6636,0.8772
51
+ RN50x64,openai,623.26,552.65,0.6111,0.7391,0.9026,0.8510,0.5985,0.2254,0.1994,0.2981,0.5314,0.5765,0.3103,0.9205,0.4792,0.5593,0.6706,0.7077,0.3830,0.8441,0.3094,0.8583,0.6820,0.7745,0.9360,0.7398,0.5387,0.7106,0.6265,0.7581,0.9829,0.6661,0.6044,0.1469,0.5280,0.1939,0.6472,0.8898
52
+ ViT-B-16,dfn2b,149.62,41.09,0.6090,0.7624,0.9429,0.9672,0.8349,0.2327,0.2453,0.1955,0.5755,0.5402,0.2473,0.9130,0.4701,0.6204,0.6818,0.4820,0.4925,0.8310,0.1927,0.7814,0.6319,0.8201,0.9372,0.7884,0.5214,0.4876,0.6137,0.9073,0.9753,0.7143,0.5985,0.1554,0.4993,0.1415,0.6250,0.8910
53
+ ViT-B-16-quickgelu,metaclip_fullcc,149.62,41.09,0.6042,0.7212,0.9328,0.9572,0.7891,0.2935,0.2260,0.2271,0.6223,0.5265,0.3059,0.8882,0.4659,0.6016,0.6505,0.4953,0.4150,0.8423,0.1871,0.6610,0.6138,0.7358,0.9175,0.7818,0.5915,0.5898,0.6744,0.8302,0.9841,0.6879,0.3909,0.1227,0.6993,0.1932,0.6402,0.8868
54
+ ViT-B-16-SigLIP-i18n-256,webli,370.63,57.84,0.6037,0.7513,0.9475,0.9118,0.7216,0.2552,0.1976,0.1593,0.6426,0.3826,0.3325,0.9171,0.5276,0.6588,0.6814,0.4585,0.3685,0.8920,0.3826,0.8301,0.5977,0.8387,0.9387,0.7536,0.5381,0.5700,0.5737,0.8926,0.9764,0.6978,0.4272,0.1451,0.4899,0.1064,0.6472,0.9186
55
+ ViT-L-14,laion400m_e32,427.62,175.33,0.5998,0.7277,0.9266,0.9464,0.7741,0.2421,0.2452,0.2302,0.6053,0.6233,0.2490,0.9007,0.4989,0.5964,0.6545,0.4647,0.4190,0.8467,0.1997,0.7612,0.5969,0.7306,0.9170,0.7561,0.4968,0.5601,0.6741,0.8962,0.9808,0.7258,0.4955,0.1254,0.4555,0.1708,0.6168,0.8839
56
+ ViT-L-14,laion400m_e31,427.62,175.33,0.5991,0.7271,0.9259,0.9465,0.7738,0.2420,0.2452,0.2290,0.5973,0.6322,0.2462,0.9002,0.4965,0.5944,0.6547,0.4596,0.4225,0.8466,0.1997,0.7668,0.5962,0.7323,0.9154,0.7585,0.4877,0.5651,0.6710,0.8964,0.9804,0.7247,0.4956,0.1239,0.4595,0.1651,0.6075,0.8831
57
+ EVA02-B-16,merged2b_s8b_b131k,149.69,41.09,0.5891,0.7472,0.9302,0.9846,0.8773,0.2125,0.2254,0.2136,0.5282,0.6635,0.2506,0.8943,0.4630,0.5771,0.6701,0.5396,0.3410,0.8244,0.2208,0.4729,0.6214,0.7245,0.9211,0.8019,0.5091,0.5415,0.6037,0.7855,0.9949,0.7064,0.2497,0.1515,0.7095,0.1724,0.6086,0.8810
58
+ convnext_base_w_320,laion_aesthetic_s13b_b82k_augreg,179.39,71.94,0.5874,0.7128,0.9255,0.8823,0.6515,0.2825,0.2225,0.2243,0.6074,0.5124,0.2632,0.8947,0.4365,0.5646,0.6362,0.4157,0.5075,0.8136,0.2180,0.7219,0.5237,0.7524,0.9239,0.7530,0.5696,0.5508,0.6421,0.8918,0.9755,0.7037,0.4443,0.1392,0.5502,0.1215,0.6297,0.8935
59
+ ViT-B-16,laion2b_s34b_b88k,149.62,41.09,0.5869,0.7023,0.9287,0.9494,0.7684,0.2149,0.2455,0.2029,0.5633,0.5346,0.2695,0.8663,0.4826,0.5608,0.6228,0.3823,0.4625,0.8061,0.1730,0.6577,0.5598,0.7084,0.9048,0.7886,0.5639,0.5969,0.6275,0.8848,0.9786,0.7085,0.5002,0.1217,0.6249,0.1211,0.5841,0.8735
60
+ convnext_base_w,laion2b_s13b_b82k_augreg,179.39,49.38,0.5833,0.7147,0.9258,0.9561,0.8021,0.3307,0.2450,0.2016,0.6144,0.4828,0.2235,0.8675,0.4654,0.5890,0.6329,0.3817,0.5110,0.8253,0.2068,0.6441,0.5732,0.7017,0.9191,0.7979,0.4823,0.5925,0.6056,0.9126,0.9705,0.7113,0.5376,0.1285,0.3801,0.0000,0.5935,0.8881
61
+ ViT-B-32,datacomp_xl_s13b_b90k,151.28,14.78,0.5831,0.6917,0.9230,0.9561,0.8031,0.1294,0.2423,0.1756,0.5713,0.5746,0.2463,0.8632,0.5185,0.5676,0.6075,0.3035,0.4975,0.7818,0.1632,0.8124,0.5510,0.7353,0.9002,0.8151,0.5284,0.4849,0.6343,0.8728,0.9654,0.6780,0.6240,0.0863,0.6656,0.0000,0.5643,0.8731
62
+ ViT-B-16-quickgelu,metaclip_400m,149.62,41.09,0.5778,0.7080,0.9341,0.9014,0.6657,0.3010,0.2245,0.2260,0.5590,0.5572,0.2839,0.8725,0.4375,0.5789,0.6261,0.4700,0.3920,0.8177,0.2419,0.4794,0.5916,0.7229,0.9035,0.7217,0.6203,0.6046,0.6619,0.7421,0.9724,0.6678,0.2523,0.1122,0.6769,0.1991,0.6063,0.8894
63
+ convnext_base_w,laion2b_s13b_b82k,179.39,49.38,0.5744,0.7078,0.9222,0.9383,0.7519,0.2385,0.1866,0.2018,0.5957,0.5678,0.2825,0.8711,0.4930,0.5712,0.6234,0.3993,0.4815,0.8070,0.1505,0.5435,0.5795,0.6955,0.9189,0.8038,0.4154,0.6041,0.6284,0.8957,0.9775,0.7128,0.3459,0.1181,0.4812,0.1072,0.6075,0.8802
64
+ convnext_base_w,laion_aesthetic_s13b_b82k,179.39,49.38,0.5744,0.7099,0.9061,0.8305,0.6116,0.2960,0.1956,0.2228,0.6229,0.4519,0.2938,0.8847,0.4016,0.5546,0.6342,0.4123,0.4750,0.7986,0.2630,0.6739,0.5559,0.7170,0.9199,0.7548,0.5517,0.5579,0.6162,0.8661,0.9709,0.7143,0.2802,0.1378,0.5859,0.1284,0.6343,0.8722
65
+ ViT-B-16-plus-240,laion400m_e32,208.38,64.03,0.5735,0.6919,0.9239,0.9273,0.7377,0.2387,0.2348,0.1894,0.5548,0.5820,0.1852,0.8734,0.4944,0.5442,0.6148,0.3689,0.4980,0.8049,0.2813,0.5709,0.5384,0.6886,0.9015,0.7636,0.5524,0.5799,0.6137,0.8448,0.9698,0.6985,0.3777,0.1163,0.4876,0.1616,0.5923,0.8697
66
+ ViT-B-16-plus-240,laion400m_e31,208.38,64.03,0.5725,0.6904,0.9219,0.9247,0.7329,0.2413,0.2346,0.1884,0.5548,0.5702,0.1861,0.8735,0.4897,0.5443,0.6138,0.3676,0.5030,0.8038,0.2799,0.5722,0.5374,0.6825,0.9035,0.7634,0.5512,0.5859,0.6144,0.8450,0.9689,0.6991,0.3767,0.1164,0.4837,0.1618,0.5841,0.8689
67
+ ViT-B-32,laion2b_s34b_b79k,151.28,14.78,0.5701,0.6656,0.9105,0.9358,0.7555,0.1535,0.2451,0.1667,0.5569,0.4806,0.2453,0.8269,0.4933,0.5366,0.5814,0.2627,0.4995,0.7643,0.2630,0.6996,0.4883,0.7024,0.9076,0.7910,0.5993,0.5728,0.6106,0.8607,0.9656,0.6872,0.4257,0.0930,0.6392,0.1479,0.5666,0.8543
68
+ RN50x16,openai,290.98,162.69,0.5697,0.7072,0.8856,0.8134,0.5209,0.1953,0.2095,0.2437,0.5266,0.4328,0.2783,0.9051,0.3984,0.5063,0.6420,0.5724,0.4495,0.7933,0.2307,0.6798,0.6071,0.7188,0.8956,0.6800,0.6249,0.6771,0.5883,0.7286,0.9775,0.6391,0.4548,0.1079,0.6248,0.1593,0.6121,0.8539
69
+ ViT-B-16,openai,149.62,41.09,0.5657,0.6834,0.8901,0.9077,0.6695,0.2123,0.2231,0.2282,0.4495,0.5594,0.2421,0.8872,0.4339,0.4824,0.6188,0.4995,0.4230,0.7770,0.2644,0.5135,0.5531,0.6907,0.8886,0.7831,0.5072,0.6068,0.5822,0.6477,0.9825,0.6435,0.5190,0.1099,0.6808,0.1888,0.5876,0.8614
70
+ xlm-roberta-base-ViT-B-32,laion5b_s13b_b90k,366.12,105.87,0.5648,0.6236,0.9079,0.9366,0.7654,0.1675,0.2025,0.1896,0.6037,0.6006,0.2692,0.8010,0.4561,0.5071,0.5425,0.2355,0.4825,0.7410,0.1814,0.7407,0.4607,0.6235,0.8690,0.7856,0.6423,0.5354,0.6137,0.8556,0.9668,0.6785,0.5532,0.0801,0.5770,0.1292,0.5771,0.8647
71
+ convnext_base_w_320,laion_aesthetic_s13b_b82k,179.39,71.94,0.5634,0.7167,0.9136,0.8613,0.5900,0.2283,0.2255,0.2237,0.5931,0.3519,0.2834,0.8930,0.4459,0.5639,0.6398,0.4225,0.4745,0.8054,0.0928,0.6647,0.5616,0.7165,0.9244,0.7240,0.4899,0.5541,0.6176,0.8821,0.9664,0.7161,0.2606,0.1473,0.4729,0.1813,0.6273,0.8856
72
+ ViT-B-16,laion400m_e32,149.62,41.09,0.5633,0.6705,0.9131,0.9172,0.7116,0.2869,0.2451,0.1810,0.5133,0.5019,0.1765,0.8613,0.4346,0.5238,0.5963,0.3324,0.5075,0.7793,0.1814,0.6624,0.5152,0.6691,0.8917,0.7684,0.5960,0.5437,0.5852,0.8373,0.9698,0.6961,0.3413,0.1028,0.5999,0.1546,0.5935,0.8534
73
+ ViT-B-16,laion400m_e31,149.62,41.09,0.5632,0.6698,0.9159,0.9169,0.7130,0.2889,0.2451,0.1804,0.5138,0.5033,0.1742,0.8587,0.4353,0.5233,0.5943,0.3327,0.5035,0.7777,0.1997,0.6531,0.5128,0.6693,0.8911,0.7678,0.5925,0.5459,0.5849,0.8365,0.9703,0.6958,0.3388,0.1056,0.5976,0.1546,0.5946,0.8534
74
+ convnext_base,laion400m_s13b_b51k,151.52,36.67,0.5594,0.6627,0.9151,0.8899,0.6462,0.2386,0.2209,0.1700,0.5404,0.4850,0.1556,0.8515,0.4551,0.5196,0.5859,0.3092,0.4925,0.7575,0.2925,0.6114,0.5058,0.6900,0.8853,0.7528,0.6116,0.5376,0.5683,0.8409,0.9656,0.6845,0.4038,0.1095,0.6565,0.1589,0.5537,0.8530
75
+ ViT-B-32-quickgelu,metaclip_fullcc,151.28,14.78,0.5564,0.6766,0.9290,0.9518,0.7767,0.1871,0.2307,0.1764,0.5883,0.4991,0.2705,0.8309,0.3922,0.5599,0.5957,0.2993,0.4825,0.7805,0.1871,0.4272,0.5286,0.6935,0.9087,0.7652,0.5596,0.5310,0.6124,0.7738,0.9630,0.6689,0.3447,0.0915,0.5656,0.1588,0.6051,0.8610
76
+ coca_ViT-B-32,laion2b_s13b_b90k,253.56,33.34,0.5537,0.6331,0.9078,0.9387,0.7378,0.1831,0.2175,0.1450,0.5367,0.4602,0.1783,0.7893,0.4532,0.5121,0.5522,0.2149,0.4920,0.7376,0.2644,0.7097,0.4470,0.6226,0.8875,0.7832,0.5938,0.5766,0.5994,0.8397,0.9626,0.6736,0.5503,0.0876,0.5749,0.1010,0.5724,0.8430
77
+ ViT-B-32,laion2b_e16,151.28,14.78,0.5470,0.6565,0.9104,0.9403,0.7544,0.1923,0.2310,0.1652,0.5383,0.5030,0.2298,0.8166,0.3655,0.5287,0.5739,0.2615,0.5030,0.7588,0.1758,0.6347,0.4877,0.6732,0.8903,0.7877,0.5072,0.5437,0.6190,0.8437,0.9653,0.6851,0.4164,0.0971,0.4648,0.0000,0.5724,0.8526
78
+ ViT-B-16,datacomp_l_s1b_b8k,149.62,41.09,0.5406,0.6310,0.8969,0.9381,0.7540,0.2314,0.2513,0.1434,0.4691,0.5011,0.1001,0.8311,0.4343,0.4976,0.5521,0.2545,0.4955,0.7177,0.4008,0.5400,0.5298,0.6261,0.8352,0.8089,0.4973,0.5294,0.5273,0.7718,0.9576,0.6431,0.4595,0.0729,0.5000,0.0976,0.5748,0.8493
79
+ roberta-ViT-B-32,laion2b_s12b_b32k,212.72,105.87,0.5405,0.6171,0.9039,0.9325,0.7505,0.1472,0.2007,0.1472,0.5920,0.5215,0.1725,0.7812,0.4082,0.4912,0.5331,0.2120,0.5075,0.7224,0.3854,0.6636,0.4499,0.5893,0.8670,0.7804,0.4985,0.5420,0.6117,0.8315,0.9564,0.6627,0.4526,0.0606,0.4098,0.1161,0.5549,0.8426
80
+ ViT-B-32-quickgelu,metaclip_400m,151.28,14.78,0.5377,0.6558,0.9171,0.9125,0.7006,0.2175,0.2448,0.1716,0.5255,0.5239,0.2680,0.8106,0.3576,0.5330,0.5760,0.2863,0.4680,0.7477,0.2588,0.4144,0.5046,0.6811,0.8877,0.7081,0.6426,0.5338,0.5954,0.7060,0.9543,0.6345,0.2056,0.0819,0.6443,0.0000,0.5970,0.8539
81
+ ViT-B-16,commonpool_l_clip_s1b_b8k,149.62,41.09,0.5348,0.5777,0.8853,0.9349,0.7313,0.2691,0.2313,0.1417,0.4500,0.4728,0.0822,0.7995,0.4657,0.4589,0.4995,0.2165,0.4950,0.6843,0.3755,0.7032,0.4914,0.5667,0.7561,0.7821,0.4962,0.5036,0.5295,0.8171,0.9496,0.6295,0.5985,0.0741,0.4920,0.1257,0.5818,0.8501
82
+ ViT-B-32-quickgelu,laion400m_e32,151.28,14.78,0.5282,0.6293,0.9118,0.9074,0.7029,0.1624,0.2391,0.1475,0.5457,0.5143,0.1658,0.8086,0.4197,0.4939,0.5506,0.2172,0.5345,0.7342,0.2897,0.3733,0.4389,0.6620,0.8671,0.7582,0.5592,0.5228,0.5454,0.7926,0.9560,0.6700,0.3039,0.0745,0.4709,0.1296,0.5491,0.8380
83
+ ViT-B-32-quickgelu,laion400m_e31,151.28,14.78,0.5273,0.6294,0.9121,0.9060,0.7021,0.1659,0.2397,0.1476,0.5447,0.5085,0.1675,0.8080,0.4230,0.4937,0.5487,0.2161,0.5335,0.7349,0.2911,0.3656,0.4374,0.6638,0.8629,0.7539,0.5543,0.5217,0.5446,0.7914,0.9553,0.6702,0.3144,0.0788,0.4554,0.1310,0.5467,0.8363
84
+ ViT-B-32,openai,151.28,14.78,0.5265,0.6332,0.8758,0.8983,0.6423,0.2320,0.2335,0.1720,0.4436,0.5044,0.1953,0.8400,0.3258,0.4229,0.5592,0.3155,0.4775,0.6933,0.2743,0.4839,0.4431,0.6670,0.8700,0.7640,0.6224,0.5865,0.5362,0.5963,0.9713,0.6248,0.3159,0.0732,0.6061,0.1676,0.5386,0.8217
85
+ ViT-B-32-quickgelu,openai,151.28,14.78,0.5265,0.6332,0.8758,0.8983,0.6423,0.2320,0.2335,0.1720,0.4436,0.5044,0.1953,0.8400,0.3258,0.4229,0.5592,0.3155,0.4775,0.6933,0.2743,0.4839,0.4431,0.6670,0.8700,0.7640,0.6224,0.5865,0.5362,0.5963,0.9713,0.6248,0.3159,0.0732,0.6061,0.1676,0.5386,0.8217
86
+ RN50x4,openai,178.3,51.82,0.5191,0.6627,0.8661,0.7943,0.4514,0.2045,0.0905,0.2039,0.4862,0.3354,0.2102,0.8640,0.3622,0.4468,0.5944,0.4145,0.4955,0.7274,0.2335,0.4903,0.5141,0.6766,0.8829,0.6814,0.5675,0.6716,0.5338,0.6673,0.9658,0.6089,0.3190,0.0870,0.5435,0.1130,0.5654,0.8376
87
+ nllb-clip-large-siglip,v1,1195.5,1804.22,0.5148,0.5175,0.8392,0.9651,0.7626,0.1737,0.2211,0.1549,0.4394,0.4941,0.0451,0.6312,0.4700,0.5050,0.4631,0.5611,0.1825,0.8325,0.4290,0.6203,0.6492,0.2846,0.4082,0.7823,0.5004,0.5601,0.5656,0.6451,0.9939,0.6355,0.4258,0.0950,0.5000,0.1415,0.6390,0.8855
88
+ ViT-B-32,laion400m_e31,151.28,14.78,0.5070,0.6022,0.8916,0.8825,0.6781,0.1549,0.2261,0.1356,0.5218,0.4694,0.1437,0.7814,0.4082,0.4648,0.5234,0.1957,0.5085,0.7079,0.1224,0.4108,0.4281,0.6319,0.8541,0.7312,0.5495,0.5162,0.5108,0.7436,0.9494,0.6508,0.2891,0.0745,0.4975,0.1076,0.5491,0.8328
89
+ ViT-B-32,laion400m_e32,151.28,14.78,0.5067,0.6024,0.8918,0.8840,0.6773,0.1536,0.2261,0.1349,0.5229,0.4754,0.1467,0.7817,0.4070,0.4646,0.5237,0.1953,0.5080,0.7084,0.1181,0.4000,0.4292,0.6323,0.8513,0.7328,0.5490,0.5206,0.5094,0.7454,0.9498,0.6509,0.2759,0.0741,0.5084,0.1068,0.5444,0.8326
90
+ RN101,openai,119.69,25.5,0.5036,0.6228,0.8527,0.8078,0.4764,0.2437,0.0923,0.1693,0.4335,0.3131,0.1853,0.8367,0.3753,0.4106,0.5612,0.2944,0.5085,0.6817,0.2644,0.5254,0.4515,0.6532,0.8652,0.6512,0.5819,0.6403,0.5476,0.6100,0.9680,0.5803,0.3185,0.0888,0.4723,0.1615,0.5631,0.8164
91
+ RN101-quickgelu,openai,119.69,25.5,0.5036,0.6228,0.8527,0.8078,0.4764,0.2437,0.0923,0.1693,0.4335,0.3131,0.1853,0.8367,0.3753,0.4106,0.5612,0.2944,0.5085,0.6817,0.2644,0.5254,0.4515,0.6532,0.8652,0.6512,0.5819,0.6403,0.5476,0.6100,0.9680,0.5803,0.3185,0.0888,0.4723,0.1615,0.5631,0.8164
92
+ ViT-B-16,commonpool_l_laion_s1b_b8k,149.62,41.09,0.5017,0.5526,0.8766,0.9296,0.7184,0.2681,0.2173,0.1119,0.4144,0.4115,0.0714,0.7661,0.3296,0.4315,0.4790,0.2004,0.4930,0.6501,0.3432,0.4753,0.4638,0.5023,0.7769,0.7686,0.5158,0.5228,0.5314,0.6760,0.9409,0.6278,0.4301,0.0490,0.5127,0.1026,0.5514,0.8463
93
+ RN50,openai,102.01,18.18,0.4812,0.5982,0.8329,0.7157,0.4030,0.2171,0.1623,0.1542,0.4154,0.4081,0.1703,0.8080,0.3510,0.3544,0.5284,0.2327,0.5720,0.6073,0.1730,0.5755,0.4141,0.6522,0.8529,0.6510,0.6393,0.5645,0.4521,0.5453,0.9419,0.5994,0.2883,0.0623,0.5624,0.0000,0.5222,0.8129
94
+ RN50-quickgelu,openai,102.01,18.18,0.4812,0.5982,0.8329,0.7157,0.4030,0.2171,0.1623,0.1542,0.4154,0.4081,0.1703,0.8080,0.3510,0.3544,0.5284,0.2327,0.5720,0.6073,0.1730,0.5755,0.4141,0.6522,0.8529,0.6510,0.6393,0.5645,0.4521,0.5453,0.9419,0.5994,0.2883,0.0623,0.5624,0.0000,0.5222,0.8129
95
+ ViT-B-16,commonpool_l_image_s1b_b8k,149.62,41.09,0.4812,0.5719,0.8856,0.9321,0.6955,0.2143,0.2453,0.1308,0.4170,0.3193,0.0735,0.7797,0.2514,0.4343,0.4872,0.2143,0.4725,0.6356,0.3826,0.2219,0.4793,0.4817,0.7784,0.7841,0.5002,0.4986,0.4622,0.6627,0.9489,0.6335,0.2673,0.0424,0.5000,0.0000,0.5946,0.8422
96
+ ViT-B-16,commonpool_l_text_s1b_b8k,149.62,41.09,0.4758,0.5605,0.8720,0.9391,0.7054,0.1843,0.2373,0.0995,0.3941,0.3830,0.0451,0.7724,0.2317,0.4437,0.4835,0.2220,0.4770,0.6708,0.2686,0.2593,0.4911,0.5164,0.7049,0.7669,0.4857,0.4931,0.4663,0.6525,0.9523,0.6088,0.2122,0.0623,0.5697,0.0000,0.5643,0.8564
97
+ ViT-B-16,commonpool_l_basic_s1b_b8k,149.62,41.09,0.4566,0.5155,0.8444,0.8289,0.5251,0.2061,0.2277,0.1173,0.4133,0.3820,0.0481,0.7461,0.2021,0.3932,0.4325,0.1913,0.4600,0.6087,0.3333,0.2809,0.4493,0.4357,0.6956,0.7151,0.5899,0.5387,0.4313,0.7216,0.9373,0.5974,0.1173,0.0436,0.5712,0.0000,0.5421,0.8384
98
+ ViT-B-16,commonpool_l_s1b_b8k,149.62,41.09,0.4386,0.4593,0.8089,0.9133,0.6421,0.1594,0.2203,0.1177,0.3383,0.3348,0.0316,0.6735,0.2766,0.3448,0.3914,0.1592,0.4335,0.5265,0.2686,0.3603,0.4126,0.3681,0.5587,0.7093,0.5516,0.5118,0.4154,0.6060,0.9339,0.5713,0.3047,0.0399,0.5102,0.0000,0.5654,0.8305
99
+ nllb-clip-base-siglip,v1,507.47,472.91,0.4377,0.3909,0.7507,0.9043,0.5939,0.1453,0.2254,0.0583,0.3617,0.3744,0.0090,0.4961,0.3429,0.3886,0.3439,0.3165,0.1695,0.6846,0.1927,0.5007,0.5001,0.1567,0.1868,0.7599,0.6692,0.5859,0.5049,0.4703,0.9818,0.5640,0.4033,0.0694,0.6500,0.0956,0.6320,0.8392
100
+ nllb-clip-large,v1,1399.22,1468.46,0.4163,0.3672,0.7234,0.9634,0.6797,0.2389,0.2254,0.0691,0.3447,0.5454,0.0216,0.4447,0.2462,0.3316,0.3233,0.2632,0.1725,0.5624,0.3727,0.2716,0.5268,0.0978,0.1283,0.7551,0.5417,0.5585,0.4983,0.3865,0.9811,0.5512,0.1725,0.0403,0.5181,0.1419,0.6752,0.8305
101
+ ViT-B-32,datacomp_m_s128m_b4k,151.28,14.78,0.3364,0.2972,0.7159,0.8252,0.5476,0.1365,0.2249,0.0453,0.2133,0.3393,0.0304,0.4168,0.1366,0.1930,0.2440,0.0493,0.4085,0.3402,0.2110,0.1147,0.1971,0.2965,0.4311,0.5459,0.5862,0.5316,0.2778,0.2803,0.8365,0.3637,0.1500,0.0142,0.6669,0.0000,0.4498,0.6559
102
+ ViT-B-32,commonpool_m_clip_s128m_b4k,151.28,14.78,0.3344,0.2725,0.6678,0.8405,0.5549,0.1402,0.2238,0.0458,0.2176,0.2589,0.0215,0.3999,0.1586,0.1844,0.2247,0.0420,0.3925,0.3297,0.3235,0.1778,0.2093,0.2551,0.3828,0.6074,0.5210,0.5014,0.2641,0.4123,0.8370,0.3875,0.1931,0.0154,0.5369,0.0000,0.4451,0.6610
103
+ nllb-clip-base,v1,501.89,369.6,0.3290,0.2432,0.5914,0.8435,0.4839,0.1531,0.2254,0.0312,0.2782,0.4104,0.0185,0.2962,0.1852,0.1838,0.2029,0.0921,0.2195,0.3656,0.3741,0.1821,0.2874,0.0850,0.0784,0.6802,0.5509,0.5420,0.3603,0.1921,0.9514,0.4708,0.1441,0.0463,0.4873,0.0000,0.5456,0.7136
104
+ RN50-quickgelu,cc12m,102.01,18.18,0.3193,0.3647,0.6581,0.5404,0.2079,0.2063,0.1574,0.0431,0.1910,0.2146,0.0226,0.4392,0.1284,0.2412,0.3098,0.0759,0.4160,0.4468,0.3713,0.1261,0.2320,0.2383,0.5651,0.4394,0.5033,0.4789,0.2137,0.1837,0.8751,0.4442,0.0918,0.0476,0.5000,0.0000,0.4883,0.7119
105
+ RN50,cc12m,102.01,18.18,0.3180,0.3591,0.6432,0.5241,0.2093,0.2076,0.1576,0.0422,0.2074,0.2202,0.0178,0.4241,0.1155,0.2354,0.3065,0.0763,0.4165,0.4466,0.3713,0.0919,0.2326,0.2465,0.5504,0.4700,0.5035,0.4871,0.2351,0.1818,0.8696,0.4440,0.0923,0.0464,0.5000,0.0000,0.4907,0.7086
106
+ ViT-B-32,commonpool_m_image_s128m_b4k,151.28,14.78,0.3166,0.2678,0.6650,0.7815,0.5203,0.1298,0.2248,0.0466,0.1910,0.2261,0.0219,0.3553,0.1513,0.1623,0.2183,0.0385,0.3795,0.2959,0.2996,0.1079,0.1837,0.2383,0.3482,0.6147,0.5742,0.5266,0.2275,0.1593,0.8171,0.3706,0.1294,0.0149,0.6905,0.0000,0.4638,0.6397
107
+ ViT-B-32,commonpool_m_text_s128m_b4k,151.28,14.78,0.3116,0.2548,0.6632,0.8164,0.5133,0.1891,0.2449,0.0355,0.1995,0.3587,0.0212,0.3568,0.1048,0.1655,0.2142,0.0431,0.3705,0.3107,0.2897,0.1034,0.1889,0.2184,0.2991,0.5355,0.5495,0.5008,0.2627,0.1935,0.7966,0.3535,0.1265,0.0063,0.5336,0.0000,0.4544,0.6317
108
+ ViT-B-32,commonpool_m_laion_s128m_b4k,151.28,14.78,0.2970,0.2304,0.6312,0.7744,0.5009,0.1623,0.2261,0.0345,0.2043,0.1880,0.0169,0.3131,0.0906,0.1515,0.1895,0.0424,0.3480,0.2801,0.2827,0.1520,0.1763,0.2090,0.2973,0.5302,0.6225,0.4964,0.2470,0.2189,0.7774,0.3327,0.0881,0.0167,0.5054,0.0000,0.4357,0.6234
109
+ RN101-quickgelu,yfcc15m,119.69,25.5,0.2967,0.3487,0.5437,0.5298,0.2262,0.1609,0.2504,0.0683,0.1851,0.2030,0.0420,0.4686,0.0940,0.0888,0.3003,0.1568,0.3370,0.2643,0.2068,0.1239,0.1988,0.4942,0.2970,0.4603,0.5004,0.4992,0.2138,0.0373,0.8661,0.4085,0.0781,0.0357,0.5000,0.0546,0.4930,0.6483
110
+ RN101,yfcc15m,119.69,25.5,0.2964,0.3407,0.5538,0.5048,0.2197,0.1369,0.2257,0.0699,0.1899,0.2076,0.0443,0.4729,0.1092,0.0888,0.2933,0.1611,0.3240,0.2629,0.2138,0.1086,0.1991,0.4886,0.3068,0.4886,0.5013,0.4920,0.2011,0.0381,0.8803,0.4235,0.1348,0.0371,0.5000,0.0000,0.5035,0.6509
111
+ ViT-B-32,commonpool_m_basic_s128m_b4k,151.28,14.78,0.2878,0.2255,0.6118,0.6321,0.3531,0.1417,0.2217,0.0423,0.1973,0.2191,0.0155,0.3165,0.1225,0.1434,0.1820,0.0383,0.3505,0.2684,0.2982,0.1229,0.1754,0.1853,0.2752,0.5323,0.5402,0.5014,0.2305,0.2900,0.7793,0.3490,0.0638,0.0133,0.5137,0.0285,0.4591,0.6322
112
+ RN50,yfcc15m,102.01,18.18,0.2784,0.3238,0.5095,0.4943,0.1862,0.1315,0.2003,0.0642,0.1745,0.1811,0.0373,0.4304,0.0844,0.0729,0.2806,0.1371,0.3265,0.2231,0.2602,0.1004,0.1824,0.4680,0.2777,0.3888,0.5331,0.4992,0.1494,0.0429,0.8161,0.3999,0.0640,0.0324,0.5256,0.0501,0.4673,0.6289
113
+ RN50-quickgelu,yfcc15m,102.01,18.18,0.2747,0.3275,0.5089,0.4919,0.2033,0.1305,0.1990,0.0637,0.1729,0.1596,0.0371,0.4493,0.0956,0.0715,0.2793,0.1373,0.3315,0.2220,0.2560,0.0924,0.1772,0.4718,0.2771,0.3845,0.5131,0.4992,0.1424,0.0407,0.7914,0.3919,0.0642,0.0261,0.5058,0.0000,0.4638,0.6343
114
+ ViT-B-32,commonpool_m_s128m_b4k,151.28,14.78,0.2614,0.1755,0.5231,0.7459,0.4391,0.1263,0.2265,0.0362,0.1606,0.2537,0.0115,0.2342,0.0869,0.0952,0.1440,0.0388,0.2780,0.1983,0.2743,0.0933,0.1574,0.1128,0.1676,0.5448,0.5048,0.5003,0.1810,0.1332,0.7690,0.3066,0.0933,0.0127,0.5015,0.0000,0.4276,0.5942
115
+ ViT-B-32,commonpool_s_clip_s13m_b4k,151.28,14.78,0.1778,0.0505,0.2483,0.4768,0.1937,0.1529,0.2313,0.0119,0.0782,0.2067,0.0083,0.0801,0.0732,0.0200,0.0380,0.0181,0.1380,0.0655,0.2785,0.0874,0.0506,0.0539,0.0796,0.3379,0.6367,0.5014,0.0806,0.0276,0.5353,0.1126,0.1166,0.0004,0.6874,0.0000,0.2605,0.2827
116
+ ViT-B-32,commonpool_s_text_s13m_b4k,151.28,14.78,0.1601,0.0460,0.2231,0.4679,0.1844,0.1350,0.1899,0.0121,0.0670,0.0896,0.0139,0.0618,0.0411,0.0175,0.0398,0.0187,0.1270,0.0606,0.3980,0.0771,0.0494,0.0428,0.0581,0.2942,0.5027,0.5008,0.1029,0.0204,0.5019,0.1051,0.0933,0.0015,0.5000,0.0000,0.2745,0.2843
117
+ ViT-B-32,commonpool_s_image_s13m_b4k,151.28,14.78,0.1492,0.0392,0.2238,0.3176,0.1329,0.1121,0.2217,0.0109,0.0521,0.1593,0.0120,0.0604,0.0579,0.0186,0.0308,0.0155,0.1055,0.0578,0.2883,0.0991,0.0436,0.0528,0.0474,0.2666,0.5273,0.4646,0.0794,0.0173,0.4601,0.0725,0.1305,0.0033,0.5425,0.0085,0.2150,0.2752
118
+ ViT-B-32,datacomp_s_s13m_b4k,151.28,14.78,0.1492,0.0392,0.2238,0.3176,0.1329,0.1121,0.2217,0.0109,0.0521,0.1593,0.0120,0.0604,0.0579,0.0186,0.0308,0.0155,0.1055,0.0578,0.2883,0.0991,0.0436,0.0528,0.0474,0.2666,0.5273,0.4646,0.0794,0.0173,0.4601,0.0725,0.1305,0.0033,0.5425,0.0085,0.2150,0.2752
119
+ ViT-B-32,commonpool_s_basic_s13m_b4k,151.28,14.78,0.1445,0.0377,0.1806,0.2664,0.1154,0.1245,0.2335,0.0120,0.0553,0.0587,0.0103,0.0588,0.0638,0.0151,0.0319,0.0203,0.0985,0.0499,0.3390,0.1085,0.0440,0.0351,0.0488,0.3081,0.5096,0.4986,0.0795,0.0200,0.4659,0.0879,0.0810,0.0003,0.5001,0.0000,0.2325,0.2643
120
+ ViT-B-32,commonpool_s_s13m_b4k,151.28,14.78,0.1441,0.0270,0.1564,0.4079,0.1296,0.1305,0.2233,0.0126,0.0574,0.1487,0.0081,0.0473,0.0654,0.0108,0.0234,0.0141,0.1000,0.0404,0.3460,0.0708,0.0360,0.0338,0.0443,0.2235,0.5268,0.5008,0.0698,0.0143,0.4266,0.0766,0.1121,0.0002,0.5124,0.0000,0.2290,0.2167
121
+ ViT-B-32,commonpool_s_laion_s13m_b4k,151.28,14.78,0.1367,0.0305,0.1549,0.3364,0.1347,0.1309,0.1299,0.0098,0.0553,0.1578,0.0134,0.0501,0.0538,0.0125,0.0271,0.0147,0.1015,0.0443,0.2518,0.1387,0.0369,0.0244,0.0399,0.3030,0.4216,0.4992,0.0583,0.0155,0.4874,0.0659,0.1473,0.0017,0.3703,0.0000,0.2079,0.2580
122
+ coca_ViT-B-32,mscoco_finetuned_laion2b_s13b_b90k,253.56,33.34,0.1133,0.0079,0.0320,0.2564,0.0193,0.1245,0.2027,0.0044,0.0303,0.1157,0.0064,0.0159,0.0146,0.0028,0.0067,0.0121,0.0220,0.0199,0.3010,0.1506,0.0144,0.0054,0.0416,0.2023,0.5713,0.4992,0.0478,0.0056,0.2579,0.0204,0.1529,0.0004,0.5681,0.0000,0.1729,0.0589
Finetuning/docs/openclip_multilingual_retrieval_results.csv ADDED
The diff for this file is too large to render. See raw diff