frozenc commited on
Commit
d135479
·
verified ·
1 Parent(s): b9383b1

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,109 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Ops-MM-embedding-v1-7B
2
+
3
+ **Ops-MM-embedding-v1-7B** is a dense, large-scale multimodal embedding model developed and open-sourced by the Alibaba Cloud OpenSearch-AI team, fine-tuned from Qwen2-VL.
4
+
5
+ ---
6
+
7
+ ### **Key Features**
8
+
9
+ #### Unified Multimodal Embeddings
10
+ - Encodes text, images, text-image pairs, visual documents, and videos (by treating video frames as multiple image inputs) into a unified embedding space for cross-modal retrieval.
11
+
12
+ #### High Performance on MMEB
13
+ - Achieves **SOTA results** among models of similar scale on **MMEB-V2** and **MMEB-Image** benchmark (until 2025-07-03).
14
+
15
+ #### Multilingual Capabilities
16
+ - **Ops-MM-embedding-v1-7B** achieves SOTA performance among dense models on the ViDoRe-v2 benchmark, demonstrating strong cross-lingual generalization.
17
+
18
+
19
+
20
+ ### Training data
21
+
22
+ MMEB-train, CC-3M, colpali training set.
23
+
24
+
25
+ ### Performance
26
+
27
+ #### MMEB-V2
28
+
29
+ | Model | Model Size (B) | Overall | Image-Overall | Video-Overall | Visdoc-Overall |
30
+ | ------------------------ | -------------- | ------- | ------------- | ------------- | -------------- |
31
+ | seed-1.6-embedding | unknown | 71.57 | 77.78 | 55.34 | 74.41 |
32
+ | Ops-MM-embedding-v1-7B | 8.29 | 67.79 | 72.72 | 53.76 | 70.91 |
33
+ | Ops-MM-embedding-v1-2B | 2.21 | 63.62 | 69.03 | 47.56 | 67.55 |
34
+ | VLM2Vec-V2.0-Qwen2VL-2B | 2.21 | 58.39 | 64.85 | 34.85 | 66.34 |
35
+ | gme-Qwen2-VL-2B-Instruct | 2.21 | 54.37 | 51.89 | 33.86 | 73.47 |
36
+
37
+ ---
38
+
39
+ #### MMEB-Image
40
+
41
+ The table below compares performance on MMEB-Image benchmark among models of similar size.
42
+
43
+ | Models | Model Size(B) | Image-Overall | I-CLS | I-QA | I-RET | I-VG |
44
+ | ------------------------------------- | ------------- | ------------- | ----- | ----- | ------ | ------ |
45
+ | Ops-MM-embedding-v1-7B | 8.29 | **72.72** | 69.65 | 69.58 | 73.09 | 87.15 |
46
+ | QQMM-embed | 8.297 | 72.175 | 70.07 | 69.52 | 71.175 | 87.075 |
47
+ | B3_Qwen2_7B | 8.29 | 72 | 70 | 66.5 | 74.1 | 84.6 |
48
+ | UniME(LLaVA-OneVision-7B-LoRA-Res336) | 8.03 | 70.7 | 66.8 | 66.6 | 70.5 | 90.9 |
49
+ | LLaVE-7B | 8.03 | 70.3 | 65.7 | 65.4 | 70.9 | 91.9 |
50
+ | UNITE-Instruct-7B | 8.29 | 70.3 | 68.3 | 65.1 | 71.6 | 84.8 |
51
+
52
+ ---
53
+
54
+ #### ViDoRe-v2
55
+
56
+ | Model | Avg | ESG Restaurant Human | MIT Bio | Econ. Macro | ESG Restaurant Synth. | MIT Bio Multi. | Econ Macro Multi. | ESG Restaurant Synth. Multi. |
57
+ | ---------------------- | -------- | -------------------- | ------- | ----------- | --------------------- | -------------- | ----------------- | ---------------------------- |
58
+ | gme-7B | 59.3 | 65.8 | 64 | 62.9 | 54.3 | 55.1 | 56.2 | 56.7 |
59
+ | seed 1.6 embedding | 58.9 | 63.3 | 63.9 | 64.0 | 58.4 | 57.1 | 53.8 | 52.0 |
60
+ | Ops-MM-embedding-v1-7B | **60.6** | 66.3 | 58.4 | 67.4 | 60.0 | 54.3 | 60.9 | 56.8 |
61
+ | Ops-MM-embedding-v1-2B | 54.4 | 58.6 | 56.0 | 56.4 | 55.8 | 52.9 | 47.9 | 53.4 |
62
+
63
+
64
+
65
+
66
+ ## Usage
67
+
68
+ ```python
69
+ from ops_mm_embedding_v1 import OpsMMEmbeddingV1, fetch_image
70
+
71
+
72
+ model = OpsMMEmbeddingV1(
73
+ "OpenSearch-AI/Ops-MM-embedding-v1-7B",
74
+ device="cuda",
75
+ attn_implementation="flash_attention_2"
76
+ )
77
+
78
+ t2i_prompt = "Find an image that matches the given text."
79
+ texts = [
80
+ "The Tesla Cybertruck is a battery electric pickup truck built by Tesla, Inc. since 2023.",
81
+ "Alibaba office.",
82
+ "Alibaba office.",
83
+ ]
84
+ images = [
85
+ "https://upload.wikimedia.org/wikipedia/commons/e/e9/Tesla_Cybertruck_damaged_window.jpg",
86
+ "https://upload.wikimedia.org/wikipedia/commons/e/e0/TaobaoCity_Alibaba_Xixi_Park.jpg",
87
+ "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b0/Alibaba_Binjiang_Park.jpg/1024px-Alibaba_Binjiang_Park.jpg"
88
+ ]
89
+
90
+ images = [fetch_image(image) for image in images]
91
+
92
+ # Text and image embedding
93
+ text_embeddings = model.get_text_embeddings(texts)
94
+ image_embeddings = model.get_image_embeddings(images)
95
+ print('Text and image embeddings', (text_embeddings @ image_embeddings.T).tolist())
96
+
97
+ # Fused Embedding
98
+ text_with_image_embeddings = model.get_fused_embeddings(texts=texts, images=images, instruction=t2i_prompt)
99
+ print('Text and image embeddings', (text_embeddings @ image_embeddings.T).tolist())
100
+
101
+ # Multi-image embeddings
102
+ multi_images = [
103
+ [images[0]],
104
+ [images[1], images[2]],
105
+ ]
106
+ multi_image_embeddings = model.get_image_embeddings(multi_images)
107
+ print('Multi-image embeddings', (multi_image_embeddings @ multi_image_embeddings.T).tolist())
108
+
109
+ ```
added_tokens.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|box_end|>": 151649,
3
+ "<|box_start|>": 151648,
4
+ "<|endoftext|>": 151643,
5
+ "<|im_end|>": 151645,
6
+ "<|im_start|>": 151644,
7
+ "<|image_pad|>": 151655,
8
+ "<|object_ref_end|>": 151647,
9
+ "<|object_ref_start|>": 151646,
10
+ "<|quad_end|>": 151651,
11
+ "<|quad_start|>": 151650,
12
+ "<|video_pad|>": 151656,
13
+ "<|vision_end|>": 151653,
14
+ "<|vision_pad|>": 151654,
15
+ "<|vision_start|>": 151652
16
+ }
chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
3
+ }
config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2VLForConditionalGeneration"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 3584,
10
+ "image_token_id": 151655,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 18944,
13
+ "max_position_embeddings": 32768,
14
+ "max_window_layers": 28,
15
+ "model_type": "qwen2_vl",
16
+ "num_attention_heads": 28,
17
+ "num_hidden_layers": 28,
18
+ "num_key_value_heads": 4,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_scaling": {
21
+ "mrope_section": [
22
+ 16,
23
+ 24,
24
+ 24
25
+ ],
26
+ "rope_type": "default",
27
+ "type": "default"
28
+ },
29
+ "rope_theta": 1000000.0,
30
+ "sliding_window": 32768,
31
+ "tie_word_embeddings": false,
32
+ "torch_dtype": "bfloat16",
33
+ "transformers_version": "4.51.1",
34
+ "use_cache": true,
35
+ "use_sliding_window": false,
36
+ "video_token_id": 151656,
37
+ "vision_config": {
38
+ "depth": 32,
39
+ "embed_dim": 1280,
40
+ "hidden_act": "quick_gelu",
41
+ "hidden_size": 3584,
42
+ "in_channels": 3,
43
+ "in_chans": 3,
44
+ "mlp_ratio": 4,
45
+ "model_type": "qwen2_vl",
46
+ "num_heads": 16,
47
+ "patch_size": 14,
48
+ "spatial_merge_size": 2,
49
+ "spatial_patch_size": 14,
50
+ "temporal_patch_size": 2
51
+ },
52
+ "vision_end_token_id": 151653,
53
+ "vision_start_token_id": 151652,
54
+ "vision_token_id": 151654,
55
+ "vocab_size": 152064
56
+ }
demo.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ops_mm_embedding_v1 import OpsMMEmbeddingV1, fetch_image
2
+
3
+
4
+ model = OpsMMEmbeddingV1(
5
+ "OpenSearch-AI/Ops-MM-embedding-v1-7B",
6
+ device="cuda",
7
+ attn_implementation="flash_attention_2"
8
+ )
9
+
10
+ t2i_prompt = "Find an image that matches the given text."
11
+ texts = [
12
+ "The Tesla Cybertruck is a battery electric pickup truck built by Tesla, Inc. since 2023.",
13
+ "Alibaba office.",
14
+ "Alibaba office.",
15
+ ]
16
+ images = [
17
+ "https://upload.wikimedia.org/wikipedia/commons/e/e9/Tesla_Cybertruck_damaged_window.jpg",
18
+ "https://upload.wikimedia.org/wikipedia/commons/e/e0/TaobaoCity_Alibaba_Xixi_Park.jpg",
19
+ "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b0/Alibaba_Binjiang_Park.jpg/1024px-Alibaba_Binjiang_Park.jpg"
20
+ ]
21
+
22
+ images = [fetch_image(image) for image in images]
23
+
24
+ # Text and image embedding
25
+ text_embeddings = model.get_text_embeddings(texts)
26
+ image_embeddings = model.get_image_embeddings(images)
27
+ print('Text and image embeddings', (text_embeddings @ image_embeddings.T).tolist())
28
+
29
+ # Fused Embedding
30
+ text_with_image_embeddings = model.get_fused_embeddings(texts=texts, images=images, instruction=t2i_prompt)
31
+ print('Text and image embeddings', (text_embeddings @ image_embeddings.T).tolist())
32
+
33
+ # Multi-image embeddings
34
+ multi_images = [
35
+ [images[0]],
36
+ [images[1], images[2]],
37
+ ]
38
+ multi_image_embeddings = model.get_image_embeddings(multi_images)
39
+ print('Multi-image embeddings', (multi_image_embeddings @ multi_image_embeddings.T).tolist())
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.01,
10
+ "top_k": 1,
11
+ "top_p": 0.001,
12
+ "transformers_version": "4.51.1"
13
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4185606d530d99b1bec0f01060de12fe659e47f2f85fa2a391e3119f7141283a
3
+ size 4966659944
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:239f228a09c8812e92b21d663830f471127dc6718e70ee66a9b3c6199c07fb37
3
+ size 4991495816
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8d8f54412ec8ab7d2674226038a15166891a44672612af950cf184c9f3ee3dd
3
+ size 4932751040
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4b6498a60113a3998a8a7f8e86a9df5501dd8b0d203d0b557f8a6a6851cff17
3
+ size 1691924384
model.safetensors.index.json ADDED
@@ -0,0 +1,737 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 16582751232
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00004-of-00004.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
32
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
33
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
53
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
55
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
65
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
67
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
74
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
77
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
79
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
80
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
86
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
89
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
91
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
98
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
99
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
101
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
102
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
103
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.16.input_layernorm.weight": "model-00003-of-00004.safetensors",
105
+ "model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
106
+ "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
107
+ "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
108
+ "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
109
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
110
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
113
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
115
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
117
+ "model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
118
+ "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
119
+ "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
120
+ "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
121
+ "model.layers.17.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
122
+ "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
123
+ "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
124
+ "model.layers.17.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
125
+ "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
126
+ "model.layers.17.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
127
+ "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
128
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
129
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
131
+ "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
132
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
133
+ "model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
134
+ "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
135
+ "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
136
+ "model.layers.18.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
137
+ "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
138
+ "model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
139
+ "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
140
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
141
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
146
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
147
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
149
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
150
+ "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
151
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
152
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
153
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
154
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
155
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
156
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
157
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
158
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
159
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
160
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
161
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
162
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
163
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
164
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
165
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
170
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
171
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
173
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
174
+ "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
175
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
176
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
182
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
183
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
185
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
186
+ "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
187
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
194
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
195
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
197
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
198
+ "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
199
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
200
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
206
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
209
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
210
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
211
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
212
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
218
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
219
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
221
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
222
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
223
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
224
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
230
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
233
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
234
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
235
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
236
+ "model.layers.26.input_layernorm.weight": "model-00004-of-00004.safetensors",
237
+ "model.layers.26.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
238
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.26.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
241
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
242
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
243
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
245
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
246
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
247
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
248
+ "model.layers.27.input_layernorm.weight": "model-00004-of-00004.safetensors",
249
+ "model.layers.27.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
250
+ "model.layers.27.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
251
+ "model.layers.27.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
252
+ "model.layers.27.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
253
+ "model.layers.27.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
254
+ "model.layers.27.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
255
+ "model.layers.27.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
256
+ "model.layers.27.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
257
+ "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
258
+ "model.layers.27.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
259
+ "model.layers.27.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
260
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
261
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
266
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
267
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
269
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
270
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
271
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
272
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
273
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
274
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
275
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
277
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
278
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
279
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
280
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
281
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
282
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
283
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
284
+ "model.layers.5.input_layernorm.weight": "model-00002-of-00004.safetensors",
285
+ "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
286
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
287
+ "model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
288
+ "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
289
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
290
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
291
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
292
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
293
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
294
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
295
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
296
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
297
+ "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
298
+ "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
299
+ "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
300
+ "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
301
+ "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
302
+ "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
303
+ "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
304
+ "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
305
+ "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
306
+ "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
307
+ "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
308
+ "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
309
+ "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
310
+ "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
311
+ "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
312
+ "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
313
+ "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
314
+ "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
315
+ "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
316
+ "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
317
+ "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
318
+ "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
319
+ "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
320
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
321
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
322
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
323
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
324
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
325
+ "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
326
+ "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
327
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
328
+ "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
329
+ "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
330
+ "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
331
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
332
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
333
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
334
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
335
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
336
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
337
+ "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
338
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
339
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
340
+ "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
341
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
342
+ "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
343
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
344
+ "model.norm.weight": "model-00004-of-00004.safetensors",
345
+ "visual.blocks.0.attn.proj.bias": "model-00001-of-00004.safetensors",
346
+ "visual.blocks.0.attn.proj.weight": "model-00001-of-00004.safetensors",
347
+ "visual.blocks.0.attn.qkv.bias": "model-00001-of-00004.safetensors",
348
+ "visual.blocks.0.attn.qkv.weight": "model-00001-of-00004.safetensors",
349
+ "visual.blocks.0.mlp.fc1.bias": "model-00001-of-00004.safetensors",
350
+ "visual.blocks.0.mlp.fc1.weight": "model-00001-of-00004.safetensors",
351
+ "visual.blocks.0.mlp.fc2.bias": "model-00001-of-00004.safetensors",
352
+ "visual.blocks.0.mlp.fc2.weight": "model-00001-of-00004.safetensors",
353
+ "visual.blocks.0.norm1.bias": "model-00001-of-00004.safetensors",
354
+ "visual.blocks.0.norm1.weight": "model-00001-of-00004.safetensors",
355
+ "visual.blocks.0.norm2.bias": "model-00001-of-00004.safetensors",
356
+ "visual.blocks.0.norm2.weight": "model-00001-of-00004.safetensors",
357
+ "visual.blocks.1.attn.proj.bias": "model-00001-of-00004.safetensors",
358
+ "visual.blocks.1.attn.proj.weight": "model-00001-of-00004.safetensors",
359
+ "visual.blocks.1.attn.qkv.bias": "model-00001-of-00004.safetensors",
360
+ "visual.blocks.1.attn.qkv.weight": "model-00001-of-00004.safetensors",
361
+ "visual.blocks.1.mlp.fc1.bias": "model-00001-of-00004.safetensors",
362
+ "visual.blocks.1.mlp.fc1.weight": "model-00001-of-00004.safetensors",
363
+ "visual.blocks.1.mlp.fc2.bias": "model-00001-of-00004.safetensors",
364
+ "visual.blocks.1.mlp.fc2.weight": "model-00001-of-00004.safetensors",
365
+ "visual.blocks.1.norm1.bias": "model-00001-of-00004.safetensors",
366
+ "visual.blocks.1.norm1.weight": "model-00001-of-00004.safetensors",
367
+ "visual.blocks.1.norm2.bias": "model-00001-of-00004.safetensors",
368
+ "visual.blocks.1.norm2.weight": "model-00001-of-00004.safetensors",
369
+ "visual.blocks.10.attn.proj.bias": "model-00001-of-00004.safetensors",
370
+ "visual.blocks.10.attn.proj.weight": "model-00001-of-00004.safetensors",
371
+ "visual.blocks.10.attn.qkv.bias": "model-00001-of-00004.safetensors",
372
+ "visual.blocks.10.attn.qkv.weight": "model-00001-of-00004.safetensors",
373
+ "visual.blocks.10.mlp.fc1.bias": "model-00001-of-00004.safetensors",
374
+ "visual.blocks.10.mlp.fc1.weight": "model-00001-of-00004.safetensors",
375
+ "visual.blocks.10.mlp.fc2.bias": "model-00001-of-00004.safetensors",
376
+ "visual.blocks.10.mlp.fc2.weight": "model-00001-of-00004.safetensors",
377
+ "visual.blocks.10.norm1.bias": "model-00001-of-00004.safetensors",
378
+ "visual.blocks.10.norm1.weight": "model-00001-of-00004.safetensors",
379
+ "visual.blocks.10.norm2.bias": "model-00001-of-00004.safetensors",
380
+ "visual.blocks.10.norm2.weight": "model-00001-of-00004.safetensors",
381
+ "visual.blocks.11.attn.proj.bias": "model-00001-of-00004.safetensors",
382
+ "visual.blocks.11.attn.proj.weight": "model-00001-of-00004.safetensors",
383
+ "visual.blocks.11.attn.qkv.bias": "model-00001-of-00004.safetensors",
384
+ "visual.blocks.11.attn.qkv.weight": "model-00001-of-00004.safetensors",
385
+ "visual.blocks.11.mlp.fc1.bias": "model-00001-of-00004.safetensors",
386
+ "visual.blocks.11.mlp.fc1.weight": "model-00001-of-00004.safetensors",
387
+ "visual.blocks.11.mlp.fc2.bias": "model-00001-of-00004.safetensors",
388
+ "visual.blocks.11.mlp.fc2.weight": "model-00001-of-00004.safetensors",
389
+ "visual.blocks.11.norm1.bias": "model-00001-of-00004.safetensors",
390
+ "visual.blocks.11.norm1.weight": "model-00001-of-00004.safetensors",
391
+ "visual.blocks.11.norm2.bias": "model-00001-of-00004.safetensors",
392
+ "visual.blocks.11.norm2.weight": "model-00001-of-00004.safetensors",
393
+ "visual.blocks.12.attn.proj.bias": "model-00001-of-00004.safetensors",
394
+ "visual.blocks.12.attn.proj.weight": "model-00001-of-00004.safetensors",
395
+ "visual.blocks.12.attn.qkv.bias": "model-00001-of-00004.safetensors",
396
+ "visual.blocks.12.attn.qkv.weight": "model-00001-of-00004.safetensors",
397
+ "visual.blocks.12.mlp.fc1.bias": "model-00001-of-00004.safetensors",
398
+ "visual.blocks.12.mlp.fc1.weight": "model-00001-of-00004.safetensors",
399
+ "visual.blocks.12.mlp.fc2.bias": "model-00001-of-00004.safetensors",
400
+ "visual.blocks.12.mlp.fc2.weight": "model-00001-of-00004.safetensors",
401
+ "visual.blocks.12.norm1.bias": "model-00001-of-00004.safetensors",
402
+ "visual.blocks.12.norm1.weight": "model-00001-of-00004.safetensors",
403
+ "visual.blocks.12.norm2.bias": "model-00001-of-00004.safetensors",
404
+ "visual.blocks.12.norm2.weight": "model-00001-of-00004.safetensors",
405
+ "visual.blocks.13.attn.proj.bias": "model-00001-of-00004.safetensors",
406
+ "visual.blocks.13.attn.proj.weight": "model-00001-of-00004.safetensors",
407
+ "visual.blocks.13.attn.qkv.bias": "model-00001-of-00004.safetensors",
408
+ "visual.blocks.13.attn.qkv.weight": "model-00001-of-00004.safetensors",
409
+ "visual.blocks.13.mlp.fc1.bias": "model-00001-of-00004.safetensors",
410
+ "visual.blocks.13.mlp.fc1.weight": "model-00001-of-00004.safetensors",
411
+ "visual.blocks.13.mlp.fc2.bias": "model-00001-of-00004.safetensors",
412
+ "visual.blocks.13.mlp.fc2.weight": "model-00001-of-00004.safetensors",
413
+ "visual.blocks.13.norm1.bias": "model-00001-of-00004.safetensors",
414
+ "visual.blocks.13.norm1.weight": "model-00001-of-00004.safetensors",
415
+ "visual.blocks.13.norm2.bias": "model-00001-of-00004.safetensors",
416
+ "visual.blocks.13.norm2.weight": "model-00001-of-00004.safetensors",
417
+ "visual.blocks.14.attn.proj.bias": "model-00001-of-00004.safetensors",
418
+ "visual.blocks.14.attn.proj.weight": "model-00001-of-00004.safetensors",
419
+ "visual.blocks.14.attn.qkv.bias": "model-00001-of-00004.safetensors",
420
+ "visual.blocks.14.attn.qkv.weight": "model-00001-of-00004.safetensors",
421
+ "visual.blocks.14.mlp.fc1.bias": "model-00001-of-00004.safetensors",
422
+ "visual.blocks.14.mlp.fc1.weight": "model-00001-of-00004.safetensors",
423
+ "visual.blocks.14.mlp.fc2.bias": "model-00001-of-00004.safetensors",
424
+ "visual.blocks.14.mlp.fc2.weight": "model-00001-of-00004.safetensors",
425
+ "visual.blocks.14.norm1.bias": "model-00001-of-00004.safetensors",
426
+ "visual.blocks.14.norm1.weight": "model-00001-of-00004.safetensors",
427
+ "visual.blocks.14.norm2.bias": "model-00001-of-00004.safetensors",
428
+ "visual.blocks.14.norm2.weight": "model-00001-of-00004.safetensors",
429
+ "visual.blocks.15.attn.proj.bias": "model-00001-of-00004.safetensors",
430
+ "visual.blocks.15.attn.proj.weight": "model-00001-of-00004.safetensors",
431
+ "visual.blocks.15.attn.qkv.bias": "model-00001-of-00004.safetensors",
432
+ "visual.blocks.15.attn.qkv.weight": "model-00001-of-00004.safetensors",
433
+ "visual.blocks.15.mlp.fc1.bias": "model-00001-of-00004.safetensors",
434
+ "visual.blocks.15.mlp.fc1.weight": "model-00001-of-00004.safetensors",
435
+ "visual.blocks.15.mlp.fc2.bias": "model-00001-of-00004.safetensors",
436
+ "visual.blocks.15.mlp.fc2.weight": "model-00001-of-00004.safetensors",
437
+ "visual.blocks.15.norm1.bias": "model-00001-of-00004.safetensors",
438
+ "visual.blocks.15.norm1.weight": "model-00001-of-00004.safetensors",
439
+ "visual.blocks.15.norm2.bias": "model-00001-of-00004.safetensors",
440
+ "visual.blocks.15.norm2.weight": "model-00001-of-00004.safetensors",
441
+ "visual.blocks.16.attn.proj.bias": "model-00001-of-00004.safetensors",
442
+ "visual.blocks.16.attn.proj.weight": "model-00001-of-00004.safetensors",
443
+ "visual.blocks.16.attn.qkv.bias": "model-00001-of-00004.safetensors",
444
+ "visual.blocks.16.attn.qkv.weight": "model-00001-of-00004.safetensors",
445
+ "visual.blocks.16.mlp.fc1.bias": "model-00001-of-00004.safetensors",
446
+ "visual.blocks.16.mlp.fc1.weight": "model-00001-of-00004.safetensors",
447
+ "visual.blocks.16.mlp.fc2.bias": "model-00001-of-00004.safetensors",
448
+ "visual.blocks.16.mlp.fc2.weight": "model-00001-of-00004.safetensors",
449
+ "visual.blocks.16.norm1.bias": "model-00001-of-00004.safetensors",
450
+ "visual.blocks.16.norm1.weight": "model-00001-of-00004.safetensors",
451
+ "visual.blocks.16.norm2.bias": "model-00001-of-00004.safetensors",
452
+ "visual.blocks.16.norm2.weight": "model-00001-of-00004.safetensors",
453
+ "visual.blocks.17.attn.proj.bias": "model-00001-of-00004.safetensors",
454
+ "visual.blocks.17.attn.proj.weight": "model-00001-of-00004.safetensors",
455
+ "visual.blocks.17.attn.qkv.bias": "model-00001-of-00004.safetensors",
456
+ "visual.blocks.17.attn.qkv.weight": "model-00001-of-00004.safetensors",
457
+ "visual.blocks.17.mlp.fc1.bias": "model-00001-of-00004.safetensors",
458
+ "visual.blocks.17.mlp.fc1.weight": "model-00001-of-00004.safetensors",
459
+ "visual.blocks.17.mlp.fc2.bias": "model-00001-of-00004.safetensors",
460
+ "visual.blocks.17.mlp.fc2.weight": "model-00001-of-00004.safetensors",
461
+ "visual.blocks.17.norm1.bias": "model-00001-of-00004.safetensors",
462
+ "visual.blocks.17.norm1.weight": "model-00001-of-00004.safetensors",
463
+ "visual.blocks.17.norm2.bias": "model-00001-of-00004.safetensors",
464
+ "visual.blocks.17.norm2.weight": "model-00001-of-00004.safetensors",
465
+ "visual.blocks.18.attn.proj.bias": "model-00001-of-00004.safetensors",
466
+ "visual.blocks.18.attn.proj.weight": "model-00001-of-00004.safetensors",
467
+ "visual.blocks.18.attn.qkv.bias": "model-00001-of-00004.safetensors",
468
+ "visual.blocks.18.attn.qkv.weight": "model-00001-of-00004.safetensors",
469
+ "visual.blocks.18.mlp.fc1.bias": "model-00001-of-00004.safetensors",
470
+ "visual.blocks.18.mlp.fc1.weight": "model-00001-of-00004.safetensors",
471
+ "visual.blocks.18.mlp.fc2.bias": "model-00001-of-00004.safetensors",
472
+ "visual.blocks.18.mlp.fc2.weight": "model-00001-of-00004.safetensors",
473
+ "visual.blocks.18.norm1.bias": "model-00001-of-00004.safetensors",
474
+ "visual.blocks.18.norm1.weight": "model-00001-of-00004.safetensors",
475
+ "visual.blocks.18.norm2.bias": "model-00001-of-00004.safetensors",
476
+ "visual.blocks.18.norm2.weight": "model-00001-of-00004.safetensors",
477
+ "visual.blocks.19.attn.proj.bias": "model-00001-of-00004.safetensors",
478
+ "visual.blocks.19.attn.proj.weight": "model-00001-of-00004.safetensors",
479
+ "visual.blocks.19.attn.qkv.bias": "model-00001-of-00004.safetensors",
480
+ "visual.blocks.19.attn.qkv.weight": "model-00001-of-00004.safetensors",
481
+ "visual.blocks.19.mlp.fc1.bias": "model-00001-of-00004.safetensors",
482
+ "visual.blocks.19.mlp.fc1.weight": "model-00001-of-00004.safetensors",
483
+ "visual.blocks.19.mlp.fc2.bias": "model-00001-of-00004.safetensors",
484
+ "visual.blocks.19.mlp.fc2.weight": "model-00001-of-00004.safetensors",
485
+ "visual.blocks.19.norm1.bias": "model-00001-of-00004.safetensors",
486
+ "visual.blocks.19.norm1.weight": "model-00001-of-00004.safetensors",
487
+ "visual.blocks.19.norm2.bias": "model-00001-of-00004.safetensors",
488
+ "visual.blocks.19.norm2.weight": "model-00001-of-00004.safetensors",
489
+ "visual.blocks.2.attn.proj.bias": "model-00001-of-00004.safetensors",
490
+ "visual.blocks.2.attn.proj.weight": "model-00001-of-00004.safetensors",
491
+ "visual.blocks.2.attn.qkv.bias": "model-00001-of-00004.safetensors",
492
+ "visual.blocks.2.attn.qkv.weight": "model-00001-of-00004.safetensors",
493
+ "visual.blocks.2.mlp.fc1.bias": "model-00001-of-00004.safetensors",
494
+ "visual.blocks.2.mlp.fc1.weight": "model-00001-of-00004.safetensors",
495
+ "visual.blocks.2.mlp.fc2.bias": "model-00001-of-00004.safetensors",
496
+ "visual.blocks.2.mlp.fc2.weight": "model-00001-of-00004.safetensors",
497
+ "visual.blocks.2.norm1.bias": "model-00001-of-00004.safetensors",
498
+ "visual.blocks.2.norm1.weight": "model-00001-of-00004.safetensors",
499
+ "visual.blocks.2.norm2.bias": "model-00001-of-00004.safetensors",
500
+ "visual.blocks.2.norm2.weight": "model-00001-of-00004.safetensors",
501
+ "visual.blocks.20.attn.proj.bias": "model-00001-of-00004.safetensors",
502
+ "visual.blocks.20.attn.proj.weight": "model-00001-of-00004.safetensors",
503
+ "visual.blocks.20.attn.qkv.bias": "model-00001-of-00004.safetensors",
504
+ "visual.blocks.20.attn.qkv.weight": "model-00001-of-00004.safetensors",
505
+ "visual.blocks.20.mlp.fc1.bias": "model-00001-of-00004.safetensors",
506
+ "visual.blocks.20.mlp.fc1.weight": "model-00001-of-00004.safetensors",
507
+ "visual.blocks.20.mlp.fc2.bias": "model-00001-of-00004.safetensors",
508
+ "visual.blocks.20.mlp.fc2.weight": "model-00001-of-00004.safetensors",
509
+ "visual.blocks.20.norm1.bias": "model-00001-of-00004.safetensors",
510
+ "visual.blocks.20.norm1.weight": "model-00001-of-00004.safetensors",
511
+ "visual.blocks.20.norm2.bias": "model-00001-of-00004.safetensors",
512
+ "visual.blocks.20.norm2.weight": "model-00001-of-00004.safetensors",
513
+ "visual.blocks.21.attn.proj.bias": "model-00001-of-00004.safetensors",
514
+ "visual.blocks.21.attn.proj.weight": "model-00001-of-00004.safetensors",
515
+ "visual.blocks.21.attn.qkv.bias": "model-00001-of-00004.safetensors",
516
+ "visual.blocks.21.attn.qkv.weight": "model-00001-of-00004.safetensors",
517
+ "visual.blocks.21.mlp.fc1.bias": "model-00001-of-00004.safetensors",
518
+ "visual.blocks.21.mlp.fc1.weight": "model-00001-of-00004.safetensors",
519
+ "visual.blocks.21.mlp.fc2.bias": "model-00001-of-00004.safetensors",
520
+ "visual.blocks.21.mlp.fc2.weight": "model-00001-of-00004.safetensors",
521
+ "visual.blocks.21.norm1.bias": "model-00001-of-00004.safetensors",
522
+ "visual.blocks.21.norm1.weight": "model-00001-of-00004.safetensors",
523
+ "visual.blocks.21.norm2.bias": "model-00001-of-00004.safetensors",
524
+ "visual.blocks.21.norm2.weight": "model-00001-of-00004.safetensors",
525
+ "visual.blocks.22.attn.proj.bias": "model-00001-of-00004.safetensors",
526
+ "visual.blocks.22.attn.proj.weight": "model-00001-of-00004.safetensors",
527
+ "visual.blocks.22.attn.qkv.bias": "model-00001-of-00004.safetensors",
528
+ "visual.blocks.22.attn.qkv.weight": "model-00001-of-00004.safetensors",
529
+ "visual.blocks.22.mlp.fc1.bias": "model-00001-of-00004.safetensors",
530
+ "visual.blocks.22.mlp.fc1.weight": "model-00001-of-00004.safetensors",
531
+ "visual.blocks.22.mlp.fc2.bias": "model-00001-of-00004.safetensors",
532
+ "visual.blocks.22.mlp.fc2.weight": "model-00001-of-00004.safetensors",
533
+ "visual.blocks.22.norm1.bias": "model-00001-of-00004.safetensors",
534
+ "visual.blocks.22.norm1.weight": "model-00001-of-00004.safetensors",
535
+ "visual.blocks.22.norm2.bias": "model-00001-of-00004.safetensors",
536
+ "visual.blocks.22.norm2.weight": "model-00001-of-00004.safetensors",
537
+ "visual.blocks.23.attn.proj.bias": "model-00001-of-00004.safetensors",
538
+ "visual.blocks.23.attn.proj.weight": "model-00001-of-00004.safetensors",
539
+ "visual.blocks.23.attn.qkv.bias": "model-00001-of-00004.safetensors",
540
+ "visual.blocks.23.attn.qkv.weight": "model-00001-of-00004.safetensors",
541
+ "visual.blocks.23.mlp.fc1.bias": "model-00001-of-00004.safetensors",
542
+ "visual.blocks.23.mlp.fc1.weight": "model-00001-of-00004.safetensors",
543
+ "visual.blocks.23.mlp.fc2.bias": "model-00001-of-00004.safetensors",
544
+ "visual.blocks.23.mlp.fc2.weight": "model-00001-of-00004.safetensors",
545
+ "visual.blocks.23.norm1.bias": "model-00001-of-00004.safetensors",
546
+ "visual.blocks.23.norm1.weight": "model-00001-of-00004.safetensors",
547
+ "visual.blocks.23.norm2.bias": "model-00001-of-00004.safetensors",
548
+ "visual.blocks.23.norm2.weight": "model-00001-of-00004.safetensors",
549
+ "visual.blocks.24.attn.proj.bias": "model-00001-of-00004.safetensors",
550
+ "visual.blocks.24.attn.proj.weight": "model-00001-of-00004.safetensors",
551
+ "visual.blocks.24.attn.qkv.bias": "model-00001-of-00004.safetensors",
552
+ "visual.blocks.24.attn.qkv.weight": "model-00001-of-00004.safetensors",
553
+ "visual.blocks.24.mlp.fc1.bias": "model-00001-of-00004.safetensors",
554
+ "visual.blocks.24.mlp.fc1.weight": "model-00001-of-00004.safetensors",
555
+ "visual.blocks.24.mlp.fc2.bias": "model-00001-of-00004.safetensors",
556
+ "visual.blocks.24.mlp.fc2.weight": "model-00001-of-00004.safetensors",
557
+ "visual.blocks.24.norm1.bias": "model-00001-of-00004.safetensors",
558
+ "visual.blocks.24.norm1.weight": "model-00001-of-00004.safetensors",
559
+ "visual.blocks.24.norm2.bias": "model-00001-of-00004.safetensors",
560
+ "visual.blocks.24.norm2.weight": "model-00001-of-00004.safetensors",
561
+ "visual.blocks.25.attn.proj.bias": "model-00001-of-00004.safetensors",
562
+ "visual.blocks.25.attn.proj.weight": "model-00001-of-00004.safetensors",
563
+ "visual.blocks.25.attn.qkv.bias": "model-00001-of-00004.safetensors",
564
+ "visual.blocks.25.attn.qkv.weight": "model-00001-of-00004.safetensors",
565
+ "visual.blocks.25.mlp.fc1.bias": "model-00001-of-00004.safetensors",
566
+ "visual.blocks.25.mlp.fc1.weight": "model-00001-of-00004.safetensors",
567
+ "visual.blocks.25.mlp.fc2.bias": "model-00001-of-00004.safetensors",
568
+ "visual.blocks.25.mlp.fc2.weight": "model-00001-of-00004.safetensors",
569
+ "visual.blocks.25.norm1.bias": "model-00001-of-00004.safetensors",
570
+ "visual.blocks.25.norm1.weight": "model-00001-of-00004.safetensors",
571
+ "visual.blocks.25.norm2.bias": "model-00001-of-00004.safetensors",
572
+ "visual.blocks.25.norm2.weight": "model-00001-of-00004.safetensors",
573
+ "visual.blocks.26.attn.proj.bias": "model-00001-of-00004.safetensors",
574
+ "visual.blocks.26.attn.proj.weight": "model-00001-of-00004.safetensors",
575
+ "visual.blocks.26.attn.qkv.bias": "model-00001-of-00004.safetensors",
576
+ "visual.blocks.26.attn.qkv.weight": "model-00001-of-00004.safetensors",
577
+ "visual.blocks.26.mlp.fc1.bias": "model-00001-of-00004.safetensors",
578
+ "visual.blocks.26.mlp.fc1.weight": "model-00001-of-00004.safetensors",
579
+ "visual.blocks.26.mlp.fc2.bias": "model-00001-of-00004.safetensors",
580
+ "visual.blocks.26.mlp.fc2.weight": "model-00001-of-00004.safetensors",
581
+ "visual.blocks.26.norm1.bias": "model-00001-of-00004.safetensors",
582
+ "visual.blocks.26.norm1.weight": "model-00001-of-00004.safetensors",
583
+ "visual.blocks.26.norm2.bias": "model-00001-of-00004.safetensors",
584
+ "visual.blocks.26.norm2.weight": "model-00001-of-00004.safetensors",
585
+ "visual.blocks.27.attn.proj.bias": "model-00001-of-00004.safetensors",
586
+ "visual.blocks.27.attn.proj.weight": "model-00001-of-00004.safetensors",
587
+ "visual.blocks.27.attn.qkv.bias": "model-00001-of-00004.safetensors",
588
+ "visual.blocks.27.attn.qkv.weight": "model-00001-of-00004.safetensors",
589
+ "visual.blocks.27.mlp.fc1.bias": "model-00001-of-00004.safetensors",
590
+ "visual.blocks.27.mlp.fc1.weight": "model-00001-of-00004.safetensors",
591
+ "visual.blocks.27.mlp.fc2.bias": "model-00001-of-00004.safetensors",
592
+ "visual.blocks.27.mlp.fc2.weight": "model-00001-of-00004.safetensors",
593
+ "visual.blocks.27.norm1.bias": "model-00001-of-00004.safetensors",
594
+ "visual.blocks.27.norm1.weight": "model-00001-of-00004.safetensors",
595
+ "visual.blocks.27.norm2.bias": "model-00001-of-00004.safetensors",
596
+ "visual.blocks.27.norm2.weight": "model-00001-of-00004.safetensors",
597
+ "visual.blocks.28.attn.proj.bias": "model-00001-of-00004.safetensors",
598
+ "visual.blocks.28.attn.proj.weight": "model-00001-of-00004.safetensors",
599
+ "visual.blocks.28.attn.qkv.bias": "model-00001-of-00004.safetensors",
600
+ "visual.blocks.28.attn.qkv.weight": "model-00001-of-00004.safetensors",
601
+ "visual.blocks.28.mlp.fc1.bias": "model-00001-of-00004.safetensors",
602
+ "visual.blocks.28.mlp.fc1.weight": "model-00001-of-00004.safetensors",
603
+ "visual.blocks.28.mlp.fc2.bias": "model-00001-of-00004.safetensors",
604
+ "visual.blocks.28.mlp.fc2.weight": "model-00001-of-00004.safetensors",
605
+ "visual.blocks.28.norm1.bias": "model-00001-of-00004.safetensors",
606
+ "visual.blocks.28.norm1.weight": "model-00001-of-00004.safetensors",
607
+ "visual.blocks.28.norm2.bias": "model-00001-of-00004.safetensors",
608
+ "visual.blocks.28.norm2.weight": "model-00001-of-00004.safetensors",
609
+ "visual.blocks.29.attn.proj.bias": "model-00001-of-00004.safetensors",
610
+ "visual.blocks.29.attn.proj.weight": "model-00001-of-00004.safetensors",
611
+ "visual.blocks.29.attn.qkv.bias": "model-00001-of-00004.safetensors",
612
+ "visual.blocks.29.attn.qkv.weight": "model-00001-of-00004.safetensors",
613
+ "visual.blocks.29.mlp.fc1.bias": "model-00001-of-00004.safetensors",
614
+ "visual.blocks.29.mlp.fc1.weight": "model-00001-of-00004.safetensors",
615
+ "visual.blocks.29.mlp.fc2.bias": "model-00001-of-00004.safetensors",
616
+ "visual.blocks.29.mlp.fc2.weight": "model-00001-of-00004.safetensors",
617
+ "visual.blocks.29.norm1.bias": "model-00001-of-00004.safetensors",
618
+ "visual.blocks.29.norm1.weight": "model-00001-of-00004.safetensors",
619
+ "visual.blocks.29.norm2.bias": "model-00001-of-00004.safetensors",
620
+ "visual.blocks.29.norm2.weight": "model-00001-of-00004.safetensors",
621
+ "visual.blocks.3.attn.proj.bias": "model-00001-of-00004.safetensors",
622
+ "visual.blocks.3.attn.proj.weight": "model-00001-of-00004.safetensors",
623
+ "visual.blocks.3.attn.qkv.bias": "model-00001-of-00004.safetensors",
624
+ "visual.blocks.3.attn.qkv.weight": "model-00001-of-00004.safetensors",
625
+ "visual.blocks.3.mlp.fc1.bias": "model-00001-of-00004.safetensors",
626
+ "visual.blocks.3.mlp.fc1.weight": "model-00001-of-00004.safetensors",
627
+ "visual.blocks.3.mlp.fc2.bias": "model-00001-of-00004.safetensors",
628
+ "visual.blocks.3.mlp.fc2.weight": "model-00001-of-00004.safetensors",
629
+ "visual.blocks.3.norm1.bias": "model-00001-of-00004.safetensors",
630
+ "visual.blocks.3.norm1.weight": "model-00001-of-00004.safetensors",
631
+ "visual.blocks.3.norm2.bias": "model-00001-of-00004.safetensors",
632
+ "visual.blocks.3.norm2.weight": "model-00001-of-00004.safetensors",
633
+ "visual.blocks.30.attn.proj.bias": "model-00001-of-00004.safetensors",
634
+ "visual.blocks.30.attn.proj.weight": "model-00001-of-00004.safetensors",
635
+ "visual.blocks.30.attn.qkv.bias": "model-00001-of-00004.safetensors",
636
+ "visual.blocks.30.attn.qkv.weight": "model-00001-of-00004.safetensors",
637
+ "visual.blocks.30.mlp.fc1.bias": "model-00001-of-00004.safetensors",
638
+ "visual.blocks.30.mlp.fc1.weight": "model-00001-of-00004.safetensors",
639
+ "visual.blocks.30.mlp.fc2.bias": "model-00001-of-00004.safetensors",
640
+ "visual.blocks.30.mlp.fc2.weight": "model-00001-of-00004.safetensors",
641
+ "visual.blocks.30.norm1.bias": "model-00001-of-00004.safetensors",
642
+ "visual.blocks.30.norm1.weight": "model-00001-of-00004.safetensors",
643
+ "visual.blocks.30.norm2.bias": "model-00001-of-00004.safetensors",
644
+ "visual.blocks.30.norm2.weight": "model-00001-of-00004.safetensors",
645
+ "visual.blocks.31.attn.proj.bias": "model-00001-of-00004.safetensors",
646
+ "visual.blocks.31.attn.proj.weight": "model-00001-of-00004.safetensors",
647
+ "visual.blocks.31.attn.qkv.bias": "model-00001-of-00004.safetensors",
648
+ "visual.blocks.31.attn.qkv.weight": "model-00001-of-00004.safetensors",
649
+ "visual.blocks.31.mlp.fc1.bias": "model-00001-of-00004.safetensors",
650
+ "visual.blocks.31.mlp.fc1.weight": "model-00001-of-00004.safetensors",
651
+ "visual.blocks.31.mlp.fc2.bias": "model-00001-of-00004.safetensors",
652
+ "visual.blocks.31.mlp.fc2.weight": "model-00001-of-00004.safetensors",
653
+ "visual.blocks.31.norm1.bias": "model-00001-of-00004.safetensors",
654
+ "visual.blocks.31.norm1.weight": "model-00001-of-00004.safetensors",
655
+ "visual.blocks.31.norm2.bias": "model-00001-of-00004.safetensors",
656
+ "visual.blocks.31.norm2.weight": "model-00001-of-00004.safetensors",
657
+ "visual.blocks.4.attn.proj.bias": "model-00001-of-00004.safetensors",
658
+ "visual.blocks.4.attn.proj.weight": "model-00001-of-00004.safetensors",
659
+ "visual.blocks.4.attn.qkv.bias": "model-00001-of-00004.safetensors",
660
+ "visual.blocks.4.attn.qkv.weight": "model-00001-of-00004.safetensors",
661
+ "visual.blocks.4.mlp.fc1.bias": "model-00001-of-00004.safetensors",
662
+ "visual.blocks.4.mlp.fc1.weight": "model-00001-of-00004.safetensors",
663
+ "visual.blocks.4.mlp.fc2.bias": "model-00001-of-00004.safetensors",
664
+ "visual.blocks.4.mlp.fc2.weight": "model-00001-of-00004.safetensors",
665
+ "visual.blocks.4.norm1.bias": "model-00001-of-00004.safetensors",
666
+ "visual.blocks.4.norm1.weight": "model-00001-of-00004.safetensors",
667
+ "visual.blocks.4.norm2.bias": "model-00001-of-00004.safetensors",
668
+ "visual.blocks.4.norm2.weight": "model-00001-of-00004.safetensors",
669
+ "visual.blocks.5.attn.proj.bias": "model-00001-of-00004.safetensors",
670
+ "visual.blocks.5.attn.proj.weight": "model-00001-of-00004.safetensors",
671
+ "visual.blocks.5.attn.qkv.bias": "model-00001-of-00004.safetensors",
672
+ "visual.blocks.5.attn.qkv.weight": "model-00001-of-00004.safetensors",
673
+ "visual.blocks.5.mlp.fc1.bias": "model-00001-of-00004.safetensors",
674
+ "visual.blocks.5.mlp.fc1.weight": "model-00001-of-00004.safetensors",
675
+ "visual.blocks.5.mlp.fc2.bias": "model-00001-of-00004.safetensors",
676
+ "visual.blocks.5.mlp.fc2.weight": "model-00001-of-00004.safetensors",
677
+ "visual.blocks.5.norm1.bias": "model-00001-of-00004.safetensors",
678
+ "visual.blocks.5.norm1.weight": "model-00001-of-00004.safetensors",
679
+ "visual.blocks.5.norm2.bias": "model-00001-of-00004.safetensors",
680
+ "visual.blocks.5.norm2.weight": "model-00001-of-00004.safetensors",
681
+ "visual.blocks.6.attn.proj.bias": "model-00001-of-00004.safetensors",
682
+ "visual.blocks.6.attn.proj.weight": "model-00001-of-00004.safetensors",
683
+ "visual.blocks.6.attn.qkv.bias": "model-00001-of-00004.safetensors",
684
+ "visual.blocks.6.attn.qkv.weight": "model-00001-of-00004.safetensors",
685
+ "visual.blocks.6.mlp.fc1.bias": "model-00001-of-00004.safetensors",
686
+ "visual.blocks.6.mlp.fc1.weight": "model-00001-of-00004.safetensors",
687
+ "visual.blocks.6.mlp.fc2.bias": "model-00001-of-00004.safetensors",
688
+ "visual.blocks.6.mlp.fc2.weight": "model-00001-of-00004.safetensors",
689
+ "visual.blocks.6.norm1.bias": "model-00001-of-00004.safetensors",
690
+ "visual.blocks.6.norm1.weight": "model-00001-of-00004.safetensors",
691
+ "visual.blocks.6.norm2.bias": "model-00001-of-00004.safetensors",
692
+ "visual.blocks.6.norm2.weight": "model-00001-of-00004.safetensors",
693
+ "visual.blocks.7.attn.proj.bias": "model-00001-of-00004.safetensors",
694
+ "visual.blocks.7.attn.proj.weight": "model-00001-of-00004.safetensors",
695
+ "visual.blocks.7.attn.qkv.bias": "model-00001-of-00004.safetensors",
696
+ "visual.blocks.7.attn.qkv.weight": "model-00001-of-00004.safetensors",
697
+ "visual.blocks.7.mlp.fc1.bias": "model-00001-of-00004.safetensors",
698
+ "visual.blocks.7.mlp.fc1.weight": "model-00001-of-00004.safetensors",
699
+ "visual.blocks.7.mlp.fc2.bias": "model-00001-of-00004.safetensors",
700
+ "visual.blocks.7.mlp.fc2.weight": "model-00001-of-00004.safetensors",
701
+ "visual.blocks.7.norm1.bias": "model-00001-of-00004.safetensors",
702
+ "visual.blocks.7.norm1.weight": "model-00001-of-00004.safetensors",
703
+ "visual.blocks.7.norm2.bias": "model-00001-of-00004.safetensors",
704
+ "visual.blocks.7.norm2.weight": "model-00001-of-00004.safetensors",
705
+ "visual.blocks.8.attn.proj.bias": "model-00001-of-00004.safetensors",
706
+ "visual.blocks.8.attn.proj.weight": "model-00001-of-00004.safetensors",
707
+ "visual.blocks.8.attn.qkv.bias": "model-00001-of-00004.safetensors",
708
+ "visual.blocks.8.attn.qkv.weight": "model-00001-of-00004.safetensors",
709
+ "visual.blocks.8.mlp.fc1.bias": "model-00001-of-00004.safetensors",
710
+ "visual.blocks.8.mlp.fc1.weight": "model-00001-of-00004.safetensors",
711
+ "visual.blocks.8.mlp.fc2.bias": "model-00001-of-00004.safetensors",
712
+ "visual.blocks.8.mlp.fc2.weight": "model-00001-of-00004.safetensors",
713
+ "visual.blocks.8.norm1.bias": "model-00001-of-00004.safetensors",
714
+ "visual.blocks.8.norm1.weight": "model-00001-of-00004.safetensors",
715
+ "visual.blocks.8.norm2.bias": "model-00001-of-00004.safetensors",
716
+ "visual.blocks.8.norm2.weight": "model-00001-of-00004.safetensors",
717
+ "visual.blocks.9.attn.proj.bias": "model-00001-of-00004.safetensors",
718
+ "visual.blocks.9.attn.proj.weight": "model-00001-of-00004.safetensors",
719
+ "visual.blocks.9.attn.qkv.bias": "model-00001-of-00004.safetensors",
720
+ "visual.blocks.9.attn.qkv.weight": "model-00001-of-00004.safetensors",
721
+ "visual.blocks.9.mlp.fc1.bias": "model-00001-of-00004.safetensors",
722
+ "visual.blocks.9.mlp.fc1.weight": "model-00001-of-00004.safetensors",
723
+ "visual.blocks.9.mlp.fc2.bias": "model-00001-of-00004.safetensors",
724
+ "visual.blocks.9.mlp.fc2.weight": "model-00001-of-00004.safetensors",
725
+ "visual.blocks.9.norm1.bias": "model-00001-of-00004.safetensors",
726
+ "visual.blocks.9.norm1.weight": "model-00001-of-00004.safetensors",
727
+ "visual.blocks.9.norm2.bias": "model-00001-of-00004.safetensors",
728
+ "visual.blocks.9.norm2.weight": "model-00001-of-00004.safetensors",
729
+ "visual.merger.ln_q.bias": "model-00001-of-00004.safetensors",
730
+ "visual.merger.ln_q.weight": "model-00001-of-00004.safetensors",
731
+ "visual.merger.mlp.0.bias": "model-00001-of-00004.safetensors",
732
+ "visual.merger.mlp.0.weight": "model-00001-of-00004.safetensors",
733
+ "visual.merger.mlp.2.bias": "model-00001-of-00004.safetensors",
734
+ "visual.merger.mlp.2.weight": "model-00001-of-00004.safetensors",
735
+ "visual.patch_embed.proj.weight": "model-00001-of-00004.safetensors"
736
+ }
737
+ }
ops_mm_embedding_v1.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import List, Optional, TypeAlias, Union
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ from PIL import Image
7
+ from tqdm import tqdm
8
+ from transformers import AutoModelForImageTextToText, AutoProcessor
9
+
10
+ ImageInput: TypeAlias = Union[Image.Image, List[Image.Image]]
11
+ BatchImageInput: TypeAlias = Union[List[Image.Image], List[List[Image.Image]]]
12
+
13
+
14
+ class OpsMMEmbeddingV1(nn.Module):
15
+ def __init__(
16
+ self,
17
+ model_name: str,
18
+ device: str = "cuda",
19
+ max_length: Optional[int] = None,
20
+ attn_implementation: Optional[str] = None,
21
+ ):
22
+ super().__init__()
23
+ self.device = device
24
+ self.max_length = max_length
25
+ self.default_instruction = "You are a helpful assistant."
26
+ self.base_model = AutoModelForImageTextToText.from_pretrained(
27
+ model_name,
28
+ torch_dtype=torch.bfloat16,
29
+ low_cpu_mem_usage=True,
30
+ attn_implementation=attn_implementation,
31
+ ).to(self.device)
32
+
33
+ self.processor = AutoProcessor.from_pretrained(model_name, min_pixels=256 * 28 * 28, max_pixels=1280 * 28 * 28)
34
+ self.processor.tokenizer.padding_side = "left"
35
+ self.eval()
36
+
37
+ def encode_input(self, input):
38
+ hidden_states = self.base_model(**input, return_dict=True, output_hidden_states=True)
39
+ hidden_states = hidden_states.hidden_states[-1]
40
+ pooled_output = self._pooling(hidden_states)
41
+ return pooled_output
42
+
43
+ def _pooling(self, last_hidden_state):
44
+ batch_size = last_hidden_state.shape[0]
45
+ reps = last_hidden_state[torch.arange(batch_size), -1, :]
46
+ reps = torch.nn.functional.normalize(reps, p=2, dim=-1)
47
+ return reps
48
+
49
+ def _validate_instructions(
50
+ self,
51
+ texts: Optional[List[str]],
52
+ images: Optional[BatchImageInput],
53
+ instruction: Optional[Union[str, List[str]]],
54
+ ) -> List[str]:
55
+ """Validate and format instructions to match batch size"""
56
+ batch_size = max(len(x) if x is not None else 0 for x in [texts, images])
57
+
58
+ if instruction is None:
59
+ return [self.default_instruction] * batch_size
60
+
61
+ if isinstance(instruction, str):
62
+ return [instruction] * batch_size
63
+
64
+ if isinstance(instruction, list):
65
+ if len(instruction) != batch_size:
66
+ raise ValueError(f"Length of instruction list ({len(instruction)}) must match batch size ({batch_size}) when texts/images are provided")
67
+ return instruction
68
+
69
+ raise TypeError("instruction must be str, List[str] or None")
70
+
71
+ def _process_images(self, images: ImageInput) -> List[Image.Image]:
72
+ """Convert single image or list of images to processed format"""
73
+ if isinstance(images, Image.Image) or isinstance(images, str):
74
+ return [fetch_image(images)]
75
+ return [fetch_image(i) for i in images]
76
+
77
+ def embed(
78
+ self,
79
+ texts: Optional[List[str]] = None,
80
+ images: Optional[BatchImageInput] = None,
81
+ instruction: Optional[Union[str, List[str]]] = None,
82
+ **kwargs,
83
+ ) -> torch.Tensor:
84
+ """Generate embeddings for text, images, or combined inputs.
85
+
86
+ Args:
87
+ texts: List of text inputs (optional)
88
+ images: Can be:
89
+ - List[Image.Image]: Single image per input
90
+ - List[List[Image.Image]]: Multiple images per input
91
+ instruction: Instruction(s) for the model. Can be:
92
+ - None: use default instruction
93
+ - str: use same instruction for all inputs
94
+ - List[str]: per-input instructions (must match batch size)
95
+ """
96
+ if texts is None and images is None:
97
+ raise ValueError("Either texts or images must be provided")
98
+
99
+ instructions = self._validate_instructions(texts, images, instruction)
100
+
101
+ # Determine batch size
102
+ batch_size = len(texts) if texts is not None else len(images) # type: ignore
103
+
104
+ input_texts, input_images = [], []
105
+ for i in range(batch_size):
106
+ text = texts[i] if texts is not None else None
107
+ image = images[i] if images is not None else None
108
+
109
+ input_str = ""
110
+ processed_image = None
111
+ if image is not None:
112
+ processed_image = self._process_images(image)
113
+ input_str += "<|vision_start|><|image_pad|><|vision_end|>" * len(processed_image)
114
+
115
+ if text is not None:
116
+ input_str += text
117
+
118
+ msg = f"<|im_start|>system\n{instructions[i]}<|im_end|>\n<|im_start|>user\n{input_str}<|im_end|>\n<|im_start|>assistant\n<|endoftext|>"
119
+
120
+ input_texts.append(msg)
121
+ input_images.append(processed_image)
122
+
123
+ # Only pass to processor if we actually have images
124
+ processed_images = input_images if any(img is not None for img in input_images) else None
125
+
126
+ inputs = self.processor(
127
+ text=input_texts,
128
+ images=processed_images,
129
+ padding=True,
130
+ truncation=True,
131
+ max_length=self.max_length,
132
+ return_tensors="pt",
133
+ )
134
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
135
+
136
+ with torch.inference_mode():
137
+ embeddings = self.encode_input(inputs)
138
+
139
+ return embeddings
140
+
141
+ def get_text_embeddings(
142
+ self,
143
+ texts: List[str],
144
+ instruction: Optional[Union[str, List[str]]] = None,
145
+ **kwargs,
146
+ ) -> torch.Tensor:
147
+ """Convenience method for text-only embeddings"""
148
+ return self.get_fused_embeddings(texts=texts, instruction=instruction, **kwargs)
149
+
150
+ def get_image_embeddings(
151
+ self,
152
+ images: BatchImageInput,
153
+ instruction: Optional[Union[str, List[str]]] = None,
154
+ **kwargs,
155
+ ) -> torch.Tensor:
156
+ """Convenience method for image-only embeddings.
157
+
158
+ Args:
159
+ images: Can be:
160
+ - List[Image.Image]: Single image per input
161
+ - List[List[Image.Image]]: Multiple images per input
162
+ """
163
+ return self.get_fused_embeddings(images=images, instruction=instruction, **kwargs)
164
+
165
+ def get_fused_embeddings(
166
+ self,
167
+ texts: Optional[List[str]] = None,
168
+ images: Optional[BatchImageInput] = None,
169
+ instruction: Optional[Union[str, List[str]]] = None,
170
+ batch_size: int = 8,
171
+ show_progress: bool = True,
172
+ **kwargs,
173
+ ) -> torch.Tensor:
174
+ """Batch processing for large collections of texts/images.
175
+
176
+ Args:
177
+ texts: List of text inputs (optional)
178
+ images: Can be:
179
+ - List[Image.Image]: Single image per input
180
+ - List[List[Image.Image]]: Multiple images per input
181
+ instruction: Instruction(s) for the model
182
+ batch_size: Number of items to process at once
183
+ show_progress: Whether to display progress bar
184
+ """
185
+
186
+ if texts is None and images is None:
187
+ raise ValueError("Either texts or images must be provided")
188
+
189
+ total_items = len(texts) if texts is not None else len(images) # type: ignore
190
+ num_batches = math.ceil(total_items / batch_size)
191
+
192
+ all_embeddings = []
193
+ progress = tqdm(total=num_batches, disable=not show_progress, desc="Processing")
194
+
195
+ for i in range(0, total_items, batch_size):
196
+ batch_texts = texts[i : i + batch_size] if texts is not None else None
197
+ batch_images = images[i : i + batch_size] if images is not None else None
198
+ batch_emb = self.embed(texts=batch_texts, images=batch_images, instruction=instruction)
199
+
200
+ all_embeddings.append(batch_emb.cpu())
201
+ progress.update(1)
202
+
203
+ progress.close()
204
+ return torch.cat(all_embeddings, dim=0).to(self.device)
205
+
206
+ def forward(self, **inputs) -> torch.Tensor:
207
+ """Alias for encode_input"""
208
+ return self.encode_input(inputs)
209
+
210
+
211
+ ### Modified from qwen_vl_utils.vision_process.py
212
+ import base64
213
+ import logging
214
+ import math
215
+ from io import BytesIO
216
+
217
+ import requests
218
+
219
+ IMAGE_FACTOR = 28
220
+ MIN_PIXELS = 256 * 28 * 28
221
+ MAX_PIXELS = 1280 * 28 * 28
222
+ MAX_RATIO = 200
223
+
224
+
225
+ def round_by_factor(number: int, factor: int) -> int:
226
+ """Returns the closest integer to 'number' that is divisible by 'factor'."""
227
+ return round(number / factor) * factor
228
+
229
+
230
+ def ceil_by_factor(number: int | float, factor: int) -> int:
231
+ """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
232
+ return math.ceil(number / factor) * factor
233
+
234
+
235
+ def floor_by_factor(number: int | float, factor: int) -> int:
236
+ """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
237
+ return math.floor(number / factor) * factor
238
+
239
+
240
+ def smart_resize(
241
+ height: int,
242
+ width: int,
243
+ factor: int = IMAGE_FACTOR,
244
+ min_pixels: int = MIN_PIXELS,
245
+ max_pixels: int = MAX_PIXELS,
246
+ ) -> tuple[int, int]:
247
+ """
248
+ Rescales the image so that the following conditions are met:
249
+ 1. Both dimensions (height and width) are divisible by 'factor'.
250
+ 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
251
+ 3. The aspect ratio of the image is maintained as closely as possible.
252
+ """
253
+ h_bar = max(factor, round_by_factor(height, factor))
254
+ w_bar = max(factor, round_by_factor(width, factor))
255
+ if h_bar * w_bar > max_pixels:
256
+ beta = math.sqrt((height * width) / max_pixels)
257
+ h_bar = floor_by_factor(height / beta, factor)
258
+ w_bar = floor_by_factor(width / beta, factor)
259
+ elif h_bar * w_bar < min_pixels:
260
+ beta = math.sqrt(min_pixels / (height * width))
261
+ h_bar = ceil_by_factor(height * beta, factor)
262
+ w_bar = ceil_by_factor(width * beta, factor)
263
+
264
+ if max(h_bar, w_bar) / min(h_bar, w_bar) > MAX_RATIO:
265
+ logging.warning(f"Absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(h_bar, w_bar) / min(h_bar, w_bar)}")
266
+ if h_bar > w_bar:
267
+ h_bar = w_bar * MAX_RATIO
268
+ else:
269
+ w_bar = h_bar * MAX_RATIO
270
+ return h_bar, w_bar
271
+
272
+
273
+ def fetch_image(
274
+ image: str | Image.Image,
275
+ size_factor: int = IMAGE_FACTOR,
276
+ min_pixels: int = MIN_PIXELS,
277
+ max_pixels: int = MAX_PIXELS,
278
+ ) -> Image.Image:
279
+ image_obj = None
280
+ if isinstance(image, Image.Image):
281
+ image_obj = image
282
+ elif image.startswith("http://") or image.startswith("https://"):
283
+ image_obj = Image.open(requests.get(image, stream=True).raw) # type: ignore
284
+ elif image.startswith("file://"):
285
+ image_obj = Image.open(image[7:])
286
+ elif image.startswith("data:image"):
287
+ if "base64," in image:
288
+ _, base64_data = image.split("base64,", 1)
289
+ data = base64.b64decode(base64_data)
290
+ image_obj = Image.open(BytesIO(data))
291
+ else:
292
+ image_obj = Image.open(image)
293
+ if image_obj is None:
294
+ raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
295
+ image = image_obj.convert("RGB")
296
+ width, height = image.size
297
+ resized_height, resized_width = smart_resize(
298
+ height,
299
+ width,
300
+ factor=size_factor,
301
+ min_pixels=min_pixels,
302
+ max_pixels=max_pixels,
303
+ )
304
+ image = image.resize((resized_width, resized_height))
305
+
306
+ return image
307
+
308
+
309
+ ###
preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": true,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.48145466,
8
+ 0.4578275,
9
+ 0.40821073
10
+ ],
11
+ "image_processor_type": "Qwen2VLImageProcessor",
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "max_pixels": 12845056,
18
+ "merge_size": 2,
19
+ "min_pixels": 3136,
20
+ "patch_size": 14,
21
+ "processor_class": "Qwen2VLProcessor",
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "longest_edge": 12845056,
26
+ "shortest_edge": 3136
27
+ },
28
+ "temporal_patch_size": 2
29
+ }
score/Ops-MM-embedding-v1-7B.json ADDED
@@ -0,0 +1,2289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "model_name": "Ops-MM-embedding-v1-7B",
4
+ "model_backbone": "Qwen2-VL-7B",
5
+ "model_size": 8.29,
6
+ "embedding_dimension": 3584,
7
+ "max_length_tokens": null,
8
+ "model_release_date": "2025-07-03",
9
+ "data_source": "OpenSearch-AI",
10
+ "url": "https://huggingface.co/OpenSearch-AI/Ops-MM-embedding-v1-7B",
11
+ "report_generated_date": "2025-07-02T23:11:24.116227"
12
+ },
13
+ "metrics": {
14
+ "image": {
15
+ "ImageNet-1K": {
16
+ "hit@1": 0.811,
17
+ "hit@5": 0.964,
18
+ "hit@10": 0.982,
19
+ "ndcg_linear@1": 0.811,
20
+ "ndcg_linear@5": 0.8972089841991893,
21
+ "ndcg_linear@10": 0.903203094426922,
22
+ "ndcg_exponential@1": 0.811,
23
+ "ndcg_exponential@5": 0.8972089841991893,
24
+ "ndcg_exponential@10": 0.903203094426922,
25
+ "precision@1": 0.811,
26
+ "precision@5": 0.1928,
27
+ "precision@10": 0.09820000000000002,
28
+ "recall@1": 0.811,
29
+ "recall@5": 0.964,
30
+ "recall@10": 0.982,
31
+ "f1@1": 0.811,
32
+ "f1@5": 0.32133333333333336,
33
+ "f1@10": 0.17854545454545453,
34
+ "map@1": 0.811,
35
+ "map@5": 0.8745833333333333,
36
+ "map@10": 0.8771603174603174,
37
+ "mrr@1": 0.811,
38
+ "mrr@5": 0.8745833333333333,
39
+ "mrr@10": 0.8771603174603174,
40
+ "num_pred": 1000,
41
+ "num_data": 1000
42
+ },
43
+ "N24News": {
44
+ "hit@1": 0.821,
45
+ "hit@5": 0.974,
46
+ "hit@10": 0.998,
47
+ "ndcg_linear@1": 0.821,
48
+ "ndcg_linear@5": 0.908003904347735,
49
+ "ndcg_linear@10": 0.9159637568629064,
50
+ "ndcg_exponential@1": 0.821,
51
+ "ndcg_exponential@5": 0.908003904347735,
52
+ "ndcg_exponential@10": 0.9159637568629064,
53
+ "precision@1": 0.821,
54
+ "precision@5": 0.1948,
55
+ "precision@10": 0.09980000000000001,
56
+ "recall@1": 0.821,
57
+ "recall@5": 0.974,
58
+ "recall@10": 0.998,
59
+ "f1@1": 0.821,
60
+ "f1@5": 0.3246666666666666,
61
+ "f1@10": 0.18145454545454545,
62
+ "map@1": 0.821,
63
+ "map@5": 0.8855333333333333,
64
+ "map@10": 0.8889365079365079,
65
+ "mrr@1": 0.821,
66
+ "mrr@5": 0.8855333333333333,
67
+ "mrr@10": 0.8889365079365079,
68
+ "num_pred": 1000,
69
+ "num_data": 1000
70
+ },
71
+ "HatefulMemes": {
72
+ "hit@1": 0.757,
73
+ "hit@5": 1.0,
74
+ "hit@10": 1.0,
75
+ "ndcg_linear@1": 0.757,
76
+ "ndcg_linear@5": 0.9103159301178643,
77
+ "ndcg_linear@10": 0.9103159301178643,
78
+ "ndcg_exponential@1": 0.757,
79
+ "ndcg_exponential@5": 0.9103159301178643,
80
+ "ndcg_exponential@10": 0.9103159301178643,
81
+ "precision@1": 0.757,
82
+ "precision@5": 0.20000000000000004,
83
+ "precision@10": 0.10000000000000002,
84
+ "recall@1": 0.757,
85
+ "recall@5": 1.0,
86
+ "recall@10": 1.0,
87
+ "f1@1": 0.757,
88
+ "f1@5": 0.3333333333333333,
89
+ "f1@10": 0.18181818181818182,
90
+ "map@1": 0.757,
91
+ "map@5": 0.8785,
92
+ "map@10": 0.8785,
93
+ "mrr@1": 0.757,
94
+ "mrr@5": 0.8785,
95
+ "mrr@10": 0.8785,
96
+ "num_pred": 1000,
97
+ "num_data": 1000
98
+ },
99
+ "VOC2007": {
100
+ "hit@1": 0.848,
101
+ "hit@5": 0.977,
102
+ "hit@10": 0.992,
103
+ "ndcg_linear@1": 0.848,
104
+ "ndcg_linear@5": 0.9241752272291508,
105
+ "ndcg_linear@10": 0.9290490649008084,
106
+ "ndcg_exponential@1": 0.848,
107
+ "ndcg_exponential@5": 0.9241752272291508,
108
+ "ndcg_exponential@10": 0.9290490649008084,
109
+ "precision@1": 0.848,
110
+ "precision@5": 0.19540000000000005,
111
+ "precision@10": 0.09920000000000001,
112
+ "recall@1": 0.848,
113
+ "recall@5": 0.977,
114
+ "recall@10": 0.992,
115
+ "f1@1": 0.848,
116
+ "f1@5": 0.3256666666666666,
117
+ "f1@10": 0.18036363636363634,
118
+ "map@1": 0.848,
119
+ "map@5": 0.9059333333333334,
120
+ "map@10": 0.9079559523809524,
121
+ "mrr@1": 0.848,
122
+ "mrr@5": 0.9059333333333334,
123
+ "mrr@10": 0.9079559523809524,
124
+ "num_pred": 1000,
125
+ "num_data": 1000
126
+ },
127
+ "SUN397": {
128
+ "hit@1": 0.81,
129
+ "hit@5": 0.975,
130
+ "hit@10": 0.993,
131
+ "ndcg_linear@1": 0.81,
132
+ "ndcg_linear@5": 0.9044714381294062,
133
+ "ndcg_linear@10": 0.9104367712161627,
134
+ "ndcg_exponential@1": 0.81,
135
+ "ndcg_exponential@5": 0.9044714381294062,
136
+ "ndcg_exponential@10": 0.9104367712161627,
137
+ "precision@1": 0.81,
138
+ "precision@5": 0.19500000000000003,
139
+ "precision@10": 0.09930000000000001,
140
+ "recall@1": 0.81,
141
+ "recall@5": 0.975,
142
+ "recall@10": 0.993,
143
+ "f1@1": 0.81,
144
+ "f1@5": 0.325,
145
+ "f1@10": 0.1805454545454545,
146
+ "map@1": 0.81,
147
+ "map@5": 0.8804500000000001,
148
+ "map@10": 0.8829964285714286,
149
+ "mrr@1": 0.81,
150
+ "mrr@5": 0.8804500000000001,
151
+ "mrr@10": 0.8829964285714286,
152
+ "num_pred": 1000,
153
+ "num_data": 1000
154
+ },
155
+ "Place365": {
156
+ "hit@1": 0.456,
157
+ "hit@5": 0.742,
158
+ "hit@10": 0.832,
159
+ "ndcg_linear@1": 0.456,
160
+ "ndcg_linear@5": 0.612464933944354,
161
+ "ndcg_linear@10": 0.6421212605535749,
162
+ "ndcg_exponential@1": 0.456,
163
+ "ndcg_exponential@5": 0.612464933944354,
164
+ "ndcg_exponential@10": 0.6421212605535749,
165
+ "precision@1": 0.456,
166
+ "precision@5": 0.14839999999999998,
167
+ "precision@10": 0.08320000000000002,
168
+ "recall@1": 0.456,
169
+ "recall@5": 0.742,
170
+ "recall@10": 0.832,
171
+ "f1@1": 0.456,
172
+ "f1@5": 0.24733333333333338,
173
+ "f1@10": 0.15127272727272728,
174
+ "map@1": 0.456,
175
+ "map@5": 0.5689833333333334,
176
+ "map@10": 0.5815511904761905,
177
+ "mrr@1": 0.456,
178
+ "mrr@5": 0.5689833333333334,
179
+ "mrr@10": 0.5815511904761905,
180
+ "num_pred": 1000,
181
+ "num_data": 1000
182
+ },
183
+ "ImageNet-A": {
184
+ "hit@1": 0.578,
185
+ "hit@5": 0.814,
186
+ "hit@10": 0.873,
187
+ "ndcg_linear@1": 0.578,
188
+ "ndcg_linear@5": 0.7067527294684098,
189
+ "ndcg_linear@10": 0.7260494542589042,
190
+ "ndcg_exponential@1": 0.578,
191
+ "ndcg_exponential@5": 0.7067527294684098,
192
+ "ndcg_exponential@10": 0.7260494542589042,
193
+ "precision@1": 0.578,
194
+ "precision@5": 0.1628,
195
+ "precision@10": 0.08730000000000002,
196
+ "recall@1": 0.578,
197
+ "recall@5": 0.814,
198
+ "recall@10": 0.873,
199
+ "f1@1": 0.578,
200
+ "f1@5": 0.27133333333333337,
201
+ "f1@10": 0.15872727272727272,
202
+ "map@1": 0.578,
203
+ "map@5": 0.67075,
204
+ "map@10": 0.6788373015873016,
205
+ "mrr@1": 0.578,
206
+ "mrr@5": 0.67075,
207
+ "mrr@10": 0.6788373015873016,
208
+ "num_pred": 1000,
209
+ "num_data": 1000
210
+ },
211
+ "ImageNet-R": {
212
+ "hit@1": 0.902,
213
+ "hit@5": 0.981,
214
+ "hit@10": 0.992,
215
+ "ndcg_linear@1": 0.902,
216
+ "ndcg_linear@5": 0.9490524502565482,
217
+ "ndcg_linear@10": 0.9525844230009524,
218
+ "ndcg_exponential@1": 0.902,
219
+ "ndcg_exponential@5": 0.9490524502565482,
220
+ "ndcg_exponential@10": 0.9525844230009524,
221
+ "precision@1": 0.902,
222
+ "precision@5": 0.19620000000000004,
223
+ "precision@10": 0.09920000000000001,
224
+ "recall@1": 0.902,
225
+ "recall@5": 0.981,
226
+ "recall@10": 0.992,
227
+ "f1@1": 0.902,
228
+ "f1@5": 0.327,
229
+ "f1@10": 0.18036363636363634,
230
+ "map@1": 0.902,
231
+ "map@5": 0.9379833333333334,
232
+ "map@10": 0.9394273809523809,
233
+ "mrr@1": 0.902,
234
+ "mrr@5": 0.9379833333333334,
235
+ "mrr@10": 0.9394273809523809,
236
+ "num_pred": 1000,
237
+ "num_data": 1000
238
+ },
239
+ "ObjectNet": {
240
+ "hit@1": 0.697,
241
+ "hit@5": 0.876,
242
+ "hit@10": 0.912,
243
+ "ndcg_linear@1": 0.697,
244
+ "ndcg_linear@5": 0.7998091251553962,
245
+ "ndcg_linear@10": 0.8114293964429173,
246
+ "ndcg_exponential@1": 0.697,
247
+ "ndcg_exponential@5": 0.7998091251553962,
248
+ "ndcg_exponential@10": 0.8114293964429173,
249
+ "precision@1": 0.697,
250
+ "precision@5": 0.17520000000000002,
251
+ "precision@10": 0.09120000000000002,
252
+ "recall@1": 0.697,
253
+ "recall@5": 0.876,
254
+ "recall@10": 0.912,
255
+ "f1@1": 0.697,
256
+ "f1@5": 0.292,
257
+ "f1@10": 0.1658181818181818,
258
+ "map@1": 0.697,
259
+ "map@5": 0.7738333333333334,
260
+ "map@10": 0.778622619047619,
261
+ "mrr@1": 0.697,
262
+ "mrr@5": 0.7738333333333334,
263
+ "mrr@10": 0.778622619047619,
264
+ "num_pred": 1000,
265
+ "num_data": 1000
266
+ },
267
+ "Country211": {
268
+ "hit@1": 0.285,
269
+ "hit@5": 0.506,
270
+ "hit@10": 0.609,
271
+ "ndcg_linear@1": 0.285,
272
+ "ndcg_linear@5": 0.40328589269051235,
273
+ "ndcg_linear@10": 0.4360567158341879,
274
+ "ndcg_exponential@1": 0.285,
275
+ "ndcg_exponential@5": 0.40328589269051235,
276
+ "ndcg_exponential@10": 0.4360567158341879,
277
+ "precision@1": 0.285,
278
+ "precision@5": 0.10119999999999998,
279
+ "precision@10": 0.06089999999999999,
280
+ "recall@1": 0.285,
281
+ "recall@5": 0.506,
282
+ "recall@10": 0.609,
283
+ "f1@1": 0.285,
284
+ "f1@5": 0.1686666666666667,
285
+ "f1@10": 0.11072727272727274,
286
+ "map@1": 0.285,
287
+ "map@5": 0.36905,
288
+ "map@10": 0.3822527777777777,
289
+ "mrr@1": 0.285,
290
+ "mrr@5": 0.36905,
291
+ "mrr@10": 0.3822527777777777,
292
+ "num_pred": 1000,
293
+ "num_data": 1000
294
+ },
295
+ "OK-VQA": {
296
+ "hit@1": 0.706,
297
+ "hit@5": 0.935,
298
+ "hit@10": 0.97,
299
+ "ndcg_linear@1": 0.706,
300
+ "ndcg_linear@5": 0.8344075880155006,
301
+ "ndcg_linear@10": 0.8459473006000523,
302
+ "ndcg_exponential@1": 0.706,
303
+ "ndcg_exponential@5": 0.8344075880155006,
304
+ "ndcg_exponential@10": 0.8459473006000523,
305
+ "precision@1": 0.706,
306
+ "precision@5": 0.18700000000000003,
307
+ "precision@10": 0.097,
308
+ "recall@1": 0.706,
309
+ "recall@5": 0.935,
310
+ "recall@10": 0.97,
311
+ "f1@1": 0.706,
312
+ "f1@5": 0.3116666666666667,
313
+ "f1@10": 0.17636363636363633,
314
+ "map@1": 0.706,
315
+ "map@5": 0.8004,
316
+ "map@10": 0.805288492063492,
317
+ "mrr@1": 0.706,
318
+ "mrr@5": 0.8004,
319
+ "mrr@10": 0.805288492063492,
320
+ "num_pred": 1000,
321
+ "num_data": 1000
322
+ },
323
+ "A-OKVQA": {
324
+ "hit@1": 0.6,
325
+ "hit@5": 0.885,
326
+ "hit@10": 0.941,
327
+ "ndcg_linear@1": 0.6,
328
+ "ndcg_linear@5": 0.7567001853131614,
329
+ "ndcg_linear@10": 0.774920654838386,
330
+ "ndcg_exponential@1": 0.6,
331
+ "ndcg_exponential@5": 0.7567001853131614,
332
+ "ndcg_exponential@10": 0.774920654838386,
333
+ "precision@1": 0.6,
334
+ "precision@5": 0.177,
335
+ "precision@10": 0.09410000000000002,
336
+ "recall@1": 0.6,
337
+ "recall@5": 0.885,
338
+ "recall@10": 0.941,
339
+ "f1@1": 0.6,
340
+ "f1@5": 0.295,
341
+ "f1@10": 0.17109090909090907,
342
+ "map@1": 0.6,
343
+ "map@5": 0.7134833333333334,
344
+ "map@10": 0.7210642857142857,
345
+ "mrr@1": 0.6,
346
+ "mrr@5": 0.7134833333333334,
347
+ "mrr@10": 0.7210642857142857,
348
+ "num_pred": 1000,
349
+ "num_data": 1000
350
+ },
351
+ "DocVQA": {
352
+ "hit@1": 0.947,
353
+ "hit@5": 0.995,
354
+ "hit@10": 0.996,
355
+ "ndcg_linear@1": 0.947,
356
+ "ndcg_linear@5": 0.9755232833635987,
357
+ "ndcg_linear@10": 0.9758794905507068,
358
+ "ndcg_exponential@1": 0.947,
359
+ "ndcg_exponential@5": 0.9755232833635987,
360
+ "ndcg_exponential@10": 0.9758794905507068,
361
+ "precision@1": 0.947,
362
+ "precision@5": 0.19900000000000004,
363
+ "precision@10": 0.09960000000000002,
364
+ "recall@1": 0.947,
365
+ "recall@5": 0.995,
366
+ "recall@10": 0.996,
367
+ "f1@1": 0.947,
368
+ "f1@5": 0.3316666666666666,
369
+ "f1@10": 0.18109090909090905,
370
+ "map@1": 0.947,
371
+ "map@5": 0.9687833333333334,
372
+ "map@10": 0.9689500000000001,
373
+ "mrr@1": 0.947,
374
+ "mrr@5": 0.9687833333333334,
375
+ "mrr@10": 0.9689500000000001,
376
+ "num_pred": 1000,
377
+ "num_data": 1000
378
+ },
379
+ "InfographicsVQA": {
380
+ "hit@1": 0.736,
381
+ "hit@5": 0.906,
382
+ "hit@10": 0.932,
383
+ "ndcg_linear@1": 0.736,
384
+ "ndcg_linear@5": 0.8277407513883303,
385
+ "ndcg_linear@10": 0.8362488999302439,
386
+ "ndcg_exponential@1": 0.736,
387
+ "ndcg_exponential@5": 0.8277407513883303,
388
+ "ndcg_exponential@10": 0.8362488999302439,
389
+ "precision@1": 0.736,
390
+ "precision@5": 0.18120000000000003,
391
+ "precision@10": 0.09320000000000002,
392
+ "recall@1": 0.736,
393
+ "recall@5": 0.906,
394
+ "recall@10": 0.932,
395
+ "f1@1": 0.736,
396
+ "f1@5": 0.302,
397
+ "f1@10": 0.16945454545454544,
398
+ "map@1": 0.736,
399
+ "map@5": 0.8015666666666666,
400
+ "map@10": 0.805140873015873,
401
+ "mrr@1": 0.736,
402
+ "mrr@5": 0.8015666666666666,
403
+ "mrr@10": 0.805140873015873,
404
+ "num_pred": 1000,
405
+ "num_data": 1000
406
+ },
407
+ "ChartQA": {
408
+ "hit@1": 0.652,
409
+ "hit@5": 0.81,
410
+ "hit@10": 0.873,
411
+ "ndcg_linear@1": 0.652,
412
+ "ndcg_linear@5": 0.7383237284140184,
413
+ "ndcg_linear@10": 0.758789100394747,
414
+ "ndcg_exponential@1": 0.652,
415
+ "ndcg_exponential@5": 0.7383237284140184,
416
+ "ndcg_exponential@10": 0.758789100394747,
417
+ "precision@1": 0.652,
418
+ "precision@5": 0.162,
419
+ "precision@10": 0.08730000000000002,
420
+ "recall@1": 0.652,
421
+ "recall@5": 0.81,
422
+ "recall@10": 0.873,
423
+ "f1@1": 0.652,
424
+ "f1@5": 0.27,
425
+ "f1@10": 0.15872727272727274,
426
+ "map@1": 0.652,
427
+ "map@5": 0.7142833333333333,
428
+ "map@10": 0.7227880952380953,
429
+ "mrr@1": 0.652,
430
+ "mrr@5": 0.7142833333333333,
431
+ "mrr@10": 0.7227880952380953,
432
+ "num_pred": 1000,
433
+ "num_data": 1000
434
+ },
435
+ "Visual7W": {
436
+ "hit@1": 0.584,
437
+ "hit@5": 0.904,
438
+ "hit@10": 0.951,
439
+ "ndcg_linear@1": 0.584,
440
+ "ndcg_linear@5": 0.7607038779643807,
441
+ "ndcg_linear@10": 0.7762948323430009,
442
+ "ndcg_exponential@1": 0.584,
443
+ "ndcg_exponential@5": 0.7607038779643807,
444
+ "ndcg_exponential@10": 0.7762948323430009,
445
+ "precision@1": 0.584,
446
+ "precision@5": 0.18080000000000002,
447
+ "precision@10": 0.09510000000000002,
448
+ "recall@1": 0.584,
449
+ "recall@5": 0.904,
450
+ "recall@10": 0.951,
451
+ "f1@1": 0.584,
452
+ "f1@5": 0.3013333333333334,
453
+ "f1@10": 0.1729090909090909,
454
+ "map@1": 0.584,
455
+ "map@5": 0.7124833333333334,
456
+ "map@10": 0.7191464285714286,
457
+ "mrr@1": 0.584,
458
+ "mrr@5": 0.7124833333333334,
459
+ "mrr@10": 0.7191464285714286,
460
+ "num_pred": 1000,
461
+ "num_data": 1000
462
+ },
463
+ "ScienceQA": {
464
+ "hit@1": 0.499,
465
+ "hit@5": 0.809,
466
+ "hit@10": 0.911,
467
+ "ndcg_linear@1": 0.499,
468
+ "ndcg_linear@5": 0.6739528358410788,
469
+ "ndcg_linear@10": 0.7062005471589817,
470
+ "ndcg_exponential@1": 0.499,
471
+ "ndcg_exponential@5": 0.6739528358410788,
472
+ "ndcg_exponential@10": 0.7062005471589817,
473
+ "precision@1": 0.499,
474
+ "precision@5": 0.1618,
475
+ "precision@10": 0.09110000000000001,
476
+ "recall@1": 0.499,
477
+ "recall@5": 0.809,
478
+ "recall@10": 0.911,
479
+ "f1@1": 0.499,
480
+ "f1@5": 0.26966666666666667,
481
+ "f1@10": 0.16563636363636364,
482
+ "map@1": 0.499,
483
+ "map@5": 0.6281666666666667,
484
+ "map@10": 0.641040873015873,
485
+ "mrr@1": 0.499,
486
+ "mrr@5": 0.6281666666666667,
487
+ "mrr@10": 0.641040873015873,
488
+ "num_pred": 1000,
489
+ "num_data": 1000
490
+ },
491
+ "VizWiz": {
492
+ "hit@1": 0.566,
493
+ "hit@5": 0.719,
494
+ "hit@10": 0.751,
495
+ "ndcg_linear@1": 0.566,
496
+ "ndcg_linear@5": 0.6514422712364567,
497
+ "ndcg_linear@10": 0.6616436437103389,
498
+ "ndcg_exponential@1": 0.566,
499
+ "ndcg_exponential@5": 0.6514422712364567,
500
+ "ndcg_exponential@10": 0.6616436437103389,
501
+ "precision@1": 0.566,
502
+ "precision@5": 0.14379999999999998,
503
+ "precision@10": 0.07510000000000001,
504
+ "recall@1": 0.566,
505
+ "recall@5": 0.719,
506
+ "recall@10": 0.751,
507
+ "f1@1": 0.566,
508
+ "f1@5": 0.23966666666666672,
509
+ "f1@10": 0.13654545454545455,
510
+ "map@1": 0.566,
511
+ "map@5": 0.6285833333333333,
512
+ "map@10": 0.6327095238095238,
513
+ "mrr@1": 0.566,
514
+ "mrr@5": 0.6285833333333333,
515
+ "mrr@10": 0.6327095238095238,
516
+ "num_pred": 1000,
517
+ "num_data": 1000
518
+ },
519
+ "GQA": {
520
+ "hit@1": 0.799,
521
+ "hit@5": 0.946,
522
+ "hit@10": 0.972,
523
+ "ndcg_linear@1": 0.799,
524
+ "ndcg_linear@5": 0.8811134365876296,
525
+ "ndcg_linear@10": 0.8897748874138558,
526
+ "ndcg_exponential@1": 0.799,
527
+ "ndcg_exponential@5": 0.8811134365876296,
528
+ "ndcg_exponential@10": 0.8897748874138558,
529
+ "precision@1": 0.799,
530
+ "precision@5": 0.1892,
531
+ "precision@10": 0.09720000000000002,
532
+ "recall@1": 0.799,
533
+ "recall@5": 0.946,
534
+ "recall@10": 0.972,
535
+ "f1@1": 0.799,
536
+ "f1@5": 0.31533333333333335,
537
+ "f1@10": 0.17672727272727268,
538
+ "map@1": 0.799,
539
+ "map@5": 0.8592000000000001,
540
+ "map@10": 0.8629210317460316,
541
+ "mrr@1": 0.799,
542
+ "mrr@5": 0.8592000000000001,
543
+ "mrr@10": 0.8629210317460316,
544
+ "num_pred": 1000,
545
+ "num_data": 1000
546
+ },
547
+ "TextVQA": {
548
+ "hit@1": 0.869,
549
+ "hit@5": 0.926,
550
+ "hit@10": 0.939,
551
+ "ndcg_linear@1": 0.869,
552
+ "ndcg_linear@5": 0.9007951815016322,
553
+ "ndcg_linear@10": 0.9049302047486061,
554
+ "ndcg_exponential@1": 0.869,
555
+ "ndcg_exponential@5": 0.9007951815016322,
556
+ "ndcg_exponential@10": 0.9049302047486061,
557
+ "precision@1": 0.869,
558
+ "precision@5": 0.18520000000000006,
559
+ "precision@10": 0.09390000000000001,
560
+ "recall@1": 0.869,
561
+ "recall@5": 0.926,
562
+ "recall@10": 0.939,
563
+ "f1@1": 0.869,
564
+ "f1@5": 0.3086666666666667,
565
+ "f1@10": 0.1707272727272727,
566
+ "map@1": 0.869,
567
+ "map@5": 0.8922833333333333,
568
+ "map@10": 0.8939476190476191,
569
+ "mrr@1": 0.869,
570
+ "mrr@5": 0.8922833333333333,
571
+ "mrr@10": 0.8939476190476191,
572
+ "num_pred": 1000,
573
+ "num_data": 1000
574
+ },
575
+ "VisDial": {
576
+ "hit@1": 0.818,
577
+ "hit@5": 0.96,
578
+ "hit@10": 0.981,
579
+ "ndcg_linear@1": 0.818,
580
+ "ndcg_linear@5": 0.8981579586925184,
581
+ "ndcg_linear@10": 0.9050024514275745,
582
+ "ndcg_exponential@1": 0.818,
583
+ "ndcg_exponential@5": 0.8981579586925184,
584
+ "ndcg_exponential@10": 0.9050024514275745,
585
+ "precision@1": 0.818,
586
+ "precision@5": 0.192,
587
+ "precision@10": 0.09810000000000002,
588
+ "recall@1": 0.818,
589
+ "recall@5": 0.96,
590
+ "recall@10": 0.981,
591
+ "f1@1": 0.818,
592
+ "f1@5": 0.32,
593
+ "f1@10": 0.17836363636363634,
594
+ "map@1": 0.818,
595
+ "map@5": 0.8771666666666667,
596
+ "map@10": 0.8800242063492063,
597
+ "mrr@1": 0.818,
598
+ "mrr@5": 0.8771666666666667,
599
+ "mrr@10": 0.8800242063492063,
600
+ "num_pred": 1000,
601
+ "num_data": 1000
602
+ },
603
+ "CIRR": {
604
+ "hit@1": 0.552,
605
+ "hit@5": 0.911,
606
+ "hit@10": 0.955,
607
+ "ndcg_linear@1": 0.552,
608
+ "ndcg_linear@5": 0.7544132207910468,
609
+ "ndcg_linear@10": 0.7687461843165859,
610
+ "ndcg_exponential@1": 0.552,
611
+ "ndcg_exponential@5": 0.7544132207910468,
612
+ "ndcg_exponential@10": 0.7687461843165859,
613
+ "precision@1": 0.552,
614
+ "precision@5": 0.18220000000000003,
615
+ "precision@10": 0.09550000000000002,
616
+ "recall@1": 0.552,
617
+ "recall@5": 0.911,
618
+ "recall@10": 0.955,
619
+ "f1@1": 0.552,
620
+ "f1@5": 0.30366666666666664,
621
+ "f1@10": 0.17363636363636362,
622
+ "map@1": 0.552,
623
+ "map@5": 0.7012666666666667,
624
+ "map@10": 0.7072380952380952,
625
+ "mrr@1": 0.552,
626
+ "mrr@5": 0.7012666666666667,
627
+ "mrr@10": 0.7072380952380952,
628
+ "num_pred": 1000,
629
+ "num_data": 1000
630
+ },
631
+ "VisualNews_t2i": {
632
+ "hit@1": 0.801,
633
+ "hit@5": 0.907,
634
+ "hit@10": 0.938,
635
+ "ndcg_linear@1": 0.801,
636
+ "ndcg_linear@5": 0.8594919255925174,
637
+ "ndcg_linear@10": 0.8692591058077485,
638
+ "ndcg_exponential@1": 0.801,
639
+ "ndcg_exponential@5": 0.8594919255925174,
640
+ "ndcg_exponential@10": 0.8692591058077485,
641
+ "precision@1": 0.801,
642
+ "precision@5": 0.18140000000000003,
643
+ "precision@10": 0.09380000000000001,
644
+ "recall@1": 0.801,
645
+ "recall@5": 0.907,
646
+ "recall@10": 0.938,
647
+ "f1@1": 0.801,
648
+ "f1@5": 0.3023333333333334,
649
+ "f1@10": 0.17054545454545453,
650
+ "map@1": 0.801,
651
+ "map@5": 0.8435,
652
+ "map@10": 0.8473805555555555,
653
+ "mrr@1": 0.801,
654
+ "mrr@5": 0.8435,
655
+ "mrr@10": 0.8473805555555555,
656
+ "num_pred": 1000,
657
+ "num_data": 1000
658
+ },
659
+ "VisualNews_i2t": {
660
+ "hit@1": 0.843,
661
+ "hit@5": 0.936,
662
+ "hit@10": 0.953,
663
+ "ndcg_linear@1": 0.843,
664
+ "ndcg_linear@5": 0.8943970763350075,
665
+ "ndcg_linear@10": 0.8998863523941975,
666
+ "ndcg_exponential@1": 0.843,
667
+ "ndcg_exponential@5": 0.8943970763350075,
668
+ "ndcg_exponential@10": 0.8998863523941975,
669
+ "precision@1": 0.843,
670
+ "precision@5": 0.18720000000000003,
671
+ "precision@10": 0.09530000000000001,
672
+ "recall@1": 0.843,
673
+ "recall@5": 0.936,
674
+ "recall@10": 0.953,
675
+ "f1@1": 0.843,
676
+ "f1@5": 0.312,
677
+ "f1@10": 0.17327272727272724,
678
+ "map@1": 0.843,
679
+ "map@5": 0.8803666666666666,
680
+ "map@10": 0.8826269841269841,
681
+ "mrr@1": 0.843,
682
+ "mrr@5": 0.8803666666666666,
683
+ "mrr@10": 0.8826269841269841,
684
+ "num_pred": 1000,
685
+ "num_data": 1000
686
+ },
687
+ "MSCOCO_t2i": {
688
+ "hit@1": 0.793,
689
+ "hit@5": 0.949,
690
+ "hit@10": 0.978,
691
+ "ndcg_linear@1": 0.793,
692
+ "ndcg_linear@5": 0.8803368672984685,
693
+ "ndcg_linear@10": 0.8896818613565582,
694
+ "ndcg_exponential@1": 0.793,
695
+ "ndcg_exponential@5": 0.8803368672984685,
696
+ "ndcg_exponential@10": 0.8896818613565582,
697
+ "precision@1": 0.793,
698
+ "precision@5": 0.18980000000000005,
699
+ "precision@10": 0.09780000000000001,
700
+ "recall@1": 0.793,
701
+ "recall@5": 0.949,
702
+ "recall@10": 0.978,
703
+ "f1@1": 0.793,
704
+ "f1@5": 0.31633333333333336,
705
+ "f1@10": 0.17781818181818182,
706
+ "map@1": 0.793,
707
+ "map@5": 0.8571333333333333,
708
+ "map@10": 0.860975,
709
+ "mrr@1": 0.793,
710
+ "mrr@5": 0.8571333333333333,
711
+ "mrr@10": 0.860975,
712
+ "num_pred": 1000,
713
+ "num_data": 1000
714
+ },
715
+ "MSCOCO_i2t": {
716
+ "hit@1": 0.721,
717
+ "hit@5": 0.943,
718
+ "hit@10": 0.98,
719
+ "ndcg_linear@1": 0.721,
720
+ "ndcg_linear@5": 0.8402135232694316,
721
+ "ndcg_linear@10": 0.8526142772096339,
722
+ "ndcg_exponential@1": 0.721,
723
+ "ndcg_exponential@5": 0.8402135232694316,
724
+ "ndcg_exponential@10": 0.8526142772096339,
725
+ "precision@1": 0.721,
726
+ "precision@5": 0.18860000000000002,
727
+ "precision@10": 0.09800000000000002,
728
+ "recall@1": 0.721,
729
+ "recall@5": 0.943,
730
+ "recall@10": 0.98,
731
+ "f1@1": 0.721,
732
+ "f1@5": 0.3143333333333333,
733
+ "f1@10": 0.17818181818181816,
734
+ "map@1": 0.721,
735
+ "map@5": 0.8058666666666666,
736
+ "map@10": 0.811238492063492,
737
+ "mrr@1": 0.721,
738
+ "mrr@5": 0.8058666666666666,
739
+ "mrr@10": 0.811238492063492,
740
+ "num_pred": 1000,
741
+ "num_data": 1000
742
+ },
743
+ "NIGHTS": {
744
+ "hit@1": 0.662,
745
+ "hit@5": 0.977,
746
+ "hit@10": 0.997,
747
+ "ndcg_linear@1": 0.662,
748
+ "ndcg_linear@5": 0.8428332649702934,
749
+ "ndcg_linear@10": 0.8493732941342598,
750
+ "ndcg_exponential@1": 0.662,
751
+ "ndcg_exponential@5": 0.8428332649702934,
752
+ "ndcg_exponential@10": 0.8493732941342598,
753
+ "precision@1": 0.662,
754
+ "precision@5": 0.19540000000000005,
755
+ "precision@10": 0.09970000000000001,
756
+ "recall@1": 0.662,
757
+ "recall@5": 0.977,
758
+ "recall@10": 0.997,
759
+ "f1@1": 0.662,
760
+ "f1@5": 0.3256666666666666,
761
+ "f1@10": 0.18127272727272725,
762
+ "map@1": 0.662,
763
+ "map@5": 0.797,
764
+ "map@10": 0.7997424603174603,
765
+ "mrr@1": 0.662,
766
+ "mrr@5": 0.797,
767
+ "mrr@10": 0.7997424603174603,
768
+ "num_pred": 1000,
769
+ "num_data": 1000
770
+ },
771
+ "WebQA": {
772
+ "hit@1": 0.919,
773
+ "hit@5": 0.994,
774
+ "hit@10": 0.998,
775
+ "ndcg_linear@1": 0.919,
776
+ "ndcg_linear@5": 0.9619522053693657,
777
+ "ndcg_linear@10": 0.963298983072579,
778
+ "ndcg_exponential@1": 0.919,
779
+ "ndcg_exponential@5": 0.9619522053693657,
780
+ "ndcg_exponential@10": 0.963298983072579,
781
+ "precision@1": 0.919,
782
+ "precision@5": 0.1988,
783
+ "precision@10": 0.09980000000000001,
784
+ "recall@1": 0.919,
785
+ "recall@5": 0.994,
786
+ "recall@10": 0.998,
787
+ "f1@1": 0.919,
788
+ "f1@5": 0.3313333333333333,
789
+ "f1@10": 0.18145454545454545,
790
+ "map@1": 0.919,
791
+ "map@5": 0.9510166666666666,
792
+ "map@10": 0.9516039682539683,
793
+ "mrr@1": 0.919,
794
+ "mrr@5": 0.9510166666666666,
795
+ "mrr@10": 0.9516039682539683,
796
+ "num_pred": 1000,
797
+ "num_data": 1000
798
+ },
799
+ "FashionIQ": {
800
+ "hit@1": 0.243,
801
+ "hit@5": 0.511,
802
+ "hit@10": 0.592,
803
+ "ndcg_linear@1": 0.243,
804
+ "ndcg_linear@5": 0.38383730216588574,
805
+ "ndcg_linear@10": 0.41007610225616226,
806
+ "ndcg_exponential@1": 0.243,
807
+ "ndcg_exponential@5": 0.38383730216588574,
808
+ "ndcg_exponential@10": 0.41007610225616226,
809
+ "precision@1": 0.243,
810
+ "precision@5": 0.10219999999999999,
811
+ "precision@10": 0.05919999999999999,
812
+ "recall@1": 0.243,
813
+ "recall@5": 0.511,
814
+ "recall@10": 0.592,
815
+ "f1@1": 0.243,
816
+ "f1@5": 0.17033333333333336,
817
+ "f1@10": 0.10763636363636366,
818
+ "map@1": 0.243,
819
+ "map@5": 0.3416666666666666,
820
+ "map@10": 0.3525230158730159,
821
+ "mrr@1": 0.243,
822
+ "mrr@5": 0.3416666666666666,
823
+ "mrr@10": 0.3525230158730159,
824
+ "num_pred": 1000,
825
+ "num_data": 1000
826
+ },
827
+ "Wiki-SS-NQ": {
828
+ "hit@1": 0.743,
829
+ "hit@5": 0.909,
830
+ "hit@10": 0.942,
831
+ "ndcg_linear@1": 0.743,
832
+ "ndcg_linear@5": 0.836321433713043,
833
+ "ndcg_linear@10": 0.8472817749175067,
834
+ "ndcg_exponential@1": 0.743,
835
+ "ndcg_exponential@5": 0.836321433713043,
836
+ "ndcg_exponential@10": 0.8472817749175067,
837
+ "precision@1": 0.743,
838
+ "precision@5": 0.18180000000000002,
839
+ "precision@10": 0.09420000000000002,
840
+ "recall@1": 0.743,
841
+ "recall@5": 0.909,
842
+ "recall@10": 0.942,
843
+ "f1@1": 0.743,
844
+ "f1@5": 0.303,
845
+ "f1@10": 0.17127272727272724,
846
+ "map@1": 0.743,
847
+ "map@5": 0.81165,
848
+ "map@10": 0.8163456349206349,
849
+ "mrr@1": 0.743,
850
+ "mrr@5": 0.81165,
851
+ "mrr@10": 0.8163456349206349,
852
+ "num_pred": 1000,
853
+ "num_data": 1000
854
+ },
855
+ "OVEN": {
856
+ "hit@1": 0.732,
857
+ "hit@5": 0.88,
858
+ "hit@10": 0.925,
859
+ "ndcg_linear@1": 0.732,
860
+ "ndcg_linear@5": 0.813023413986076,
861
+ "ndcg_linear@10": 0.8277770061315273,
862
+ "ndcg_exponential@1": 0.732,
863
+ "ndcg_exponential@5": 0.813023413986076,
864
+ "ndcg_exponential@10": 0.8277770061315273,
865
+ "precision@1": 0.732,
866
+ "precision@5": 0.176,
867
+ "precision@10": 0.0925,
868
+ "recall@1": 0.732,
869
+ "recall@5": 0.88,
870
+ "recall@10": 0.925,
871
+ "f1@1": 0.732,
872
+ "f1@5": 0.2933333333333334,
873
+ "f1@10": 0.16818181818181818,
874
+ "map@1": 0.732,
875
+ "map@5": 0.7905333333333333,
876
+ "map@10": 0.796740476190476,
877
+ "mrr@1": 0.732,
878
+ "mrr@5": 0.7905333333333333,
879
+ "mrr@10": 0.796740476190476,
880
+ "num_pred": 1000,
881
+ "num_data": 1000
882
+ },
883
+ "EDIS": {
884
+ "hit@1": 0.944,
885
+ "hit@5": 0.997,
886
+ "hit@10": 1.0,
887
+ "ndcg_linear@1": 0.944,
888
+ "ndcg_linear@5": 0.9750516728538741,
889
+ "ndcg_linear@10": 0.9760124097440858,
890
+ "ndcg_exponential@1": 0.944,
891
+ "ndcg_exponential@5": 0.9750516728538741,
892
+ "ndcg_exponential@10": 0.9760124097440858,
893
+ "precision@1": 0.944,
894
+ "precision@5": 0.19940000000000002,
895
+ "precision@10": 0.10000000000000002,
896
+ "recall@1": 0.944,
897
+ "recall@5": 0.997,
898
+ "recall@10": 1.0,
899
+ "f1@1": 0.944,
900
+ "f1@5": 0.3323333333333333,
901
+ "f1@10": 0.18181818181818182,
902
+ "map@1": 0.944,
903
+ "map@5": 0.9675,
904
+ "map@10": 0.9678916666666667,
905
+ "mrr@1": 0.944,
906
+ "mrr@5": 0.9675,
907
+ "mrr@10": 0.9678916666666667,
908
+ "num_pred": 1000,
909
+ "num_data": 1000
910
+ },
911
+ "MSCOCO": {
912
+ "hit@1": 0.739,
913
+ "hit@5": 0.879,
914
+ "hit@10": 0.912,
915
+ "ndcg_linear@1": 0.739,
916
+ "ndcg_linear@5": 0.8170358283919806,
917
+ "ndcg_linear@10": 0.8278094286114062,
918
+ "ndcg_exponential@1": 0.739,
919
+ "ndcg_exponential@5": 0.8170358283919806,
920
+ "ndcg_exponential@10": 0.8278094286114062,
921
+ "precision@1": 0.739,
922
+ "precision@5": 0.1758,
923
+ "precision@10": 0.09120000000000002,
924
+ "recall@1": 0.739,
925
+ "recall@5": 0.879,
926
+ "recall@10": 0.912,
927
+ "f1@1": 0.739,
928
+ "f1@5": 0.293,
929
+ "f1@10": 0.1658181818181818,
930
+ "map@1": 0.739,
931
+ "map@5": 0.7961166666666667,
932
+ "map@10": 0.8006210317460317,
933
+ "mrr@1": 0.739,
934
+ "mrr@5": 0.7961166666666667,
935
+ "mrr@10": 0.8006210317460317,
936
+ "num_pred": 1000,
937
+ "num_data": 1000
938
+ },
939
+ "RefCOCO": {
940
+ "hit@1": 0.904,
941
+ "hit@5": 0.995,
942
+ "hit@10": 0.997,
943
+ "ndcg_linear@1": 0.904,
944
+ "ndcg_linear@5": 0.9580666714637504,
945
+ "ndcg_linear@10": 0.9587154696738694,
946
+ "ndcg_exponential@1": 0.904,
947
+ "ndcg_exponential@5": 0.9580666714637504,
948
+ "ndcg_exponential@10": 0.9587154696738694,
949
+ "precision@1": 0.904,
950
+ "precision@5": 0.19900000000000004,
951
+ "precision@10": 0.09970000000000001,
952
+ "recall@1": 0.904,
953
+ "recall@5": 0.995,
954
+ "recall@10": 0.997,
955
+ "f1@1": 0.904,
956
+ "f1@5": 0.3316666666666666,
957
+ "f1@10": 0.18127272727272725,
958
+ "map@1": 0.904,
959
+ "map@5": 0.9452833333333333,
960
+ "map@10": 0.9455511904761904,
961
+ "mrr@1": 0.904,
962
+ "mrr@5": 0.9452833333333333,
963
+ "mrr@10": 0.9455511904761904,
964
+ "num_pred": 1000,
965
+ "num_data": 1000
966
+ },
967
+ "RefCOCO-Matching": {
968
+ "hit@1": 0.927,
969
+ "hit@5": 1.0,
970
+ "hit@10": 1.0,
971
+ "ndcg_linear@1": 0.927,
972
+ "ndcg_linear@5": 0.9748197315178593,
973
+ "ndcg_linear@10": 0.9748197315178593,
974
+ "ndcg_exponential@1": 0.927,
975
+ "ndcg_exponential@5": 0.9748197315178593,
976
+ "ndcg_exponential@10": 0.9748197315178593,
977
+ "precision@1": 0.927,
978
+ "precision@5": 0.20000000000000004,
979
+ "precision@10": 0.10000000000000002,
980
+ "recall@1": 0.927,
981
+ "recall@5": 1.0,
982
+ "recall@10": 1.0,
983
+ "f1@1": 0.927,
984
+ "f1@5": 0.3333333333333333,
985
+ "f1@10": 0.18181818181818182,
986
+ "map@1": 0.927,
987
+ "map@5": 0.9663333333333334,
988
+ "map@10": 0.9663333333333334,
989
+ "mrr@1": 0.927,
990
+ "mrr@5": 0.9633333333333334,
991
+ "mrr@10": 0.9633333333333334,
992
+ "num_pred": 1000,
993
+ "num_data": 1000
994
+ },
995
+ "Visual7W-Pointing": {
996
+ "hit@1": 0.916,
997
+ "hit@5": 0.971,
998
+ "hit@10": 0.985,
999
+ "ndcg_linear@1": 0.916,
1000
+ "ndcg_linear@5": 0.9457068095214094,
1001
+ "ndcg_linear@10": 0.9502811890192044,
1002
+ "ndcg_exponential@1": 0.916,
1003
+ "ndcg_exponential@5": 0.9457068095214094,
1004
+ "ndcg_exponential@10": 0.9502811890192044,
1005
+ "precision@1": 0.916,
1006
+ "precision@5": 0.19420000000000004,
1007
+ "precision@10": 0.0985,
1008
+ "recall@1": 0.916,
1009
+ "recall@5": 0.971,
1010
+ "recall@10": 0.985,
1011
+ "f1@1": 0.916,
1012
+ "f1@5": 0.32366666666666666,
1013
+ "f1@10": 0.17909090909090908,
1014
+ "map@1": 0.916,
1015
+ "map@5": 0.93725,
1016
+ "map@10": 0.939163492063492,
1017
+ "mrr@1": 0.916,
1018
+ "mrr@5": 0.93725,
1019
+ "mrr@10": 0.939163492063492,
1020
+ "num_pred": 1000,
1021
+ "num_data": 1000
1022
+ }
1023
+ },
1024
+ "visdoc": {
1025
+ "ViDoRe_arxivqa": {
1026
+ "hit@1": 0.724,
1027
+ "hit@5": 0.832,
1028
+ "hit@10": 0.87,
1029
+ "ndcg_linear@1": 0.724,
1030
+ "ndcg_linear@5": 0.7816980877667238,
1031
+ "ndcg_linear@10": 0.794154083491937,
1032
+ "ndcg_exponential@1": 0.724,
1033
+ "ndcg_exponential@5": 0.7816980877667238,
1034
+ "ndcg_exponential@10": 0.794154083491937,
1035
+ "precision@1": 0.724,
1036
+ "precision@5": 0.16639999999999996,
1037
+ "precision@10": 0.087,
1038
+ "recall@1": 0.724,
1039
+ "recall@5": 0.832,
1040
+ "recall@10": 0.87,
1041
+ "f1@1": 0.724,
1042
+ "f1@5": 0.2773333333333334,
1043
+ "f1@10": 0.15818181818181817,
1044
+ "map@1": 0.724,
1045
+ "map@5": 0.765,
1046
+ "map@10": 0.7702428571428571,
1047
+ "mrr@1": 0.724,
1048
+ "mrr@5": 0.765,
1049
+ "mrr@10": 0.7702428571428571,
1050
+ "num_pred": 500,
1051
+ "num_data": 500
1052
+ },
1053
+ "ViDoRe_docvqa": {
1054
+ "hit@1": 0.4035476718403548,
1055
+ "hit@5": 0.5809312638580931,
1056
+ "hit@10": 0.656319290465632,
1057
+ "ndcg_linear@1": 0.4035476718403548,
1058
+ "ndcg_linear@5": 0.49108570677106456,
1059
+ "ndcg_linear@10": 0.5147397758959128,
1060
+ "ndcg_exponential@1": 0.4035476718403548,
1061
+ "ndcg_exponential@5": 0.49108570677106456,
1062
+ "ndcg_exponential@10": 0.5147397758959128,
1063
+ "precision@1": 0.4035476718403548,
1064
+ "precision@5": 0.11707317073170731,
1065
+ "precision@10": 0.06651884700665188,
1066
+ "recall@1": 0.3976164079822616,
1067
+ "recall@5": 0.5727827050997782,
1068
+ "recall@10": 0.644660014781966,
1069
+ "f1@1": 0.39886671593988665,
1070
+ "f1@5": 0.19345167028093863,
1071
+ "f1@10": 0.1199606503375905,
1072
+ "map@1": 0.4035476718403548,
1073
+ "map@5": 0.4620349839862035,
1074
+ "map@10": 0.4718157181571816,
1075
+ "mrr@1": 0.4035476718403548,
1076
+ "mrr@5": 0.46855136733185515,
1077
+ "mrr@10": 0.47878523915109283,
1078
+ "num_pred": 451,
1079
+ "num_data": 451
1080
+ },
1081
+ "ViDoRe_infovqa": {
1082
+ "hit@1": 0.7995951417004049,
1083
+ "hit@5": 0.9230769230769231,
1084
+ "hit@10": 0.9473684210526315,
1085
+ "ndcg_linear@1": 0.7995951417004049,
1086
+ "ndcg_linear@5": 0.8658530746439195,
1087
+ "ndcg_linear@10": 0.8742677543682404,
1088
+ "ndcg_exponential@1": 0.7995951417004049,
1089
+ "ndcg_exponential@5": 0.8658530746439195,
1090
+ "ndcg_exponential@10": 0.8742677543682404,
1091
+ "precision@1": 0.7995951417004049,
1092
+ "precision@5": 0.18461538461538463,
1093
+ "precision@10": 0.09493927125506073,
1094
+ "recall@1": 0.7995951417004049,
1095
+ "recall@5": 0.9220647773279352,
1096
+ "recall@10": 0.9473684210526315,
1097
+ "f1@1": 0.7995951417004049,
1098
+ "f1@5": 0.3075959128590708,
1099
+ "f1@10": 0.17255551466077784,
1100
+ "map@1": 0.7995951417004049,
1101
+ "map@5": 0.8468960863697707,
1102
+ "map@10": 0.8505888117730224,
1103
+ "mrr@1": 0.7995951417004049,
1104
+ "mrr@5": 0.8472334682860998,
1105
+ "mrr@10": 0.8505888117730224,
1106
+ "num_pred": 494,
1107
+ "num_data": 494
1108
+ },
1109
+ "ViDoRe_tabfquad": {
1110
+ "hit@1": 0.8714285714285714,
1111
+ "hit@5": 0.95,
1112
+ "hit@10": 0.975,
1113
+ "ndcg_linear@1": 0.8714285714285714,
1114
+ "ndcg_linear@5": 0.9139664212357709,
1115
+ "ndcg_linear@10": 0.9218222423868873,
1116
+ "ndcg_exponential@1": 0.8714285714285714,
1117
+ "ndcg_exponential@5": 0.9139664212357709,
1118
+ "ndcg_exponential@10": 0.9218222423868873,
1119
+ "precision@1": 0.8714285714285714,
1120
+ "precision@5": 0.18999999999999997,
1121
+ "precision@10": 0.09749999999999999,
1122
+ "recall@1": 0.8714285714285714,
1123
+ "recall@5": 0.95,
1124
+ "recall@10": 0.975,
1125
+ "f1@1": 0.8714285714285714,
1126
+ "f1@5": 0.3166666666666667,
1127
+ "f1@10": 0.1772727272727273,
1128
+ "map@1": 0.8714285714285714,
1129
+ "map@5": 0.9019642857142858,
1130
+ "map@10": 0.9050736961451247,
1131
+ "mrr@1": 0.8714285714285714,
1132
+ "mrr@5": 0.9019642857142858,
1133
+ "mrr@10": 0.9050736961451247,
1134
+ "num_pred": 280,
1135
+ "num_data": 280
1136
+ },
1137
+ "ViDoRe_tatdqa": {
1138
+ "hit@1": 0.41433778857837184,
1139
+ "hit@5": 0.6834750911300121,
1140
+ "hit@10": 0.7885783718104495,
1141
+ "ndcg_linear@1": 0.41433778857837184,
1142
+ "ndcg_linear@5": 0.5558704694199323,
1143
+ "ndcg_linear@10": 0.5903675001014173,
1144
+ "ndcg_exponential@1": 0.41433778857837184,
1145
+ "ndcg_exponential@5": 0.5558704694199323,
1146
+ "ndcg_exponential@10": 0.5903675001014173,
1147
+ "precision@1": 0.41433778857837184,
1148
+ "precision@5": 0.13681652490886997,
1149
+ "precision@10": 0.0789793438639125,
1150
+ "recall@1": 0.413730255164034,
1151
+ "recall@5": 0.6825637910085055,
1152
+ "recall@10": 0.7879708383961118,
1153
+ "f1@1": 0.4139327663021466,
1154
+ "f1@5": 0.2278828907018458,
1155
+ "f1@10": 0.14354357671490117,
1156
+ "map@1": 0.41433778857837184,
1157
+ "map@5": 0.5136644390441474,
1158
+ "map@10": 0.5281519412139096,
1159
+ "mrr@1": 0.41433778857837184,
1160
+ "mrr@5": 0.5141454029971648,
1161
+ "mrr@10": 0.5285721518254932,
1162
+ "num_pred": 1646,
1163
+ "num_data": 1646
1164
+ },
1165
+ "ViDoRe_shiftproject": {
1166
+ "hit@1": 0.61,
1167
+ "hit@5": 0.89,
1168
+ "hit@10": 0.97,
1169
+ "ndcg_linear@1": 0.61,
1170
+ "ndcg_linear@5": 0.7617245745315876,
1171
+ "ndcg_linear@10": 0.7890774568114949,
1172
+ "ndcg_exponential@1": 0.61,
1173
+ "ndcg_exponential@5": 0.7617245745315876,
1174
+ "ndcg_exponential@10": 0.7890774568114949,
1175
+ "precision@1": 0.61,
1176
+ "precision@5": 0.17799999999999996,
1177
+ "precision@10": 0.09699999999999998,
1178
+ "recall@1": 0.61,
1179
+ "recall@5": 0.89,
1180
+ "recall@10": 0.97,
1181
+ "f1@1": 0.61,
1182
+ "f1@5": 0.2966666666666667,
1183
+ "f1@10": 0.17636363636363647,
1184
+ "map@1": 0.61,
1185
+ "map@5": 0.7188333333333334,
1186
+ "map@10": 0.7309761904761904,
1187
+ "mrr@1": 0.61,
1188
+ "mrr@5": 0.7188333333333334,
1189
+ "mrr@10": 0.7309761904761904,
1190
+ "num_pred": 100,
1191
+ "num_data": 100
1192
+ },
1193
+ "ViDoRe_syntheticDocQA_artificial_intelligence": {
1194
+ "hit@1": 0.8,
1195
+ "hit@5": 0.99,
1196
+ "hit@10": 1.0,
1197
+ "ndcg_linear@1": 0.8,
1198
+ "ndcg_linear@5": 0.9099357497170402,
1199
+ "ndcg_linear@10": 0.912826397980219,
1200
+ "ndcg_exponential@1": 0.8,
1201
+ "ndcg_exponential@5": 0.9099357497170402,
1202
+ "ndcg_exponential@10": 0.912826397980219,
1203
+ "precision@1": 0.8,
1204
+ "precision@5": 0.19799999999999998,
1205
+ "precision@10": 0.09999999999999998,
1206
+ "recall@1": 0.8,
1207
+ "recall@5": 0.99,
1208
+ "recall@10": 1.0,
1209
+ "f1@1": 0.8,
1210
+ "f1@5": 0.3300000000000001,
1211
+ "f1@10": 0.1818181818181819,
1212
+ "map@1": 0.8,
1213
+ "map@5": 0.8826666666666667,
1214
+ "map@10": 0.8836666666666666,
1215
+ "mrr@1": 0.8,
1216
+ "mrr@5": 0.8826666666666667,
1217
+ "mrr@10": 0.8836666666666666,
1218
+ "num_pred": 100,
1219
+ "num_data": 100
1220
+ },
1221
+ "ViDoRe_syntheticDocQA_energy": {
1222
+ "hit@1": 0.81,
1223
+ "hit@5": 0.93,
1224
+ "hit@10": 0.96,
1225
+ "ndcg_linear@1": 0.81,
1226
+ "ndcg_linear@5": 0.8753378444481007,
1227
+ "ndcg_linear@10": 0.8856166369581185,
1228
+ "ndcg_exponential@1": 0.81,
1229
+ "ndcg_exponential@5": 0.8753378444481007,
1230
+ "ndcg_exponential@10": 0.8856166369581185,
1231
+ "precision@1": 0.81,
1232
+ "precision@5": 0.18599999999999994,
1233
+ "precision@10": 0.09599999999999997,
1234
+ "recall@1": 0.81,
1235
+ "recall@5": 0.93,
1236
+ "recall@10": 0.96,
1237
+ "f1@1": 0.81,
1238
+ "f1@5": 0.31000000000000005,
1239
+ "f1@10": 0.17454545454545464,
1240
+ "map@1": 0.81,
1241
+ "map@5": 0.8569999999999999,
1242
+ "map@10": 0.8615833333333334,
1243
+ "mrr@1": 0.81,
1244
+ "mrr@5": 0.8569999999999999,
1245
+ "mrr@10": 0.8615833333333334,
1246
+ "num_pred": 100,
1247
+ "num_data": 100
1248
+ },
1249
+ "ViDoRe_syntheticDocQA_government_reports": {
1250
+ "hit@1": 0.83,
1251
+ "hit@5": 0.98,
1252
+ "hit@10": 0.99,
1253
+ "ndcg_linear@1": 0.83,
1254
+ "ndcg_linear@5": 0.9155750345909591,
1255
+ "ndcg_linear@10": 0.9187296833588163,
1256
+ "ndcg_exponential@1": 0.83,
1257
+ "ndcg_exponential@5": 0.9155750345909591,
1258
+ "ndcg_exponential@10": 0.9187296833588163,
1259
+ "precision@1": 0.83,
1260
+ "precision@5": 0.19599999999999998,
1261
+ "precision@10": 0.09899999999999999,
1262
+ "recall@1": 0.83,
1263
+ "recall@5": 0.98,
1264
+ "recall@10": 0.99,
1265
+ "f1@1": 0.83,
1266
+ "f1@5": 0.32666666666666677,
1267
+ "f1@10": 0.18000000000000008,
1268
+ "map@1": 0.83,
1269
+ "map@5": 0.8936666666666667,
1270
+ "map@10": 0.8949166666666667,
1271
+ "mrr@1": 0.83,
1272
+ "mrr@5": 0.8936666666666667,
1273
+ "mrr@10": 0.8949166666666667,
1274
+ "num_pred": 100,
1275
+ "num_data": 100
1276
+ },
1277
+ "ViDoRe_syntheticDocQA_healthcare_industry": {
1278
+ "hit@1": 0.85,
1279
+ "hit@5": 0.99,
1280
+ "hit@10": 1.0,
1281
+ "ndcg_linear@1": 0.85,
1282
+ "ndcg_linear@5": 0.9338868640816542,
1283
+ "ndcg_linear@10": 0.9372201974149877,
1284
+ "ndcg_exponential@1": 0.85,
1285
+ "ndcg_exponential@5": 0.9338868640816542,
1286
+ "ndcg_exponential@10": 0.9372201974149877,
1287
+ "precision@1": 0.85,
1288
+ "precision@5": 0.19799999999999998,
1289
+ "precision@10": 0.09999999999999998,
1290
+ "recall@1": 0.85,
1291
+ "recall@5": 0.99,
1292
+ "recall@10": 1.0,
1293
+ "f1@1": 0.85,
1294
+ "f1@5": 0.3300000000000001,
1295
+ "f1@10": 0.1818181818181819,
1296
+ "map@1": 0.85,
1297
+ "map@5": 0.9145,
1298
+ "map@10": 0.9159285714285714,
1299
+ "mrr@1": 0.85,
1300
+ "mrr@5": 0.9145,
1301
+ "mrr@10": 0.9159285714285714,
1302
+ "num_pred": 100,
1303
+ "num_data": 100
1304
+ },
1305
+ "ViDoRe_esg_reports_human_labeled_v2": {
1306
+ "hit@1": 0.6346153846153846,
1307
+ "hit@5": 0.9038461538461539,
1308
+ "hit@10": 0.9423076923076923,
1309
+ "ndcg_linear@1": 0.6121794871794872,
1310
+ "ndcg_linear@5": 0.662739973618026,
1311
+ "ndcg_linear@10": 0.6801813086670259,
1312
+ "ndcg_exponential@1": 0.6053113553113553,
1313
+ "ndcg_exponential@5": 0.6592882828242518,
1314
+ "ndcg_exponential@10": 0.6769628635131526,
1315
+ "precision@1": 0.6346153846153846,
1316
+ "precision@5": 0.25769230769230766,
1317
+ "precision@10": 0.15192307692307688,
1318
+ "recall@1": 0.4403846153846154,
1319
+ "recall@5": 0.7235431235431237,
1320
+ "recall@10": 0.7684232434232434,
1321
+ "f1@1": 0.4834530853761623,
1322
+ "f1@5": 0.34303453899042136,
1323
+ "f1@10": 0.2309966899616221,
1324
+ "map@1": 0.6346153846153846,
1325
+ "map@5": 0.6022649572649572,
1326
+ "map@10": 0.6106303418803418,
1327
+ "mrr@1": 0.6346153846153846,
1328
+ "mrr@5": 0.7346153846153847,
1329
+ "mrr@10": 0.7389423076923076,
1330
+ "num_pred": 52,
1331
+ "num_data": 52
1332
+ },
1333
+ "ViDoRe_biomedical_lectures_v2": {
1334
+ "hit@1": 0.5375,
1335
+ "hit@5": 0.83125,
1336
+ "hit@10": 0.90625,
1337
+ "ndcg_linear@1": 0.5375,
1338
+ "ndcg_linear@5": 0.5841363021080512,
1339
+ "ndcg_linear@10": 0.6249174529201162,
1340
+ "ndcg_exponential@1": 0.5375,
1341
+ "ndcg_exponential@5": 0.5841363021080511,
1342
+ "ndcg_exponential@10": 0.6249174529201162,
1343
+ "precision@1": 0.5375,
1344
+ "precision@5": 0.26875,
1345
+ "precision@10": 0.17625000000000002,
1346
+ "recall@1": 0.3166321699134199,
1347
+ "recall@5": 0.6228635886172651,
1348
+ "recall@10": 0.7383215161064426,
1349
+ "f1@1": 0.36593299062049056,
1350
+ "f1@5": 0.33357090132090134,
1351
+ "f1@10": 0.25719967294415824,
1352
+ "map@1": 0.5375,
1353
+ "map@5": 0.5106197916666666,
1354
+ "map@10": 0.5325731233465609,
1355
+ "mrr@1": 0.5375,
1356
+ "mrr@5": 0.6533333333333333,
1357
+ "mrr@10": 0.6637103174603174,
1358
+ "num_pred": 160,
1359
+ "num_data": 160
1360
+ },
1361
+ "ViDoRe_biomedical_lectures_v2_multilingual": {
1362
+ "hit@1": 0.490625,
1363
+ "hit@5": 0.8,
1364
+ "hit@10": 0.890625,
1365
+ "ndcg_linear@1": 0.490625,
1366
+ "ndcg_linear@5": 0.5434414494513081,
1367
+ "ndcg_linear@10": 0.582249778808178,
1368
+ "ndcg_exponential@1": 0.490625,
1369
+ "ndcg_exponential@5": 0.5434414494513081,
1370
+ "ndcg_exponential@10": 0.5822497788081779,
1371
+ "precision@1": 0.490625,
1372
+ "precision@5": 0.254375,
1373
+ "precision@10": 0.16359375000000004,
1374
+ "recall@1": 0.2916960107906799,
1375
+ "recall@5": 0.58253104508849,
1376
+ "recall@10": 0.6977005104059648,
1377
+ "f1@1": 0.33520032918470416,
1378
+ "f1@5": 0.3138615559787435,
1379
+ "f1@10": 0.23995111426223562,
1380
+ "map@1": 0.490625,
1381
+ "map@5": 0.47009505208333324,
1382
+ "map@10": 0.48930132895171957,
1383
+ "mrr@1": 0.490625,
1384
+ "mrr@5": 0.613671875,
1385
+ "mrr@10": 0.6260515873015873,
1386
+ "num_pred": 640,
1387
+ "num_data": 640
1388
+ },
1389
+ "ViDoRe_economics_reports_v2": {
1390
+ "hit@1": 0.7586206896551724,
1391
+ "hit@5": 0.9655172413793104,
1392
+ "hit@10": 0.9655172413793104,
1393
+ "ndcg_linear@1": 0.7586206896551724,
1394
+ "ndcg_linear@5": 0.6741969787919236,
1395
+ "ndcg_linear@10": 0.6190122003564679,
1396
+ "ndcg_exponential@1": 0.7586206896551724,
1397
+ "ndcg_exponential@5": 0.6741969787919236,
1398
+ "ndcg_exponential@10": 0.6190122003564678,
1399
+ "precision@1": 0.7586206896551724,
1400
+ "precision@5": 0.5758620689655172,
1401
+ "precision@10": 0.42241379310344823,
1402
+ "recall@1": 0.13585704413718444,
1403
+ "recall@5": 0.35288134780268793,
1404
+ "recall@10": 0.45080357139107263,
1405
+ "f1@1": 0.1922091657419343,
1406
+ "f1@5": 0.356547764866831,
1407
+ "f1@10": 0.3541634848772098,
1408
+ "map@1": 0.7586206896551724,
1409
+ "map@5": 0.5760680076628353,
1410
+ "map@10": 0.48829512775736095,
1411
+ "mrr@1": 0.7586206896551724,
1412
+ "mrr@5": 0.8482758620689655,
1413
+ "mrr@10": 0.8482758620689655,
1414
+ "num_pred": 58,
1415
+ "num_data": 58
1416
+ },
1417
+ "ViDoRe_economics_reports_v2_multilingual": {
1418
+ "hit@1": 0.7025862068965517,
1419
+ "hit@5": 0.9267241379310345,
1420
+ "hit@10": 0.9655172413793104,
1421
+ "ndcg_linear@1": 0.7025862068965517,
1422
+ "ndcg_linear@5": 0.6092499057967938,
1423
+ "ndcg_linear@10": 0.5716310478677091,
1424
+ "ndcg_exponential@1": 0.7025862068965517,
1425
+ "ndcg_exponential@5": 0.6092499057967938,
1426
+ "ndcg_exponential@10": 0.571631047867709,
1427
+ "precision@1": 0.7025862068965517,
1428
+ "precision@5": 0.5137931034482759,
1429
+ "precision@10": 0.39525862068965517,
1430
+ "recall@1": 0.11976302485978202,
1431
+ "recall@5": 0.3134997052445495,
1432
+ "recall@10": 0.4232996166582797,
1433
+ "f1@1": 0.1708464717573616,
1434
+ "f1@5": 0.3146703201409769,
1435
+ "f1@10": 0.330190439964704,
1436
+ "map@1": 0.7025862068965517,
1437
+ "map@5": 0.5082686781609195,
1438
+ "map@10": 0.43586978908741014,
1439
+ "mrr@1": 0.7025862068965517,
1440
+ "mrr@5": 0.7956896551724137,
1441
+ "mrr@10": 0.8007714148877941,
1442
+ "num_pred": 232,
1443
+ "num_data": 232
1444
+ },
1445
+ "ViDoRe_esg_reports_v2": {
1446
+ "hit@1": 0.6491228070175439,
1447
+ "hit@5": 0.8596491228070176,
1448
+ "hit@10": 0.8947368421052632,
1449
+ "ndcg_linear@1": 0.6491228070175439,
1450
+ "ndcg_linear@5": 0.5997641652064848,
1451
+ "ndcg_linear@10": 0.634484891188961,
1452
+ "ndcg_exponential@1": 0.6491228070175439,
1453
+ "ndcg_exponential@5": 0.5997641652064848,
1454
+ "ndcg_exponential@10": 0.634484891188961,
1455
+ "precision@1": 0.6491228070175439,
1456
+ "precision@5": 0.2771929824561403,
1457
+ "precision@10": 0.18771929824561406,
1458
+ "recall@1": 0.31802422723475354,
1459
+ "recall@5": 0.5807435254803677,
1460
+ "recall@10": 0.7037802840434418,
1461
+ "f1@1": 0.3830497579317905,
1462
+ "f1@5": 0.3241369172563792,
1463
+ "f1@10": 0.2614094743270236,
1464
+ "map@1": 0.6491228070175439,
1465
+ "map@5": 0.5182261208576998,
1466
+ "map@10": 0.5330643158822983,
1467
+ "mrr@1": 0.6491228070175439,
1468
+ "mrr@5": 0.7368421052631579,
1469
+ "mrr@10": 0.7407407407407407,
1470
+ "num_pred": 57,
1471
+ "num_data": 57
1472
+ },
1473
+ "ViDoRe_esg_reports_v2_multilingual": {
1474
+ "hit@1": 0.5614035087719298,
1475
+ "hit@5": 0.8552631578947368,
1476
+ "hit@10": 0.8903508771929824,
1477
+ "ndcg_linear@1": 0.5614035087719298,
1478
+ "ndcg_linear@5": 0.5682060214897536,
1479
+ "ndcg_linear@10": 0.5985226385169606,
1480
+ "ndcg_exponential@1": 0.5614035087719298,
1481
+ "ndcg_exponential@5": 0.5682060214897536,
1482
+ "ndcg_exponential@10": 0.5985226385169606,
1483
+ "precision@1": 0.5614035087719298,
1484
+ "precision@5": 0.2754385964912281,
1485
+ "precision@10": 0.18026315789473688,
1486
+ "recall@1": 0.27056182121971595,
1487
+ "recall@5": 0.5850093984962406,
1488
+ "recall@10": 0.6978070175438598,
1489
+ "f1@1": 0.32801009939848413,
1490
+ "f1@5": 0.32522762720398035,
1491
+ "f1@10": 0.25474615009098467,
1492
+ "map@1": 0.5614035087719298,
1493
+ "map@5": 0.47951510721247564,
1494
+ "map@10": 0.49127398937157707,
1495
+ "mrr@1": 0.5614035087719298,
1496
+ "mrr@5": 0.6827485380116959,
1497
+ "mrr@10": 0.6871031746031746,
1498
+ "num_pred": 228,
1499
+ "num_data": 228
1500
+ },
1501
+ "VisRAG_ArxivQA": {
1502
+ "hit@1": 0.7120098039215687,
1503
+ "hit@5": 0.8382352941176471,
1504
+ "hit@10": 0.8700980392156863,
1505
+ "ndcg_linear@1": 0.7120098039215687,
1506
+ "ndcg_linear@5": 0.7817991444138218,
1507
+ "ndcg_linear@10": 0.7919346368535662,
1508
+ "ndcg_exponential@1": 0.7120098039215687,
1509
+ "ndcg_exponential@5": 0.7817991444138218,
1510
+ "ndcg_exponential@10": 0.7919346368535662,
1511
+ "precision@1": 0.7120098039215687,
1512
+ "precision@5": 0.1676470588235294,
1513
+ "precision@10": 0.08700980392156861,
1514
+ "recall@1": 0.7120098039215687,
1515
+ "recall@5": 0.8382352941176471,
1516
+ "recall@10": 0.8700980392156863,
1517
+ "f1@1": 0.7120098039215687,
1518
+ "f1@5": 0.2794117647058824,
1519
+ "f1@10": 0.15819964349376117,
1520
+ "map@1": 0.7120098039215687,
1521
+ "map@5": 0.7628063725490196,
1522
+ "map@10": 0.766889880952381,
1523
+ "mrr@1": 0.7120098039215687,
1524
+ "mrr@5": 0.7628063725490196,
1525
+ "mrr@10": 0.766889880952381,
1526
+ "num_pred": 816,
1527
+ "num_data": 816
1528
+ },
1529
+ "VisRAG_ChartQA": {
1530
+ "hit@1": 0.6984126984126984,
1531
+ "hit@5": 0.873015873015873,
1532
+ "hit@10": 0.9206349206349206,
1533
+ "ndcg_linear@1": 0.6984126984126984,
1534
+ "ndcg_linear@5": 0.7952874267735981,
1535
+ "ndcg_linear@10": 0.8103095637633947,
1536
+ "ndcg_exponential@1": 0.6984126984126984,
1537
+ "ndcg_exponential@5": 0.7952874267735981,
1538
+ "ndcg_exponential@10": 0.8103095637633947,
1539
+ "precision@1": 0.6984126984126984,
1540
+ "precision@5": 0.17460317460317454,
1541
+ "precision@10": 0.09206349206349201,
1542
+ "recall@1": 0.6984126984126984,
1543
+ "recall@5": 0.873015873015873,
1544
+ "recall@10": 0.9206349206349206,
1545
+ "f1@1": 0.6984126984126984,
1546
+ "f1@5": 0.291005291005291,
1547
+ "f1@10": 0.16738816738816742,
1548
+ "map@1": 0.6984126984126984,
1549
+ "map@5": 0.7690476190476191,
1550
+ "map@10": 0.775,
1551
+ "mrr@1": 0.6984126984126984,
1552
+ "mrr@5": 0.7690476190476191,
1553
+ "mrr@10": 0.775,
1554
+ "num_pred": 63,
1555
+ "num_data": 63
1556
+ },
1557
+ "VisRAG_MP-DocVQA": {
1558
+ "hit@1": 0.6903553299492385,
1559
+ "hit@5": 0.8595600676818951,
1560
+ "hit@10": 0.9204737732656514,
1561
+ "ndcg_linear@1": 0.6903553299492385,
1562
+ "ndcg_linear@5": 0.7820349036494127,
1563
+ "ndcg_linear@10": 0.8015893567306576,
1564
+ "ndcg_exponential@1": 0.6903553299492385,
1565
+ "ndcg_exponential@5": 0.7820349036494127,
1566
+ "ndcg_exponential@10": 0.8015893567306576,
1567
+ "precision@1": 0.6903553299492385,
1568
+ "precision@5": 0.171912013536379,
1569
+ "precision@10": 0.09204737732656514,
1570
+ "recall@1": 0.6903553299492385,
1571
+ "recall@5": 0.8595600676818951,
1572
+ "recall@10": 0.9204737732656514,
1573
+ "f1@1": 0.6903553299492385,
1574
+ "f1@5": 0.28652002256063175,
1575
+ "f1@10": 0.16735886786648213,
1576
+ "map@1": 0.6903553299492385,
1577
+ "map@5": 0.7560631697687535,
1578
+ "map@10": 0.7640507077055301,
1579
+ "mrr@1": 0.6903553299492385,
1580
+ "mrr@5": 0.7560631697687535,
1581
+ "mrr@10": 0.7640507077055301,
1582
+ "num_pred": 591,
1583
+ "num_data": 591
1584
+ },
1585
+ "VisRAG_SlideVQA": {
1586
+ "hit@1": 0.8794964028776978,
1587
+ "hit@5": 0.9658273381294964,
1588
+ "hit@10": 0.9748201438848921,
1589
+ "ndcg_linear@1": 0.8794964028776978,
1590
+ "ndcg_linear@5": 0.9111322782066683,
1591
+ "ndcg_linear@10": 0.9166794486497265,
1592
+ "ndcg_exponential@1": 0.8794964028776978,
1593
+ "ndcg_exponential@5": 0.9111322782066683,
1594
+ "ndcg_exponential@10": 0.9166794486497265,
1595
+ "precision@1": 0.8794964028776978,
1596
+ "precision@5": 0.23776978417266184,
1597
+ "precision@10": 0.12122302158273382,
1598
+ "recall@1": 0.7562949640287769,
1599
+ "recall@5": 0.947841726618705,
1600
+ "recall@10": 0.9631294964028777,
1601
+ "f1@1": 0.7973621103117506,
1602
+ "f1@5": 0.37332990750256945,
1603
+ "f1@10": 0.2128569871375627,
1604
+ "map@1": 0.8794964028776978,
1605
+ "map@5": 0.8898381294964028,
1606
+ "map@10": 0.8929377926230444,
1607
+ "mrr@1": 0.8794964028776978,
1608
+ "mrr@5": 0.9164568345323741,
1609
+ "mrr@10": 0.9175181283544593,
1610
+ "num_pred": 556,
1611
+ "num_data": 556
1612
+ },
1613
+ "VisRAG_InfoVQA": {
1614
+ "hit@1": 0.786908077994429,
1615
+ "hit@5": 0.9401114206128134,
1616
+ "hit@10": 0.9665738161559888,
1617
+ "ndcg_linear@1": 0.786908077994429,
1618
+ "ndcg_linear@5": 0.871620682507705,
1619
+ "ndcg_linear@10": 0.8802332479350842,
1620
+ "ndcg_exponential@1": 0.786908077994429,
1621
+ "ndcg_exponential@5": 0.871620682507705,
1622
+ "ndcg_exponential@10": 0.8802332479350842,
1623
+ "precision@1": 0.786908077994429,
1624
+ "precision@5": 0.18802228412256264,
1625
+ "precision@10": 0.09665738161559888,
1626
+ "recall@1": 0.786908077994429,
1627
+ "recall@5": 0.9401114206128134,
1628
+ "recall@10": 0.9665738161559888,
1629
+ "f1@1": 0.786908077994429,
1630
+ "f1@5": 0.31337047353760455,
1631
+ "f1@10": 0.17574069384654348,
1632
+ "map@1": 0.786908077994429,
1633
+ "map@5": 0.8485143918291551,
1634
+ "map@10": 0.8521018481673078,
1635
+ "mrr@1": 0.786908077994429,
1636
+ "mrr@5": 0.8485143918291551,
1637
+ "mrr@10": 0.8521018481673078,
1638
+ "num_pred": 718,
1639
+ "num_data": 718
1640
+ },
1641
+ "VisRAG_PlotQA": {
1642
+ "hit@1": 0.47740440324449596,
1643
+ "hit@5": 0.7450753186558516,
1644
+ "hit@10": 0.8342989571263036,
1645
+ "ndcg_linear@1": 0.47740440324449596,
1646
+ "ndcg_linear@5": 0.6171386649075443,
1647
+ "ndcg_linear@10": 0.6460193087434463,
1648
+ "ndcg_exponential@1": 0.47740440324449596,
1649
+ "ndcg_exponential@5": 0.6171386649075443,
1650
+ "ndcg_exponential@10": 0.6460193087434463,
1651
+ "precision@1": 0.47740440324449596,
1652
+ "precision@5": 0.14901506373117032,
1653
+ "precision@10": 0.08342989571263036,
1654
+ "recall@1": 0.47740440324449596,
1655
+ "recall@5": 0.7450753186558516,
1656
+ "recall@10": 0.8342989571263036,
1657
+ "f1@1": 0.47740440324449596,
1658
+ "f1@5": 0.24835843955195064,
1659
+ "f1@10": 0.15169071947750978,
1660
+ "map@1": 0.47740440324449596,
1661
+ "map@5": 0.5747779065276168,
1662
+ "map@10": 0.5867033603707996,
1663
+ "mrr@1": 0.47740440324449596,
1664
+ "mrr@5": 0.5747779065276168,
1665
+ "mrr@10": 0.5867033603707996,
1666
+ "num_pred": 863,
1667
+ "num_data": 863
1668
+ },
1669
+ "ViDoSeek-page": {
1670
+ "hit@1": 0.028021015761821366,
1671
+ "hit@5": 0.4115586690017513,
1672
+ "hit@10": 0.6514886164623468,
1673
+ "ndcg_linear@1": 0.028021015761821366,
1674
+ "ndcg_linear@5": 0.22470566490499208,
1675
+ "ndcg_linear@10": 0.3023957738482507,
1676
+ "ndcg_exponential@1": 0.028021015761821366,
1677
+ "ndcg_exponential@5": 0.2247056649049921,
1678
+ "ndcg_exponential@10": 0.3023957738482508,
1679
+ "precision@1": 0.028021015761821366,
1680
+ "precision@5": 0.08231173380035026,
1681
+ "precision@10": 0.06514886164623468,
1682
+ "recall@1": 0.028021015761821366,
1683
+ "recall@5": 0.4115586690017513,
1684
+ "recall@10": 0.6514886164623468,
1685
+ "f1@1": 0.028021015761821366,
1686
+ "f1@5": 0.13718622300058378,
1687
+ "f1@10": 0.11845247572042672,
1688
+ "map@1": 0.028021015761821366,
1689
+ "map@5": 0.16317863397548157,
1690
+ "map@10": 0.19531176159897704,
1691
+ "mrr@1": 0.028021015761821366,
1692
+ "mrr@5": 0.16317863397548157,
1693
+ "mrr@10": 0.19531176159897704,
1694
+ "num_pred": 1142,
1695
+ "num_data": 1142
1696
+ },
1697
+ "ViDoSeek-doc": {
1698
+ "hit@1": 0.9929947460595446,
1699
+ "hit@5": 0.9964973730297724,
1700
+ "hit@10": 0.9973730297723292,
1701
+ "ndcg_linear@1": 0.6780502043199066,
1702
+ "ndcg_linear@5": 0.8347155048681709,
1703
+ "ndcg_linear@10": 0.8283316874780556,
1704
+ "ndcg_exponential@1": 0.4530898173630222,
1705
+ "ndcg_exponential@5": 0.7153063162905082,
1706
+ "ndcg_exponential@10": 0.7524802642874342,
1707
+ "precision@1": 0.9929947460595446,
1708
+ "precision@5": 0.9224168126094572,
1709
+ "precision@10": 0.8474605954465849,
1710
+ "recall@1": 0.05858093829304035,
1711
+ "recall@5": 0.27097789072938616,
1712
+ "recall@10": 0.4882164962990025,
1713
+ "f1@1": 0.10988695226799426,
1714
+ "f1@5": 0.4104084006565447,
1715
+ "f1@10": 0.605623879196985,
1716
+ "map@1": 0.9929947460595446,
1717
+ "map@5": 0.9156625802685348,
1718
+ "map@10": 0.8379593772710485,
1719
+ "mrr@1": 0.9929947460595446,
1720
+ "mrr@5": 0.9943812025685932,
1721
+ "mrr@10": 0.9944687682428489,
1722
+ "num_pred": 1142,
1723
+ "num_data": 1142
1724
+ },
1725
+ "MMLongBench-page": {
1726
+ "hit@1": 0.07517899761336516,
1727
+ "hit@5": 0.30668257756563244,
1728
+ "hit@10": 0.40692124105011934,
1729
+ "ndcg_linear@1": 0.07517899761336516,
1730
+ "ndcg_linear@5": 0.15889672106004182,
1731
+ "ndcg_linear@10": 0.19237673084837761,
1732
+ "ndcg_exponential@1": 0.07517899761336516,
1733
+ "ndcg_exponential@5": 0.15889672106004182,
1734
+ "ndcg_exponential@10": 0.19237673084837764,
1735
+ "precision@1": 0.07517899761336516,
1736
+ "precision@5": 0.07136038186157519,
1737
+ "precision@10": 0.05286396181384249,
1738
+ "recall@1": 0.04922434367541766,
1739
+ "recall@5": 0.22613081031935447,
1740
+ "recall@10": 0.3179622684395954,
1741
+ "f1@1": 0.05696844921188596,
1742
+ "f1@5": 0.10252626544217168,
1743
+ "f1@10": 0.08628360174493102,
1744
+ "map@1": 0.07517899761336516,
1745
+ "map@5": 0.11930389817024661,
1746
+ "map@10": 0.134453706071397,
1747
+ "mrr@1": 0.07517899761336516,
1748
+ "mrr@5": 0.16205250596658713,
1749
+ "mrr@10": 0.17527607303860288,
1750
+ "num_pred": 838,
1751
+ "num_data": 838
1752
+ },
1753
+ "MMLongBench-doc": {
1754
+ "hit@1": 0.6992840095465394,
1755
+ "hit@5": 0.788782816229117,
1756
+ "hit@10": 0.8305489260143198,
1757
+ "ndcg_linear@1": 0.4908512330946698,
1758
+ "ndcg_linear@5": 0.5151318143032525,
1759
+ "ndcg_linear@10": 0.4942756345433957,
1760
+ "ndcg_exponential@1": 0.3419706784861916,
1761
+ "ndcg_exponential@5": 0.43117309505779183,
1762
+ "ndcg_exponential@10": 0.439350643034243,
1763
+ "precision@1": 0.6992840095465394,
1764
+ "precision@5": 0.5708830548926014,
1765
+ "precision@10": 0.5034606205250597,
1766
+ "recall@1": 0.022724297502752358,
1767
+ "recall@5": 0.09002044278616353,
1768
+ "recall@10": 0.1555140961544712,
1769
+ "f1@1": 0.04359714278969275,
1770
+ "f1@5": 0.14945001865951169,
1771
+ "f1@10": 0.2231273977054642,
1772
+ "map@1": 0.6992840095465394,
1773
+ "map@5": 0.5389498806682578,
1774
+ "map@10": 0.4554091058327336,
1775
+ "mrr@1": 0.6992840095465394,
1776
+ "mrr@5": 0.7340294351630867,
1777
+ "mrr@10": 0.7398544342160094,
1778
+ "num_pred": 838,
1779
+ "num_data": 838
1780
+ }
1781
+ },
1782
+ "video": {
1783
+ "DiDeMo": {
1784
+ "hit@1": 0.46115537848605576,
1785
+ "hit@5": 0.7141434262948207,
1786
+ "hit@10": 0.7928286852589641,
1787
+ "ndcg_linear@1": 0.46115537848605576,
1788
+ "ndcg_linear@5": 0.597854832187236,
1789
+ "ndcg_linear@10": 0.623638688484401,
1790
+ "ndcg_exponential@1": 0.46115537848605576,
1791
+ "ndcg_exponential@5": 0.597854832187236,
1792
+ "ndcg_exponential@10": 0.623638688484401,
1793
+ "precision@1": 0.46115537848605576,
1794
+ "precision@5": 0.14282868525896414,
1795
+ "precision@10": 0.0792828685258964,
1796
+ "recall@1": 0.46115537848605576,
1797
+ "recall@5": 0.7141434262948207,
1798
+ "recall@10": 0.7928286852589641,
1799
+ "f1@1": 0.46115537848605576,
1800
+ "f1@5": 0.2380478087649403,
1801
+ "f1@10": 0.1441506700470844,
1802
+ "map@1": 0.46115537848605576,
1803
+ "map@5": 0.5588977423638777,
1804
+ "map@10": 0.5697341586036806,
1805
+ "mrr@1": 0.46115537848605576,
1806
+ "mrr@5": 0.5588977423638777,
1807
+ "mrr@10": 0.5697341586036806,
1808
+ "num_pred": 1004,
1809
+ "num_data": 1004
1810
+ },
1811
+ "MSR-VTT": {
1812
+ "hit@1": 0.494,
1813
+ "hit@5": 0.718,
1814
+ "hit@10": 0.809,
1815
+ "ndcg_linear@1": 0.494,
1816
+ "ndcg_linear@5": 0.6126583935599356,
1817
+ "ndcg_linear@10": 0.6418663414853761,
1818
+ "ndcg_exponential@1": 0.494,
1819
+ "ndcg_exponential@5": 0.6126583935599356,
1820
+ "ndcg_exponential@10": 0.6418663414853761,
1821
+ "precision@1": 0.494,
1822
+ "precision@5": 0.1436,
1823
+ "precision@10": 0.0809,
1824
+ "recall@1": 0.494,
1825
+ "recall@5": 0.718,
1826
+ "recall@10": 0.809,
1827
+ "f1@1": 0.494,
1828
+ "f1@5": 0.2393333333333334,
1829
+ "f1@10": 0.14709090909090913,
1830
+ "map@1": 0.494,
1831
+ "map@5": 0.5777,
1832
+ "map@10": 0.5896333333333333,
1833
+ "mrr@1": 0.494,
1834
+ "mrr@5": 0.5777,
1835
+ "mrr@10": 0.5896333333333333,
1836
+ "num_pred": 1000,
1837
+ "num_data": 1000
1838
+ },
1839
+ "MSVD": {
1840
+ "hit@1": 0.6716417910447762,
1841
+ "hit@5": 0.9044776119402985,
1842
+ "hit@10": 0.9507462686567164,
1843
+ "ndcg_linear@1": 0.6716417910447762,
1844
+ "ndcg_linear@5": 0.8018114248883265,
1845
+ "ndcg_linear@10": 0.8170350100109979,
1846
+ "ndcg_exponential@1": 0.6716417910447762,
1847
+ "ndcg_exponential@5": 0.8018114248883265,
1848
+ "ndcg_exponential@10": 0.8170350100109979,
1849
+ "precision@1": 0.6716417910447762,
1850
+ "precision@5": 0.18089552238805967,
1851
+ "precision@10": 0.09507462686567163,
1852
+ "recall@1": 0.6716417910447762,
1853
+ "recall@5": 0.9044776119402985,
1854
+ "recall@10": 0.9507462686567164,
1855
+ "f1@1": 0.6716417910447762,
1856
+ "f1@5": 0.3014925373134329,
1857
+ "f1@10": 0.17286295793758483,
1858
+ "map@1": 0.6716417910447762,
1859
+ "map@5": 0.7671144278606964,
1860
+ "map@10": 0.7735459606728263,
1861
+ "mrr@1": 0.6716417910447762,
1862
+ "mrr@5": 0.7671144278606964,
1863
+ "mrr@10": 0.7735459606728263,
1864
+ "num_pred": 670,
1865
+ "num_data": 670
1866
+ },
1867
+ "VATEX": {
1868
+ "hit@1": 0.43948191156766414,
1869
+ "hit@5": 0.7168378740509156,
1870
+ "hit@10": 0.8164359088878964,
1871
+ "ndcg_linear@1": 0.43948191156766414,
1872
+ "ndcg_linear@5": 0.5876683667478259,
1873
+ "ndcg_linear@10": 0.6199711102927952,
1874
+ "ndcg_exponential@1": 0.43948191156766414,
1875
+ "ndcg_exponential@5": 0.5876683667478259,
1876
+ "ndcg_exponential@10": 0.6199711102927952,
1877
+ "precision@1": 0.43948191156766414,
1878
+ "precision@5": 0.1433675748101831,
1879
+ "precision@10": 0.08164359088878963,
1880
+ "recall@1": 0.43948191156766414,
1881
+ "recall@5": 0.7168378740509156,
1882
+ "recall@10": 0.8164359088878964,
1883
+ "f1@1": 0.43948191156766414,
1884
+ "f1@5": 0.2389459580169719,
1885
+ "f1@10": 0.14844289252507206,
1886
+ "map@1": 0.43948191156766414,
1887
+ "map@5": 0.544606967396159,
1888
+ "map@10": 0.5579919819647376,
1889
+ "mrr@1": 0.43948191156766414,
1890
+ "mrr@5": 0.544606967396159,
1891
+ "mrr@10": 0.5579919819647376,
1892
+ "num_pred": 4478,
1893
+ "num_data": 4478
1894
+ },
1895
+ "YouCook2": {
1896
+ "hit@1": 0.2195659012268009,
1897
+ "hit@5": 0.44510852469329976,
1898
+ "hit@10": 0.5558351682919157,
1899
+ "ndcg_linear@1": 0.2195659012268009,
1900
+ "ndcg_linear@5": 0.33707072315082415,
1901
+ "ndcg_linear@10": 0.3726523748344782,
1902
+ "ndcg_exponential@1": 0.2195659012268009,
1903
+ "ndcg_exponential@5": 0.33707072315082415,
1904
+ "ndcg_exponential@10": 0.3726523748344782,
1905
+ "precision@1": 0.2195659012268009,
1906
+ "precision@5": 0.08902170493865996,
1907
+ "precision@10": 0.05558351682919157,
1908
+ "recall@1": 0.2195659012268009,
1909
+ "recall@5": 0.44510852469329976,
1910
+ "recall@10": 0.5558351682919157,
1911
+ "f1@1": 0.2195659012268009,
1912
+ "f1@5": 0.14836950823109996,
1913
+ "f1@10": 0.10106093968943923,
1914
+ "map@1": 0.2195659012268009,
1915
+ "map@5": 0.30132117017930166,
1916
+ "map@10": 0.31586977036804026,
1917
+ "mrr@1": 0.2195659012268009,
1918
+ "mrr@5": 0.30132117017930166,
1919
+ "mrr@10": 0.31586977036804026,
1920
+ "num_pred": 3179,
1921
+ "num_data": 3179
1922
+ },
1923
+ "QVHighlight": {
1924
+ "hit@1": 0.5817174515235457,
1925
+ "hit@5": 0.8688827331486612,
1926
+ "hit@10": 1.0,
1927
+ "ndcg_linear@1": 0.5817174515235457,
1928
+ "ndcg_linear@5": 0.7355674322372523,
1929
+ "ndcg_linear@10": 0.778061976394854,
1930
+ "ndcg_exponential@1": 0.5817174515235457,
1931
+ "ndcg_exponential@5": 0.7355674322372523,
1932
+ "ndcg_exponential@10": 0.778061976394854,
1933
+ "precision@1": 0.5817174515235457,
1934
+ "precision@5": 0.17377654662973221,
1935
+ "precision@10": 0.09999999999999998,
1936
+ "recall@1": 0.5817174515235457,
1937
+ "recall@5": 0.8688827331486612,
1938
+ "recall@10": 1.0,
1939
+ "f1@1": 0.5817174515235457,
1940
+ "f1@5": 0.28962757771622044,
1941
+ "f1@10": 0.18181818181818185,
1942
+ "map@1": 0.5817174515235457,
1943
+ "map@5": 0.6910587873191752,
1944
+ "map@10": 0.7086550440428556,
1945
+ "mrr@1": 0.5817174515235457,
1946
+ "mrr@5": 0.6910587873191752,
1947
+ "mrr@10": 0.7086550440428556,
1948
+ "num_pred": 1083,
1949
+ "num_data": 1083
1950
+ },
1951
+ "Charades-STA": {
1952
+ "hit@1": 0.24896836313617607,
1953
+ "hit@5": 0.6189821182943603,
1954
+ "hit@10": 1.0,
1955
+ "ndcg_linear@1": 0.24896836313617607,
1956
+ "ndcg_linear@5": 0.4350379203932738,
1957
+ "ndcg_linear@10": 0.554874249964459,
1958
+ "ndcg_exponential@1": 0.24896836313617607,
1959
+ "ndcg_exponential@5": 0.4350379203932738,
1960
+ "ndcg_exponential@10": 0.554874249964459,
1961
+ "precision@1": 0.24896836313617607,
1962
+ "precision@5": 0.12379642365887207,
1963
+ "precision@10": 0.09999999999999998,
1964
+ "recall@1": 0.24896836313617607,
1965
+ "recall@5": 0.6189821182943603,
1966
+ "recall@10": 1.0,
1967
+ "f1@1": 0.24896836313617607,
1968
+ "f1@5": 0.20632737276478683,
1969
+ "f1@10": 0.18181818181818185,
1970
+ "map@1": 0.24896836313617607,
1971
+ "map@5": 0.3748509857863365,
1972
+ "map@10": 0.4223728739547171,
1973
+ "mrr@1": 0.24896836313617607,
1974
+ "mrr@5": 0.3748509857863365,
1975
+ "mrr@10": 0.4223728739547171,
1976
+ "num_pred": 727,
1977
+ "num_data": 727
1978
+ },
1979
+ "MomentSeeker": {
1980
+ "hit@1": 0.46555555555555556,
1981
+ "hit@5": 0.9027777777777778,
1982
+ "hit@10": 1.0,
1983
+ "ndcg_linear@1": 0.46555555555555556,
1984
+ "ndcg_linear@5": 0.7032832283395833,
1985
+ "ndcg_linear@10": 0.7383141215502083,
1986
+ "ndcg_exponential@1": 0.46555555555555556,
1987
+ "ndcg_exponential@5": 0.7032832283395833,
1988
+ "ndcg_exponential@10": 0.7383141215502083,
1989
+ "precision@1": 0.46555555555555556,
1990
+ "precision@5": 0.2011111111111111,
1991
+ "precision@10": 0.11288888888888891,
1992
+ "recall@1": 0.4385185185185185,
1993
+ "recall@5": 0.8939583333333333,
1994
+ "recall@10": 0.9981481481481482,
1995
+ "f1@1": 0.44680511463844796,
1996
+ "f1@5": 0.32284955784955793,
1997
+ "f1@10": 0.20018091476424807,
1998
+ "map@1": 0.46555555555555556,
1999
+ "map@5": 0.635270524691358,
2000
+ "map@10": 0.651133616255144,
2001
+ "mrr@1": 0.46555555555555556,
2002
+ "mrr@5": 0.6413055555555555,
2003
+ "mrr@10": 0.6548293650793651,
2004
+ "num_pred": 1800,
2005
+ "num_data": 1800
2006
+ },
2007
+ "K700": {
2008
+ "hit@1": 0.59,
2009
+ "hit@5": 0.806,
2010
+ "hit@10": 0.857,
2011
+ "ndcg_linear@1": 0.59,
2012
+ "ndcg_linear@5": 0.7077746148817649,
2013
+ "ndcg_linear@10": 0.724000273666642,
2014
+ "ndcg_exponential@1": 0.59,
2015
+ "ndcg_exponential@5": 0.7077746148817649,
2016
+ "ndcg_exponential@10": 0.724000273666642,
2017
+ "precision@1": 0.59,
2018
+ "precision@5": 0.16119999999999998,
2019
+ "precision@10": 0.08570000000000001,
2020
+ "recall@1": 0.59,
2021
+ "recall@5": 0.806,
2022
+ "recall@10": 0.857,
2023
+ "f1@1": 0.59,
2024
+ "f1@5": 0.26866666666666666,
2025
+ "f1@10": 0.1558181818181818,
2026
+ "map@1": 0.59,
2027
+ "map@5": 0.6748333333333334,
2028
+ "map@10": 0.681372619047619,
2029
+ "mrr@1": 0.59,
2030
+ "mrr@5": 0.6748333333333334,
2031
+ "mrr@10": 0.681372619047619,
2032
+ "num_pred": 1000,
2033
+ "num_data": 1000
2034
+ },
2035
+ "SmthSmthV2": {
2036
+ "hit@1": 0.646,
2037
+ "hit@5": 0.866,
2038
+ "hit@10": 0.919,
2039
+ "ndcg_linear@1": 0.646,
2040
+ "ndcg_linear@5": 0.7665689935313655,
2041
+ "ndcg_linear@10": 0.7835822912819611,
2042
+ "ndcg_exponential@1": 0.646,
2043
+ "ndcg_exponential@5": 0.7665689935313655,
2044
+ "ndcg_exponential@10": 0.7835822912819611,
2045
+ "precision@1": 0.646,
2046
+ "precision@5": 0.17320000000000002,
2047
+ "precision@10": 0.09190000000000001,
2048
+ "recall@1": 0.646,
2049
+ "recall@5": 0.866,
2050
+ "recall@10": 0.919,
2051
+ "f1@1": 0.646,
2052
+ "f1@5": 0.2886666666666667,
2053
+ "f1@10": 0.16709090909090907,
2054
+ "map@1": 0.646,
2055
+ "map@5": 0.7331333333333333,
2056
+ "map@10": 0.7400765873015873,
2057
+ "mrr@1": 0.646,
2058
+ "mrr@5": 0.7331333333333333,
2059
+ "mrr@10": 0.7400765873015873,
2060
+ "num_pred": 1000,
2061
+ "num_data": 1000
2062
+ },
2063
+ "HMDB51": {
2064
+ "hit@1": 0.593,
2065
+ "hit@5": 0.811,
2066
+ "hit@10": 0.885,
2067
+ "ndcg_linear@1": 0.593,
2068
+ "ndcg_linear@5": 0.7090441379199225,
2069
+ "ndcg_linear@10": 0.733036952422186,
2070
+ "ndcg_exponential@1": 0.593,
2071
+ "ndcg_exponential@5": 0.7090441379199225,
2072
+ "ndcg_exponential@10": 0.733036952422186,
2073
+ "precision@1": 0.593,
2074
+ "precision@5": 0.16219999999999998,
2075
+ "precision@10": 0.0885,
2076
+ "recall@1": 0.593,
2077
+ "recall@5": 0.811,
2078
+ "recall@10": 0.885,
2079
+ "f1@1": 0.593,
2080
+ "f1@5": 0.27033333333333337,
2081
+ "f1@10": 0.16090909090909095,
2082
+ "map@1": 0.593,
2083
+ "map@5": 0.6751,
2084
+ "map@10": 0.6850373015873016,
2085
+ "mrr@1": 0.593,
2086
+ "mrr@5": 0.6751,
2087
+ "mrr@10": 0.6850373015873016,
2088
+ "num_pred": 1000,
2089
+ "num_data": 1000
2090
+ },
2091
+ "UCF101": {
2092
+ "hit@1": 0.788,
2093
+ "hit@5": 0.965,
2094
+ "hit@10": 0.989,
2095
+ "ndcg_linear@1": 0.788,
2096
+ "ndcg_linear@5": 0.8873256149390782,
2097
+ "ndcg_linear@10": 0.895344170842387,
2098
+ "ndcg_exponential@1": 0.788,
2099
+ "ndcg_exponential@5": 0.8873256149390782,
2100
+ "ndcg_exponential@10": 0.895344170842387,
2101
+ "precision@1": 0.788,
2102
+ "precision@5": 0.19300000000000003,
2103
+ "precision@10": 0.0989,
2104
+ "recall@1": 0.788,
2105
+ "recall@5": 0.965,
2106
+ "recall@10": 0.989,
2107
+ "f1@1": 0.788,
2108
+ "f1@5": 0.3216666666666667,
2109
+ "f1@10": 0.1798181818181818,
2110
+ "map@1": 0.788,
2111
+ "map@5": 0.8610833333333333,
2112
+ "map@10": 0.8645432539682539,
2113
+ "mrr@1": 0.788,
2114
+ "mrr@5": 0.8610833333333333,
2115
+ "mrr@10": 0.8645432539682539,
2116
+ "num_pred": 1000,
2117
+ "num_data": 1000
2118
+ },
2119
+ "Breakfast": {
2120
+ "hit@1": 0.3672055427251732,
2121
+ "hit@5": 0.789838337182448,
2122
+ "hit@10": 1.0,
2123
+ "ndcg_linear@1": 0.3672055427251732,
2124
+ "ndcg_linear@5": 0.5786460297178996,
2125
+ "ndcg_linear@10": 0.6478264872451965,
2126
+ "ndcg_exponential@1": 0.3672055427251732,
2127
+ "ndcg_exponential@5": 0.5786460297178996,
2128
+ "ndcg_exponential@10": 0.6478264872451965,
2129
+ "precision@1": 0.3672055427251732,
2130
+ "precision@5": 0.15796766743648963,
2131
+ "precision@10": 0.1,
2132
+ "recall@1": 0.3672055427251732,
2133
+ "recall@5": 0.789838337182448,
2134
+ "recall@10": 1.0,
2135
+ "f1@1": 0.3672055427251732,
2136
+ "f1@5": 0.26327944572748274,
2137
+ "f1@10": 0.1818181818181818,
2138
+ "map@1": 0.3672055427251732,
2139
+ "map@5": 0.5095073133179369,
2140
+ "map@10": 0.5387706660801349,
2141
+ "mrr@1": 0.3672055427251732,
2142
+ "mrr@5": 0.5095073133179369,
2143
+ "mrr@10": 0.5387706660801349,
2144
+ "num_pred": 433,
2145
+ "num_data": 433
2146
+ },
2147
+ "MVBench": {
2148
+ "hit@1": 0.56975,
2149
+ "hit@5": 1.0,
2150
+ "hit@10": 1.0,
2151
+ "ndcg_linear@1": 0.56975,
2152
+ "ndcg_linear@5": 0.8110591965257652,
2153
+ "ndcg_linear@10": 0.8110591965257652,
2154
+ "ndcg_exponential@1": 0.56975,
2155
+ "ndcg_exponential@5": 0.8110591965257652,
2156
+ "ndcg_exponential@10": 0.8110591965257652,
2157
+ "precision@1": 0.56975,
2158
+ "precision@5": 0.20000000000000004,
2159
+ "precision@10": 0.10000000000000002,
2160
+ "recall@1": 0.56975,
2161
+ "recall@5": 1.0,
2162
+ "recall@10": 1.0,
2163
+ "f1@1": 0.56975,
2164
+ "f1@5": 0.3333333333333333,
2165
+ "f1@10": 0.18181818181818182,
2166
+ "map@1": 0.56975,
2167
+ "map@5": 0.746825,
2168
+ "map@10": 0.746825,
2169
+ "mrr@1": 0.56975,
2170
+ "mrr@5": 0.746825,
2171
+ "mrr@10": 0.746825,
2172
+ "num_pred": 4000,
2173
+ "num_data": 4000
2174
+ },
2175
+ "Video-MME": {
2176
+ "hit@1": 0.5088888888888888,
2177
+ "hit@5": 1.0,
2178
+ "hit@10": 1.0,
2179
+ "ndcg_linear@1": 0.5088888888888888,
2180
+ "ndcg_linear@5": 0.7738633421698715,
2181
+ "ndcg_linear@10": 0.7738633421698715,
2182
+ "ndcg_exponential@1": 0.5088888888888888,
2183
+ "ndcg_exponential@5": 0.7738633421698715,
2184
+ "ndcg_exponential@10": 0.7738633421698715,
2185
+ "precision@1": 0.5088888888888888,
2186
+ "precision@5": 0.19999999999999996,
2187
+ "precision@10": 0.09999999999999998,
2188
+ "recall@1": 0.5088888888888888,
2189
+ "recall@5": 1.0,
2190
+ "recall@10": 1.0,
2191
+ "f1@1": 0.5088888888888888,
2192
+ "f1@5": 0.3333333333333334,
2193
+ "f1@10": 0.18181818181818188,
2194
+ "map@1": 0.5088888888888888,
2195
+ "map@5": 0.6979012345679012,
2196
+ "map@10": 0.6979012345679012,
2197
+ "mrr@1": 0.5088888888888888,
2198
+ "mrr@5": 0.6979012345679012,
2199
+ "mrr@10": 0.6979012345679012,
2200
+ "num_pred": 2700,
2201
+ "num_data": 2700
2202
+ },
2203
+ "NExTQA": {
2204
+ "hit@1": 0.6705978514712752,
2205
+ "hit@5": 1.0,
2206
+ "hit@10": 1.0,
2207
+ "ndcg_linear@1": 0.6705978514712752,
2208
+ "ndcg_linear@5": 0.8495446170574841,
2209
+ "ndcg_linear@10": 0.8495446170574841,
2210
+ "ndcg_exponential@1": 0.6705978514712752,
2211
+ "ndcg_exponential@5": 0.8495446170574841,
2212
+ "ndcg_exponential@10": 0.8495446170574841,
2213
+ "precision@1": 0.6705978514712752,
2214
+ "precision@5": 0.20000000000000004,
2215
+ "precision@10": 0.10000000000000002,
2216
+ "recall@1": 0.6705978514712752,
2217
+ "recall@5": 1.0,
2218
+ "recall@10": 1.0,
2219
+ "f1@1": 0.6705978514712752,
2220
+ "f1@5": 0.3333333333333333,
2221
+ "f1@10": 0.1818181818181818,
2222
+ "map@1": 0.6705978514712752,
2223
+ "map@5": 0.7991339716643312,
2224
+ "map@10": 0.7991339716643312,
2225
+ "mrr@1": 0.6705978514712752,
2226
+ "mrr@5": 0.7989588198661062,
2227
+ "mrr@10": 0.7989588198661062,
2228
+ "num_pred": 8564,
2229
+ "num_data": 8564
2230
+ },
2231
+ "EgoSchema": {
2232
+ "hit@1": 0.596,
2233
+ "hit@5": 1.0,
2234
+ "hit@10": 1.0,
2235
+ "ndcg_linear@1": 0.596,
2236
+ "ndcg_linear@5": 0.815692468749886,
2237
+ "ndcg_linear@10": 0.815692468749886,
2238
+ "ndcg_exponential@1": 0.596,
2239
+ "ndcg_exponential@5": 0.815692468749886,
2240
+ "ndcg_exponential@10": 0.815692468749886,
2241
+ "precision@1": 0.596,
2242
+ "precision@5": 0.2,
2243
+ "precision@10": 0.1,
2244
+ "recall@1": 0.596,
2245
+ "recall@5": 1.0,
2246
+ "recall@10": 1.0,
2247
+ "f1@1": 0.596,
2248
+ "f1@5": 0.3333333333333333,
2249
+ "f1@10": 0.18181818181818182,
2250
+ "map@1": 0.596,
2251
+ "map@5": 0.7538333333333332,
2252
+ "map@10": 0.7538333333333332,
2253
+ "mrr@1": 0.596,
2254
+ "mrr@5": 0.7538333333333332,
2255
+ "mrr@10": 0.7538333333333332,
2256
+ "num_pred": 500,
2257
+ "num_data": 500
2258
+ },
2259
+ "ActivityNetQA": {
2260
+ "hit@1": 0.766,
2261
+ "hit@5": 1.0,
2262
+ "hit@10": 1.0,
2263
+ "ndcg_linear@1": 0.766,
2264
+ "ndcg_linear@5": 0.9136375623357211,
2265
+ "ndcg_linear@10": 0.9136375623357211,
2266
+ "ndcg_exponential@1": 0.766,
2267
+ "ndcg_exponential@5": 0.9136375623357211,
2268
+ "ndcg_exponential@10": 0.9136375623357211,
2269
+ "precision@1": 0.766,
2270
+ "precision@5": 0.20000000000000004,
2271
+ "precision@10": 0.10000000000000002,
2272
+ "recall@1": 0.766,
2273
+ "recall@5": 1.0,
2274
+ "recall@10": 1.0,
2275
+ "f1@1": 0.766,
2276
+ "f1@5": 0.3333333333333333,
2277
+ "f1@10": 0.18181818181818182,
2278
+ "map@1": 0.766,
2279
+ "map@5": 0.883,
2280
+ "map@10": 0.883,
2281
+ "mrr@1": 0.766,
2282
+ "mrr@5": 0.883,
2283
+ "mrr@10": 0.883,
2284
+ "num_pred": 1000,
2285
+ "num_data": 1000
2286
+ }
2287
+ }
2288
+ }
2289
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:091aa7594dc2fcfbfa06b9e3c22a5f0562ac14f30375c13af7309407a0e67b8a
3
+ size 11420371
tokenizer_config.json ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "<|object_ref_start|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151647": {
37
+ "content": "<|object_ref_end|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "151648": {
45
+ "content": "<|box_start|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "151649": {
53
+ "content": "<|box_end|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "151650": {
61
+ "content": "<|quad_start|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "151651": {
69
+ "content": "<|quad_end|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "151652": {
77
+ "content": "<|vision_start|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "151653": {
85
+ "content": "<|vision_end|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "151654": {
93
+ "content": "<|vision_pad|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "151655": {
101
+ "content": "<|image_pad|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "151656": {
109
+ "content": "<|video_pad|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ }
116
+ },
117
+ "additional_special_tokens": [
118
+ "<|im_start|>",
119
+ "<|im_end|>",
120
+ "<|object_ref_start|>",
121
+ "<|object_ref_end|>",
122
+ "<|box_start|>",
123
+ "<|box_end|>",
124
+ "<|quad_start|>",
125
+ "<|quad_end|>",
126
+ "<|vision_start|>",
127
+ "<|vision_end|>",
128
+ "<|vision_pad|>",
129
+ "<|image_pad|>",
130
+ "<|video_pad|>"
131
+ ],
132
+ "bos_token": null,
133
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
134
+ "clean_up_tokenization_spaces": false,
135
+ "eos_token": "<|im_end|>",
136
+ "errors": "replace",
137
+ "extra_special_tokens": {},
138
+ "model_max_length": 32768,
139
+ "pad_token": "<|endoftext|>",
140
+ "padding_side": "left",
141
+ "processor_class": "Qwen2VLProcessor",
142
+ "split_special_tokens": false,
143
+ "tokenizer_class": "Qwen2Tokenizer",
144
+ "unk_token": null
145
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff