can-gaa-hou commited on
Commit
d5db947
·
1 Parent(s): d45ffa3
README.md CHANGED
@@ -1,3 +1,52 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - multilingual
5
+ pipeline_tag: image-text-to-text
6
+ tags:
7
+ - got
8
+ - vision-language
9
+ - ocr2.0
10
+ - custom_code
11
+ base_model:
12
+ - stepfun-ai/GOT-OCR2_0
13
+ base_model_relation: quantized
14
+ ---
15
+
16
+ This is the OpenVINO accelerated version for GOT-OCR2.0.
17
+ To use this model, download all files from the origin repo [stepfun-ai/GOT-OCR2_0](https://huggingface.co/stepfun-ai/GOT-OCR2_0/tree/main) and copy everything to the **weight** folder. The file structure should look like this:
18
+ ```
19
+ .
20
+ │ app.py
21
+ │ convert_model.py
22
+ ├─weight
23
+ │ config.json
24
+ │ generation_config.json
25
+ │ got_vision_b.py
26
+ │ modeling_GOT.py
27
+ │ openvino_language_model.bin
28
+ │ openvino_language_model.xml
29
+ │ openvino_text_embeddings_model.bin
30
+ │ openvino_text_embeddings_model.xml
31
+ │ openvino_vision_embeddings_merger_model.bin
32
+ │ openvino_vision_embeddings_merger_model.xml
33
+ │ openvino_vision_embeddings_model.bin
34
+ │ openvino_vision_embeddings_model.xml
35
+ │ qwen.tiktoken
36
+ │ render_tools.py
37
+ │ special_tokens_map.json
38
+ │ tokenization_qwen.json
39
+ │ tokenizer_config.json
40
+ ```
41
+
42
+ Libraries require:
43
+ ```python
44
+ pip install "openvino" "torch" "transformers" "torchvision" "Pillow" "nncf" "requests" "numpy"
45
+ ```
46
+
47
+ Simply running the following command
48
+ ```python
49
+ python app.py --image-file /path/to/image
50
+ ```
51
+
52
+ For more instruction, refer to [GitHub Page](https://github.com/can-gaa-hou/GOT-OCR2.0-OpenVINO)
app.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+ import requests
4
+ import dataclasses
5
+ import nncf
6
+ from PIL import Image
7
+ from io import BytesIO
8
+ from typing import List
9
+ from enum import auto, Enum
10
+ from convert_model import OVGotOcrModel
11
+ from transformers import AutoTokenizer, TextStreamer, StoppingCriteria
12
+ from torchvision import transforms
13
+ from torchvision.transforms.functional import InterpolationMode
14
+
15
+
16
+ class SeparatorStyle(Enum):
17
+ """Different separator style."""
18
+ SINGLE = auto()
19
+ TWO = auto()
20
+ MPT = auto()
21
+
22
+
23
+ @dataclasses.dataclass
24
+ class Conversation:
25
+ """A class that keeps all conversation history."""
26
+ system: str
27
+ roles: List[str]
28
+ messages: List[List[str]]
29
+ offset: int
30
+ sep_style: SeparatorStyle = SeparatorStyle.SINGLE
31
+ sep: str = "<|im_end|>"
32
+ sep2: str = None
33
+ version: str = "Unknown"
34
+
35
+ skip_next: bool = False
36
+
37
+ def get_prompt(self):
38
+ if self.sep_style == SeparatorStyle.SINGLE:
39
+ ret = self.system + self.sep + '\n'
40
+ for role, message in self.messages:
41
+ if message:
42
+ if type(message) is tuple:
43
+ message, _, _ = message
44
+ ret += role + ": " + message + self.sep
45
+ else:
46
+ ret += role + ":"
47
+ return ret
48
+ elif self.sep_style == SeparatorStyle.TWO:
49
+ seps = [self.sep, self.sep2]
50
+ ret = self.system + seps[0]
51
+ for i, (role, message) in enumerate(self.messages):
52
+ if message:
53
+ if type(message) is tuple:
54
+ message, _, _ = message
55
+ ret += role + ": " + message + seps[i % 2]
56
+ else:
57
+ ret += role + ":"
58
+ return ret
59
+ if self.sep_style == SeparatorStyle.MPT:
60
+ if self.system:
61
+ ret = self.system + self.sep
62
+ else:
63
+ ret = ''
64
+ for role, message in self.messages:
65
+ if message:
66
+ if type(message) is tuple:
67
+ message, _, _ = message
68
+ ret += role + message + self.sep
69
+ else:
70
+ ret += role
71
+ return ret
72
+ else:
73
+ raise ValueError(f"Invalid style: {self.sep_style}")
74
+
75
+
76
+ def append_message(self, role, message):
77
+ self.messages.append([role, message])
78
+
79
+ def copy(self):
80
+ return Conversation(
81
+ system=self.system,
82
+ roles=self.roles,
83
+ messages=[[x, y] for x, y in self.messages],
84
+ offset=self.offset,
85
+ sep_style=self.sep_style,
86
+ sep=self.sep,
87
+ sep2=self.sep2)
88
+
89
+
90
+ class KeywordsStoppingCriteria(StoppingCriteria):
91
+ def __init__(self, keywords, tokenizer, input_ids):
92
+ self.keywords = keywords
93
+ self.keyword_ids = [tokenizer(keyword).input_ids for keyword in keywords]
94
+ self.keyword_ids = [keyword_id[0] for keyword_id in self.keyword_ids if type(keyword_id) is list and len(keyword_id) == 1]
95
+ self.tokenizer = tokenizer
96
+ self.start_len = None
97
+ self.input_ids = input_ids
98
+
99
+ def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
100
+ if self.start_len is None:
101
+ self.start_len = self.input_ids.shape[1]
102
+ else:
103
+ for keyword_id in self.keyword_ids:
104
+ if output_ids[0, -1] == keyword_id:
105
+ return True
106
+ outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len:], skip_special_tokens=True)[0]
107
+ for keyword in self.keywords:
108
+ if keyword in outputs:
109
+ return True
110
+ return False
111
+
112
+
113
+ class GOTImageEvalProcessor:
114
+ def __init__(self, image_size=384, mean=None, std=None):
115
+ if mean is None:
116
+ mean = (0.48145466, 0.4578275, 0.40821073)
117
+ if std is None:
118
+ std = (0.26862954, 0.26130258, 0.27577711)
119
+
120
+ self.normalize = transforms.Normalize(mean, std)
121
+
122
+ self.transform = transforms.Compose(
123
+ [
124
+ transforms.Resize(
125
+ (image_size, image_size), interpolation=InterpolationMode.BICUBIC
126
+ ),
127
+ transforms.ToTensor(),
128
+ self.normalize,
129
+ ]
130
+ )
131
+ def __call__(self, item):
132
+ return self.transform(item)
133
+
134
+
135
+ def load_image(image_file):
136
+ if image_file.startswith('http') or image_file.startswith('https'):
137
+ response = requests.get(image_file)
138
+ image = Image.open(BytesIO(response.content)).convert('RGB')
139
+ else:
140
+ image = Image.open(image_file).convert('RGB')
141
+ return image
142
+
143
+
144
+ def eval_model(image_file, model, tokenizer):
145
+
146
+ DEFAULT_IMAGE_TOKEN = "<image>"
147
+ DEFAULT_IMAGE_PATCH_TOKEN = '<imgpad>'
148
+ DEFAULT_IM_START_TOKEN = '<img>'
149
+ DEFAULT_IM_END_TOKEN = '</img>'
150
+ # Model
151
+
152
+ # TODO vary old codes, NEED del
153
+ image_processor = GOTImageEvalProcessor(image_size=1024)
154
+
155
+ use_im_start_end = True
156
+
157
+ image_token_len = 256
158
+
159
+ image = load_image(image_file)
160
+
161
+ qs = 'OCR: '
162
+
163
+ if use_im_start_end:
164
+ qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN*image_token_len + DEFAULT_IM_END_TOKEN + '\n' + qs
165
+ else:
166
+ qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
167
+
168
+
169
+
170
+ conv_mpt = Conversation(
171
+ system="""<|im_start|>system
172
+ You should follow the instructions carefully and explain your answers in detail.""",
173
+ # system = None,
174
+ roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
175
+ version="mpt",
176
+ messages=(),
177
+ offset=0,
178
+ sep_style=SeparatorStyle.MPT,
179
+ sep="<|im_end|>",
180
+ )
181
+
182
+ conv = conv_mpt.copy()
183
+ conv.append_message(conv.roles[0], qs)
184
+ conv.append_message(conv.roles[1], None)
185
+ prompt = conv.get_prompt()
186
+
187
+
188
+ inputs = tokenizer([prompt])
189
+
190
+ image_tensor = image_processor(image)
191
+
192
+ input_ids = torch.as_tensor(inputs.input_ids).cpu()
193
+
194
+ stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
195
+ keywords = [stop_str]
196
+ stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
197
+ streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
198
+
199
+ import time
200
+
201
+ start = time.time()
202
+ output_ids = model.generate(
203
+ input_ids,
204
+ images= [image_tensor.unsqueeze(0).cpu()],
205
+ do_sample=False,
206
+ num_beams = 1,
207
+ no_repeat_ngram_size = 20,
208
+ streamer=streamer,
209
+ max_new_tokens=4096,
210
+ stopping_criteria=[stopping_criteria],
211
+ )
212
+ end = time.time()
213
+ print(f"\n Generate time {end - start}s")
214
+
215
+ outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
216
+
217
+ if outputs.endswith(stop_str):
218
+ outputs = outputs[:-len(stop_str)]
219
+ outputs = outputs.strip()
220
+ return output_ids.size(-1) / (end - start)
221
+ return outputs
222
+
223
+
224
+ if __name__ == "__main__":
225
+ parser = argparse.ArgumentParser()
226
+ parser.add_argument("--weight-dir", type=str, default="./")
227
+ parser.add_argument("--image-file", type=str, required=True)
228
+ args = parser.parse_args()
229
+ model_dir = args.weight_dir
230
+
231
+ compression_configuration = {
232
+ "mode": nncf.CompressWeightsMode.INT4_ASYM,
233
+ "group_size": 128,
234
+ "ratio": 1.0,
235
+ }
236
+ model = OVGotOcrModel(model_dir, "CPU", compression_configuration=compression_configuration)
237
+ tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
238
+ with torch.no_grad():
239
+ eval_model(args.image_file, model, tokenizer)
convert_model.py ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Optional, Tuple, Union, List
3
+ import openvino as ov
4
+ import numpy as np
5
+ import torch
6
+ from transformers import AutoConfig
7
+ from transformers.generation import GenerationConfig, GenerationMixin
8
+ from transformers.modeling_outputs import CausalLMOutputWithPast
9
+
10
+
11
+ core = ov.Core()
12
+
13
+
14
+ LANGUAGE_MODEL_NAME = "openvino_language_model.xml"
15
+ VISION_TOWER_HIGH_NAME = "openvino_vision_tower_high_model.xml"
16
+ TEXT_EMBEDDING_NAME = "openvino_text_embeddings_model.xml"
17
+ PROJECTOR_VARY_NAME = "openvino_projector_vary_model.xml"
18
+ LM_HAED_NAME = "openvino_lm_head_model.xml"
19
+
20
+
21
+ class OvModelForCausalLMWithEmb(GenerationMixin):
22
+ def __init__(self, model_dir, device="CPU", config=None, ov_config=None, compile=True) -> None:
23
+ self._supports_cache_class = False
24
+ self.config = AutoConfig.from_pretrained(model_dir) if config is None else config
25
+ self.config.is_decoder = True
26
+ self.config.is_encoder_decoder = False
27
+ self.generation_config = GenerationConfig.from_model_config(self.config)
28
+ model_dir = Path(model_dir)
29
+ self.model = core.read_model(model_dir / LANGUAGE_MODEL_NAME)
30
+ self.token_emb = core.read_model(model_dir / TEXT_EMBEDDING_NAME)
31
+ self.request = None
32
+ self.token_emb_request = None
33
+ self._device = device.upper()
34
+ self.device = torch.device("cpu")
35
+ self.ov_config = ov_config
36
+ self.next_beam_idx = None
37
+ self._past_length = None
38
+ self.input_names = [input_t.get_any_name() for input_t in self.model.inputs]
39
+ self.main_input_name = "input_ids"
40
+ if compile:
41
+ self.compile()
42
+
43
+ def compile(self):
44
+ if self.request is None:
45
+ self.request = core.compile_model(self.model, self._device, self.ov_config).create_infer_request()
46
+ self._compile_token_emb()
47
+
48
+ def _compile_token_emb(self):
49
+ if self.token_emb_request is None:
50
+ self.token_emb_request = core.compile_model(self.token_emb, self._device, self.ov_config)
51
+
52
+ def to(self, device: str):
53
+ if isinstance(device, str):
54
+ self._device = device.upper()
55
+ self.clear_requests()
56
+
57
+ return self
58
+
59
+ def clear_requests(self):
60
+ del self.request
61
+ del self.token_emb_request
62
+ self.request = None
63
+ self.token_emb_request = None
64
+
65
+ def embed_tokens(self, input_ids: torch.LongTensor):
66
+ self._compile_token_emb()
67
+ res = self.token_emb_request(input_ids, share_inputs=True)
68
+ return res[0]
69
+
70
+ def prepare_inputs(
71
+ self,
72
+ input_ids: torch.LongTensor,
73
+ attention_mask: Optional[torch.LongTensor] = None,
74
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
75
+ position_ids: Optional[torch.LongTensor] = None,
76
+ inputs_embeds: Optional[torch.FloatTensor] = None,
77
+ **kwargs,
78
+ ):
79
+ batch_size = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0]
80
+
81
+ inputs = {}
82
+ # past_key_values are not used explicitly, instead they are handled inside the model
83
+ if past_key_values is None:
84
+ # This is the first iteration in a sequence, reset all states
85
+ if self.request is not None:
86
+ self.request.reset_state()
87
+ # Set initial value for the next beam_idx input that will be used at the current iteration
88
+ # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used
89
+ self.next_beam_idx = np.arange(batch_size, dtype=int)
90
+ self._past_length = 0
91
+ past_len = self._get_past_length(past_key_values)
92
+
93
+ if inputs_embeds is None:
94
+ inputs_embeds = self.embed_tokens(input_ids if past_key_values is None else input_ids[:, -1:])
95
+
96
+ if hasattr(self.config, "scale_emb"):
97
+ inputs_embeds = inputs_embeds * self.config.scale_emb
98
+ inputs["inputs_embeds"] = inputs_embeds
99
+
100
+ # Add the attention_mask inputs when needed
101
+ if "attention_mask" in self.input_names or "position_ids" in self.input_names:
102
+ if attention_mask is not None:
103
+ attention_mask = np.array(attention_mask)
104
+ else:
105
+ attention_mask = np.ones((inputs_embeds.shape[0], inputs_embeds.shape[1] + past_len), dtype=int)
106
+
107
+ if "attention_mask" in self.input_names:
108
+ inputs["attention_mask"] = attention_mask
109
+
110
+ if "position_ids" in self.input_names:
111
+ if position_ids is not None:
112
+ position_ids = np.array(position_ids)
113
+ else:
114
+ position_ids = np.cumsum(attention_mask, axis=1) - 1
115
+ position_ids[attention_mask == 0] = 1
116
+ if past_key_values:
117
+ position_ids = position_ids[:, -input_ids.shape[1] :]
118
+
119
+ inputs["position_ids"] = position_ids
120
+
121
+ if "beam_idx" in self.input_names:
122
+ inputs["beam_idx"] = self.next_beam_idx if self.next_beam_idx is not None else np.arange(batch_size, dtype=int)
123
+
124
+ return inputs
125
+
126
+ def forward(
127
+ self,
128
+ input_ids: torch.LongTensor,
129
+ attention_mask: Optional[torch.LongTensor] = None,
130
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
131
+ position_ids: Optional[torch.LongTensor] = None,
132
+ inputs_embeds: Optional[torch.LongTensor] = None,
133
+ **kwargs,
134
+ ):
135
+ self.compile()
136
+
137
+ inputs = self.prepare_inputs(
138
+ input_ids=input_ids,
139
+ attention_mask=attention_mask,
140
+ past_key_values=past_key_values,
141
+ position_ids=position_ids,
142
+ inputs_embeds=inputs_embeds,
143
+ **kwargs,
144
+ )
145
+
146
+ # Run inference
147
+ self.request.start_async(inputs, share_inputs=True)
148
+ self.request.wait()
149
+ logits = self.request.get_tensor("logits").data
150
+ logits = torch.from_numpy(logits).to(self.device)
151
+ past_key_values = ((),)
152
+ self._past_length += inputs["inputs_embeds"].shape[1]
153
+
154
+ return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values)
155
+
156
+ # Adapted from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation
157
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
158
+ # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
159
+ attention_mask = kwargs.get("attention_mask", None)
160
+ use_cache = kwargs.get("use_cache", None)
161
+
162
+ if past_key_values is not None:
163
+ past_len = self._get_past_length(past_key_values)
164
+ # Keep only the unprocessed tokens:
165
+ # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
166
+ # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
167
+ # input)
168
+ if attention_mask is not None and input_ids is not None and attention_mask.shape[1] > input_ids.shape[1]:
169
+ input_ids = input_ids[:, -(attention_mask.shape[1] - past_len) :]
170
+ # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
171
+ # input_ids based on the past_length.
172
+ elif input_ids is not None and past_len < input_ids.shape[1]:
173
+ input_ids = input_ids[:, past_len:]
174
+ # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens
175
+ position_ids = kwargs.get("position_ids", None)
176
+ if attention_mask is not None and position_ids is None and "position_ids" in self.input_names:
177
+ # create position_ids on the fly for batch generation
178
+ position_ids = attention_mask.long().cumsum(-1) - 1
179
+ position_ids.masked_fill_(attention_mask == 0, 1)
180
+ if past_key_values and input_ids is not None:
181
+ position_ids = position_ids[:, -input_ids.shape[1] :]
182
+
183
+ model_inputs = {
184
+ "input_ids": input_ids,
185
+ "past_key_values": past_key_values,
186
+ "use_cache": use_cache,
187
+ "position_ids": position_ids,
188
+ "attention_mask": attention_mask,
189
+ "inputs_embeds": inputs_embeds if past_key_values is None else None,
190
+ }
191
+
192
+ return model_inputs
193
+
194
+ def _get_past_length(self, past_key_values=None):
195
+ if past_key_values is None:
196
+ return 0
197
+ return self._past_length
198
+
199
+ # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel._reorder_cache
200
+ def _reorder_cache(self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
201
+ """
202
+ This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
203
+ [`~PreTrainedModel.beam_sample`] is called.
204
+ This is required to match `past_key_values` with the correct beam_idx at every generation step.
205
+ """
206
+ self.next_beam_idx = np.array(beam_idx) # save beam_idx to be used as an input in the next iteration
207
+ return past_key_values
208
+
209
+ def can_generate(self):
210
+ """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate."""
211
+
212
+ return True
213
+
214
+ def __call__(self, *args, **kwargs):
215
+ return self.forward(*args, **kwargs)
216
+
217
+
218
+ class OVGotOcrModel(GenerationMixin):
219
+ def __init__(self, model_dir, device, ov_config=None, compression_configuration=None):
220
+ model_dir = Path(model_dir)
221
+ self.config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True, local_files_only=True)
222
+ self.generation_config = GenerationConfig.from_model_config(self.config)
223
+ self.vision_tower_high = core.compile_model(model_dir / VISION_TOWER_HIGH_NAME, device, ov_config)
224
+ self.mm_projector_vary = core.compile_model(model_dir / PROJECTOR_VARY_NAME, device, ov_config)
225
+ self.embed_tokens = core.compile_model(model_dir / TEXT_EMBEDDING_NAME, device)
226
+ self.lm_head = core.compile_model(model_dir / LM_HAED_NAME, device)
227
+ self.language_model = OvModelForCausalLMWithEmb(model_dir, device, self.config, ov_config)
228
+ self.main_input_name = "input_ids"
229
+ self.device = torch.device("cpu")
230
+ self._supports_cache_class = False
231
+ self.next_beam_idx = None
232
+ self._past_length = None
233
+ self.first = True
234
+ self.im_start_token = self.config.im_start_token
235
+
236
+ def can_generate(self):
237
+ """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate."""
238
+ return True
239
+
240
+ def __call__(self, *args, **kwargs) -> CausalLMOutputWithPast:
241
+ return self.forward(
242
+ *args,
243
+ **kwargs,
244
+ )
245
+
246
+ def _reorder_cache(self, *args, **kwargs) -> Tuple[Tuple[torch.Tensor]]:
247
+ """
248
+ This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
249
+ [`~PreTrainedModel.beam_sample`] is called.
250
+ This is required to match `past_key_values` with the correct beam_idx at every generation step.
251
+ """
252
+ return self.language_model._reorder_cache(*args, **kwargs)
253
+
254
+
255
+ def prepare_inputs_for_generation(
256
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
257
+ ):
258
+ # Omit tokens covered by past_key_values
259
+ if past_key_values is not None:
260
+ cache_length = past_length = self.language_model._get_past_length(past_key_values)
261
+ max_cache_length = None
262
+
263
+ # Keep only the unprocessed tokens:
264
+ # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
265
+ # some of the inputs are exclusively passed as part of the cache (e.g. when passing inputs_embeds as
266
+ # input)
267
+ if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
268
+ input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
269
+ # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
270
+ # input_ids based on the past_length.
271
+ elif past_length < input_ids.shape[1]:
272
+ input_ids = input_ids[:, past_length:]
273
+ # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
274
+
275
+ # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
276
+ if (
277
+ max_cache_length is not None
278
+ and attention_mask is not None
279
+ and cache_length + input_ids.shape[1] > max_cache_length
280
+ ):
281
+ attention_mask = attention_mask[:, -max_cache_length:]
282
+
283
+ position_ids = kwargs.get("position_ids", None)
284
+ if attention_mask is not None and position_ids is None:
285
+ # create position_ids on the fly for batch generation
286
+ position_ids = attention_mask.long().cumsum(-1) - 1
287
+ position_ids.masked_fill_(attention_mask == 0, 1)
288
+ if past_key_values:
289
+ position_ids = position_ids[:, -input_ids.shape[1] :]
290
+
291
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
292
+ if inputs_embeds is not None and past_key_values is None:
293
+ model_inputs = {"inputs_embeds": inputs_embeds}
294
+ else:
295
+ model_inputs = {"input_ids": input_ids}
296
+
297
+ model_inputs.update(
298
+ {
299
+ "position_ids": position_ids,
300
+ "past_key_values": past_key_values,
301
+ "use_cache": kwargs.get("use_cache"),
302
+ "attention_mask": attention_mask,
303
+ "images": kwargs.get("images", None),
304
+ }
305
+ )
306
+ return model_inputs
307
+
308
+ def forward(
309
+ self,
310
+ input_ids: torch.LongTensor = None,
311
+ attention_mask: Optional[torch.Tensor] = None,
312
+ position_ids: Optional[torch.LongTensor] = None,
313
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
314
+ inputs_embeds: Optional[torch.FloatTensor] = None,
315
+ labels: Optional[torch.LongTensor] = None,
316
+ use_cache: Optional[bool] = None,
317
+ output_attentions: Optional[bool] = None,
318
+ output_hidden_states: Optional[bool] = None,
319
+ images: Optional[torch.FloatTensor] = None,
320
+ return_dict: Optional[bool] = None,
321
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
322
+
323
+ if inputs_embeds is None:
324
+ inputs_embeds = torch.from_numpy(self.language_model.embed_tokens(input_ids))
325
+
326
+ if self.vision_tower_high is not None and (input_ids.shape[1] != 1) and images is not None:
327
+ use_im_start_end = getattr(self.config, "use_im_start_end", -1)
328
+
329
+ vision_select_layer = getattr(self.config, "vision_select_layer", -1)
330
+ im_patch_token = getattr(self.config, "im_patch_token", -1)
331
+ im_start_token = getattr(self.config, "im_start_token", -1)
332
+ im_end_token = getattr(self.config, "im_end_token", -1)
333
+ freeze_vision_tower = getattr(self.config, "freeze_vision_tower", False)
334
+
335
+ im_patch_token = 151859
336
+
337
+ im_start_token = 151857
338
+
339
+ im_end_token = 151858
340
+
341
+ image_features = []
342
+
343
+ for image in images:
344
+ P, C, H, W = image.shape
345
+ if P == 1:
346
+ with torch.set_grad_enabled(False):
347
+ cnn_feature = self.vision_tower_high(image)[0]
348
+ cnn_feature = torch.from_numpy(cnn_feature).flatten(2).permute(0, 2, 1).numpy() # 256*1024
349
+ image_feature = self.mm_projector_vary(cnn_feature)[0]
350
+ image_features.append(torch.from_numpy(image_feature))
351
+
352
+ else:
353
+ image_patches = torch.unbind(image)
354
+ image_patches_features = []
355
+ for image_patch in image_patches:
356
+ image_p = torch.stack([image_patch])
357
+
358
+ with torch.set_grad_enabled(False):
359
+ cnn_feature_p = self.vision_tower_high(image_p)[0]
360
+ cnn_feature_p = torch.from_numpy(cnn_feature_p).flatten(2).permute(0, 2, 1).numpy()
361
+ image_feature_p = self.mm_projector_vary(cnn_feature_p)[0]
362
+ image_patches_features.append(torch.from_numpy(image_feature_p))
363
+ image_feature = torch.cat(image_patches_features, dim=1)
364
+ image_features.append(image_feature)
365
+
366
+
367
+ dummy_image_features_2 = torch.zeros(256, 1024, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
368
+ dummy_image_features = dummy_image_features_2
369
+ use_im_start_end = True
370
+ new_input_embeds = []
371
+ for cur_input_ids, cur_input_embeds, cur_image_features in zip(input_ids, inputs_embeds, image_features):
372
+ if (cur_input_ids == im_patch_token).sum() == 0:
373
+ cur_input_embeds = cur_input_embeds + (0. * dummy_image_features).sum()
374
+ new_input_embeds.append(cur_input_embeds)
375
+ continue
376
+
377
+ if use_im_start_end:
378
+ if (cur_input_ids == im_start_token).sum() != (cur_input_ids == im_end_token).sum():
379
+ raise ValueError("The number of image start tokens and image end tokens should be the same.")
380
+
381
+ image_start_tokens = torch.where(cur_input_ids == im_start_token)[0]
382
+ for image_start_token_pos, per_cur_image_features in zip(image_start_tokens, cur_image_features):
383
+ per_cur_image_features = per_cur_image_features.to(device=cur_input_embeds.device)
384
+ num_patches = per_cur_image_features.shape[0]
385
+
386
+ if cur_input_ids[image_start_token_pos + num_patches + 1] != im_end_token:
387
+ raise ValueError("The image end token should follow the image start token.")
388
+
389
+ cur_input_embeds = torch.cat(
390
+ (
391
+ cur_input_embeds[:image_start_token_pos+1],
392
+ per_cur_image_features,
393
+ cur_input_embeds[image_start_token_pos + num_patches + 1:]
394
+ ),
395
+ dim=0
396
+ )
397
+
398
+
399
+ new_input_embeds.append(cur_input_embeds)
400
+ else:
401
+ raise NotImplementedError
402
+
403
+ inputs_embeds = torch.stack(new_input_embeds, dim=0)
404
+
405
+ outputs = self.language_model(
406
+ None, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=True
407
+ )
408
+ logits = outputs[0]
409
+ logits = self.lm_head(logits[0])[0]
410
+ logits = torch.from_numpy(logits).to(self.device)
411
+ logits = logits.unsqueeze(0)
412
+
413
+ return CausalLMOutputWithPast(
414
+ loss=None,
415
+ logits=logits,
416
+ past_key_values=outputs.past_key_values,
417
+ )
weight/openvino_language_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dff5d3046cd2ece93d5acd35f02acb950039dc9b1c64354dbd2e7031f9735d8
3
+ size 161798428
weight/openvino_language_model.xml ADDED
The diff for this file is too large to render. See raw diff
 
weight/openvino_lm_head_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89c9f07b311b28e94dd9500ad721920be96af641c3c696e582001d631de9cef4
3
+ size 311009280
weight/openvino_lm_head_model.xml ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0"?>
2
+ <net name="Model8839" version="11">
3
+ <layers>
4
+ <layer id="0" name="input" type="Parameter" version="opset1">
5
+ <data shape="?,?" element_type="f32" />
6
+ <output>
7
+ <port id="0" precision="FP32" names="input">
8
+ <dim>-1</dim>
9
+ <dim>-1</dim>
10
+ </port>
11
+ </output>
12
+ </layer>
13
+ <layer id="1" name="self.weight" type="Const" version="opset1">
14
+ <data element_type="bf16" shape="151860, 1024" offset="0" size="311009280" />
15
+ <output>
16
+ <port id="0" precision="BF16" names="self.weight">
17
+ <dim>151860</dim>
18
+ <dim>1024</dim>
19
+ </port>
20
+ </output>
21
+ </layer>
22
+ <layer id="2" name="ov_ext::linear/ConvertLike" type="Convert" version="opset1">
23
+ <data destination_type="f32" />
24
+ <rt_info>
25
+ <attribute name="decompression" version="0" />
26
+ </rt_info>
27
+ <input>
28
+ <port id="0" precision="BF16">
29
+ <dim>151860</dim>
30
+ <dim>1024</dim>
31
+ </port>
32
+ </input>
33
+ <output>
34
+ <port id="1" precision="FP32">
35
+ <dim>151860</dim>
36
+ <dim>1024</dim>
37
+ </port>
38
+ </output>
39
+ </layer>
40
+ <layer id="3" name="ov_ext::linear/MatMul" type="MatMul" version="opset1">
41
+ <data transpose_a="false" transpose_b="true" />
42
+ <input>
43
+ <port id="0" precision="FP32">
44
+ <dim>-1</dim>
45
+ <dim>-1</dim>
46
+ </port>
47
+ <port id="1" precision="FP32">
48
+ <dim>151860</dim>
49
+ <dim>1024</dim>
50
+ </port>
51
+ </input>
52
+ <output>
53
+ <port id="2" precision="FP32">
54
+ <dim>-1</dim>
55
+ <dim>151860</dim>
56
+ </port>
57
+ </output>
58
+ </layer>
59
+ <layer id="4" name="Result_1246350" type="Result" version="opset1">
60
+ <input>
61
+ <port id="0" precision="FP32">
62
+ <dim>-1</dim>
63
+ <dim>151860</dim>
64
+ </port>
65
+ </input>
66
+ </layer>
67
+ </layers>
68
+ <edges>
69
+ <edge from-layer="0" from-port="0" to-layer="3" to-port="0" />
70
+ <edge from-layer="1" from-port="0" to-layer="2" to-port="0" />
71
+ <edge from-layer="2" from-port="1" to-layer="3" to-port="1" />
72
+ <edge from-layer="3" from-port="2" to-layer="4" to-port="0" />
73
+ </edges>
74
+ <rt_info>
75
+ <Runtime_version value="2025.0.0-17942-1f68be9f594-releases/2025/0" />
76
+ <conversion_parameters>
77
+ <framework value="pytorch" />
78
+ <is_python_object value="True" />
79
+ </conversion_parameters>
80
+ </rt_info>
81
+ </net>
weight/openvino_projector_vary_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd69d514aac29cfa24c6fb34ef6b82b9c64d7ddf8b9636a135201a6cd4e8a4d4
3
+ size 2099200
weight/openvino_projector_vary_model.xml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0"?>
2
+ <net name="Model8836" version="11">
3
+ <layers>
4
+ <layer id="0" name="input" type="Parameter" version="opset1">
5
+ <data shape="?,?,?" element_type="f32" />
6
+ <output>
7
+ <port id="0" precision="FP32" names="input">
8
+ <dim>-1</dim>
9
+ <dim>-1</dim>
10
+ <dim>-1</dim>
11
+ </port>
12
+ </output>
13
+ </layer>
14
+ <layer id="1" name="self.weight" type="Const" version="opset1">
15
+ <data element_type="bf16" shape="1024, 1024" offset="0" size="2097152" />
16
+ <output>
17
+ <port id="0" precision="BF16" names="self.weight">
18
+ <dim>1024</dim>
19
+ <dim>1024</dim>
20
+ </port>
21
+ </output>
22
+ </layer>
23
+ <layer id="2" name="ov_ext::linear/ConvertLike" type="Convert" version="opset1">
24
+ <data destination_type="f32" />
25
+ <rt_info>
26
+ <attribute name="decompression" version="0" />
27
+ </rt_info>
28
+ <input>
29
+ <port id="0" precision="BF16">
30
+ <dim>1024</dim>
31
+ <dim>1024</dim>
32
+ </port>
33
+ </input>
34
+ <output>
35
+ <port id="1" precision="FP32">
36
+ <dim>1024</dim>
37
+ <dim>1024</dim>
38
+ </port>
39
+ </output>
40
+ </layer>
41
+ <layer id="3" name="ov_ext::linear/MatMul" type="MatMul" version="opset1">
42
+ <data transpose_a="false" transpose_b="true" />
43
+ <input>
44
+ <port id="0" precision="FP32">
45
+ <dim>-1</dim>
46
+ <dim>-1</dim>
47
+ <dim>-1</dim>
48
+ </port>
49
+ <port id="1" precision="FP32">
50
+ <dim>1024</dim>
51
+ <dim>1024</dim>
52
+ </port>
53
+ </input>
54
+ <output>
55
+ <port id="2" precision="FP32">
56
+ <dim>-1</dim>
57
+ <dim>-1</dim>
58
+ <dim>1024</dim>
59
+ </port>
60
+ </output>
61
+ </layer>
62
+ <layer id="4" name="self.bias" type="Const" version="opset1">
63
+ <data element_type="bf16" shape="1024" offset="2097152" size="2048" />
64
+ <output>
65
+ <port id="0" precision="BF16" names="self.bias">
66
+ <dim>1024</dim>
67
+ </port>
68
+ </output>
69
+ </layer>
70
+ <layer id="5" name="ov_ext::linear/ConvertLike_1" type="Convert" version="opset1">
71
+ <data destination_type="f32" />
72
+ <rt_info>
73
+ <attribute name="decompression" version="0" />
74
+ </rt_info>
75
+ <input>
76
+ <port id="0" precision="BF16">
77
+ <dim>1024</dim>
78
+ </port>
79
+ </input>
80
+ <output>
81
+ <port id="1" precision="FP32">
82
+ <dim>1024</dim>
83
+ </port>
84
+ </output>
85
+ </layer>
86
+ <layer id="6" name="ov_ext::linear/Add" type="Add" version="opset1">
87
+ <data auto_broadcast="numpy" />
88
+ <input>
89
+ <port id="0" precision="FP32">
90
+ <dim>-1</dim>
91
+ <dim>-1</dim>
92
+ <dim>1024</dim>
93
+ </port>
94
+ <port id="1" precision="FP32">
95
+ <dim>1024</dim>
96
+ </port>
97
+ </input>
98
+ <output>
99
+ <port id="2" precision="FP32">
100
+ <dim>-1</dim>
101
+ <dim>-1</dim>
102
+ <dim>1024</dim>
103
+ </port>
104
+ </output>
105
+ </layer>
106
+ <layer id="7" name="Result_1244540" type="Result" version="opset1">
107
+ <input>
108
+ <port id="0" precision="FP32">
109
+ <dim>-1</dim>
110
+ <dim>-1</dim>
111
+ <dim>1024</dim>
112
+ </port>
113
+ </input>
114
+ </layer>
115
+ </layers>
116
+ <edges>
117
+ <edge from-layer="0" from-port="0" to-layer="3" to-port="0" />
118
+ <edge from-layer="1" from-port="0" to-layer="2" to-port="0" />
119
+ <edge from-layer="2" from-port="1" to-layer="3" to-port="1" />
120
+ <edge from-layer="3" from-port="2" to-layer="6" to-port="0" />
121
+ <edge from-layer="4" from-port="0" to-layer="5" to-port="0" />
122
+ <edge from-layer="5" from-port="1" to-layer="6" to-port="1" />
123
+ <edge from-layer="6" from-port="2" to-layer="7" to-port="0" />
124
+ </edges>
125
+ <rt_info>
126
+ <Runtime_version value="2025.0.0-17942-1f68be9f594-releases/2025/0" />
127
+ <conversion_parameters>
128
+ <framework value="pytorch" />
129
+ <is_python_object value="True" />
130
+ </conversion_parameters>
131
+ </rt_info>
132
+ </net>
weight/openvino_text_embeddings_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acfbb7cf3eaf87dbd8172828ae169bde88f7871c96cbb15af3313afc198015e4
3
+ size 311009284
weight/openvino_text_embeddings_model.xml ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0"?>
2
+ <net name="Model0" version="11">
3
+ <layers>
4
+ <layer id="0" name="input" type="Parameter" version="opset1">
5
+ <data shape="?,?" element_type="i64" />
6
+ <output>
7
+ <port id="0" precision="I64" names="input">
8
+ <dim>-1</dim>
9
+ <dim>-1</dim>
10
+ </port>
11
+ </output>
12
+ </layer>
13
+ <layer id="1" name="self.weight" type="Const" version="opset1">
14
+ <data element_type="bf16" shape="151860, 1024" offset="0" size="311009280" />
15
+ <output>
16
+ <port id="0" precision="BF16" names="self.weight">
17
+ <dim>151860</dim>
18
+ <dim>1024</dim>
19
+ </port>
20
+ </output>
21
+ </layer>
22
+ <layer id="2" name="ov_ext::embedding/Convert" type="Convert" version="opset1">
23
+ <data destination_type="f32" />
24
+ <rt_info>
25
+ <attribute name="decompression" version="0" />
26
+ </rt_info>
27
+ <input>
28
+ <port id="0" precision="BF16">
29
+ <dim>151860</dim>
30
+ <dim>1024</dim>
31
+ </port>
32
+ </input>
33
+ <output>
34
+ <port id="1" precision="FP32">
35
+ <dim>151860</dim>
36
+ <dim>1024</dim>
37
+ </port>
38
+ </output>
39
+ </layer>
40
+ <layer id="3" name="ov_ext::embedding/Convert_1" type="Convert" version="opset1">
41
+ <data destination_type="i32" />
42
+ <input>
43
+ <port id="0" precision="I64">
44
+ <dim>-1</dim>
45
+ <dim>-1</dim>
46
+ </port>
47
+ </input>
48
+ <output>
49
+ <port id="1" precision="I32">
50
+ <dim>-1</dim>
51
+ <dim>-1</dim>
52
+ </port>
53
+ </output>
54
+ </layer>
55
+ <layer id="4" name="ov_ext::embedding/Constant" type="Const" version="opset1">
56
+ <data element_type="i32" shape="" offset="311009280" size="4" />
57
+ <output>
58
+ <port id="0" precision="I32" />
59
+ </output>
60
+ </layer>
61
+ <layer id="5" name="ov_ext::embedding/Gather" type="Gather" version="opset8">
62
+ <data batch_dims="0" />
63
+ <input>
64
+ <port id="0" precision="FP32">
65
+ <dim>151860</dim>
66
+ <dim>1024</dim>
67
+ </port>
68
+ <port id="1" precision="I32">
69
+ <dim>-1</dim>
70
+ <dim>-1</dim>
71
+ </port>
72
+ <port id="2" precision="I32" />
73
+ </input>
74
+ <output>
75
+ <port id="3" precision="FP32">
76
+ <dim>-1</dim>
77
+ <dim>-1</dim>
78
+ <dim>1024</dim>
79
+ </port>
80
+ </output>
81
+ </layer>
82
+ <layer id="6" name="Result_9" type="Result" version="opset1">
83
+ <input>
84
+ <port id="0" precision="FP32">
85
+ <dim>-1</dim>
86
+ <dim>-1</dim>
87
+ <dim>1024</dim>
88
+ </port>
89
+ </input>
90
+ </layer>
91
+ </layers>
92
+ <edges>
93
+ <edge from-layer="0" from-port="0" to-layer="3" to-port="0" />
94
+ <edge from-layer="1" from-port="0" to-layer="2" to-port="0" />
95
+ <edge from-layer="2" from-port="1" to-layer="5" to-port="0" />
96
+ <edge from-layer="3" from-port="1" to-layer="5" to-port="1" />
97
+ <edge from-layer="4" from-port="0" to-layer="5" to-port="2" />
98
+ <edge from-layer="5" from-port="3" to-layer="6" to-port="0" />
99
+ </edges>
100
+ <rt_info>
101
+ <Runtime_version value="2025.0.0-17942-1f68be9f594-releases/2025/0" />
102
+ <conversion_parameters>
103
+ <framework value="pytorch" />
104
+ <is_python_object value="True" />
105
+ </conversion_parameters>
106
+ </rt_info>
107
+ </net>
weight/openvino_vision_tower_high_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1216dd9cd19adce02fa1d8c40f28e57d568a68123788b2ca84bafe357ac7d2f3
3
+ size 59177496
weight/openvino_vision_tower_high_model.xml ADDED
The diff for this file is too large to render. See raw diff