khang119966 commited on
Commit
d28b41c
·
verified ·
1 Parent(s): 49b8ae9

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +148 -1
README.md CHANGED
@@ -127,5 +127,152 @@ hiệu quả hơn nguồn tài nguyên quặng apatit trong nước.
127
  3. Chủ đầu tư : Tổng công ty Hoá chất Việt Nam.
128
  ```
129
 
130
- # .............. continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
 
127
  3. Chủ đầu tư : Tổng công ty Hoá chất Việt Nam.
128
  ```
129
 
130
+ ## Quickstart
131
+
132
+ Here provides a code snippet to show you how to load the tokenizer and model and how to generate contents.
133
+ To run inference using the model, follow the steps outlined in our Colab inference notebook
134
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1ZD1oB56PF0lF66RCuTVJYLTEV0tM3CFf?usp=sharing)
135
+
136
+ ```python
137
+ import numpy as np
138
+ import torch
139
+ import torchvision.transforms as T
140
+ # from decord import VideoReader, cpu
141
+ from PIL import Image
142
+ from torchvision.transforms.functional import InterpolationMode
143
+ from transformers import AutoModel, AutoTokenizer
144
+
145
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
146
+ IMAGENET_STD = (0.229, 0.224, 0.225)
147
+
148
+ def build_transform(input_size):
149
+ MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
150
+ transform = T.Compose([
151
+ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
152
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
153
+ T.ToTensor(),
154
+ T.Normalize(mean=MEAN, std=STD)
155
+ ])
156
+ return transform
157
+
158
+ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
159
+ best_ratio_diff = float('inf')
160
+ best_ratio = (1, 1)
161
+ area = width * height
162
+ for ratio in target_ratios:
163
+ target_aspect_ratio = ratio[0] / ratio[1]
164
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
165
+ if ratio_diff < best_ratio_diff:
166
+ best_ratio_diff = ratio_diff
167
+ best_ratio = ratio
168
+ elif ratio_diff == best_ratio_diff:
169
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
170
+ best_ratio = ratio
171
+ return best_ratio
172
+
173
+ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
174
+ orig_width, orig_height = image.size
175
+ aspect_ratio = orig_width / orig_height
176
+
177
+ # calculate the existing image aspect ratio
178
+ target_ratios = set(
179
+ (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
180
+ i * j <= max_num and i * j >= min_num)
181
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
182
+
183
+ # find the closest aspect ratio to the target
184
+ target_aspect_ratio = find_closest_aspect_ratio(
185
+ aspect_ratio, target_ratios, orig_width, orig_height, image_size)
186
+
187
+ # calculate the target width and height
188
+ target_width = image_size * target_aspect_ratio[0]
189
+ target_height = image_size * target_aspect_ratio[1]
190
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
191
+
192
+ # resize the image
193
+ resized_img = image.resize((target_width, target_height))
194
+ processed_images = []
195
+ for i in range(blocks):
196
+ box = (
197
+ (i % (target_width // image_size)) * image_size,
198
+ (i // (target_width // image_size)) * image_size,
199
+ ((i % (target_width // image_size)) + 1) * image_size,
200
+ ((i // (target_width // image_size)) + 1) * image_size
201
+ )
202
+ # split the image
203
+ split_img = resized_img.crop(box)
204
+ processed_images.append(split_img)
205
+ assert len(processed_images) == blocks
206
+ if use_thumbnail and len(processed_images) != 1:
207
+ thumbnail_img = image.resize((image_size, image_size))
208
+ processed_images.append(thumbnail_img)
209
+ return processed_images
210
+
211
+ def load_image(image_file, input_size=448, max_num=12):
212
+ image = Image.open(image_file).convert('RGB')
213
+ transform = build_transform(input_size=input_size)
214
+ images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
215
+ pixel_values = [transform(image) for image in images]
216
+ pixel_values = torch.stack(pixel_values)
217
+ return pixel_values
218
+
219
+ model = AutoModel.from_pretrained(
220
+ "5CD-AI/Vintern-3B-R-beta",
221
+ torch_dtype=torch.bfloat16,
222
+ low_cpu_mem_usage=True,
223
+ trust_remote_code=True,
224
+ use_flash_attn=False,
225
+ ).eval().cuda()
226
+
227
+ tokenizer = AutoTokenizer.from_pretrained("5CD-AI/Vintern-3B-R-beta", trust_remote_code=True, use_fast=False)
228
+
229
+ test_image = 'test-image.jpg'
230
+
231
+ think_prompt_format = """<image>\nBạn là người rất cẩn thận và đa nghi, vui lòng trả lời câu hỏi dưới đây bằng tiếng Việt. Khi suy luận bạn thường liệt kê ra các bằng chứng để chỉ ra các đáp án khả thi, suy luận và giải thích tại sao lại lựa chọn và loại bỏ trước khi đưa ra câu trả lời cuối cùng.
232
+ Câu hỏi:
233
+ {question_input}
234
+ Hãy trả lời rất dài theo định dạng sau:
235
+ <SUMMARY>...</SUMMARY>
236
+ <CAPTION>...</CAPTION>
237
+ <INFORMATION_EXTRACT>...</INFORMATION_EXTRACT>
238
+ <EXTERNAL_KNOWLEDGE_EXPANSION>...</EXTERNAL_KNOWLEDGE_EXPANSION>
239
+ <FIND_CANDIDATES_REASONING>...</FIND_CANDIDATES_REASONING>
240
+ <TOP3_CANDIDATES>...</TOP3_CANDIDATES>
241
+ <REASONING_PLAN>...</REASONING_PLAN>
242
+ <REASONING>...</REASONING>
243
+ <COUNTER_ARGUMENTS>...</COUNTER_ARGUMENTS>
244
+ <VALIDATION_REASONING>...</VALIDATION_REASONING>
245
+ <CONCLUSION>...</CONCLUSION>
246
+ """
247
+
248
+ pixel_values = load_image(test_image, max_num=6).to(torch.bfloat16).cuda()
249
+ generation_config = dict(max_new_tokens= 1024, do_sample=False, num_beams = 3, repetition_penalty=2.5)
250
+
251
+ question = '<image>\nTrích xuất thông tin chính trong ảnh và trả về dạng markdown.'
252
+
253
+ response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
254
+ print(f'User: {question}\nAssistant: {response}')
255
+
256
+ #question = "Câu hỏi khác ......"
257
+ #response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
258
+ #print(f'User: {question}\nAssistant: {response}')
259
+ ```
260
+
261
+ ## Citation
262
+
263
+ ```
264
+ @misc{doan2024vintern1befficientmultimodallarge,
265
+ title={Vintern-1B: An Efficient Multimodal Large Language Model for Vietnamese},
266
+ author={Khang T. Doan and Bao G. Huynh and Dung T. Hoang and Thuc D. Pham and Nhat H. Pham and Quan T. M. Nguyen and Bang Q. Vo and Suong N. Hoang},
267
+ year={2024},
268
+ eprint={2408.12480},
269
+ archivePrefix={arXiv},
270
+ primaryClass={cs.LG},
271
+ url={https://arxiv.org/abs/2408.12480},
272
+ }
273
+ ```
274
+
275
+ ## Reference
276
+
277
+ [1] Z. Chen et al., ‘Expanding performance boundaries of open-source multimodal models with model, data, and test-time scaling’, arXiv preprint arXiv:2412. 05271, 2024.
278