Upload folder using huggingface_hub
Browse files
README.md
CHANGED
@@ -78,7 +78,7 @@ Here, we have conducted only a simple performance comparison. For more detailed
|
|
78 |
|
79 |
We provide an example code to run InternVL-Chat-V1-1 using `transformers`.
|
80 |
|
81 |
-
We also welcome you to experience the InternVL2 series models in our [online demo](https://internvl.opengvlab.com/).
|
82 |
|
83 |
> Please use transformers==4.37.2 to ensure the model works normally.
|
84 |
|
@@ -172,9 +172,9 @@ model = AutoModel.from_pretrained(
|
|
172 |
torch_dtype=torch.bfloat16,
|
173 |
low_cpu_mem_usage=True,
|
174 |
trust_remote_code=True).eval().cuda()
|
175 |
-
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
|
176 |
|
177 |
-
generation_config = dict(
|
178 |
question = 'Hello, who are you?'
|
179 |
response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
|
180 |
print(f'User: {question}')
|
@@ -199,13 +199,13 @@ model = AutoModel.from_pretrained(
|
|
199 |
torch_dtype=torch.bfloat16,
|
200 |
low_cpu_mem_usage=True,
|
201 |
trust_remote_code=True).eval().cuda()
|
202 |
-
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
|
203 |
|
204 |
image_processor = CLIPImageProcessor.from_pretrained(path)
|
205 |
image = Image.open('./examples/image2.jpg').resize((448, 448))
|
206 |
pixel_values = image_processor(images=image, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
|
207 |
|
208 |
-
generation_config = dict(
|
209 |
question = '<image>\nPlease describe the image shortly.'
|
210 |
response = model.chat(tokenizer, pixel_values, question, generation_config)
|
211 |
print(f'User: {question}')
|
@@ -225,13 +225,13 @@ model = AutoModel.from_pretrained(
|
|
225 |
torch_dtype=torch.bfloat16,
|
226 |
low_cpu_mem_usage=True,
|
227 |
trust_remote_code=True).eval().cuda()
|
228 |
-
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
|
229 |
|
230 |
image_processor = CLIPImageProcessor.from_pretrained(path)
|
231 |
image = Image.open('./examples/image2.jpg').resize((448, 448))
|
232 |
pixel_values = image_processor(images=image, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
|
233 |
|
234 |
-
generation_config = dict(
|
235 |
question = '<image>\nPlease describe the image in detail.'
|
236 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
|
237 |
print(f'User: {question}')
|
@@ -258,7 +258,7 @@ model = AutoModel.from_pretrained(
|
|
258 |
torch_dtype=torch.bfloat16,
|
259 |
low_cpu_mem_usage=True,
|
260 |
trust_remote_code=True).eval().cuda()
|
261 |
-
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
|
262 |
|
263 |
image_processor = CLIPImageProcessor.from_pretrained(path)
|
264 |
image1 = Image.open('./examples/image1.jpg').resize((448, 448))
|
@@ -267,7 +267,7 @@ image2 = Image.open('./examples/image2.jpg').resize((448, 448))
|
|
267 |
pixel_values2 = image_processor(images=image2, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
|
268 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
269 |
|
270 |
-
generation_config = dict(
|
271 |
question = '<image>\nDescribe the two images in detail.'
|
272 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
273 |
history=None, return_history=True)
|
@@ -296,7 +296,7 @@ model = AutoModel.from_pretrained(
|
|
296 |
torch_dtype=torch.bfloat16,
|
297 |
low_cpu_mem_usage=True,
|
298 |
trust_remote_code=True).eval().cuda()
|
299 |
-
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
|
300 |
|
301 |
image_processor = CLIPImageProcessor.from_pretrained(path)
|
302 |
image1 = Image.open('./examples/image1.jpg').resize((448, 448))
|
@@ -306,7 +306,7 @@ pixel_values2 = image_processor(images=image2, return_tensors='pt').pixel_values
|
|
306 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
307 |
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
|
308 |
|
309 |
-
generation_config = dict(
|
310 |
question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
|
311 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
312 |
num_patches_list=num_patches_list, history=None, return_history=True)
|
@@ -333,7 +333,7 @@ model = AutoModel.from_pretrained(
|
|
333 |
torch_dtype=torch.bfloat16,
|
334 |
low_cpu_mem_usage=True,
|
335 |
trust_remote_code=True).eval().cuda()
|
336 |
-
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
|
337 |
|
338 |
image_processor = CLIPImageProcessor.from_pretrained(path)
|
339 |
image1 = Image.open('./examples/image1.jpg').resize((448, 448))
|
@@ -343,7 +343,7 @@ pixel_values2 = image_processor(images=image2, return_tensors='pt').pixel_values
|
|
343 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
344 |
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
|
345 |
|
346 |
-
generation_config = dict(
|
347 |
questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
|
348 |
responses = model.batch_chat(tokenizer, pixel_values,
|
349 |
num_patches_list=num_patches_list,
|
@@ -403,9 +403,9 @@ model = AutoModel.from_pretrained(
|
|
403 |
torch_dtype=torch.bfloat16,
|
404 |
low_cpu_mem_usage=True,
|
405 |
trust_remote_code=True).eval().cuda()
|
406 |
-
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
|
407 |
|
408 |
-
generation_config = dict(
|
409 |
|
410 |
video_path = './examples/red-panda.mp4'
|
411 |
pixel_values, num_patches_list = load_video(video_path, num_segments=8)
|
@@ -436,7 +436,7 @@ from threading import Thread
|
|
436 |
# Initialize the streamer
|
437 |
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
|
438 |
# Define the generation configuration
|
439 |
-
generation_config = dict(
|
440 |
# Start the model chat in a separate thread
|
441 |
thread = Thread(target=model.chat, kwargs=dict(
|
442 |
tokenizer=tokenizer, pixel_values=pixel_values, question=question,
|
|
|
78 |
|
79 |
We provide an example code to run InternVL-Chat-V1-1 using `transformers`.
|
80 |
|
81 |
+
We also welcome you to experience the InternVL2 series models in our [online demo](https://internvl.opengvlab.com/).
|
82 |
|
83 |
> Please use transformers==4.37.2 to ensure the model works normally.
|
84 |
|
|
|
172 |
torch_dtype=torch.bfloat16,
|
173 |
low_cpu_mem_usage=True,
|
174 |
trust_remote_code=True).eval().cuda()
|
175 |
+
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
|
176 |
|
177 |
+
generation_config = dict(max_new_tokens=1024, do_sample=False)
|
178 |
question = 'Hello, who are you?'
|
179 |
response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
|
180 |
print(f'User: {question}')
|
|
|
199 |
torch_dtype=torch.bfloat16,
|
200 |
low_cpu_mem_usage=True,
|
201 |
trust_remote_code=True).eval().cuda()
|
202 |
+
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
|
203 |
|
204 |
image_processor = CLIPImageProcessor.from_pretrained(path)
|
205 |
image = Image.open('./examples/image2.jpg').resize((448, 448))
|
206 |
pixel_values = image_processor(images=image, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
|
207 |
|
208 |
+
generation_config = dict(max_new_tokens=1024, do_sample=False)
|
209 |
question = '<image>\nPlease describe the image shortly.'
|
210 |
response = model.chat(tokenizer, pixel_values, question, generation_config)
|
211 |
print(f'User: {question}')
|
|
|
225 |
torch_dtype=torch.bfloat16,
|
226 |
low_cpu_mem_usage=True,
|
227 |
trust_remote_code=True).eval().cuda()
|
228 |
+
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
|
229 |
|
230 |
image_processor = CLIPImageProcessor.from_pretrained(path)
|
231 |
image = Image.open('./examples/image2.jpg').resize((448, 448))
|
232 |
pixel_values = image_processor(images=image, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
|
233 |
|
234 |
+
generation_config = dict(max_new_tokens=1024, do_sample=False)
|
235 |
question = '<image>\nPlease describe the image in detail.'
|
236 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
|
237 |
print(f'User: {question}')
|
|
|
258 |
torch_dtype=torch.bfloat16,
|
259 |
low_cpu_mem_usage=True,
|
260 |
trust_remote_code=True).eval().cuda()
|
261 |
+
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
|
262 |
|
263 |
image_processor = CLIPImageProcessor.from_pretrained(path)
|
264 |
image1 = Image.open('./examples/image1.jpg').resize((448, 448))
|
|
|
267 |
pixel_values2 = image_processor(images=image2, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
|
268 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
269 |
|
270 |
+
generation_config = dict(max_new_tokens=1024, do_sample=False)
|
271 |
question = '<image>\nDescribe the two images in detail.'
|
272 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
273 |
history=None, return_history=True)
|
|
|
296 |
torch_dtype=torch.bfloat16,
|
297 |
low_cpu_mem_usage=True,
|
298 |
trust_remote_code=True).eval().cuda()
|
299 |
+
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
|
300 |
|
301 |
image_processor = CLIPImageProcessor.from_pretrained(path)
|
302 |
image1 = Image.open('./examples/image1.jpg').resize((448, 448))
|
|
|
306 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
307 |
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
|
308 |
|
309 |
+
generation_config = dict(max_new_tokens=1024, do_sample=False)
|
310 |
question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
|
311 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
312 |
num_patches_list=num_patches_list, history=None, return_history=True)
|
|
|
333 |
torch_dtype=torch.bfloat16,
|
334 |
low_cpu_mem_usage=True,
|
335 |
trust_remote_code=True).eval().cuda()
|
336 |
+
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
|
337 |
|
338 |
image_processor = CLIPImageProcessor.from_pretrained(path)
|
339 |
image1 = Image.open('./examples/image1.jpg').resize((448, 448))
|
|
|
343 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
344 |
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
|
345 |
|
346 |
+
generation_config = dict(max_new_tokens=1024, do_sample=False)
|
347 |
questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
|
348 |
responses = model.batch_chat(tokenizer, pixel_values,
|
349 |
num_patches_list=num_patches_list,
|
|
|
403 |
torch_dtype=torch.bfloat16,
|
404 |
low_cpu_mem_usage=True,
|
405 |
trust_remote_code=True).eval().cuda()
|
406 |
+
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
|
407 |
|
408 |
+
generation_config = dict(max_new_tokens=1024, do_sample=False)
|
409 |
|
410 |
video_path = './examples/red-panda.mp4'
|
411 |
pixel_values, num_patches_list = load_video(video_path, num_segments=8)
|
|
|
436 |
# Initialize the streamer
|
437 |
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
|
438 |
# Define the generation configuration
|
439 |
+
generation_config = dict(max_new_tokens=1024, do_sample=False, streamer=streamer)
|
440 |
# Start the model chat in a separate thread
|
441 |
thread = Thread(target=model.chat, kwargs=dict(
|
442 |
tokenizer=tokenizer, pixel_values=pixel_values, question=question,
|