Upload folder using huggingface_hub
Browse files
README.md
CHANGED
|
@@ -78,7 +78,7 @@ Here, we have conducted only a simple performance comparison. For more detailed
|
|
| 78 |
|
| 79 |
We provide an example code to run InternVL-Chat-V1-1 using `transformers`.
|
| 80 |
|
| 81 |
-
We also welcome you to experience the InternVL2 series models in our [online demo](https://internvl.opengvlab.com/).
|
| 82 |
|
| 83 |
> Please use transformers==4.37.2 to ensure the model works normally.
|
| 84 |
|
|
@@ -172,9 +172,9 @@ model = AutoModel.from_pretrained(
|
|
| 172 |
torch_dtype=torch.bfloat16,
|
| 173 |
low_cpu_mem_usage=True,
|
| 174 |
trust_remote_code=True).eval().cuda()
|
| 175 |
-
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
|
| 176 |
|
| 177 |
-
generation_config = dict(
|
| 178 |
question = 'Hello, who are you?'
|
| 179 |
response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
|
| 180 |
print(f'User: {question}')
|
|
@@ -199,13 +199,13 @@ model = AutoModel.from_pretrained(
|
|
| 199 |
torch_dtype=torch.bfloat16,
|
| 200 |
low_cpu_mem_usage=True,
|
| 201 |
trust_remote_code=True).eval().cuda()
|
| 202 |
-
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
|
| 203 |
|
| 204 |
image_processor = CLIPImageProcessor.from_pretrained(path)
|
| 205 |
image = Image.open('./examples/image2.jpg').resize((448, 448))
|
| 206 |
pixel_values = image_processor(images=image, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
|
| 207 |
|
| 208 |
-
generation_config = dict(
|
| 209 |
question = '<image>\nPlease describe the image shortly.'
|
| 210 |
response = model.chat(tokenizer, pixel_values, question, generation_config)
|
| 211 |
print(f'User: {question}')
|
|
@@ -225,13 +225,13 @@ model = AutoModel.from_pretrained(
|
|
| 225 |
torch_dtype=torch.bfloat16,
|
| 226 |
low_cpu_mem_usage=True,
|
| 227 |
trust_remote_code=True).eval().cuda()
|
| 228 |
-
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
|
| 229 |
|
| 230 |
image_processor = CLIPImageProcessor.from_pretrained(path)
|
| 231 |
image = Image.open('./examples/image2.jpg').resize((448, 448))
|
| 232 |
pixel_values = image_processor(images=image, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
|
| 233 |
|
| 234 |
-
generation_config = dict(
|
| 235 |
question = '<image>\nPlease describe the image in detail.'
|
| 236 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
|
| 237 |
print(f'User: {question}')
|
|
@@ -258,7 +258,7 @@ model = AutoModel.from_pretrained(
|
|
| 258 |
torch_dtype=torch.bfloat16,
|
| 259 |
low_cpu_mem_usage=True,
|
| 260 |
trust_remote_code=True).eval().cuda()
|
| 261 |
-
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
|
| 262 |
|
| 263 |
image_processor = CLIPImageProcessor.from_pretrained(path)
|
| 264 |
image1 = Image.open('./examples/image1.jpg').resize((448, 448))
|
|
@@ -267,7 +267,7 @@ image2 = Image.open('./examples/image2.jpg').resize((448, 448))
|
|
| 267 |
pixel_values2 = image_processor(images=image2, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
|
| 268 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
| 269 |
|
| 270 |
-
generation_config = dict(
|
| 271 |
question = '<image>\nDescribe the two images in detail.'
|
| 272 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 273 |
history=None, return_history=True)
|
|
@@ -296,7 +296,7 @@ model = AutoModel.from_pretrained(
|
|
| 296 |
torch_dtype=torch.bfloat16,
|
| 297 |
low_cpu_mem_usage=True,
|
| 298 |
trust_remote_code=True).eval().cuda()
|
| 299 |
-
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
|
| 300 |
|
| 301 |
image_processor = CLIPImageProcessor.from_pretrained(path)
|
| 302 |
image1 = Image.open('./examples/image1.jpg').resize((448, 448))
|
|
@@ -306,7 +306,7 @@ pixel_values2 = image_processor(images=image2, return_tensors='pt').pixel_values
|
|
| 306 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
| 307 |
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
|
| 308 |
|
| 309 |
-
generation_config = dict(
|
| 310 |
question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
|
| 311 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 312 |
num_patches_list=num_patches_list, history=None, return_history=True)
|
|
@@ -333,7 +333,7 @@ model = AutoModel.from_pretrained(
|
|
| 333 |
torch_dtype=torch.bfloat16,
|
| 334 |
low_cpu_mem_usage=True,
|
| 335 |
trust_remote_code=True).eval().cuda()
|
| 336 |
-
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
|
| 337 |
|
| 338 |
image_processor = CLIPImageProcessor.from_pretrained(path)
|
| 339 |
image1 = Image.open('./examples/image1.jpg').resize((448, 448))
|
|
@@ -343,7 +343,7 @@ pixel_values2 = image_processor(images=image2, return_tensors='pt').pixel_values
|
|
| 343 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
| 344 |
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
|
| 345 |
|
| 346 |
-
generation_config = dict(
|
| 347 |
questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
|
| 348 |
responses = model.batch_chat(tokenizer, pixel_values,
|
| 349 |
num_patches_list=num_patches_list,
|
|
@@ -403,9 +403,9 @@ model = AutoModel.from_pretrained(
|
|
| 403 |
torch_dtype=torch.bfloat16,
|
| 404 |
low_cpu_mem_usage=True,
|
| 405 |
trust_remote_code=True).eval().cuda()
|
| 406 |
-
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
|
| 407 |
|
| 408 |
-
generation_config = dict(
|
| 409 |
|
| 410 |
video_path = './examples/red-panda.mp4'
|
| 411 |
pixel_values, num_patches_list = load_video(video_path, num_segments=8)
|
|
@@ -436,7 +436,7 @@ from threading import Thread
|
|
| 436 |
# Initialize the streamer
|
| 437 |
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
|
| 438 |
# Define the generation configuration
|
| 439 |
-
generation_config = dict(
|
| 440 |
# Start the model chat in a separate thread
|
| 441 |
thread = Thread(target=model.chat, kwargs=dict(
|
| 442 |
tokenizer=tokenizer, pixel_values=pixel_values, question=question,
|
|
|
|
| 78 |
|
| 79 |
We provide an example code to run InternVL-Chat-V1-1 using `transformers`.
|
| 80 |
|
| 81 |
+
We also welcome you to experience the InternVL2 series models in our [online demo](https://internvl.opengvlab.com/).
|
| 82 |
|
| 83 |
> Please use transformers==4.37.2 to ensure the model works normally.
|
| 84 |
|
|
|
|
| 172 |
torch_dtype=torch.bfloat16,
|
| 173 |
low_cpu_mem_usage=True,
|
| 174 |
trust_remote_code=True).eval().cuda()
|
| 175 |
+
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
|
| 176 |
|
| 177 |
+
generation_config = dict(max_new_tokens=1024, do_sample=False)
|
| 178 |
question = 'Hello, who are you?'
|
| 179 |
response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
|
| 180 |
print(f'User: {question}')
|
|
|
|
| 199 |
torch_dtype=torch.bfloat16,
|
| 200 |
low_cpu_mem_usage=True,
|
| 201 |
trust_remote_code=True).eval().cuda()
|
| 202 |
+
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
|
| 203 |
|
| 204 |
image_processor = CLIPImageProcessor.from_pretrained(path)
|
| 205 |
image = Image.open('./examples/image2.jpg').resize((448, 448))
|
| 206 |
pixel_values = image_processor(images=image, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
|
| 207 |
|
| 208 |
+
generation_config = dict(max_new_tokens=1024, do_sample=False)
|
| 209 |
question = '<image>\nPlease describe the image shortly.'
|
| 210 |
response = model.chat(tokenizer, pixel_values, question, generation_config)
|
| 211 |
print(f'User: {question}')
|
|
|
|
| 225 |
torch_dtype=torch.bfloat16,
|
| 226 |
low_cpu_mem_usage=True,
|
| 227 |
trust_remote_code=True).eval().cuda()
|
| 228 |
+
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
|
| 229 |
|
| 230 |
image_processor = CLIPImageProcessor.from_pretrained(path)
|
| 231 |
image = Image.open('./examples/image2.jpg').resize((448, 448))
|
| 232 |
pixel_values = image_processor(images=image, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
|
| 233 |
|
| 234 |
+
generation_config = dict(max_new_tokens=1024, do_sample=False)
|
| 235 |
question = '<image>\nPlease describe the image in detail.'
|
| 236 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
|
| 237 |
print(f'User: {question}')
|
|
|
|
| 258 |
torch_dtype=torch.bfloat16,
|
| 259 |
low_cpu_mem_usage=True,
|
| 260 |
trust_remote_code=True).eval().cuda()
|
| 261 |
+
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
|
| 262 |
|
| 263 |
image_processor = CLIPImageProcessor.from_pretrained(path)
|
| 264 |
image1 = Image.open('./examples/image1.jpg').resize((448, 448))
|
|
|
|
| 267 |
pixel_values2 = image_processor(images=image2, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
|
| 268 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
| 269 |
|
| 270 |
+
generation_config = dict(max_new_tokens=1024, do_sample=False)
|
| 271 |
question = '<image>\nDescribe the two images in detail.'
|
| 272 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 273 |
history=None, return_history=True)
|
|
|
|
| 296 |
torch_dtype=torch.bfloat16,
|
| 297 |
low_cpu_mem_usage=True,
|
| 298 |
trust_remote_code=True).eval().cuda()
|
| 299 |
+
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
|
| 300 |
|
| 301 |
image_processor = CLIPImageProcessor.from_pretrained(path)
|
| 302 |
image1 = Image.open('./examples/image1.jpg').resize((448, 448))
|
|
|
|
| 306 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
| 307 |
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
|
| 308 |
|
| 309 |
+
generation_config = dict(max_new_tokens=1024, do_sample=False)
|
| 310 |
question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
|
| 311 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 312 |
num_patches_list=num_patches_list, history=None, return_history=True)
|
|
|
|
| 333 |
torch_dtype=torch.bfloat16,
|
| 334 |
low_cpu_mem_usage=True,
|
| 335 |
trust_remote_code=True).eval().cuda()
|
| 336 |
+
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
|
| 337 |
|
| 338 |
image_processor = CLIPImageProcessor.from_pretrained(path)
|
| 339 |
image1 = Image.open('./examples/image1.jpg').resize((448, 448))
|
|
|
|
| 343 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
| 344 |
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
|
| 345 |
|
| 346 |
+
generation_config = dict(max_new_tokens=1024, do_sample=False)
|
| 347 |
questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
|
| 348 |
responses = model.batch_chat(tokenizer, pixel_values,
|
| 349 |
num_patches_list=num_patches_list,
|
|
|
|
| 403 |
torch_dtype=torch.bfloat16,
|
| 404 |
low_cpu_mem_usage=True,
|
| 405 |
trust_remote_code=True).eval().cuda()
|
| 406 |
+
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
|
| 407 |
|
| 408 |
+
generation_config = dict(max_new_tokens=1024, do_sample=False)
|
| 409 |
|
| 410 |
video_path = './examples/red-panda.mp4'
|
| 411 |
pixel_values, num_patches_list = load_video(video_path, num_segments=8)
|
|
|
|
| 436 |
# Initialize the streamer
|
| 437 |
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
|
| 438 |
# Define the generation configuration
|
| 439 |
+
generation_config = dict(max_new_tokens=1024, do_sample=False, streamer=streamer)
|
| 440 |
# Start the model chat in a separate thread
|
| 441 |
thread = Thread(target=model.chat, kwargs=dict(
|
| 442 |
tokenizer=tokenizer, pixel_values=pixel_values, question=question,
|