OpenGVLab
/

InternVL-Chat-V1-1

@@ -78,7 +78,7 @@ Here, we have conducted only a simple performance comparison. For more detailed
 We provide an example code to run InternVL-Chat-V1-1 using `transformers`.
-We also welcome you to experience the InternVL2 series models in our [online demo](https://internvl.opengvlab.com/). Currently, due to the limited GPU resources with public IP addresses, we can only deploy models up to a maximum of 26B. We will expand soon and deploy larger models to the online demo.
 > Please use transformers==4.37.2 to ensure the model works normally.
@@ -172,9 +172,9 @@ model = AutoModel.from_pretrained(
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
     trust_remote_code=True).eval().cuda()
-tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
-generation_config = dict(num_beams=1, max_new_tokens=1024, do_sample=False)
 question = 'Hello, who are you?'
 response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
 print(f'User: {question}')
@@ -199,13 +199,13 @@ model = AutoModel.from_pretrained(
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
     trust_remote_code=True).eval().cuda()
-tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
 image_processor = CLIPImageProcessor.from_pretrained(path)
 image = Image.open('./examples/image2.jpg').resize((448, 448))
 pixel_values = image_processor(images=image, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
-generation_config = dict(num_beams=1, max_new_tokens=1024, do_sample=False)
 question = '<image>\nPlease describe the image shortly.'
 response = model.chat(tokenizer, pixel_values, question, generation_config)
 print(f'User: {question}')
@@ -225,13 +225,13 @@ model = AutoModel.from_pretrained(
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
     trust_remote_code=True).eval().cuda()
-tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
 image_processor = CLIPImageProcessor.from_pretrained(path)
 image = Image.open('./examples/image2.jpg').resize((448, 448))
 pixel_values = image_processor(images=image, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
-generation_config = dict(num_beams=1, max_new_tokens=1024, do_sample=False)
 question = '<image>\nPlease describe the image in detail.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
 print(f'User: {question}')
@@ -258,7 +258,7 @@ model = AutoModel.from_pretrained(
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
     trust_remote_code=True).eval().cuda()
-tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
 image_processor = CLIPImageProcessor.from_pretrained(path)
 image1 = Image.open('./examples/image1.jpg').resize((448, 448))
@@ -267,7 +267,7 @@ image2 = Image.open('./examples/image2.jpg').resize((448, 448))
 pixel_values2 = image_processor(images=image2, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
 pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
-generation_config = dict(num_beams=1, max_new_tokens=1024, do_sample=False)
 question = '<image>\nDescribe the two images in detail.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                history=None, return_history=True)
@@ -296,7 +296,7 @@ model = AutoModel.from_pretrained(
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
     trust_remote_code=True).eval().cuda()
-tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
 image_processor = CLIPImageProcessor.from_pretrained(path)
 image1 = Image.open('./examples/image1.jpg').resize((448, 448))
@@ -306,7 +306,7 @@ pixel_values2 = image_processor(images=image2, return_tensors='pt').pixel_values
 pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
 num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
-generation_config = dict(num_beams=1, max_new_tokens=1024, do_sample=False)
 question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                num_patches_list=num_patches_list, history=None, return_history=True)
@@ -333,7 +333,7 @@ model = AutoModel.from_pretrained(
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
     trust_remote_code=True).eval().cuda()
-tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
 image_processor = CLIPImageProcessor.from_pretrained(path)
 image1 = Image.open('./examples/image1.jpg').resize((448, 448))
@@ -343,7 +343,7 @@ pixel_values2 = image_processor(images=image2, return_tensors='pt').pixel_values
 pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
 num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
-generation_config = dict(num_beams=1, max_new_tokens=1024, do_sample=False)
 questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
 responses = model.batch_chat(tokenizer, pixel_values,
                              num_patches_list=num_patches_list,
@@ -403,9 +403,9 @@ model = AutoModel.from_pretrained(
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
     trust_remote_code=True).eval().cuda()
-tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
-generation_config = dict(num_beams=1, max_new_tokens=1024, do_sample=False)
 video_path = './examples/red-panda.mp4'
 pixel_values, num_patches_list = load_video(video_path, num_segments=8)
@@ -436,7 +436,7 @@ from threading import Thread
 # Initialize the streamer
 streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
 # Define the generation configuration
-generation_config = dict(num_beams=1, max_new_tokens=1024, do_sample=False, streamer=streamer)
 # Start the model chat in a separate thread
 thread = Thread(target=model.chat, kwargs=dict(
     tokenizer=tokenizer, pixel_values=pixel_values, question=question,

 We provide an example code to run InternVL-Chat-V1-1 using `transformers`.
+We also welcome you to experience the InternVL2 series models in our [online demo](https://internvl.opengvlab.com/).
 > Please use transformers==4.37.2 to ensure the model works normally.
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
     trust_remote_code=True).eval().cuda()
+tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
+generation_config = dict(max_new_tokens=1024, do_sample=False)
 question = 'Hello, who are you?'
 response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
 print(f'User: {question}')
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
     trust_remote_code=True).eval().cuda()
+tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
 image_processor = CLIPImageProcessor.from_pretrained(path)
 image = Image.open('./examples/image2.jpg').resize((448, 448))
 pixel_values = image_processor(images=image, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
+generation_config = dict(max_new_tokens=1024, do_sample=False)
 question = '<image>\nPlease describe the image shortly.'
 response = model.chat(tokenizer, pixel_values, question, generation_config)
 print(f'User: {question}')
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
     trust_remote_code=True).eval().cuda()
+tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
 image_processor = CLIPImageProcessor.from_pretrained(path)
 image = Image.open('./examples/image2.jpg').resize((448, 448))
 pixel_values = image_processor(images=image, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
+generation_config = dict(max_new_tokens=1024, do_sample=False)
 question = '<image>\nPlease describe the image in detail.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
 print(f'User: {question}')
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
     trust_remote_code=True).eval().cuda()
+tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
 image_processor = CLIPImageProcessor.from_pretrained(path)
 image1 = Image.open('./examples/image1.jpg').resize((448, 448))
 pixel_values2 = image_processor(images=image2, return_tensors='pt').pixel_values.to(torch.bfloat16).cuda()
 pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
+generation_config = dict(max_new_tokens=1024, do_sample=False)
 question = '<image>\nDescribe the two images in detail.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                history=None, return_history=True)
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
     trust_remote_code=True).eval().cuda()
+tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
 image_processor = CLIPImageProcessor.from_pretrained(path)
 image1 = Image.open('./examples/image1.jpg').resize((448, 448))
 pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
 num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
+generation_config = dict(max_new_tokens=1024, do_sample=False)
 question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                num_patches_list=num_patches_list, history=None, return_history=True)
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
     trust_remote_code=True).eval().cuda()
+tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
 image_processor = CLIPImageProcessor.from_pretrained(path)
 image1 = Image.open('./examples/image1.jpg').resize((448, 448))
 pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
 num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
+generation_config = dict(max_new_tokens=1024, do_sample=False)
 questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
 responses = model.batch_chat(tokenizer, pixel_values,
                              num_patches_list=num_patches_list,
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
     trust_remote_code=True).eval().cuda()
+tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
+generation_config = dict(max_new_tokens=1024, do_sample=False)
 video_path = './examples/red-panda.mp4'
 pixel_values, num_patches_list = load_video(video_path, num_segments=8)
 # Initialize the streamer
 streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
 # Define the generation configuration
+generation_config = dict(max_new_tokens=1024, do_sample=False, streamer=streamer)
 # Start the model chat in a separate thread
 thread = Thread(target=model.chat, kwargs=dict(
     tokenizer=tokenizer, pixel_values=pixel_values, question=question,