VARGPT-family commited on
Commit
51ff6cc
·
verified ·
1 Parent(s): fcee4cf

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,3 +1,118 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <h3>VARGPT: Unified Understanding and Generation in a Visual Autoregressive Multimodal Large Language Model</h3>
2
+
3
+ VARGPT (7B+2B) modeling understanding and generation as two distinct paradigms within a unified model: **predicting the next token for visual understanding and predicting the next scale for visual generation**.
4
+
5
+ We provide the simple generation process for using our model. For more details, you could refer to Github.
6
+
7
+ ### Multimodal Understanding
8
+ Inference demo for **Multimodal Understanding**. You can execute the following code:
9
+ ```python
10
+ # Or execute the following code
11
+ import requests
12
+ from PIL import Image
13
+
14
+ import torch
15
+ from transformers import AutoProcessor, AutoTokenizer
16
+ from vargpt_llava.modeling_vargpt_llava import VARGPTLlavaForConditionalGeneration
17
+ from vargpt_llava.prepare_vargpt_llava import prepare_vargpt_llava
18
+ from vargpt_llava.processing_vargpt_llava import VARGPTLlavaProcessor
19
+ from patching_utils.patching import patching
20
+
21
+ model_id = "VARGPT_LLaVA-v1"
22
+ prepare_vargpt_llava(model_id)
23
+
24
+ model = VARGPTLlavaForConditionalGeneration.from_pretrained(
25
+ model_id,
26
+ torch_dtype=torch.float32,
27
+ low_cpu_mem_usage=True,
28
+ ).to(0)
29
+ patching(model)
30
+
31
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
32
+ processor = VARGPTLlavaProcessor.from_pretrained(model_id)
33
+
34
+ # Define a chat history and use `apply_chat_template` to get correctly formatted prompt
35
+ # Each value in "content" has to be a list of dicts with types ("text", "image")
36
+ conversation = [
37
+ {
38
+ "role": "user",
39
+ "content": [
40
+ {"type": "text", "text": "Please explain the meme in detail."},
41
+ {"type": "image"},
42
+ ],
43
+ },
44
+ ]
45
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
46
+ image_file = "./assets/llava_bench_demo.png"
47
+ print(prompt)
48
+
49
+ raw_image = Image.open(image_file)
50
+ inputs = processor(images=raw_image, text=prompt, return_tensors='pt').to(0, torch.float32)
51
+
52
+ output = model.generate(
53
+ **inputs,
54
+ max_new_tokens=2048,
55
+ do_sample=False)
56
+
57
+ print(processor.decode(output[0], skip_special_tokens=True))
58
+
59
+
60
+ ```
61
+ ### Multimodal Generation
62
+
63
+
64
+ Inference demo for **Text-to-Image Generation**. You can execute the following code:
65
+ ```python
66
+ import requests
67
+ from PIL import Image
68
+
69
+ import torch
70
+ from transformers import AutoProcessor, AutoTokenizer
71
+ from vargpt_llava.modeling_vargpt_llava import VARGPTLlavaForConditionalGeneration
72
+ from vargpt_llava.prepare_vargpt_llava import prepare_vargpt_llava
73
+ from vargpt_llava.processing_vargpt_llava import VARGPTLlavaProcessor
74
+ from patching_utils.patching import patching
75
+ model_id = "VARGPT_LLaVA-v1"
76
+
77
+ prepare_vargpt_llava(model_id)
78
+
79
+ model = VARGPTLlavaForConditionalGeneration.from_pretrained(
80
+ model_id,
81
+ torch_dtype=torch.float32,
82
+ low_cpu_mem_usage=True,
83
+ ).to(0)
84
+
85
+ patching(model)
86
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
87
+ processor = VARGPTLlavaProcessor.from_pretrained(model_id)
88
+
89
+ # some instruction examples:
90
+ # Please design a drawing of a butterfly on a flower.
91
+ # Please create a painting of a black weasel is standing in the grass.
92
+ # Can you generate a rendered photo of a rabbit sitting in the grass.
93
+ # I need a designed photo of a lighthouse is seen in the distance.
94
+ # Please create a rendered drawing of an old photo of an aircraft carrier in the water.
95
+ # Please produce a designed photo of a squirrel is standing in the snow.
96
+
97
+
98
+ conversation = [
99
+ {
100
+ "role": "user",
101
+ "content": [
102
+ {"type": "text", "text": "Please design a drawing of a butterfly on a flower."},
103
+ ],
104
+ },
105
+ ]
106
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
107
+ print(prompt)
108
+
109
+ inputs = processor(text=prompt, return_tensors='pt').to(0, torch.float32)
110
+ model._IMAGE_GEN_PATH = "output.png"
111
+ output = model.generate(
112
+ **inputs,
113
+ max_new_tokens=2048,
114
+ do_sample=False)
115
+
116
+ print(processor.decode(output[0], skip_special_tokens=True))
117
+
118
+ ```
added_tokens.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "<image>": 32000,
3
+ "<pad>": 32001,
4
+ "<|image_gen_end|>": 32003,
5
+ "<|image_gen_pad|>": 32004,
6
+ "<|image_gen_start|>": 32002
7
+ }
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 11.995625227852717,
3
+ "total_flos": 1.7779223938308506e+17,
4
+ "train_loss": 6.699234984606801,
5
+ "train_runtime": 169455.6516,
6
+ "train_samples_per_second": 99.441,
7
+ "train_steps_per_second": 0.097
8
+ }
chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"
3
+ }
config.json ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/mnt/bn/yufan-lf/pretrained_models/VARGPT_LLaVA-7B-stage3/VARGPT_LLaVA-7B-v1",
3
+ "architectures": [
4
+ "VARGPTLlavaForConditionalGeneration"
5
+ ],
6
+ "hidden_size": 4096,
7
+ "ignore_index": -100,
8
+ "image_seq_length": 576,
9
+ "image_token_index": 32000,
10
+ "model_type": "vargpt_llava",
11
+ "pad_token_id": 32001,
12
+ "padding_side": "left",
13
+ "projector_hidden_act": "gelu",
14
+ "special_tokens": {
15
+ "image_gen_end": "<|image_gen_end|>",
16
+ "image_gen_end_token_id": 32003,
17
+ "image_gen_pad": "<|image_gen_pad|>",
18
+ "image_gen_pad_token_id": 32004,
19
+ "image_gen_start": "<|image_gen_start|>",
20
+ "image_gen_start_token_id": 32002
21
+ },
22
+ "text_config": {
23
+ "_attn_implementation_autoset": false,
24
+ "_name_or_path": "lmsys/vicuna-7b-v1.5",
25
+ "add_cross_attention": false,
26
+ "architectures": [
27
+ "LlamaForCausalLM"
28
+ ],
29
+ "attention_bias": false,
30
+ "attention_dropout": 0.0,
31
+ "bad_words_ids": null,
32
+ "begin_suppress_tokens": null,
33
+ "bos_token_id": 1,
34
+ "chunk_size_feed_forward": 0,
35
+ "cross_attention_hidden_size": null,
36
+ "decoder_start_token_id": null,
37
+ "diversity_penalty": 0.0,
38
+ "do_sample": false,
39
+ "early_stopping": false,
40
+ "encoder_no_repeat_ngram_size": 0,
41
+ "eos_token_id": 2,
42
+ "exponential_decay_length_penalty": null,
43
+ "finetuning_task": null,
44
+ "forced_bos_token_id": null,
45
+ "forced_eos_token_id": null,
46
+ "head_dim": 128,
47
+ "hidden_act": "silu",
48
+ "hidden_size": 4096,
49
+ "id2label": {
50
+ "0": "LABEL_0",
51
+ "1": "LABEL_1"
52
+ },
53
+ "initializer_range": 0.02,
54
+ "intermediate_size": 11008,
55
+ "is_decoder": false,
56
+ "is_encoder_decoder": false,
57
+ "label2id": {
58
+ "LABEL_0": 0,
59
+ "LABEL_1": 1
60
+ },
61
+ "length_penalty": 1.0,
62
+ "max_length": 20,
63
+ "max_position_embeddings": 4096,
64
+ "min_length": 0,
65
+ "mlp_bias": false,
66
+ "model_type": "llama",
67
+ "no_repeat_ngram_size": 0,
68
+ "num_attention_heads": 32,
69
+ "num_beam_groups": 1,
70
+ "num_beams": 1,
71
+ "num_hidden_layers": 32,
72
+ "num_key_value_heads": 32,
73
+ "num_return_sequences": 1,
74
+ "output_attentions": false,
75
+ "output_hidden_states": false,
76
+ "output_scores": false,
77
+ "pad_token_id": null,
78
+ "prefix": null,
79
+ "pretraining_tp": 1,
80
+ "problem_type": null,
81
+ "pruned_heads": {},
82
+ "remove_invalid_values": false,
83
+ "repetition_penalty": 1.0,
84
+ "return_dict": true,
85
+ "return_dict_in_generate": false,
86
+ "rms_norm_eps": 1e-05,
87
+ "rope_scaling": null,
88
+ "rope_theta": 10000.0,
89
+ "sep_token_id": null,
90
+ "suppress_tokens": null,
91
+ "task_specific_params": null,
92
+ "temperature": 1.0,
93
+ "tf_legacy_loss": false,
94
+ "tie_encoder_decoder": false,
95
+ "tie_word_embeddings": false,
96
+ "tokenizer_class": null,
97
+ "top_k": 50,
98
+ "top_p": 1.0,
99
+ "torch_dtype": "float16",
100
+ "torchscript": false,
101
+ "typical_p": 1.0,
102
+ "use_bfloat16": false,
103
+ "use_cache": true,
104
+ "vocab_size": 32064
105
+ },
106
+ "tie_word_embeddings": false,
107
+ "torch_dtype": "bfloat16",
108
+ "train_from_scratch": false,
109
+ "transformers_version": "4.46.1",
110
+ "use_cache": false,
111
+ "vision_config": {
112
+ "_attn_implementation_autoset": false,
113
+ "_name_or_path": "",
114
+ "add_cross_attention": false,
115
+ "architectures": null,
116
+ "attention_dropout": 0.0,
117
+ "bad_words_ids": null,
118
+ "begin_suppress_tokens": null,
119
+ "bos_token_id": null,
120
+ "chunk_size_feed_forward": 0,
121
+ "cross_attention_hidden_size": null,
122
+ "decoder_start_token_id": null,
123
+ "diversity_penalty": 0.0,
124
+ "do_sample": false,
125
+ "early_stopping": false,
126
+ "encoder_no_repeat_ngram_size": 0,
127
+ "eos_token_id": null,
128
+ "exponential_decay_length_penalty": null,
129
+ "finetuning_task": null,
130
+ "forced_bos_token_id": null,
131
+ "forced_eos_token_id": null,
132
+ "hidden_act": "quick_gelu",
133
+ "hidden_size": 1024,
134
+ "id2label": {
135
+ "0": "LABEL_0",
136
+ "1": "LABEL_1"
137
+ },
138
+ "image_size": 336,
139
+ "initializer_factor": 1.0,
140
+ "initializer_range": 0.02,
141
+ "intermediate_size": 4096,
142
+ "is_decoder": false,
143
+ "is_encoder_decoder": false,
144
+ "label2id": {
145
+ "LABEL_0": 0,
146
+ "LABEL_1": 1
147
+ },
148
+ "layer_norm_eps": 1e-05,
149
+ "length_penalty": 1.0,
150
+ "max_length": 20,
151
+ "min_length": 0,
152
+ "model_type": "clip_vision_model",
153
+ "no_repeat_ngram_size": 0,
154
+ "num_attention_heads": 16,
155
+ "num_beam_groups": 1,
156
+ "num_beams": 1,
157
+ "num_channels": 3,
158
+ "num_hidden_layers": 24,
159
+ "num_return_sequences": 1,
160
+ "output_attentions": false,
161
+ "output_hidden_states": false,
162
+ "output_scores": false,
163
+ "pad_token_id": null,
164
+ "patch_size": 14,
165
+ "prefix": null,
166
+ "problem_type": null,
167
+ "projection_dim": 768,
168
+ "pruned_heads": {},
169
+ "remove_invalid_values": false,
170
+ "repetition_penalty": 1.0,
171
+ "return_dict": true,
172
+ "return_dict_in_generate": false,
173
+ "sep_token_id": null,
174
+ "suppress_tokens": null,
175
+ "task_specific_params": null,
176
+ "temperature": 1.0,
177
+ "tf_legacy_loss": false,
178
+ "tie_encoder_decoder": false,
179
+ "tie_word_embeddings": true,
180
+ "tokenizer_class": null,
181
+ "top_k": 50,
182
+ "top_p": 1.0,
183
+ "torch_dtype": null,
184
+ "torchscript": false,
185
+ "typical_p": 1.0,
186
+ "use_bfloat16": false,
187
+ "vocab_size": 32000
188
+ },
189
+ "vision_feature_layer": -2,
190
+ "vision_feature_select_strategy": "default",
191
+ "vocab_size": 32064
192
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 32001,
6
+ "transformers_version": "4.46.1"
7
+ }
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:788c7bfdae26ef9c00297dddd4b47737b43a99202da9e27e7a5c61d2d986d546
3
+ size 4992930688
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5851ee22d392303c11c957a5babb95de98de3d4b8f853c238c761c337137acc
3
+ size 4957878552
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bff7b86adfc3f7602c564ce6ce6e99dea4d626bde98c56b8e5b6ca23e87d9c0
3
+ size 4978083414
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0520b548f43ce52ac29bd4103562328851d442fabe58682b1eede96db52f983
3
+ size 3628686856
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 336,
4
+ "width": 336
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 0.48145466,
13
+ 0.4578275,
14
+ 0.40821073
15
+ ],
16
+ "image_processor_type": "CLIPImageProcessor",
17
+ "image_std": [
18
+ 0.26862954,
19
+ 0.26130258,
20
+ 0.27577711
21
+ ],
22
+ "processor_class": "VARGPTLlavaProcessor",
23
+ "resample": 3,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "shortest_edge": 336
27
+ }
28
+ }
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "image_token": "<image>",
3
+ "patch_size": 14,
4
+ "processor_class": "VARGPTLlavaProcessor",
5
+ "vision_feature_select_strategy": "default"
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|image_gen_start|>",
4
+ "<|image_gen_end|>",
5
+ "<|image_gen_pad|>"
6
+ ],
7
+ "bos_token": {
8
+ "content": "<s>",
9
+ "lstrip": false,
10
+ "normalized": false,
11
+ "rstrip": false,
12
+ "single_word": false
13
+ },
14
+ "eos_token": {
15
+ "content": "</s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "pad_token": {
22
+ "content": "<pad>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ },
28
+ "unk_token": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ }
35
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "<image>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "32001": {
39
+ "content": "<pad>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "32002": {
47
+ "content": "<|image_gen_start|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "32003": {
55
+ "content": "<|image_gen_end|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "32004": {
63
+ "content": "<|image_gen_pad|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ }
70
+ },
71
+ "additional_special_tokens": [
72
+ "<|image_gen_start|>",
73
+ "<|image_gen_end|>",
74
+ "<|image_gen_pad|>"
75
+ ],
76
+ "bos_token": "<s>",
77
+ "chat_template": "{% set system_message = 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\\'s questions.' %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ 'USER: ' + content + ' ASSISTANT:' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' }}{% endif %}{% endfor %}",
78
+ "clean_up_tokenization_spaces": false,
79
+ "eos_token": "</s>",
80
+ "extra_special_tokens": {
81
+ "image_token": "<image>"
82
+ },
83
+ "image_token": "<image>",
84
+ "legacy": false,
85
+ "model_max_length": 1000000000000000019884624838656,
86
+ "pad_token": "<pad>",
87
+ "padding_side": "right",
88
+ "processor_class": "VARGPTLlavaProcessor",
89
+ "sp_model_kwargs": {},
90
+ "split_special_tokens": false,
91
+ "tokenizer_class": "LlamaTokenizer",
92
+ "trust_remote_code": false,
93
+ "unk_token": "<unk>",
94
+ "use_default_system_prompt": false
95
+ }