yujiepan commited on
Commit
a8a0682
·
verified ·
1 Parent(s): cb07b8c

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ pipeline_tag: text-generation
4
+ inference: true
5
+ widget:
6
+ - text: Hello!
7
+ example_title: Hello world
8
+ group: Python
9
+ base_model:
10
+ - stepfun-ai/step3
11
+ ---
12
+
13
+ This tiny model is for debugging. It is randomly initialized with the config adapted from [stepfun-ai/step3](https://huggingface.co/stepfun-ai/step3).
14
+
15
+ Note: For vLLM supported version, see [tiny-random/step3-vllm](https://huggingface.co/tiny-random/step3-vllm).
16
+
17
+ ### Example usage:
18
+
19
+ ```python
20
+ import torch
21
+ from transformers import AutoModelForCausalLM, AutoProcessor
22
+
23
+ model_id = "tiny-random/step3"
24
+ processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
25
+ model = AutoModelForCausalLM.from_pretrained(
26
+ model_id,
27
+ device_map="cuda", torch_dtype=torch.bfloat16,
28
+ trust_remote_code=True,
29
+ )
30
+ messages = [
31
+ {
32
+ "role": "user",
33
+ "content": [
34
+ {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
35
+ {"type": "text", "text": "What's in this picture?"}
36
+ ]
37
+ },
38
+ ]
39
+ inputs = processor.apply_chat_template(
40
+ messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
41
+ ).to(model.device)
42
+ generate_ids = model.generate(**inputs, max_new_tokens=32, do_sample=False)
43
+ decoded = processor.decode(generate_ids[0, inputs["input_ids"].shape[-1]:], skip_special_tokens=False)
44
+ print(decoded)
45
+ ```
46
+
47
+ ### Codes to create this repo:
48
+
49
+ ```python
50
+ import json
51
+ from pathlib import Path
52
+
53
+ import accelerate
54
+ import torch
55
+ from huggingface_hub import file_exists, hf_hub_download
56
+ from transformers import (
57
+ AutoConfig,
58
+ AutoModelForCausalLM,
59
+ AutoProcessor,
60
+ AutoTokenizer,
61
+ GenerationConfig,
62
+ set_seed,
63
+ )
64
+
65
+ source_model_id = "stepfun-ai/step3"
66
+ save_folder = "/tmp/tiny-random/step3"
67
+
68
+ processor = AutoProcessor.from_pretrained(source_model_id, trust_remote_code=True)
69
+ processor.save_pretrained(save_folder)
70
+
71
+ def rewrite_automap(filepath: str, source_model_id: str, overrides: dict = None):
72
+ import json
73
+ with open(filepath, 'r', encoding='utf-8') as f:
74
+ config = json.load(f)
75
+ for k, v in config['auto_map'].items():
76
+ v = v.split('--')[-1]
77
+ config['auto_map'][k] = f'{source_model_id}--{v}'
78
+ if overrides is not None:
79
+ config.update(overrides)
80
+ with open(filepath, 'w', encoding='utf - 8') as f:
81
+ json.dump(config, f, indent=2)
82
+
83
+ rewrite_automap(f'{save_folder}/processor_config.json', source_model_id)
84
+ rewrite_automap(f'{save_folder}/tokenizer_config.json', source_model_id)
85
+
86
+ with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
87
+ config_json = json.load(f)
88
+
89
+ for k, v in config_json['auto_map'].items():
90
+ config_json['auto_map'][k] = f'{source_model_id}--{v}'
91
+ config_json['architectures'] = ["Step3VLForConditionalGeneration"]
92
+ config_json['text_config'].update({
93
+ "hidden_size": 32,
94
+ "intermediate_size": 64,
95
+ "num_hidden_layers": 2,
96
+ "num_attention_heads": 2,
97
+ "num_attention_groups": 1,
98
+ "head_dim": 256,
99
+ "share_q_dim": 512,
100
+ "moe_layers_enum": "1",
101
+ "moe_num_experts": 8,
102
+ "moe_top_k": 3,
103
+ "moe_intermediate_size": 64,
104
+ "share_expert_dim": 64,
105
+ "tie_word_embeddings": True,
106
+ })
107
+ config_json['vision_config'].update({
108
+ "hidden_size": 64,
109
+ "output_hidden_size": 64,
110
+ "intermediate_size": 128,
111
+ "num_hidden_layers": 2,
112
+ "num_attention_heads": 2
113
+ })
114
+
115
+ with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
116
+ json.dump(config_json, f, indent=2)
117
+ config = AutoConfig.from_pretrained(
118
+ save_folder,
119
+ trust_remote_code=True,
120
+ )
121
+ print(config)
122
+ # key_mapping = {
123
+ # "^vision_model": "model.vision_model",
124
+ # r"^model(?!\.(language_model|vision_model))": "model.language_model",
125
+ # "vit_downsampler": "model.vit_downsampler",
126
+ # "vit_downsampler2": "model.vit_downsampler2",
127
+ # "vit_large_projector": "model.vit_large_projector",
128
+ # }
129
+ automap = config_json['auto_map']
130
+ torch.set_default_dtype(torch.bfloat16)
131
+ model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
132
+ torch.set_default_dtype(torch.float32)
133
+ if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
134
+ model.generation_config = GenerationConfig.from_pretrained(
135
+ source_model_id, trust_remote_code=True,
136
+ )
137
+ set_seed(42)
138
+ model = model.cpu() # cpu is more stable for random initialization across machines
139
+ with torch.no_grad():
140
+ for name, p in sorted(model.named_parameters()):
141
+ torch.nn.init.normal_(p, 0, 0.2)
142
+ print(name, p.shape)
143
+ model.save_pretrained(save_folder)
144
+ print(model)
145
+ rewrite_automap(f'{save_folder}/config.json', source_model_id)
146
+
147
+ for python_file in Path(save_folder).glob('*.py'):
148
+ if python_file.name.startswith('modeling_') or python_file.name.startswith('configuration_') or python_file.name.endswith('.py'):
149
+ python_file.unlink()
150
+ ```
151
+
152
+ ### Printing the model:
153
+
154
+ ```text
155
+ Step3vForConditionalGeneration(
156
+ (model): Step3vModel(
157
+ (vision_model): StepCLIPVisionTransformer(
158
+ (embeddings): StepCLIPVisionEmbeddings(
159
+ (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
160
+ (position_embedding): Embedding(2705, 64)
161
+ )
162
+ (transformer): StepCLIPEncoder(
163
+ (layers): ModuleList(
164
+ (0-1): 2 x StepCLIPEncoderLayer(
165
+ (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
166
+ (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
167
+ (self_attn): StepCLIPAttention(
168
+ (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
169
+ (out_proj): Linear(in_features=64, out_features=64, bias=True)
170
+ )
171
+ (mlp): StepCLIPMLP(
172
+ (fc1): Linear(in_features=64, out_features=128, bias=True)
173
+ (act): QuickGELUActivation()
174
+ (fc2): Linear(in_features=128, out_features=64, bias=True)
175
+ )
176
+ )
177
+ )
178
+ )
179
+ )
180
+ (language_model): Step3Model(
181
+ (embed_tokens): Embedding(128815, 32)
182
+ (layers): ModuleList(
183
+ (0): Step3vDecoderLayer(
184
+ (self_attn): Step3vAttention(
185
+ (q_proj): Linear(in_features=32, out_features=512, bias=False)
186
+ (k_proj): Linear(in_features=32, out_features=256, bias=False)
187
+ (v_proj): Linear(in_features=32, out_features=256, bias=False)
188
+ (o_proj): Linear(in_features=512, out_features=32, bias=False)
189
+ (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
190
+ (wq): Linear(in_features=512, out_features=512, bias=False)
191
+ )
192
+ (mlp): Step3vMLP(
193
+ (gate_proj): Linear(in_features=32, out_features=64, bias=False)
194
+ (up_proj): Linear(in_features=32, out_features=64, bias=False)
195
+ (down_proj): Linear(in_features=64, out_features=32, bias=False)
196
+ (act_fn): SiLU()
197
+ )
198
+ (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
199
+ (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
200
+ )
201
+ (1): Step3vDecoderLayer(
202
+ (self_attn): Step3vAttention(
203
+ (q_proj): Linear(in_features=32, out_features=512, bias=False)
204
+ (k_proj): Linear(in_features=32, out_features=256, bias=False)
205
+ (v_proj): Linear(in_features=32, out_features=256, bias=False)
206
+ (o_proj): Linear(in_features=512, out_features=32, bias=False)
207
+ (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
208
+ (wq): Linear(in_features=512, out_features=512, bias=False)
209
+ )
210
+ (moe): Step3vMoEMLP(
211
+ (gate): Linear(in_features=32, out_features=8, bias=False)
212
+ (up_proj): MoELinear()
213
+ (gate_proj): MoELinear()
214
+ (down_proj): MoELinear()
215
+ (act_fn): SiLU()
216
+ )
217
+ (share_expert): Step3vMLP(
218
+ (gate_proj): Linear(in_features=32, out_features=64, bias=False)
219
+ (up_proj): Linear(in_features=32, out_features=64, bias=False)
220
+ (down_proj): Linear(in_features=64, out_features=32, bias=False)
221
+ (act_fn): SiLU()
222
+ )
223
+ (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
224
+ (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
225
+ )
226
+ )
227
+ (norm): Step3vRMSNorm((32,), eps=1e-05)
228
+ (rotary_emb): Step3vRotaryEmbedding()
229
+ )
230
+ (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
231
+ (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
232
+ (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
233
+ )
234
+ (lm_head): Linear(in_features=32, out_features=128815, bias=False)
235
+ )
236
+ ```
chat_template.jinja ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% macro render_content(content) %} {% if content is string %}{{- content }}{% elif content is mapping %}{{- content['value'] if 'value' in content else content['text'] }}{% elif content is iterable %}{% for item in content %}{% if item.type == 'text' %}{{- item['value'] if 'value' in item else item['text'] }}{% elif item.type == 'image' %}<im_patch>{% endif %}{% endfor %}{% endif %} {% endmacro %}{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message.role == 'system' %}{{ render_content(message['content']) }}{% endif %}{% endfor %}{% if tools is defined and tools %}{% set ns = namespace(data='') %}{% for tool in tools %}{% set ns.data = ns.data + (tool | tojson(ensure_ascii=False)) + '
2
+ ' %}{% endfor %}{% set tool_schemas_var = ns.data %}# Tools
3
+ You may call one or more tools to assist with the user query. You are provided with tool schemas within <tools></tools> XML tags: <tools>{{ tool_schemas_var }}</tools> When making tool calls, use XML format to invoke tools and pass parameters: <|tool_calls_begin|>
4
+ <|tool_call_begin|>
5
+ function<|tool_sep|><steptml:invoke name="tool_name0"><steptml:parameter name="parameter_name0">[parameter value]</steptml:parameter>...</steptml:invoke><|tool_call_end|>
6
+ <|tool_call_begin|>
7
+ function<|tool_sep|><steptml:invoke name="tool_name1"><steptml:parameter name="parameter_name1">[parameter value]</steptml:parameter>...</steptml:invoke><|tool_call_end|>
8
+ <|tool_calls_end|>
9
+ Note: * You can invoke one or more tools in parallel. * Each tool call must be complete and self-contained within a single <steptml:toolcall></steptml:toolcall> block. {% endif %}{% for message in messages %}{% if message.role == 'tool_description' %}{{ render_content(message['content']) }}{% elif message.role == 'user' %}{{- '<|BOT|>' + message.role + '\n' + render_content(message['content']) }}{{- '<|EOT|>' }}{% elif message.role == 'tool_response' %}<|tool_outputs_begin|>
10
+ {% for tool_output in message['content'] %}<|tool_output_begin|>
11
+ {{ render_content(tool_output) }}<|tool_output_end|>{% endfor %}
12
+ <|tool_outputs_end|>
13
+ {% else %}{{- '<|BOT|>' + message.role + '
14
+ ' }}{% if message['content'] is defined %}{{- render_content(message['content']) }}{% endif %}{% if message.tool_calls is defined %}<|tool_calls_begin|>
15
+ {% for tool in message.tool_calls %}<|tool_call_begin>|>
16
+ {{ tool['type'] }}<|tool_sep|>{{- '<steptml:invoke name="' + tool['function']['name'] + '">' }}{% for name, param in tool['function']['arguments'].items() %} {{- '<steptml:parameter name="' + name + '">' + param | string + '</steptml:parameter>' }}{% endfor %}</steptml:invoke><|tool_call_end|>
17
+ {% endfor %}<|tool_calls_end|>
18
+ {% endif %}<|EOT|>{% endif %}{% endfor %}{% if add_generation_prompt %}{{- '<|BOT|>assistant
19
+ <think>
20
+ ' }}{% endif %}
config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Step3vForConditionalGeneration"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "stepfun-ai/step3--configuration_step3.Step3VLConfig",
7
+ "AutoModelForCausalLM": "stepfun-ai/step3--modeling_step3.Step3vForConditionalGeneration"
8
+ },
9
+ "bos_token_id": 0,
10
+ "eos_token_id": 128805,
11
+ "hidden_size": 32,
12
+ "im_end_token": "<im_end>",
13
+ "im_patch_token": "<im_patch>",
14
+ "im_start_token": "<im_start>",
15
+ "image_token_id": 128001,
16
+ "image_token_len": 169,
17
+ "model_type": "step3_vl",
18
+ "patch_token_len": 81,
19
+ "projector_bias": false,
20
+ "text_config": {
21
+ "architectures": [
22
+ "Step3TextForCausalLM"
23
+ ],
24
+ "head_dim": 256,
25
+ "hidden_size": 32,
26
+ "intermediate_size": 64,
27
+ "max_position_embedding": 65536,
28
+ "max_seq_len": 65536,
29
+ "model_type": "step3_text",
30
+ "moe_intermediate_size": 64,
31
+ "moe_layers_enum": "1",
32
+ "moe_num_experts": 8,
33
+ "moe_top_k": 3,
34
+ "norm_expert_weight": false,
35
+ "num_attention_groups": 1,
36
+ "num_attention_heads": 2,
37
+ "num_hidden_layers": 2,
38
+ "rms_norm_eps": 1e-05,
39
+ "rope_scaling": null,
40
+ "rope_theta": 500000,
41
+ "share_expert_dim": 64,
42
+ "share_q_dim": 512,
43
+ "torch_dtype": "bfloat16",
44
+ "vocab_size": 128815
45
+ },
46
+ "torch_dtype": "bfloat16",
47
+ "transformers_version": "4.54.1",
48
+ "understand_projector_stride": 2,
49
+ "vision_config": {
50
+ "hidden_act": "quick_gelu",
51
+ "hidden_size": 64,
52
+ "image_size": 728,
53
+ "intermediate_size": 128,
54
+ "layer_norm_eps": 1e-05,
55
+ "model_type": "step3_vision_encoder",
56
+ "num_attention_heads": 2,
57
+ "num_channels": 3,
58
+ "num_hidden_layers": 2,
59
+ "output_hidden_size": 64,
60
+ "patch_size": 14
61
+ }
62
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 0,
3
+ "do_sample": true,
4
+ "eos_token_id": 128805,
5
+ "temperature": 0.7,
6
+ "top_p": 0.95,
7
+ "transformers_version": "4.54.1",
8
+ "trust_remote_code": true
9
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bc0f1f5812da20e92c3a73a7d4996a1bc320147f8e462c19ff81ad846d6fca2
3
+ size 18611328
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "stepfun-ai/step3--processing_step3.Step3VLProcessor"
4
+ },
5
+ "processor_class": "Step3VLProcessor"
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin▁of▁sentence|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|EOT|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|end▁of▁sentence|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff