p1atdev commited on
Commit
3b5d4a2
·
verified ·
1 Parent(s): b5e658d

Upload processor

Browse files
processor_config.json CHANGED
@@ -1,9 +1,6 @@
1
  {
2
  "auto_map": {
3
- "AutoProcessor": [
4
- "processor_multitokenizers.MultiTokenizersProcessor",
5
- null
6
- ]
7
  },
8
- "processor_class": "MultiTokenizersProcessor"
9
  }
 
1
  {
2
  "auto_map": {
3
+ "AutoProcessor": "processor_multi.MultiProcessor"
 
 
 
4
  },
5
+ "processor_class": "MultiProcessor"
6
  }
processor_multi.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import warnings
4
+ from pathlib import Path
5
+
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+
10
+ from transformers import (
11
+ PreTrainedTokenizer,
12
+ PreTrainedTokenizerBase,
13
+ ProcessorMixin,
14
+ BatchFeature,
15
+ )
16
+ from transformers.utils import (
17
+ logging,
18
+ direct_transformers_import,
19
+ PROCESSOR_NAME,
20
+ CHAT_TEMPLATE_NAME,
21
+ )
22
+ from transformers.image_utils import ImageInput
23
+ from transformers.dynamic_module_utils import custom_object_save
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+ # Dynamically import the Transformers module to grab the attribute classes of the processor form their names.
28
+ transformers_module = direct_transformers_import(Path(__file__).parent)
29
+
30
+
31
+ class MultiProcessorKwargs:
32
+ _defaults = {
33
+ "tokenizer_1_kwargs": {
34
+ "padding": False,
35
+ },
36
+ "tokenizer_2_kwargs": {
37
+ "padding": False,
38
+ },
39
+ }
40
+
41
+
42
+ class MultiProcessor(ProcessorMixin):
43
+ attributes = ["tokenizer_1", "tokenizer_2"]
44
+ valid_kwargs = ["chat_template"]
45
+ tokenizer_1_class = "AutoTokenizer"
46
+ tokenizer_2_class = "AutoTokenizer"
47
+
48
+ tokenizer_1: PreTrainedTokenizer
49
+ tokenizer_2: PreTrainedTokenizer
50
+
51
+ def __init__(
52
+ self,
53
+ tokenizer_1=None,
54
+ tokenizer_2=None,
55
+ chat_template=None,
56
+ **kwargs,
57
+ ):
58
+ super().__init__(
59
+ tokenizer_1,
60
+ tokenizer_2,
61
+ chat_template=chat_template,
62
+ **kwargs,
63
+ )
64
+
65
+ def __call__(
66
+ self,
67
+ text_1: str | list[str] | None = None,
68
+ text_2: str | list[str] | None = None,
69
+ **kwargs,
70
+ ) -> BatchFeature:
71
+ def _validate_text_input(text) -> str | list[str]:
72
+ if isinstance(text, list):
73
+ assert all(
74
+ isinstance(t, str) for t in text
75
+ ), f"Expected list of str but got {type(text)}"
76
+ assert all(len(t) > 0 for t in text), "Expected non-empty strings"
77
+ else:
78
+ assert isinstance(text, str), f"Expected str but got {type(text)}"
79
+ return text
80
+
81
+ def _normalize_text_input(text: str | list[str]) -> list[str]:
82
+ if isinstance(text, str):
83
+ return [text]
84
+ return text
85
+
86
+ _text_1: str | list[str] = _validate_text_input(text_1)
87
+ text_1_list: list[str] = _normalize_text_input(_text_1)
88
+ _text_2: str | list[str] = _validate_text_input(text_2)
89
+ text_2_list: list[str] = _normalize_text_input(_text_2)
90
+
91
+ tokenizer_1_output_kwargs = {
92
+ **MultiProcessorKwargs._defaults["tokenizer_1_kwargs"],
93
+ "return_tensors": "pt",
94
+ **kwargs,
95
+ }
96
+ tokenizer_2_output_kwargs = {
97
+ **MultiProcessorKwargs._defaults["tokenizer_2_kwargs"],
98
+ "return_tensors": "pt",
99
+ **kwargs,
100
+ }
101
+
102
+ # tokenize
103
+ text_1_inputs = self.tokenizer_1(
104
+ text_1_list,
105
+ **tokenizer_1_output_kwargs,
106
+ )
107
+ text_2_inputs = self.tokenizer_2(
108
+ text_2_list,
109
+ **tokenizer_2_output_kwargs,
110
+ )
111
+
112
+ return BatchFeature(
113
+ data={
114
+ "input_ids": text_1_inputs.get("input_ids"),
115
+ "attention_mask": text_1_inputs.get("attention_mask"),
116
+ "input_ids_2": text_2_inputs.get("input_ids"),
117
+ "attention_mask_2": text_2_inputs.get("attention_mask"),
118
+ }
119
+ )
120
+
121
+ def batch_decode(self, *args, **kwargs):
122
+ """
123
+ This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
124
+ refer to the docstring of this method for more information.
125
+ """
126
+ return self.tokenizer_2_tokenizer.batch_decode(*args, **kwargs)
127
+
128
+ def decode(self, *args, **kwargs):
129
+ """
130
+ This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
131
+ the docstring of this method for more information.
132
+ """
133
+ return self.tokenizer_2_tokenizer.decode(*args, **kwargs)
134
+
135
+ @property
136
+ def model_input_names(self):
137
+ return ["text_1", "text_2"]
138
+
139
+ # edit from: https://github.com/huggingface/transformers/blob/1d063793318b20654ebb850f48f43e0a247ab7bb/src/transformers/processing_utils.py#L980-L995
140
+ @classmethod
141
+ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
142
+ args = []
143
+ for attribute_name in cls.attributes:
144
+ class_name = getattr(cls, f"{attribute_name}_class")
145
+ subfolder = attribute_name # subfolder is the same as attribute_name
146
+ if isinstance(class_name, tuple):
147
+ classes = tuple(
148
+ getattr(transformers_module, n) if n is not None else None
149
+ for n in class_name
150
+ )
151
+ use_fast = kwargs.get("use_fast", True)
152
+ if use_fast and classes[1] is not None:
153
+ attribute_class = classes[1]
154
+ else:
155
+ attribute_class = classes[0]
156
+ else:
157
+ attribute_class = getattr(transformers_module, class_name)
158
+
159
+ assert attribute_class is not None, f"Missing attribute class: {class_name}"
160
+ args.append(
161
+ attribute_class.from_pretrained(
162
+ pretrained_model_name_or_path,
163
+ subfolder=subfolder,
164
+ **kwargs,
165
+ )
166
+ )
167
+ return args
168
+
169
+ # edit from: https://github.com/huggingface/transformers/blob/1d063793318b20654ebb850f48f43e0a247ab7bb/src/transformers/processing_utils.py#L460-L560
170
+ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
171
+ """
172
+ Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it
173
+ can be reloaded using the [`~ProcessorMixin.from_pretrained`] method.
174
+
175
+ <Tip>
176
+
177
+ This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
178
+ [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`]. Please refer to the docstrings of the
179
+ methods above for more information.
180
+
181
+ </Tip>
182
+
183
+ Args:
184
+ save_directory (`str` or `os.PathLike`):
185
+ Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
186
+ be created if it does not exist).
187
+ push_to_hub (`bool`, *optional*, defaults to `False`):
188
+ Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
189
+ repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
190
+ namespace).
191
+ kwargs (`Dict[str, Any]`, *optional*):
192
+ Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
193
+ """
194
+ use_auth_token = kwargs.pop("use_auth_token", None)
195
+
196
+ if use_auth_token is not None:
197
+ warnings.warn(
198
+ "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
199
+ FutureWarning,
200
+ )
201
+ if kwargs.get("token", None) is not None:
202
+ raise ValueError(
203
+ "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
204
+ )
205
+ kwargs["token"] = use_auth_token
206
+
207
+ os.makedirs(save_directory, exist_ok=True)
208
+
209
+ if push_to_hub:
210
+ commit_message = kwargs.pop("commit_message", None)
211
+ repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
212
+ repo_id = self._create_repo(repo_id, **kwargs)
213
+ files_timestamps = self._get_files_timestamps(save_directory)
214
+ # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
215
+ # loaded from the Hub.
216
+ if self._auto_class is not None:
217
+ attrs = [
218
+ getattr(self, attribute_name) for attribute_name in self.attributes
219
+ ]
220
+ configs = [
221
+ (a.init_kwargs if isinstance(a, PreTrainedTokenizerBase) else a)
222
+ for a in attrs
223
+ ]
224
+ configs.append(self)
225
+ custom_object_save(self, save_directory, config=configs)
226
+
227
+ for attribute_name in self.attributes:
228
+ attribute = getattr(self, attribute_name)
229
+ # Include the processor class in the attribute config so this processor can then be reloaded with the
230
+ # `AutoProcessor` API.
231
+ if hasattr(attribute, "_set_processor_class"):
232
+ attribute._set_processor_class(self.__class__.__name__)
233
+ attribute.save_pretrained(
234
+ os.path.join(
235
+ save_directory,
236
+ attribute_name, # CHANGED: save to subfolder
237
+ ),
238
+ )
239
+
240
+ if self._auto_class is not None:
241
+ # We added an attribute to the init_kwargs of the tokenizers, which needs to be cleaned up.
242
+ for attribute_name in self.attributes:
243
+ attribute = getattr(self, attribute_name)
244
+ if isinstance(attribute, PreTrainedTokenizerBase):
245
+ del attribute.init_kwargs["auto_map"]
246
+
247
+ # If we save using the predefined names, we can load using `from_pretrained`
248
+ # plus we save chat_template in its own file
249
+ output_processor_file = os.path.join(save_directory, PROCESSOR_NAME)
250
+ output_chat_template_file = os.path.join(save_directory, CHAT_TEMPLATE_NAME)
251
+
252
+ processor_dict = self.to_dict()
253
+ # Save `chat_template` in its own file. We can't get it from `processor_dict` as we popped it in `to_dict`
254
+ # to avoid serializing chat template in json config file. So let's get it from `self` directly
255
+ if self.chat_template is not None:
256
+ chat_template_json_string = (
257
+ json.dumps(
258
+ {"chat_template": self.chat_template}, indent=2, sort_keys=True
259
+ )
260
+ + "\n"
261
+ )
262
+ with open(output_chat_template_file, "w", encoding="utf-8") as writer:
263
+ writer.write(chat_template_json_string)
264
+ logger.info(f"chat template saved in {output_chat_template_file}")
265
+
266
+ # For now, let's not save to `processor_config.json` if the processor doesn't have extra attributes and
267
+ # `auto_map` is not specified.
268
+ if set(processor_dict.keys()) != {"processor_class"}:
269
+ self.to_json_file(output_processor_file)
270
+ logger.info(f"processor saved in {output_processor_file}")
271
+
272
+ if push_to_hub:
273
+ self._upload_modified_files(
274
+ save_directory,
275
+ repo_id,
276
+ files_timestamps,
277
+ commit_message=commit_message,
278
+ token=kwargs.get("token"),
279
+ )
280
+
281
+ if set(processor_dict.keys()) == {"processor_class"}:
282
+ return []
283
+ return [output_processor_file]
tokenizer_1/tokenizer_config.json CHANGED
@@ -68,10 +68,7 @@
68
  }
69
  },
70
  "auto_map": {
71
- "AutoProcessor": [
72
- "processor_multitokenizers.MultiTokenizersProcessor",
73
- null
74
- ]
75
  },
76
  "bos_token": "<s>",
77
  "clean_up_tokenization_spaces": false,
@@ -82,7 +79,7 @@
82
  "mask_token": "<MASK|LLM-jp>",
83
  "model_max_length": 1000000000000000019884624838656,
84
  "pad_token": "<PAD|LLM-jp>",
85
- "processor_class": "MultiTokenizersProcessor",
86
  "sep_token": "<SEP|LLM-jp>",
87
  "sp_model_kwargs": {},
88
  "tokenizer_class": "PreTrainedTokenizerFast",
 
68
  }
69
  },
70
  "auto_map": {
71
+ "AutoProcessor": "processor_multi.MultiProcessor"
 
 
 
72
  },
73
  "bos_token": "<s>",
74
  "clean_up_tokenization_spaces": false,
 
79
  "mask_token": "<MASK|LLM-jp>",
80
  "model_max_length": 1000000000000000019884624838656,
81
  "pad_token": "<PAD|LLM-jp>",
82
+ "processor_class": "MultiProcessor",
83
  "sep_token": "<SEP|LLM-jp>",
84
  "sp_model_kwargs": {},
85
  "tokenizer_class": "PreTrainedTokenizerFast",
tokenizer_2/tokenizer_config.json CHANGED
@@ -195,10 +195,7 @@
195
  "<|video_pad|>"
196
  ],
197
  "auto_map": {
198
- "AutoProcessor": [
199
- "processor_multitokenizers.MultiTokenizersProcessor",
200
- null
201
- ]
202
  },
203
  "bos_token": null,
204
  "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
@@ -207,7 +204,7 @@
207
  "errors": "replace",
208
  "model_max_length": 32768,
209
  "pad_token": "<|endoftext|>",
210
- "processor_class": "MultiTokenizersProcessor",
211
  "split_special_tokens": false,
212
  "tokenizer_class": "Qwen2Tokenizer",
213
  "unk_token": null
 
195
  "<|video_pad|>"
196
  ],
197
  "auto_map": {
198
+ "AutoProcessor": "processor_multi.MultiProcessor"
 
 
 
199
  },
200
  "bos_token": null,
201
  "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
 
204
  "errors": "replace",
205
  "model_max_length": 32768,
206
  "pad_token": "<|endoftext|>",
207
+ "processor_class": "MultiProcessor",
208
  "split_special_tokens": false,
209
  "tokenizer_class": "Qwen2Tokenizer",
210
  "unk_token": null