p1atdev commited on
Commit
111e8a3
·
verified ·
1 Parent(s): 3b5d4a2

Delete processor_multitokenizers.py

Browse files
Files changed (1) hide show
  1. processor_multitokenizers.py +0 -285
processor_multitokenizers.py DELETED
@@ -1,285 +0,0 @@
1
- import os
2
- import json
3
- import warnings
4
- from pathlib import Path
5
-
6
-
7
- import torch
8
- import torch.nn as nn
9
-
10
- from transformers import (
11
- PreTrainedTokenizer,
12
- PreTrainedTokenizerBase,
13
- ProcessorMixin,
14
- BatchFeature,
15
- )
16
- from transformers.utils import (
17
- logging,
18
- direct_transformers_import,
19
- PROCESSOR_NAME,
20
- CHAT_TEMPLATE_NAME,
21
- )
22
- from transformers.image_utils import ImageInput
23
- from transformers.dynamic_module_utils import custom_object_save
24
-
25
- logger = logging.get_logger(__name__)
26
-
27
- # Dynamically import the Transformers module to grab the attribute classes of the processor form their names.
28
- transformers_module = direct_transformers_import(Path(__file__).parent)
29
-
30
-
31
- class MultiTokenizersProcessorKwargs:
32
- _defaults = {
33
- "tokenizer_1_kwargs": {
34
- "padding": False,
35
- },
36
- "tokenizer_2_kwargs": {
37
- "padding": False,
38
- },
39
- }
40
-
41
-
42
- class MultiTokenizersProcessor(ProcessorMixin):
43
- attributes = ["tokenizer_1", "tokenizer_2"]
44
- valid_kwargs = ["chat_template"]
45
- tokenizer_1_class = "AutoTokenizer"
46
- tokenizer_2_class = "AutoTokenizer"
47
-
48
- tokenizer_1: PreTrainedTokenizer
49
- tokenizer_2: PreTrainedTokenizer
50
-
51
- def __init__(
52
- self,
53
- tokenizer_1=None,
54
- tokenizer_2=None,
55
- chat_template=None,
56
- **kwargs,
57
- ):
58
- super().__init__(
59
- tokenizer_1,
60
- tokenizer_2,
61
- chat_template=chat_template,
62
- **kwargs,
63
- )
64
-
65
- def __call__(
66
- self,
67
- text_1: str | list[str] | None = None,
68
- text_2: str | list[str] | None = None,
69
- **kwargs,
70
- ) -> BatchFeature:
71
- def _validate_text_input(text) -> str | list[str]:
72
- if isinstance(text, list):
73
- assert all(
74
- isinstance(t, str) for t in text
75
- ), f"Expected list of str but got {type(text)}"
76
- assert all(len(t) > 0 for t in text), "Expected non-empty strings"
77
- else:
78
- assert isinstance(text, str), f"Expected str but got {type(text)}"
79
- return text
80
-
81
- def _normalize_text_input(text: str | list[str]) -> list[str]:
82
- if isinstance(text, str):
83
- return [text]
84
- return text
85
-
86
- _text_1: str | list[str] = _validate_text_input(text_1)
87
- text_1_list: list[str] = _normalize_text_input(_text_1)
88
- _text_2: str | list[str] = _validate_text_input(text_2)
89
- text_2_list: list[str] = _normalize_text_input(_text_2)
90
-
91
- tokenizer_1_output_kwargs = {
92
- **MultiTokenizersProcessorKwargs._defaults["tokenizer_1_kwargs"],
93
- "return_tensors": "pt",
94
- **kwargs,
95
- }
96
- tokenizer_2_output_kwargs = {
97
- **MultiTokenizersProcessorKwargs._defaults["tokenizer_2_kwargs"],
98
- "return_tensors": "pt",
99
- **kwargs,
100
- }
101
-
102
- # tokenize
103
- text_1_inputs = self.tokenizer_1(
104
- text_1_list,
105
- **tokenizer_1_output_kwargs,
106
- )
107
- text_2_inputs = self.tokenizer_2(
108
- text_2_list,
109
- **tokenizer_2_output_kwargs,
110
- )
111
-
112
- return BatchFeature(
113
- data={
114
- "input_ids": text_1_inputs.get("input_ids"),
115
- "attention_mask": text_1_inputs.get("attention_mask"),
116
- "input_ids_2": text_2_inputs.get("input_ids"),
117
- "attention_mask_2": text_2_inputs.get("attention_mask"),
118
- }
119
- )
120
-
121
- def batch_decode(self, *args, **kwargs):
122
- """
123
- This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
124
- refer to the docstring of this method for more information.
125
- """
126
- return self.tokenizer_2_tokenizer.batch_decode(*args, **kwargs)
127
-
128
- def decode(self, *args, **kwargs):
129
- """
130
- This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
131
- the docstring of this method for more information.
132
- """
133
- return self.tokenizer_2_tokenizer.decode(*args, **kwargs)
134
-
135
- @property
136
- def model_input_names(self):
137
- return ["text_1", "text_2"]
138
-
139
- # edit from: https://github.com/huggingface/transformers/blob/1d063793318b20654ebb850f48f43e0a247ab7bb/src/transformers/processing_utils.py#L980-L995
140
- @classmethod
141
- def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
142
- args = []
143
- for attribute_name in cls.attributes:
144
- class_name = getattr(cls, f"{attribute_name}_class")
145
- subfolder = attribute_name # subfolder is the same as attribute_name
146
- if isinstance(class_name, tuple):
147
- classes = tuple(
148
- getattr(transformers_module, n) if n is not None else None
149
- for n in class_name
150
- )
151
- use_fast = kwargs.get("use_fast", True)
152
- if use_fast and classes[1] is not None:
153
- attribute_class = classes[1]
154
- else:
155
- attribute_class = classes[0]
156
- else:
157
- attribute_class = getattr(transformers_module, class_name)
158
-
159
- assert attribute_class is not None, f"Missing attribute class: {class_name}"
160
- args.append(
161
- attribute_class.from_pretrained(
162
- pretrained_model_name_or_path,
163
- subfolder=subfolder,
164
- **kwargs,
165
- )
166
- )
167
- return args
168
-
169
- # edit from: https://github.com/huggingface/transformers/blob/1d063793318b20654ebb850f48f43e0a247ab7bb/src/transformers/processing_utils.py#L460-L560
170
- def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
171
- """
172
- Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it
173
- can be reloaded using the [`~ProcessorMixin.from_pretrained`] method.
174
-
175
- <Tip>
176
-
177
- This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
178
- [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`]. Please refer to the docstrings of the
179
- methods above for more information.
180
-
181
- </Tip>
182
-
183
- Args:
184
- save_directory (`str` or `os.PathLike`):
185
- Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
186
- be created if it does not exist).
187
- push_to_hub (`bool`, *optional*, defaults to `False`):
188
- Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
189
- repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
190
- namespace).
191
- kwargs (`Dict[str, Any]`, *optional*):
192
- Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
193
- """
194
- use_auth_token = kwargs.pop("use_auth_token", None)
195
-
196
- if use_auth_token is not None:
197
- warnings.warn(
198
- "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
199
- FutureWarning,
200
- )
201
- if kwargs.get("token", None) is not None:
202
- raise ValueError(
203
- "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
204
- )
205
- kwargs["token"] = use_auth_token
206
-
207
- os.makedirs(save_directory, exist_ok=True)
208
-
209
- if push_to_hub:
210
- commit_message = kwargs.pop("commit_message", None)
211
- repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
212
- repo_id = self._create_repo(repo_id, **kwargs)
213
- files_timestamps = self._get_files_timestamps(save_directory)
214
- # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
215
- # loaded from the Hub.
216
- if self._auto_class is not None:
217
- attrs = [
218
- getattr(self, attribute_name) for attribute_name in self.attributes
219
- ]
220
- configs = [
221
- (a.init_kwargs if isinstance(a, PreTrainedTokenizerBase) else a)
222
- for a in attrs
223
- ]
224
- configs.append(self)
225
- custom_object_save(self, save_directory, config=configs)
226
-
227
- for attribute_name in self.attributes:
228
- attribute = getattr(self, attribute_name)
229
- # Include the processor class in the attribute config so this processor can then be reloaded with the
230
- # `AutoProcessor` API.
231
- if hasattr(attribute, "_set_processor_class"):
232
- attribute._set_processor_class(self.__class__.__name__)
233
- attribute.save_pretrained(
234
- os.path.join(
235
- save_directory,
236
- attribute_name, # CHANGED: save to subfolder
237
- ),
238
- )
239
-
240
- if self._auto_class is not None:
241
- # We added an attribute to the init_kwargs of the tokenizers, which needs to be cleaned up.
242
- for attribute_name in self.attributes:
243
- attribute = getattr(self, attribute_name)
244
- if isinstance(attribute, PreTrainedTokenizerBase):
245
- del attribute.init_kwargs["auto_map"]
246
-
247
- # If we save using the predefined names, we can load using `from_pretrained`
248
- # plus we save chat_template in its own file
249
- output_processor_file = os.path.join(save_directory, PROCESSOR_NAME)
250
- output_chat_template_file = os.path.join(save_directory, CHAT_TEMPLATE_NAME)
251
-
252
- processor_dict = self.to_dict()
253
- # Save `chat_template` in its own file. We can't get it from `processor_dict` as we popped it in `to_dict`
254
- # to avoid serializing chat template in json config file. So let's get it from `self` directly
255
- if self.chat_template is not None:
256
- chat_template_json_string = (
257
- json.dumps(
258
- {"chat_template": self.chat_template}, indent=2, sort_keys=True
259
- )
260
- + "\n"
261
- )
262
- with open(output_chat_template_file, "w", encoding="utf-8") as writer:
263
- writer.write(chat_template_json_string)
264
- logger.info(f"chat template saved in {output_chat_template_file}")
265
-
266
- print("auto_map", self.auto_map)
267
-
268
- # For now, let's not save to `processor_config.json` if the processor doesn't have extra attributes and
269
- # `auto_map` is not specified.
270
- if set(processor_dict.keys()) != {"processor_class"}:
271
- self.to_json_file(output_processor_file)
272
- logger.info(f"processor saved in {output_processor_file}")
273
-
274
- if push_to_hub:
275
- self._upload_modified_files(
276
- save_directory,
277
- repo_id,
278
- files_timestamps,
279
- commit_message=commit_message,
280
- token=kwargs.get("token"),
281
- )
282
-
283
- if set(processor_dict.keys()) == {"processor_class"}:
284
- return []
285
- return [output_processor_file]