Upload processor

Browse files

Files changed (4) hide show

processor_config.json +2 -5
processor_multi.py +283 -0
tokenizer_1/tokenizer_config.json +2 -5
tokenizer_2/tokenizer_config.json +2 -5

processor_config.json CHANGED Viewed

@@ -1,9 +1,6 @@
 {
   "auto_map": {
-    "AutoProcessor": [
-      "processor_multitokenizers.MultiTokenizersProcessor",
-      null
-    ]
   },
-  "processor_class": "MultiTokenizersProcessor"
 }

 {
   "auto_map": {
+    "AutoProcessor": "processor_multi.MultiProcessor"
   },
+  "processor_class": "MultiProcessor"
 }

processor_multi.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import os
+import json
+import warnings
+from pathlib import Path
+import torch
+import torch.nn as nn
+from transformers import (
+    PreTrainedTokenizer,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    BatchFeature,
+)
+from transformers.utils import (
+    logging,
+    direct_transformers_import,
+    PROCESSOR_NAME,
+    CHAT_TEMPLATE_NAME,
+)
+from transformers.image_utils import ImageInput
+from transformers.dynamic_module_utils import custom_object_save
+logger = logging.get_logger(__name__)
+# Dynamically import the Transformers module to grab the attribute classes of the processor form their names.
+transformers_module = direct_transformers_import(Path(__file__).parent)
+class MultiProcessorKwargs:
+    _defaults = {
+        "tokenizer_1_kwargs": {
+            "padding": False,
+        },
+        "tokenizer_2_kwargs": {
+            "padding": False,
+        },
+    }
+class MultiProcessor(ProcessorMixin):
+    attributes = ["tokenizer_1", "tokenizer_2"]
+    valid_kwargs = ["chat_template"]
+    tokenizer_1_class = "AutoTokenizer"
+    tokenizer_2_class = "AutoTokenizer"
+    tokenizer_1: PreTrainedTokenizer
+    tokenizer_2: PreTrainedTokenizer
+    def __init__(
+        self,
+        tokenizer_1=None,
+        tokenizer_2=None,
+        chat_template=None,
+        **kwargs,
+    ):
+        super().__init__(
+            tokenizer_1,
+            tokenizer_2,
+            chat_template=chat_template,
+            **kwargs,
+        )
+    def __call__(
+        self,
+        text_1: str | list[str] | None = None,
+        text_2: str | list[str] | None = None,
+        **kwargs,
+    ) -> BatchFeature:
+        def _validate_text_input(text) -> str | list[str]:
+            if isinstance(text, list):
+                assert all(
+                    isinstance(t, str) for t in text
+                ), f"Expected list of str but got {type(text)}"
+                assert all(len(t) > 0 for t in text), "Expected non-empty strings"
+            else:
+                assert isinstance(text, str), f"Expected str but got {type(text)}"
+            return text
+        def _normalize_text_input(text: str | list[str]) -> list[str]:
+            if isinstance(text, str):
+                return [text]
+            return text
+        _text_1: str | list[str] = _validate_text_input(text_1)
+        text_1_list: list[str] = _normalize_text_input(_text_1)
+        _text_2: str | list[str] = _validate_text_input(text_2)
+        text_2_list: list[str] = _normalize_text_input(_text_2)
+        tokenizer_1_output_kwargs = {
+            **MultiProcessorKwargs._defaults["tokenizer_1_kwargs"],
+            "return_tensors": "pt",
+            **kwargs,
+        }
+        tokenizer_2_output_kwargs = {
+            **MultiProcessorKwargs._defaults["tokenizer_2_kwargs"],
+            "return_tensors": "pt",
+            **kwargs,
+        }
+        # tokenize
+        text_1_inputs = self.tokenizer_1(
+            text_1_list,
+            **tokenizer_1_output_kwargs,
+        )
+        text_2_inputs = self.tokenizer_2(
+            text_2_list,
+            **tokenizer_2_output_kwargs,
+        )
+        return BatchFeature(
+            data={
+                "input_ids": text_1_inputs.get("input_ids"),
+                "attention_mask": text_1_inputs.get("attention_mask"),
+                "input_ids_2": text_2_inputs.get("input_ids"),
+                "attention_mask_2": text_2_inputs.get("attention_mask"),
+            }
+        )
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer_2_tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer_2_tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        return ["text_1", "text_2"]
+    # edit from: https://github.com/huggingface/transformers/blob/1d063793318b20654ebb850f48f43e0a247ab7bb/src/transformers/processing_utils.py#L980-L995
+    @classmethod
+    def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        args = []
+        for attribute_name in cls.attributes:
+            class_name = getattr(cls, f"{attribute_name}_class")
+            subfolder = attribute_name  # subfolder is the same as attribute_name
+            if isinstance(class_name, tuple):
+                classes = tuple(
+                    getattr(transformers_module, n) if n is not None else None
+                    for n in class_name
+                )
+                use_fast = kwargs.get("use_fast", True)
+                if use_fast and classes[1] is not None:
+                    attribute_class = classes[1]
+                else:
+                    attribute_class = classes[0]
+            else:
+                attribute_class = getattr(transformers_module, class_name)
+            assert attribute_class is not None, f"Missing attribute class: {class_name}"
+            args.append(
+                attribute_class.from_pretrained(
+                    pretrained_model_name_or_path,
+                    subfolder=subfolder,
+                    **kwargs,
+                )
+            )
+        return args
+    # edit from: https://github.com/huggingface/transformers/blob/1d063793318b20654ebb850f48f43e0a247ab7bb/src/transformers/processing_utils.py#L460-L560
+    def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
+        """
+        Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it
+        can be reloaded using the [`~ProcessorMixin.from_pretrained`] method.
+        <Tip>
+        This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
+        [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`]. Please refer to the docstrings of the
+        methods above for more information.
+        </Tip>
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
+                be created if it does not exist).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+                FutureWarning,
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+        os.makedirs(save_directory, exist_ok=True)
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = self._create_repo(repo_id, **kwargs)
+            files_timestamps = self._get_files_timestamps(save_directory)
+        # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
+        # loaded from the Hub.
+        if self._auto_class is not None:
+            attrs = [
+                getattr(self, attribute_name) for attribute_name in self.attributes
+            ]
+            configs = [
+                (a.init_kwargs if isinstance(a, PreTrainedTokenizerBase) else a)
+                for a in attrs
+            ]
+            configs.append(self)
+            custom_object_save(self, save_directory, config=configs)
+        for attribute_name in self.attributes:
+            attribute = getattr(self, attribute_name)
+            # Include the processor class in the attribute config so this processor can then be reloaded with the
+            # `AutoProcessor` API.
+            if hasattr(attribute, "_set_processor_class"):
+                attribute._set_processor_class(self.__class__.__name__)
+            attribute.save_pretrained(
+                os.path.join(
+                    save_directory,
+                    attribute_name,  # CHANGED: save to subfolder
+                ),
+            )
+        if self._auto_class is not None:
+            # We added an attribute to the init_kwargs of the tokenizers, which needs to be cleaned up.
+            for attribute_name in self.attributes:
+                attribute = getattr(self, attribute_name)
+                if isinstance(attribute, PreTrainedTokenizerBase):
+                    del attribute.init_kwargs["auto_map"]
+        # If we save using the predefined names, we can load using `from_pretrained`
+        # plus we save chat_template in its own file
+        output_processor_file = os.path.join(save_directory, PROCESSOR_NAME)
+        output_chat_template_file = os.path.join(save_directory, CHAT_TEMPLATE_NAME)
+        processor_dict = self.to_dict()
+        # Save `chat_template` in its own file. We can't get it from `processor_dict` as we popped it in `to_dict`
+        # to avoid serializing chat template in json config file. So let's get it from `self` directly
+        if self.chat_template is not None:
+            chat_template_json_string = (
+                json.dumps(
+                    {"chat_template": self.chat_template}, indent=2, sort_keys=True
+                )
+                + "\n"
+            )
+            with open(output_chat_template_file, "w", encoding="utf-8") as writer:
+                writer.write(chat_template_json_string)
+            logger.info(f"chat template saved in {output_chat_template_file}")
+        # For now, let's not save to `processor_config.json` if the processor doesn't have extra attributes and
+        # `auto_map` is not specified.
+        if set(processor_dict.keys()) != {"processor_class"}:
+            self.to_json_file(output_processor_file)
+            logger.info(f"processor saved in {output_processor_file}")
+        if push_to_hub:
+            self._upload_modified_files(
+                save_directory,
+                repo_id,
+                files_timestamps,
+                commit_message=commit_message,
+                token=kwargs.get("token"),
+            )
+        if set(processor_dict.keys()) == {"processor_class"}:
+            return []
+        return [output_processor_file]

tokenizer_1/tokenizer_config.json CHANGED Viewed

@@ -68,10 +68,7 @@
     }
   },
   "auto_map": {
-    "AutoProcessor": [
-      "processor_multitokenizers.MultiTokenizersProcessor",
-      null
-    ]
   },
   "bos_token": "<s>",
   "clean_up_tokenization_spaces": false,
@@ -82,7 +79,7 @@
   "mask_token": "<MASK|LLM-jp>",
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "<PAD|LLM-jp>",
-  "processor_class": "MultiTokenizersProcessor",
   "sep_token": "<SEP|LLM-jp>",
   "sp_model_kwargs": {},
   "tokenizer_class": "PreTrainedTokenizerFast",

     }
   },
   "auto_map": {
+    "AutoProcessor": "processor_multi.MultiProcessor"
   },
   "bos_token": "<s>",
   "clean_up_tokenization_spaces": false,
   "mask_token": "<MASK|LLM-jp>",
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "<PAD|LLM-jp>",
+  "processor_class": "MultiProcessor",
   "sep_token": "<SEP|LLM-jp>",
   "sp_model_kwargs": {},
   "tokenizer_class": "PreTrainedTokenizerFast",

tokenizer_2/tokenizer_config.json CHANGED Viewed

@@ -195,10 +195,7 @@
     "<|video_pad|>"
   ],
   "auto_map": {
-    "AutoProcessor": [
-      "processor_multitokenizers.MultiTokenizersProcessor",
-      null
-    ]
   },
   "bos_token": null,
   "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
@@ -207,7 +204,7 @@
   "errors": "replace",
   "model_max_length": 32768,
   "pad_token": "<|endoftext|>",
-  "processor_class": "MultiTokenizersProcessor",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
   "unk_token": null

     "<|video_pad|>"
   ],
   "auto_map": {
+    "AutoProcessor": "processor_multi.MultiProcessor"
   },
   "bos_token": null,
   "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
   "errors": "replace",
   "model_max_length": 32768,
   "pad_token": "<|endoftext|>",
+  "processor_class": "MultiProcessor",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
   "unk_token": null