# coding=utf-8 # Copyright 2022 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Processing saving/loading class for common processors. """ import os from pathlib import Path from .dynamic_module_utils import custom_object_save from .tokenization_utils_base import PreTrainedTokenizerBase from .utils import PushToHubMixin, copy_func, direct_transformers_import, logging logger = logging.get_logger(__name__) # Dynamically import the Transformers module to grab the attribute classes of the processor form their names. transformers_module = direct_transformers_import(Path(__file__).parent) AUTO_TO_BASE_CLASS_MAPPING = { "AutoTokenizer": "PreTrainedTokenizerBase", "AutoFeatureExtractor": "FeatureExtractionMixin", "AutoImageProcessor": "ImageProcessingMixin", } class ProcessorMixin(PushToHubMixin): """ This is a mixin used to provide saving/loading functionality for all processor classes. """ attributes = ["feature_extractor", "tokenizer"] # Names need to be attr_class for attr in attributes feature_extractor_class = None tokenizer_class = None _auto_class = None # args have to match the attributes class attribute def __init__(self, *args, **kwargs): # Sanitize args and kwargs for key in kwargs: if key not in self.attributes: raise TypeError(f"Unexpected keyword argument {key}.") for arg, attribute_name in zip(args, self.attributes): if attribute_name in kwargs: raise TypeError(f"Got multiple values for argument {attribute_name}.") else: kwargs[attribute_name] = arg if len(kwargs) != len(self.attributes): raise ValueError( f"This processor requires {len(self.attributes)} arguments: {', '.join(self.attributes)}. Got " f"{len(args)} arguments instead." ) # Check each arg is of the proper class (this will also catch a user initializing in the wrong order) for attribute_name, arg in kwargs.items(): class_name = getattr(self, f"{attribute_name}_class") # Nothing is ever going to be an instance of "AutoXxx", in that case we check the base class. class_name = AUTO_TO_BASE_CLASS_MAPPING.get(class_name, class_name) if isinstance(class_name, tuple): proper_class = tuple(getattr(transformers_module, n) for n in class_name if n is not None) else: proper_class = getattr(transformers_module, class_name) if not isinstance(arg, proper_class): raise ValueError( f"Received a {type(arg).__name__} for argument {attribute_name}, but a {class_name} was expected." ) setattr(self, attribute_name, arg) def __repr__(self): attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes] attributes_repr = "\n".join(attributes_repr) return f"{self.__class__.__name__}:\n{attributes_repr}" def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs): """ Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it can be reloaded using the [`~ProcessorMixin.from_pretrained`] method. This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`]. Please refer to the docstrings of the methods above for more information. Args: save_directory (`str` or `os.PathLike`): Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will be created if it does not exist). push_to_hub (`bool`, *optional*, defaults to `False`): Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the repository you want to push to with `repo_id` (will default to the name of `save_directory` in your namespace). kwargs: Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. """ os.makedirs(save_directory, exist_ok=True) if push_to_hub: commit_message = kwargs.pop("commit_message", None) repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1]) repo_id = self._create_repo(repo_id, **kwargs) files_timestamps = self._get_files_timestamps(save_directory) # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be # loaded from the Hub. if self._auto_class is not None: attrs = [getattr(self, attribute_name) for attribute_name in self.attributes] configs = [(a.init_kwargs if isinstance(a, PreTrainedTokenizerBase) else a) for a in attrs] custom_object_save(self, save_directory, config=configs) for attribute_name in self.attributes: attribute = getattr(self, attribute_name) # Include the processor class in the attribute config so this processor can then be reloaded with the # `AutoProcessor` API. if hasattr(attribute, "_set_processor_class"): attribute._set_processor_class(self.__class__.__name__) attribute.save_pretrained(save_directory) if self._auto_class is not None: # We added an attribute to the init_kwargs of the tokenizers, which needs to be cleaned up. for attribute_name in self.attributes: attribute = getattr(self, attribute_name) if isinstance(attribute, PreTrainedTokenizerBase): del attribute.init_kwargs["auto_map"] if push_to_hub: self._upload_modified_files( save_directory, repo_id, files_timestamps, commit_message=commit_message, token=kwargs.get("use_auth_token"), ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): r""" Instantiate a processor associated with a pretrained model. This class method is simply calling the feature extractor [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`], image processor [`~image_processing_utils.ImageProcessingMixin`] and the tokenizer [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] methods. Please refer to the docstrings of the methods above for more information. Args: pretrained_model_name_or_path (`str` or `os.PathLike`): This can be either: - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`. - a path to a *directory* containing a feature extractor file saved using the [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`. - a path or url to a saved feature extractor JSON *file*, e.g., `./my_model_directory/preprocessor_config.json`. **kwargs Additional keyword arguments passed along to both [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. """ args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs) return cls(*args) @classmethod def register_for_auto_class(cls, auto_class="AutoProcessor"): """ Register this class with a given auto class. This should only be used for custom feature extractors as the ones in the library are already mapped with `AutoProcessor`. This API is experimental and may have some slight breaking changes in the next releases. Args: auto_class (`str` or `type`, *optional*, defaults to `"AutoProcessor"`): The auto class to register this new feature extractor with. """ if not isinstance(auto_class, str): auto_class = auto_class.__name__ import transformers.models.auto as auto_module if not hasattr(auto_module, auto_class): raise ValueError(f"{auto_class} is not a valid auto class.") cls._auto_class = auto_class @classmethod def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs): args = [] for attribute_name in cls.attributes: class_name = getattr(cls, f"{attribute_name}_class") if isinstance(class_name, tuple): classes = tuple(getattr(transformers_module, n) if n is not None else None for n in class_name) use_fast = kwargs.get("use_fast", True) if use_fast and classes[1] is not None: attribute_class = classes[1] else: attribute_class = classes[0] else: attribute_class = getattr(transformers_module, class_name) args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs)) return args @property def model_input_names(self): first_attribute = getattr(self, self.attributes[0]) return getattr(first_attribute, "model_input_names", None) ProcessorMixin.push_to_hub = copy_func(ProcessorMixin.push_to_hub) if ProcessorMixin.push_to_hub.__doc__ is not None: ProcessorMixin.push_to_hub.__doc__ = ProcessorMixin.push_to_hub.__doc__.format( object="processor", object_class="AutoProcessor", object_files="processor files" )