GAMA-IT

Running on Zero

App Files Files Community

GAMA-IT / hf-dev-train /transformers-main /src /transformers /processing_utils.py

sonalkum

hf peft

1e6d67a over 1 year ago

raw

history blame

10.8 kB

	# coding=utf-8
	# Copyright 2022 The HuggingFace Inc. team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""
	Processing saving/loading class for common processors.
	"""

	import os
	from pathlib import Path

	from .dynamic_module_utils import custom_object_save
	from .tokenization_utils_base import PreTrainedTokenizerBase
	from .utils import PushToHubMixin, copy_func, direct_transformers_import, logging


	logger = logging.get_logger(__name__)

	# Dynamically import the Transformers module to grab the attribute classes of the processor form their names.
	transformers_module = direct_transformers_import(Path(__file__).parent)


	AUTO_TO_BASE_CLASS_MAPPING = {
	"AutoTokenizer": "PreTrainedTokenizerBase",
	"AutoFeatureExtractor": "FeatureExtractionMixin",
	"AutoImageProcessor": "ImageProcessingMixin",
	}


	class ProcessorMixin(PushToHubMixin):
	"""
	This is a mixin used to provide saving/loading functionality for all processor classes.
	"""

	attributes = ["feature_extractor", "tokenizer"]
	# Names need to be attr_class for attr in attributes
	feature_extractor_class = None
	tokenizer_class = None
	_auto_class = None

	# args have to match the attributes class attribute
	def __init__(self, args, *kwargs):
	# Sanitize args and kwargs
	for key in kwargs:
	if key not in self.attributes:
	raise TypeError(f"Unexpected keyword argument {key}.")
	for arg, attribute_name in zip(args, self.attributes):
	if attribute_name in kwargs:
	raise TypeError(f"Got multiple values for argument {attribute_name}.")
	else:
	kwargs[attribute_name] = arg

	if len(kwargs) != len(self.attributes):
	raise ValueError(
	f"This processor requires {len(self.attributes)} arguments: {', '.join(self.attributes)}. Got "
	f"{len(args)} arguments instead."
	)

	# Check each arg is of the proper class (this will also catch a user initializing in the wrong order)
	for attribute_name, arg in kwargs.items():
	class_name = getattr(self, f"{attribute_name}_class")
	# Nothing is ever going to be an instance of "AutoXxx", in that case we check the base class.
	class_name = AUTO_TO_BASE_CLASS_MAPPING.get(class_name, class_name)
	if isinstance(class_name, tuple):
	proper_class = tuple(getattr(transformers_module, n) for n in class_name if n is not None)
	else:
	proper_class = getattr(transformers_module, class_name)

	if not isinstance(arg, proper_class):
	raise ValueError(
	f"Received a {type(arg).__name__} for argument {attribute_name}, but a {class_name} was expected."
	)

	setattr(self, attribute_name, arg)

	def __repr__(self):
	attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes]
	attributes_repr = "\n".join(attributes_repr)
	return f"{self.__class__.__name__}:\n{attributes_repr}"

	def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
	"""
	Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it
	can be reloaded using the [`~ProcessorMixin.from_pretrained`] method.

	<Tip>

	This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
	[`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`]. Please refer to the docstrings of the
	methods above for more information.

	</Tip>

	Args:
	save_directory (`str` or `os.PathLike`):
	Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
	be created if it does not exist).
	push_to_hub (`bool`, optional, defaults to `False`):
	Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
	repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
	namespace).
	kwargs:
	Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
	"""
	os.makedirs(save_directory, exist_ok=True)

	if push_to_hub:
	commit_message = kwargs.pop("commit_message", None)
	repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
	repo_id = self._create_repo(repo_id, **kwargs)
	files_timestamps = self._get_files_timestamps(save_directory)
	# If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
	# loaded from the Hub.
	if self._auto_class is not None:
	attrs = [getattr(self, attribute_name) for attribute_name in self.attributes]
	configs = [(a.init_kwargs if isinstance(a, PreTrainedTokenizerBase) else a) for a in attrs]
	custom_object_save(self, save_directory, config=configs)

	for attribute_name in self.attributes:
	attribute = getattr(self, attribute_name)
	# Include the processor class in the attribute config so this processor can then be reloaded with the
	# `AutoProcessor` API.
	if hasattr(attribute, "_set_processor_class"):
	attribute._set_processor_class(self.__class__.__name__)
	attribute.save_pretrained(save_directory)

	if self._auto_class is not None:
	# We added an attribute to the init_kwargs of the tokenizers, which needs to be cleaned up.
	for attribute_name in self.attributes:
	attribute = getattr(self, attribute_name)
	if isinstance(attribute, PreTrainedTokenizerBase):
	del attribute.init_kwargs["auto_map"]

	if push_to_hub:
	self._upload_modified_files(
	save_directory,
	repo_id,
	files_timestamps,
	commit_message=commit_message,
	token=kwargs.get("use_auth_token"),
	)

	@classmethod
	def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
	r"""
	Instantiate a processor associated with a pretrained model.

	<Tip>

	This class method is simply calling the feature extractor
	[`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`], image processor
	[`~image_processing_utils.ImageProcessingMixin`] and the tokenizer
	[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] methods. Please refer to the docstrings of the
	methods above for more information.

	</Tip>

	Args:
	pretrained_model_name_or_path (`str` or `os.PathLike`):
	This can be either:

	- a string, the model id of a pretrained feature_extractor hosted inside a model repo on
	huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
	namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
	- a path to a directory containing a feature extractor file saved using the
	[`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
	- a path or url to a saved feature extractor JSON file, e.g.,
	`./my_model_directory/preprocessor_config.json`.
	**kwargs
	Additional keyword arguments passed along to both
	[`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and
	[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
	"""
	args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
	return cls(*args)

	@classmethod
	def register_for_auto_class(cls, auto_class="AutoProcessor"):
	"""
	Register this class with a given auto class. This should only be used for custom feature extractors as the ones
	in the library are already mapped with `AutoProcessor`.

	<Tip warning={true}>

	This API is experimental and may have some slight breaking changes in the next releases.

	</Tip>

	Args:
	auto_class (`str` or `type`, optional, defaults to `"AutoProcessor"`):
	The auto class to register this new feature extractor with.
	"""
	if not isinstance(auto_class, str):
	auto_class = auto_class.__name__

	import transformers.models.auto as auto_module

	if not hasattr(auto_module, auto_class):
	raise ValueError(f"{auto_class} is not a valid auto class.")

	cls._auto_class = auto_class

	@classmethod
	def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
	args = []
	for attribute_name in cls.attributes:
	class_name = getattr(cls, f"{attribute_name}_class")
	if isinstance(class_name, tuple):
	classes = tuple(getattr(transformers_module, n) if n is not None else None for n in class_name)
	use_fast = kwargs.get("use_fast", True)
	if use_fast and classes[1] is not None:
	attribute_class = classes[1]
	else:
	attribute_class = classes[0]
	else:
	attribute_class = getattr(transformers_module, class_name)

	args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
	return args

	@property
	def model_input_names(self):
	first_attribute = getattr(self, self.attributes[0])
	return getattr(first_attribute, "model_input_names", None)


	ProcessorMixin.push_to_hub = copy_func(ProcessorMixin.push_to_hub)
	if ProcessorMixin.push_to_hub.__doc__ is not None:
	ProcessorMixin.push_to_hub.__doc__ = ProcessorMixin.push_to_hub.__doc__.format(
	object="processor", object_class="AutoProcessor", object_files="processor files"
	)