|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Utilities to dynamically load objects from the Hub.""" |
|
|
|
import importlib |
|
import os |
|
import re |
|
import shutil |
|
import subprocess |
|
import sys |
|
import tempfile |
|
from pathlib import Path |
|
from typing import Dict, Optional, Union |
|
|
|
from huggingface_hub import model_info |
|
|
|
from .utils import HF_MODULES_CACHE, TRANSFORMERS_DYNAMIC_MODULE_NAME, cached_file, is_offline_mode, logging |
|
|
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
def init_hf_modules(): |
|
""" |
|
Creates the cache directory for modules with an init, and adds it to the Python path. |
|
""" |
|
|
|
if HF_MODULES_CACHE in sys.path: |
|
return |
|
|
|
sys.path.append(HF_MODULES_CACHE) |
|
os.makedirs(HF_MODULES_CACHE, exist_ok=True) |
|
init_path = Path(HF_MODULES_CACHE) / "__init__.py" |
|
if not init_path.exists(): |
|
init_path.touch() |
|
|
|
|
|
def create_dynamic_module(name: Union[str, os.PathLike]): |
|
""" |
|
Creates a dynamic module in the cache directory for modules. |
|
""" |
|
init_hf_modules() |
|
dynamic_module_path = Path(HF_MODULES_CACHE) / name |
|
|
|
if not dynamic_module_path.parent.exists(): |
|
create_dynamic_module(dynamic_module_path.parent) |
|
os.makedirs(dynamic_module_path, exist_ok=True) |
|
init_path = dynamic_module_path / "__init__.py" |
|
if not init_path.exists(): |
|
init_path.touch() |
|
|
|
|
|
def get_relative_imports(module_file): |
|
""" |
|
Get the list of modules that are relatively imported in a module file. |
|
|
|
Args: |
|
module_file (`str` or `os.PathLike`): The module file to inspect. |
|
""" |
|
with open(module_file, "r", encoding="utf-8") as f: |
|
content = f.read() |
|
|
|
|
|
relative_imports = re.findall(r"^\s*import\s+\.(\S+)\s*$", content, flags=re.MULTILINE) |
|
|
|
relative_imports += re.findall(r"^\s*from\s+\.(\S+)\s+import", content, flags=re.MULTILINE) |
|
|
|
return list(set(relative_imports)) |
|
|
|
|
|
def get_relative_import_files(module_file): |
|
""" |
|
Get the list of all files that are needed for a given module. Note that this function recurses through the relative |
|
imports (if a imports b and b imports c, it will return module files for b and c). |
|
|
|
Args: |
|
module_file (`str` or `os.PathLike`): The module file to inspect. |
|
""" |
|
no_change = False |
|
files_to_check = [module_file] |
|
all_relative_imports = [] |
|
|
|
|
|
while not no_change: |
|
new_imports = [] |
|
for f in files_to_check: |
|
new_imports.extend(get_relative_imports(f)) |
|
|
|
module_path = Path(module_file).parent |
|
new_import_files = [str(module_path / m) for m in new_imports] |
|
new_import_files = [f for f in new_import_files if f not in all_relative_imports] |
|
files_to_check = [f"{f}.py" for f in new_import_files] |
|
|
|
no_change = len(new_import_files) == 0 |
|
all_relative_imports.extend(files_to_check) |
|
|
|
return all_relative_imports |
|
|
|
|
|
def check_imports(filename): |
|
""" |
|
Check if the current Python environment contains all the libraries that are imported in a file. |
|
""" |
|
with open(filename, "r", encoding="utf-8") as f: |
|
content = f.read() |
|
|
|
|
|
content = re.sub(r"\s*try\s*:\s*.*?\s*except\s*:", "", content, flags=re.MULTILINE) |
|
|
|
|
|
imports = re.findall(r"^\s*import\s+(\S+)\s*$", content, flags=re.MULTILINE) |
|
|
|
imports += re.findall(r"^\s*from\s+(\S+)\s+import", content, flags=re.MULTILINE) |
|
|
|
imports = [imp.split(".")[0] for imp in imports if not imp.startswith(".")] |
|
|
|
|
|
imports = list(set(imports)) |
|
missing_packages = [] |
|
for imp in imports: |
|
try: |
|
importlib.import_module(imp) |
|
except ImportError: |
|
missing_packages.append(imp) |
|
|
|
if len(missing_packages) > 0: |
|
raise ImportError( |
|
"This modeling file requires the following packages that were not found in your environment: " |
|
f"{', '.join(missing_packages)}. Run `pip install {' '.join(missing_packages)}`" |
|
) |
|
|
|
return get_relative_imports(filename) |
|
|
|
|
|
def get_class_in_module(class_name, module_path): |
|
""" |
|
Import a module on the cache directory for modules and extract a class from it. |
|
""" |
|
with tempfile.TemporaryDirectory() as tmp_dir: |
|
module_dir = Path(HF_MODULES_CACHE) / os.path.dirname(module_path) |
|
module_file_name = module_path.split(os.path.sep)[-1] + ".py" |
|
|
|
|
|
|
|
shutil.copy(f"{module_dir}/{module_file_name}", tmp_dir) |
|
|
|
cmd = f'import os; os.remove(r"{module_dir}{os.path.sep}{module_file_name}")' |
|
|
|
|
|
|
|
|
|
try: |
|
subprocess.run(["python", "-c", cmd]) |
|
except FileNotFoundError: |
|
try: |
|
subprocess.run(["python3", "-c", cmd]) |
|
except FileNotFoundError: |
|
pass |
|
|
|
|
|
shutil.copyfile(f"{tmp_dir}/{module_file_name}", f"{module_dir}/{module_file_name}") |
|
|
|
|
|
module_path = module_path.replace(os.path.sep, ".") |
|
module = importlib.import_module(module_path) |
|
|
|
return getattr(module, class_name) |
|
|
|
|
|
def get_cached_module_file( |
|
pretrained_model_name_or_path: Union[str, os.PathLike], |
|
module_file: str, |
|
cache_dir: Optional[Union[str, os.PathLike]] = None, |
|
force_download: bool = False, |
|
resume_download: bool = False, |
|
proxies: Optional[Dict[str, str]] = None, |
|
use_auth_token: Optional[Union[bool, str]] = None, |
|
revision: Optional[str] = None, |
|
local_files_only: bool = False, |
|
): |
|
""" |
|
Prepares Downloads a module from a local folder or a distant repo and returns its path inside the cached |
|
Transformers module. |
|
|
|
Args: |
|
pretrained_model_name_or_path (`str` or `os.PathLike`): |
|
This can be either: |
|
|
|
- a string, the *model id* of a pretrained model configuration hosted inside a model repo on |
|
huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced |
|
under a user or organization name, like `dbmdz/bert-base-german-cased`. |
|
- a path to a *directory* containing a configuration file saved using the |
|
[`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. |
|
|
|
module_file (`str`): |
|
The name of the module file containing the class to look for. |
|
cache_dir (`str` or `os.PathLike`, *optional*): |
|
Path to a directory in which a downloaded pretrained model configuration should be cached if the standard |
|
cache should not be used. |
|
force_download (`bool`, *optional*, defaults to `False`): |
|
Whether or not to force to (re-)download the configuration files and override the cached versions if they |
|
exist. |
|
resume_download (`bool`, *optional*, defaults to `False`): |
|
Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists. |
|
proxies (`Dict[str, str]`, *optional*): |
|
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', |
|
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. |
|
use_auth_token (`str` or *bool*, *optional*): |
|
The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated |
|
when running `huggingface-cli login` (stored in `~/.huggingface`). |
|
revision (`str`, *optional*, defaults to `"main"`): |
|
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a |
|
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any |
|
identifier allowed by git. |
|
local_files_only (`bool`, *optional*, defaults to `False`): |
|
If `True`, will only try to load the tokenizer configuration from local files. |
|
|
|
<Tip> |
|
|
|
Passing `use_auth_token=True` is required when you want to use a private model. |
|
|
|
</Tip> |
|
|
|
Returns: |
|
`str`: The path to the module inside the cache. |
|
""" |
|
if is_offline_mode() and not local_files_only: |
|
logger.info("Offline mode: forcing local_files_only=True") |
|
local_files_only = True |
|
|
|
|
|
pretrained_model_name_or_path = str(pretrained_model_name_or_path) |
|
if os.path.isdir(pretrained_model_name_or_path): |
|
submodule = pretrained_model_name_or_path.split(os.path.sep)[-1] |
|
else: |
|
submodule = pretrained_model_name_or_path.replace("/", os.path.sep) |
|
|
|
try: |
|
|
|
resolved_module_file = cached_file( |
|
pretrained_model_name_or_path, |
|
module_file, |
|
cache_dir=cache_dir, |
|
force_download=force_download, |
|
proxies=proxies, |
|
resume_download=resume_download, |
|
local_files_only=local_files_only, |
|
use_auth_token=use_auth_token, |
|
revision=revision, |
|
) |
|
|
|
except EnvironmentError: |
|
logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.") |
|
raise |
|
|
|
|
|
modules_needed = check_imports(resolved_module_file) |
|
|
|
|
|
full_submodule = TRANSFORMERS_DYNAMIC_MODULE_NAME + os.path.sep + submodule |
|
create_dynamic_module(full_submodule) |
|
submodule_path = Path(HF_MODULES_CACHE) / full_submodule |
|
if submodule == pretrained_model_name_or_path.split(os.path.sep)[-1]: |
|
|
|
|
|
|
|
shutil.copy(resolved_module_file, submodule_path / module_file) |
|
for module_needed in modules_needed: |
|
module_needed = f"{module_needed}.py" |
|
shutil.copy(os.path.join(pretrained_model_name_or_path, module_needed), submodule_path / module_needed) |
|
else: |
|
|
|
|
|
commit_hash = model_info(pretrained_model_name_or_path, revision=revision, token=use_auth_token).sha |
|
|
|
|
|
|
|
submodule_path = submodule_path / commit_hash |
|
full_submodule = full_submodule + os.path.sep + commit_hash |
|
create_dynamic_module(full_submodule) |
|
|
|
if not (submodule_path / module_file).exists(): |
|
shutil.copy(resolved_module_file, submodule_path / module_file) |
|
|
|
for module_needed in modules_needed: |
|
if not (submodule_path / module_needed).exists(): |
|
get_cached_module_file( |
|
pretrained_model_name_or_path, |
|
f"{module_needed}.py", |
|
cache_dir=cache_dir, |
|
force_download=force_download, |
|
resume_download=resume_download, |
|
proxies=proxies, |
|
use_auth_token=use_auth_token, |
|
revision=revision, |
|
local_files_only=local_files_only, |
|
) |
|
return os.path.join(full_submodule, module_file) |
|
|
|
|
|
def get_class_from_dynamic_module( |
|
pretrained_model_name_or_path: Union[str, os.PathLike], |
|
module_file: str, |
|
class_name: str, |
|
cache_dir: Optional[Union[str, os.PathLike]] = None, |
|
force_download: bool = False, |
|
resume_download: bool = False, |
|
proxies: Optional[Dict[str, str]] = None, |
|
use_auth_token: Optional[Union[bool, str]] = None, |
|
revision: Optional[str] = None, |
|
local_files_only: bool = False, |
|
**kwargs, |
|
): |
|
""" |
|
Extracts a class from a module file, present in the local folder or repository of a model. |
|
|
|
<Tip warning={true}> |
|
|
|
Calling this function will execute the code in the module file found locally or downloaded from the Hub. It should |
|
therefore only be called on trusted repos. |
|
|
|
</Tip> |
|
|
|
Args: |
|
pretrained_model_name_or_path (`str` or `os.PathLike`): |
|
This can be either: |
|
|
|
- a string, the *model id* of a pretrained model configuration hosted inside a model repo on |
|
huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced |
|
under a user or organization name, like `dbmdz/bert-base-german-cased`. |
|
- a path to a *directory* containing a configuration file saved using the |
|
[`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. |
|
|
|
module_file (`str`): |
|
The name of the module file containing the class to look for. |
|
class_name (`str`): |
|
The name of the class to import in the module. |
|
cache_dir (`str` or `os.PathLike`, *optional*): |
|
Path to a directory in which a downloaded pretrained model configuration should be cached if the standard |
|
cache should not be used. |
|
force_download (`bool`, *optional*, defaults to `False`): |
|
Whether or not to force to (re-)download the configuration files and override the cached versions if they |
|
exist. |
|
resume_download (`bool`, *optional*, defaults to `False`): |
|
Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists. |
|
proxies (`Dict[str, str]`, *optional*): |
|
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', |
|
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. |
|
use_auth_token (`str` or `bool`, *optional*): |
|
The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated |
|
when running `huggingface-cli login` (stored in `~/.huggingface`). |
|
revision (`str`, *optional*, defaults to `"main"`): |
|
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a |
|
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any |
|
identifier allowed by git. |
|
local_files_only (`bool`, *optional*, defaults to `False`): |
|
If `True`, will only try to load the tokenizer configuration from local files. |
|
|
|
<Tip> |
|
|
|
Passing `use_auth_token=True` is required when you want to use a private model. |
|
|
|
</Tip> |
|
|
|
Returns: |
|
`type`: The class, dynamically imported from the module. |
|
|
|
Examples: |
|
|
|
```python |
|
# Download module `modeling.py` from huggingface.co and cache then extract the class `MyBertModel` from this |
|
# module. |
|
cls = get_class_from_dynamic_module("sgugger/my-bert-model", "modeling.py", "MyBertModel") |
|
```""" |
|
|
|
final_module = get_cached_module_file( |
|
pretrained_model_name_or_path, |
|
module_file, |
|
cache_dir=cache_dir, |
|
force_download=force_download, |
|
resume_download=resume_download, |
|
proxies=proxies, |
|
use_auth_token=use_auth_token, |
|
revision=revision, |
|
local_files_only=local_files_only, |
|
) |
|
return get_class_in_module(class_name, final_module.replace(".py", "")) |
|
|
|
|
|
def custom_object_save(obj, folder, config=None): |
|
""" |
|
Save the modeling files corresponding to a custom model/configuration/tokenizer etc. in a given folder. Optionally |
|
adds the proper fields in a config. |
|
|
|
Args: |
|
obj (`Any`): The object for which to save the module files. |
|
folder (`str` or `os.PathLike`): The folder where to save. |
|
config (`PretrainedConfig` or dictionary, `optional`): |
|
A config in which to register the auto_map corresponding to this custom object. |
|
""" |
|
if obj.__module__ == "__main__": |
|
logger.warning( |
|
f"We can't save the code defining {obj} in {folder} as it's been defined in __main__. You should put " |
|
"this code in a separate module so we can include it in the saved folder and make it easier to share via " |
|
"the Hub." |
|
) |
|
|
|
def _set_auto_map_in_config(_config): |
|
module_name = obj.__class__.__module__ |
|
last_module = module_name.split(".")[-1] |
|
full_name = f"{last_module}.{obj.__class__.__name__}" |
|
|
|
if "Tokenizer" in full_name: |
|
slow_tokenizer_class = None |
|
fast_tokenizer_class = None |
|
if obj.__class__.__name__.endswith("Fast"): |
|
|
|
fast_tokenizer_class = f"{last_module}.{obj.__class__.__name__}" |
|
if getattr(obj, "slow_tokenizer_class", None) is not None: |
|
slow_tokenizer = getattr(obj, "slow_tokenizer_class") |
|
slow_tok_module_name = slow_tokenizer.__module__ |
|
last_slow_tok_module = slow_tok_module_name.split(".")[-1] |
|
slow_tokenizer_class = f"{last_slow_tok_module}.{slow_tokenizer.__name__}" |
|
else: |
|
|
|
slow_tokenizer_class = f"{last_module}.{obj.__class__.__name__}" |
|
|
|
full_name = (slow_tokenizer_class, fast_tokenizer_class) |
|
|
|
if isinstance(_config, dict): |
|
auto_map = _config.get("auto_map", {}) |
|
auto_map[obj._auto_class] = full_name |
|
_config["auto_map"] = auto_map |
|
elif getattr(_config, "auto_map", None) is not None: |
|
_config.auto_map[obj._auto_class] = full_name |
|
else: |
|
_config.auto_map = {obj._auto_class: full_name} |
|
|
|
|
|
if isinstance(config, (list, tuple)): |
|
for cfg in config: |
|
_set_auto_map_in_config(cfg) |
|
elif config is not None: |
|
_set_auto_map_in_config(config) |
|
|
|
|
|
object_file = sys.modules[obj.__module__].__file__ |
|
dest_file = Path(folder) / (Path(object_file).name) |
|
shutil.copy(object_file, dest_file) |
|
|
|
|
|
for needed_file in get_relative_import_files(object_file): |
|
dest_file = Path(folder) / (Path(needed_file).name) |
|
shutil.copy(needed_file, dest_file) |
|
|