|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | """A script running `create_dummy_models.py` with a pre-defined set of arguments. | 
					
						
						|  |  | 
					
						
						|  | This file is intended to be used in a CI workflow file without the need of specifying arguments. It creates and uploads | 
					
						
						|  | tiny models for all model classes (if their tiny versions are not on the Hub yet), as well as produces an updated | 
					
						
						|  | version of `tests/utils/tiny_model_summary.json`. That updated file should be merged into the `main` branch of | 
					
						
						|  | `transformers` so the pipeline testing will use the latest created/updated tiny models. | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | import argparse | 
					
						
						|  | import copy | 
					
						
						|  | import json | 
					
						
						|  | import multiprocessing | 
					
						
						|  | import os | 
					
						
						|  | import time | 
					
						
						|  |  | 
					
						
						|  | from create_dummy_models import COMPOSITE_MODELS, create_tiny_models | 
					
						
						|  | from huggingface_hub import ModelFilter, hf_api | 
					
						
						|  |  | 
					
						
						|  | import transformers | 
					
						
						|  | from transformers import AutoFeatureExtractor, AutoImageProcessor, AutoTokenizer | 
					
						
						|  | from transformers.image_processing_utils import BaseImageProcessor | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_all_model_names(): | 
					
						
						|  | model_names = set() | 
					
						
						|  |  | 
					
						
						|  | for module_name in ["modeling_auto", "modeling_tf_auto", "modeling_flax_auto"]: | 
					
						
						|  | module = getattr(transformers.models.auto, module_name, None) | 
					
						
						|  | if module is None: | 
					
						
						|  | continue | 
					
						
						|  |  | 
					
						
						|  | mapping_names = [ | 
					
						
						|  | x | 
					
						
						|  | for x in dir(module) | 
					
						
						|  | if x.endswith("_MAPPING_NAMES") | 
					
						
						|  | and (x.startswith("MODEL_") or x.startswith("TF_MODEL_") or x.startswith("FLAX_MODEL_")) | 
					
						
						|  | ] | 
					
						
						|  | for name in mapping_names: | 
					
						
						|  | mapping = getattr(module, name) | 
					
						
						|  | if mapping is not None: | 
					
						
						|  | for v in mapping.values(): | 
					
						
						|  | if isinstance(v, (list, tuple)): | 
					
						
						|  | model_names.update(v) | 
					
						
						|  | elif isinstance(v, str): | 
					
						
						|  | model_names.add(v) | 
					
						
						|  |  | 
					
						
						|  | return sorted(model_names) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_tiny_model_names_from_repo(): | 
					
						
						|  |  | 
					
						
						|  | model_names = set(get_all_model_names()) | 
					
						
						|  |  | 
					
						
						|  | with open("tests/utils/tiny_model_summary.json") as fp: | 
					
						
						|  | tiny_model_info = json.load(fp) | 
					
						
						|  | tiny_models_names = set() | 
					
						
						|  | for model_base_name in tiny_model_info: | 
					
						
						|  | tiny_models_names.update(tiny_model_info[model_base_name]["model_classes"]) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | not_on_hub = model_names.difference(tiny_models_names) | 
					
						
						|  | for model_name in copy.copy(tiny_models_names): | 
					
						
						|  | if not model_name.startswith("TF") and f"TF{model_name}" in not_on_hub: | 
					
						
						|  | tiny_models_names.remove(model_name) | 
					
						
						|  | elif model_name.startswith("TF") and model_name[2:] in not_on_hub: | 
					
						
						|  | tiny_models_names.remove(model_name) | 
					
						
						|  |  | 
					
						
						|  | return sorted(tiny_models_names) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_tiny_model_summary_from_hub(output_path): | 
					
						
						|  | special_models = COMPOSITE_MODELS.values() | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | model_names = get_all_model_names() | 
					
						
						|  | models = hf_api.list_models( | 
					
						
						|  | filter=ModelFilter( | 
					
						
						|  | author="hf-internal-testing", | 
					
						
						|  | ) | 
					
						
						|  | ) | 
					
						
						|  | _models = set() | 
					
						
						|  | for x in models: | 
					
						
						|  | model = x.modelId | 
					
						
						|  | org, model = model.split("/") | 
					
						
						|  | if not model.startswith("tiny-random-"): | 
					
						
						|  | continue | 
					
						
						|  | model = model.replace("tiny-random-", "") | 
					
						
						|  | if not model[0].isupper(): | 
					
						
						|  | continue | 
					
						
						|  | if model not in model_names and model not in special_models: | 
					
						
						|  | continue | 
					
						
						|  | _models.add(model) | 
					
						
						|  |  | 
					
						
						|  | models = sorted(_models) | 
					
						
						|  |  | 
					
						
						|  | summary = {} | 
					
						
						|  | for model in models: | 
					
						
						|  | repo_id = f"hf-internal-testing/tiny-random-{model}" | 
					
						
						|  | model = model.split("-")[0] | 
					
						
						|  | try: | 
					
						
						|  | repo_info = hf_api.repo_info(repo_id) | 
					
						
						|  | content = { | 
					
						
						|  | "tokenizer_classes": set(), | 
					
						
						|  | "processor_classes": set(), | 
					
						
						|  | "model_classes": set(), | 
					
						
						|  | "sha": repo_info.sha, | 
					
						
						|  | } | 
					
						
						|  | except Exception: | 
					
						
						|  | continue | 
					
						
						|  | try: | 
					
						
						|  | time.sleep(1) | 
					
						
						|  | tokenizer_fast = AutoTokenizer.from_pretrained(repo_id) | 
					
						
						|  | content["tokenizer_classes"].add(tokenizer_fast.__class__.__name__) | 
					
						
						|  | except Exception: | 
					
						
						|  | pass | 
					
						
						|  | try: | 
					
						
						|  | time.sleep(1) | 
					
						
						|  | tokenizer_slow = AutoTokenizer.from_pretrained(repo_id, use_fast=False) | 
					
						
						|  | content["tokenizer_classes"].add(tokenizer_slow.__class__.__name__) | 
					
						
						|  | except Exception: | 
					
						
						|  | pass | 
					
						
						|  | try: | 
					
						
						|  | time.sleep(1) | 
					
						
						|  | img_p = AutoImageProcessor.from_pretrained(repo_id) | 
					
						
						|  | content["processor_classes"].add(img_p.__class__.__name__) | 
					
						
						|  | except Exception: | 
					
						
						|  | pass | 
					
						
						|  | try: | 
					
						
						|  | time.sleep(1) | 
					
						
						|  | feat_p = AutoFeatureExtractor.from_pretrained(repo_id) | 
					
						
						|  | if not isinstance(feat_p, BaseImageProcessor): | 
					
						
						|  | content["processor_classes"].add(feat_p.__class__.__name__) | 
					
						
						|  | except Exception: | 
					
						
						|  | pass | 
					
						
						|  | try: | 
					
						
						|  | time.sleep(1) | 
					
						
						|  | model_class = getattr(transformers, model) | 
					
						
						|  | m = model_class.from_pretrained(repo_id) | 
					
						
						|  | content["model_classes"].add(m.__class__.__name__) | 
					
						
						|  | except Exception: | 
					
						
						|  | pass | 
					
						
						|  | try: | 
					
						
						|  | time.sleep(1) | 
					
						
						|  | model_class = getattr(transformers, f"TF{model}") | 
					
						
						|  | m = model_class.from_pretrained(repo_id) | 
					
						
						|  | content["model_classes"].add(m.__class__.__name__) | 
					
						
						|  | except Exception: | 
					
						
						|  | pass | 
					
						
						|  |  | 
					
						
						|  | content["tokenizer_classes"] = sorted(content["tokenizer_classes"]) | 
					
						
						|  | content["processor_classes"] = sorted(content["processor_classes"]) | 
					
						
						|  | content["model_classes"] = sorted(content["model_classes"]) | 
					
						
						|  |  | 
					
						
						|  | summary[model] = content | 
					
						
						|  | with open(os.path.join(output_path, "hub_tiny_model_summary.json"), "w") as fp: | 
					
						
						|  | json.dump(summary, fp, ensure_ascii=False, indent=4) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if __name__ == "__main__": | 
					
						
						|  | parser = argparse.ArgumentParser() | 
					
						
						|  | parser.add_argument("--num_workers", default=1, type=int, help="The number of workers to run.") | 
					
						
						|  | args = parser.parse_args() | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | multiprocessing.set_start_method("spawn") | 
					
						
						|  |  | 
					
						
						|  | output_path = "tiny_models" | 
					
						
						|  | all = True | 
					
						
						|  | model_types = None | 
					
						
						|  | models_to_skip = get_tiny_model_names_from_repo() | 
					
						
						|  | no_check = True | 
					
						
						|  | upload = True | 
					
						
						|  | organization = "hf-internal-testing" | 
					
						
						|  |  | 
					
						
						|  | create_tiny_models( | 
					
						
						|  | output_path, | 
					
						
						|  | all, | 
					
						
						|  | model_types, | 
					
						
						|  | models_to_skip, | 
					
						
						|  | no_check, | 
					
						
						|  | upload, | 
					
						
						|  | organization, | 
					
						
						|  | token=os.environ.get("TOKEN", None), | 
					
						
						|  | num_workers=args.num_workers, | 
					
						
						|  | ) | 
					
						
						|  |  |