Spaces:
Running
Running
| import os | |
| import shutil | |
| import subprocess | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| from interfaces.cap import languages as languages_cap | |
| from interfaces.cap import domains as domains_cap | |
| from interfaces.emotion9 import languages as languages_emotion9 | |
| from interfaces.illframes import domains as domains_illframes | |
| from interfaces.cap import build_huggingface_path as hf_cap_path | |
| from interfaces.cap_minor import build_huggingface_path as hf_cap_minor_path | |
| from interfaces.cap_minor_media import build_huggingface_path as hf_cap_minor_media_path | |
| from interfaces.cap_media_demo import build_huggingface_path as hf_cap_media_path # why... just follow the name template the next time pls | |
| from interfaces.manifesto import build_huggingface_path as hf_manifesto_path | |
| from interfaces.sentiment import build_huggingface_path as hf_sentiment_path | |
| from interfaces.emotion import build_huggingface_path as hf_emotion_path | |
| from interfaces.emotion9 import build_huggingface_path as hf_emotion9_path | |
| from interfaces.ontolisst import build_huggingface_path as hf_ontlisst_path | |
| from interfaces.illframes import build_huggingface_path as hf_illframes_path | |
| from interfaces.ontolisst import build_huggingface_path as hf_ontolisst_path | |
| from huggingface_hub import scan_cache_dir | |
| JIT_DIR = "/data/jit_models" | |
| HF_TOKEN = os.environ["hf_read"] | |
| # should be a temporary solution | |
| models = [hf_manifesto_path(""), hf_sentiment_path(""), hf_emotion_path(""), hf_cap_minor_path("", ""), hf_ontolisst_path("")] | |
| # it gets more difficult with cap | |
| domains_cap = list(domains_cap.values()) | |
| for language in languages_cap: | |
| for domain in domains_cap: | |
| models.append(hf_cap_path(language, domain)) | |
| # cap media | |
| models.append(hf_cap_media_path("", "")) | |
| # cap minor media | |
| models.append(hf_cap_minor_media_path("", "", False)) | |
| # emotion9 | |
| for language in languages_emotion9: | |
| models.append(hf_emotion9_path(language)) | |
| # illframes (domains is a dict for some reason?) | |
| for domain in domains_illframes.values(): | |
| models.append(hf_illframes_path(domain)) | |
| tokenizers = ["xlm-roberta-large"] | |
| def download_hf_models(): | |
| # Ensure the JIT model directory exists | |
| os.makedirs(JIT_DIR, exist_ok=True) | |
| for model_id in models: | |
| print(f"Downloading + JIT tracing model: {model_id}") | |
| # Load model and tokenizer | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| model_id, | |
| token=HF_TOKEN, | |
| device_map="auto" | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large") | |
| safe_model_name = model_id.replace("/", "_") | |
| traced_model_path = os.path.join(JIT_DIR, f"{safe_model_name}.pt") | |
| if os.path.exists(traced_model_path): | |
| print(f"β© Skipping JIT β already exists: {traced_model_path}") | |
| else: | |
| print(f"βοΈ Tracing and saving: {traced_model_path}") | |
| model.eval() | |
| # Dummy input for tracing | |
| dummy_input = tokenizer( | |
| "Hello, world!", | |
| return_tensors="pt", | |
| padding=True, | |
| truncation=True, | |
| max_length=256 | |
| ) | |
| # JIT trace | |
| traced_model = torch.jit.trace( | |
| model, | |
| (dummy_input["input_ids"], dummy_input["attention_mask"]), | |
| strict=False | |
| ) | |
| # Save traced model | |
| traced_model.save(traced_model_path) | |
| print(f"βοΈ Saved JIT model to: {traced_model_path}") | |
| def df_h(): | |
| result = subprocess.run(["df", "-H"], capture_output=True, text=True) | |
| print(result.stdout) | |
| def scan_cache(): | |
| cache_dir = os.environ.get("TRANSFORMERS_CACHE", os.path.expanduser("~/.cache/huggingface/transformers")) | |
| scan_result = scan_cache_dir(cache_dir) | |
| print("=== Model Cache Report ===") | |
| print(f"Cache size: {scan_result.size_on_disk / 1e6:.2f} MB") | |
| print(f"Number of repos: {len(scan_result.repos)}") | |
| for repo in scan_result.repos: | |
| print(f"- {repo.repo_id} ({repo.repo_type}) β {repo.size_on_disk / 1e6:.2f} MB") | |
| def set_hf_cache_dir(path:str): | |
| os.environ['TRANSFORMERS_CACHE'] = path | |
| os.environ['HF_HOME'] = path | |
| os.environ['HF_DATASETS_CACHE'] = path | |
| os.environ['TORCH_HOME'] = path | |
| def is_disk_full(min_free_space_in_GB=10): | |
| total, used, free = shutil.disk_usage("/") | |
| free_gb = free / (1024 ** 3) | |
| if free_gb >= min_free_space_in_GB: | |
| return False | |
| else: | |
| return True | |