Spaces:
Running
Running
| import os | |
| import shutil | |
| import glob | |
| import subprocess | |
| from contextlib import contextmanager | |
| import torch | |
| import pandas as pd | |
| import json | |
| from google.oauth2 import service_account | |
| from pandas_gbq import to_gbq | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| from interfaces.cap import languages as languages_cap | |
| from interfaces.cap import domains as domains_cap | |
| from interfaces.emotion9 import languages as languages_emotion9 | |
| from interfaces.illframes import domains as domains_illframes | |
| from interfaces.cap import build_huggingface_path as hf_cap_path | |
| from interfaces.cap_minor import build_huggingface_path as hf_cap_minor_path | |
| from interfaces.cap_minor_media import build_huggingface_path as hf_cap_minor_media_path | |
| from interfaces.cap_media import build_huggingface_path as hf_cap_media_path | |
| from interfaces.cap_media2 import build_huggingface_path as hf_cap_media2_path | |
| from interfaces.manifesto import build_huggingface_path as hf_manifesto_path | |
| from interfaces.sentiment import build_huggingface_path as hf_sentiment_path | |
| from interfaces.emotion import build_huggingface_path as hf_emotion_path | |
| from interfaces.emotion9 import build_huggingface_path as hf_emotion9_path | |
| from interfaces.ontolisst import build_huggingface_path as hf_ontlisst_path | |
| from interfaces.illframes import build_huggingface_path as hf_illframes_path | |
| from interfaces.ontolisst import build_huggingface_path as hf_ontolisst_path | |
| from huggingface_hub import scan_cache_dir | |
| JIT_DIR = "/data/jit_models" | |
| HF_TOKEN = os.environ["hf_read"] | |
| # should be a temporary solution | |
| models = [ | |
| hf_manifesto_path(""), | |
| hf_sentiment_path(""), | |
| hf_emotion_path(""), | |
| hf_cap_minor_path("", ""), | |
| hf_cap_minor_path("", "social"), | |
| hf_ontolisst_path(""), | |
| ] | |
| # it gets more difficult with cap | |
| domains_cap = list(domains_cap.values()) | |
| for language in languages_cap: | |
| for domain in domains_cap: | |
| models.append(hf_cap_path(language, domain)) | |
| # cap media | |
| models.append(hf_cap_media_path("", "")) | |
| # cap media2 | |
| models.append(hf_cap_media2_path("", "")) | |
| # cap minor media | |
| models.append(hf_cap_minor_media_path("", "", False)) | |
| # emotion9 | |
| for language in languages_emotion9: | |
| models.append(hf_emotion9_path(language)) | |
| # illframes (domains is a dict for some reason?) | |
| for domain in domains_illframes.values(): | |
| models.append(hf_illframes_path(domain)) | |
| tokenizers = ["xlm-roberta-large"] | |
| def download_hf_models(): | |
| os.makedirs(JIT_DIR, exist_ok=True) | |
| for model_id in models: | |
| print(f"Downloading + JIT tracing model: {model_id}") | |
| safe_model_name = model_id.replace("/", "_") | |
| traced_model_path = os.path.join(JIT_DIR, f"{safe_model_name}.pt") | |
| if os.path.exists(traced_model_path): | |
| delete_unused_bin_files(model_id) | |
| print(f"β© Skipping JIT β already exists: {traced_model_path}") | |
| else: | |
| print(f"βοΈ Tracing and saving: {traced_model_path}") | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| model_id, token=HF_TOKEN, device_map="auto" | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large") | |
| model.eval() | |
| # Dummy input for tracing | |
| dummy_input = tokenizer( | |
| "Hello, world!", | |
| return_tensors="pt", | |
| padding=True, | |
| truncation=True, | |
| max_length=64, | |
| ) | |
| # JIT trace | |
| traced_model = torch.jit.trace( | |
| model, | |
| (dummy_input["input_ids"], dummy_input["attention_mask"]), | |
| strict=False, | |
| ) | |
| # Save traced model | |
| traced_model.save(traced_model_path) | |
| print(f"βοΈ Saved JIT model to: {traced_model_path}") | |
| def df_h(): | |
| df_result = subprocess.run(["df", "-H"], capture_output=True, text=True) | |
| print("=== Disk Free Space (df -H) ===") | |
| print(df_result.stdout) | |
| du_result = subprocess.run( | |
| ["du", "-h", "--max-depth=2", "/data/"], capture_output=True, text=True | |
| ) | |
| print("=== Disk Usage for /data/ (du -h --max-depth=2) ===") | |
| print(du_result.stdout) | |
| def delete_unused_bin_files(model_id: str): | |
| target_path = f"/data/models--poltextlab--{model_id}" | |
| # delete files in blobs/ | |
| blob_bins = glob.glob(f"{target_path}/blobs/**/*", recursive=True) | |
| # delete .bin files in snapshots/, except config.json | |
| snapshot_bins = glob.glob(f"{target_path}/snapshots/**/*.bin", recursive=True) | |
| files_to_delete = blob_bins + snapshot_bins | |
| for file_path in files_to_delete: | |
| if os.path.basename(file_path) == "config.json": | |
| continue | |
| if os.path.isfile(path): | |
| print(f"Deleting file: {path}") | |
| os.remove(path) | |
| elif os.path.isdir(path): | |
| print(f"Deleting directory: {path}") | |
| shutil.rmtree(path) | |
| def delete_http_folders(): | |
| http_folders = glob.glob("/data/http*") | |
| for folder in http_folders: | |
| if os.path.isdir(folder): | |
| print(f"Deleting: {folder}") | |
| shutil.rmtree(folder) | |
| def hf_cleanup(): | |
| delete_http_folders() | |
| try: | |
| yield | |
| finally: | |
| delete_http_folders() | |
| def scan_cache(): | |
| # Scan Hugging Face model cache | |
| cache_dir = os.environ.get( | |
| "TRANSFORMERS_CACHE", os.path.expanduser("~/.cache/huggingface/transformers") | |
| ) | |
| scan_result = scan_cache_dir(cache_dir) | |
| print("=== π€ Hugging Face Model Cache ===") | |
| print(f"Cache size: {scan_result.size_on_disk / 1e6:.2f} MB") | |
| print(f"Number of repos: {len(scan_result.repos)}") | |
| for repo in scan_result.repos: | |
| print(f"- {repo.repo_id} ({repo.repo_type}) β {repo.size_on_disk / 1e6:.2f} MB") | |
| print("\n=== π§ TorchScript JIT Cache ===") | |
| if not os.path.exists(JIT_DIR): | |
| print(f"(Directory does not exist: {JIT_DIR})") | |
| return | |
| total_size = 0 | |
| for filename in os.listdir(JIT_DIR): | |
| if filename.endswith(".pt"): | |
| path = os.path.join(JIT_DIR, filename) | |
| size = os.path.getsize(path) | |
| total_size += size | |
| print(f"- {filename}: {size / 1e6:.2f} MB") | |
| print(f"Total JIT cache size: {total_size / 1e6:.2f} MB") | |
| def set_hf_cache_dir(path: str): | |
| os.environ["TRANSFORMERS_CACHE"] = path | |
| os.environ["HF_HOME"] = path | |
| os.environ["HF_DATASETS_CACHE"] = path | |
| os.environ["TORCH_HOME"] = path | |
| def set_torch_threads(): | |
| torch.set_num_threads(1) | |
| os.environ["OMP_NUM_THREADS"] = "1" | |
| os.environ["MKL_NUM_THREADS"] = "1" | |
| def is_disk_full(min_free_space_in_GB=10): | |
| total, used, free = shutil.disk_usage("/") | |
| free_gb = free / (1024**3) | |
| if free_gb >= min_free_space_in_GB: | |
| return False | |
| else: | |
| return True | |
| def update_bq_model_table(): | |
| try: | |
| # Load BQ credentials from HF secret | |
| service_account_info = json.loads(os.environ["GCP_SERVICE_ACCOUNT_JSON"]) | |
| credentials = service_account.Credentials.from_service_account_info( | |
| service_account_info | |
| ) | |
| project_id = os.environ.get("BQ_PROJECT_ID", None) | |
| dataset_id = os.environ.get("BQ_DATASET_ID", None) | |
| table_id = os.environ.get("BQ_TABLE_ID", None) # hf_space_models | |
| full_table_id = f"{dataset_id}.{table_id}" | |
| to_gbq( | |
| pd.DataFrame({"model_id": models}), | |
| destination_table=full_table_id, | |
| project_id=project_id, | |
| if_exists="replace", | |
| credentials=credentials, | |
| ) | |
| # TO-DO: add timestamp column? | |
| print(f"Updated BigQuery model table!") | |
| except Exception as e: | |
| print(f"BigQuery model table update failed: {e}") | |