Spaces:

poltextlab
/

babel_machine

Running

File size: 6,851 Bytes

e390ccc
4bba8df
b1b87fb
4bba8df
3f77878
e390ccc
ca62943
 
4bba8df
e390ccc
c554973
 
 
4bba8df
 
 
 
e390ccc
4bba8df
af68a82
17ff73c
0c9d7b1
e390ccc
 
 
4bba8df
 
 
 
e390ccc
44d3c68
 
caa0374
d68fe8b
e390ccc
 
c554973
4bba8df
 
 
 
6d39e54
 
c554973
4bba8df
2926563
 
af68a82
0c9d7b1
 
 
af68a82
 
2926563
4bba8df
 
 
 
 
 
 
c554973
e390ccc
 
3abd99d
caa0374
 
41bc8d2
caa0374
4bba8df
caa0374
 
654bf8b
fb1a253
8453705
654bf8b
 
 
8453705
 
 
 
 
 
 
654bf8b
 
 
 
 
 
 
 
 
06f6aab
654bf8b
 
 
 
 
 
 
 
 
 
 
 
4bba8df
 
04d7b9c
 
 
 
 
 
 
4bba8df
8453705
 
 
bf07f99
65e6711
10307a1
bf07f99
 
 
 
 
 
 
 
 
8d3cc6e
 
 
 
 
 
8453705
b1b87fb
3f77878
b1b87fb
 
 
 
 
3f77878
 
 
 
 
 
 
 
 
b1b87fb
 
af77a1c
0a394ee
44d3c68
 
af77a1c
0a394ee
44d3c68
 
 
 
0a394ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4bba8df
 
 
 
 
 
8027e9b
 
 
 
 
 
4bba8df
e390ccc
4bba8df

import os
import shutil
import glob
import subprocess
from contextlib import contextmanager

import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification

from interfaces.cap import languages as languages_cap
from interfaces.cap import domains as domains_cap

from interfaces.emotion9 import languages as languages_emotion9

from interfaces.illframes import domains as domains_illframes

from interfaces.cap import build_huggingface_path as hf_cap_path
from interfaces.cap_minor import build_huggingface_path as hf_cap_minor_path
from interfaces.cap_minor_media import build_huggingface_path as hf_cap_minor_media_path
from interfaces.cap_media_demo import build_huggingface_path as hf_cap_media_path # why... just follow the name template the next time pls
from interfaces.cap_media2 import build_huggingface_path as hf_cap_media2_path
from interfaces.manifesto import build_huggingface_path as hf_manifesto_path
from interfaces.sentiment import build_huggingface_path as hf_sentiment_path
from interfaces.emotion import build_huggingface_path as hf_emotion_path
from interfaces.emotion9 import build_huggingface_path as hf_emotion9_path
from interfaces.ontolisst import build_huggingface_path as hf_ontlisst_path
from interfaces.illframes import build_huggingface_path as hf_illframes_path
from interfaces.ontolisst import build_huggingface_path as hf_ontolisst_path

from huggingface_hub import scan_cache_dir

JIT_DIR = "/data/jit_models"

HF_TOKEN = os.environ["hf_read"]

# should be a temporary solution
models = [hf_manifesto_path(""), hf_sentiment_path(""), hf_emotion_path(""), hf_cap_minor_path("", ""), hf_ontolisst_path("")]

# it gets more difficult with cap
domains_cap = list(domains_cap.values())
for language in languages_cap:
    for domain in domains_cap:
        models.append(hf_cap_path(language, domain))
        
# cap media
models.append(hf_cap_media_path("", ""))

# cap media2
models.append(hf_cap_media2_path("", ""))

# cap minor media
models.append(hf_cap_minor_media_path("", "", False))
        
# emotion9
for language in languages_emotion9:
    models.append(hf_emotion9_path(language))
    
# illframes (domains is a dict for some reason?)
for domain in domains_illframes.values():
    models.append(hf_illframes_path(domain))

tokenizers = ["xlm-roberta-large"]

def download_hf_models():
    os.makedirs(JIT_DIR, exist_ok=True)

    for model_id in models:
        print(f"Downloading + JIT tracing model: {model_id}")
        
        safe_model_name = model_id.replace("/", "_")
        traced_model_path = os.path.join(JIT_DIR, f"{safe_model_name}.pt")
        
        if os.path.exists(traced_model_path):
            delete_unused_bin_files(model_id)
            print(f"⏩ Skipping JIT — already exists: {traced_model_path}")
        else:
            print(f"⚙️  Tracing and saving: {traced_model_path}")
            
            model = AutoModelForSequenceClassification.from_pretrained(
                model_id,
                token=HF_TOKEN,
                device_map="auto"
            )
            tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")

            model.eval()

            # Dummy input for tracing
            dummy_input = tokenizer(
                "Hello, world!",
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=64
            )

            # JIT trace
            traced_model = torch.jit.trace(
                model,
                (dummy_input["input_ids"], dummy_input["attention_mask"]),
                strict=False
            )

            # Save traced model
            traced_model.save(traced_model_path)
            print(f"✔️ Saved JIT model to: {traced_model_path}")
        
def df_h():
    df_result = subprocess.run(["df", "-H"], capture_output=True, text=True)
    print("=== Disk Free Space (df -H) ===")
    print(df_result.stdout)

    du_result = subprocess.run(["du", "-h", "--max-depth=2", "/data/"], capture_output=True, text=True)
    print("=== Disk Usage for /data/ (du -h --max-depth=2) ===")
    print(du_result.stdout)
    

def delete_unused_bin_files(model_id: str):
    target_path = f"/data/models--poltextlab--{model_id}"

    # delete files in blobs/
    blob_bins = glob.glob(f"{target_path}/blobs/**/*", recursive=True)
    
    # delete .bin files in snapshots/, except config.json
    snapshot_bins = glob.glob(f"{target_path}/snapshots/**/*.bin", recursive=True)

    files_to_delete = blob_bins + snapshot_bins

    for file_path in files_to_delete:
        if os.path.basename(file_path) == "config.json":
            continue
        if os.path.isfile(path):
            print(f"Deleting file: {path}")
            os.remove(path)
        elif os.path.isdir(path):
            print(f"Deleting directory: {path}")
            shutil.rmtree(path)
    
    
def delete_http_folders():
    http_folders = glob.glob("/data/http*")
    for folder in http_folders:
        if os.path.isdir(folder):
            print(f"Deleting: {folder}")
            shutil.rmtree(folder)
       
            
@contextmanager
def hf_cleanup():
    delete_http_folders()
    try:
        yield
    finally:
        delete_http_folders()
    
    
def scan_cache():
    # Scan Hugging Face model cache
    cache_dir = os.environ.get("TRANSFORMERS_CACHE", os.path.expanduser("~/.cache/huggingface/transformers"))
    scan_result = scan_cache_dir(cache_dir)
    
    print("=== 🤗 Hugging Face Model Cache ===")
    print(f"Cache size: {scan_result.size_on_disk / 1e6:.2f} MB")
    print(f"Number of repos: {len(scan_result.repos)}")
    for repo in scan_result.repos:
        print(f"- {repo.repo_id} ({repo.repo_type}) — {repo.size_on_disk / 1e6:.2f} MB")

    print("\n=== 🧊 TorchScript JIT Cache ===")
    if not os.path.exists(JIT_DIR):
        print(f"(Directory does not exist: {JIT_DIR})")
        return

    total_size = 0
    for filename in os.listdir(JIT_DIR):
        if filename.endswith(".pt"):
            path = os.path.join(JIT_DIR, filename)
            size = os.path.getsize(path)
            total_size += size
            print(f"- {filename}: {size / 1e6:.2f} MB")
    
    print(f"Total JIT cache size: {total_size / 1e6:.2f} MB")
    
def set_hf_cache_dir(path:str):
    os.environ['TRANSFORMERS_CACHE'] = path
    os.environ['HF_HOME'] = path
    os.environ['HF_DATASETS_CACHE'] = path
    os.environ['TORCH_HOME'] = path
  
    
def set_torch_threads():
    torch.set_num_threads(1)
    os.environ["OMP_NUM_THREADS"] = "1"
    os.environ["MKL_NUM_THREADS"] = "1"


def is_disk_full(min_free_space_in_GB=10):
    total, used, free = shutil.disk_usage("/")
    free_gb = free / (1024 ** 3)
    
    if free_gb >= min_free_space_in_GB:
        return False
    else:
        return True