Spaces:

poltextlab
/

babel_machine

Running

kovacsvi commited on Jun 3

Commit

8453705

1 Parent(s): 7cbaea3

delete unused model weights (before JIT)

Files changed (1) hide show

utils.py CHANGED Viewed

@@ -63,27 +63,26 @@ for domain in domains_illframes.values():
 tokenizers = ["xlm-roberta-large"]
 def download_hf_models():
-    # Ensure the JIT model directory exists
     os.makedirs(JIT_DIR, exist_ok=True)
     for model_id in models:
         print(f"Downloading + JIT tracing model: {model_id}")
-        # Load model and tokenizer
-        model = AutoModelForSequenceClassification.from_pretrained(
-            model_id,
-            token=HF_TOKEN,
-            device_map="auto"
-        )
-        tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
         safe_model_name = model_id.replace("/", "_")
         traced_model_path = os.path.join(JIT_DIR, f"{safe_model_name}.pt")
         if os.path.exists(traced_model_path):
             print(f"⏩ Skipping JIT — already exists: {traced_model_path}")
         else:
             print(f"⚙️  Tracing and saving: {traced_model_path}")
             model.eval()
@@ -116,6 +115,15 @@ def df_h():
     print("=== Disk Usage for /data/ (du -h --max-depth=2) ===")
     print(du_result.stdout)
 def delete_http_folders():
     http_folders = glob.glob("/data/http*")

 tokenizers = ["xlm-roberta-large"]
 def download_hf_models():
     os.makedirs(JIT_DIR, exist_ok=True)
     for model_id in models:
         print(f"Downloading + JIT tracing model: {model_id}")
         safe_model_name = model_id.replace("/", "_")
         traced_model_path = os.path.join(JIT_DIR, f"{safe_model_name}.pt")
         if os.path.exists(traced_model_path):
+            delete_unused_bin_files(model_id)
             print(f"⏩ Skipping JIT — already exists: {traced_model_path}")
         else:
             print(f"⚙️  Tracing and saving: {traced_model_path}")
+            model = AutoModelForSequenceClassification.from_pretrained(
+                model_id,
+                token=HF_TOKEN,
+                device_map="auto"
+            )
+            tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
             model.eval()
     print("=== Disk Usage for /data/ (du -h --max-depth=2) ===")
     print(du_result.stdout)
+def delete_unused_bin_files(model_id: str):
+    target_path = f"/data/models--poltextlab--{model_id}"
+    bin_files = glob.glob(f"{target_path}/**/*.bin", recursive=True)
+    for file_path in bin_files:
+        if os.path.isfile(file_path):
+            print(f"Deleting: {file_path}")
+            os.remove(file_path)
 def delete_http_folders():
     http_folders = glob.glob("/data/http*")