kovacsvi commited on
Commit
8453705
·
1 Parent(s): 7cbaea3

delete unused model weights (before JIT)

Browse files
Files changed (1) hide show
  1. utils.py +17 -9
utils.py CHANGED
@@ -63,27 +63,26 @@ for domain in domains_illframes.values():
63
  tokenizers = ["xlm-roberta-large"]
64
 
65
  def download_hf_models():
66
- # Ensure the JIT model directory exists
67
  os.makedirs(JIT_DIR, exist_ok=True)
68
 
69
  for model_id in models:
70
  print(f"Downloading + JIT tracing model: {model_id}")
71
 
72
- # Load model and tokenizer
73
- model = AutoModelForSequenceClassification.from_pretrained(
74
- model_id,
75
- token=HF_TOKEN,
76
- device_map="auto"
77
- )
78
- tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
79
-
80
  safe_model_name = model_id.replace("/", "_")
81
  traced_model_path = os.path.join(JIT_DIR, f"{safe_model_name}.pt")
82
 
83
  if os.path.exists(traced_model_path):
 
84
  print(f"⏩ Skipping JIT — already exists: {traced_model_path}")
85
  else:
86
  print(f"⚙️ Tracing and saving: {traced_model_path}")
 
 
 
 
 
 
 
87
 
88
  model.eval()
89
 
@@ -116,6 +115,15 @@ def df_h():
116
  print("=== Disk Usage for /data/ (du -h --max-depth=2) ===")
117
  print(du_result.stdout)
118
 
 
 
 
 
 
 
 
 
 
119
 
120
  def delete_http_folders():
121
  http_folders = glob.glob("/data/http*")
 
63
  tokenizers = ["xlm-roberta-large"]
64
 
65
  def download_hf_models():
 
66
  os.makedirs(JIT_DIR, exist_ok=True)
67
 
68
  for model_id in models:
69
  print(f"Downloading + JIT tracing model: {model_id}")
70
 
 
 
 
 
 
 
 
 
71
  safe_model_name = model_id.replace("/", "_")
72
  traced_model_path = os.path.join(JIT_DIR, f"{safe_model_name}.pt")
73
 
74
  if os.path.exists(traced_model_path):
75
+ delete_unused_bin_files(model_id)
76
  print(f"⏩ Skipping JIT — already exists: {traced_model_path}")
77
  else:
78
  print(f"⚙️ Tracing and saving: {traced_model_path}")
79
+
80
+ model = AutoModelForSequenceClassification.from_pretrained(
81
+ model_id,
82
+ token=HF_TOKEN,
83
+ device_map="auto"
84
+ )
85
+ tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
86
 
87
  model.eval()
88
 
 
115
  print("=== Disk Usage for /data/ (du -h --max-depth=2) ===")
116
  print(du_result.stdout)
117
 
118
+
119
+ def delete_unused_bin_files(model_id: str):
120
+ target_path = f"/data/models--poltextlab--{model_id}"
121
+ bin_files = glob.glob(f"{target_path}/**/*.bin", recursive=True)
122
+ for file_path in bin_files:
123
+ if os.path.isfile(file_path):
124
+ print(f"Deleting: {file_path}")
125
+ os.remove(file_path)
126
+
127
 
128
  def delete_http_folders():
129
  http_folders = glob.glob("/data/http*")