Spaces:

poltextlab
/

babel_machine

Running

babel_machine / utils.py

kovacsvi

removed table schema

410b6ef 2 months ago

7.76 kB

	import os
	import shutil
	import glob
	import subprocess
	from contextlib import contextmanager

	import torch
	import pandas as pd

	import json
	from google.oauth2 import service_account
	from pandas_gbq import to_gbq

	from transformers import AutoTokenizer, AutoModelForSequenceClassification

	from interfaces.cap import languages as languages_cap
	from interfaces.cap import domains as domains_cap

	from interfaces.emotion9 import languages as languages_emotion9

	from interfaces.illframes import domains as domains_illframes

	from interfaces.cap import build_huggingface_path as hf_cap_path
	from interfaces.cap_minor import build_huggingface_path as hf_cap_minor_path
	from interfaces.cap_minor_media import build_huggingface_path as hf_cap_minor_media_path
	from interfaces.cap_media import build_huggingface_path as hf_cap_media_path
	from interfaces.cap_media2 import build_huggingface_path as hf_cap_media2_path
	from interfaces.manifesto import build_huggingface_path as hf_manifesto_path
	from interfaces.sentiment import build_huggingface_path as hf_sentiment_path
	from interfaces.emotion import build_huggingface_path as hf_emotion_path
	from interfaces.emotion9 import build_huggingface_path as hf_emotion9_path
	from interfaces.ontolisst import build_huggingface_path as hf_ontlisst_path
	from interfaces.illframes import build_huggingface_path as hf_illframes_path
	from interfaces.ontolisst import build_huggingface_path as hf_ontolisst_path

	from huggingface_hub import scan_cache_dir

	JIT_DIR = "/data/jit_models"

	HF_TOKEN = os.environ["hf_read"]

	# should be a temporary solution
	models = [
	hf_manifesto_path(""),
	hf_sentiment_path(""),
	hf_emotion_path(""),
	hf_cap_minor_path("", ""),
	hf_cap_minor_path("", "social"),
	hf_ontolisst_path(""),
	]

	# it gets more difficult with cap
	domains_cap = list(domains_cap.values())
	for language in languages_cap:
	for domain in domains_cap:
	models.append(hf_cap_path(language, domain))

	# cap media
	models.append(hf_cap_media_path("", ""))

	# cap media2
	models.append(hf_cap_media2_path("", ""))

	# cap minor media
	models.append(hf_cap_minor_media_path("", "", False))

	# emotion9
	for language in languages_emotion9:
	models.append(hf_emotion9_path(language))

	# illframes (domains is a dict for some reason?)
	for domain in domains_illframes.values():
	models.append(hf_illframes_path(domain))

	tokenizers = ["xlm-roberta-large"]


	def download_hf_models():
	os.makedirs(JIT_DIR, exist_ok=True)

	for model_id in models:
	print(f"Downloading + JIT tracing model: {model_id}")

	safe_model_name = model_id.replace("/", "_")
	traced_model_path = os.path.join(JIT_DIR, f"{safe_model_name}.pt")

	if os.path.exists(traced_model_path):
	delete_unused_bin_files(model_id)
	print(f"⏩ Skipping JIT — already exists: {traced_model_path}")
	else:
	print(f"⚙️ Tracing and saving: {traced_model_path}")

	model = AutoModelForSequenceClassification.from_pretrained(
	model_id, token=HF_TOKEN, device_map="auto"
	)
	tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")

	model.eval()

	# Dummy input for tracing
	dummy_input = tokenizer(
	"Hello, world!",
	return_tensors="pt",
	padding=True,
	truncation=True,
	max_length=64,
	)

	# JIT trace
	traced_model = torch.jit.trace(
	model,
	(dummy_input["input_ids"], dummy_input["attention_mask"]),
	strict=False,
	)

	# Save traced model
	traced_model.save(traced_model_path)
	print(f"✔️ Saved JIT model to: {traced_model_path}")


	def df_h():
	df_result = subprocess.run(["df", "-H"], capture_output=True, text=True)
	print("=== Disk Free Space (df -H) ===")
	print(df_result.stdout)

	du_result = subprocess.run(
	["du", "-h", "--max-depth=2", "/data/"], capture_output=True, text=True
	)
	print("=== Disk Usage for /data/ (du -h --max-depth=2) ===")
	print(du_result.stdout)


	def delete_unused_bin_files(model_id: str):
	target_path = f"/data/models--poltextlab--{model_id}"

	# delete files in blobs/
	blob_bins = glob.glob(f"{target_path}/blobs/*/", recursive=True)

	# delete .bin files in snapshots/, except config.json
	snapshot_bins = glob.glob(f"{target_path}/snapshots/*/.bin", recursive=True)

	files_to_delete = blob_bins + snapshot_bins

	for file_path in files_to_delete:
	if os.path.basename(file_path) == "config.json":
	continue
	if os.path.isfile(path):
	print(f"Deleting file: {path}")
	os.remove(path)
	elif os.path.isdir(path):
	print(f"Deleting directory: {path}")
	shutil.rmtree(path)


	def delete_http_folders():
	http_folders = glob.glob("/data/http*")
	for folder in http_folders:
	if os.path.isdir(folder):
	print(f"Deleting: {folder}")
	shutil.rmtree(folder)


	@contextmanager
	def hf_cleanup():
	delete_http_folders()
	try:
	yield
	finally:
	delete_http_folders()


	def scan_cache():
	# Scan Hugging Face model cache
	cache_dir = os.environ.get(
	"TRANSFORMERS_CACHE", os.path.expanduser("~/.cache/huggingface/transformers")
	)
	scan_result = scan_cache_dir(cache_dir)

	print("=== 🤗 Hugging Face Model Cache ===")
	print(f"Cache size: {scan_result.size_on_disk / 1e6:.2f} MB")
	print(f"Number of repos: {len(scan_result.repos)}")
	for repo in scan_result.repos:
	print(f"- {repo.repo_id} ({repo.repo_type}) — {repo.size_on_disk / 1e6:.2f} MB")

	print("\n=== 🧊 TorchScript JIT Cache ===")
	if not os.path.exists(JIT_DIR):
	print(f"(Directory does not exist: {JIT_DIR})")
	return

	total_size = 0
	for filename in os.listdir(JIT_DIR):
	if filename.endswith(".pt"):
	path = os.path.join(JIT_DIR, filename)
	size = os.path.getsize(path)
	total_size += size
	print(f"- {filename}: {size / 1e6:.2f} MB")

	print(f"Total JIT cache size: {total_size / 1e6:.2f} MB")


	def set_hf_cache_dir(path: str):
	os.environ["TRANSFORMERS_CACHE"] = path
	os.environ["HF_HOME"] = path
	os.environ["HF_DATASETS_CACHE"] = path
	os.environ["TORCH_HOME"] = path


	def set_torch_threads():
	torch.set_num_threads(1)
	os.environ["OMP_NUM_THREADS"] = "1"
	os.environ["MKL_NUM_THREADS"] = "1"


	def is_disk_full(min_free_space_in_GB=10):
	total, used, free = shutil.disk_usage("/")
	free_gb = free / (1024**3)

	if free_gb >= min_free_space_in_GB:
	return False
	else:
	return True


	def update_bq_model_table():
	try:
	# Load BQ credentials from HF secret
	service_account_info = json.loads(os.environ["GCP_SERVICE_ACCOUNT_JSON"])
	credentials = service_account.Credentials.from_service_account_info(
	service_account_info
	)

	project_id = os.environ.get("BQ_PROJECT_ID", None)
	dataset_id = os.environ.get("BQ_DATASET_ID", None)
	table_id = os.environ.get("BQ_TABLE_ID", None) # hf_space_models
	full_table_id = f"{dataset_id}.{table_id}"

	to_gbq(
	pd.DataFrame({"model_id": models}),
	destination_table=full_table_id,
	project_id=project_id,
	if_exists="replace",
	credentials=credentials,
	)
	# TO-DO: add timestamp column?
	print(f"Updated BigQuery model table!")
	except Exception as e:
	print(f"BigQuery model table update failed: {e}")