Spaces:
Running
on
A100
Running
on
A100
Fix long queue waits with mechanism to prevent running duplicate jobs
#42
by
Blazgo
- opened
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import os
|
2 |
import pathlib
|
|
|
3 |
import random
|
4 |
import string
|
5 |
import tempfile
|
@@ -113,12 +114,40 @@ examples = [[str(f)] for f in pathlib.Path("examples").glob("*.yaml")]
|
|
113 |
COMMUNITY_HF_TOKEN = os.getenv("COMMUNITY_HF_TOKEN")
|
114 |
|
115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
def merge(yaml_config: str, hf_token: str, repo_name: str) -> Iterable[List[Log]]:
|
117 |
runner = LogsViewRunner()
|
118 |
|
119 |
if not yaml_config:
|
120 |
yield runner.log("Empty yaml, pick an example below", level="ERROR")
|
121 |
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
try:
|
123 |
merge_config = MergeConfiguration.model_validate(yaml.safe_load(yaml_config))
|
124 |
except Exception as e:
|
@@ -170,7 +199,7 @@ def merge(yaml_config: str, hf_token: str, repo_name: str) -> Iterable[List[Log]
|
|
170 |
return
|
171 |
|
172 |
# Set tmp HF_HOME to avoid filling up disk Space
|
173 |
-
tmp_env = os.environ.copy()
|
174 |
tmp_env["HF_HOME"] = f"{tmpdirname}/.cache"
|
175 |
full_cli = cli + f" --lora-merge-cache {tmpdirname}/.lora_cache"
|
176 |
yield from runner.run_command(full_cli.split(), cwd=merged_path, env=tmp_env)
|
@@ -188,6 +217,9 @@ def merge(yaml_config: str, hf_token: str, repo_name: str) -> Iterable[List[Log]
|
|
188 |
)
|
189 |
yield runner.log(f"Model successfully uploaded to HF: {repo_url.repo_id}")
|
190 |
|
|
|
|
|
|
|
191 |
# This is workaround. As the space always getting stuck.
|
192 |
def _restart_space():
|
193 |
huggingface_hub.HfApi().restart_space(repo_id="arcee-ai/mergekit-gui", token=COMMUNITY_HF_TOKEN, factory_reboot=False)
|
|
|
1 |
import os
|
2 |
import pathlib
|
3 |
+
import hashlib
|
4 |
import random
|
5 |
import string
|
6 |
import tempfile
|
|
|
114 |
COMMUNITY_HF_TOKEN = os.getenv("COMMUNITY_HF_TOKEN")
|
115 |
|
116 |
|
117 |
+
# A dictionary to store active jobs and their respective job IDs (which will be used to track them)
|
118 |
+
active_jobs = {}
|
119 |
+
|
120 |
+
def get_yaml_hash(yaml_config: str) -> str:
|
121 |
+
"""Generate a hash for the YAML config to detect duplicates."""
|
122 |
+
return hashlib.sha256(yaml_config.encode("utf-8")).hexdigest()
|
123 |
+
|
124 |
def merge(yaml_config: str, hf_token: str, repo_name: str) -> Iterable[List[Log]]:
|
125 |
runner = LogsViewRunner()
|
126 |
|
127 |
if not yaml_config:
|
128 |
yield runner.log("Empty yaml, pick an example below", level="ERROR")
|
129 |
return
|
130 |
+
|
131 |
+
yaml_hash = get_yaml_hash(yaml_config)
|
132 |
+
|
133 |
+
# Check if this YAML job is already running
|
134 |
+
if yaml_hash in active_jobs:
|
135 |
+
old_job_id = active_jobs[yaml_hash]
|
136 |
+
yield runner.log(f"Duplicate job detected! An identical job is already running with Job ID: {old_job_id}.", level="WARNING")
|
137 |
+
user_input = yield gradio.inputs.Button(label="Continue with new job", info="Do you want to cancel the old job and continue with the new one?")
|
138 |
+
|
139 |
+
if user_input == "Continue with new job":
|
140 |
+
# Cancel the old job and remove it from active jobs
|
141 |
+
runner.log(f"Cancelling the old job with Job ID: {old_job_id}")
|
142 |
+
# This part assumes you have the ability to cancel the previous job if needed
|
143 |
+
# In real implementation, you'd stop the old task/process here
|
144 |
+
active_jobs.pop(yaml_hash) # Remove the old job from the active jobs list
|
145 |
+
else:
|
146 |
+
# If user chooses not to continue, exit
|
147 |
+
yield runner.log("Duplicate job detected. Operation aborted.", level="ERROR")
|
148 |
+
return
|
149 |
+
|
150 |
+
# Proceed with the merge
|
151 |
try:
|
152 |
merge_config = MergeConfiguration.model_validate(yaml.safe_load(yaml_config))
|
153 |
except Exception as e:
|
|
|
199 |
return
|
200 |
|
201 |
# Set tmp HF_HOME to avoid filling up disk Space
|
202 |
+
tmp_env = os.environ.copy() # taken from https://stackoverflow.com/a/4453495
|
203 |
tmp_env["HF_HOME"] = f"{tmpdirname}/.cache"
|
204 |
full_cli = cli + f" --lora-merge-cache {tmpdirname}/.lora_cache"
|
205 |
yield from runner.run_command(full_cli.split(), cwd=merged_path, env=tmp_env)
|
|
|
217 |
)
|
218 |
yield runner.log(f"Model successfully uploaded to HF: {repo_url.repo_id}")
|
219 |
|
220 |
+
# Track this YAML as an active job
|
221 |
+
active_jobs[yaml_hash] = "new_job_id"
|
222 |
+
|
223 |
# This is workaround. As the space always getting stuck.
|
224 |
def _restart_space():
|
225 |
huggingface_hub.HfApi().restart_space(repo_id="arcee-ai/mergekit-gui", token=COMMUNITY_HF_TOKEN, factory_reboot=False)
|