Spaces:
Running
Running
remove Phi-4 and DeepSeek Lite, change model key for ggufs
Browse files
app.py
CHANGED
@@ -35,36 +35,26 @@ os.makedirs("performance_metrics", exist_ok=True)
|
|
35 |
|
36 |
# Model configuration dictionary
|
37 |
MODEL_CONFIG = {
|
38 |
-
"Llama 2 Chat": {
|
39 |
"name": "TheBloke/Llama-2-7B-Chat-GGUF",
|
40 |
"description": "Llama 2 7B Chat model with good general performance",
|
41 |
"dtype": torch.float16 if torch.cuda.is_available() else torch.float32
|
42 |
},
|
43 |
-
"TinyLlama Chat": {
|
44 |
"name": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
|
45 |
"description": "Lightweight model with 1.1B parameters, fast and efficient",
|
46 |
"dtype": torch.float16 if torch.cuda.is_available() else torch.float32
|
47 |
},
|
48 |
-
"Mistral Instruct": {
|
49 |
"name": "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
|
50 |
"description": "7B instruction-tuned model with excellent reasoning",
|
51 |
"dtype": torch.float16 if torch.cuda.is_available() else torch.float32
|
52 |
},
|
53 |
-
"Phi-4 Mini Instruct": {
|
54 |
-
"name": "microsoft/Phi-4-mini-instruct",
|
55 |
-
"description": "Lightweight model from Microsoft suitable for instructional tasks",
|
56 |
-
"dtype": torch.float16 if torch.cuda.is_available() else torch.float32
|
57 |
-
},
|
58 |
"DeepSeek Coder Instruct": {
|
59 |
"name": "deepseek-ai/deepseek-coder-1.3b-instruct",
|
60 |
"description": "1.3B model for code and data analysis",
|
61 |
"dtype": torch.float16 if torch.cuda.is_available() else torch.float32
|
62 |
},
|
63 |
-
"DeepSeek Lite Chat": {
|
64 |
-
"name": "deepseek-ai/DeepSeek-V2-Lite-Chat",
|
65 |
-
"description": "Light but powerful chat model from DeepSeek",
|
66 |
-
"dtype": torch.float16 if torch.cuda.is_available() else torch.float32
|
67 |
-
},
|
68 |
"Qwen2.5 Coder Instruct": {
|
69 |
"name": "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF",
|
70 |
"description": "3B model specialized for code and technical applications",
|
@@ -174,70 +164,6 @@ def initialize_model_once(model_key):
|
|
174 |
)
|
175 |
MODEL_CACHE["is_gguf"] = False
|
176 |
|
177 |
-
# For Phi-4 specifically
|
178 |
-
elif "Phi-4" in model_key:
|
179 |
-
MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
180 |
-
# Load model with optimized memory
|
181 |
-
try:
|
182 |
-
MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
|
183 |
-
model_name,
|
184 |
-
device_map="cpu", # Force CPU explicitly
|
185 |
-
torch_dtype=torch.float32, # Use float32 for CPU
|
186 |
-
low_cpu_mem_usage=True,
|
187 |
-
trust_remote_code=True,
|
188 |
-
offload_folder="model_offload",
|
189 |
-
offload_state_dict=True,
|
190 |
-
max_memory={"cpu": "1.7GiB"} # Limit memory usage
|
191 |
-
)
|
192 |
-
|
193 |
-
except Exception as e:
|
194 |
-
print(f"Error loading Phi-4 with full settings: {str(e)}")
|
195 |
-
print("Trying with minimal configuration...")
|
196 |
-
|
197 |
-
# Fallback with minimum configuration
|
198 |
-
MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
|
199 |
-
model_name,
|
200 |
-
device_map="cpu",
|
201 |
-
torch_dtype=torch.float32,
|
202 |
-
trust_remote_code=True,
|
203 |
-
offload_folder="model_offload",
|
204 |
-
low_cpu_mem_usage=True
|
205 |
-
)
|
206 |
-
|
207 |
-
MODEL_CACHE["is_gguf"] = False
|
208 |
-
|
209 |
-
# Special handling for DeepSeek Lite Chat
|
210 |
-
elif model_key == "DeepSeek Lite Chat":
|
211 |
-
MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
212 |
-
# Load model with optimized memory
|
213 |
-
try:
|
214 |
-
MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
|
215 |
-
model_name,
|
216 |
-
device_map="cpu", # Force CPU
|
217 |
-
torch_dtype=torch.float32, # Use float32 for CPU
|
218 |
-
low_cpu_mem_usage=True,
|
219 |
-
trust_remote_code=True,
|
220 |
-
max_memory={"cpu": "1.7GiB"}
|
221 |
-
)
|
222 |
-
except Exception as e:
|
223 |
-
print(f"Error loading DeepSeek with full settings: {str(e)}")
|
224 |
-
print("Trying with lightweight approach...")
|
225 |
-
|
226 |
-
# Fallback to lighter approach
|
227 |
-
import torch.nn as nn
|
228 |
-
from transformers import PreTrainedModel
|
229 |
-
|
230 |
-
# Trying to load model with smaller fraction
|
231 |
-
MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
|
232 |
-
model_name,
|
233 |
-
device_map="cpu",
|
234 |
-
torch_dtype=torch.float32,
|
235 |
-
trust_remote_code=True,
|
236 |
-
low_cpu_mem_usage=True
|
237 |
-
)
|
238 |
-
|
239 |
-
MODEL_CACHE["is_gguf"] = False
|
240 |
-
|
241 |
# Handle standard HF models
|
242 |
else:
|
243 |
# Only use quantization if CUDA is available
|
@@ -296,28 +222,13 @@ def initialize_model_once(model_key):
|
|
296 |
def get_fallback_model(current_model):
|
297 |
"""Get appropriate fallback model for problematic models"""
|
298 |
fallback_map = {
|
299 |
-
"
|
300 |
-
"DeepSeek Lite Chat": "DeepSeek Coder Instruct",
|
301 |
-
"Flan T5 Small": "Llama 2 Chat"
|
302 |
}
|
303 |
-
return fallback_map.get(current_model, "Llama 2 Chat")
|
304 |
|
305 |
-
# Optimized pipeline for
|
306 |
def create_optimized_pipeline(model, tokenizer, model_key):
|
307 |
-
"""Optimized pipeline for
|
308 |
-
if model_key == "Phi-4 Mini Instruct" or model_key == "DeepSeek Lite Chat":
|
309 |
-
# Use minimum parameter
|
310 |
-
pipe = pipeline(
|
311 |
-
"text-generation",
|
312 |
-
model=model,
|
313 |
-
tokenizer=tokenizer,
|
314 |
-
max_new_tokens=128, # Kurangi jumlah token yang dihasilkan
|
315 |
-
temperature=0.3,
|
316 |
-
top_p=0.9,
|
317 |
-
return_full_text=False,
|
318 |
-
)
|
319 |
-
return HuggingFacePipeline(pipeline=pipe)
|
320 |
-
else:
|
321 |
# Default pipeline for other models
|
322 |
pipe = pipeline(
|
323 |
"text-generation",
|
@@ -428,12 +339,12 @@ def handle_model_loading_error(model_key, session_id):
|
|
428 |
# Regular suggestion logic for when fallbacks don't work or aren't applicable
|
429 |
suggested_models = [
|
430 |
"DeepSeek Coder Instruct", # 1.3B model
|
431 |
-
"TinyLlama Chat", # 1.1B model
|
432 |
"Qwen2.5 Coder Instruct" # Another option
|
433 |
]
|
434 |
|
435 |
# Remove problematic models and current model from suggestions
|
436 |
-
problem_models = ["
|
437 |
suggested_models = [m for m in suggested_models if m not in problem_models and m != model_key]
|
438 |
|
439 |
suggestions = ", ".join(suggested_models[:3]) # Only show top 3 suggestions
|
|
|
35 |
|
36 |
# Model configuration dictionary
|
37 |
MODEL_CONFIG = {
|
38 |
+
"Llama 2 Chat GGUF": {
|
39 |
"name": "TheBloke/Llama-2-7B-Chat-GGUF",
|
40 |
"description": "Llama 2 7B Chat model with good general performance",
|
41 |
"dtype": torch.float16 if torch.cuda.is_available() else torch.float32
|
42 |
},
|
43 |
+
"TinyLlama Chat GGUF": {
|
44 |
"name": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
|
45 |
"description": "Lightweight model with 1.1B parameters, fast and efficient",
|
46 |
"dtype": torch.float16 if torch.cuda.is_available() else torch.float32
|
47 |
},
|
48 |
+
"Mistral Instruct GGUF": {
|
49 |
"name": "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
|
50 |
"description": "7B instruction-tuned model with excellent reasoning",
|
51 |
"dtype": torch.float16 if torch.cuda.is_available() else torch.float32
|
52 |
},
|
|
|
|
|
|
|
|
|
|
|
53 |
"DeepSeek Coder Instruct": {
|
54 |
"name": "deepseek-ai/deepseek-coder-1.3b-instruct",
|
55 |
"description": "1.3B model for code and data analysis",
|
56 |
"dtype": torch.float16 if torch.cuda.is_available() else torch.float32
|
57 |
},
|
|
|
|
|
|
|
|
|
|
|
58 |
"Qwen2.5 Coder Instruct": {
|
59 |
"name": "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF",
|
60 |
"description": "3B model specialized for code and technical applications",
|
|
|
164 |
)
|
165 |
MODEL_CACHE["is_gguf"] = False
|
166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
# Handle standard HF models
|
168 |
else:
|
169 |
# Only use quantization if CUDA is available
|
|
|
222 |
def get_fallback_model(current_model):
|
223 |
"""Get appropriate fallback model for problematic models"""
|
224 |
fallback_map = {
|
225 |
+
"Flan T5 Small": "Llama 2 Chat GGUF"
|
|
|
|
|
226 |
}
|
227 |
+
return fallback_map.get(current_model, "Llama 2 Chat GGUF")
|
228 |
|
229 |
+
# Optimized pipeline for models
|
230 |
def create_optimized_pipeline(model, tokenizer, model_key):
|
231 |
+
"""Optimized pipeline for models"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
# Default pipeline for other models
|
233 |
pipe = pipeline(
|
234 |
"text-generation",
|
|
|
339 |
# Regular suggestion logic for when fallbacks don't work or aren't applicable
|
340 |
suggested_models = [
|
341 |
"DeepSeek Coder Instruct", # 1.3B model
|
342 |
+
"TinyLlama Chat GGUF", # 1.1B model
|
343 |
"Qwen2.5 Coder Instruct" # Another option
|
344 |
]
|
345 |
|
346 |
# Remove problematic models and current model from suggestions
|
347 |
+
problem_models = ["Flan T5 Small"]
|
348 |
suggested_models = [m for m in suggested_models if m not in problem_models and m != model_key]
|
349 |
|
350 |
suggestions = ", ".join(suggested_models[:3]) # Only show top 3 suggestions
|