hmrizal commited on
Commit
31d5efd
·
verified ·
1 Parent(s): 872a597

remove Phi-4 and DeepSeek Lite, change model key for ggufs

Browse files
Files changed (1) hide show
  1. app.py +9 -98
app.py CHANGED
@@ -35,36 +35,26 @@ os.makedirs("performance_metrics", exist_ok=True)
35
 
36
  # Model configuration dictionary
37
  MODEL_CONFIG = {
38
- "Llama 2 Chat": {
39
  "name": "TheBloke/Llama-2-7B-Chat-GGUF",
40
  "description": "Llama 2 7B Chat model with good general performance",
41
  "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
42
  },
43
- "TinyLlama Chat": {
44
  "name": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
45
  "description": "Lightweight model with 1.1B parameters, fast and efficient",
46
  "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
47
  },
48
- "Mistral Instruct": {
49
  "name": "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
50
  "description": "7B instruction-tuned model with excellent reasoning",
51
  "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
52
  },
53
- "Phi-4 Mini Instruct": {
54
- "name": "microsoft/Phi-4-mini-instruct",
55
- "description": "Lightweight model from Microsoft suitable for instructional tasks",
56
- "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
57
- },
58
  "DeepSeek Coder Instruct": {
59
  "name": "deepseek-ai/deepseek-coder-1.3b-instruct",
60
  "description": "1.3B model for code and data analysis",
61
  "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
62
  },
63
- "DeepSeek Lite Chat": {
64
- "name": "deepseek-ai/DeepSeek-V2-Lite-Chat",
65
- "description": "Light but powerful chat model from DeepSeek",
66
- "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
67
- },
68
  "Qwen2.5 Coder Instruct": {
69
  "name": "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF",
70
  "description": "3B model specialized for code and technical applications",
@@ -174,70 +164,6 @@ def initialize_model_once(model_key):
174
  )
175
  MODEL_CACHE["is_gguf"] = False
176
 
177
- # For Phi-4 specifically
178
- elif "Phi-4" in model_key:
179
- MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
180
- # Load model with optimized memory
181
- try:
182
- MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
183
- model_name,
184
- device_map="cpu", # Force CPU explicitly
185
- torch_dtype=torch.float32, # Use float32 for CPU
186
- low_cpu_mem_usage=True,
187
- trust_remote_code=True,
188
- offload_folder="model_offload",
189
- offload_state_dict=True,
190
- max_memory={"cpu": "1.7GiB"} # Limit memory usage
191
- )
192
-
193
- except Exception as e:
194
- print(f"Error loading Phi-4 with full settings: {str(e)}")
195
- print("Trying with minimal configuration...")
196
-
197
- # Fallback with minimum configuration
198
- MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
199
- model_name,
200
- device_map="cpu",
201
- torch_dtype=torch.float32,
202
- trust_remote_code=True,
203
- offload_folder="model_offload",
204
- low_cpu_mem_usage=True
205
- )
206
-
207
- MODEL_CACHE["is_gguf"] = False
208
-
209
- # Special handling for DeepSeek Lite Chat
210
- elif model_key == "DeepSeek Lite Chat":
211
- MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
212
- # Load model with optimized memory
213
- try:
214
- MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
215
- model_name,
216
- device_map="cpu", # Force CPU
217
- torch_dtype=torch.float32, # Use float32 for CPU
218
- low_cpu_mem_usage=True,
219
- trust_remote_code=True,
220
- max_memory={"cpu": "1.7GiB"}
221
- )
222
- except Exception as e:
223
- print(f"Error loading DeepSeek with full settings: {str(e)}")
224
- print("Trying with lightweight approach...")
225
-
226
- # Fallback to lighter approach
227
- import torch.nn as nn
228
- from transformers import PreTrainedModel
229
-
230
- # Trying to load model with smaller fraction
231
- MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
232
- model_name,
233
- device_map="cpu",
234
- torch_dtype=torch.float32,
235
- trust_remote_code=True,
236
- low_cpu_mem_usage=True
237
- )
238
-
239
- MODEL_CACHE["is_gguf"] = False
240
-
241
  # Handle standard HF models
242
  else:
243
  # Only use quantization if CUDA is available
@@ -296,28 +222,13 @@ def initialize_model_once(model_key):
296
  def get_fallback_model(current_model):
297
  """Get appropriate fallback model for problematic models"""
298
  fallback_map = {
299
- "Phi-4 Mini Instruct": "TinyLlama Chat",
300
- "DeepSeek Lite Chat": "DeepSeek Coder Instruct",
301
- "Flan T5 Small": "Llama 2 Chat"
302
  }
303
- return fallback_map.get(current_model, "Llama 2 Chat")
304
 
305
- # Optimized pipeline for "problematic" models
306
  def create_optimized_pipeline(model, tokenizer, model_key):
307
- """Optimized pipeline for problematic models"""
308
- if model_key == "Phi-4 Mini Instruct" or model_key == "DeepSeek Lite Chat":
309
- # Use minimum parameter
310
- pipe = pipeline(
311
- "text-generation",
312
- model=model,
313
- tokenizer=tokenizer,
314
- max_new_tokens=128, # Kurangi jumlah token yang dihasilkan
315
- temperature=0.3,
316
- top_p=0.9,
317
- return_full_text=False,
318
- )
319
- return HuggingFacePipeline(pipeline=pipe)
320
- else:
321
  # Default pipeline for other models
322
  pipe = pipeline(
323
  "text-generation",
@@ -428,12 +339,12 @@ def handle_model_loading_error(model_key, session_id):
428
  # Regular suggestion logic for when fallbacks don't work or aren't applicable
429
  suggested_models = [
430
  "DeepSeek Coder Instruct", # 1.3B model
431
- "TinyLlama Chat", # 1.1B model
432
  "Qwen2.5 Coder Instruct" # Another option
433
  ]
434
 
435
  # Remove problematic models and current model from suggestions
436
- problem_models = ["Phi-4 Mini Instruct", "DeepSeek Lite Chat", "Flan T5 Small"]
437
  suggested_models = [m for m in suggested_models if m not in problem_models and m != model_key]
438
 
439
  suggestions = ", ".join(suggested_models[:3]) # Only show top 3 suggestions
 
35
 
36
  # Model configuration dictionary
37
  MODEL_CONFIG = {
38
+ "Llama 2 Chat GGUF": {
39
  "name": "TheBloke/Llama-2-7B-Chat-GGUF",
40
  "description": "Llama 2 7B Chat model with good general performance",
41
  "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
42
  },
43
+ "TinyLlama Chat GGUF": {
44
  "name": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
45
  "description": "Lightweight model with 1.1B parameters, fast and efficient",
46
  "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
47
  },
48
+ "Mistral Instruct GGUF": {
49
  "name": "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
50
  "description": "7B instruction-tuned model with excellent reasoning",
51
  "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
52
  },
 
 
 
 
 
53
  "DeepSeek Coder Instruct": {
54
  "name": "deepseek-ai/deepseek-coder-1.3b-instruct",
55
  "description": "1.3B model for code and data analysis",
56
  "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
57
  },
 
 
 
 
 
58
  "Qwen2.5 Coder Instruct": {
59
  "name": "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF",
60
  "description": "3B model specialized for code and technical applications",
 
164
  )
165
  MODEL_CACHE["is_gguf"] = False
166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  # Handle standard HF models
168
  else:
169
  # Only use quantization if CUDA is available
 
222
  def get_fallback_model(current_model):
223
  """Get appropriate fallback model for problematic models"""
224
  fallback_map = {
225
+ "Flan T5 Small": "Llama 2 Chat GGUF"
 
 
226
  }
227
+ return fallback_map.get(current_model, "Llama 2 Chat GGUF")
228
 
229
+ # Optimized pipeline for models
230
  def create_optimized_pipeline(model, tokenizer, model_key):
231
+ """Optimized pipeline for models"""
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  # Default pipeline for other models
233
  pipe = pipeline(
234
  "text-generation",
 
339
  # Regular suggestion logic for when fallbacks don't work or aren't applicable
340
  suggested_models = [
341
  "DeepSeek Coder Instruct", # 1.3B model
342
+ "TinyLlama Chat GGUF", # 1.1B model
343
  "Qwen2.5 Coder Instruct" # Another option
344
  ]
345
 
346
  # Remove problematic models and current model from suggestions
347
+ problem_models = ["Flan T5 Small"]
348
  suggested_models = [m for m in suggested_models if m not in problem_models and m != model_key]
349
 
350
  suggestions = ", ".join(suggested_models[:3]) # Only show top 3 suggestions