hmrizal commited on
Commit
872a597
·
verified ·
1 Parent(s): 3dda2b6

fix Phi-4 and DeepSeek Lite Chat by limiting max new tokens and max memory and optimizing pipeline creation

Browse files
Files changed (1) hide show
  1. app.py +78 -18
app.py CHANGED
@@ -177,25 +177,65 @@ def initialize_model_once(model_key):
177
  # For Phi-4 specifically
178
  elif "Phi-4" in model_key:
179
  MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
180
- MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
 
 
181
  model_name,
182
  device_map="cpu", # Force CPU explicitly
183
  torch_dtype=torch.float32, # Use float32 for CPU
184
  low_cpu_mem_usage=True,
185
- trust_remote_code=True
186
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  MODEL_CACHE["is_gguf"] = False
188
 
189
  # Special handling for DeepSeek Lite Chat
190
  elif model_key == "DeepSeek Lite Chat":
191
  MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
192
- MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
 
 
193
  model_name,
194
  device_map="cpu", # Force CPU
195
  torch_dtype=torch.float32, # Use float32 for CPU
196
  low_cpu_mem_usage=True,
197
- trust_remote_code=True
198
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  MODEL_CACHE["is_gguf"] = False
200
 
201
  # Handle standard HF models
@@ -262,6 +302,36 @@ def get_fallback_model(current_model):
262
  }
263
  return fallback_map.get(current_model, "Llama 2 Chat")
264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  def create_llm_pipeline(model_key):
266
  """Create a new pipeline using the specified model with better error handling"""
267
  try:
@@ -310,18 +380,8 @@ def create_llm_pipeline(model_key):
310
  # Remove return_full_text parameter for T5 models
311
  )
312
  else:
313
- print("Creating causal LM pipeline")
314
- pipe = pipeline(
315
- "text-generation",
316
- model=model,
317
- tokenizer=tokenizer,
318
- max_new_tokens=256, # Increased for more comprehensive answers
319
- temperature=0.3,
320
- top_p=0.9,
321
- top_k=30,
322
- repetition_penalty=1.2,
323
- return_full_text=False,
324
- )
325
 
326
  print("Pipeline created successfully")
327
  return HuggingFacePipeline(pipeline=pipe)
 
177
  # For Phi-4 specifically
178
  elif "Phi-4" in model_key:
179
  MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
180
+ # Load model with optimized memory
181
+ try:
182
+ MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
183
  model_name,
184
  device_map="cpu", # Force CPU explicitly
185
  torch_dtype=torch.float32, # Use float32 for CPU
186
  low_cpu_mem_usage=True,
187
+ trust_remote_code=True,
188
+ offload_folder="model_offload",
189
+ offload_state_dict=True,
190
+ max_memory={"cpu": "1.7GiB"} # Limit memory usage
191
+ )
192
+
193
+ except Exception as e:
194
+ print(f"Error loading Phi-4 with full settings: {str(e)}")
195
+ print("Trying with minimal configuration...")
196
+
197
+ # Fallback with minimum configuration
198
+ MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
199
+ model_name,
200
+ device_map="cpu",
201
+ torch_dtype=torch.float32,
202
+ trust_remote_code=True,
203
+ offload_folder="model_offload",
204
+ low_cpu_mem_usage=True
205
+ )
206
+
207
  MODEL_CACHE["is_gguf"] = False
208
 
209
  # Special handling for DeepSeek Lite Chat
210
  elif model_key == "DeepSeek Lite Chat":
211
  MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
212
+ # Load model with optimized memory
213
+ try:
214
+ MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
215
  model_name,
216
  device_map="cpu", # Force CPU
217
  torch_dtype=torch.float32, # Use float32 for CPU
218
  low_cpu_mem_usage=True,
219
+ trust_remote_code=True,
220
+ max_memory={"cpu": "1.7GiB"}
221
+ )
222
+ except Exception as e:
223
+ print(f"Error loading DeepSeek with full settings: {str(e)}")
224
+ print("Trying with lightweight approach...")
225
+
226
+ # Fallback to lighter approach
227
+ import torch.nn as nn
228
+ from transformers import PreTrainedModel
229
+
230
+ # Trying to load model with smaller fraction
231
+ MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
232
+ model_name,
233
+ device_map="cpu",
234
+ torch_dtype=torch.float32,
235
+ trust_remote_code=True,
236
+ low_cpu_mem_usage=True
237
+ )
238
+
239
  MODEL_CACHE["is_gguf"] = False
240
 
241
  # Handle standard HF models
 
302
  }
303
  return fallback_map.get(current_model, "Llama 2 Chat")
304
 
305
+ # Optimized pipeline for "problematic" models
306
+ def create_optimized_pipeline(model, tokenizer, model_key):
307
+ """Optimized pipeline for problematic models"""
308
+ if model_key == "Phi-4 Mini Instruct" or model_key == "DeepSeek Lite Chat":
309
+ # Use minimum parameter
310
+ pipe = pipeline(
311
+ "text-generation",
312
+ model=model,
313
+ tokenizer=tokenizer,
314
+ max_new_tokens=128, # Kurangi jumlah token yang dihasilkan
315
+ temperature=0.3,
316
+ top_p=0.9,
317
+ return_full_text=False,
318
+ )
319
+ return HuggingFacePipeline(pipeline=pipe)
320
+ else:
321
+ # Default pipeline for other models
322
+ pipe = pipeline(
323
+ "text-generation",
324
+ model=model,
325
+ tokenizer=tokenizer,
326
+ max_new_tokens=256,
327
+ temperature=0.3,
328
+ top_p=0.9,
329
+ top_k=30,
330
+ repetition_penalty=1.2,
331
+ return_full_text=False,
332
+ )
333
+ return HuggingFacePipeline(pipeline=pipe)
334
+
335
  def create_llm_pipeline(model_key):
336
  """Create a new pipeline using the specified model with better error handling"""
337
  try:
 
380
  # Remove return_full_text parameter for T5 models
381
  )
382
  else:
383
+ # Use optimized pipeline for problematic model
384
+ return create_optimized_pipeline(model, tokenizer, model_key)
 
 
 
 
 
 
 
 
 
 
385
 
386
  print("Pipeline created successfully")
387
  return HuggingFacePipeline(pipeline=pipe)