Spaces:
Running
Running
fix Phi-4 and DeepSeek Lite Chat by limiting max new tokens and max memory and optimizing pipeline creation
Browse files
app.py
CHANGED
@@ -177,25 +177,65 @@ def initialize_model_once(model_key):
|
|
177 |
# For Phi-4 specifically
|
178 |
elif "Phi-4" in model_key:
|
179 |
MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
180 |
-
|
|
|
|
|
181 |
model_name,
|
182 |
device_map="cpu", # Force CPU explicitly
|
183 |
torch_dtype=torch.float32, # Use float32 for CPU
|
184 |
low_cpu_mem_usage=True,
|
185 |
-
trust_remote_code=True
|
186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
MODEL_CACHE["is_gguf"] = False
|
188 |
|
189 |
# Special handling for DeepSeek Lite Chat
|
190 |
elif model_key == "DeepSeek Lite Chat":
|
191 |
MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
192 |
-
|
|
|
|
|
193 |
model_name,
|
194 |
device_map="cpu", # Force CPU
|
195 |
torch_dtype=torch.float32, # Use float32 for CPU
|
196 |
low_cpu_mem_usage=True,
|
197 |
-
trust_remote_code=True
|
198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
MODEL_CACHE["is_gguf"] = False
|
200 |
|
201 |
# Handle standard HF models
|
@@ -262,6 +302,36 @@ def get_fallback_model(current_model):
|
|
262 |
}
|
263 |
return fallback_map.get(current_model, "Llama 2 Chat")
|
264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
def create_llm_pipeline(model_key):
|
266 |
"""Create a new pipeline using the specified model with better error handling"""
|
267 |
try:
|
@@ -310,18 +380,8 @@ def create_llm_pipeline(model_key):
|
|
310 |
# Remove return_full_text parameter for T5 models
|
311 |
)
|
312 |
else:
|
313 |
-
|
314 |
-
|
315 |
-
"text-generation",
|
316 |
-
model=model,
|
317 |
-
tokenizer=tokenizer,
|
318 |
-
max_new_tokens=256, # Increased for more comprehensive answers
|
319 |
-
temperature=0.3,
|
320 |
-
top_p=0.9,
|
321 |
-
top_k=30,
|
322 |
-
repetition_penalty=1.2,
|
323 |
-
return_full_text=False,
|
324 |
-
)
|
325 |
|
326 |
print("Pipeline created successfully")
|
327 |
return HuggingFacePipeline(pipeline=pipe)
|
|
|
177 |
# For Phi-4 specifically
|
178 |
elif "Phi-4" in model_key:
|
179 |
MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
180 |
+
# Load model with optimized memory
|
181 |
+
try:
|
182 |
+
MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
|
183 |
model_name,
|
184 |
device_map="cpu", # Force CPU explicitly
|
185 |
torch_dtype=torch.float32, # Use float32 for CPU
|
186 |
low_cpu_mem_usage=True,
|
187 |
+
trust_remote_code=True,
|
188 |
+
offload_folder="model_offload",
|
189 |
+
offload_state_dict=True,
|
190 |
+
max_memory={"cpu": "1.7GiB"} # Limit memory usage
|
191 |
+
)
|
192 |
+
|
193 |
+
except Exception as e:
|
194 |
+
print(f"Error loading Phi-4 with full settings: {str(e)}")
|
195 |
+
print("Trying with minimal configuration...")
|
196 |
+
|
197 |
+
# Fallback with minimum configuration
|
198 |
+
MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
|
199 |
+
model_name,
|
200 |
+
device_map="cpu",
|
201 |
+
torch_dtype=torch.float32,
|
202 |
+
trust_remote_code=True,
|
203 |
+
offload_folder="model_offload",
|
204 |
+
low_cpu_mem_usage=True
|
205 |
+
)
|
206 |
+
|
207 |
MODEL_CACHE["is_gguf"] = False
|
208 |
|
209 |
# Special handling for DeepSeek Lite Chat
|
210 |
elif model_key == "DeepSeek Lite Chat":
|
211 |
MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
212 |
+
# Load model with optimized memory
|
213 |
+
try:
|
214 |
+
MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
|
215 |
model_name,
|
216 |
device_map="cpu", # Force CPU
|
217 |
torch_dtype=torch.float32, # Use float32 for CPU
|
218 |
low_cpu_mem_usage=True,
|
219 |
+
trust_remote_code=True,
|
220 |
+
max_memory={"cpu": "1.7GiB"}
|
221 |
+
)
|
222 |
+
except Exception as e:
|
223 |
+
print(f"Error loading DeepSeek with full settings: {str(e)}")
|
224 |
+
print("Trying with lightweight approach...")
|
225 |
+
|
226 |
+
# Fallback to lighter approach
|
227 |
+
import torch.nn as nn
|
228 |
+
from transformers import PreTrainedModel
|
229 |
+
|
230 |
+
# Trying to load model with smaller fraction
|
231 |
+
MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
|
232 |
+
model_name,
|
233 |
+
device_map="cpu",
|
234 |
+
torch_dtype=torch.float32,
|
235 |
+
trust_remote_code=True,
|
236 |
+
low_cpu_mem_usage=True
|
237 |
+
)
|
238 |
+
|
239 |
MODEL_CACHE["is_gguf"] = False
|
240 |
|
241 |
# Handle standard HF models
|
|
|
302 |
}
|
303 |
return fallback_map.get(current_model, "Llama 2 Chat")
|
304 |
|
305 |
+
# Optimized pipeline for "problematic" models
|
306 |
+
def create_optimized_pipeline(model, tokenizer, model_key):
|
307 |
+
"""Optimized pipeline for problematic models"""
|
308 |
+
if model_key == "Phi-4 Mini Instruct" or model_key == "DeepSeek Lite Chat":
|
309 |
+
# Use minimum parameter
|
310 |
+
pipe = pipeline(
|
311 |
+
"text-generation",
|
312 |
+
model=model,
|
313 |
+
tokenizer=tokenizer,
|
314 |
+
max_new_tokens=128, # Kurangi jumlah token yang dihasilkan
|
315 |
+
temperature=0.3,
|
316 |
+
top_p=0.9,
|
317 |
+
return_full_text=False,
|
318 |
+
)
|
319 |
+
return HuggingFacePipeline(pipeline=pipe)
|
320 |
+
else:
|
321 |
+
# Default pipeline for other models
|
322 |
+
pipe = pipeline(
|
323 |
+
"text-generation",
|
324 |
+
model=model,
|
325 |
+
tokenizer=tokenizer,
|
326 |
+
max_new_tokens=256,
|
327 |
+
temperature=0.3,
|
328 |
+
top_p=0.9,
|
329 |
+
top_k=30,
|
330 |
+
repetition_penalty=1.2,
|
331 |
+
return_full_text=False,
|
332 |
+
)
|
333 |
+
return HuggingFacePipeline(pipeline=pipe)
|
334 |
+
|
335 |
def create_llm_pipeline(model_key):
|
336 |
"""Create a new pipeline using the specified model with better error handling"""
|
337 |
try:
|
|
|
380 |
# Remove return_full_text parameter for T5 models
|
381 |
)
|
382 |
else:
|
383 |
+
# Use optimized pipeline for problematic model
|
384 |
+
return create_optimized_pipeline(model, tokenizer, model_key)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
385 |
|
386 |
print("Pipeline created successfully")
|
387 |
return HuggingFacePipeline(pipeline=pipe)
|