hmrizal commited on
Commit
8036e11
·
verified ·
1 Parent(s): 0ae8a86

update initialize_model_once, create_llm_pipeline, force cpu only

Browse files
Files changed (1) hide show
  1. app.py +38 -23
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
  import os
 
3
  import uuid
4
  import threading
5
  import pandas as pd
@@ -88,26 +89,37 @@ def initialize_model_once(model_key):
88
  model_info = MODEL_CONFIG[model_key]
89
  model_name = model_info["name"]
90
  MODEL_CACHE["model_name"] = model_key
91
-
92
- # Handle T5 models separately
93
- if model_info.get("is_t5", False):
94
- MODEL_CACHE["tokenizer"] = T5Tokenizer.from_pretrained(model_name)
95
- MODEL_CACHE["model"] = T5ForConditionalGeneration.from_pretrained(
96
- model_name,
97
- torch_dtype=model_info["dtype"],
98
- device_map="auto",
99
- low_cpu_mem_usage=True
100
- )
101
- else:
102
- # Load tokenizer and model with appropriate configuration
103
- MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name)
104
- MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
105
- model_name,
106
- torch_dtype=model_info["dtype"],
107
- device_map="auto",
108
- low_cpu_mem_usage=True,
109
- trust_remote_code=True
110
- )
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  return MODEL_CACHE["tokenizer"], MODEL_CACHE["model"], model_info.get("is_t5", False)
113
 
@@ -117,6 +129,9 @@ def create_llm_pipeline(model_key):
117
  print(f"Creating pipeline for model: {model_key}")
118
  tokenizer, model, is_t5 = initialize_model_once(model_key)
119
 
 
 
 
120
  # Create appropriate pipeline based on model type
121
  if is_t5:
122
  print("Creating T5 pipeline")
@@ -124,7 +139,7 @@ def create_llm_pipeline(model_key):
124
  "text2text-generation",
125
  model=model,
126
  tokenizer=tokenizer,
127
- max_new_tokens=256,
128
  temperature=0.3,
129
  top_p=0.9,
130
  return_full_text=False,
@@ -135,7 +150,7 @@ def create_llm_pipeline(model_key):
135
  "text-generation",
136
  model=model,
137
  tokenizer=tokenizer,
138
- max_new_tokens=256,
139
  temperature=0.3,
140
  top_p=0.9,
141
  top_k=30,
@@ -150,7 +165,7 @@ def create_llm_pipeline(model_key):
150
  import traceback
151
  print(f"Error creating pipeline: {str(e)}")
152
  print(traceback.format_exc())
153
- raise
154
 
155
  def create_conversational_chain(db, file_path, model_key):
156
  llm = create_llm_pipeline(model_key)
 
1
  import gradio as gr
2
  import os
3
+ os.environ["CUDA_VISIBLE_DEVICES"] = "" # Force CPU only
4
  import uuid
5
  import threading
6
  import pandas as pd
 
89
  model_info = MODEL_CONFIG[model_key]
90
  model_name = model_info["name"]
91
  MODEL_CACHE["model_name"] = model_key
92
+
93
+ try:
94
+ print(f"Loading model: {model_name}")
95
+ # Handle T5 models separately
96
+ if model_info.get("is_t5", False):
97
+ MODEL_CACHE["tokenizer"] = T5Tokenizer.from_pretrained(model_name)
98
+ MODEL_CACHE["model"] = T5ForConditionalGeneration.from_pretrained(
99
+ model_name,
100
+ torch_dtype=model_info["dtype"],
101
+ device_map="auto" if torch.cuda.is_available() else None,
102
+ low_cpu_mem_usage=True
103
+ )
104
+ else:
105
+ # Load tokenizer and model with appropriate configuration
106
+ MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name)
107
+ MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
108
+ model_name,
109
+ torch_dtype=model_info["dtype"],
110
+ device_map="auto" if torch.cuda.is_available() else None,
111
+ low_cpu_mem_usage=True,
112
+ trust_remote_code=True
113
+ )
114
+ print(f"Model {model_name} loaded successfully")
115
+ except Exception as e:
116
+ import traceback
117
+ print(f"Error loading model {model_name}: {str(e)}")
118
+ print(traceback.format_exc())
119
+ raise RuntimeError(f"Failed to load model {model_name}: {str(e)}")
120
+
121
+ if MODEL_CACHE["model"] is None or MODEL_CACHE["tokenizer"] is None:
122
+ raise ValueError(f"Model or tokenizer not initialized properly for {model_key}")
123
 
124
  return MODEL_CACHE["tokenizer"], MODEL_CACHE["model"], model_info.get("is_t5", False)
125
 
 
129
  print(f"Creating pipeline for model: {model_key}")
130
  tokenizer, model, is_t5 = initialize_model_once(model_key)
131
 
132
+ if model is None or tokenizer is None:
133
+ raise ValueError(f"Model or tokenizer is None for {model_key}")
134
+
135
  # Create appropriate pipeline based on model type
136
  if is_t5:
137
  print("Creating T5 pipeline")
 
139
  "text2text-generation",
140
  model=model,
141
  tokenizer=tokenizer,
142
+ max_new_tokens=128, # Reduced for better performance
143
  temperature=0.3,
144
  top_p=0.9,
145
  return_full_text=False,
 
150
  "text-generation",
151
  model=model,
152
  tokenizer=tokenizer,
153
+ max_new_tokens=128, # Reduced for better performance
154
  temperature=0.3,
155
  top_p=0.9,
156
  top_k=30,
 
165
  import traceback
166
  print(f"Error creating pipeline: {str(e)}")
167
  print(traceback.format_exc())
168
+ raise RuntimeError(f"Failed to create pipeline: {str(e)}")
169
 
170
  def create_conversational_chain(db, file_path, model_key):
171
  llm = create_llm_pipeline(model_key)