ajsbsd commited on
Commit
d32f90c
·
verified ·
1 Parent(s): 6d6c49f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -73
app.py CHANGED
@@ -4,107 +4,88 @@ import os
4
  import time
5
 
6
  # --- Try to import ctransformers for GGUF, provide helpful message if not found ---
7
- # We try to import ctransformers first as it's the preferred method for ZeroCPU efficiency
8
  try:
9
  from ctransformers import AutoModelForCausalLM as AutoModelForCausalLM_GGUF
10
- # We still need AutoTokenizer from transformers for standard tokenizing
11
  from transformers import AutoTokenizer, AutoModelForCausalLM
12
  GGUF_AVAILABLE = True
13
  except ImportError:
14
  GGUF_AVAILABLE = False
15
  print("WARNING: 'ctransformers' not found. This app relies on it for efficient CPU inference.")
16
  print("Please install it with: pip install ctransformers transformers")
17
- # If ctransformers isn't available, we'll fall back to standard transformers loading, which is slower on CPU.
18
  from transformers import AutoTokenizer, AutoModelForCausalLM
19
 
20
  # --- Configuration for Models and Generation ---
21
- # Original model (for reference, or if a GPU is detected, though ZeroCPU is target)
22
  ORIGINAL_MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct"
23
-
24
- # !!! IMPORTANT !!! For efficient ZeroCPU (CPU-only) inference,
25
- # a GGUF quantized model is HIGHLY RECOMMENDED.
26
- # SmolLM2-360M-Instruct does NOT have a readily available GGUF version from common providers.
27
- # Therefore, for ZeroCPU deployment, this app will use a common, small GGUF model by default.
28
- # If you find a GGUF for SmolLM2 later, you can update these:
29
- GGUF_MODEL_ID = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" # Recommended GGUF placeholder for ZeroCPU
30
- GGUF_MODEL_FILENAME = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" # Corresponding GGUF file name
31
 
32
  # --- Generation Parameters ---
33
  MAX_NEW_TOKENS = 256
34
  TEMPERATURE = 0.7
35
  TOP_K = 50
36
  TOP_P = 0.95
37
- DO_SAMPLE = True # Important for varied responses
38
 
39
- # Global model and tokenizer variables
40
  model = None
41
  tokenizer = None
42
- device = "cpu" # Explicitly set to CPU for ZeroCPU deployment
43
 
44
  # --- Model Loading Function ---
45
  def load_model_for_zerocpu():
46
  global model, tokenizer, device
47
 
48
- # Attempt to load the GGUF model first for efficiency on ZeroCPU
49
  if GGUF_AVAILABLE:
50
  print(f"Attempting to load GGUF model '{GGUF_MODEL_ID}' (file: '{GGUF_MODEL_FILENAME}') for ZeroCPU...")
51
  try:
52
  model = AutoModelForCausalLM_GGUF.from_pretrained(
53
  GGUF_MODEL_ID,
54
  model_file=GGUF_MODEL_FILENAME,
55
- model_type="llama", # Most GGUF models are Llama-based (TinyLlama is)
56
- gpu_layers=0 # Ensures it runs on CPU, not GPU
57
  )
58
- # Use the tokenizer from the original SmolLM2 for chat template consistency
59
  tokenizer = AutoTokenizer.from_pretrained(ORIGINAL_MODEL_ID)
60
  if tokenizer.pad_token is None:
61
  tokenizer.pad_token = tokenizer.eos_token
62
  print(f"GGUF model '{GGUF_MODEL_ID}' loaded successfully for CPU.")
63
- return # Exit function if GGUF model loaded successfully
64
  except Exception as e:
65
  print(f"WARNING: Could not load GGUF model '{GGUF_MODEL_ID}' from '{GGUF_MODEL_FILENAME}': {e}")
66
  print(f"Falling back to standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU (will be slower without GGUF quantization).")
67
- # Continue to the next block to try loading the standard HF model
68
  else:
69
  print("WARNING: ctransformers is not available. Will load standard Hugging Face model directly.")
70
 
71
- # Fallback/alternative: Load the standard Hugging Face model (will be slower on CPU without GGUF)
72
  print(f"Loading standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU...")
73
  try:
74
  model = AutoModelForCausalLM.from_pretrained(ORIGINAL_MODEL_ID)
75
  tokenizer = AutoTokenizer.from_pretrained(ORIGINAL_MODEL_ID)
76
  if tokenizer.pad_token is None:
77
  tokenizer.pad_token = tokenizer.eos_token
78
- model.to(device) # Explicitly move model to CPU
79
  print(f"Standard model '{ORIGINAL_MODEL_ID}' loaded successfully on CPU.")
80
  except Exception as e:
81
  print(f"CRITICAL ERROR: Could not load standard model '{ORIGINAL_MODEL_ID}' on CPU: {e}")
82
  print("Please ensure the model ID is correct, you have enough RAM, and dependencies are installed.")
83
- model = None # Indicate failure to load
84
- tokenizer = None # Indicate failure to load
85
 
86
  # --- Inference Function for Gradio ChatInterface ---
87
  def predict_chat(message: str, history: list):
88
- # 'history' is a list of lists, where each inner list is [user_message, bot_message]
89
- # 'message' is the current user input
90
-
91
  if model is None or tokenizer is None:
92
  yield "Error: Model or tokenizer failed to load. Please check the Space logs for details."
93
  return
94
 
95
- # Build the full conversation history for the model's chat template
96
  messages = [{"role": "system", "content": "You are a friendly chatbot."}]
97
  for human_msg, ai_msg in history:
98
  messages.append({"role": "user", "content": human_msg})
99
  messages.append({"role": "assistant", "content": ai_msg})
100
- messages.append({"role": "user", "content": message}) # Add the current user message
101
 
102
  generated_text = ""
103
-
104
- start_time = time.time() # Start timing for the current turn
105
 
106
- if isinstance(model, AutoModelForCausalLM_GGUF): # Check if the loaded model is from ctransformers
107
- # For ctransformers (GGUF), manually construct a simple prompt string
108
  prompt_input = ""
109
  for msg in messages:
110
  if msg["role"] == "system":
@@ -113,9 +94,8 @@ def predict_chat(message: str, history: list):
113
  prompt_input += f"User: {msg['content']}\n"
114
  elif msg["role"] == "assistant":
115
  prompt_input += f"Assistant: {msg['content']}\n"
116
- prompt_input += "Assistant:" # Instruct the model to generate the assistant's response
117
 
118
- # Use the GGUF model's generate method
119
  for token in model.generate(
120
  prompt_input,
121
  max_new_tokens=MAX_NEW_TOKENS,
@@ -123,18 +103,16 @@ def predict_chat(message: str, history: list):
123
  top_k=TOP_K,
124
  top_p=TOP_P,
125
  do_sample=DO_SAMPLE,
126
- repetition_penalty=1.1, # Common for GGUF models
127
- stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"] # Common stop tokens
128
  ):
129
  generated_text += token
130
- yield generated_text # Yield partial response for streaming in Gradio
131
 
132
- else: # If standard Hugging Face transformers model was loaded (slower on CPU)
133
- # Apply the tokenizer's chat template
134
  input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
135
  inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
136
 
137
- # Generate the response
138
  outputs = model.generate(
139
  inputs,
140
  max_new_tokens=MAX_NEW_TOKENS,
@@ -142,55 +120,63 @@ def predict_chat(message: str, history: list):
142
  top_k=TOP_K,
143
  top_p=TOP_P,
144
  do_sample=DO_SAMPLE,
145
- pad_token_id=tokenizer.pad_token_id # Important for generation
146
  )
147
- # Decode only the newly generated tokens
148
  generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
149
- yield generated_text # Yield the full response at once (transformers.generate doesn't stream by default)
150
 
151
  end_time = time.time()
152
  print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
153
 
154
-
155
  # --- Gradio Interface Setup ---
156
  if __name__ == "__main__":
157
- # Load the model globally when the Gradio app starts
158
  load_model_for_zerocpu()
159
 
160
- # Define a custom startup message for the chatbot
161
  initial_chatbot_message = (
162
  "Hello! I'm an AI assistant. I'm currently running in a CPU-only "
163
  "environment for efficient demonstration. How can I help you today?"
164
  )
165
 
166
- demo = gr.ChatInterface(
167
- fn=predict_chat, # The function that handles chat prediction
168
- chatbot=gr.Chatbot(height=500), # The chat display area
169
- textbox=gr.Textbox(
170
- placeholder="Ask me a question...",
171
- container=False,
172
- scale=7
173
- ),
174
- title="SmolLM2-360M-Instruct (or TinyLlama GGUF) on ZeroCPU",
175
- description=(
176
  f"This Space demonstrates an LLM for efficient CPU-only inference. "
177
  f"**Note:** For ZeroCPU, this app prioritizes `{GGUF_MODEL_ID}` (a GGUF-quantized model "
178
  f"like TinyLlama) due to better CPU performance than `{ORIGINAL_MODEL_ID}` "
179
  f"without GGUF. Expect varied responses each run due to randomized generation."
180
- ),
181
- theme="soft",
182
- examples=[ # Pre-defined examples for quick testing
183
- ["What is the capital of France?"],
184
- ["Can you tell me a fun fact about outer space?"],
185
- ["What's the best way to stay motivated?"],
186
- ],
187
- cache_examples=False, # Important: Ensures examples run inference each time, not from cache
188
- clear_btn="Clear Chat", # Button to clear the conversation
189
- # Custom message to start the conversation from the assistant
190
- initial_chatbot_message=initial_chatbot_message
191
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
- # Launch the Gradio app
194
- # `share=True` creates a public link (useful for testing, but not needed on HF Spaces)
195
- # `server_name="0.0.0.0"` and `server_port=7860` are typically default for HF Spaces
196
  demo.launch()
 
4
  import time
5
 
6
  # --- Try to import ctransformers for GGUF, provide helpful message if not found ---
 
7
  try:
8
  from ctransformers import AutoModelForCausalLM as AutoModelForCausalLM_GGUF
 
9
  from transformers import AutoTokenizer, AutoModelForCausalLM
10
  GGUF_AVAILABLE = True
11
  except ImportError:
12
  GGUF_AVAILABLE = False
13
  print("WARNING: 'ctransformers' not found. This app relies on it for efficient CPU inference.")
14
  print("Please install it with: pip install ctransformers transformers")
 
15
  from transformers import AutoTokenizer, AutoModelForCausalLM
16
 
17
  # --- Configuration for Models and Generation ---
 
18
  ORIGINAL_MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct"
19
+ GGUF_MODEL_ID = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
20
+ GGUF_MODEL_FILENAME = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
 
 
 
 
 
 
21
 
22
  # --- Generation Parameters ---
23
  MAX_NEW_TOKENS = 256
24
  TEMPERATURE = 0.7
25
  TOP_K = 50
26
  TOP_P = 0.95
27
+ DO_SAMPLE = True
28
 
29
+ # Global model and tokenizer
30
  model = None
31
  tokenizer = None
32
+ device = "cpu"
33
 
34
  # --- Model Loading Function ---
35
  def load_model_for_zerocpu():
36
  global model, tokenizer, device
37
 
 
38
  if GGUF_AVAILABLE:
39
  print(f"Attempting to load GGUF model '{GGUF_MODEL_ID}' (file: '{GGUF_MODEL_FILENAME}') for ZeroCPU...")
40
  try:
41
  model = AutoModelForCausalLM_GGUF.from_pretrained(
42
  GGUF_MODEL_ID,
43
  model_file=GGUF_MODEL_FILENAME,
44
+ model_type="llama",
45
+ gpu_layers=0
46
  )
 
47
  tokenizer = AutoTokenizer.from_pretrained(ORIGINAL_MODEL_ID)
48
  if tokenizer.pad_token is None:
49
  tokenizer.pad_token = tokenizer.eos_token
50
  print(f"GGUF model '{GGUF_MODEL_ID}' loaded successfully for CPU.")
51
+ return
52
  except Exception as e:
53
  print(f"WARNING: Could not load GGUF model '{GGUF_MODEL_ID}' from '{GGUF_MODEL_FILENAME}': {e}")
54
  print(f"Falling back to standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU (will be slower without GGUF quantization).")
 
55
  else:
56
  print("WARNING: ctransformers is not available. Will load standard Hugging Face model directly.")
57
 
 
58
  print(f"Loading standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU...")
59
  try:
60
  model = AutoModelForCausalLM.from_pretrained(ORIGINAL_MODEL_ID)
61
  tokenizer = AutoTokenizer.from_pretrained(ORIGINAL_MODEL_ID)
62
  if tokenizer.pad_token is None:
63
  tokenizer.pad_token = tokenizer.eos_token
64
+ model.to(device)
65
  print(f"Standard model '{ORIGINAL_MODEL_ID}' loaded successfully on CPU.")
66
  except Exception as e:
67
  print(f"CRITICAL ERROR: Could not load standard model '{ORIGINAL_MODEL_ID}' on CPU: {e}")
68
  print("Please ensure the model ID is correct, you have enough RAM, and dependencies are installed.")
69
+ model = None
70
+ tokenizer = None
71
 
72
  # --- Inference Function for Gradio ChatInterface ---
73
  def predict_chat(message: str, history: list):
 
 
 
74
  if model is None or tokenizer is None:
75
  yield "Error: Model or tokenizer failed to load. Please check the Space logs for details."
76
  return
77
 
78
+ # history contains [user_message, bot_message] tuples, convert to messages format for apply_chat_template
79
  messages = [{"role": "system", "content": "You are a friendly chatbot."}]
80
  for human_msg, ai_msg in history:
81
  messages.append({"role": "user", "content": human_msg})
82
  messages.append({"role": "assistant", "content": ai_msg})
83
+ messages.append({"role": "user", "content": message})
84
 
85
  generated_text = ""
86
+ start_time = time.time()
 
87
 
88
+ if isinstance(model, AutoModelForCausalLM_GGUF):
 
89
  prompt_input = ""
90
  for msg in messages:
91
  if msg["role"] == "system":
 
94
  prompt_input += f"User: {msg['content']}\n"
95
  elif msg["role"] == "assistant":
96
  prompt_input += f"Assistant: {msg['content']}\n"
97
+ prompt_input += "Assistant:"
98
 
 
99
  for token in model.generate(
100
  prompt_input,
101
  max_new_tokens=MAX_NEW_TOKENS,
 
103
  top_k=TOP_K,
104
  top_p=TOP_P,
105
  do_sample=DO_SAMPLE,
106
+ repetition_penalty=1.1,
107
+ stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"]
108
  ):
109
  generated_text += token
110
+ yield generated_text
111
 
112
+ else:
 
113
  input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
114
  inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
115
 
 
116
  outputs = model.generate(
117
  inputs,
118
  max_new_tokens=MAX_NEW_TOKENS,
 
120
  top_k=TOP_K,
121
  top_p=TOP_P,
122
  do_sample=DO_SAMPLE,
123
+ pad_token_id=tokenizer.pad_token_id
124
  )
 
125
  generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
126
+ yield generated_text
127
 
128
  end_time = time.time()
129
  print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
130
 
 
131
  # --- Gradio Interface Setup ---
132
  if __name__ == "__main__":
 
133
  load_model_for_zerocpu()
134
 
 
135
  initial_chatbot_message = (
136
  "Hello! I'm an AI assistant. I'm currently running in a CPU-only "
137
  "environment for efficient demonstration. How can I help you today?"
138
  )
139
 
140
+ # Use gr.Chatbot with type='messages' to avoid the deprecation warning
141
+ chatbot_component = gr.Chatbot(height=500, type='messages')
142
+
143
+ with gr.Blocks(theme="soft") as demo: # Use gr.Blocks to lay out components
144
+ gr.Markdown(
145
+ f"# SmolLM2-360M-Instruct (or TinyLlama GGUF) on ZeroCPU\n"
 
 
 
 
146
  f"This Space demonstrates an LLM for efficient CPU-only inference. "
147
  f"**Note:** For ZeroCPU, this app prioritizes `{GGUF_MODEL_ID}` (a GGUF-quantized model "
148
  f"like TinyLlama) due to better CPU performance than `{ORIGINAL_MODEL_ID}` "
149
  f"without GGUF. Expect varied responses each run due to randomized generation."
150
+ )
151
+
152
+ chatbot_component.render() # Render the chatbot
153
+
154
+ # Use gr.ChatInterface for the core chat functionality
155
+ # It handles the textbox, send button, and history implicitly
156
+ chat_interface = gr.ChatInterface(
157
+ fn=predict_chat,
158
+ chatbot=chatbot_component, # Link to the rendered chatbot component
159
+ textbox=gr.Textbox(
160
+ placeholder="Ask me a question...",
161
+ container=False,
162
+ scale=7
163
+ ),
164
+ # clear_btn is removed from ChatInterface constructor
165
+ examples=[
166
+ ["What is the capital of France?"],
167
+ ["Can you tell me a fun fact about outer space?"],
168
+ ["What's the best way to stay motivated?"],
169
+ ],
170
+ cache_examples=False,
171
+ # initial_chatbot_message will be set after chat_interface is rendered
172
+ )
173
+
174
+ # Manually add a clear button that links to the chatbot component
175
+ gr.ClearButton(components=[chatbot_component])
176
+
177
+ # Set the initial message for the chatbot
178
+ # This needs to be done *after* the chatbot_component is defined
179
+ chatbot_component.value = [[None, initial_chatbot_message]]
180
+
181
 
 
 
 
182
  demo.launch()