ajsbsd commited on
Commit
9656c26
·
verified ·
1 Parent(s): d32f90c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -19
app.py CHANGED
@@ -54,7 +54,7 @@ def load_model_for_zerocpu():
54
  print(f"Falling back to standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU (will be slower without GGUF quantization).")
55
  else:
56
  print("WARNING: ctransformers is not available. Will load standard Hugging Face model directly.")
57
-
58
  print(f"Loading standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU...")
59
  try:
60
  model = AutoModelForCausalLM.from_pretrained(ORIGINAL_MODEL_ID)
@@ -75,7 +75,6 @@ def predict_chat(message: str, history: list):
75
  yield "Error: Model or tokenizer failed to load. Please check the Space logs for details."
76
  return
77
 
78
- # history contains [user_message, bot_message] tuples, convert to messages format for apply_chat_template
79
  messages = [{"role": "system", "content": "You are a friendly chatbot."}]
80
  for human_msg, ai_msg in history:
81
  messages.append({"role": "user", "content": human_msg})
@@ -124,7 +123,7 @@ def predict_chat(message: str, history: list):
124
  )
125
  generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
126
  yield generated_text
127
-
128
  end_time = time.time()
129
  print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
130
 
@@ -137,10 +136,9 @@ if __name__ == "__main__":
137
  "environment for efficient demonstration. How can I help you today?"
138
  )
139
 
140
- # Use gr.Chatbot with type='messages' to avoid the deprecation warning
141
  chatbot_component = gr.Chatbot(height=500, type='messages')
142
-
143
- with gr.Blocks(theme="soft") as demo: # Use gr.Blocks to lay out components
144
  gr.Markdown(
145
  f"# SmolLM2-360M-Instruct (or TinyLlama GGUF) on ZeroCPU\n"
146
  f"This Space demonstrates an LLM for efficient CPU-only inference. "
@@ -148,34 +146,30 @@ if __name__ == "__main__":
148
  f"like TinyLlama) due to better CPU performance than `{ORIGINAL_MODEL_ID}` "
149
  f"without GGUF. Expect varied responses each run due to randomized generation."
150
  )
151
-
152
- chatbot_component.render() # Render the chatbot
153
-
154
- # Use gr.ChatInterface for the core chat functionality
155
- # It handles the textbox, send button, and history implicitly
156
  chat_interface = gr.ChatInterface(
157
  fn=predict_chat,
158
- chatbot=chatbot_component, # Link to the rendered chatbot component
159
  textbox=gr.Textbox(
160
  placeholder="Ask me a question...",
161
  container=False,
162
  scale=7
163
  ),
164
- # clear_btn is removed from ChatInterface constructor
165
  examples=[
166
  ["What is the capital of France?"],
167
  ["Can you tell me a fun fact about outer space?"],
168
  ["What's the best way to stay motivated?"],
169
  ],
170
- cache_examples=False,
171
- # initial_chatbot_message will be set after chat_interface is rendered
172
  )
173
-
174
- # Manually add a clear button that links to the chatbot component
 
 
 
175
  gr.ClearButton(components=[chatbot_component])
176
 
177
- # Set the initial message for the chatbot
178
- # This needs to be done *after* the chatbot_component is defined
179
  chatbot_component.value = [[None, initial_chatbot_message]]
180
 
181
 
 
54
  print(f"Falling back to standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU (will be slower without GGUF quantization).")
55
  else:
56
  print("WARNING: ctransformers is not available. Will load standard Hugging Face model directly.")
57
+
58
  print(f"Loading standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU...")
59
  try:
60
  model = AutoModelForCausalLM.from_pretrained(ORIGINAL_MODEL_ID)
 
75
  yield "Error: Model or tokenizer failed to load. Please check the Space logs for details."
76
  return
77
 
 
78
  messages = [{"role": "system", "content": "You are a friendly chatbot."}]
79
  for human_msg, ai_msg in history:
80
  messages.append({"role": "user", "content": human_msg})
 
123
  )
124
  generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
125
  yield generated_text
126
+
127
  end_time = time.time()
128
  print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
129
 
 
136
  "environment for efficient demonstration. How can I help you today?"
137
  )
138
 
 
139
  chatbot_component = gr.Chatbot(height=500, type='messages')
140
+
141
+ with gr.Blocks(theme="soft") as demo:
142
  gr.Markdown(
143
  f"# SmolLM2-360M-Instruct (or TinyLlama GGUF) on ZeroCPU\n"
144
  f"This Space demonstrates an LLM for efficient CPU-only inference. "
 
146
  f"like TinyLlama) due to better CPU performance than `{ORIGINAL_MODEL_ID}` "
147
  f"without GGUF. Expect varied responses each run due to randomized generation."
148
  )
149
+
150
+ # This is the key change: explicitly placing the chat_interface component
 
 
 
151
  chat_interface = gr.ChatInterface(
152
  fn=predict_chat,
153
+ chatbot=chatbot_component,
154
  textbox=gr.Textbox(
155
  placeholder="Ask me a question...",
156
  container=False,
157
  scale=7
158
  ),
 
159
  examples=[
160
  ["What is the capital of France?"],
161
  ["Can you tell me a fun fact about outer space?"],
162
  ["What's the best way to stay motivated?"],
163
  ],
164
+ cache_examples=False,
 
165
  )
166
+
167
+ # Now explicitly place the chat_interface component into the Blocks layout
168
+ chat_interface.render()
169
+
170
+ # The clear button is typically below the chat interface
171
  gr.ClearButton(components=[chatbot_component])
172
 
 
 
173
  chatbot_component.value = [[None, initial_chatbot_message]]
174
 
175