Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -54,7 +54,7 @@ def load_model_for_zerocpu():
|
|
54 |
print(f"Falling back to standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU (will be slower without GGUF quantization).")
|
55 |
else:
|
56 |
print("WARNING: ctransformers is not available. Will load standard Hugging Face model directly.")
|
57 |
-
|
58 |
print(f"Loading standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU...")
|
59 |
try:
|
60 |
model = AutoModelForCausalLM.from_pretrained(ORIGINAL_MODEL_ID)
|
@@ -75,7 +75,6 @@ def predict_chat(message: str, history: list):
|
|
75 |
yield "Error: Model or tokenizer failed to load. Please check the Space logs for details."
|
76 |
return
|
77 |
|
78 |
-
# history contains [user_message, bot_message] tuples, convert to messages format for apply_chat_template
|
79 |
messages = [{"role": "system", "content": "You are a friendly chatbot."}]
|
80 |
for human_msg, ai_msg in history:
|
81 |
messages.append({"role": "user", "content": human_msg})
|
@@ -124,7 +123,7 @@ def predict_chat(message: str, history: list):
|
|
124 |
)
|
125 |
generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
|
126 |
yield generated_text
|
127 |
-
|
128 |
end_time = time.time()
|
129 |
print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
|
130 |
|
@@ -137,10 +136,9 @@ if __name__ == "__main__":
|
|
137 |
"environment for efficient demonstration. How can I help you today?"
|
138 |
)
|
139 |
|
140 |
-
# Use gr.Chatbot with type='messages' to avoid the deprecation warning
|
141 |
chatbot_component = gr.Chatbot(height=500, type='messages')
|
142 |
-
|
143 |
-
with gr.Blocks(theme="soft") as demo:
|
144 |
gr.Markdown(
|
145 |
f"# SmolLM2-360M-Instruct (or TinyLlama GGUF) on ZeroCPU\n"
|
146 |
f"This Space demonstrates an LLM for efficient CPU-only inference. "
|
@@ -148,34 +146,30 @@ if __name__ == "__main__":
|
|
148 |
f"like TinyLlama) due to better CPU performance than `{ORIGINAL_MODEL_ID}` "
|
149 |
f"without GGUF. Expect varied responses each run due to randomized generation."
|
150 |
)
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
# Use gr.ChatInterface for the core chat functionality
|
155 |
-
# It handles the textbox, send button, and history implicitly
|
156 |
chat_interface = gr.ChatInterface(
|
157 |
fn=predict_chat,
|
158 |
-
chatbot=chatbot_component,
|
159 |
textbox=gr.Textbox(
|
160 |
placeholder="Ask me a question...",
|
161 |
container=False,
|
162 |
scale=7
|
163 |
),
|
164 |
-
# clear_btn is removed from ChatInterface constructor
|
165 |
examples=[
|
166 |
["What is the capital of France?"],
|
167 |
["Can you tell me a fun fact about outer space?"],
|
168 |
["What's the best way to stay motivated?"],
|
169 |
],
|
170 |
-
cache_examples=False,
|
171 |
-
# initial_chatbot_message will be set after chat_interface is rendered
|
172 |
)
|
173 |
-
|
174 |
-
#
|
|
|
|
|
|
|
175 |
gr.ClearButton(components=[chatbot_component])
|
176 |
|
177 |
-
# Set the initial message for the chatbot
|
178 |
-
# This needs to be done *after* the chatbot_component is defined
|
179 |
chatbot_component.value = [[None, initial_chatbot_message]]
|
180 |
|
181 |
|
|
|
54 |
print(f"Falling back to standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU (will be slower without GGUF quantization).")
|
55 |
else:
|
56 |
print("WARNING: ctransformers is not available. Will load standard Hugging Face model directly.")
|
57 |
+
|
58 |
print(f"Loading standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU...")
|
59 |
try:
|
60 |
model = AutoModelForCausalLM.from_pretrained(ORIGINAL_MODEL_ID)
|
|
|
75 |
yield "Error: Model or tokenizer failed to load. Please check the Space logs for details."
|
76 |
return
|
77 |
|
|
|
78 |
messages = [{"role": "system", "content": "You are a friendly chatbot."}]
|
79 |
for human_msg, ai_msg in history:
|
80 |
messages.append({"role": "user", "content": human_msg})
|
|
|
123 |
)
|
124 |
generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
|
125 |
yield generated_text
|
126 |
+
|
127 |
end_time = time.time()
|
128 |
print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
|
129 |
|
|
|
136 |
"environment for efficient demonstration. How can I help you today?"
|
137 |
)
|
138 |
|
|
|
139 |
chatbot_component = gr.Chatbot(height=500, type='messages')
|
140 |
+
|
141 |
+
with gr.Blocks(theme="soft") as demo:
|
142 |
gr.Markdown(
|
143 |
f"# SmolLM2-360M-Instruct (or TinyLlama GGUF) on ZeroCPU\n"
|
144 |
f"This Space demonstrates an LLM for efficient CPU-only inference. "
|
|
|
146 |
f"like TinyLlama) due to better CPU performance than `{ORIGINAL_MODEL_ID}` "
|
147 |
f"without GGUF. Expect varied responses each run due to randomized generation."
|
148 |
)
|
149 |
+
|
150 |
+
# This is the key change: explicitly placing the chat_interface component
|
|
|
|
|
|
|
151 |
chat_interface = gr.ChatInterface(
|
152 |
fn=predict_chat,
|
153 |
+
chatbot=chatbot_component,
|
154 |
textbox=gr.Textbox(
|
155 |
placeholder="Ask me a question...",
|
156 |
container=False,
|
157 |
scale=7
|
158 |
),
|
|
|
159 |
examples=[
|
160 |
["What is the capital of France?"],
|
161 |
["Can you tell me a fun fact about outer space?"],
|
162 |
["What's the best way to stay motivated?"],
|
163 |
],
|
164 |
+
cache_examples=False,
|
|
|
165 |
)
|
166 |
+
|
167 |
+
# Now explicitly place the chat_interface component into the Blocks layout
|
168 |
+
chat_interface.render()
|
169 |
+
|
170 |
+
# The clear button is typically below the chat interface
|
171 |
gr.ClearButton(components=[chatbot_component])
|
172 |
|
|
|
|
|
173 |
chatbot_component.value = [[None, initial_chatbot_message]]
|
174 |
|
175 |
|