Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -54,7 +54,7 @@ def load_model_for_zerocpu():
|
|
54 |
print(f"Falling back to standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU (will be slower without GGUF quantization).")
|
55 |
else:
|
56 |
print("WARNING: ctransformers is not available. Will load standard Hugging Face model directly.")
|
57 |
-
|
58 |
print(f"Loading standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU...")
|
59 |
try:
|
60 |
model = AutoModelForCausalLM.from_pretrained(ORIGINAL_MODEL_ID)
|
@@ -123,7 +123,7 @@ def predict_chat(message: str, history: list):
|
|
123 |
)
|
124 |
generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
|
125 |
yield generated_text
|
126 |
-
|
127 |
end_time = time.time()
|
128 |
print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
|
129 |
|
@@ -136,41 +136,35 @@ if __name__ == "__main__":
|
|
136 |
"environment for efficient demonstration. How can I help you today?"
|
137 |
)
|
138 |
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
f"This Space demonstrates an LLM for efficient CPU-only inference. "
|
145 |
f"**Note:** For ZeroCPU, this app prioritizes `{GGUF_MODEL_ID}` (a GGUF-quantized model "
|
146 |
f"like TinyLlama) due to better CPU performance than `{ORIGINAL_MODEL_ID}` "
|
147 |
f"without GGUF. Expect varied responses each run due to randomized generation."
|
148 |
-
)
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
["What is the capital of France?"],
|
161 |
-
["Can you tell me a fun fact about outer space?"],
|
162 |
-
["What's the best way to stay motivated?"],
|
163 |
-
],
|
164 |
-
cache_examples=False,
|
165 |
-
)
|
166 |
-
|
167 |
-
# Now explicitly place the chat_interface component into the Blocks layout
|
168 |
-
chat_interface.render()
|
169 |
-
|
170 |
-
# The clear button is typically below the chat interface
|
171 |
-
gr.ClearButton(components=[chatbot_component])
|
172 |
-
|
173 |
-
chatbot_component.value = [[None, initial_chatbot_message]]
|
174 |
-
|
175 |
|
176 |
demo.launch()
|
|
|
54 |
print(f"Falling back to standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU (will be slower without GGUF quantization).")
|
55 |
else:
|
56 |
print("WARNING: ctransformers is not available. Will load standard Hugging Face model directly.")
|
57 |
+
|
58 |
print(f"Loading standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU...")
|
59 |
try:
|
60 |
model = AutoModelForCausalLM.from_pretrained(ORIGINAL_MODEL_ID)
|
|
|
123 |
)
|
124 |
generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
|
125 |
yield generated_text
|
126 |
+
|
127 |
end_time = time.time()
|
128 |
print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
|
129 |
|
|
|
136 |
"environment for efficient demonstration. How can I help you today?"
|
137 |
)
|
138 |
|
139 |
+
# Use gr.ChatInterface directly without gr.Blocks wrapper for simplicity
|
140 |
+
# This often works better when ChatInterface is the sole component
|
141 |
+
demo = gr.ChatInterface(
|
142 |
+
fn=predict_chat,
|
143 |
+
# Define the chatbot here, with type='messages'
|
144 |
+
chatbot=gr.Chatbot(height=500, type='messages',
|
145 |
+
value=[[None, initial_chatbot_message]]), # Set initial message directly here
|
146 |
+
textbox=gr.Textbox(
|
147 |
+
placeholder="Ask me a question...",
|
148 |
+
container=False,
|
149 |
+
scale=7
|
150 |
+
),
|
151 |
+
title="SmolLM2-360M-Instruct (or TinyLlama GGUF) on ZeroCPU",
|
152 |
+
description=(
|
153 |
f"This Space demonstrates an LLM for efficient CPU-only inference. "
|
154 |
f"**Note:** For ZeroCPU, this app prioritizes `{GGUF_MODEL_ID}` (a GGUF-quantized model "
|
155 |
f"like TinyLlama) due to better CPU performance than `{ORIGINAL_MODEL_ID}` "
|
156 |
f"without GGUF. Expect varied responses each run due to randomized generation."
|
157 |
+
),
|
158 |
+
theme="soft",
|
159 |
+
examples=[
|
160 |
+
["What is the capital of France?"],
|
161 |
+
["Can you tell me a fun fact about outer space?"],
|
162 |
+
["What's the best way to stay motivated?"],
|
163 |
+
],
|
164 |
+
cache_examples=False,
|
165 |
+
# Gradio 4.x has `clear_btn` directly on ChatInterface again
|
166 |
+
# but if this causes issues, you might need to revert to a gr.ClearButton() below
|
167 |
+
clear_btn="Clear Chat" # Re-added clear_btn as it seems to be supported again in latest Gradio versions
|
168 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
|
170 |
demo.launch()
|