Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -54,7 +54,7 @@ def load_model_for_zerocpu():
|
|
| 54 |
print(f"Falling back to standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU (will be slower without GGUF quantization).")
|
| 55 |
else:
|
| 56 |
print("WARNING: ctransformers is not available. Will load standard Hugging Face model directly.")
|
| 57 |
-
|
| 58 |
print(f"Loading standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU...")
|
| 59 |
try:
|
| 60 |
model = AutoModelForCausalLM.from_pretrained(ORIGINAL_MODEL_ID)
|
|
@@ -123,7 +123,7 @@ def predict_chat(message: str, history: list):
|
|
| 123 |
)
|
| 124 |
generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
|
| 125 |
yield generated_text
|
| 126 |
-
|
| 127 |
end_time = time.time()
|
| 128 |
print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
|
| 129 |
|
|
@@ -136,41 +136,35 @@ if __name__ == "__main__":
|
|
| 136 |
"environment for efficient demonstration. How can I help you today?"
|
| 137 |
)
|
| 138 |
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
f"This Space demonstrates an LLM for efficient CPU-only inference. "
|
| 145 |
f"**Note:** For ZeroCPU, this app prioritizes `{GGUF_MODEL_ID}` (a GGUF-quantized model "
|
| 146 |
f"like TinyLlama) due to better CPU performance than `{ORIGINAL_MODEL_ID}` "
|
| 147 |
f"without GGUF. Expect varied responses each run due to randomized generation."
|
| 148 |
-
)
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
["What is the capital of France?"],
|
| 161 |
-
["Can you tell me a fun fact about outer space?"],
|
| 162 |
-
["What's the best way to stay motivated?"],
|
| 163 |
-
],
|
| 164 |
-
cache_examples=False,
|
| 165 |
-
)
|
| 166 |
-
|
| 167 |
-
# Now explicitly place the chat_interface component into the Blocks layout
|
| 168 |
-
chat_interface.render()
|
| 169 |
-
|
| 170 |
-
# The clear button is typically below the chat interface
|
| 171 |
-
gr.ClearButton(components=[chatbot_component])
|
| 172 |
-
|
| 173 |
-
chatbot_component.value = [[None, initial_chatbot_message]]
|
| 174 |
-
|
| 175 |
|
| 176 |
demo.launch()
|
|
|
|
| 54 |
print(f"Falling back to standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU (will be slower without GGUF quantization).")
|
| 55 |
else:
|
| 56 |
print("WARNING: ctransformers is not available. Will load standard Hugging Face model directly.")
|
| 57 |
+
|
| 58 |
print(f"Loading standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU...")
|
| 59 |
try:
|
| 60 |
model = AutoModelForCausalLM.from_pretrained(ORIGINAL_MODEL_ID)
|
|
|
|
| 123 |
)
|
| 124 |
generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
|
| 125 |
yield generated_text
|
| 126 |
+
|
| 127 |
end_time = time.time()
|
| 128 |
print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
|
| 129 |
|
|
|
|
| 136 |
"environment for efficient demonstration. How can I help you today?"
|
| 137 |
)
|
| 138 |
|
| 139 |
+
# Use gr.ChatInterface directly without gr.Blocks wrapper for simplicity
|
| 140 |
+
# This often works better when ChatInterface is the sole component
|
| 141 |
+
demo = gr.ChatInterface(
|
| 142 |
+
fn=predict_chat,
|
| 143 |
+
# Define the chatbot here, with type='messages'
|
| 144 |
+
chatbot=gr.Chatbot(height=500, type='messages',
|
| 145 |
+
value=[[None, initial_chatbot_message]]), # Set initial message directly here
|
| 146 |
+
textbox=gr.Textbox(
|
| 147 |
+
placeholder="Ask me a question...",
|
| 148 |
+
container=False,
|
| 149 |
+
scale=7
|
| 150 |
+
),
|
| 151 |
+
title="SmolLM2-360M-Instruct (or TinyLlama GGUF) on ZeroCPU",
|
| 152 |
+
description=(
|
| 153 |
f"This Space demonstrates an LLM for efficient CPU-only inference. "
|
| 154 |
f"**Note:** For ZeroCPU, this app prioritizes `{GGUF_MODEL_ID}` (a GGUF-quantized model "
|
| 155 |
f"like TinyLlama) due to better CPU performance than `{ORIGINAL_MODEL_ID}` "
|
| 156 |
f"without GGUF. Expect varied responses each run due to randomized generation."
|
| 157 |
+
),
|
| 158 |
+
theme="soft",
|
| 159 |
+
examples=[
|
| 160 |
+
["What is the capital of France?"],
|
| 161 |
+
["Can you tell me a fun fact about outer space?"],
|
| 162 |
+
["What's the best way to stay motivated?"],
|
| 163 |
+
],
|
| 164 |
+
cache_examples=False,
|
| 165 |
+
# Gradio 4.x has `clear_btn` directly on ChatInterface again
|
| 166 |
+
# but if this causes issues, you might need to revert to a gr.ClearButton() below
|
| 167 |
+
clear_btn="Clear Chat" # Re-added clear_btn as it seems to be supported again in latest Gradio versions
|
| 168 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
demo.launch()
|