Spaces:

ajsbsd
/

smollm2-zerocpu-demo

Running

App Files Files Community

ajsbsd commited on Jun 16

Commit

ee2d859

verified ·

1 Parent(s): 9656c26

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -34

app.py CHANGED Viewed

@@ -54,7 +54,7 @@ def load_model_for_zerocpu():
             print(f"Falling back to standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU (will be slower without GGUF quantization).")
     else:
         print("WARNING: ctransformers is not available. Will load standard Hugging Face model directly.")
     print(f"Loading standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU...")
     try:
         model = AutoModelForCausalLM.from_pretrained(ORIGINAL_MODEL_ID)
@@ -123,7 +123,7 @@ def predict_chat(message: str, history: list):
         )
         generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
         yield generated_text
     end_time = time.time()
     print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
@@ -136,41 +136,35 @@ if __name__ == "__main__":
         "environment for efficient demonstration. How can I help you today?"
     )
-    chatbot_component = gr.Chatbot(height=500, type='messages')
-    with gr.Blocks(theme="soft") as demo:
-        gr.Markdown(
-            f"# SmolLM2-360M-Instruct (or TinyLlama GGUF) on ZeroCPU\n"
             f"This Space demonstrates an LLM for efficient CPU-only inference. "
             f"**Note:** For ZeroCPU, this app prioritizes `{GGUF_MODEL_ID}` (a GGUF-quantized model "
             f"like TinyLlama) due to better CPU performance than `{ORIGINAL_MODEL_ID}` "
             f"without GGUF. Expect varied responses each run due to randomized generation."
-        )
-        # This is the key change: explicitly placing the chat_interface component
-        chat_interface = gr.ChatInterface(
-            fn=predict_chat,
-            chatbot=chatbot_component,
-            textbox=gr.Textbox(
-                placeholder="Ask me a question...",
-                container=False,
-                scale=7
-            ),
-            examples=[
-                ["What is the capital of France?"],
-                ["Can you tell me a fun fact about outer space?"],
-                ["What's the best way to stay motivated?"],
-            ],
-            cache_examples=False,
-        )
-        # Now explicitly place the chat_interface component into the Blocks layout
-        chat_interface.render()
-        # The clear button is typically below the chat interface
-        gr.ClearButton(components=[chatbot_component])
-        chatbot_component.value = [[None, initial_chatbot_message]]
     demo.launch()

             print(f"Falling back to standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU (will be slower without GGUF quantization).")
     else:
         print("WARNING: ctransformers is not available. Will load standard Hugging Face model directly.")
     print(f"Loading standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU...")
     try:
         model = AutoModelForCausalLM.from_pretrained(ORIGINAL_MODEL_ID)
         )
         generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
         yield generated_text
     end_time = time.time()
     print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
         "environment for efficient demonstration. How can I help you today?"
     )
+    # Use gr.ChatInterface directly without gr.Blocks wrapper for simplicity
+    # This often works better when ChatInterface is the sole component
+    demo = gr.ChatInterface(
+        fn=predict_chat,
+        # Define the chatbot here, with type='messages'
+        chatbot=gr.Chatbot(height=500, type='messages',
+                           value=[[None, initial_chatbot_message]]), # Set initial message directly here
+        textbox=gr.Textbox(
+            placeholder="Ask me a question...",
+            container=False,
+            scale=7
+        ),
+        title="SmolLM2-360M-Instruct (or TinyLlama GGUF) on ZeroCPU",
+        description=(
             f"This Space demonstrates an LLM for efficient CPU-only inference. "
             f"**Note:** For ZeroCPU, this app prioritizes `{GGUF_MODEL_ID}` (a GGUF-quantized model "
             f"like TinyLlama) due to better CPU performance than `{ORIGINAL_MODEL_ID}` "
             f"without GGUF. Expect varied responses each run due to randomized generation."
+        ),
+        theme="soft",
+        examples=[
+            ["What is the capital of France?"],
+            ["Can you tell me a fun fact about outer space?"],
+            ["What's the best way to stay motivated?"],
+        ],
+        cache_examples=False,
+        # Gradio 4.x has `clear_btn` directly on ChatInterface again
+        # but if this causes issues, you might need to revert to a gr.ClearButton() below
+        clear_btn="Clear Chat" # Re-added clear_btn as it seems to be supported again in latest Gradio versions
+    )
     demo.launch()