Spaces:

hackergeek
/

CBT

Runtime error

App Files Files Community

hackergeek commited on Mar 21

Commit

4098b12

verified ·

1 Parent(s): 5809bb3

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -40

app.py CHANGED Viewed

@@ -2,18 +2,16 @@ import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
-# Load model and tokenizer with CPU optimizations
 model = AutoModelForCausalLM.from_pretrained(
     "hackergeek/gemma-finetuned",
-    torch_dtype=torch.float32,  # Changed to float32 for CPU compatibility
-    device_map="cpu"            # Force CPU usage
 )
 tokenizer = AutoTokenizer.from_pretrained("hackergeek/gemma-finetuned")
 tokenizer.pad_token = tokenizer.eos_token
-# Explicitly move model to CPU (redundant but safe)
-model.to("cpu")
 def format_prompt(message, history):
     """Format the prompt with conversation history"""
     system_prompt = "You are a knowledgeable space expert assistant. Answer questions about astronomy, space exploration, and related topics in a clear and engaging manner."
@@ -26,50 +24,20 @@ def format_prompt(message, history):
     return prompt
 def respond(message, history):
-    # Format the prompt with conversation history
     full_prompt = format_prompt(message, history)
-    # Tokenize input (keep on CPU)
     inputs = tokenizer(full_prompt, return_tensors="pt", add_special_tokens=False)
-    # Generate response with CPU-friendly parameters
     outputs = model.generate(
-        input_ids=inputs.input_ids,
         attention_mask=inputs.attention_mask,
-        max_new_tokens=512,        # Reduced for faster CPU processing
         temperature=0.7,
         top_p=0.85,
         repetition_penalty=1.1,
-        do_sample=True,
-        no_repeat_ngram_size=2      # Added to reduce repetition
     )
-    # Decode response
     response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
     return response
-# Simplified CSS for better CPU rendering
-space_css = """
-.gradio-container { background: #000000; color: #ffffff; }
-.chatbot { background: #0a0a2a !important; }
-"""
-with gr.Blocks(css=space_css) as demo:
-    gr.Markdown("# 🚀 CPU Space Chatbot 🌌")
-    gr.Markdown("Note: Responses may be slower due to CPU processing")
-    chatbot = gr.ChatInterface(
-        respond,
-        examples=[
-            "What is a neutron star?",
-            "Explain the Big Bang theory",
-            "How do rockets work?",
-            "What's the temperature on Venus?"
-        ],
-        clear_btn="Clear",
-    )
-    chatbot.chatbot.height = 500
-if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
+# Load model with CPU optimizations
 model = AutoModelForCausalLM.from_pretrained(
     "hackergeek/gemma-finetuned",
+    torch_dtype=torch.float32,
+    device_map="cpu",
+    low_cpu_mem_usage=True  # Now works with Accelerate installed
 )
 tokenizer = AutoTokenizer.from_pretrained("hackergeek/gemma-finetuned")
 tokenizer.pad_token = tokenizer.eos_token
 def format_prompt(message, history):
     """Format the prompt with conversation history"""
     system_prompt = "You are a knowledgeable space expert assistant. Answer questions about astronomy, space exploration, and related topics in a clear and engaging manner."
     return prompt
 def respond(message, history):
     full_prompt = format_prompt(message, history)
     inputs = tokenizer(full_prompt, return_tensors="pt", add_special_tokens=False)
     outputs = model.generate(
+        inputs.input_ids,
         attention_mask=inputs.attention_mask,
+        max_new_tokens=256,  # Reduced for CPU safety
         temperature=0.7,
         top_p=0.85,
         repetition_penalty=1.1,
+        do_sample=True
     )
     response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
     return response
+# ... (rest of the Gradio interface code remains the same)