File size: 2,596 Bytes
f1ff7a7
dc27180
 
f1ff7a7
f086418
dc27180
 
f086418
 
dc27180
 
 
f1ff7a7
f086418
 
 
dc27180
 
 
 
 
 
 
 
 
 
f1ff7a7
dc27180
 
 
 
f086418
 
dc27180
f086418
dc27180
f086418
 
 
dc27180
f086418
dc27180
f086418
 
dc27180
 
f086418
dc27180
 
 
f1ff7a7
f086418
dc27180
f086418
 
f1ff7a7
 
f086418
 
 
6a97a99
dc27180
6a97a99
 
f086418
 
 
 
6a97a99
f086418
6a97a99
f086418
f1ff7a7
 
f086418
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load model and tokenizer with CPU optimizations
model = AutoModelForCausalLM.from_pretrained(
    "hackergeek/gemma-finetuned",
    torch_dtype=torch.float32,  # Changed to float32 for CPU compatibility
    device_map="cpu"            # Force CPU usage
)
tokenizer = AutoTokenizer.from_pretrained("hackergeek/gemma-finetuned")
tokenizer.pad_token = tokenizer.eos_token

# Explicitly move model to CPU (redundant but safe)
model.to("cpu")

def format_prompt(message, history):
    """Format the prompt with conversation history"""
    system_prompt = "You are a knowledgeable space expert assistant. Answer questions about astronomy, space exploration, and related topics in a clear and engaging manner."
    prompt = f"<system>{system_prompt}</system>\n"
    
    for user_msg, bot_msg in history:
        prompt += f"<user>{user_msg}</user>\n<assistant>{bot_msg}</assistant>\n"
    
    prompt += f"<user>{message}</user>\n<assistant>"
    return prompt

def respond(message, history):
    # Format the prompt with conversation history
    full_prompt = format_prompt(message, history)
    
    # Tokenize input (keep on CPU)
    inputs = tokenizer(full_prompt, return_tensors="pt", add_special_tokens=False)
    
    # Generate response with CPU-friendly parameters
    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=512,        # Reduced for faster CPU processing
        temperature=0.7,
        top_p=0.85,
        repetition_penalty=1.1,
        do_sample=True,
        no_repeat_ngram_size=2      # Added to reduce repetition
    )
    
    # Decode response
    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    
    return response

# Simplified CSS for better CPU rendering
space_css = """
.gradio-container { background: #000000; color: #ffffff; }
.chatbot { background: #0a0a2a !important; }
"""

with gr.Blocks(css=space_css) as demo:
    gr.Markdown("# πŸš€ CPU Space Chatbot 🌌")
    gr.Markdown("Note: Responses may be slower due to CPU processing")
    
    chatbot = gr.ChatInterface(
        respond,
        examples=[
            "What is a neutron star?",
            "Explain the Big Bang theory",
            "How do rockets work?",
            "What's the temperature on Venus?"
        ],
        clear_btn="Clear",
    )
    chatbot.chatbot.height = 500

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)