hackergeek commited on
Commit
4098b12
·
verified ·
1 Parent(s): 5809bb3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -40
app.py CHANGED
@@ -2,18 +2,16 @@ import gradio as gr
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import torch
4
 
5
- # Load model and tokenizer with CPU optimizations
6
  model = AutoModelForCausalLM.from_pretrained(
7
  "hackergeek/gemma-finetuned",
8
- torch_dtype=torch.float32, # Changed to float32 for CPU compatibility
9
- device_map="cpu" # Force CPU usage
 
10
  )
11
  tokenizer = AutoTokenizer.from_pretrained("hackergeek/gemma-finetuned")
12
  tokenizer.pad_token = tokenizer.eos_token
13
 
14
- # Explicitly move model to CPU (redundant but safe)
15
- model.to("cpu")
16
-
17
  def format_prompt(message, history):
18
  """Format the prompt with conversation history"""
19
  system_prompt = "You are a knowledgeable space expert assistant. Answer questions about astronomy, space exploration, and related topics in a clear and engaging manner."
@@ -26,50 +24,20 @@ def format_prompt(message, history):
26
  return prompt
27
 
28
  def respond(message, history):
29
- # Format the prompt with conversation history
30
  full_prompt = format_prompt(message, history)
31
-
32
- # Tokenize input (keep on CPU)
33
  inputs = tokenizer(full_prompt, return_tensors="pt", add_special_tokens=False)
34
 
35
- # Generate response with CPU-friendly parameters
36
  outputs = model.generate(
37
- input_ids=inputs.input_ids,
38
  attention_mask=inputs.attention_mask,
39
- max_new_tokens=512, # Reduced for faster CPU processing
40
  temperature=0.7,
41
  top_p=0.85,
42
  repetition_penalty=1.1,
43
- do_sample=True,
44
- no_repeat_ngram_size=2 # Added to reduce repetition
45
  )
46
 
47
- # Decode response
48
  response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
49
-
50
  return response
51
 
52
- # Simplified CSS for better CPU rendering
53
- space_css = """
54
- .gradio-container { background: #000000; color: #ffffff; }
55
- .chatbot { background: #0a0a2a !important; }
56
- """
57
-
58
- with gr.Blocks(css=space_css) as demo:
59
- gr.Markdown("# 🚀 CPU Space Chatbot 🌌")
60
- gr.Markdown("Note: Responses may be slower due to CPU processing")
61
-
62
- chatbot = gr.ChatInterface(
63
- respond,
64
- examples=[
65
- "What is a neutron star?",
66
- "Explain the Big Bang theory",
67
- "How do rockets work?",
68
- "What's the temperature on Venus?"
69
- ],
70
- clear_btn="Clear",
71
- )
72
- chatbot.chatbot.height = 500
73
-
74
- if __name__ == "__main__":
75
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import torch
4
 
5
+ # Load model with CPU optimizations
6
  model = AutoModelForCausalLM.from_pretrained(
7
  "hackergeek/gemma-finetuned",
8
+ torch_dtype=torch.float32,
9
+ device_map="cpu",
10
+ low_cpu_mem_usage=True # Now works with Accelerate installed
11
  )
12
  tokenizer = AutoTokenizer.from_pretrained("hackergeek/gemma-finetuned")
13
  tokenizer.pad_token = tokenizer.eos_token
14
 
 
 
 
15
  def format_prompt(message, history):
16
  """Format the prompt with conversation history"""
17
  system_prompt = "You are a knowledgeable space expert assistant. Answer questions about astronomy, space exploration, and related topics in a clear and engaging manner."
 
24
  return prompt
25
 
26
  def respond(message, history):
 
27
  full_prompt = format_prompt(message, history)
 
 
28
  inputs = tokenizer(full_prompt, return_tensors="pt", add_special_tokens=False)
29
 
 
30
  outputs = model.generate(
31
+ inputs.input_ids,
32
  attention_mask=inputs.attention_mask,
33
+ max_new_tokens=256, # Reduced for CPU safety
34
  temperature=0.7,
35
  top_p=0.85,
36
  repetition_penalty=1.1,
37
+ do_sample=True
 
38
  )
39
 
 
40
  response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
 
41
  return response
42
 
43
+ # ... (rest of the Gradio interface code remains the same)