import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import torch # Load model with CPU optimizations model = AutoModelForCausalLM.from_pretrained( "hackergeek/gemma-finetuned", torch_dtype=torch.float32, device_map="cpu", low_cpu_mem_usage=True # Now works with Accelerate installed ) tokenizer = AutoTokenizer.from_pretrained("hackergeek/gemma-finetuned") tokenizer.pad_token = tokenizer.eos_token def format_prompt(message, history): """Format the prompt with conversation history""" system_prompt = "You are a knowledgeable space expert assistant. Answer questions about astronomy, space exploration, and related topics in a clear and engaging manner." prompt = f"{system_prompt}\n" for user_msg, bot_msg in history: prompt += f"{user_msg}\n{bot_msg}\n" prompt += f"{message}\n" return prompt def respond(message, history): full_prompt = format_prompt(message, history) inputs = tokenizer(full_prompt, return_tensors="pt", add_special_tokens=False) outputs = model.generate( inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=256, # Reduced for CPU safety temperature=0.7, top_p=0.85, repetition_penalty=1.1, do_sample=True ) response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) return response # ... (rest of the Gradio interface code remains the same)