Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
import torch | |
# Configuration | |
MODEL_NAME = "RekaAI/reka-flash-3" | |
DEFAULT_MAX_LENGTH = 4096 # Reduced for CPU efficiency | |
DEFAULT_TEMPERATURE = 0.7 | |
# System prompt with reasoning instructions | |
SYSTEM_PROMPT = """You are Reka Flash-3, a helpful AI assistant created by Reka AI. | |
When responding, think step-by-step within <thinking> tags and conclude your answer after </thinking>. | |
For example: | |
User: What is 2+2? | |
Assistant: <thinking>Let me calculate that. 2 plus 2 equals 4.</thinking> The answer is 4.""" | |
# Load model and tokenizer with 4-bit quantization | |
try: | |
quantization_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_compute_dtype=torch.float16, | |
bnb_4bit_use_double_quant=True, | |
bnb_4bit_quant_type="nf4" | |
) | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL_NAME, | |
quantization_config=quantization_config, | |
device_map="auto", # Maps to CPU | |
torch_dtype=torch.float16 | |
) | |
tokenizer.pad_token = tokenizer.eos_token # Ensure padding works | |
except Exception as e: | |
raise Exception(f"Failed to load model: {str(e)}. Ensure access to {MODEL_NAME} and sufficient CPU memory.") | |
def generate_response( | |
message, | |
chat_history, | |
system_prompt, | |
max_length, | |
temperature, | |
top_p, | |
top_k, | |
repetition_penalty, | |
show_reasoning | |
): | |
"""Generate a response from Reka Flash-3 with reasoning tags.""" | |
try: | |
# Format chat history and prompt (multi-round conversation) | |
history_str = "" | |
for user_msg, assistant_msg in chat_history: | |
history_str += f"human: {user_msg} <sep> assistant: {assistant_msg} <sep> " | |
prompt = f"{system_prompt} <sep> human: {message} <sep> assistant: <thinking>\n" | |
# Tokenize input | |
inputs = tokenizer(prompt, return_tensors="pt").to("cpu") | |
# Generate response with budget forcing | |
outputs = model.generate( | |
**inputs, | |
max_new_tokens=max_length, | |
temperature=temperature, | |
top_p=top_p, | |
top_k=top_k, | |
repetition_penalty=repetition_penalty, | |
do_sample=True, | |
eos_token_id=tokenizer.convert_tokens_to_ids("<sep>"), # Stop at <sep> | |
pad_token_id=tokenizer.eos_token_id | |
) | |
# Decode and clean response | |
response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
response = response[len(prompt):].split("<sep>")[0].strip() # Extract assistant response | |
# Parse reasoning and final answer | |
if "</thinking>" in response: | |
reasoning, final_answer = response.split("</thinking>", 1) | |
reasoning = reasoning.replace("<thinking>", "").strip() | |
final_answer = final_answer.strip() | |
else: | |
reasoning = "" | |
final_answer = response | |
# Update chat history (drop reasoning to save tokens) | |
chat_history.append({"role": "user", "content": message}) | |
chat_history.append({"role": "assistant", "content": final_answer}) | |
# Display reasoning if requested | |
reasoning_display = f"**Reasoning:**\n{reasoning}" if show_reasoning and reasoning else "" | |
return "", chat_history, reasoning_display | |
except Exception as e: | |
error_msg = f"Error: {str(e)}" | |
gr.Warning(error_msg) | |
return "", chat_history, error_msg | |
# Gradio Interface | |
with gr.Blocks(title="Reka Flash-3 Chat", theme=gr.themes.Soft()) as demo: | |
gr.Markdown(""" | |
# Reka Flash-3 Chat Interface | |
*Powered by [Reka AI](https://www.reka.ai/)* - A 21B parameter reasoning model optimized for CPU. | |
""") | |
with gr.Accordion("Deployment Instructions", open=True): | |
gr.Textbox( | |
value="""To deploy on Hugging Face Spaces: | |
1. Request access to RekaAI/reka-flash-3 from Reka AI. | |
2. Use a Pro subscription with zero-GPU (CPU-only) hardware. | |
3. Ensure 32GB+ CPU memory for 4-bit quantization. | |
4. Install dependencies: gradio, transformers, torch, bitsandbytes.""", | |
label="How to Deploy", | |
interactive=False | |
) | |
with gr.Row(): | |
chatbot = gr.Chatbot(type="messages", height=400, label="Conversation") | |
reasoning_display = gr.Textbox(label="Model Reasoning", interactive=False, lines=8) | |
with gr.Row(): | |
message = gr.Textbox(label="Your Message", placeholder="Ask me anything...", lines=2) | |
submit_btn = gr.Button("Send", variant="primary") | |
with gr.Accordion("Options", open=True): | |
max_length = gr.Slider(128, 512, value=DEFAULT_MAX_LENGTH, label="Max Length", step=64) | |
temperature = gr.Slider(0.1, 2.0, value=DEFAULT_TEMPERATURE, label="Temperature", step=0.1) | |
top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p", step=0.05) | |
top_k = gr.Slider(1, 100, value=50, label="Top-k", step=1) | |
repetition_penalty = gr.Slider(0.1, 2.0, value=1.1, label="Repetition Penalty", step=0.1) | |
system_prompt = gr.Textbox(label="System Prompt", value=SYSTEM_PROMPT, lines=4) | |
show_reasoning = gr.Checkbox(label="Show Reasoning", value=True) | |
# Event handling | |
inputs = [message, chatbot, system_prompt, max_length, temperature, top_p, top_k, repetition_penalty, show_reasoning] | |
outputs = [message, chatbot, reasoning_display] | |
submit_btn.click(generate_response, inputs=inputs, outputs=outputs) | |
message.submit(generate_response, inputs=inputs, outputs=outputs) | |
demo.launch(debug=True) |