Spaces:

ZoroaStrella
/

RekaFlash

Running on Zero

App Files Files Community

ZoroaStrella commited on 22 days ago

Commit

e970aef

1 Parent(s): 646a0c2

Add transformers dependency and correct errors

Browse files

Files changed (2) hide show

app.py +65 -109
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -1,30 +1,37 @@
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 # Configuration
 MODEL_NAME = "RekaAI/reka-flash-3"
-DEFAULT_MAX_LENGTH = 1024
 DEFAULT_TEMPERATURE = 0.7
-# System prompt with instructions for reasoning
-SYSTEM_PROMPT = """You are Reka Flash-3, a helpful AI assistant created by Reka AI.
-Provide detailed, helpful answers while maintaining safety.
-Format responses clearly using markdown when appropriate.
-When asked a question, think step by step inside <thinking> tags, then provide your final answer after </thinking> tags. For example:
 User: What is 2+2?
-Assistant: <thinking>
-Let me calculate that. 2 plus 2 equals 4.
-</thinking>
-The answer is 4."""
-# Load model and tokenizer (assuming CPU-only for zero GPU)
 try:
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="cpu", torch_dtype=torch.float32)
 except Exception as e:
-    raise Exception(f"Failed to load model: {str(e)}. Ensure you have access to {MODEL_NAME} and sufficient CPU memory.")
 def generate_response(
     message,
@@ -35,21 +42,20 @@ def generate_response(
     top_p,
     top_k,
     repetition_penalty,
-    presence_penalty,
-    frequency_penalty,
     show_reasoning
 ):
-    """
-    Generate a response from Reka Flash-3, parsing reasoning and final answer.
-    """
     try:
-        # Format the prompt with thinking tags
-        formatted_prompt = f"{system_prompt}\n\nUser: {message}\n\nAssistant: <thinking>\n"
         # Tokenize input
-        inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cpu")
-        # Generate response
         outputs = model.generate(
             **inputs,
             max_new_tokens=max_length,
@@ -57,127 +63,77 @@ def generate_response(
             top_p=top_p,
             top_k=top_k,
             repetition_penalty=repetition_penalty,
-            presence_penalty=presence_penalty,
-            frequency_penalty=frequency_penalty,
             do_sample=True,
-            pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id else 0
         )
-        # Decode the generated text
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        response = response[len(formatted_prompt):]  # Remove the prompt from the output
         # Parse reasoning and final answer
         if "</thinking>" in response:
             reasoning, final_answer = response.split("</thinking>", 1)
-            reasoning = reasoning.strip()
             final_answer = final_answer.strip()
         else:
             reasoning = ""
-            final_answer = response.strip()
-        # Update chat history with final answer
-        chat_history.append((message, final_answer))
         # Display reasoning if requested
-        reasoning_display = reasoning if show_reasoning and reasoning else ""
-        if reasoning_display:
-            reasoning_display = f"**Reasoning:**\n{reasoning_display}"
         return "", chat_history, reasoning_display
     except Exception as e:
-        error_msg = f"Error generating response: {str(e)}"
         gr.Warning(error_msg)
         return "", chat_history, error_msg
-# UI Components
-with gr.Blocks(title="Reka Flash-3 Chat Demo", theme=gr.themes.Soft()) as demo:
-    # Header Section
     gr.Markdown("""
     # Reka Flash-3 Chat Interface
-    *Powered by [Reka Core AI](https://www.reka.ai/)*
     """)
-    # Deployment Notice
-    with gr.Accordion("Important Deployment Notice", open=True):
         gr.Textbox(
-            value="""To deploy this model on Hugging Face Spaces:
-1. Request the Reka Flash-3 OSS model from Reka AI (https://www.reka.ai/).
-2. Use a Hugging Face Pro subscription for deployment.
-3. Configure your Space with zero GPU (CPU-only) hardware.
-4. Ensure sufficient CPU memory for the 3B parameter model.""",
-            label="Deployment Instructions",
-            lines=5,
             interactive=False
         )
-    # Chat Interface
     with gr.Row():
-        chatbot = gr.Chatbot(height=500, label="Conversation")
-        reasoning_display = gr.Textbox(
-            label="Model Reasoning",
-            interactive=False,
-            visible=True,
-            lines=10,
-            max_lines=20
-        )
-    # Input Section
     with gr.Row():
-        message = gr.Textbox(
-            label="Your Message",
-            placeholder="Type your message here...",
-            lines=3,
-            max_lines=6
-        )
         submit_btn = gr.Button("Send", variant="primary")
-    # Normal Options
-    with gr.Accordion("Normal Options", open=True):
-        with gr.Row():
-            max_length = gr.Slider(128, 4096, value=DEFAULT_MAX_LENGTH, label="Max Length", step=128)
-            temperature = gr.Slider(0.1, 2.0, value=DEFAULT_TEMPERATURE, label="Temperature", step=0.1)
-    # Advanced Options
-    with gr.Accordion("Advanced Options", open=False):
-        with gr.Row():
-            top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p", step=0.05)
-            top_k = gr.Slider(1, 100, value=50, label="Top-k", step=1)
-            repetition_penalty = gr.Slider(0.1, 2.0, value=1.1, label="Repetition Penalty", step=0.1)
-        with gr.Row():
-            presence_penalty = gr.Slider(-2.0, 2.0, value=0.0, label="Presence Penalty", step=0.1)
-            frequency_penalty = gr.Slider(-2.0, 2.0, value=0.0, label="Frequency Penalty", step=0.1)
-    # System Prompt
-    system_prompt = gr.Textbox(
-        label="System Prompt",
-        value=SYSTEM_PROMPT,
-        lines=5,
-        max_lines=10
-    )
-    # Debug Options
-    show_reasoning = gr.Checkbox(label="Show Model Reasoning", value=True)
-    # Event Handling
-    inputs = [
-        message,
-        chatbot,
-        system_prompt,
-        max_length,
-        temperature,
-        top_p,
-        top_k,
-        repetition_penalty,
-        presence_penalty,
-        frequency_penalty,
-        show_reasoning
-    ]
-    outputs = [message, chatbot, reasoning_display]
     submit_btn.click(generate_response, inputs=inputs, outputs=outputs)
     message.submit(generate_response, inputs=inputs, outputs=outputs)
-# Launch the interface
 demo.launch(debug=True)

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 import torch
 # Configuration
 MODEL_NAME = "RekaAI/reka-flash-3"
+DEFAULT_MAX_LENGTH = 4096  # Reduced for CPU efficiency
 DEFAULT_TEMPERATURE = 0.7
+# System prompt with reasoning instructions
+SYSTEM_PROMPT = """You are Reka Flash-3, a helpful AI assistant created by Reka AI.
+When responding, think step-by-step within <thinking> tags and conclude your answer after </thinking>.
+For example:
 User: What is 2+2?
+Assistant: <thinking>Let me calculate that. 2 plus 2 equals 4.</thinking> The answer is 4."""
+# Load model and tokenizer with 4-bit quantization
 try:
+    quantization_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.float16,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type="nf4"
+    )
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_NAME,
+        quantization_config=quantization_config,
+        device_map="auto",  # Maps to CPU
+        torch_dtype=torch.float16
+    )
+    tokenizer.pad_token = tokenizer.eos_token  # Ensure padding works
 except Exception as e:
+    raise Exception(f"Failed to load model: {str(e)}. Ensure access to {MODEL_NAME} and sufficient CPU memory.")
 def generate_response(
     message,
     top_p,
     top_k,
     repetition_penalty,
     show_reasoning
 ):
+    """Generate a response from Reka Flash-3 with reasoning tags."""
     try:
+        # Format chat history and prompt (multi-round conversation)
+        history_str = ""
+        for user_msg, assistant_msg in chat_history:
+            history_str += f"human: {user_msg} <sep> assistant: {assistant_msg} <sep> "
+        prompt = f"{system_prompt} <sep> human: {message} <sep> assistant: <thinking>\n"
         # Tokenize input
+        inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
+        # Generate response with budget forcing
         outputs = model.generate(
             **inputs,
             max_new_tokens=max_length,
             top_p=top_p,
             top_k=top_k,
             repetition_penalty=repetition_penalty,
             do_sample=True,
+            eos_token_id=tokenizer.convert_tokens_to_ids("<sep>"),  # Stop at <sep>
+            pad_token_id=tokenizer.eos_token_id
         )
+        # Decode and clean response
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        response = response[len(prompt):].split("<sep>")[0].strip()  # Extract assistant response
         # Parse reasoning and final answer
         if "</thinking>" in response:
             reasoning, final_answer = response.split("</thinking>", 1)
+            reasoning = reasoning.replace("<thinking>", "").strip()
             final_answer = final_answer.strip()
         else:
             reasoning = ""
+            final_answer = response
+        # Update chat history (drop reasoning to save tokens)
+        chat_history.append({"role": "user", "content": message})
+        chat_history.append({"role": "assistant", "content": final_answer})
         # Display reasoning if requested
+        reasoning_display = f"**Reasoning:**\n{reasoning}" if show_reasoning and reasoning else ""
         return "", chat_history, reasoning_display
     except Exception as e:
+        error_msg = f"Error: {str(e)}"
         gr.Warning(error_msg)
         return "", chat_history, error_msg
+# Gradio Interface
+with gr.Blocks(title="Reka Flash-3 Chat", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
     # Reka Flash-3 Chat Interface
+    *Powered by [Reka AI](https://www.reka.ai/)* - A 21B parameter reasoning model optimized for CPU.
     """)
+    with gr.Accordion("Deployment Instructions", open=True):
         gr.Textbox(
+            value="""To deploy on Hugging Face Spaces:
+1. Request access to RekaAI/reka-flash-3 from Reka AI.
+2. Use a Pro subscription with zero-GPU (CPU-only) hardware.
+3. Ensure 32GB+ CPU memory for 4-bit quantization.
+4. Install dependencies: gradio, transformers, torch, bitsandbytes.""",
+            label="How to Deploy",
             interactive=False
         )
     with gr.Row():
+        chatbot = gr.Chatbot(type="messages", height=400, label="Conversation")
+        reasoning_display = gr.Textbox(label="Model Reasoning", interactive=False, lines=8)
     with gr.Row():
+        message = gr.Textbox(label="Your Message", placeholder="Ask me anything...", lines=2)
         submit_btn = gr.Button("Send", variant="primary")
+    with gr.Accordion("Options", open=True):
+        max_length = gr.Slider(128, 512, value=DEFAULT_MAX_LENGTH, label="Max Length", step=64)
+        temperature = gr.Slider(0.1, 2.0, value=DEFAULT_TEMPERATURE, label="Temperature", step=0.1)
+        top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p", step=0.05)
+        top_k = gr.Slider(1, 100, value=50, label="Top-k", step=1)
+        repetition_penalty = gr.Slider(0.1, 2.0, value=1.1, label="Repetition Penalty", step=0.1)
+    system_prompt = gr.Textbox(label="System Prompt", value=SYSTEM_PROMPT, lines=4)
+    show_reasoning = gr.Checkbox(label="Show Reasoning", value=True)
+    # Event handling
+    inputs = [message, chatbot, system_prompt, max_length, temperature, top_p, top_k, repetition_penalty, show_reasoning]
+    outputs = [message, chatbot, reasoning_display]
     submit_btn.click(generate_response, inputs=inputs, outputs=outputs)
     message.submit(generate_response, inputs=inputs, outputs=outputs)
 demo.launch(debug=True)

requirements.txt CHANGED Viewed

@@ -1,3 +1,5 @@
 gradio>=3.50
 huggingface_hub==0.25.2
-torch

 gradio>=3.50
 huggingface_hub==0.25.2
+torch
+transformers
+bitsandbytes