Spaces:

hanzla
/

Falcon3MambaReasoner

Running on Zero

App Files Files Community

hanzla commited on Mar 24

Commit

9be27a9

verified ·

1 Parent(s): 2cf380d

Update app.py

Browse files

Files changed (1) hide show

app.py +150 -89

app.py CHANGED Viewed

@@ -1,105 +1,166 @@
-import gradio as gr
 import subprocess
 import sys
-import os
 import spaces
-# Install the necessary packages that require CUDA
-try:
-    subprocess.check_call([sys.executable, "-m", "pip", "install", "causal-conv1d>=1.4.0", "--no-build-isolation"])
-    subprocess.check_call([sys.executable, "-m", "pip", "install", "mamba-ssm"])
-except Exception as e:
-    print(f"Warning: Could not install CUDA extensions: {e}")
-    print("The model might not work correctly or will be slower.")
-# Now import the required libraries
-from transformers import AutoTokenizer, AutoModelForCausalLM
-import torch
-# Define model repository
-repo_name = "hanzla/Falcon3-Mamba-R1-v0"
-# Load tokenizer
-print("Loading tokenizer...")
-tokenizer = AutoTokenizer.from_pretrained(repo_name)
-# Load model with appropriate settings
-print("Loading model... (this may take some time)")
-model = None
-try:
-    # Try to load the model with GPU acceleration
-    model = AutoModelForCausalLM.from_pretrained(
-        repo_name,
-        device_map="auto",
-        torch_dtype=torch.bfloat16,
-    )
-except Exception as e:
-    print(f"Error loading model with GPU: {e}")
-    print("Attempting to load with CPU only...")
-    try:
-        model = AutoModelForCausalLM.from_pretrained(
-            repo_name,
-            device_map="cpu",
-            torch_dtype=torch.float32,
-        )
-    except Exception as e2:
-        print(f"Error loading model with CPU: {e2}")
-if model is None:
-    print("Could not load the model. Please check the logs.")
-else:
-    print("Model loaded successfully!")
 @spaces.GPU
-def generate_response(message, history):
-    print(message)
-    if model is None:
-        return "Sorry, the model could not be loaded. Please check the logs."
-    messages = [
-        {"role": "system", "content": "You are a helpful assistant. You think out loud before answering anything"},
-    ]
-    # Add chat history to messages
-    for h in history:
-        messages.append({"role": "user", "content": h[0]})
-        messages.append({"role": "assistant", "content": h[1]})
-    # Add current message
-    messages.append({"role": "user", "content": message})
-    # Generate input text using chat template
-    input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    # Tokenize input
-    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
-    # Generate response
-    outputs = model.generate(
-        input_ids,
-        max_new_tokens=8000,
-        temperature=0.7,
-        do_sample=True,
     )
-    # Decode the generated tokens
-    generated_tokens = outputs[0][len(input_ids[0]):]
-    response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
-    return response
-# Create Gradio interface
-demo = gr.ChatInterface(
-    generate_response,
-    title="Falcon3-Mamba-R1-v0 Chat",
-    description="Chat with the Falcon3-Mamba-R1-v0 model.",
-    examples=[
-              "How does the surface area of moon compare with that of earth?",
-              "Why it takes 8 minutes for sunlight to reach earth?"],
-    theme="soft"
-)
-# Launch the interface
-demo.launch()

 import subprocess
 import sys
+import shlex
 import spaces
+import torch
+import uuid
+import os
+import json
+from pathlib import Path
+import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from threading import Thread
+# install packages for mamba
+def install_mamba():
+    subprocess.run(shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.4.0/causal_conv1d-1.4.0+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"))
+    subprocess.run(shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.2/mamba_ssm-2.2.2+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"))
+install_mamba()
+MODEL = "hanzla/Falcon3-Mamba-R1-v0"
+TITLE = "<h1><center>Falcon3-Mamba-R1-v0 playground</center></h1>"
+SUB_TITLE = """<center>Falcon3 Mamba R1 is a Selective State Space model (Mamba) that scales on test time compute for reasoning.</center>"""
+SYSTEM_PROMPT = os.getenv('SYSTEM_PROMPT')
+CSS = """
+.duplicate-button {
+    margin: auto !important;
+    color: white !important;
+    background: black !important;
+    border-radius: 100vh !important;
+}
+h3 {
+    text-align: center;
+/* Fix for chat container */
+.chat-container {
+    height: 600px !important;
+    overflow-y: auto !important;
+    flex-direction: column !important;
+}
+.messages-container {
+    flex-grow: 1 !important;
+    overflow-y: auto !important;
+    padding-right: 10px !important;
+}
+/* Ensure consistent height */
+.contain {
+    height: 100% !important;
+}
+"""
+END_MESSAGE = """
+\n
+**The conversation has reached to its end, please press "Clear" to restart a new conversation**
+"""
+device = "cuda" # for GPU usage or "cpu" for CPU usage
+tokenizer = AutoTokenizer.from_pretrained(MODEL)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL,
+    torch_dtype=torch.bfloat16,
+).to(device)
+if device == "cuda":
+    model = torch.compile(model)
 @spaces.GPU
+def stream_chat(
+    message: str,
+    history: list,
+    temperature: float = 0.3,
+    max_new_tokens: int = 100,
+    top_p: float = 1.0,
+    top_k: int = 20,
+    penalty: float = 1.2,
+):
+    print(f'message: {message}')
+    print(f'history: {history}')
+    conversation = []
+    for prompt, answer in history:
+        conversation.extend([
+            {"role": 'system', "content": SYSTEM_PROMPT },
+            {"role": "user", "content": prompt},
+            {"role": "assistant", "content": answer},
+        ])
+    conversation.append({"role": "user", "content": message})
+    input_text = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=40.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        input_ids=inputs,
+        max_new_tokens=max_new_tokens,
+        do_sample=False if temperature == 0 else True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        streamer=streamer,
+        pad_token_id=11,
     )
+    with torch.no_grad():
+        thread = Thread(target=model.generate, kwargs=generate_kwargs)
+        thread.start()
+    buffer = ""
+    for new_text in streamer:
+        buffer += new_text
+        buffer = buffer.replace("\nUser", "")
+        buffer = buffer.replace("\nSystem", "")
+        yield buffer
+    print(f'response: {buffer}')
+with gr.Blocks(css=CSS, theme="soft") as demo:
+    gr.HTML(TITLE)
+    gr.HTML(SUB_TITLE)
+    gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
+    chat_interface = gr.ChatInterface(
+        fn=stream_chat,
+        chatbot=gr.Chatbot(
+            height=600,
+            container=True,
+            elem_classes=["chat-container"]
+        ),
+        fill_height=True,
+        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
+        additional_inputs=[
+            gr.Slider(minimum=0, maximum=1, step=0.1, value=0.3, label="Temperature", render=False),
+            gr.Slider(minimum=128, maximum=32768, step=1, value=1024, label="Max new tokens", render=False),
+            gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=1.0, label="top_p", render=False),
+            gr.Slider(minimum=1, maximum=20, step=1, value=20, label="top_k", render=False),
+            gr.Slider(minimum=0.0, maximum=2.0, step=0.1, value=1.2, label="Repetition penalty", render=False),
+        ],
+        examples=[
+            ["""Consider the following statements:
+1. If it rains, then the ground will be wet.
+2. It is raining.
+Using propositional logic, determine whether the conclusion "The ground is wet" is valid.
+Also, identify the rule of inference used to reach the conclusion.
+"""],
+            ["""A satellite is in a circular orbit around Earth at an altitude of 500 km above the surface. Calculate:
+1. The orbital velocity of the satellite.
+2. The orbital period of the satellite.
+Given:
+- Radius of Earth, R_E = 6.37 × 10^6 m
+- Gravitational constant, G = 6.674 × 10^−11 Nm²/kg²
+- Mass of Earth, M_E = 5.97 × 10^24 kg"""],
+        ],
+        cache_examples=False,
+    )
+if __name__ == "__main__":
+    demo.launch()