Spaces:

hanzla
/

Falcon3MambaReasoner

Running on Zero

mjavaid commited on Mar 13

Commit

004ab91

1 Parent(s): c53d7c3

add

Files changed (1) hide show

app.py CHANGED Viewed

@@ -55,7 +55,8 @@ else:
 @spaces.GPU
 def generate_response(message, history):
     if model is None:
-        return "Sorry, the model could not be loaded. Please check the logs."
     messages = [
         {"role": "system", "content": "You are a helpful assistant. You think before answering"},
@@ -75,25 +76,29 @@ def generate_response(message, history):
     # Tokenize input
     input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
-    # Generate response
-    outputs = model.generate(
         input_ids,
-        max_new_tokens=512,  # Reduced from 1024 to improve speed
         temperature=0.7,
         do_sample=True,
-    )
-    # Decode the generated tokens
-    generated_tokens = outputs[0][len(input_ids[0]):]
-    response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
-    return response
-# Create Gradio interface
 demo = gr.ChatInterface(
     generate_response,
     title="Falcon3-Mamba-R1-v0 Chat",
-    description="Chat with the Falcon3-Mamba-R1-v0 model.",
     examples=[
               "How does the surface area of moon compare with that of earth?",
               "Why it takes 8 minutes for sunlight to reach earth?"],

 @spaces.GPU
 def generate_response(message, history):
     if model is None:
+        yield "Sorry, the model could not be loaded. Please check the logs."
+        return
     messages = [
         {"role": "system", "content": "You are a helpful assistant. You think before answering"},
     # Tokenize input
     input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
+    # Stream response generation
+    streamer = ""
+    for new_token in model.generate(
         input_ids,
+        max_new_tokens=2048,
         temperature=0.7,
         do_sample=True,
+        streamer=None,  # We're implementing our own streaming
+    ):
+        # Get the new token and add it to the stream
+        next_token = new_token[0, -1].unsqueeze(0)
+        token_text = tokenizer.decode(next_token, skip_special_tokens=True)
+        if token_text:
+            streamer += token_text
+            yield streamer
+# Create Gradio interface with streaming
 demo = gr.ChatInterface(
     generate_response,
     title="Falcon3-Mamba-R1-v0 Chat",
+    description="Chat with the Falcon3-Mamba-R1-v0 model. Responses are streamed in real-time.",
     examples=[
               "How does the surface area of moon compare with that of earth?",
               "Why it takes 8 minutes for sunlight to reach earth?"],