Spaces:

hanzla
/

Falcon3MambaReasoner

Running on Zero

mjavaid commited on Mar 13

Commit

dc060e7

1 Parent(s): 004ab91

add

Files changed (1) hide show

app.py CHANGED Viewed

@@ -55,8 +55,7 @@ else:
 @spaces.GPU
 def generate_response(message, history):
     if model is None:
-        yield "Sorry, the model could not be loaded. Please check the logs."
-        return
     messages = [
         {"role": "system", "content": "You are a helpful assistant. You think before answering"},
@@ -76,29 +75,25 @@ def generate_response(message, history):
     # Tokenize input
     input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
-    # Stream response generation
-    streamer = ""
-    for new_token in model.generate(
         input_ids,
-        max_new_tokens=2048,
         temperature=0.7,
         do_sample=True,
-        streamer=None,  # We're implementing our own streaming
-    ):
-        # Get the new token and add it to the stream
-        next_token = new_token[0, -1].unsqueeze(0)
-        token_text = tokenizer.decode(next_token, skip_special_tokens=True)
-        if token_text:
-            streamer += token_text
-            yield streamer
-# Create Gradio interface with streaming
 demo = gr.ChatInterface(
     generate_response,
     title="Falcon3-Mamba-R1-v0 Chat",
-    description="Chat with the Falcon3-Mamba-R1-v0 model. Responses are streamed in real-time.",
     examples=[
               "How does the surface area of moon compare with that of earth?",
               "Why it takes 8 minutes for sunlight to reach earth?"],

 @spaces.GPU
 def generate_response(message, history):
     if model is None:
+        return "Sorry, the model could not be loaded. Please check the logs."
     messages = [
         {"role": "system", "content": "You are a helpful assistant. You think before answering"},
     # Tokenize input
     input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
+    # Generate response
+    outputs = model.generate(
         input_ids,
+        max_new_tokens=2048,  # Reduced from 1024 to improve speed
         temperature=0.7,
         do_sample=True,
+    )
+    # Decode the generated tokens
+    generated_tokens = outputs[0][len(input_ids[0]):]
+    response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
+    return response
+# Create Gradio interface
 demo = gr.ChatInterface(
     generate_response,
     title="Falcon3-Mamba-R1-v0 Chat",
+    description="Chat with the Falcon3-Mamba-R1-v0 model.",
     examples=[
               "How does the surface area of moon compare with that of earth?",
               "Why it takes 8 minutes for sunlight to reach earth?"],