mjavaid commited on
Commit
004ab91
·
1 Parent(s): c53d7c3
Files changed (1) hide show
  1. app.py +18 -13
app.py CHANGED
@@ -55,7 +55,8 @@ else:
55
  @spaces.GPU
56
  def generate_response(message, history):
57
  if model is None:
58
- return "Sorry, the model could not be loaded. Please check the logs."
 
59
 
60
  messages = [
61
  {"role": "system", "content": "You are a helpful assistant. You think before answering"},
@@ -75,25 +76,29 @@ def generate_response(message, history):
75
  # Tokenize input
76
  input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
77
 
78
- # Generate response
79
- outputs = model.generate(
 
80
  input_ids,
81
- max_new_tokens=512, # Reduced from 1024 to improve speed
82
  temperature=0.7,
83
  do_sample=True,
84
- )
85
-
86
- # Decode the generated tokens
87
- generated_tokens = outputs[0][len(input_ids[0]):]
88
- response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
89
-
90
- return response
 
 
 
91
 
92
- # Create Gradio interface
93
  demo = gr.ChatInterface(
94
  generate_response,
95
  title="Falcon3-Mamba-R1-v0 Chat",
96
- description="Chat with the Falcon3-Mamba-R1-v0 model.",
97
  examples=[
98
  "How does the surface area of moon compare with that of earth?",
99
  "Why it takes 8 minutes for sunlight to reach earth?"],
 
55
  @spaces.GPU
56
  def generate_response(message, history):
57
  if model is None:
58
+ yield "Sorry, the model could not be loaded. Please check the logs."
59
+ return
60
 
61
  messages = [
62
  {"role": "system", "content": "You are a helpful assistant. You think before answering"},
 
76
  # Tokenize input
77
  input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
78
 
79
+ # Stream response generation
80
+ streamer = ""
81
+ for new_token in model.generate(
82
  input_ids,
83
+ max_new_tokens=2048,
84
  temperature=0.7,
85
  do_sample=True,
86
+ streamer=None, # We're implementing our own streaming
87
+ ):
88
+ # Get the new token and add it to the stream
89
+ next_token = new_token[0, -1].unsqueeze(0)
90
+ token_text = tokenizer.decode(next_token, skip_special_tokens=True)
91
+
92
+ if token_text:
93
+ streamer += token_text
94
+ yield streamer
95
+
96
 
97
+ # Create Gradio interface with streaming
98
  demo = gr.ChatInterface(
99
  generate_response,
100
  title="Falcon3-Mamba-R1-v0 Chat",
101
+ description="Chat with the Falcon3-Mamba-R1-v0 model. Responses are streamed in real-time.",
102
  examples=[
103
  "How does the surface area of moon compare with that of earth?",
104
  "Why it takes 8 minutes for sunlight to reach earth?"],