mjavaid commited on
Commit
dc060e7
·
1 Parent(s): 004ab91
Files changed (1) hide show
  1. app.py +13 -18
app.py CHANGED
@@ -55,8 +55,7 @@ else:
55
  @spaces.GPU
56
  def generate_response(message, history):
57
  if model is None:
58
- yield "Sorry, the model could not be loaded. Please check the logs."
59
- return
60
 
61
  messages = [
62
  {"role": "system", "content": "You are a helpful assistant. You think before answering"},
@@ -76,29 +75,25 @@ def generate_response(message, history):
76
  # Tokenize input
77
  input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
78
 
79
- # Stream response generation
80
- streamer = ""
81
- for new_token in model.generate(
82
  input_ids,
83
- max_new_tokens=2048,
84
  temperature=0.7,
85
  do_sample=True,
86
- streamer=None, # We're implementing our own streaming
87
- ):
88
- # Get the new token and add it to the stream
89
- next_token = new_token[0, -1].unsqueeze(0)
90
- token_text = tokenizer.decode(next_token, skip_special_tokens=True)
91
-
92
- if token_text:
93
- streamer += token_text
94
- yield streamer
95
-
96
 
97
- # Create Gradio interface with streaming
98
  demo = gr.ChatInterface(
99
  generate_response,
100
  title="Falcon3-Mamba-R1-v0 Chat",
101
- description="Chat with the Falcon3-Mamba-R1-v0 model. Responses are streamed in real-time.",
102
  examples=[
103
  "How does the surface area of moon compare with that of earth?",
104
  "Why it takes 8 minutes for sunlight to reach earth?"],
 
55
  @spaces.GPU
56
  def generate_response(message, history):
57
  if model is None:
58
+ return "Sorry, the model could not be loaded. Please check the logs."
 
59
 
60
  messages = [
61
  {"role": "system", "content": "You are a helpful assistant. You think before answering"},
 
75
  # Tokenize input
76
  input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
77
 
78
+ # Generate response
79
+ outputs = model.generate(
 
80
  input_ids,
81
+ max_new_tokens=2048, # Reduced from 1024 to improve speed
82
  temperature=0.7,
83
  do_sample=True,
84
+ )
85
+
86
+ # Decode the generated tokens
87
+ generated_tokens = outputs[0][len(input_ids[0]):]
88
+ response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
89
+
90
+ return response
 
 
 
91
 
92
+ # Create Gradio interface
93
  demo = gr.ChatInterface(
94
  generate_response,
95
  title="Falcon3-Mamba-R1-v0 Chat",
96
+ description="Chat with the Falcon3-Mamba-R1-v0 model.",
97
  examples=[
98
  "How does the surface area of moon compare with that of earth?",
99
  "Why it takes 8 minutes for sunlight to reach earth?"],