hanzla commited on
Commit
9be27a9
·
verified ·
1 Parent(s): 2cf380d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +150 -89
app.py CHANGED
@@ -1,105 +1,166 @@
1
- import gradio as gr
2
  import subprocess
3
  import sys
4
- import os
5
  import spaces
 
 
 
 
 
 
 
 
6
 
7
- # Install the necessary packages that require CUDA
8
- try:
9
- subprocess.check_call([sys.executable, "-m", "pip", "install", "causal-conv1d>=1.4.0", "--no-build-isolation"])
10
- subprocess.check_call([sys.executable, "-m", "pip", "install", "mamba-ssm"])
11
- except Exception as e:
12
- print(f"Warning: Could not install CUDA extensions: {e}")
13
- print("The model might not work correctly or will be slower.")
14
 
15
- # Now import the required libraries
16
- from transformers import AutoTokenizer, AutoModelForCausalLM
17
- import torch
 
18
 
19
- # Define model repository
20
- repo_name = "hanzla/Falcon3-Mamba-R1-v0"
21
 
22
- # Load tokenizer
23
- print("Loading tokenizer...")
24
- tokenizer = AutoTokenizer.from_pretrained(repo_name)
25
 
26
- # Load model with appropriate settings
27
- print("Loading model... (this may take some time)")
28
- model = None
29
 
30
- try:
31
- # Try to load the model with GPU acceleration
32
- model = AutoModelForCausalLM.from_pretrained(
33
- repo_name,
34
- device_map="auto",
35
- torch_dtype=torch.bfloat16,
36
- )
37
- except Exception as e:
38
- print(f"Error loading model with GPU: {e}")
39
- print("Attempting to load with CPU only...")
40
- try:
41
- model = AutoModelForCausalLM.from_pretrained(
42
- repo_name,
43
- device_map="cpu",
44
- torch_dtype=torch.float32,
45
- )
46
- except Exception as e2:
47
- print(f"Error loading model with CPU: {e2}")
48
-
49
- if model is None:
50
- print("Could not load the model. Please check the logs.")
51
- else:
52
- print("Model loaded successfully!")
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  @spaces.GPU
56
- def generate_response(message, history):
57
- print(message)
58
- if model is None:
59
- return "Sorry, the model could not be loaded. Please check the logs."
60
-
61
- messages = [
62
- {"role": "system", "content": "You are a helpful assistant. You think out loud before answering anything"},
63
- ]
64
-
65
- # Add chat history to messages
66
- for h in history:
67
- messages.append({"role": "user", "content": h[0]})
68
- messages.append({"role": "assistant", "content": h[1]})
69
-
70
- # Add current message
71
- messages.append({"role": "user", "content": message})
72
-
73
- # Generate input text using chat template
74
- input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
75
 
76
- # Tokenize input
77
- input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
 
78
 
79
- # Generate response
80
- outputs = model.generate(
81
- input_ids,
82
- max_new_tokens=8000,
83
- temperature=0.7,
84
- do_sample=True,
 
 
 
85
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
- # Decode the generated tokens
88
- generated_tokens = outputs[0][len(input_ids[0]):]
89
- response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
90
-
91
- return response
92
-
93
- # Create Gradio interface
94
- demo = gr.ChatInterface(
95
- generate_response,
96
- title="Falcon3-Mamba-R1-v0 Chat",
97
- description="Chat with the Falcon3-Mamba-R1-v0 model.",
98
- examples=[
99
- "How does the surface area of moon compare with that of earth?",
100
- "Why it takes 8 minutes for sunlight to reach earth?"],
101
- theme="soft"
102
- )
103
-
104
- # Launch the interface
105
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import subprocess
2
  import sys
3
+ import shlex
4
  import spaces
5
+ import torch
6
+ import uuid
7
+ import os
8
+ import json
9
+ from pathlib import Path
10
+ import gradio as gr
11
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
12
+ from threading import Thread
13
 
 
 
 
 
 
 
 
14
 
15
+ # install packages for mamba
16
+ def install_mamba():
17
+ subprocess.run(shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.4.0/causal_conv1d-1.4.0+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"))
18
+ subprocess.run(shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.2/mamba_ssm-2.2.2+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"))
19
 
20
+ install_mamba()
 
21
 
22
+ MODEL = "hanzla/Falcon3-Mamba-R1-v0"
 
 
23
 
24
+ TITLE = "<h1><center>Falcon3-Mamba-R1-v0 playground</center></h1>"
 
 
25
 
26
+ SUB_TITLE = """<center>Falcon3 Mamba R1 is a Selective State Space model (Mamba) that scales on test time compute for reasoning.</center>"""
27
+ SYSTEM_PROMPT = os.getenv('SYSTEM_PROMPT')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
+ CSS = """
30
+ .duplicate-button {
31
+ margin: auto !important;
32
+ color: white !important;
33
+ background: black !important;
34
+ border-radius: 100vh !important;
35
+ }
36
+ h3 {
37
+ text-align: center;
38
+ /* Fix for chat container */
39
+ .chat-container {
40
+ height: 600px !important;
41
+ overflow-y: auto !important;
42
+ flex-direction: column !important;
43
+ }
44
+ .messages-container {
45
+ flex-grow: 1 !important;
46
+ overflow-y: auto !important;
47
+ padding-right: 10px !important;
48
+ }
49
+ /* Ensure consistent height */
50
+ .contain {
51
+ height: 100% !important;
52
+ }
53
+ """
54
+
55
+ END_MESSAGE = """
56
+ \n
57
+ **The conversation has reached to its end, please press "Clear" to restart a new conversation**
58
+ """
59
+
60
+ device = "cuda" # for GPU usage or "cpu" for CPU usage
61
+
62
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
63
+ model = AutoModelForCausalLM.from_pretrained(
64
+ MODEL,
65
+ torch_dtype=torch.bfloat16,
66
+ ).to(device)
67
+
68
+ if device == "cuda":
69
+ model = torch.compile(model)
70
 
71
  @spaces.GPU
72
+ def stream_chat(
73
+ message: str,
74
+ history: list,
75
+ temperature: float = 0.3,
76
+ max_new_tokens: int = 100,
77
+ top_p: float = 1.0,
78
+ top_k: int = 20,
79
+ penalty: float = 1.2,
80
+ ):
81
+ print(f'message: {message}')
82
+ print(f'history: {history}')
83
+
84
+ conversation = []
85
+ for prompt, answer in history:
86
+ conversation.extend([
87
+ {"role": 'system', "content": SYSTEM_PROMPT },
88
+ {"role": "user", "content": prompt},
89
+ {"role": "assistant", "content": answer},
90
+ ])
91
+
92
+ conversation.append({"role": "user", "content": message})
93
 
94
+ input_text = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
95
+ inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
96
+ streamer = TextIteratorStreamer(tokenizer, timeout=40.0, skip_prompt=True, skip_special_tokens=True)
97
 
98
+ generate_kwargs = dict(
99
+ input_ids=inputs,
100
+ max_new_tokens=max_new_tokens,
101
+ do_sample=False if temperature == 0 else True,
102
+ top_p=top_p,
103
+ top_k=top_k,
104
+ temperature=temperature,
105
+ streamer=streamer,
106
+ pad_token_id=11,
107
  )
108
+
109
+ with torch.no_grad():
110
+ thread = Thread(target=model.generate, kwargs=generate_kwargs)
111
+ thread.start()
112
+
113
+ buffer = ""
114
+ for new_text in streamer:
115
+ buffer += new_text
116
+ buffer = buffer.replace("\nUser", "")
117
+ buffer = buffer.replace("\nSystem", "")
118
+ yield buffer
119
+
120
+ print(f'response: {buffer}')
121
+
122
+ with gr.Blocks(css=CSS, theme="soft") as demo:
123
+ gr.HTML(TITLE)
124
+ gr.HTML(SUB_TITLE)
125
+ gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
126
 
127
+ chat_interface = gr.ChatInterface(
128
+ fn=stream_chat,
129
+ chatbot=gr.Chatbot(
130
+ height=600,
131
+ container=True,
132
+ elem_classes=["chat-container"]
133
+ ),
134
+ fill_height=True,
135
+ additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
136
+ additional_inputs=[
137
+ gr.Slider(minimum=0, maximum=1, step=0.1, value=0.3, label="Temperature", render=False),
138
+ gr.Slider(minimum=128, maximum=32768, step=1, value=1024, label="Max new tokens", render=False),
139
+ gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=1.0, label="top_p", render=False),
140
+ gr.Slider(minimum=1, maximum=20, step=1, value=20, label="top_k", render=False),
141
+ gr.Slider(minimum=0.0, maximum=2.0, step=0.1, value=1.2, label="Repetition penalty", render=False),
142
+ ],
143
+ examples=[
144
+ ["""Consider the following statements:
145
+
146
+ 1. If it rains, then the ground will be wet.
147
+ 2. It is raining.
148
+
149
+ Using propositional logic, determine whether the conclusion "The ground is wet" is valid.
150
+ Also, identify the rule of inference used to reach the conclusion.
151
+ """],
152
+ ["""A satellite is in a circular orbit around Earth at an altitude of 500 km above the surface. Calculate:
153
+
154
+ 1. The orbital velocity of the satellite.
155
+ 2. The orbital period of the satellite.
156
+
157
+ Given:
158
+ - Radius of Earth, R_E = 6.37 × 10^6 m
159
+ - Gravitational constant, G = 6.674 × 10^−11 Nm²/kg²
160
+ - Mass of Earth, M_E = 5.97 × 10^24 kg"""],
161
+ ],
162
+ cache_examples=False,
163
+ )
164
+
165
+ if __name__ == "__main__":
166
+ demo.launch()