Akjava commited on
Commit
048aade
Β·
verified Β·
1 Parent(s): 8aaa2cd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -106
app.py CHANGED
@@ -141,109 +141,3 @@ with gr.Blocks(fill_height=True, css=css) as demo:
141
  if __name__ == "__main__":
142
  demo.launch()
143
 
144
-
145
-
146
-
147
- import spaces
148
- import os
149
- import torch
150
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
151
- from transformers import TextIteratorStreamer
152
- from threading import Thread
153
-
154
- import gradio as gr
155
-
156
- text_generator = None
157
- is_hugging_face = True
158
- model_id = "AXCXEPT/phi-4-deepseek-R1K-RL-EZO"
159
- model_id = "AXCXEPT/phi-4-open-R1-Distill-EZOv1"
160
-
161
- huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
162
- huggingface_token = None
163
- device = "auto" # torch.device("cuda" if torch.cuda.is_available() else "cpu")
164
- device = "cuda"
165
- dtype = torch.bfloat16
166
- dtype = torch.float16
167
-
168
- if not huggingface_token:
169
- pass
170
- print("no HUGGINGFACE_TOKEN if you need set secret ")
171
- #raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
172
-
173
-
174
-
175
-
176
-
177
-
178
-
179
-
180
- tokenizer = AutoTokenizer.from_pretrained(model_id, token=huggingface_token)
181
-
182
- print(model_id,device,dtype)
183
- histories = []
184
- #model = None
185
-
186
-
187
-
188
- if not is_hugging_face:
189
- model = AutoModelForCausalLM.from_pretrained(
190
- model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device
191
- )
192
- text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer,torch_dtype=dtype,device_map=device,stream=True ) #pipeline has not to(device)
193
-
194
- if next(model.parameters()).is_cuda:
195
- print("The model is on a GPU")
196
- else:
197
- print("The model is on a CPU")
198
-
199
- #print(f"text_generator.device='{text_generator.device}")
200
- if str(text_generator.device).strip() == 'cuda':
201
- print("The pipeline is using a GPU")
202
- else:
203
- print("The pipeline is using a CPU")
204
-
205
- print("initialized")
206
-
207
-
208
- def generate_text(messages):
209
- if is_hugging_face:#need everytime initialize for ZeroGPU
210
- model = AutoModelForCausalLM.from_pretrained(
211
- model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device
212
- )
213
- model.to(device)
214
- question = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
215
- question = tokenizer(question, return_tensors="pt").to(device)
216
-
217
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
218
- generation_kwargs = dict(question, streamer=streamer, max_new_tokens=200)
219
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
220
-
221
- generated_output = ""
222
- thread.start()
223
- for new_text in streamer:
224
- generated_output += new_text
225
- yield generated_output
226
- generate_text.zerogpu = True
227
-
228
-
229
-
230
- @spaces.GPU(duration=60)
231
- def call_generate_text(message, history):
232
- # history.append({"role": "user", "content": message})
233
- #print(message)
234
- #print(history)
235
-
236
- messages = history+[{"role":"user","content":message}]
237
- try:
238
-
239
- for text in generate_text(messages):
240
- yield text
241
- except RuntimeError as e:
242
- print(f"An unexpected error occurred: {e}")
243
- yield ""
244
-
245
- demo = gr.ChatInterface(call_generate_text,type="messages")
246
-
247
- #if __name__ == "__main__":
248
- demo.queue()
249
- demo.launch()
 
141
  if __name__ == "__main__":
142
  demo.launch()
143