MadsGalsgaard commited on
Commit
783efce
·
verified ·
1 Parent(s): 98f4c1f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +311 -139
app.py CHANGED
@@ -112,153 +112,325 @@
112
  # if __name__ == "__main__":
113
  # demo.launch()
114
 
115
- ### 26 Use a pipeline as a high-level Logic
116
- import spaces
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  import os
118
- import subprocess
119
- from llama_cpp import Llama
120
- from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
121
- from llama_cpp_agent.providers import LlamaCppPythonProvider
122
- from llama_cpp_agent.chat_history import BasicChatHistory
123
- from llama_cpp_agent.chat_history.messages import Roles
124
  import gradio as gr
125
- from huggingface_hub import hf_hub_download
126
-
127
- huggingface_token = os.getenv("HF_TOKEN")
128
-
129
- # Download the Meta-Llama-3.1-8B-Instruct model
130
- hf_hub_download(
131
- repo_id="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
132
- filename="Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf",
133
- local_dir="./models",
134
- token=huggingface_token
135
- )
136
-
137
- llm = None
138
- llm_model = None
139
-
140
- @spaces.GPU(duration=120)
141
- def respond(
142
- message,
143
- history: list[tuple[str, str]],
144
- model,
145
- system_message,
146
- max_tokens,
147
- temperature,
148
- top_p,
149
- top_k,
150
- repeat_penalty,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  ):
152
- chat_template = MessagesFormatterType.GEMMA_2
 
153
 
154
- global llm
155
- global llm_model
156
-
157
- # Load model only if it's not already loaded or if a new model is selected
158
- if llm is None or llm_model != model:
159
- try:
160
- llm = Llama(
161
- model_path=f"models/{model}",
162
- flash_attn=True,
163
- n_gpu_layers=81, # Adjust based on available GPU resources
164
- n_batch=1024,
165
- n_ctx=8192,
166
- )
167
- llm_model = model
168
- except Exception as e:
169
- return f"Error loading model: {str(e)}"
170
-
171
- provider = LlamaCppPythonProvider(llm)
172
-
173
- agent = LlamaCppAgent(
174
- provider,
175
- system_prompt=f"{system_message}",
176
- predefined_messages_formatter_type=chat_template,
177
- debug_output=True
178
- )
179
 
180
- settings = provider.get_provider_default_settings()
181
- settings.temperature = temperature
182
- settings.top_k = top_k
183
- settings.top_p = top_p
184
- settings.max_tokens = max_tokens
185
- settings.repeat_penalty = repeat_penalty
186
- settings.stream = True
187
-
188
- messages = BasicChatHistory()
189
-
190
- # Add user and assistant messages to the history
191
- for msn in history:
192
- user = {'role': Roles.user, 'content': msn[0]}
193
- assistant = {'role': Roles.assistant, 'content': msn[1]}
194
- messages.add_message(user)
195
- messages.add_message(assistant)
196
 
197
- # Stream the response
198
- try:
199
- stream = agent.get_chat_response(
200
- message,
201
- llm_sampling_settings=settings,
202
- chat_history=messages,
203
- returns_streaming_generator=True,
204
- print_output=False
205
- )
 
 
 
 
 
 
206
 
207
- outputs = ""
208
- for output in stream:
209
- outputs += output
210
- yield outputs
211
- except Exception as e:
212
- yield f"Error during response generation: {str(e)}"
213
-
214
- description = """<p align="center">Using the Meta-Llama-3.1-8B-Instruct Model</p>"""
215
-
216
- demo = gr.ChatInterface(
217
- respond,
218
- additional_inputs=[
219
- gr.Dropdown([
220
- 'Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf'
221
- ],
222
- value="Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf",
223
- label="Model"
224
- ),
225
- gr.Textbox(value="You are a helpful assistant.", label="System message"),
226
- gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
227
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
228
- gr.Slider(
229
- minimum=0.1,
230
- maximum=1.0,
231
- value=0.95,
232
- step=0.05,
233
- label="Top-p",
234
- ),
235
- gr.Slider(
236
- minimum=0,
237
- maximum=100,
238
- value=40,
239
- step=1,
240
- label="Top-k",
241
- ),
242
- gr.Slider(
243
- minimum=0.0,
244
- maximum=2.0,
245
- value=1.1,
246
- step=0.1,
247
- label="Repetition penalty",
248
- ),
249
- ],
250
- retry_btn="Retry",
251
- undo_btn="Undo",
252
- clear_btn="Clear",
253
- submit_btn="Send",
254
- title="Chat with Meta-Llama-3.1-8B-Instruct using llama.cpp",
255
- description=description,
256
- chatbot=gr.Chatbot(
257
- scale=1,
258
- likeable=False,
259
- show_copy_button=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  )
261
- )
262
 
263
  if __name__ == "__main__":
264
  demo.launch()
 
112
  # if __name__ == "__main__":
113
  # demo.launch()
114
 
115
+ ### 26 aug Use a pipeline as a high-level Logic
116
+ # import spaces
117
+ # import os
118
+ # import subprocess
119
+ # from llama_cpp import Llama
120
+ # from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
121
+ # from llama_cpp_agent.providers import LlamaCppPythonProvider
122
+ # from llama_cpp_agent.chat_history import BasicChatHistory
123
+ # from llama_cpp_agent.chat_history.messages import Roles
124
+ # import gradio as gr
125
+ # from huggingface_hub import hf_hub_download
126
+
127
+ # huggingface_token = os.getenv("HF_TOKEN")
128
+
129
+ # # Download the Meta-Llama-3.1-8B-Instruct model
130
+ # hf_hub_download(
131
+ # repo_id="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
132
+ # filename="Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf",
133
+ # local_dir="./models",
134
+ # token=huggingface_token
135
+ # )
136
+
137
+ # llm = None
138
+ # llm_model = None
139
+
140
+ # @spaces.GPU(duration=120)
141
+ # def respond(
142
+ # message,
143
+ # history: list[tuple[str, str]],
144
+ # model,
145
+ # system_message,
146
+ # max_tokens,
147
+ # temperature,
148
+ # top_p,
149
+ # top_k,
150
+ # repeat_penalty,
151
+ # ):
152
+ # chat_template = MessagesFormatterType.GEMMA_2
153
+
154
+ # global llm
155
+ # global llm_model
156
+
157
+ # # Load model only if it's not already loaded or if a new model is selected
158
+ # if llm is None or llm_model != model:
159
+ # try:
160
+ # llm = Llama(
161
+ # model_path=f"models/{model}",
162
+ # flash_attn=True,
163
+ # n_gpu_layers=81, # Adjust based on available GPU resources
164
+ # n_batch=1024,
165
+ # n_ctx=8192,
166
+ # )
167
+ # llm_model = model
168
+ # except Exception as e:
169
+ # return f"Error loading model: {str(e)}"
170
+
171
+ # provider = LlamaCppPythonProvider(llm)
172
+
173
+ # agent = LlamaCppAgent(
174
+ # provider,
175
+ # system_prompt=f"{system_message}",
176
+ # predefined_messages_formatter_type=chat_template,
177
+ # debug_output=True
178
+ # )
179
+
180
+ # settings = provider.get_provider_default_settings()
181
+ # settings.temperature = temperature
182
+ # settings.top_k = top_k
183
+ # settings.top_p = top_p
184
+ # settings.max_tokens = max_tokens
185
+ # settings.repeat_penalty = repeat_penalty
186
+ # settings.stream = True
187
+
188
+ # messages = BasicChatHistory()
189
+
190
+ # # Add user and assistant messages to the history
191
+ # for msn in history:
192
+ # user = {'role': Roles.user, 'content': msn[0]}
193
+ # assistant = {'role': Roles.assistant, 'content': msn[1]}
194
+ # messages.add_message(user)
195
+ # messages.add_message(assistant)
196
+
197
+ # # Stream the response
198
+ # try:
199
+ # stream = agent.get_chat_response(
200
+ # message,
201
+ # llm_sampling_settings=settings,
202
+ # chat_history=messages,
203
+ # returns_streaming_generator=True,
204
+ # print_output=False
205
+ # )
206
+
207
+ # outputs = ""
208
+ # for output in stream:
209
+ # outputs += output
210
+ # yield outputs
211
+ # except Exception as e:
212
+ # yield f"Error during response generation: {str(e)}"
213
+
214
+ # description = """<p align="center">Using the Meta-Llama-3.1-8B-Instruct Model</p>"""
215
+
216
+ # demo = gr.ChatInterface(
217
+ # respond,
218
+ # additional_inputs=[
219
+ # gr.Dropdown([
220
+ # 'Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf'
221
+ # ],
222
+ # value="Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf",
223
+ # label="Model"
224
+ # ),
225
+ # gr.Textbox(value="You are a helpful assistant.", label="System message"),
226
+ # gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
227
+ # gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
228
+ # gr.Slider(
229
+ # minimum=0.1,
230
+ # maximum=1.0,
231
+ # value=0.95,
232
+ # step=0.05,
233
+ # label="Top-p",
234
+ # ),
235
+ # gr.Slider(
236
+ # minimum=0,
237
+ # maximum=100,
238
+ # value=40,
239
+ # step=1,
240
+ # label="Top-k",
241
+ # ),
242
+ # gr.Slider(
243
+ # minimum=0.0,
244
+ # maximum=2.0,
245
+ # value=1.1,
246
+ # step=0.1,
247
+ # label="Repetition penalty",
248
+ # ),
249
+ # ],
250
+ # retry_btn="Retry",
251
+ # undo_btn="Undo",
252
+ # clear_btn="Clear",
253
+ # submit_btn="Send",
254
+ # title="Chat with Meta-Llama-3.1-8B-Instruct using llama.cpp",
255
+ # description=description,
256
+ # chatbot=gr.Chatbot(
257
+ # scale=1,
258
+ # likeable=False,
259
+ # show_copy_button=True
260
+ # )
261
+ # )
262
+
263
+ # if __name__ == "__main__":
264
+ # demo.launch()
265
+
266
+
267
+
268
+ ####03 3.1 8b
269
  import os
270
+ import time
271
+ import spaces
272
+ import torch
273
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
 
 
274
  import gradio as gr
275
+ from threading import Thread
276
+
277
+ MODEL_LIST = ["meta-llama/Meta-Llama-3.1-8B-Instruct"]
278
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
279
+ MODEL = os.environ.get("MODEL_LIST")
280
+
281
+ TITLE = "<h1><center>Meta-Llama3.1-8B</center></h1>"
282
+
283
+ PLACEHOLDER = """
284
+ <center>
285
+ <p>Hi! How can I help you today?</p>
286
+ </center>
287
+ """
288
+
289
+
290
+ CSS = """
291
+ .duplicate-button {
292
+ margin: auto !important;
293
+ color: white !important;
294
+ background: black !important;
295
+ border-radius: 100vh !important;
296
+ }
297
+ h3 {
298
+ text-align: center;
299
+ }
300
+ """
301
+
302
+ device = "cuda" # for GPU usage or "cpu" for CPU usage
303
+
304
+ quantization_config = BitsAndBytesConfig(
305
+ load_in_4bit=True,
306
+ bnb_4bit_compute_dtype=torch.bfloat16,
307
+ bnb_4bit_use_double_quant=True,
308
+ bnb_4bit_quant_type= "nf4")
309
+
310
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
311
+ model = AutoModelForCausalLM.from_pretrained(
312
+ MODEL,
313
+ torch_dtype=torch.bfloat16,
314
+ device_map="auto",
315
+ quantization_config=quantization_config)
316
+
317
+ @spaces.GPU()
318
+ def stream_chat(
319
+ message: str,
320
+ history: list,
321
+ system_prompt: str,
322
+ temperature: float = 0.8,
323
+ max_new_tokens: int = 1024,
324
+ top_p: float = 1.0,
325
+ top_k: int = 20,
326
+ penalty: float = 1.2,
327
  ):
328
+ print(f'message: {message}')
329
+ print(f'history: {history}')
330
 
331
+ conversation = [
332
+ {"role": "system", "content": system_prompt}
333
+ ]
334
+ for prompt, answer in history:
335
+ conversation.extend([
336
+ {"role": "user", "content": prompt},
337
+ {"role": "assistant", "content": answer},
338
+ ])
339
+
340
+ conversation.append({"role": "user", "content": message})
341
+
342
+ input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device)
 
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
+ streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
 
346
+ generate_kwargs = dict(
347
+ input_ids=input_ids,
348
+ max_new_tokens = max_new_tokens,
349
+ do_sample = False if temperature == 0 else True,
350
+ top_p = top_p,
351
+ top_k = top_k,
352
+ temperature = temperature,
353
+ repetition_penalty=penalty,
354
+ eos_token_id=[128001,128008,128009],
355
+ streamer=streamer,
356
+ )
357
+
358
+ with torch.no_grad():
359
+ thread = Thread(target=model.generate, kwargs=generate_kwargs)
360
+ thread.start()
361
 
362
+ buffer = ""
363
+ for new_text in streamer:
364
+ buffer += new_text
365
+ yield buffer
366
+
367
+
368
+ chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
369
+
370
+ with gr.Blocks(css=CSS, theme="soft") as demo:
371
+ gr.HTML(TITLE)
372
+ gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
373
+ gr.ChatInterface(
374
+ fn=stream_chat,
375
+ chatbot=chatbot,
376
+ fill_height=True,
377
+ additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
378
+ additional_inputs=[
379
+ gr.Textbox(
380
+ value="You are a helpful assistant",
381
+ label="System Prompt",
382
+ render=False,
383
+ ),
384
+ gr.Slider(
385
+ minimum=0,
386
+ maximum=1,
387
+ step=0.1,
388
+ value=0.8,
389
+ label="Temperature",
390
+ render=False,
391
+ ),
392
+ gr.Slider(
393
+ minimum=128,
394
+ maximum=8192,
395
+ step=1,
396
+ value=1024,
397
+ label="Max new tokens",
398
+ render=False,
399
+ ),
400
+ gr.Slider(
401
+ minimum=0.0,
402
+ maximum=1.0,
403
+ step=0.1,
404
+ value=1.0,
405
+ label="top_p",
406
+ render=False,
407
+ ),
408
+ gr.Slider(
409
+ minimum=1,
410
+ maximum=20,
411
+ step=1,
412
+ value=20,
413
+ label="top_k",
414
+ render=False,
415
+ ),
416
+ gr.Slider(
417
+ minimum=0.0,
418
+ maximum=2.0,
419
+ step=0.1,
420
+ value=1.2,
421
+ label="Repetition penalty",
422
+ render=False,
423
+ ),
424
+ ],
425
+ examples=[
426
+ ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."],
427
+ ["What are 5 creative things I could do with my kids' art? I don't want to throw them away, but it's also so much clutter."],
428
+ ["Tell me a random fun fact about the Roman Empire."],
429
+ ["Show me a code snippet of a website's sticky header in CSS and JavaScript."],
430
+ ],
431
+ cache_examples=False,
432
  )
433
+
434
 
435
  if __name__ == "__main__":
436
  demo.launch()