MadsGalsgaard commited on
Commit
7c364eb
·
verified ·
1 Parent(s): e97fc71
Files changed (1) hide show
  1. app.py +223 -50
app.py CHANGED
@@ -50,63 +50,236 @@
50
  # demo.launch()
51
 
52
 
 
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  import gradio as gr
55
- from huggingface_hub import InferenceClient
56
 
 
 
 
 
 
 
 
 
 
 
57
  """
58
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 
 
 
 
 
 
 
 
 
 
 
59
  """
60
- client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
61
-
62
- def respond(
63
- message: str,
64
- history: list[tuple[str, str]], # This will not be used
65
- system_message: str,
66
- max_tokens: int,
67
- temperature: float,
68
- top_p: float,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  ):
70
- # Build the messages list
71
- messages = [{"role": "system", "content": system_message}]
72
- messages.append({"role": "user", "content": message})
73
-
74
- response = ""
75
-
76
- try:
77
- # Generate response from the model
78
- for msg in client.chat_completion(
79
- messages=messages,
80
- max_tokens=max_tokens,
81
- stream=True,
82
- temperature=temperature,
83
- top_p=top_p,
84
- ):
85
- if msg.choices[0].delta.content is not None:
86
- token = msg.choices[0].delta.content
87
- response += token
88
- yield response
89
- except Exception as e:
90
- yield f"An error occurred: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- """
93
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
94
- """
95
- demo = gr.ChatInterface(
96
- respond,
97
- additional_inputs=[
98
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
99
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
100
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
101
- gr.Slider(
102
- minimum=0.1,
103
- maximum=1.0,
104
- value=0.95,
105
- step=0.05,
106
- label="Top-p (nucleus sampling)",
107
- ),
108
- ],
109
- )
110
 
111
  if __name__ == "__main__":
112
  demo.launch()
 
50
  # demo.launch()
51
 
52
 
53
+ ##Running smothly CHATBOT
54
 
55
+ # import gradio as gr
56
+ # from huggingface_hub import InferenceClient
57
+
58
+ # """
59
+ # For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
60
+ # """
61
+ # client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
62
+
63
+ # def respond(
64
+ # message: str,
65
+ # history: list[tuple[str, str]], # This will not be used
66
+ # system_message: str,
67
+ # max_tokens: int,
68
+ # temperature: float,
69
+ # top_p: float,
70
+ # ):
71
+ # # Build the messages list
72
+ # messages = [{"role": "system", "content": system_message}]
73
+ # messages.append({"role": "user", "content": message})
74
+
75
+ # response = ""
76
+
77
+ # try:
78
+ # # Generate response from the model
79
+ # for msg in client.chat_completion(
80
+ # messages=messages,
81
+ # max_tokens=max_tokens,
82
+ # stream=True,
83
+ # temperature=temperature,
84
+ # top_p=top_p,
85
+ # ):
86
+ # if msg.choices[0].delta.content is not None:
87
+ # token = msg.choices[0].delta.content
88
+ # response += token
89
+ # yield response
90
+ # except Exception as e:
91
+ # yield f"An error occurred: {e}"
92
+
93
+ # """
94
+ # For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
95
+ # """
96
+ # demo = gr.ChatInterface(
97
+ # respond,
98
+ # additional_inputs=[
99
+ # gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
100
+ # gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
101
+ # gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
102
+ # gr.Slider(
103
+ # minimum=0.1,
104
+ # maximum=1.0,
105
+ # value=0.95,
106
+ # step=0.05,
107
+ # label="Top-p (nucleus sampling)",
108
+ # ),
109
+ # ],
110
+ # )
111
+
112
+ # if __name__ == "__main__":
113
+ # demo.launch()
114
+
115
+
116
+ ### 20aug
117
+
118
+ import os
119
+ import time
120
+ import spaces
121
+ import torch
122
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
123
  import gradio as gr
124
+ from threading import Thread
125
 
126
+ MODEL_LIST = ["meta-llama/Meta-Llama-3.1-8B-Instruct"]
127
+ HF_TOKEN = os.environ.get("HF_API_TOKEN", None)
128
+ MODEL = os.environ.get("MODEL_ID")
129
+
130
+ TITLE = "<h1><center>Meta-Llama3.1-8B</center></h1>"
131
+
132
+ PLACEHOLDER = """
133
+ <center>
134
+ <p>Hi! How can I help you today?</p>
135
+ </center>
136
  """
137
+
138
+
139
+ CSS = """
140
+ .duplicate-button {
141
+ margin: auto !important;
142
+ color: white !important;
143
+ background: black !important;
144
+ border-radius: 100vh !important;
145
+ }
146
+ h3 {
147
+ text-align: center;
148
+ }
149
  """
150
+
151
+ device = "cuda" # for GPU usage or "cpu" for CPU usage
152
+
153
+ quantization_config = BitsAndBytesConfig(
154
+ load_in_4bit=True,
155
+ bnb_4bit_compute_dtype=torch.bfloat16,
156
+ bnb_4bit_use_double_quant=True,
157
+ bnb_4bit_quant_type= "nf4")
158
+
159
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
160
+ model = AutoModelForCausalLM.from_pretrained(
161
+ MODEL,
162
+ torch_dtype=torch.bfloat16,
163
+ device_map="auto",
164
+ quantization_config=quantization_config)
165
+
166
+ @spaces.GPU()
167
+ def stream_chat(
168
+ message: str,
169
+ history: list,
170
+ system_prompt: str,
171
+ temperature: float = 0.8,
172
+ max_new_tokens: int = 1024,
173
+ top_p: float = 1.0,
174
+ top_k: int = 20,
175
+ penalty: float = 1.2,
176
  ):
177
+ print(f'message: {message}')
178
+ print(f'history: {history}')
179
+
180
+ conversation = [
181
+ {"role": "system", "content": system_prompt}
182
+ ]
183
+ for prompt, answer in history:
184
+ conversation.extend([
185
+ {"role": "user", "content": prompt},
186
+ {"role": "assistant", "content": answer},
187
+ ])
188
+
189
+ conversation.append({"role": "user", "content": message})
190
+
191
+ input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device)
192
+
193
+ streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
194
+
195
+ generate_kwargs = dict(
196
+ input_ids=input_ids,
197
+ max_new_tokens = max_new_tokens,
198
+ do_sample = False if temperature == 0 else True,
199
+ top_p = top_p,
200
+ top_k = top_k,
201
+ temperature = temperature,
202
+ repetition_penalty=penalty,
203
+ eos_token_id=[128001,128008,128009],
204
+ streamer=streamer,
205
+ )
206
+
207
+ with torch.no_grad():
208
+ thread = Thread(target=model.generate, kwargs=generate_kwargs)
209
+ thread.start()
210
+
211
+ buffer = ""
212
+ for new_text in streamer:
213
+ buffer += new_text
214
+ yield buffer
215
+
216
+
217
+ chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
218
+
219
+ with gr.Blocks(css=CSS, theme="soft") as demo:
220
+ gr.HTML(TITLE)
221
+ gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
222
+ gr.ChatInterface(
223
+ fn=stream_chat,
224
+ chatbot=chatbot,
225
+ fill_height=True,
226
+ additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
227
+ additional_inputs=[
228
+ gr.Textbox(
229
+ value="You are a helpful assistant",
230
+ label="System Prompt",
231
+ render=False,
232
+ ),
233
+ gr.Slider(
234
+ minimum=0,
235
+ maximum=1,
236
+ step=0.1,
237
+ value=0.8,
238
+ label="Temperature",
239
+ render=False,
240
+ ),
241
+ gr.Slider(
242
+ minimum=128,
243
+ maximum=8192,
244
+ step=1,
245
+ value=1024,
246
+ label="Max new tokens",
247
+ render=False,
248
+ ),
249
+ gr.Slider(
250
+ minimum=0.0,
251
+ maximum=1.0,
252
+ step=0.1,
253
+ value=1.0,
254
+ label="top_p",
255
+ render=False,
256
+ ),
257
+ gr.Slider(
258
+ minimum=1,
259
+ maximum=20,
260
+ step=1,
261
+ value=20,
262
+ label="top_k",
263
+ render=False,
264
+ ),
265
+ gr.Slider(
266
+ minimum=0.0,
267
+ maximum=2.0,
268
+ step=0.1,
269
+ value=1.2,
270
+ label="Repetition penalty",
271
+ render=False,
272
+ ),
273
+ ],
274
+ examples=[
275
+ ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."],
276
+ ["What are 5 creative things I could do with my kids' art? I don't want to throw them away, but it's also so much clutter."],
277
+ ["Tell me a random fun fact about the Roman Empire."],
278
+ ["Show me a code snippet of a website's sticky header in CSS and JavaScript."],
279
+ ],
280
+ cache_examples=False,
281
+ )
282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
 
284
  if __name__ == "__main__":
285
  demo.launch()