Akjava commited on
Commit
801fec7
·
verified ·
1 Parent(s): e496267

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -319
app.py CHANGED
@@ -24,254 +24,31 @@ from exception import CustomExceptionHandling
24
  # Download gguf model files
25
  huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
26
  os.makedirs("models",exist_ok=True)
27
- #mtsdurica/madlad400-3b-mt-Q8_0-GGUF
28
  hf_hub_download(
29
  repo_id="mtsdurica/madlad400-3b-mt-Q8_0-GGUF",
30
  filename="madlad400-3b-mt-q8_0.gguf",
31
  local_dir="./models",
32
  )
33
 
 
 
 
 
34
 
35
- # Define the prompt markers for Gemma 3
36
- gemma_3_prompt_markers = {
37
- Roles.system: PromptMarkers("", "\n"), # System prompt should be included within user message
38
- Roles.user: PromptMarkers("<start_of_turn>user\n", "<end_of_turn>\n"),
39
- Roles.assistant: PromptMarkers("<start_of_turn>model\n", "<end_of_turn>\n"),
40
- Roles.tool: PromptMarkers("", ""), # If you need tool support
41
- }
42
 
43
- # Create the formatter
44
- gemma_3_formatter = MessagesFormatter(
45
- pre_prompt="", # No pre-prompt
46
- prompt_markers=gemma_3_prompt_markers,
47
- include_sys_prompt_in_first_user_message=True, # Include system prompt in first user message
48
- default_stop_sequences=["<end_of_turn>", "<start_of_turn>"],
49
- strip_prompt=False, # Don't strip whitespace from the prompt
50
- bos_token="<bos>", # Beginning of sequence token for Gemma 3
51
- eos_token="<eos>", # End of sequence token for Gemma 3
52
- )
53
 
54
-
55
- # Set the title and description
56
- title = "Gemma Llama.cpp"
57
- description = """Gemma 3 is a family of lightweight, multimodal open models that offers advanced capabilities like large context windows and multilingual support, enabling diverse applications on various devices."""
58
 
59
 
60
- llm = None
61
- llm_model = None
62
-
63
  import ctypes
64
  import os
65
  import multiprocessing
66
 
67
  import llama_cpp
68
 
69
- def low_level():
70
-
71
-
72
- llama_cpp.llama_backend_init(numa=False)
73
-
74
- N_THREADS = multiprocessing.cpu_count()
75
- MODEL_PATH = "models/madlad400-3b-mt-q8_0.gguf"
76
-
77
- prompt = b"translate English to German: The house is wonderful."
78
-
79
- lparams = llama_cpp.llama_model_default_params()
80
- model = llama_cpp.llama_load_model_from_file(MODEL_PATH.encode("utf-8"), lparams)
81
-
82
- vocab = llama_cpp.llama_model_get_vocab(model)
83
-
84
- cparams = llama_cpp.llama_context_default_params()
85
- cparams.no_perf = False
86
- ctx = llama_cpp.llama_init_from_model(model, cparams)
87
-
88
- sparams = llama_cpp.llama_sampler_chain_default_params()
89
- smpl = llama_cpp.llama_sampler_chain_init(sparams)
90
- llama_cpp.llama_sampler_chain_add(smpl, llama_cpp.llama_sampler_init_greedy())
91
-
92
- n_past = 0
93
-
94
- embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))()
95
-
96
- n_of_tok = llama_cpp.llama_tokenize(
97
- vocab,
98
- prompt,
99
- len(prompt),
100
- embd_inp,
101
- len(embd_inp),
102
- True,
103
- True,
104
- )
105
-
106
- embd_inp = embd_inp[:n_of_tok]
107
-
108
- n_ctx = llama_cpp.llama_n_ctx(ctx)
109
-
110
- n_predict = 20
111
- n_predict = min(n_predict, n_ctx - len(embd_inp))
112
-
113
- input_consumed = 0
114
- input_noecho = False
115
-
116
- remaining_tokens = n_predict
117
-
118
- embd = []
119
- last_n_size = 64
120
- last_n_tokens_data = [0] * last_n_size
121
- n_batch = 24
122
- last_n_repeat = 64
123
- repeat_penalty = 1
124
- frequency_penalty = 0.0
125
- presence_penalty = 0.0
126
-
127
- batch = llama_cpp.llama_batch_init(n_batch, 0, 1)
128
-
129
- # prepare batch for encoding containing the prompt
130
- batch.n_tokens = len(embd_inp)
131
- for i in range(batch.n_tokens):
132
- batch.token[i] = embd_inp[i]
133
- batch.pos[i] = i
134
- batch.n_seq_id[i] = 1
135
- batch.seq_id[i][0] = 0
136
- batch.logits[i] = False
137
-
138
- llama_cpp.llama_encode(
139
- ctx,
140
- batch
141
- )
142
-
143
- # now overwrite embd_inp so batch for decoding will initially contain only
144
- # a single token with id acquired from llama_model_decoder_start_token(model)
145
- embd_inp = [llama_cpp.llama_model_decoder_start_token(model)]
146
-
147
- while remaining_tokens > 0:
148
- if len(embd) > 0:
149
-
150
- batch.n_tokens = len(embd)
151
- for i in range(batch.n_tokens):
152
- batch.token[i] = embd[i]
153
- batch.pos[i] = n_past + i
154
- batch.n_seq_id[i] = 1
155
- batch.seq_id[i][0] = 0
156
- batch.logits[i] = i == batch.n_tokens - 1
157
-
158
- llama_cpp.llama_decode(
159
- ctx,
160
- batch
161
- )
162
-
163
- n_past += len(embd)
164
- embd = []
165
- if len(embd_inp) <= input_consumed:
166
- id = llama_cpp.llama_sampler_sample(smpl, ctx, -1)
167
-
168
- last_n_tokens_data = last_n_tokens_data[1:] + [id]
169
- embd.append(id)
170
- input_noecho = False
171
- remaining_tokens -= 1
172
- else:
173
- while len(embd_inp) > input_consumed:
174
- embd.append(embd_inp[input_consumed])
175
- last_n_tokens_data = last_n_tokens_data[1:] + [embd_inp[input_consumed]]
176
- input_consumed += 1
177
- if len(embd) >= n_batch:
178
- break
179
- if not input_noecho:
180
- for id in embd:
181
- size = 32
182
- buffer = (ctypes.c_char * size)()
183
- n = llama_cpp.llama_token_to_piece(
184
- vocab, llama_cpp.llama_token(id), buffer, size, 0, True
185
- )
186
- assert n <= size
187
- print(
188
- buffer[:n].decode("utf-8"),
189
- end="",
190
- flush=True,
191
- )
192
-
193
- if len(embd) > 0 and embd[-1] in [llama_cpp.llama_token_eos(vocab), llama_cpp.llama_token_eot(vocab)]:
194
- break
195
-
196
- print()
197
-
198
-
199
- def trans(text):
200
- #test()
201
- llama = Llama("models/madlad400-3b-mt-q8_0.gguf")
202
- tokens = llama.tokenize(b"translate English to German: The house is wonderful.")
203
- llama.encode(tokens)
204
- tokens = [llama.decoder_start_token()]
205
- for token in llama.generate(tokens, top_k=40, top_p=0.95, temp=1, repeat_penalty=1.0):
206
- print(llama.detokenize([token]))
207
- if token == llama.token_eos():
208
- break
209
-
210
- return None
211
-
212
- # テキストに言語タグを付与し、バイト列に変換
213
- input_text = f"<2ja>{text}"
214
-
215
- # トークナイズ
216
- tokens = llm.tokenize(input_text)
217
- print("Tokens:", tokens)
218
-
219
- # BOSトークンを取得し、確認
220
- bos_token = llm.token_bos()
221
- print("BOS Token:", bos_token)
222
- initial_tokens = [bos_token]
223
- initial_tokens = [1]
224
- print("Initial Tokens:", initial_tokens)
225
-
226
- # 生成
227
- buf = ""
228
- for token in llm.generate(initial_tokens, top_p=0.95, temp=0.0, repeat_penalty=1.0):
229
- decoded = llm.detokenize([token]).decode('utf-8', errors='ignore')
230
- buf += decoded
231
- if token == llm.token_eos():
232
- break
233
-
234
- return buf
235
-
236
- # テキストに言語タグを付与し、バイト列に変換
237
- input_text = f"<2ja>{text}".encode('utf-8')
238
-
239
- # トークナイズ
240
- tokens = llm.tokenize(input_text)
241
- print("Tokens:", tokens)
242
-
243
- # BOSトークンを使用(デコーダーのみのモデルを想定)
244
- initial_tokens = [llm.token_bos()]
245
-
246
- # 生成
247
- buf = ""
248
- for token in llm.generate(initial_tokens, top_p=0.95, temp=0.0, repeat_penalty=1.0):
249
- decoded = llm.detokenize([token]).decode('utf-8', errors='ignore')
250
- buf += decoded
251
- if token == llm.token_eos():
252
- break
253
-
254
- return buf
255
-
256
-
257
- input_text = f"<2ja>{text}".encode('utf-8')
258
- tokens = llm.tokenize(input_text)
259
- print("Tokens:", tokens)
260
- initial_tokens = [llm.decoder_start_token()]
261
- print("Initial Tokens:", initial_tokens)
262
- return text
263
- llama = llm
264
- text = f"<2ja>{text}".encode()
265
- tokens = llama.tokenize(text)
266
- llama.encode(tokens)
267
- tokens = [llama.decoder_start_token()]
268
- buf = ""
269
- for token in llama.generate(tokens, top_k=0, top_p=0.95, temp=0, repeat_penalty=1.0):
270
- buf += llama.detokenize([token]).decode()
271
- if token == llama.token_eos():
272
- break
273
- return buf
274
-
275
  def respond(
276
  message: str,
277
  history: List[Tuple[str, str]],
@@ -283,24 +60,6 @@ def respond(
283
  top_k: int,
284
  repeat_penalty: float,
285
  ):
286
- llama = Llama("models/madlad400-3b-mt-q8_0.gguf",flash_attn=False,
287
- n_gpu_layers=0,
288
- n_batch=16,
289
- n_ctx=512,
290
- n_threads=2,
291
- n_threads_batch=8,)
292
- #tokens = llama.tokenize(f"<2ja>{message}")#
293
- tokens = llama.tokenize(f"<2ja>{message}".encode("utf-8"))
294
- llama.encode(tokens)
295
- tokens = [llama.decoder_start_token()]
296
- outputs =""
297
- for token in llama.generate(tokens, top_k=40, top_p=0.95, temp=1, repeat_penalty=1.0):
298
- outputs+= llama.detokenize([token]).decode()
299
- yield outputs
300
- if token == llama.token_eos():
301
- break
302
- return outputs
303
-
304
  """
305
  Respond to a message using the Gemma3 model via Llama.cpp.
306
 
@@ -319,79 +78,35 @@ def respond(
319
  str: The response to the message.
320
  """
321
  try:
322
- # Load the global variables
323
- global llm
324
- global llm_model
325
-
326
- #llama = Llama("madlad400-3b-mt-q8_0.gguf")
327
- # Load the model
328
- if llm is None or llm_model != model:
329
- llm = Llama(
330
- model_path=f"models/{model}",
331
- flash_attn=False,
332
- n_gpu_layers=0,
333
- n_batch=8,
334
- n_ctx=2048,
335
- n_threads=8,
336
- n_threads_batch=8,
337
- )
338
- llm_model = model
339
-
340
- trans(message)
341
- #yield "done"
342
-
343
- provider = LlamaCppPythonProvider(llm)
344
-
345
- # Create the agent
346
- agent = LlamaCppAgent(
347
- provider,
348
- system_prompt=f"{system_message}",
349
- # predefined_messages_formatter_type=GEMMA_2,
350
- custom_messages_formatter=gemma_3_formatter,
351
- debug_output=True,
352
- )
353
-
354
- # Set the settings like temperature, top-k, top-p, max tokens, etc.
355
- settings = provider.get_provider_default_settings()
356
- settings.temperature = temperature
357
- settings.top_k = top_k
358
- settings.top_p = top_p
359
- settings.max_tokens = max_tokens
360
- settings.repeat_penalty = repeat_penalty
361
- settings.stream = True
362
-
363
- messages = BasicChatHistory()
364
-
365
- # Add the chat history
366
- for msn in history:
367
- user = {"role": Roles.user, "content": msn[0]}
368
- assistant = {"role": Roles.assistant, "content": msn[1]}
369
- messages.add_message(user)
370
- messages.add_message(assistant)
371
-
372
- # Get the response stream
373
- stream = agent.get_chat_response(
374
- message,
375
- llm_sampling_settings=settings,
376
- chat_history=messages,
377
- returns_streaming_generator=True,
378
- print_output=False,
379
- )
380
-
381
- # Log the success
382
- logging.info("Response stream generated successfully")
383
-
384
- # Generate the response
385
- outputs = ""
386
- for output in stream:
387
- outputs += output
388
- #yield outputs
389
-
390
- # Handle exceptions that may occur during the process
391
  except Exception as e:
392
  # Custom exception handling
393
  raise CustomExceptionHandling(e, sys) from e
394
 
 
 
 
 
 
 
395
 
396
  # Create a chat interface
397
  demo = gr.ChatInterface(
@@ -413,7 +128,7 @@ demo = gr.ChatInterface(
413
  value="You are a helpful assistant.",
414
  label="System Prompt",
415
  info="Define the AI assistant's personality and behavior",
416
- lines=2,
417
  ),
418
  gr.Slider(
419
  minimum=512,
 
24
  # Download gguf model files
25
  huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
26
  os.makedirs("models",exist_ok=True)
27
+
28
  hf_hub_download(
29
  repo_id="mtsdurica/madlad400-3b-mt-Q8_0-GGUF",
30
  filename="madlad400-3b-mt-q8_0.gguf",
31
  local_dir="./models",
32
  )
33
 
34
+ # Set the title and description
35
+ title = "madlad400-3b-mt Llama.cpp"
36
+ description = """
37
+ I'm using [fairydreaming/T5-branch](https://github.com/fairydreaming/llama-cpp-python/tree/t5) I'm not sure current llama-cpp-python support t5
38
 
39
+ [Model-Q8_0-GGUF](https://huggingface.co/mtsdurica/madlad400-3b-mt-Q8_0-GGUF) [Reference1](https://huggingface.co/spaces/sitammeur/Gemma-llamacpp) [Reference2](https://qiita.com/mbotsu/items/7dd80bc637ff6c12ef6a)
40
+ """
 
 
 
 
 
41
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ llama = None
 
 
 
44
 
45
 
 
 
 
46
  import ctypes
47
  import os
48
  import multiprocessing
49
 
50
  import llama_cpp
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  def respond(
53
  message: str,
54
  history: List[Tuple[str, str]],
 
60
  top_k: int,
61
  repeat_penalty: float,
62
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  """
64
  Respond to a message using the Gemma3 model via Llama.cpp.
65
 
 
78
  str: The response to the message.
79
  """
80
  try:
81
+ global llama
82
+ if llama == None:
83
+ llama = Llama("models/madlad400-3b-mt-q8_0.gguf",flash_attn=False,
84
+ n_gpu_layers=0,
85
+ n_batch=32,
86
+ n_ctx=512,
87
+ n_threads=2,
88
+ n_threads_batch=16)
89
+
90
+ tokens = llama.tokenize(f"<2ja>{message}".encode("utf-8"))
91
+ llama.encode(tokens)
92
+ tokens = [llama.decoder_start_token()]
93
+ outputs =""
94
+ for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
95
+ outputs+= llama.detokenize([token]).decode()
96
+ yield outputs
97
+ if token == llama.token_eos():
98
+ break
99
+ return outputs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  except Exception as e:
101
  # Custom exception handling
102
  raise CustomExceptionHandling(e, sys) from e
103
 
104
+ return None
105
+
106
+
107
+
108
+
109
+
110
 
111
  # Create a chat interface
112
  demo = gr.ChatInterface(
 
128
  value="You are a helpful assistant.",
129
  label="System Prompt",
130
  info="Define the AI assistant's personality and behavior",
131
+ lines=2,visible=False
132
  ),
133
  gr.Slider(
134
  minimum=512,