Update app.py
Browse files
app.py
CHANGED
@@ -24,254 +24,31 @@ from exception import CustomExceptionHandling
|
|
24 |
# Download gguf model files
|
25 |
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
|
26 |
os.makedirs("models",exist_ok=True)
|
27 |
-
|
28 |
hf_hub_download(
|
29 |
repo_id="mtsdurica/madlad400-3b-mt-Q8_0-GGUF",
|
30 |
filename="madlad400-3b-mt-q8_0.gguf",
|
31 |
local_dir="./models",
|
32 |
)
|
33 |
|
|
|
|
|
|
|
|
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
Roles.system: PromptMarkers("", "\n"), # System prompt should be included within user message
|
38 |
-
Roles.user: PromptMarkers("<start_of_turn>user\n", "<end_of_turn>\n"),
|
39 |
-
Roles.assistant: PromptMarkers("<start_of_turn>model\n", "<end_of_turn>\n"),
|
40 |
-
Roles.tool: PromptMarkers("", ""), # If you need tool support
|
41 |
-
}
|
42 |
|
43 |
-
# Create the formatter
|
44 |
-
gemma_3_formatter = MessagesFormatter(
|
45 |
-
pre_prompt="", # No pre-prompt
|
46 |
-
prompt_markers=gemma_3_prompt_markers,
|
47 |
-
include_sys_prompt_in_first_user_message=True, # Include system prompt in first user message
|
48 |
-
default_stop_sequences=["<end_of_turn>", "<start_of_turn>"],
|
49 |
-
strip_prompt=False, # Don't strip whitespace from the prompt
|
50 |
-
bos_token="<bos>", # Beginning of sequence token for Gemma 3
|
51 |
-
eos_token="<eos>", # End of sequence token for Gemma 3
|
52 |
-
)
|
53 |
|
54 |
-
|
55 |
-
# Set the title and description
|
56 |
-
title = "Gemma Llama.cpp"
|
57 |
-
description = """Gemma 3 is a family of lightweight, multimodal open models that offers advanced capabilities like large context windows and multilingual support, enabling diverse applications on various devices."""
|
58 |
|
59 |
|
60 |
-
llm = None
|
61 |
-
llm_model = None
|
62 |
-
|
63 |
import ctypes
|
64 |
import os
|
65 |
import multiprocessing
|
66 |
|
67 |
import llama_cpp
|
68 |
|
69 |
-
def low_level():
|
70 |
-
|
71 |
-
|
72 |
-
llama_cpp.llama_backend_init(numa=False)
|
73 |
-
|
74 |
-
N_THREADS = multiprocessing.cpu_count()
|
75 |
-
MODEL_PATH = "models/madlad400-3b-mt-q8_0.gguf"
|
76 |
-
|
77 |
-
prompt = b"translate English to German: The house is wonderful."
|
78 |
-
|
79 |
-
lparams = llama_cpp.llama_model_default_params()
|
80 |
-
model = llama_cpp.llama_load_model_from_file(MODEL_PATH.encode("utf-8"), lparams)
|
81 |
-
|
82 |
-
vocab = llama_cpp.llama_model_get_vocab(model)
|
83 |
-
|
84 |
-
cparams = llama_cpp.llama_context_default_params()
|
85 |
-
cparams.no_perf = False
|
86 |
-
ctx = llama_cpp.llama_init_from_model(model, cparams)
|
87 |
-
|
88 |
-
sparams = llama_cpp.llama_sampler_chain_default_params()
|
89 |
-
smpl = llama_cpp.llama_sampler_chain_init(sparams)
|
90 |
-
llama_cpp.llama_sampler_chain_add(smpl, llama_cpp.llama_sampler_init_greedy())
|
91 |
-
|
92 |
-
n_past = 0
|
93 |
-
|
94 |
-
embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))()
|
95 |
-
|
96 |
-
n_of_tok = llama_cpp.llama_tokenize(
|
97 |
-
vocab,
|
98 |
-
prompt,
|
99 |
-
len(prompt),
|
100 |
-
embd_inp,
|
101 |
-
len(embd_inp),
|
102 |
-
True,
|
103 |
-
True,
|
104 |
-
)
|
105 |
-
|
106 |
-
embd_inp = embd_inp[:n_of_tok]
|
107 |
-
|
108 |
-
n_ctx = llama_cpp.llama_n_ctx(ctx)
|
109 |
-
|
110 |
-
n_predict = 20
|
111 |
-
n_predict = min(n_predict, n_ctx - len(embd_inp))
|
112 |
-
|
113 |
-
input_consumed = 0
|
114 |
-
input_noecho = False
|
115 |
-
|
116 |
-
remaining_tokens = n_predict
|
117 |
-
|
118 |
-
embd = []
|
119 |
-
last_n_size = 64
|
120 |
-
last_n_tokens_data = [0] * last_n_size
|
121 |
-
n_batch = 24
|
122 |
-
last_n_repeat = 64
|
123 |
-
repeat_penalty = 1
|
124 |
-
frequency_penalty = 0.0
|
125 |
-
presence_penalty = 0.0
|
126 |
-
|
127 |
-
batch = llama_cpp.llama_batch_init(n_batch, 0, 1)
|
128 |
-
|
129 |
-
# prepare batch for encoding containing the prompt
|
130 |
-
batch.n_tokens = len(embd_inp)
|
131 |
-
for i in range(batch.n_tokens):
|
132 |
-
batch.token[i] = embd_inp[i]
|
133 |
-
batch.pos[i] = i
|
134 |
-
batch.n_seq_id[i] = 1
|
135 |
-
batch.seq_id[i][0] = 0
|
136 |
-
batch.logits[i] = False
|
137 |
-
|
138 |
-
llama_cpp.llama_encode(
|
139 |
-
ctx,
|
140 |
-
batch
|
141 |
-
)
|
142 |
-
|
143 |
-
# now overwrite embd_inp so batch for decoding will initially contain only
|
144 |
-
# a single token with id acquired from llama_model_decoder_start_token(model)
|
145 |
-
embd_inp = [llama_cpp.llama_model_decoder_start_token(model)]
|
146 |
-
|
147 |
-
while remaining_tokens > 0:
|
148 |
-
if len(embd) > 0:
|
149 |
-
|
150 |
-
batch.n_tokens = len(embd)
|
151 |
-
for i in range(batch.n_tokens):
|
152 |
-
batch.token[i] = embd[i]
|
153 |
-
batch.pos[i] = n_past + i
|
154 |
-
batch.n_seq_id[i] = 1
|
155 |
-
batch.seq_id[i][0] = 0
|
156 |
-
batch.logits[i] = i == batch.n_tokens - 1
|
157 |
-
|
158 |
-
llama_cpp.llama_decode(
|
159 |
-
ctx,
|
160 |
-
batch
|
161 |
-
)
|
162 |
-
|
163 |
-
n_past += len(embd)
|
164 |
-
embd = []
|
165 |
-
if len(embd_inp) <= input_consumed:
|
166 |
-
id = llama_cpp.llama_sampler_sample(smpl, ctx, -1)
|
167 |
-
|
168 |
-
last_n_tokens_data = last_n_tokens_data[1:] + [id]
|
169 |
-
embd.append(id)
|
170 |
-
input_noecho = False
|
171 |
-
remaining_tokens -= 1
|
172 |
-
else:
|
173 |
-
while len(embd_inp) > input_consumed:
|
174 |
-
embd.append(embd_inp[input_consumed])
|
175 |
-
last_n_tokens_data = last_n_tokens_data[1:] + [embd_inp[input_consumed]]
|
176 |
-
input_consumed += 1
|
177 |
-
if len(embd) >= n_batch:
|
178 |
-
break
|
179 |
-
if not input_noecho:
|
180 |
-
for id in embd:
|
181 |
-
size = 32
|
182 |
-
buffer = (ctypes.c_char * size)()
|
183 |
-
n = llama_cpp.llama_token_to_piece(
|
184 |
-
vocab, llama_cpp.llama_token(id), buffer, size, 0, True
|
185 |
-
)
|
186 |
-
assert n <= size
|
187 |
-
print(
|
188 |
-
buffer[:n].decode("utf-8"),
|
189 |
-
end="",
|
190 |
-
flush=True,
|
191 |
-
)
|
192 |
-
|
193 |
-
if len(embd) > 0 and embd[-1] in [llama_cpp.llama_token_eos(vocab), llama_cpp.llama_token_eot(vocab)]:
|
194 |
-
break
|
195 |
-
|
196 |
-
print()
|
197 |
-
|
198 |
-
|
199 |
-
def trans(text):
|
200 |
-
#test()
|
201 |
-
llama = Llama("models/madlad400-3b-mt-q8_0.gguf")
|
202 |
-
tokens = llama.tokenize(b"translate English to German: The house is wonderful.")
|
203 |
-
llama.encode(tokens)
|
204 |
-
tokens = [llama.decoder_start_token()]
|
205 |
-
for token in llama.generate(tokens, top_k=40, top_p=0.95, temp=1, repeat_penalty=1.0):
|
206 |
-
print(llama.detokenize([token]))
|
207 |
-
if token == llama.token_eos():
|
208 |
-
break
|
209 |
-
|
210 |
-
return None
|
211 |
-
|
212 |
-
# テキストに言語タグを付与し、バイト列に変換
|
213 |
-
input_text = f"<2ja>{text}"
|
214 |
-
|
215 |
-
# トークナイズ
|
216 |
-
tokens = llm.tokenize(input_text)
|
217 |
-
print("Tokens:", tokens)
|
218 |
-
|
219 |
-
# BOSトークンを取得し、確認
|
220 |
-
bos_token = llm.token_bos()
|
221 |
-
print("BOS Token:", bos_token)
|
222 |
-
initial_tokens = [bos_token]
|
223 |
-
initial_tokens = [1]
|
224 |
-
print("Initial Tokens:", initial_tokens)
|
225 |
-
|
226 |
-
# 生成
|
227 |
-
buf = ""
|
228 |
-
for token in llm.generate(initial_tokens, top_p=0.95, temp=0.0, repeat_penalty=1.0):
|
229 |
-
decoded = llm.detokenize([token]).decode('utf-8', errors='ignore')
|
230 |
-
buf += decoded
|
231 |
-
if token == llm.token_eos():
|
232 |
-
break
|
233 |
-
|
234 |
-
return buf
|
235 |
-
|
236 |
-
# テキストに言語タグを付与し、バイト列に変換
|
237 |
-
input_text = f"<2ja>{text}".encode('utf-8')
|
238 |
-
|
239 |
-
# トークナイズ
|
240 |
-
tokens = llm.tokenize(input_text)
|
241 |
-
print("Tokens:", tokens)
|
242 |
-
|
243 |
-
# BOSトークンを使用(デコーダーのみのモデルを想定)
|
244 |
-
initial_tokens = [llm.token_bos()]
|
245 |
-
|
246 |
-
# 生成
|
247 |
-
buf = ""
|
248 |
-
for token in llm.generate(initial_tokens, top_p=0.95, temp=0.0, repeat_penalty=1.0):
|
249 |
-
decoded = llm.detokenize([token]).decode('utf-8', errors='ignore')
|
250 |
-
buf += decoded
|
251 |
-
if token == llm.token_eos():
|
252 |
-
break
|
253 |
-
|
254 |
-
return buf
|
255 |
-
|
256 |
-
|
257 |
-
input_text = f"<2ja>{text}".encode('utf-8')
|
258 |
-
tokens = llm.tokenize(input_text)
|
259 |
-
print("Tokens:", tokens)
|
260 |
-
initial_tokens = [llm.decoder_start_token()]
|
261 |
-
print("Initial Tokens:", initial_tokens)
|
262 |
-
return text
|
263 |
-
llama = llm
|
264 |
-
text = f"<2ja>{text}".encode()
|
265 |
-
tokens = llama.tokenize(text)
|
266 |
-
llama.encode(tokens)
|
267 |
-
tokens = [llama.decoder_start_token()]
|
268 |
-
buf = ""
|
269 |
-
for token in llama.generate(tokens, top_k=0, top_p=0.95, temp=0, repeat_penalty=1.0):
|
270 |
-
buf += llama.detokenize([token]).decode()
|
271 |
-
if token == llama.token_eos():
|
272 |
-
break
|
273 |
-
return buf
|
274 |
-
|
275 |
def respond(
|
276 |
message: str,
|
277 |
history: List[Tuple[str, str]],
|
@@ -283,24 +60,6 @@ def respond(
|
|
283 |
top_k: int,
|
284 |
repeat_penalty: float,
|
285 |
):
|
286 |
-
llama = Llama("models/madlad400-3b-mt-q8_0.gguf",flash_attn=False,
|
287 |
-
n_gpu_layers=0,
|
288 |
-
n_batch=16,
|
289 |
-
n_ctx=512,
|
290 |
-
n_threads=2,
|
291 |
-
n_threads_batch=8,)
|
292 |
-
#tokens = llama.tokenize(f"<2ja>{message}")#
|
293 |
-
tokens = llama.tokenize(f"<2ja>{message}".encode("utf-8"))
|
294 |
-
llama.encode(tokens)
|
295 |
-
tokens = [llama.decoder_start_token()]
|
296 |
-
outputs =""
|
297 |
-
for token in llama.generate(tokens, top_k=40, top_p=0.95, temp=1, repeat_penalty=1.0):
|
298 |
-
outputs+= llama.detokenize([token]).decode()
|
299 |
-
yield outputs
|
300 |
-
if token == llama.token_eos():
|
301 |
-
break
|
302 |
-
return outputs
|
303 |
-
|
304 |
"""
|
305 |
Respond to a message using the Gemma3 model via Llama.cpp.
|
306 |
|
@@ -319,79 +78,35 @@ def respond(
|
|
319 |
str: The response to the message.
|
320 |
"""
|
321 |
try:
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
#yield "done"
|
342 |
-
|
343 |
-
provider = LlamaCppPythonProvider(llm)
|
344 |
-
|
345 |
-
# Create the agent
|
346 |
-
agent = LlamaCppAgent(
|
347 |
-
provider,
|
348 |
-
system_prompt=f"{system_message}",
|
349 |
-
# predefined_messages_formatter_type=GEMMA_2,
|
350 |
-
custom_messages_formatter=gemma_3_formatter,
|
351 |
-
debug_output=True,
|
352 |
-
)
|
353 |
-
|
354 |
-
# Set the settings like temperature, top-k, top-p, max tokens, etc.
|
355 |
-
settings = provider.get_provider_default_settings()
|
356 |
-
settings.temperature = temperature
|
357 |
-
settings.top_k = top_k
|
358 |
-
settings.top_p = top_p
|
359 |
-
settings.max_tokens = max_tokens
|
360 |
-
settings.repeat_penalty = repeat_penalty
|
361 |
-
settings.stream = True
|
362 |
-
|
363 |
-
messages = BasicChatHistory()
|
364 |
-
|
365 |
-
# Add the chat history
|
366 |
-
for msn in history:
|
367 |
-
user = {"role": Roles.user, "content": msn[0]}
|
368 |
-
assistant = {"role": Roles.assistant, "content": msn[1]}
|
369 |
-
messages.add_message(user)
|
370 |
-
messages.add_message(assistant)
|
371 |
-
|
372 |
-
# Get the response stream
|
373 |
-
stream = agent.get_chat_response(
|
374 |
-
message,
|
375 |
-
llm_sampling_settings=settings,
|
376 |
-
chat_history=messages,
|
377 |
-
returns_streaming_generator=True,
|
378 |
-
print_output=False,
|
379 |
-
)
|
380 |
-
|
381 |
-
# Log the success
|
382 |
-
logging.info("Response stream generated successfully")
|
383 |
-
|
384 |
-
# Generate the response
|
385 |
-
outputs = ""
|
386 |
-
for output in stream:
|
387 |
-
outputs += output
|
388 |
-
#yield outputs
|
389 |
-
|
390 |
-
# Handle exceptions that may occur during the process
|
391 |
except Exception as e:
|
392 |
# Custom exception handling
|
393 |
raise CustomExceptionHandling(e, sys) from e
|
394 |
|
|
|
|
|
|
|
|
|
|
|
|
|
395 |
|
396 |
# Create a chat interface
|
397 |
demo = gr.ChatInterface(
|
@@ -413,7 +128,7 @@ demo = gr.ChatInterface(
|
|
413 |
value="You are a helpful assistant.",
|
414 |
label="System Prompt",
|
415 |
info="Define the AI assistant's personality and behavior",
|
416 |
-
lines=2,
|
417 |
),
|
418 |
gr.Slider(
|
419 |
minimum=512,
|
|
|
24 |
# Download gguf model files
|
25 |
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
|
26 |
os.makedirs("models",exist_ok=True)
|
27 |
+
|
28 |
hf_hub_download(
|
29 |
repo_id="mtsdurica/madlad400-3b-mt-Q8_0-GGUF",
|
30 |
filename="madlad400-3b-mt-q8_0.gguf",
|
31 |
local_dir="./models",
|
32 |
)
|
33 |
|
34 |
+
# Set the title and description
|
35 |
+
title = "madlad400-3b-mt Llama.cpp"
|
36 |
+
description = """
|
37 |
+
I'm using [fairydreaming/T5-branch](https://github.com/fairydreaming/llama-cpp-python/tree/t5) I'm not sure current llama-cpp-python support t5
|
38 |
|
39 |
+
[Model-Q8_0-GGUF](https://huggingface.co/mtsdurica/madlad400-3b-mt-Q8_0-GGUF) [Reference1](https://huggingface.co/spaces/sitammeur/Gemma-llamacpp) [Reference2](https://qiita.com/mbotsu/items/7dd80bc637ff6c12ef6a)
|
40 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
+
llama = None
|
|
|
|
|
|
|
44 |
|
45 |
|
|
|
|
|
|
|
46 |
import ctypes
|
47 |
import os
|
48 |
import multiprocessing
|
49 |
|
50 |
import llama_cpp
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
def respond(
|
53 |
message: str,
|
54 |
history: List[Tuple[str, str]],
|
|
|
60 |
top_k: int,
|
61 |
repeat_penalty: float,
|
62 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
"""
|
64 |
Respond to a message using the Gemma3 model via Llama.cpp.
|
65 |
|
|
|
78 |
str: The response to the message.
|
79 |
"""
|
80 |
try:
|
81 |
+
global llama
|
82 |
+
if llama == None:
|
83 |
+
llama = Llama("models/madlad400-3b-mt-q8_0.gguf",flash_attn=False,
|
84 |
+
n_gpu_layers=0,
|
85 |
+
n_batch=32,
|
86 |
+
n_ctx=512,
|
87 |
+
n_threads=2,
|
88 |
+
n_threads_batch=16)
|
89 |
+
|
90 |
+
tokens = llama.tokenize(f"<2ja>{message}".encode("utf-8"))
|
91 |
+
llama.encode(tokens)
|
92 |
+
tokens = [llama.decoder_start_token()]
|
93 |
+
outputs =""
|
94 |
+
for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
|
95 |
+
outputs+= llama.detokenize([token]).decode()
|
96 |
+
yield outputs
|
97 |
+
if token == llama.token_eos():
|
98 |
+
break
|
99 |
+
return outputs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
except Exception as e:
|
101 |
# Custom exception handling
|
102 |
raise CustomExceptionHandling(e, sys) from e
|
103 |
|
104 |
+
return None
|
105 |
+
|
106 |
+
|
107 |
+
|
108 |
+
|
109 |
+
|
110 |
|
111 |
# Create a chat interface
|
112 |
demo = gr.ChatInterface(
|
|
|
128 |
value="You are a helpful assistant.",
|
129 |
label="System Prompt",
|
130 |
info="Define the AI assistant's personality and behavior",
|
131 |
+
lines=2,visible=False
|
132 |
),
|
133 |
gr.Slider(
|
134 |
minimum=512,
|