llamacpp-flan-t5-large-grammar-synthesis

Sleeping

App Files Files Community

Akjava commited on Mar 19

Commit

801fec7

verified ·

1 Parent(s): e496267

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -319

app.py CHANGED Viewed

@@ -24,254 +24,31 @@ from exception import CustomExceptionHandling
 # Download gguf model files
 huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
 os.makedirs("models",exist_ok=True)
-#mtsdurica/madlad400-3b-mt-Q8_0-GGUF
 hf_hub_download(
     repo_id="mtsdurica/madlad400-3b-mt-Q8_0-GGUF",
     filename="madlad400-3b-mt-q8_0.gguf",
     local_dir="./models",
 )
-# Define the prompt markers for Gemma 3
-gemma_3_prompt_markers = {
-    Roles.system: PromptMarkers("", "\n"),  # System prompt should be included within user message
-    Roles.user: PromptMarkers("<start_of_turn>user\n", "<end_of_turn>\n"),
-    Roles.assistant: PromptMarkers("<start_of_turn>model\n", "<end_of_turn>\n"),
-    Roles.tool: PromptMarkers("", ""),  # If you need tool support
-}
-# Create the formatter
-gemma_3_formatter = MessagesFormatter(
-    pre_prompt="",  # No pre-prompt
-    prompt_markers=gemma_3_prompt_markers,
-    include_sys_prompt_in_first_user_message=True,  # Include system prompt in first user message
-    default_stop_sequences=["<end_of_turn>", "<start_of_turn>"],
-    strip_prompt=False,  # Don't strip whitespace from the prompt
-    bos_token="<bos>",  # Beginning of sequence token for Gemma 3
-    eos_token="<eos>",  # End of sequence token for Gemma 3
-)
-# Set the title and description
-title = "Gemma Llama.cpp"
-description = """Gemma 3 is a family of lightweight, multimodal open models that offers advanced capabilities like large context windows and multilingual support, enabling diverse applications on various devices."""
-llm = None
-llm_model = None
 import ctypes
 import os
 import multiprocessing
 import llama_cpp
-def low_level():
-    llama_cpp.llama_backend_init(numa=False)
-    N_THREADS = multiprocessing.cpu_count()
-    MODEL_PATH = "models/madlad400-3b-mt-q8_0.gguf"
-    prompt = b"translate English to German: The house is wonderful."
-    lparams = llama_cpp.llama_model_default_params()
-    model = llama_cpp.llama_load_model_from_file(MODEL_PATH.encode("utf-8"), lparams)
-    vocab = llama_cpp.llama_model_get_vocab(model)
-    cparams = llama_cpp.llama_context_default_params()
-    cparams.no_perf = False
-    ctx = llama_cpp.llama_init_from_model(model, cparams)
-    sparams = llama_cpp.llama_sampler_chain_default_params()
-    smpl = llama_cpp.llama_sampler_chain_init(sparams)
-    llama_cpp.llama_sampler_chain_add(smpl, llama_cpp.llama_sampler_init_greedy())
-    n_past = 0
-    embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))()
-    n_of_tok = llama_cpp.llama_tokenize(
-        vocab,
-        prompt,
-        len(prompt),
-        embd_inp,
-        len(embd_inp),
-        True,
-        True,
-    )
-    embd_inp = embd_inp[:n_of_tok]
-    n_ctx = llama_cpp.llama_n_ctx(ctx)
-    n_predict = 20
-    n_predict = min(n_predict, n_ctx - len(embd_inp))
-    input_consumed = 0
-    input_noecho = False
-    remaining_tokens = n_predict
-    embd = []
-    last_n_size = 64
-    last_n_tokens_data = [0] * last_n_size
-    n_batch = 24
-    last_n_repeat = 64
-    repeat_penalty = 1
-    frequency_penalty = 0.0
-    presence_penalty = 0.0
-    batch = llama_cpp.llama_batch_init(n_batch, 0, 1)
-    # prepare batch for encoding containing the prompt
-    batch.n_tokens = len(embd_inp)
-    for i in range(batch.n_tokens):
-        batch.token[i] = embd_inp[i]
-        batch.pos[i] = i
-        batch.n_seq_id[i] = 1
-        batch.seq_id[i][0] = 0
-        batch.logits[i] = False
-    llama_cpp.llama_encode(
-        ctx,
-        batch
-    )
-    # now overwrite embd_inp so batch for decoding will initially contain only
-    # a single token with id acquired from llama_model_decoder_start_token(model)
-    embd_inp = [llama_cpp.llama_model_decoder_start_token(model)]
-    while remaining_tokens > 0:
-        if len(embd) > 0:
-            batch.n_tokens = len(embd)
-            for i in range(batch.n_tokens):
-                batch.token[i] = embd[i]
-                batch.pos[i] = n_past + i
-                batch.n_seq_id[i] = 1
-                batch.seq_id[i][0] = 0
-                batch.logits[i] = i == batch.n_tokens - 1
-            llama_cpp.llama_decode(
-                ctx,
-                batch
-            )
-        n_past += len(embd)
-        embd = []
-        if len(embd_inp) <= input_consumed:
-            id = llama_cpp.llama_sampler_sample(smpl, ctx, -1)
-            last_n_tokens_data = last_n_tokens_data[1:] + [id]
-            embd.append(id)
-            input_noecho = False
-            remaining_tokens -= 1
-        else:
-            while len(embd_inp) > input_consumed:
-                embd.append(embd_inp[input_consumed])
-                last_n_tokens_data = last_n_tokens_data[1:] + [embd_inp[input_consumed]]
-                input_consumed += 1
-                if len(embd) >= n_batch:
-                    break
-        if not input_noecho:
-            for id in embd:
-                size = 32
-                buffer = (ctypes.c_char * size)()
-                n = llama_cpp.llama_token_to_piece(
-                    vocab, llama_cpp.llama_token(id), buffer, size, 0, True
-                )
-                assert n <= size
-                print(
-                    buffer[:n].decode("utf-8"),
-                    end="",
-                    flush=True,
-                )
-        if len(embd) > 0 and embd[-1] in [llama_cpp.llama_token_eos(vocab), llama_cpp.llama_token_eot(vocab)]:
-            break
-    print()
-def trans(text):
-    #test()
-    llama = Llama("models/madlad400-3b-mt-q8_0.gguf")
-    tokens = llama.tokenize(b"translate English to German: The house is wonderful.")
-    llama.encode(tokens)
-    tokens = [llama.decoder_start_token()]
-    for token in llama.generate(tokens, top_k=40, top_p=0.95, temp=1, repeat_penalty=1.0):
-        print(llama.detokenize([token]))
-        if token == llama.token_eos():
-            break
-    return None
-    # テキストに言語タグを付与し、バイト列に変換
-    input_text = f"<2ja>{text}"
-    # トークナイズ
-    tokens = llm.tokenize(input_text)
-    print("Tokens:", tokens)
-    # BOSトークンを取得し、確認
-    bos_token = llm.token_bos()
-    print("BOS Token:", bos_token)
-    initial_tokens = [bos_token]
-    initial_tokens = [1]
-    print("Initial Tokens:", initial_tokens)
-    # 生成
-    buf = ""
-    for token in llm.generate(initial_tokens, top_p=0.95, temp=0.0, repeat_penalty=1.0):
-        decoded = llm.detokenize([token]).decode('utf-8', errors='ignore')
-        buf += decoded
-        if token == llm.token_eos():
-            break
-    return buf
-    # テキストに言語タグを付与し、バイト列に変換
-    input_text = f"<2ja>{text}".encode('utf-8')
-    # トークナイズ
-    tokens = llm.tokenize(input_text)
-    print("Tokens:", tokens)
-    # BOSトークンを使用（デコーダーのみのモデルを想定）
-    initial_tokens = [llm.token_bos()]
-    # 生成
-    buf = ""
-    for token in llm.generate(initial_tokens, top_p=0.95, temp=0.0, repeat_penalty=1.0):
-        decoded = llm.detokenize([token]).decode('utf-8', errors='ignore')
-        buf += decoded
-        if token == llm.token_eos():
-            break
-    return buf
-    input_text = f"<2ja>{text}".encode('utf-8')
-    tokens = llm.tokenize(input_text)
-    print("Tokens:", tokens)
-    initial_tokens = [llm.decoder_start_token()]
-    print("Initial Tokens:", initial_tokens)
-    return text
-    llama = llm
-    text = f"<2ja>{text}".encode()
-    tokens = llama.tokenize(text)
-    llama.encode(tokens)
-    tokens = [llama.decoder_start_token()]
-    buf = ""
-    for token in llama.generate(tokens, top_k=0, top_p=0.95, temp=0, repeat_penalty=1.0):
-        buf += llama.detokenize([token]).decode()
-        if token == llama.token_eos():
-            break
-    return buf
 def respond(
     message: str,
     history: List[Tuple[str, str]],
@@ -283,24 +60,6 @@ def respond(
     top_k: int,
     repeat_penalty: float,
 ):
-    llama = Llama("models/madlad400-3b-mt-q8_0.gguf",flash_attn=False,
-                n_gpu_layers=0,
-                n_batch=16,
-                n_ctx=512,
-                n_threads=2,
-                n_threads_batch=8,)
-    #tokens = llama.tokenize(f"<2ja>{message}")#
-    tokens = llama.tokenize(f"<2ja>{message}".encode("utf-8"))
-    llama.encode(tokens)
-    tokens = [llama.decoder_start_token()]
-    outputs =""
-    for token in llama.generate(tokens, top_k=40, top_p=0.95, temp=1, repeat_penalty=1.0):
-        outputs+= llama.detokenize([token]).decode()
-        yield outputs
-        if token == llama.token_eos():
-            break
-    return outputs
     """
     Respond to a message using the Gemma3 model via Llama.cpp.
@@ -319,79 +78,35 @@ def respond(
         str: The response to the message.
     """
     try:
-        # Load the global variables
-        global llm
-        global llm_model
-        #llama = Llama("madlad400-3b-mt-q8_0.gguf")
-        # Load the model
-        if llm is None or llm_model != model:
-            llm = Llama(
-                model_path=f"models/{model}",
-                flash_attn=False,
-                n_gpu_layers=0,
-                n_batch=8,
-                n_ctx=2048,
-                n_threads=8,
-                n_threads_batch=8,
-            )
-            llm_model = model
-        trans(message)
-        #yield "done"
-        provider = LlamaCppPythonProvider(llm)
-        # Create the agent
-        agent = LlamaCppAgent(
-            provider,
-            system_prompt=f"{system_message}",
-            # predefined_messages_formatter_type=GEMMA_2,
-            custom_messages_formatter=gemma_3_formatter,
-            debug_output=True,
-        )
-        # Set the settings like temperature, top-k, top-p, max tokens, etc.
-        settings = provider.get_provider_default_settings()
-        settings.temperature = temperature
-        settings.top_k = top_k
-        settings.top_p = top_p
-        settings.max_tokens = max_tokens
-        settings.repeat_penalty = repeat_penalty
-        settings.stream = True
-        messages = BasicChatHistory()
-        # Add the chat history
-        for msn in history:
-            user = {"role": Roles.user, "content": msn[0]}
-            assistant = {"role": Roles.assistant, "content": msn[1]}
-            messages.add_message(user)
-            messages.add_message(assistant)
-        # Get the response stream
-        stream = agent.get_chat_response(
-            message,
-            llm_sampling_settings=settings,
-            chat_history=messages,
-            returns_streaming_generator=True,
-            print_output=False,
-        )
-        # Log the success
-        logging.info("Response stream generated successfully")
-        # Generate the response
-        outputs = ""
-        for output in stream:
-            outputs += output
-            #yield outputs
-    # Handle exceptions that may occur during the process
     except Exception as e:
         # Custom exception handling
         raise CustomExceptionHandling(e, sys) from e
 # Create a chat interface
 demo = gr.ChatInterface(
@@ -413,7 +128,7 @@ demo = gr.ChatInterface(
             value="You are a helpful assistant.",
             label="System Prompt",
             info="Define the AI assistant's personality and behavior",
-            lines=2,
         ),
         gr.Slider(
             minimum=512,

 # Download gguf model files
 huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
 os.makedirs("models",exist_ok=True)
 hf_hub_download(
     repo_id="mtsdurica/madlad400-3b-mt-Q8_0-GGUF",
     filename="madlad400-3b-mt-q8_0.gguf",
     local_dir="./models",
 )
+# Set the title and description
+title = "madlad400-3b-mt Llama.cpp"
+description = """
+I'm using [fairydreaming/T5-branch](https://github.com/fairydreaming/llama-cpp-python/tree/t5) I'm not sure current llama-cpp-python support t5
+[Model-Q8_0-GGUF](https://huggingface.co/mtsdurica/madlad400-3b-mt-Q8_0-GGUF) [Reference1](https://huggingface.co/spaces/sitammeur/Gemma-llamacpp) [Reference2](https://qiita.com/mbotsu/items/7dd80bc637ff6c12ef6a)
+"""
+llama = None
 import ctypes
 import os
 import multiprocessing
 import llama_cpp
 def respond(
     message: str,
     history: List[Tuple[str, str]],
     top_k: int,
     repeat_penalty: float,
 ):
     """
     Respond to a message using the Gemma3 model via Llama.cpp.
         str: The response to the message.
     """
     try:
+        global llama
+        if llama == None:
+            llama = Llama("models/madlad400-3b-mt-q8_0.gguf",flash_attn=False,
+                        n_gpu_layers=0,
+                        n_batch=32,
+                        n_ctx=512,
+                        n_threads=2,
+                        n_threads_batch=16)
+        tokens = llama.tokenize(f"<2ja>{message}".encode("utf-8"))
+        llama.encode(tokens)
+        tokens = [llama.decoder_start_token()]
+        outputs =""
+        for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
+            outputs+= llama.detokenize([token]).decode()
+            yield outputs
+            if token == llama.token_eos():
+                break
+    return outputs
     except Exception as e:
         # Custom exception handling
         raise CustomExceptionHandling(e, sys) from e
+    return None
 # Create a chat interface
 demo = gr.ChatInterface(
             value="You are a helpful assistant.",
             label="System Prompt",
             info="Define the AI assistant's personality and behavior",
+            lines=2,visible=False
         ),
         gr.Slider(
             minimum=512,