llamacpp-flan-t5-large-grammar-synthesis

Sleeping

llamacpp-flan-t5-large-grammar-synthesis

File size: 5,723 Bytes

26a8369
 
 
 
eefa060
26a8369
 
 
ac18c71
26a8369
 
 
 
 
4ff0368
 
26a8369
 
 
 
 
 
 
 
c3b8348
358544d
801fec7
3debf7a
 
26a8369
0430bc7
 
26a8369
 
 
3debf7a
 
801fec7
1074fa0
801fec7
3debf7a
4ff0368
2efeef1
801fec7
4ff0368
 
801fec7
26a8369
 
9c2d729
 
 
 
 
 
26a8369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a66261e
 
26a8369
801fec7
 
0430bc7
3debf7a
801fec7
5e8e544
801fec7
8baeec8
3debf7a
33572bc
801fec7
 
 
0430bc7
8ce032d
57c0898
8ce032d
 
 
 
0430bc7
663a9c3
26a8369
 
 
663a9c3
26a8369
801fec7
 
 
 
 
26a8369
 
 
 
2efeef1
26a8369
 
 
 
 
 
1074fa0
26a8369
1074fa0
26a8369
 
8c8c1ee
26a8369
c3b8348
d4681be
c3b8348
 
801fec7
c3b8348
26a8369
 
95cd28c
 
26a8369
 
3debf7a
26a8369
 
 
 
3debf7a
26a8369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9c2d729

# Importing required libraries
import warnings
warnings.filterwarnings("ignore")

import os
import json
import subprocess
import sys
from llama_cpp import Llama,llama_model_decoder_start_token
from llama_cpp_agent import LlamaCppAgent
from llama_cpp_agent import MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider
from llama_cpp_agent.chat_history import BasicChatHistory
from llama_cpp_agent.chat_history.messages import Roles
from llama_cpp_agent.chat_history.messages import Roles
from llama_cpp_agent.messages_formatter import MessagesFormatter, PromptMarkers
import gradio as gr
from huggingface_hub import hf_hub_download
from typing import List, Tuple
from logger import logging
from exception import CustomExceptionHandling


# Download gguf model files
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
os.makedirs("models",exist_ok=True)



hf_hub_download(
    repo_id="pszemraj/flan-t5-large-grammar-synthesis",
    filename="ggml-model-Q6_K.gguf",
    local_dir="./models",
)



# Set the title and description
title = "flan-t5-large-grammar-synthesis Llama.cpp"
description = """
I'm using [fairydreaming/T5-branch](https://github.com/fairydreaming/llama-cpp-python/tree/t5), I'm not sure current llama-cpp-python server support t5

[Model-Q6_K-GGUF](https://huggingface.co/pszemraj/flan-t5-large-grammar-synthesis-gguf), [Reference1](https://huggingface.co/spaces/sitammeur/Gemma-llamacpp)
"""


llama = None


import ctypes
import os
import multiprocessing

import llama_cpp

def respond(
    message: str,
    history: List[Tuple[str, str]],
    model: str,
    system_message: str,
    max_tokens: int,
    temperature: float,
    top_p: float,
    top_k: int,
    repeat_penalty: float,
):
    """
    Respond to a message using the Gemma3 model via Llama.cpp.

    Args:
        - message (str): The message to respond to.
        - history (List[Tuple[str, str]]): The chat history.
        - model (str): The model to use.
        - system_message (str): The system message to use.
        - max_tokens (int): The maximum number of tokens to generate.
        - temperature (float): The temperature of the model.
        - top_p (float): The top-p of the model.
        - top_k (int): The top-k of the model.
        - repeat_penalty (float): The repetition penalty of the model.

    Returns:
        str: The response to the message.
    """
    if model == None:
        return
    try:
        global llama
        if llama == None:
            model_id = "ggml-model-Q6_K.gguf"
            llama = Llama(f"models/{model_id}",flash_attn=False,
                        n_gpu_layers=0,
                        n_ctx=max_tokens,
                        n_threads=2,
                        n_threads_batch=2,verbose=False)
       
        tokens = llama.tokenize(f"{message}".encode("utf-8"))
        llama.encode(tokens)
        tokens = [llama.decoder_start_token()]
        outputs =""
        iteration = 1
        for i in range(iteration):
            for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
                outputs+= llama.detokenize([token]).decode()
                yield outputs
                if token == llama.token_eos():
                    break
            #outputs+="\n"
        return outputs
    except Exception as e:
        # Custom exception handling
        raise CustomExceptionHandling(e, sys) from e
    return None

            

    
    


# Create a chat interface
demo = gr.ChatInterface(
    respond,
    examples=[["What are the capital of France?"], ["What real child was raise by wolves?"], ["What am gravity?"]],
    additional_inputs_accordion=gr.Accordion(
        label="⚙️ Parameters", open=False, render=False
    ),
    additional_inputs=[
        gr.Dropdown(
            choices=[
                "ggml-model-Q6_K.gguf",
            ],
            value="ggml-model-Q6_K.gguf",
            label="Model",
            info="Select the AI model to use for chat",
            visible=False
        ),
        gr.Textbox(
            value="You are a helpful assistant.",
            label="System Prompt",
            info="Define the AI assistant's personality and behavior",
            lines=2,visible=False
        ),
        gr.Slider(
            minimum=512,
            maximum=512,
            value=512,
            step=1,
            label="Max Tokens",
            info="Maximum length of response (higher = longer replies)",visible=False
        ),
        gr.Slider(
            minimum=0.1,
            maximum=2.0,
            value=0.4,
            step=0.1,
            label="Temperature",
            info="Creativity level (higher = more creative, lower = more focused)",
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p",
            info="Nucleus sampling threshold",
        ),
        gr.Slider(
            minimum=1,
            maximum=100,
            value=40,
            step=1,
            label="Top-k",
            info="Limit vocabulary choices to top K tokens",
        ),
        gr.Slider(
            minimum=1.0,
            maximum=2.0,
            value=1.1,
            step=0.1,
            label="Repetition Penalty",
            info="Penalize repeated words (higher = less repetition)",
        ),
    ],
    theme="Ocean",
    submit_btn="Send",
    stop_btn="Stop",
    title=title,
    description=description,
    chatbot=gr.Chatbot(scale=1, show_copy_button=True),
    flagging_mode="never",
)


# Launch the chat interface
if __name__ == "__main__":
    demo.launch(debug=False)
    test()