File size: 5,723 Bytes
26a8369
 
 
 
eefa060
26a8369
 
 
ac18c71
26a8369
 
 
 
 
4ff0368
 
26a8369
 
 
 
 
 
 
 
c3b8348
358544d
801fec7
3debf7a
 
26a8369
0430bc7
 
26a8369
 
 
3debf7a
 
801fec7
1074fa0
801fec7
3debf7a
4ff0368
2efeef1
801fec7
4ff0368
 
801fec7
26a8369
 
9c2d729
 
 
 
 
 
26a8369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a66261e
 
26a8369
801fec7
 
0430bc7
3debf7a
801fec7
5e8e544
801fec7
8baeec8
3debf7a
33572bc
801fec7
 
 
0430bc7
8ce032d
57c0898
8ce032d
 
 
 
0430bc7
663a9c3
26a8369
 
 
663a9c3
26a8369
801fec7
 
 
 
 
26a8369
 
 
 
2efeef1
26a8369
 
 
 
 
 
1074fa0
26a8369
1074fa0
26a8369
 
8c8c1ee
26a8369
c3b8348
d4681be
c3b8348
 
801fec7
c3b8348
26a8369
 
95cd28c
 
26a8369
 
3debf7a
26a8369
 
 
 
3debf7a
26a8369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9c2d729
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# Importing required libraries
import warnings
warnings.filterwarnings("ignore")

import os
import json
import subprocess
import sys
from llama_cpp import Llama,llama_model_decoder_start_token
from llama_cpp_agent import LlamaCppAgent
from llama_cpp_agent import MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider
from llama_cpp_agent.chat_history import BasicChatHistory
from llama_cpp_agent.chat_history.messages import Roles
from llama_cpp_agent.chat_history.messages import Roles
from llama_cpp_agent.messages_formatter import MessagesFormatter, PromptMarkers
import gradio as gr
from huggingface_hub import hf_hub_download
from typing import List, Tuple
from logger import logging
from exception import CustomExceptionHandling


# Download gguf model files
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
os.makedirs("models",exist_ok=True)



hf_hub_download(
    repo_id="pszemraj/flan-t5-large-grammar-synthesis",
    filename="ggml-model-Q6_K.gguf",
    local_dir="./models",
)



# Set the title and description
title = "flan-t5-large-grammar-synthesis Llama.cpp"
description = """
I'm using [fairydreaming/T5-branch](https://github.com/fairydreaming/llama-cpp-python/tree/t5), I'm not sure current llama-cpp-python server support t5

[Model-Q6_K-GGUF](https://huggingface.co/pszemraj/flan-t5-large-grammar-synthesis-gguf), [Reference1](https://huggingface.co/spaces/sitammeur/Gemma-llamacpp)
"""


llama = None


import ctypes
import os
import multiprocessing

import llama_cpp

def respond(
    message: str,
    history: List[Tuple[str, str]],
    model: str,
    system_message: str,
    max_tokens: int,
    temperature: float,
    top_p: float,
    top_k: int,
    repeat_penalty: float,
):
    """
    Respond to a message using the Gemma3 model via Llama.cpp.

    Args:
        - message (str): The message to respond to.
        - history (List[Tuple[str, str]]): The chat history.
        - model (str): The model to use.
        - system_message (str): The system message to use.
        - max_tokens (int): The maximum number of tokens to generate.
        - temperature (float): The temperature of the model.
        - top_p (float): The top-p of the model.
        - top_k (int): The top-k of the model.
        - repeat_penalty (float): The repetition penalty of the model.

    Returns:
        str: The response to the message.
    """
    if model == None:
        return
    try:
        global llama
        if llama == None:
            model_id = "ggml-model-Q6_K.gguf"
            llama = Llama(f"models/{model_id}",flash_attn=False,
                        n_gpu_layers=0,
                        n_ctx=max_tokens,
                        n_threads=2,
                        n_threads_batch=2,verbose=False)
       
        tokens = llama.tokenize(f"{message}".encode("utf-8"))
        llama.encode(tokens)
        tokens = [llama.decoder_start_token()]
        outputs =""
        iteration = 1
        for i in range(iteration):
            for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
                outputs+= llama.detokenize([token]).decode()
                yield outputs
                if token == llama.token_eos():
                    break
            #outputs+="\n"
        return outputs
    except Exception as e:
        # Custom exception handling
        raise CustomExceptionHandling(e, sys) from e
    return None

            

    
    


# Create a chat interface
demo = gr.ChatInterface(
    respond,
    examples=[["What are the capital of France?"], ["What real child was raise by wolves?"], ["What am gravity?"]],
    additional_inputs_accordion=gr.Accordion(
        label="⚙️ Parameters", open=False, render=False
    ),
    additional_inputs=[
        gr.Dropdown(
            choices=[
                "ggml-model-Q6_K.gguf",
            ],
            value="ggml-model-Q6_K.gguf",
            label="Model",
            info="Select the AI model to use for chat",
            visible=False
        ),
        gr.Textbox(
            value="You are a helpful assistant.",
            label="System Prompt",
            info="Define the AI assistant's personality and behavior",
            lines=2,visible=False
        ),
        gr.Slider(
            minimum=512,
            maximum=512,
            value=512,
            step=1,
            label="Max Tokens",
            info="Maximum length of response (higher = longer replies)",visible=False
        ),
        gr.Slider(
            minimum=0.1,
            maximum=2.0,
            value=0.4,
            step=0.1,
            label="Temperature",
            info="Creativity level (higher = more creative, lower = more focused)",
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p",
            info="Nucleus sampling threshold",
        ),
        gr.Slider(
            minimum=1,
            maximum=100,
            value=40,
            step=1,
            label="Top-k",
            info="Limit vocabulary choices to top K tokens",
        ),
        gr.Slider(
            minimum=1.0,
            maximum=2.0,
            value=1.1,
            step=0.1,
            label="Repetition Penalty",
            info="Penalize repeated words (higher = less repetition)",
        ),
    ],
    theme="Ocean",
    submit_btn="Send",
    stop_btn="Stop",
    title=title,
    description=description,
    chatbot=gr.Chatbot(scale=1, show_copy_button=True),
    flagging_mode="never",
)


# Launch the chat interface
if __name__ == "__main__":
    demo.launch(debug=False)
    test()