File size: 5,231 Bytes
26a8369
 
 
 
eefa060
26a8369
 
 
ac18c71
26a8369
 
 
 
 
 
 
 
c3b8348
358544d
801fec7
26a8369
358544d
 
26a8369
 
 
801fec7
fc47a3f
801fec7
e594c8e
4ff0368
e594c8e
801fec7
4ff0368
 
801fec7
26a8369
 
9c2d729
 
 
 
 
 
26a8369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c73126b
 
26a8369
801fec7
 
 
 
a06586a
801fec7
 
c73126b
801fec7
 
 
 
 
 
 
 
 
 
663a9c3
26a8369
 
 
663a9c3
26a8369
801fec7
 
 
 
 
26a8369
 
 
 
c62145d
26a8369
 
 
 
 
 
358544d
26a8369
358544d
26a8369
 
 
c3b8348
d4681be
c3b8348
 
801fec7
c3b8348
26a8369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9c2d729
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# Importing required libraries
import warnings
warnings.filterwarnings("ignore")

import os
import json
import subprocess
import sys
from llama_cpp import Llama,llama_model_decoder_start_token
import gradio as gr
from huggingface_hub import hf_hub_download
from typing import List, Tuple
from logger import logging
from exception import CustomExceptionHandling


# Download gguf model files
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
os.makedirs("models",exist_ok=True)

hf_hub_download(
    repo_id="mtsdurica/madlad400-3b-mt-Q8_0-GGUF",
    filename="madlad400-3b-mt-q8_0.gguf",
    local_dir="./models",
)

# Set the title and description
title = "madlad400-3b-mt llama.cpp"
description = """
I'm using [fairydreaming/T5-branch](https://github.com/fairydreaming/llama-cpp-python/tree/t5), I'm not sure current llama-cpp-python support t5

[Model-Q8_0-GGUF](https://huggingface.co/mtsdurica/madlad400-3b-mt-Q8_0-GGUF), [Reference1](https://huggingface.co/spaces/sitammeur/Gemma-llamacpp), [Reference2](https://qiita.com/mbotsu/items/7dd80bc637ff6c12ef6a)
"""


llama = None


import ctypes
import os
import multiprocessing

import llama_cpp

def respond(
    message: str,
    history: List[Tuple[str, str]],
    model: str,
    system_message: str,
    max_tokens: int,
    temperature: float,
    top_p: float,
    top_k: int,
    repeat_penalty: float,
):
    """
    Respond to a message using the Gemma3 model via Llama.cpp.

    Args:
        - message (str): The message to respond to.
        - history (List[Tuple[str, str]]): The chat history.
        - model (str): The model to use.
        - system_message (str): The system message to use.
        - max_tokens (int): The maximum number of tokens to generate.
        - temperature (float): The temperature of the model.
        - top_p (float): The top-p of the model.
        - top_k (int): The top-k of the model.
        - repeat_penalty (float): The repetition penalty of the model.

    Returns:
        str: The response to the message.
    """
    if model==None:
        return
    try:
        global llama
        if llama == None:
            llama = Llama("models/madlad400-3b-mt-q8_0.gguf",flash_attn=False,
                        n_gpu_layers=0,
                        n_batch=32,
                        n_ctx=512,
                        n_threads=2,
                        n_threads_batch=2,verbose=False)
        
        tokens = llama.tokenize(f"<2ja>{message}".encode("utf-8"))
        llama.encode(tokens)
        tokens = [llama.decoder_start_token()]
        outputs =""
        for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
            outputs+= llama.detokenize([token]).decode()
            yield outputs
            if token == llama.token_eos():
                break
        return outputs
    except Exception as e:
        # Custom exception handling
        raise CustomExceptionHandling(e, sys) from e
    return None

            

    
    


# Create a chat interface
demo = gr.ChatInterface(
    respond,
    examples=[["What is the capital of France?"], ["Tell me something about artificial intelligence."], ["What is gravity?"]],
    additional_inputs_accordion=gr.Accordion(
        label="⚙️ Parameters", open=False, render=False
    ),
    additional_inputs=[
        gr.Dropdown(
            choices=[
                "madlad400-3b-mt-q8_0.gguf",
            ],
            value="madlad400-3b-mt-q8_0.gguf",
            label="Model",
            info="Select the AI model to use for chat",
        ),
        gr.Textbox(
            value="You are a helpful assistant.",
            label="System Prompt",
            info="Define the AI assistant's personality and behavior",
            lines=2,visible=False
        ),
        gr.Slider(
            minimum=512,
            maximum=2048,
            value=1024,
            step=1,
            label="Max Tokens",
            info="Maximum length of response (higher = longer replies)",
        ),
        gr.Slider(
            minimum=0.1,
            maximum=2.0,
            value=0.7,
            step=0.1,
            label="Temperature",
            info="Creativity level (higher = more creative, lower = more focused)",
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p",
            info="Nucleus sampling threshold",
        ),
        gr.Slider(
            minimum=1,
            maximum=100,
            value=40,
            step=1,
            label="Top-k",
            info="Limit vocabulary choices to top K tokens",
        ),
        gr.Slider(
            minimum=1.0,
            maximum=2.0,
            value=1.1,
            step=0.1,
            label="Repetition Penalty",
            info="Penalize repeated words (higher = less repetition)",
        ),
    ],
    theme="Ocean",
    submit_btn="Send",
    stop_btn="Stop",
    title=title,
    description=description,
    chatbot=gr.Chatbot(scale=1, show_copy_button=True),
    flagging_mode="never",
)


# Launch the chat interface
if __name__ == "__main__":
    demo.launch(debug=False)
    test()