File size: 7,810 Bytes
21b8ce0
13e498a
b5c263a
d8a3c53
606c0ce
40afde6
7418606
2cd3649
ec06a49
0e76b02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8a3c53
69a7f00
 
70b9f60
69a7f00
70b9f60
 
75cbc5e
 
ed82b9d
69a7f00
75cbc5e
2110ec0
 
75cbc5e
2110ec0
79f3ba6
69a7f00
 
0e76b02
c6ac795
0e76b02
 
 
 
 
 
 
3b38821
d8a3c53
 
 
 
 
 
 
0824852
 
df6c9eb
d8a3c53
40afde6
cf6a52f
 
ac70b49
777a931
12a9e25
0e76b02
 
8e6bf26
ac7a09d
0824852
88b5cb0
888ea87
5c89384
d1f8024
8e6bf26
5c89384
cf6a52f
 
 
11987d3
0e76b02
cf6a52f
8e6bf26
cf6a52f
 
5c89384
 
 
e9aaf81
5c89384
cf6a52f
ac70b49
 
 
 
777a931
 
 
 
 
 
ac70b49
 
777a931
 
cf6a52f
0e76b02
 
 
 
 
 
 
cf6a52f
559c9c0
8e6bf26
b9838b1
559c9c0
606c0ce
532ca99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8a3c53
 
 
606c0ce
0824852
d8a3c53
 
 
 
 
 
0824852
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8a3c53
0e76b02
 
e1f9354
 
0e76b02
 
 
 
d8a3c53
6ccc337
15a7813
5eb0b07
5c900a6
ed82b9d
9a1c399
736d872
2110ec0
7c944d3
 
905a689
f97343d
 
 
e6201bd
592c790
532ca99
 
d8a3c53
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import spaces
import json
import subprocess
import gradio as gr
from huggingface_hub import hf_hub_download

subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True)
subprocess.run('pip install llama-cpp-agent==0.2.10', shell=True)

hf_hub_download(
    repo_id="bartowski/Meta-Llama-3-70B-Instruct-GGUF",
    filename="Meta-Llama-3-70B-Instruct-Q3_K_M.gguf",
    local_dir = "./models"
)
hf_hub_download(
    repo_id="bartowski/Llama-3-8B-Synthia-v3.5-GGUF",
    filename="Llama-3-8B-Synthia-v3.5-f16.gguf",
    local_dir = "./models"
)
hf_hub_download(
    repo_id="bartowski/Mistral-7B-Instruct-v0.3-GGUF",
    filename="Mistral-7B-Instruct-v0.3-f32.gguf",
    local_dir = "./models"
)

css = """
.message-row {
    justify-content: space-evenly !important;
}
.message-bubble-border {
    border-radius: 6px !important;
}
.dark.message-bubble-border {
    border-color: #343140 !important;
}
.dark.user {
    background: #1e1c26 !important;
}
.dark.assistant.dark, .dark.pending.dark {
    background: #16141c !important;
}
"""

def get_messages_formatter_type(model_name):
    from llama_cpp_agent import MessagesFormatterType
    if "Llama" in model_name:
        return MessagesFormatterType.LLAMA_3
    elif "Mistral" in model_name:
        return MessagesFormatterType.MISTRAL
    else:
        raise ValueError(f"Unsupported model: {model_name}")

@spaces.GPU(duration=120)
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    top_k,
    repeat_penalty,
    model,
):
    from llama_cpp import Llama
    from llama_cpp_agent import LlamaCppAgent
    from llama_cpp_agent.providers import LlamaCppPythonProvider
    from llama_cpp_agent.chat_history import BasicChatHistory
    from llama_cpp_agent.chat_history.messages import Roles

    chat_template = get_messages_formatter_type(model)

    llm = Llama(
        model_path=f"models/{model}",
        flash_attn=True,
        n_threads=40,
        n_gpu_layers=81,
        n_batch=1024,
        n_ctx=8192,
    )
    provider = LlamaCppPythonProvider(llm)

    agent = LlamaCppAgent(
        provider,
        system_prompt=f"{system_message}",
        predefined_messages_formatter_type=chat_template,
        debug_output=True
    )
    
    settings = provider.get_provider_default_settings()
    settings.temperature = temperature
    settings.top_k = top_k
    settings.top_p = top_p
    settings.max_tokens = max_tokens
    settings.repeat_penalty = repeat_penalty
    settings.stream = True

    messages = BasicChatHistory()

    for msn in history:
        user = {
            'role': Roles.user,
            'content': msn[0]
        }
        assistant = {
            'role': Roles.assistant,
            'content': msn[1]
        }
        messages.add_message(user)
        messages.add_message(assistant)
    
    stream = agent.get_chat_response(
        message,
        llm_sampling_settings=settings,
        chat_history=messages,
        returns_streaming_generator=True,
        print_output=False
    )
    
    outputs = ""
    for output in stream:
        outputs += output
        yield outputs

PLACEHOLDER = """
<div class="container" style="max-width: 600px; margin: 0 auto; padding: 30px; background-color: #fff; box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);">
    <h1 style="font-size: 28px; margin-bottom: 15px;">llama-cpp-agent: Simplify LLM Interactions</h1>
    <p style="font-size: 16px; line-height: 1.5; margin-bottom: 15px;">The llama-cpp-agent framework simplifies interactions with Large Language Models (LLMs), providing an interface for chatting, executing function calls, generating structured output, performing retrieval augmented generation, and processing text using agentic chains with tools.</p>
    <p style="font-size: 16px; line-height: 1.5; margin-bottom: 15px;">The framework uses guided sampling to constrain model output to user-defined structures, enabling models not fine-tuned for function calling and JSON output to do so. It is compatible with llama.cpp server, llama-cpp-python and its server, TGI, and vllm servers.</p>
    <h2 style="font-size: 22px; margin-bottom: 10px;">Key Features</h2>
    <ul style="list-style-type: none; padding: 0;">
        <li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>Simple Chat Interface</strong>: Engage in seamless conversations with LLMs.</li>
        <li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>Structured Output</strong>: Generate structured output (objects) from LLMs.</li>
        <li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>Function Calling</strong>: Execute functions using LLMs.</li>
        <li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>RAG</strong>: Perform retrieval augmented generation with colbert reranking.</li>
        <li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>Agent Chains</strong>: Process text using agent chains with tools.</li>
        <li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>Guided Sampling</strong>: Allows most 7B LLMs to do function calling and structured output.</li>
        <li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>Multiple Providers</strong>: Works with various servers and providers.</li>
        <li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>Compatibility</strong>: Works with python functions, pydantic tools, llama-index tools, and OpenAI tool schemas.</li>
        <li style="font-size: 16px; line-height: 1.5;"><strong>Flexibility</strong>: Suitable for various applications, from casual chatting to specific function executions.</li>
    </ul>
</div>
"""

demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a helpful assistant.", label="System message"),
        gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p",
        ),
        gr.Slider(
            minimum=0,
            maximum=100,
            value=40,
            step=1,
            label="Top-k",
        ),
        gr.Slider(
            minimum=0.0,
            maximum=2.0,
            value=1.1,
            step=0.1,
            label="Repetition penalty",
        ),
        gr.Dropdown([
                'Meta-Llama-3-70B-Instruct-Q3_K_M.gguf',
                'Llama-3-8B-Synthia-v3.5-f16.gguf',
                'Mistral-7B-Instruct-v0.3-f32.gguf'
            ],
            value="Meta-Llama-3-70B-Instruct-Q3_K_M.gguf",
            label="Model"
        ),
    ],
    theme=gr.themes.Soft(primary_hue="violet", secondary_hue="violet", neutral_hue="gray",font=[gr.themes.GoogleFont("Exo"), "ui-sans-serif", "system-ui", "sans-serif"]).set(
        body_background_fill_dark="#16141c",
        block_background_fill_dark="#16141c",
        block_border_width="1px",
        block_title_background_fill_dark="#1e1c26",
        input_background_fill_dark="#292733",
        button_secondary_background_fill_dark="#24212b",
        border_color_primary_dark="#343140",
        background_fill_secondary_dark="#16141c",
        color_accent_soft_dark="transparent"
    ),
    css=css,
    retry_btn="Retry",
    undo_btn="Undo",
    clear_btn="Clear",
    submit_btn="Send",
    description="Llama-cpp-agent: Chat multi llm selection",
    chatbot=gr.Chatbot(scale=1, placeholder=PLACEHOLDER)
)

if __name__ == "__main__":
    demo.launch()