|
import spaces |
|
import json |
|
import subprocess |
|
import gradio as gr |
|
from huggingface_hub import hf_hub_download |
|
|
|
subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True) |
|
subprocess.run('pip install llama-cpp-agent==0.2.10', shell=True) |
|
|
|
hf_hub_download( |
|
repo_id="bartowski/Meta-Llama-3-70B-Instruct-GGUF", |
|
filename="Meta-Llama-3-70B-Instruct-Q3_K_M.gguf", |
|
local_dir = "./models" |
|
) |
|
hf_hub_download( |
|
repo_id="bartowski/Llama-3-8B-Synthia-v3.5-GGUF", |
|
filename="Llama-3-8B-Synthia-v3.5-f16.gguf", |
|
local_dir = "./models" |
|
) |
|
hf_hub_download( |
|
repo_id="bartowski/Mistral-7B-Instruct-v0.3-GGUF", |
|
filename="Mistral-7B-Instruct-v0.3-f32.gguf", |
|
local_dir = "./models" |
|
) |
|
|
|
css = """ |
|
.message-row { |
|
justify-content: space-evenly !important; |
|
} |
|
.message-bubble-border { |
|
border-radius: 6px !important; |
|
} |
|
.dark.message-bubble-border { |
|
border-color: #343140 !important; |
|
} |
|
.dark.user { |
|
background: #1e1c26 !important; |
|
} |
|
.dark.assistant.dark, .dark.pending.dark { |
|
background: #16141c !important; |
|
} |
|
""" |
|
|
|
def get_messages_formatter_type(model_name): |
|
from llama_cpp_agent import MessagesFormatterType |
|
if "Llama" in model_name: |
|
return MessagesFormatterType.LLAMA_3 |
|
elif "Mistral" in model_name: |
|
return MessagesFormatterType.MISTRAL |
|
else: |
|
raise ValueError(f"Unsupported model: {model_name}") |
|
|
|
@spaces.GPU(duration=120) |
|
def respond( |
|
message, |
|
history: list[tuple[str, str]], |
|
system_message, |
|
max_tokens, |
|
temperature, |
|
top_p, |
|
top_k, |
|
repeat_penalty, |
|
model, |
|
): |
|
from llama_cpp import Llama |
|
from llama_cpp_agent import LlamaCppAgent |
|
from llama_cpp_agent.providers import LlamaCppPythonProvider |
|
from llama_cpp_agent.chat_history import BasicChatHistory |
|
from llama_cpp_agent.chat_history.messages import Roles |
|
|
|
chat_template = get_messages_formatter_type(model) |
|
|
|
llm = Llama( |
|
model_path=f"models/{model}", |
|
flash_attn=True, |
|
n_threads=40, |
|
n_gpu_layers=81, |
|
n_batch=1024, |
|
n_ctx=8192, |
|
) |
|
provider = LlamaCppPythonProvider(llm) |
|
|
|
agent = LlamaCppAgent( |
|
provider, |
|
system_prompt=f"{system_message}", |
|
predefined_messages_formatter_type=chat_template, |
|
debug_output=True |
|
) |
|
|
|
settings = provider.get_provider_default_settings() |
|
settings.temperature = temperature |
|
settings.top_k = top_k |
|
settings.top_p = top_p |
|
settings.max_tokens = max_tokens |
|
settings.repeat_penalty = repeat_penalty |
|
settings.stream = True |
|
|
|
messages = BasicChatHistory() |
|
|
|
for msn in history: |
|
user = { |
|
'role': Roles.user, |
|
'content': msn[0] |
|
} |
|
assistant = { |
|
'role': Roles.assistant, |
|
'content': msn[1] |
|
} |
|
messages.add_message(user) |
|
messages.add_message(assistant) |
|
|
|
stream = agent.get_chat_response( |
|
message, |
|
llm_sampling_settings=settings, |
|
chat_history=messages, |
|
returns_streaming_generator=True, |
|
print_output=False |
|
) |
|
|
|
outputs = "" |
|
for output in stream: |
|
outputs += output |
|
yield outputs |
|
|
|
PLACEHOLDER = """ |
|
<div class="container" style="max-width: 600px; margin: 0 auto; padding: 30px; background-color: #fff; box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);"> |
|
<h1 style="font-size: 28px; margin-bottom: 15px;">llama-cpp-agent: Simplify LLM Interactions</h1> |
|
<p style="font-size: 16px; line-height: 1.5; margin-bottom: 15px;">The llama-cpp-agent framework simplifies interactions with Large Language Models (LLMs), providing an interface for chatting, executing function calls, generating structured output, performing retrieval augmented generation, and processing text using agentic chains with tools.</p> |
|
<p style="font-size: 16px; line-height: 1.5; margin-bottom: 15px;">The framework uses guided sampling to constrain model output to user-defined structures, enabling models not fine-tuned for function calling and JSON output to do so. It is compatible with llama.cpp server, llama-cpp-python and its server, TGI, and vllm servers.</p> |
|
<h2 style="font-size: 22px; margin-bottom: 10px;">Key Features</h2> |
|
<ul style="list-style-type: none; padding: 0;"> |
|
<li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>Simple Chat Interface</strong>: Engage in seamless conversations with LLMs.</li> |
|
<li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>Structured Output</strong>: Generate structured output (objects) from LLMs.</li> |
|
<li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>Function Calling</strong>: Execute functions using LLMs.</li> |
|
<li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>RAG</strong>: Perform retrieval augmented generation with colbert reranking.</li> |
|
<li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>Agent Chains</strong>: Process text using agent chains with tools.</li> |
|
<li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>Guided Sampling</strong>: Allows most 7B LLMs to do function calling and structured output.</li> |
|
<li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>Multiple Providers</strong>: Works with various servers and providers.</li> |
|
<li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>Compatibility</strong>: Works with python functions, pydantic tools, llama-index tools, and OpenAI tool schemas.</li> |
|
<li style="font-size: 16px; line-height: 1.5;"><strong>Flexibility</strong>: Suitable for various applications, from casual chatting to specific function executions.</li> |
|
</ul> |
|
</div> |
|
""" |
|
|
|
demo = gr.ChatInterface( |
|
respond, |
|
additional_inputs=[ |
|
gr.Textbox(value="You are a helpful assistant.", label="System message"), |
|
gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"), |
|
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), |
|
gr.Slider( |
|
minimum=0.1, |
|
maximum=1.0, |
|
value=0.95, |
|
step=0.05, |
|
label="Top-p", |
|
), |
|
gr.Slider( |
|
minimum=0, |
|
maximum=100, |
|
value=40, |
|
step=1, |
|
label="Top-k", |
|
), |
|
gr.Slider( |
|
minimum=0.0, |
|
maximum=2.0, |
|
value=1.1, |
|
step=0.1, |
|
label="Repetition penalty", |
|
), |
|
gr.Dropdown([ |
|
'Meta-Llama-3-70B-Instruct-Q3_K_M.gguf', |
|
'Llama-3-8B-Synthia-v3.5-f16.gguf', |
|
'Mistral-7B-Instruct-v0.3-f32.gguf' |
|
], |
|
value="Meta-Llama-3-70B-Instruct-Q3_K_M.gguf", |
|
label="Model" |
|
), |
|
], |
|
theme=gr.themes.Soft(primary_hue="violet", secondary_hue="violet", neutral_hue="gray",font=[gr.themes.GoogleFont("Exo"), "ui-sans-serif", "system-ui", "sans-serif"]).set( |
|
body_background_fill_dark="#16141c", |
|
block_background_fill_dark="#16141c", |
|
block_border_width="1px", |
|
block_title_background_fill_dark="#1e1c26", |
|
input_background_fill_dark="#292733", |
|
button_secondary_background_fill_dark="#24212b", |
|
border_color_primary_dark="#343140", |
|
background_fill_secondary_dark="#16141c", |
|
color_accent_soft_dark="transparent" |
|
), |
|
css=css, |
|
retry_btn="Retry", |
|
undo_btn="Undo", |
|
clear_btn="Clear", |
|
submit_btn="Send", |
|
description="Llama-cpp-agent: Chat multi llm selection", |
|
chatbot=gr.Chatbot(scale=1, placeholder=PLACEHOLDER) |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |