llama-cpp-agent / app.py
pabloce's picture
Update app.py
532ca99 verified
raw
history blame
7.81 kB
import spaces
import json
import subprocess
import gradio as gr
from huggingface_hub import hf_hub_download
subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True)
subprocess.run('pip install llama-cpp-agent==0.2.10', shell=True)
hf_hub_download(
repo_id="bartowski/Meta-Llama-3-70B-Instruct-GGUF",
filename="Meta-Llama-3-70B-Instruct-Q3_K_M.gguf",
local_dir = "./models"
)
hf_hub_download(
repo_id="bartowski/Llama-3-8B-Synthia-v3.5-GGUF",
filename="Llama-3-8B-Synthia-v3.5-f16.gguf",
local_dir = "./models"
)
hf_hub_download(
repo_id="bartowski/Mistral-7B-Instruct-v0.3-GGUF",
filename="Mistral-7B-Instruct-v0.3-f32.gguf",
local_dir = "./models"
)
css = """
.message-row {
justify-content: space-evenly !important;
}
.message-bubble-border {
border-radius: 6px !important;
}
.dark.message-bubble-border {
border-color: #343140 !important;
}
.dark.user {
background: #1e1c26 !important;
}
.dark.assistant.dark, .dark.pending.dark {
background: #16141c !important;
}
"""
def get_messages_formatter_type(model_name):
from llama_cpp_agent import MessagesFormatterType
if "Llama" in model_name:
return MessagesFormatterType.LLAMA_3
elif "Mistral" in model_name:
return MessagesFormatterType.MISTRAL
else:
raise ValueError(f"Unsupported model: {model_name}")
@spaces.GPU(duration=120)
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
top_k,
repeat_penalty,
model,
):
from llama_cpp import Llama
from llama_cpp_agent import LlamaCppAgent
from llama_cpp_agent.providers import LlamaCppPythonProvider
from llama_cpp_agent.chat_history import BasicChatHistory
from llama_cpp_agent.chat_history.messages import Roles
chat_template = get_messages_formatter_type(model)
llm = Llama(
model_path=f"models/{model}",
flash_attn=True,
n_threads=40,
n_gpu_layers=81,
n_batch=1024,
n_ctx=8192,
)
provider = LlamaCppPythonProvider(llm)
agent = LlamaCppAgent(
provider,
system_prompt=f"{system_message}",
predefined_messages_formatter_type=chat_template,
debug_output=True
)
settings = provider.get_provider_default_settings()
settings.temperature = temperature
settings.top_k = top_k
settings.top_p = top_p
settings.max_tokens = max_tokens
settings.repeat_penalty = repeat_penalty
settings.stream = True
messages = BasicChatHistory()
for msn in history:
user = {
'role': Roles.user,
'content': msn[0]
}
assistant = {
'role': Roles.assistant,
'content': msn[1]
}
messages.add_message(user)
messages.add_message(assistant)
stream = agent.get_chat_response(
message,
llm_sampling_settings=settings,
chat_history=messages,
returns_streaming_generator=True,
print_output=False
)
outputs = ""
for output in stream:
outputs += output
yield outputs
PLACEHOLDER = """
<div class="container" style="max-width: 600px; margin: 0 auto; padding: 30px; background-color: #fff; box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);">
<h1 style="font-size: 28px; margin-bottom: 15px;">llama-cpp-agent: Simplify LLM Interactions</h1>
<p style="font-size: 16px; line-height: 1.5; margin-bottom: 15px;">The llama-cpp-agent framework simplifies interactions with Large Language Models (LLMs), providing an interface for chatting, executing function calls, generating structured output, performing retrieval augmented generation, and processing text using agentic chains with tools.</p>
<p style="font-size: 16px; line-height: 1.5; margin-bottom: 15px;">The framework uses guided sampling to constrain model output to user-defined structures, enabling models not fine-tuned for function calling and JSON output to do so. It is compatible with llama.cpp server, llama-cpp-python and its server, TGI, and vllm servers.</p>
<h2 style="font-size: 22px; margin-bottom: 10px;">Key Features</h2>
<ul style="list-style-type: none; padding: 0;">
<li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>Simple Chat Interface</strong>: Engage in seamless conversations with LLMs.</li>
<li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>Structured Output</strong>: Generate structured output (objects) from LLMs.</li>
<li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>Function Calling</strong>: Execute functions using LLMs.</li>
<li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>RAG</strong>: Perform retrieval augmented generation with colbert reranking.</li>
<li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>Agent Chains</strong>: Process text using agent chains with tools.</li>
<li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>Guided Sampling</strong>: Allows most 7B LLMs to do function calling and structured output.</li>
<li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>Multiple Providers</strong>: Works with various servers and providers.</li>
<li style="font-size: 16px; line-height: 1.5; margin-bottom: 8px;"><strong>Compatibility</strong>: Works with python functions, pydantic tools, llama-index tools, and OpenAI tool schemas.</li>
<li style="font-size: 16px; line-height: 1.5;"><strong>Flexibility</strong>: Suitable for various applications, from casual chatting to specific function executions.</li>
</ul>
</div>
"""
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a helpful assistant.", label="System message"),
gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p",
),
gr.Slider(
minimum=0,
maximum=100,
value=40,
step=1,
label="Top-k",
),
gr.Slider(
minimum=0.0,
maximum=2.0,
value=1.1,
step=0.1,
label="Repetition penalty",
),
gr.Dropdown([
'Meta-Llama-3-70B-Instruct-Q3_K_M.gguf',
'Llama-3-8B-Synthia-v3.5-f16.gguf',
'Mistral-7B-Instruct-v0.3-f32.gguf'
],
value="Meta-Llama-3-70B-Instruct-Q3_K_M.gguf",
label="Model"
),
],
theme=gr.themes.Soft(primary_hue="violet", secondary_hue="violet", neutral_hue="gray",font=[gr.themes.GoogleFont("Exo"), "ui-sans-serif", "system-ui", "sans-serif"]).set(
body_background_fill_dark="#16141c",
block_background_fill_dark="#16141c",
block_border_width="1px",
block_title_background_fill_dark="#1e1c26",
input_background_fill_dark="#292733",
button_secondary_background_fill_dark="#24212b",
border_color_primary_dark="#343140",
background_fill_secondary_dark="#16141c",
color_accent_soft_dark="transparent"
),
css=css,
retry_btn="Retry",
undo_btn="Undo",
clear_btn="Clear",
submit_btn="Send",
description="Llama-cpp-agent: Chat multi llm selection",
chatbot=gr.Chatbot(scale=1, placeholder=PLACEHOLDER)
)
if __name__ == "__main__":
demo.launch()