llama-cpp-agent

Paused

File size: 2,005 Bytes

21b8ce0
b5c263a
d8a3c53
606c0ce
 
 
 
 
d8a3c53
b0f00f9
ec06a49
98758c3
d8a3c53
3b38821
d8a3c53
 
 
 
 
 
 
 
4d0808e
d8a3c53
606c0ce
ec06a49
606c0ce
 
 
 
 
6c9b3f7
d8a3c53
606c0ce
 
 
 
 
 
827a15f
606c0ce
d8a3c53
 
 
606c0ce
d8a3c53

import spaces
import subprocess
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from llama_cpp_agent import LlamaCppAgent
from llama_cpp_agent import MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider

subprocess.run('pip install llama-cpp-python==0.2.75 --no-build-isolation --no-cache-dir --upgrade --only-binary=:all: --extra-index-url=https://abetlen.github.io/llama-cpp-python/whl/cu124', env={'CMAKE_ARGS': "-DLLAMA_CUDA=on"}, shell=True)

hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q6_K.gguf",  local_dir = "./models")

@spaces.GPU(duration=120)
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    llama_model = Llama(r"models/mistral-7b-instruct-v0.2.Q6_K.gguf", n_batch=1024, n_threads=0, n_gpu_layers=33, n_ctx=8192, verbose=False)

    provider = LlamaCppPythonProvider(llama_model)

    agent = LlamaCppAgent(
      provider,
      system_prompt=f"{system_message}",
      predefined_messages_formatter_type=MessagesFormatterType.MISTRAL,
      debug_output=True
    )

    settings = provider.get_provider_default_settings()
    settings.stream = True
    settings.max_tokens = max_tokens
    settings.temperature = temperature
    settings.top_p = top_p

    yield agent.get_chat_response(message, llm_sampling_settings=settings)

demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a helpful assistant.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

if __name__ == "__main__":
    demo.launch()