llama-cpp-agent

Paused

File size: 2,776 Bytes

21b8ce0
b5c263a
d8a3c53
606c0ce
 
 
 
 
d8a3c53
ae70ddf
 
 
 
 
 
 
 
00a2173
ec06a49
98758c3
d8a3c53
3b38821
d8a3c53
 
 
 
 
 
 
 
63c66b0
becbc87
92d9ef4
 
ae70ddf
ec06a49
ae70ddf
 
 
 
 
 
7e5587a
ae70ddf
 
 
 
 
 
 
 
 
 
 
 
6c9b3f7
63c66b0
 
 
 
 
 
04095d9
63c66b0
 
 
 
606c0ce
d8a3c53
 
 
606c0ce
d8a3c53

import spaces
import subprocess
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from llama_cpp_agent import LlamaCppAgent
from llama_cpp_agent import MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider

from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)
from llama_index.core.memory import ChatMemoryBuffer

subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', env={'CMAKE_ARGS': "-DLLAMA_CUDA=on"}, shell=True)

hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q6_K.gguf",  local_dir = "./models")

@spaces.GPU(duration=120)
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    stop_tokens = ["</s>", "[INST]", "[INST] ", "<s>", "[/INST]", "[/INST] "]
    chat_template = '<s>[INST] ' + system_message
    # for human, assistant in history:
    #     chat_template += human + ' [/INST] ' + assistant + '</s>[INST]'
    chat_template += ' ' + message + ' [/INST]'

    print(chat_template)
    
    llm = LlamaCPP(
        model_path="models/mistral-7b-instruct-v0.2.Q6_K.gguf",
        temperature=temperature,
        max_new_tokens=max_tokens,
        context_window=2048,
        generate_kwargs={
            "top_k": 50,
            "top_p": top_p,
            "repeat_penalty": 1.3
        },
        model_kwargs={
            "n_threads": 0,
            "n_gpu_layers": 33
        },
        messages_to_prompt=messages_to_prompt,
        completion_to_prompt=completion_to_prompt,
        verbose=True,
    )
    # response = ""
    # for chunk in llm.stream_complete(message):
    #     print(chunk.delta, end="", flush=True)
    #     response += str(chunk.delta)
    #     yield response
    outputs = []
    for chunk in llm.stream_complete(message):
        outputs.append(chunk.delta)
        if chunk.delta in stop_tokens:
            break
        yield "".join(outputs)

demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a helpful assistant.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

if __name__ == "__main__":
    demo.launch()