llama-cpp-agent

Paused

File size: 2,388 Bytes

21b8ce0
13e498a
b5c263a
d8a3c53
606c0ce
40afde6
7418606
8bdc23a
ec06a49
98758c3
d8a3c53
3b38821
d8a3c53
 
 
 
 
 
 
 
40afde6
cf6a52f
 
 
ac70b49
cf6a52f
8e6bf26
102611c
 
8e6bf26
cf6a52f
 
 
 
 
 
 
8e6bf26
cf6a52f
 
 
 
ac70b49
 
c0b0a51
ac70b49
 
 
 
3de786c
ac70b49
 
0133165
 
ac70b49
cf6a52f
3a779df
cf6a52f
559c9c0
8e6bf26
b9838b1
559c9c0
606c0ce
d8a3c53
 
 
606c0ce
d8a3c53

import spaces
import json
import subprocess
import gradio as gr
from huggingface_hub import hf_hub_download

subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True)
subprocess.run('pip install llama-cpp-agent==0.2.8', shell=True)

hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q6_K.gguf",  local_dir = "./models")

@spaces.GPU(duration=120)
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    from llama_cpp import Llama
    from llama_cpp_agent import LlamaCppAgent
    from llama_cpp_agent import MessagesFormatterType
    from llama_cpp_agent.providers import LlamaCppPythonProvider
    from llama_cpp_agent.chat_history import BasicChatHistory
    
    llm = Llama(
        model_path="models/mistral-7b-instruct-v0.2.Q6_K.gguf",
        n_gpu_layers=33,
    )
    provider = LlamaCppPythonProvider(llm)

    agent = LlamaCppAgent(
        provider,
        system_prompt="You are a helpful assistant.",
        predefined_messages_formatter_type=MessagesFormatterType.MISTRAL,
        debug_output=True
    )
    
    settings = provider.get_provider_default_settings()
    settings.max_tokens = 2000
    settings.stream = True

    messages = BasicChatHistory()
    print("history")
    print(history)

    for msn in history:
        dic = {
            'role': msn[0],
            'content': msn[1]
        }
        print("dic")
        print(dic)
        messages.add_message(dic)
    
    stream = agent.get_chat_response(message, llm_sampling_settings=settings, returns_streaming_generator=True, print_output=False)
    
    outputs = ""
    for output in stream:
        outputs += output
        yield outputs

demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a helpful assistant.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

if __name__ == "__main__":
    demo.launch()