File size: 3,176 Bytes
21b8ce0
13e498a
b5c263a
d8a3c53
606c0ce
40afde6
7418606
2cd3649
ec06a49
0fd9e08
df6c9eb
d8a3c53
3b38821
d8a3c53
 
 
 
 
 
 
df6c9eb
d8a3c53
40afde6
cf6a52f
 
 
ac70b49
777a931
df6c9eb
8e6bf26
ac7a09d
888ea87
8e6bf26
cf6a52f
 
 
 
 
622f877
cf6a52f
8e6bf26
cf6a52f
 
e9aaf81
cf6a52f
ac70b49
 
c0b0a51
ac70b49
 
 
777a931
 
 
 
 
 
ac70b49
 
777a931
 
 
 
cf6a52f
2cd3649
cf6a52f
559c9c0
8e6bf26
b9838b1
559c9c0
606c0ce
d8a3c53
 
 
606c0ce
2da6f34
d8a3c53
 
 
 
 
 
 
 
ac7a09d
d8a3c53
3600cd6
 
 
d8a3c53
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import spaces
import json
import subprocess
import gradio as gr
from huggingface_hub import hf_hub_download

subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True)
subprocess.run('pip install llama-cpp-agent==0.2.10', shell=True)

hf_hub_download(repo_id="bartowski/Meta-Llama-3-70B-Instruct-GGUF", filename="Meta-Llama-3-70B-Instruct-Q3_K_M.gguf",  local_dir = "./models")
hf_hub_download(repo_id="bartowski/Smaug-Llama-3-70B-Instruct-GGUF", filename="Smaug-Llama-3-70B-Instruct-Q3_K_M.gguf",  local_dir = "./models")

@spaces.GPU(duration=120)
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    model,
):
    from llama_cpp import Llama
    from llama_cpp_agent import LlamaCppAgent
    from llama_cpp_agent import MessagesFormatterType
    from llama_cpp_agent.providers import LlamaCppPythonProvider
    from llama_cpp_agent.chat_history import BasicChatHistory
    from llama_cpp_agent.chat_history.messages import Roles
    print(model)
    llm = Llama(
        model_path=f"models/{model}",
        n_gpu_layers=81,
    )
    provider = LlamaCppPythonProvider(llm)

    agent = LlamaCppAgent(
        provider,
        system_prompt="You are a helpful assistant.",
        predefined_messages_formatter_type=MessagesFormatterType.LLAMA_3,
        debug_output=True
    )
    
    settings = provider.get_provider_default_settings()
    settings.max_tokens = max_tokens
    settings.stream = True

    messages = BasicChatHistory()
    print("history")
    print(history)

    for msn in history:
        user = {
            'role': Roles.user,
            'content': msn[0]
        }
        assistant = {
            'role': Roles.assistant,
            'content': msn[1]
        }
        print(user)
        print(assistant)
        messages.add_message(user)
        messages.add_message(assistant)
    
    stream = agent.get_chat_response(message, llm_sampling_settings=settings, chat_history=messages, returns_streaming_generator=True, print_output=False)
    
    outputs = ""
    for output in stream:
        outputs += output
        yield outputs

demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a helpful assistant.", label="System message"),
        gr.Slider(minimum=1, maximum=8192, value=8192, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
        gr.Dropdown(['Meta-Llama-3-70B-Instruct-Q3_K_M.gguf', 'Smaug-Llama-3-70B-Instruct-Q3_K_M.gguf'], value="Meta-Llama-3-70B-Instruct-Q3_K_M.gguf", label="Model"),
    ],
    theme=gr.themes.Soft(primary_hue="green", secondary_hue="indigo", neutral_hue="zinc",font=[gr.themes.GoogleFont("Exo 2"), "ui-sans-serif", "system-ui", "sans-serif"]).set(
        block_background_fill_dark="*neutral_800"
    )
)

if __name__ == "__main__":
    demo.launch()