File size: 4,789 Bytes
21b8ce0
13e498a
b5c263a
d8a3c53
606c0ce
40afde6
7418606
8bdc23a
ec06a49
98758c3
d8a3c53
3b38821
d8a3c53
 
 
 
 
 
 
 
40afde6
cf6a52f
 
 
ac70b49
cf6a52f
8e6bf26
102611c
 
8e6bf26
cf6a52f
 
 
 
 
 
 
8e6bf26
cf6a52f
 
 
 
ac70b49
 
 
 
 
 
 
 
 
 
cf6a52f
ac70b49
cf6a52f
559c9c0
8e6bf26
189ce5d
b9838b1
 
559c9c0
8e6bf26
 
 
 
40afde6
8e6bf26
ec06a49
8e6bf26
7418606
8e6bf26
 
 
 
 
 
7418606
8e6bf26
 
 
 
 
 
 
 
 
 
 
 
7418606
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63c66b0
7418606
 
 
 
606c0ce
d8a3c53
 
 
606c0ce
d8a3c53
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import spaces
import json
import subprocess
import gradio as gr
from huggingface_hub import hf_hub_download

subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True)
subprocess.run('pip install llama-cpp-agent==0.2.8', shell=True)

hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q6_K.gguf",  local_dir = "./models")

@spaces.GPU(duration=120)
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    from llama_cpp import Llama
    from llama_cpp_agent import LlamaCppAgent
    from llama_cpp_agent import MessagesFormatterType
    from llama_cpp_agent.providers import LlamaCppPythonProvider
    from llama_cpp_agent.chat_history import BasicChatHistory
    
    llm = Llama(
        model_path="models/mistral-7b-instruct-v0.2.Q6_K.gguf",
        n_gpu_layers=33,
    )
    provider = LlamaCppPythonProvider(llm)

    agent = LlamaCppAgent(
        provider,
        system_prompt="You are a helpful assistant.",
        predefined_messages_formatter_type=MessagesFormatterType.MISTRAL,
        debug_output=True
    )
    
    settings = provider.get_provider_default_settings()
    settings.max_tokens = 2000
    settings.stream = True

    messages = BasicChatHistory()
    print(history)

    for msn in history:
        dic = {
            'role': msn[0]
            'content': msn[1]
        }
        messages.add_message(dic)
    
    stream = agent.get_chat_response(message, llm_sampling_settings=settings, chat_history=messages, returns_streaming_generator=True)
    
    outputs = ""
    for output in stream:
        print(output)
        # if "content" in output["choices"][0]["delta"]:  
        outputs += output
        yield outputs
    # from llama_cpp import Llama
    # from llama_cpp_agent import LlamaCppAgent
    # from llama_cpp_agent import MessagesFormatterType
    # from llama_cpp_agent.providers import LlamaCppPythonProvider
    
    # llama_model = Llama(r"models/mistral-7b-instruct-v0.2.Q6_K.gguf", n_batch=1024, n_threads=0, n_gpu_layers=33, n_ctx=8192, verbose=False)

    # provider = LlamaCppPythonProvider(llama_model)

    # agent = LlamaCppAgent(
    #   provider,
    #   system_prompt=f"{system_message}",
    #   predefined_messages_formatter_type=MessagesFormatterType.MISTRAL,
    #   debug_output=True
    # )

    # settings = provider.get_provider_default_settings()
    # settings.stream = True
    # settings.max_tokens = max_tokens
    # settings.temperature = temperature
    # settings.top_p = top_p
    # partial_message = ""
    # for new_token in agent.get_chat_response(message, llm_sampling_settings=settings, returns_streaming_generator=True):
    #     partial_message += new_token
    #     if '<|im_end|>' in partial_message:
    #         break
    #     yield partial_message
    
    # stop_tokens = ["</s>", "[INST]", "[INST] ", "<s>", "[/INST]", "[/INST] "]
    # chat_template = '<s>[INST] ' + system_message
    # # for human, assistant in history:
    # #     chat_template += human + ' [/INST] ' + assistant + '</s>[INST]'
    # chat_template += ' ' + message + ' [/INST]'

    # print(chat_template)
    
    # llm = LlamaCPP(
    #     model_path="models/mistral-7b-instruct-v0.2.Q6_K.gguf",
    #     temperature=temperature,
    #     max_new_tokens=max_tokens,
    #     context_window=2048,
    #     generate_kwargs={
    #         "top_k": 50,
    #         "top_p": top_p,
    #         "repeat_penalty": 1.3
    #     },
    #     model_kwargs={
    #         "n_threads": 0,
    #         "n_gpu_layers": 33
    #     },
    #     messages_to_prompt=messages_to_prompt,
    #     completion_to_prompt=completion_to_prompt,
    #     verbose=True,
    # )
    # # response = ""
    # # for chunk in llm.stream_complete(message):
    # #     print(chunk.delta, end="", flush=True)
    # #     response += str(chunk.delta)
    # #     yield response
    # outputs = []
    # for chunk in llm.stream_complete(message):
    #     outputs.append(chunk.delta)
    #     if chunk.delta in stop_tokens:
    #         break
    #     yield "".join(outputs)

demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a helpful assistant.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

if __name__ == "__main__":
    demo.launch()