File size: 4,255 Bytes
21b8ce0
13e498a
b5c263a
d8a3c53
606c0ce
40afde6
 
 
 
 
 
 
 
 
ae70ddf
7418606
 
ec06a49
98758c3
d8a3c53
3b38821
d8a3c53
 
 
 
 
 
 
 
40afde6
8e6bf26
faf3f73
8e6bf26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40afde6
8e6bf26
ec06a49
8e6bf26
7418606
8e6bf26
 
 
 
 
 
7418606
8e6bf26
 
 
 
 
 
 
 
 
 
 
 
7418606
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63c66b0
7418606
 
 
 
606c0ce
d8a3c53
 
 
606c0ce
d8a3c53
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import spaces
import json
import subprocess
import gradio as gr
from huggingface_hub import hf_hub_download


# from llama_index.core.llms import ChatMessage, MessageRole
# from llama_index.llms.llama_cpp import LlamaCPP
# from llama_index.llms.llama_cpp.llama_utils import (
#     messages_to_prompt,
#     completion_to_prompt,
# )
# from llama_index.core.memory import ChatMemoryBuffer

subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True)
subprocess.run('pip install llama-cpp-agent', shell=True)

hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q6_K.gguf",  local_dir = "./models")

@spaces.GPU(duration=120)
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    from llama_cpp import Llama
    llm = Llama(
      model_path="models/mistral-7b-instruct-v0.2.Q6_K.gguf"
    )
    stream = llm.create_chat_completion(
        messages = [
            {"role": "system", "content": f"{system_message}"},
            {
                "role": "user",
                "content": f"{message}"
            }
        ],
        stream=True,
    )
    for output in stream:
        yield json.dumps(output, indent=2)
    # from llama_cpp import Llama
    # from llama_cpp_agent import LlamaCppAgent
    # from llama_cpp_agent import MessagesFormatterType
    # from llama_cpp_agent.providers import LlamaCppPythonProvider
    
    # llama_model = Llama(r"models/mistral-7b-instruct-v0.2.Q6_K.gguf", n_batch=1024, n_threads=0, n_gpu_layers=33, n_ctx=8192, verbose=False)

    # provider = LlamaCppPythonProvider(llama_model)

    # agent = LlamaCppAgent(
    #   provider,
    #   system_prompt=f"{system_message}",
    #   predefined_messages_formatter_type=MessagesFormatterType.MISTRAL,
    #   debug_output=True
    # )

    # settings = provider.get_provider_default_settings()
    # settings.stream = True
    # settings.max_tokens = max_tokens
    # settings.temperature = temperature
    # settings.top_p = top_p
    # partial_message = ""
    # for new_token in agent.get_chat_response(message, llm_sampling_settings=settings, returns_streaming_generator=True):
    #     partial_message += new_token
    #     if '<|im_end|>' in partial_message:
    #         break
    #     yield partial_message
    
    # stop_tokens = ["</s>", "[INST]", "[INST] ", "<s>", "[/INST]", "[/INST] "]
    # chat_template = '<s>[INST] ' + system_message
    # # for human, assistant in history:
    # #     chat_template += human + ' [/INST] ' + assistant + '</s>[INST]'
    # chat_template += ' ' + message + ' [/INST]'

    # print(chat_template)
    
    # llm = LlamaCPP(
    #     model_path="models/mistral-7b-instruct-v0.2.Q6_K.gguf",
    #     temperature=temperature,
    #     max_new_tokens=max_tokens,
    #     context_window=2048,
    #     generate_kwargs={
    #         "top_k": 50,
    #         "top_p": top_p,
    #         "repeat_penalty": 1.3
    #     },
    #     model_kwargs={
    #         "n_threads": 0,
    #         "n_gpu_layers": 33
    #     },
    #     messages_to_prompt=messages_to_prompt,
    #     completion_to_prompt=completion_to_prompt,
    #     verbose=True,
    # )
    # # response = ""
    # # for chunk in llm.stream_complete(message):
    # #     print(chunk.delta, end="", flush=True)
    # #     response += str(chunk.delta)
    # #     yield response
    # outputs = []
    # for chunk in llm.stream_complete(message):
    #     outputs.append(chunk.delta)
    #     if chunk.delta in stop_tokens:
    #         break
    #     yield "".join(outputs)

demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a helpful assistant.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

if __name__ == "__main__":
    demo.launch()