File size: 4,818 Bytes
21b8ce0 13e498a b5c263a d8a3c53 606c0ce 40afde6 ae70ddf 7418606 ec06a49 98758c3 d8a3c53 3b38821 d8a3c53 40afde6 cf6a52f 8e6bf26 102611c 8e6bf26 cf6a52f 8e6bf26 cf6a52f 559c9c0 8e6bf26 189ce5d 5f15dd3 a257051 559c9c0 8e6bf26 40afde6 8e6bf26 ec06a49 8e6bf26 7418606 8e6bf26 7418606 8e6bf26 7418606 63c66b0 7418606 606c0ce d8a3c53 606c0ce d8a3c53 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import spaces
import json
import subprocess
import gradio as gr
from huggingface_hub import hf_hub_download
# from llama_index.core.llms import ChatMessage, MessageRole
# from llama_index.llms.llama_cpp import LlamaCPP
# from llama_index.llms.llama_cpp.llama_utils import (
# messages_to_prompt,
# completion_to_prompt,
# )
# from llama_index.core.memory import ChatMemoryBuffer
subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True)
subprocess.run('pip install llama-cpp-agent', shell=True)
hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q6_K.gguf", local_dir = "./models")
@spaces.GPU(duration=120)
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
from llama_cpp import Llama
from llama_cpp_agent import LlamaCppAgent
from llama_cpp_agent import MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider
llm = Llama(
model_path="models/mistral-7b-instruct-v0.2.Q6_K.gguf",
n_gpu_layers=33,
)
provider = LlamaCppPythonProvider(llm)
agent = LlamaCppAgent(
provider,
system_prompt="You are a helpful assistant.",
predefined_messages_formatter_type=MessagesFormatterType.MISTRAL,
debug_output=True
)
settings = provider.get_provider_default_settings()
settings.max_tokens = 2000
settings.stream = True
stream = agent.get_chat_response(message, llm_sampling_settings=settings, returns_streaming_generator=True)
outputs = ""
for output in stream:
print(output)
if "content" in output["choices"][0]["delta"]:
outputs += output["choices"][0]["delta"]["content"]
yield outputs
# from llama_cpp import Llama
# from llama_cpp_agent import LlamaCppAgent
# from llama_cpp_agent import MessagesFormatterType
# from llama_cpp_agent.providers import LlamaCppPythonProvider
# llama_model = Llama(r"models/mistral-7b-instruct-v0.2.Q6_K.gguf", n_batch=1024, n_threads=0, n_gpu_layers=33, n_ctx=8192, verbose=False)
# provider = LlamaCppPythonProvider(llama_model)
# agent = LlamaCppAgent(
# provider,
# system_prompt=f"{system_message}",
# predefined_messages_formatter_type=MessagesFormatterType.MISTRAL,
# debug_output=True
# )
# settings = provider.get_provider_default_settings()
# settings.stream = True
# settings.max_tokens = max_tokens
# settings.temperature = temperature
# settings.top_p = top_p
# partial_message = ""
# for new_token in agent.get_chat_response(message, llm_sampling_settings=settings, returns_streaming_generator=True):
# partial_message += new_token
# if '<|im_end|>' in partial_message:
# break
# yield partial_message
# stop_tokens = ["</s>", "[INST]", "[INST] ", "<s>", "[/INST]", "[/INST] "]
# chat_template = '<s>[INST] ' + system_message
# # for human, assistant in history:
# # chat_template += human + ' [/INST] ' + assistant + '</s>[INST]'
# chat_template += ' ' + message + ' [/INST]'
# print(chat_template)
# llm = LlamaCPP(
# model_path="models/mistral-7b-instruct-v0.2.Q6_K.gguf",
# temperature=temperature,
# max_new_tokens=max_tokens,
# context_window=2048,
# generate_kwargs={
# "top_k": 50,
# "top_p": top_p,
# "repeat_penalty": 1.3
# },
# model_kwargs={
# "n_threads": 0,
# "n_gpu_layers": 33
# },
# messages_to_prompt=messages_to_prompt,
# completion_to_prompt=completion_to_prompt,
# verbose=True,
# )
# # response = ""
# # for chunk in llm.stream_complete(message):
# # print(chunk.delta, end="", flush=True)
# # response += str(chunk.delta)
# # yield response
# outputs = []
# for chunk in llm.stream_complete(message):
# outputs.append(chunk.delta)
# if chunk.delta in stop_tokens:
# break
# yield "".join(outputs)
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a helpful assistant.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
if __name__ == "__main__":
demo.launch() |