File size: 4,255 Bytes
21b8ce0 13e498a b5c263a d8a3c53 606c0ce 40afde6 ae70ddf 7418606 ec06a49 98758c3 d8a3c53 3b38821 d8a3c53 40afde6 8e6bf26 faf3f73 8e6bf26 40afde6 8e6bf26 ec06a49 8e6bf26 7418606 8e6bf26 7418606 8e6bf26 7418606 63c66b0 7418606 606c0ce d8a3c53 606c0ce d8a3c53 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import spaces
import json
import subprocess
import gradio as gr
from huggingface_hub import hf_hub_download
# from llama_index.core.llms import ChatMessage, MessageRole
# from llama_index.llms.llama_cpp import LlamaCPP
# from llama_index.llms.llama_cpp.llama_utils import (
# messages_to_prompt,
# completion_to_prompt,
# )
# from llama_index.core.memory import ChatMemoryBuffer
subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True)
subprocess.run('pip install llama-cpp-agent', shell=True)
hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q6_K.gguf", local_dir = "./models")
@spaces.GPU(duration=120)
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
from llama_cpp import Llama
llm = Llama(
model_path="models/mistral-7b-instruct-v0.2.Q6_K.gguf"
)
stream = llm.create_chat_completion(
messages = [
{"role": "system", "content": f"{system_message}"},
{
"role": "user",
"content": f"{message}"
}
],
stream=True,
)
for output in stream:
yield json.dumps(output, indent=2)
# from llama_cpp import Llama
# from llama_cpp_agent import LlamaCppAgent
# from llama_cpp_agent import MessagesFormatterType
# from llama_cpp_agent.providers import LlamaCppPythonProvider
# llama_model = Llama(r"models/mistral-7b-instruct-v0.2.Q6_K.gguf", n_batch=1024, n_threads=0, n_gpu_layers=33, n_ctx=8192, verbose=False)
# provider = LlamaCppPythonProvider(llama_model)
# agent = LlamaCppAgent(
# provider,
# system_prompt=f"{system_message}",
# predefined_messages_formatter_type=MessagesFormatterType.MISTRAL,
# debug_output=True
# )
# settings = provider.get_provider_default_settings()
# settings.stream = True
# settings.max_tokens = max_tokens
# settings.temperature = temperature
# settings.top_p = top_p
# partial_message = ""
# for new_token in agent.get_chat_response(message, llm_sampling_settings=settings, returns_streaming_generator=True):
# partial_message += new_token
# if '<|im_end|>' in partial_message:
# break
# yield partial_message
# stop_tokens = ["</s>", "[INST]", "[INST] ", "<s>", "[/INST]", "[/INST] "]
# chat_template = '<s>[INST] ' + system_message
# # for human, assistant in history:
# # chat_template += human + ' [/INST] ' + assistant + '</s>[INST]'
# chat_template += ' ' + message + ' [/INST]'
# print(chat_template)
# llm = LlamaCPP(
# model_path="models/mistral-7b-instruct-v0.2.Q6_K.gguf",
# temperature=temperature,
# max_new_tokens=max_tokens,
# context_window=2048,
# generate_kwargs={
# "top_k": 50,
# "top_p": top_p,
# "repeat_penalty": 1.3
# },
# model_kwargs={
# "n_threads": 0,
# "n_gpu_layers": 33
# },
# messages_to_prompt=messages_to_prompt,
# completion_to_prompt=completion_to_prompt,
# verbose=True,
# )
# # response = ""
# # for chunk in llm.stream_complete(message):
# # print(chunk.delta, end="", flush=True)
# # response += str(chunk.delta)
# # yield response
# outputs = []
# for chunk in llm.stream_complete(message):
# outputs.append(chunk.delta)
# if chunk.delta in stop_tokens:
# break
# yield "".join(outputs)
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a helpful assistant.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
if __name__ == "__main__":
demo.launch() |