File size: 2,388 Bytes
21b8ce0 13e498a b5c263a d8a3c53 606c0ce 40afde6 7418606 8bdc23a ec06a49 98758c3 d8a3c53 3b38821 d8a3c53 40afde6 cf6a52f ac70b49 cf6a52f 8e6bf26 102611c 8e6bf26 cf6a52f 8e6bf26 cf6a52f ac70b49 c0b0a51 ac70b49 3de786c ac70b49 0133165 ac70b49 cf6a52f 3a779df cf6a52f 559c9c0 8e6bf26 b9838b1 559c9c0 606c0ce d8a3c53 606c0ce d8a3c53 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import spaces
import json
import subprocess
import gradio as gr
from huggingface_hub import hf_hub_download
subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True)
subprocess.run('pip install llama-cpp-agent==0.2.8', shell=True)
hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q6_K.gguf", local_dir = "./models")
@spaces.GPU(duration=120)
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
from llama_cpp import Llama
from llama_cpp_agent import LlamaCppAgent
from llama_cpp_agent import MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider
from llama_cpp_agent.chat_history import BasicChatHistory
llm = Llama(
model_path="models/mistral-7b-instruct-v0.2.Q6_K.gguf",
n_gpu_layers=33,
)
provider = LlamaCppPythonProvider(llm)
agent = LlamaCppAgent(
provider,
system_prompt="You are a helpful assistant.",
predefined_messages_formatter_type=MessagesFormatterType.MISTRAL,
debug_output=True
)
settings = provider.get_provider_default_settings()
settings.max_tokens = 2000
settings.stream = True
messages = BasicChatHistory()
print("history")
print(history)
for msn in history:
dic = {
'role': msn[0],
'content': msn[1]
}
print("dic")
print(dic)
messages.add_message(dic)
stream = agent.get_chat_response(message, llm_sampling_settings=settings, returns_streaming_generator=True, print_output=False)
outputs = ""
for output in stream:
outputs += output
yield outputs
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a helpful assistant.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
if __name__ == "__main__":
demo.launch() |