File size: 2,869 Bytes
21b8ce0 13e498a b5c263a d8a3c53 606c0ce 40afde6 7418606 2cd3649 ec06a49 0fd9e08 d8a3c53 3b38821 d8a3c53 40afde6 cf6a52f ac70b49 777a931 cf6a52f 8e6bf26 0fd9e08 888ea87 8e6bf26 cf6a52f 622f877 cf6a52f 8e6bf26 cf6a52f e9aaf81 cf6a52f ac70b49 c0b0a51 ac70b49 777a931 ac70b49 777a931 cf6a52f 2cd3649 cf6a52f 559c9c0 8e6bf26 b9838b1 559c9c0 606c0ce d8a3c53 606c0ce e9aaf81 d8a3c53 3600cd6 d8a3c53 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import spaces
import json
import subprocess
import gradio as gr
from huggingface_hub import hf_hub_download
subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True)
subprocess.run('pip install llama-cpp-agent==0.2.10', shell=True)
hf_hub_download(repo_id="bartowski/Meta-Llama-3-70B-Instruct-GGUF", filename="Meta-Llama-3-70B-Instruct-Q3_K_M.gguf", local_dir = "./models")
@spaces.GPU(duration=120)
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
from llama_cpp import Llama
from llama_cpp_agent import LlamaCppAgent
from llama_cpp_agent import MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider
from llama_cpp_agent.chat_history import BasicChatHistory
from llama_cpp_agent.chat_history.messages import Roles
llm = Llama(
model_path="models/Meta-Llama-3-70B-Instruct-Q3_K_M.gguf",
n_gpu_layers=81,
)
provider = LlamaCppPythonProvider(llm)
agent = LlamaCppAgent(
provider,
system_prompt="You are a helpful assistant.",
predefined_messages_formatter_type=MessagesFormatterType.LLAMA_3,
debug_output=True
)
settings = provider.get_provider_default_settings()
settings.max_tokens = max_tokens
settings.stream = True
messages = BasicChatHistory()
print("history")
print(history)
for msn in history:
user = {
'role': Roles.user,
'content': msn[0]
}
assistant = {
'role': Roles.assistant,
'content': msn[1]
}
print(user)
print(assistant)
messages.add_message(user)
messages.add_message(assistant)
stream = agent.get_chat_response(message, llm_sampling_settings=settings, chat_history=messages, returns_streaming_generator=True, print_output=False)
outputs = ""
for output in stream:
outputs += output
yield outputs
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a helpful assistant.", label="System message"),
gr.Slider(minimum=1, maximum=4096, value=4096, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
theme=gr.themes.Soft(primary_hue="green", secondary_hue="indigo", neutral_hue="zinc",font=[gr.themes.GoogleFont("Exo 2"), "ui-sans-serif", "system-ui", "sans-serif"]).set(
block_background_fill_dark="*neutral_800"
)
)
if __name__ == "__main__":
demo.launch() |