llama-cpp-agent

Paused

App Files Files Community

pabloce commited on May 20, 2024

Commit

ae70ddf

verified ·

1 Parent(s): a786baa

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -16

app.py CHANGED Viewed

@@ -7,6 +7,15 @@ from llama_cpp_agent import LlamaCppAgent
 from llama_cpp_agent import MessagesFormatterType
 from llama_cpp_agent.providers import LlamaCppPythonProvider
 subprocess.run('pip install llama-cpp-python==0.2.75 --no-build-isolation --no-cache-dir --upgrade --only-binary=:all: --extra-index-url=https://abetlen.github.io/llama-cpp-python/whl/cu124', env={'CMAKE_ARGS': "-DLLAMA_CUDA=on"}, shell=True)
 hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q6_K.gguf",  local_dir = "./models")
@@ -20,24 +29,37 @@ def respond(
     temperature,
     top_p,
 ):
-    llama_model = Llama(r"models/mistral-7b-instruct-v0.2.Q6_K.gguf", n_batch=1024, n_threads=0, n_gpu_layers=33, n_ctx=8192, verbose=False)
-    provider = LlamaCppPythonProvider(llama_model)
-    agent = LlamaCppAgent(
-      provider,
-      system_prompt=f"{system_message}",
-      predefined_messages_formatter_type=MessagesFormatterType.MISTRAL,
-      debug_output=True
     )
-    settings = provider.get_provider_default_settings()
-    settings.stream = True
-    settings.max_tokens = max_tokens
-    settings.temperature = temperature
-    settings.top_p = top_p
-    yield agent.get_chat_response(message, llm_sampling_settings=settings)
 demo = gr.ChatInterface(
     respond,

 from llama_cpp_agent import MessagesFormatterType
 from llama_cpp_agent.providers import LlamaCppPythonProvider
+from llama_index.core.llms import ChatMessage, MessageRole
+from llama_index.llms.llama_cpp import LlamaCPP
+from llama_index.llms.llama_cpp.llama_utils import (
+    messages_to_prompt,
+    completion_to_prompt,
+)
+from llama_index.storage.chat_store.redis import RedisChatStore
+from llama_index.core.memory import ChatMemoryBuffer
 subprocess.run('pip install llama-cpp-python==0.2.75 --no-build-isolation --no-cache-dir --upgrade --only-binary=:all: --extra-index-url=https://abetlen.github.io/llama-cpp-python/whl/cu124', env={'CMAKE_ARGS': "-DLLAMA_CUDA=on"}, shell=True)
 hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q6_K.gguf",  local_dir = "./models")
     temperature,
     top_p,
 ):
+    stop_tokens = ["</s>", "[INST]", "[INST] ", "<s>", "[/INST]", "[/INST] "]
+    chat_template = '<s>[INST] ' + system_prompt
+    for human, assistant in history:
+        chat_template += human + ' [/INST] ' + assistant + '</s>[INST]'
+    chat_template += ' ' + message + ' [/INST]'
+    print(chat_template)
+    llm = LlamaCPP(
+        model_path="models/mistral-7b-instruct-v0.2.Q6_K.gguf",
+        temperature=temperature,
+        max_new_tokens=max_tokens,
+        context_window=8192,
+        generate_kwargs={
+            "top_k": 50,
+            "top_p": top_p,
+            "repeat_penalty": 1.3
+        },
+        model_kwargs={
+            "n_threads": 0,
+            "n_gpu_layers": 33
+        },
+        messages_to_prompt=messages_to_prompt,
+        completion_to_prompt=completion_to_prompt,
+        verbose=True,
     )
+    let response = ""
+    for chunk in llm.stream_chat(chat_template):
+        print(chunk.delta, end="", flush=True)
+        response += str(chunk.delta)
+        yield response
 demo = gr.ChatInterface(
     respond,