pabloce commited on
Commit
ae70ddf
·
verified ·
1 Parent(s): a786baa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -16
app.py CHANGED
@@ -7,6 +7,15 @@ from llama_cpp_agent import LlamaCppAgent
7
  from llama_cpp_agent import MessagesFormatterType
8
  from llama_cpp_agent.providers import LlamaCppPythonProvider
9
 
 
 
 
 
 
 
 
 
 
10
  subprocess.run('pip install llama-cpp-python==0.2.75 --no-build-isolation --no-cache-dir --upgrade --only-binary=:all: --extra-index-url=https://abetlen.github.io/llama-cpp-python/whl/cu124', env={'CMAKE_ARGS': "-DLLAMA_CUDA=on"}, shell=True)
11
 
12
  hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q6_K.gguf", local_dir = "./models")
@@ -20,24 +29,37 @@ def respond(
20
  temperature,
21
  top_p,
22
  ):
23
- llama_model = Llama(r"models/mistral-7b-instruct-v0.2.Q6_K.gguf", n_batch=1024, n_threads=0, n_gpu_layers=33, n_ctx=8192, verbose=False)
24
-
25
- provider = LlamaCppPythonProvider(llama_model)
 
 
26
 
27
- agent = LlamaCppAgent(
28
- provider,
29
- system_prompt=f"{system_message}",
30
- predefined_messages_formatter_type=MessagesFormatterType.MISTRAL,
31
- debug_output=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  )
33
-
34
- settings = provider.get_provider_default_settings()
35
- settings.stream = True
36
- settings.max_tokens = max_tokens
37
- settings.temperature = temperature
38
- settings.top_p = top_p
39
-
40
- yield agent.get_chat_response(message, llm_sampling_settings=settings)
41
 
42
  demo = gr.ChatInterface(
43
  respond,
 
7
  from llama_cpp_agent import MessagesFormatterType
8
  from llama_cpp_agent.providers import LlamaCppPythonProvider
9
 
10
+ from llama_index.core.llms import ChatMessage, MessageRole
11
+ from llama_index.llms.llama_cpp import LlamaCPP
12
+ from llama_index.llms.llama_cpp.llama_utils import (
13
+ messages_to_prompt,
14
+ completion_to_prompt,
15
+ )
16
+ from llama_index.storage.chat_store.redis import RedisChatStore
17
+ from llama_index.core.memory import ChatMemoryBuffer
18
+
19
  subprocess.run('pip install llama-cpp-python==0.2.75 --no-build-isolation --no-cache-dir --upgrade --only-binary=:all: --extra-index-url=https://abetlen.github.io/llama-cpp-python/whl/cu124', env={'CMAKE_ARGS': "-DLLAMA_CUDA=on"}, shell=True)
20
 
21
  hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q6_K.gguf", local_dir = "./models")
 
29
  temperature,
30
  top_p,
31
  ):
32
+ stop_tokens = ["</s>", "[INST]", "[INST] ", "<s>", "[/INST]", "[/INST] "]
33
+ chat_template = '<s>[INST] ' + system_prompt
34
+ for human, assistant in history:
35
+ chat_template += human + ' [/INST] ' + assistant + '</s>[INST]'
36
+ chat_template += ' ' + message + ' [/INST]'
37
 
38
+ print(chat_template)
39
+
40
+ llm = LlamaCPP(
41
+ model_path="models/mistral-7b-instruct-v0.2.Q6_K.gguf",
42
+ temperature=temperature,
43
+ max_new_tokens=max_tokens,
44
+ context_window=8192,
45
+ generate_kwargs={
46
+ "top_k": 50,
47
+ "top_p": top_p,
48
+ "repeat_penalty": 1.3
49
+ },
50
+ model_kwargs={
51
+ "n_threads": 0,
52
+ "n_gpu_layers": 33
53
+ },
54
+ messages_to_prompt=messages_to_prompt,
55
+ completion_to_prompt=completion_to_prompt,
56
+ verbose=True,
57
  )
58
+ let response = ""
59
+ for chunk in llm.stream_chat(chat_template):
60
+ print(chunk.delta, end="", flush=True)
61
+ response += str(chunk.delta)
62
+ yield response
 
 
 
63
 
64
  demo = gr.ChatInterface(
65
  respond,