Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
|
2 |
import subprocess
|
3 |
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
|
4 |
|
@@ -19,7 +18,6 @@ from transformers import (
|
|
19 |
MODEL_ID = "NousResearch/DeepHermes-3-Llama-3-8B-Preview"
|
20 |
|
21 |
DEFAULT_SYSTEM_PROMPT = """
|
22 |
-
|
23 |
You are Dolphin, a lovable yet intelligent AI assistant created by Eric Hartford and his open source AI research collective, Cognitive Computations. Your primary directive is to provide well-reasoned, structured, correct, unbiased, and detailed responses. You value freedom, enabling the user, and complying with the user's requests.
|
24 |
|
25 |
Formatting Requirements:
|
@@ -35,8 +33,7 @@ Detailed and Structured: Use markdown, json, mermaid, latex math notation, etc.
|
|
35 |
Scientific and Logical Approach: Your explanations should reflect the depth and precision of the greatest scientific minds.
|
36 |
Concise yet Complete: Ensure responses are informative, yet to the point without unnecessary elaboration.
|
37 |
Maintain a professional yet friendly and lovable, intelligent, and analytical tone in all interactions
|
38 |
-
|
39 |
-
""" # You can modify the default system instructions here
|
40 |
|
41 |
CSS = """
|
42 |
.gr-chatbot { min-height: 500px; border-radius: 15px; }
|
@@ -78,7 +75,7 @@ def format_response(text):
|
|
78 |
# List of replacements to format key tokens with HTML for styling.
|
79 |
replacements = [
|
80 |
("[Understand]", '\n<strong class="special-tag">[Understand]</strong>\n'),
|
81 |
-
(
|
82 |
("[/think]", '\n<strong class="special-tag">[/think]</strong>\n'),
|
83 |
("[Answer]", '\n<strong class="special-tag">[Answer]</strong>\n'),
|
84 |
("[/Answer]", '\n<strong class="special-tag">[/Answer]</strong>\n'),
|
@@ -87,6 +84,25 @@ def format_response(text):
|
|
87 |
text = text.replace(old, new)
|
88 |
return text
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
@spaces.GPU(duration=120)
|
91 |
def generate_response(message, chat_history, system_prompt, temperature, max_tokens, top_p, top_k, repetition_penalty):
|
92 |
# Build the conversation history.
|
@@ -96,12 +112,9 @@ def generate_response(message, chat_history, system_prompt, temperature, max_tok
|
|
96 |
conversation.append({"role": "assistant", "content": bot_msg})
|
97 |
conversation.append({"role": "user", "content": message})
|
98 |
|
99 |
-
#
|
100 |
-
|
101 |
-
|
102 |
-
add_generation_prompt=True,
|
103 |
-
return_tensors="pt"
|
104 |
-
).to(model.device)
|
105 |
|
106 |
# Setup the streamer to yield new tokens as they are generated.
|
107 |
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
|
|
|
|
|
1 |
import subprocess
|
2 |
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
|
3 |
|
|
|
18 |
MODEL_ID = "NousResearch/DeepHermes-3-Llama-3-8B-Preview"
|
19 |
|
20 |
DEFAULT_SYSTEM_PROMPT = """
|
|
|
21 |
You are Dolphin, a lovable yet intelligent AI assistant created by Eric Hartford and his open source AI research collective, Cognitive Computations. Your primary directive is to provide well-reasoned, structured, correct, unbiased, and detailed responses. You value freedom, enabling the user, and complying with the user's requests.
|
22 |
|
23 |
Formatting Requirements:
|
|
|
33 |
Scientific and Logical Approach: Your explanations should reflect the depth and precision of the greatest scientific minds.
|
34 |
Concise yet Complete: Ensure responses are informative, yet to the point without unnecessary elaboration.
|
35 |
Maintain a professional yet friendly and lovable, intelligent, and analytical tone in all interactions
|
36 |
+
"""
|
|
|
37 |
|
38 |
CSS = """
|
39 |
.gr-chatbot { min-height: 500px; border-radius: 15px; }
|
|
|
75 |
# List of replacements to format key tokens with HTML for styling.
|
76 |
replacements = [
|
77 |
("[Understand]", '\n<strong class="special-tag">[Understand]</strong>\n'),
|
78 |
+
("[think]", '\n<strong class="special-tag">[think]</strong>\n'),
|
79 |
("[/think]", '\n<strong class="special-tag">[/think]</strong>\n'),
|
80 |
("[Answer]", '\n<strong class="special-tag">[Answer]</strong>\n'),
|
81 |
("[/Answer]", '\n<strong class="special-tag">[/Answer]</strong>\n'),
|
|
|
84 |
text = text.replace(old, new)
|
85 |
return text
|
86 |
|
87 |
+
# --- New helper: Llama-3 conversation template ---
|
88 |
+
def apply_llama3_chat_template(conversation, add_generation_prompt=True):
|
89 |
+
"""
|
90 |
+
Convert the conversation (a list of dicts with 'role' and 'content')
|
91 |
+
into a single prompt string in Llama-3 style.
|
92 |
+
"""
|
93 |
+
prompt = ""
|
94 |
+
for msg in conversation:
|
95 |
+
role = msg["role"].upper()
|
96 |
+
if role == "SYSTEM":
|
97 |
+
prompt += "<|SYSTEM|>\n" + msg["content"].strip() + "\n"
|
98 |
+
elif role == "USER":
|
99 |
+
prompt += "<|USER|>\n" + msg["content"].strip() + "\n"
|
100 |
+
elif role == "ASSISTANT":
|
101 |
+
prompt += "<|ASSISTANT|>\n" + msg["content"].strip() + "\n"
|
102 |
+
if add_generation_prompt:
|
103 |
+
prompt += "<|ASSISTANT|>\n"
|
104 |
+
return prompt
|
105 |
+
|
106 |
@spaces.GPU(duration=120)
|
107 |
def generate_response(message, chat_history, system_prompt, temperature, max_tokens, top_p, top_k, repetition_penalty):
|
108 |
# Build the conversation history.
|
|
|
112 |
conversation.append({"role": "assistant", "content": bot_msg})
|
113 |
conversation.append({"role": "user", "content": message})
|
114 |
|
115 |
+
# Use the Llama-3 conversation template to build the prompt.
|
116 |
+
prompt = apply_llama3_chat_template(conversation, add_generation_prompt=True)
|
117 |
+
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
|
|
|
|
|
|
|
118 |
|
119 |
# Setup the streamer to yield new tokens as they are generated.
|
120 |
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
|