Smart_LLM

Running on Zero

App Files Files Community

Daemontatox commited on about 1 month ago

Commit

43c94de

verified ·

1 Parent(s): 538f502

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -11

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import subprocess
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
@@ -19,7 +18,6 @@ from transformers import (
 MODEL_ID = "NousResearch/DeepHermes-3-Llama-3-8B-Preview"
 DEFAULT_SYSTEM_PROMPT = """
 You are Dolphin, a lovable yet intelligent AI assistant created by Eric Hartford and his open source AI research collective, Cognitive Computations. Your primary directive is to provide well-reasoned, structured, correct, unbiased, and detailed responses. You value freedom, enabling the user, and complying with the user's requests.
 Formatting Requirements:
@@ -35,8 +33,7 @@ Detailed and Structured: Use markdown, json, mermaid, latex math notation, etc.
 Scientific and Logical Approach: Your explanations should reflect the depth and precision of the greatest scientific minds.
 Concise yet Complete: Ensure responses are informative, yet to the point without unnecessary elaboration.
 Maintain a professional yet friendly and lovable, intelligent, and analytical tone in all interactions
-"""  # You can modify the default system instructions here
 CSS = """
 .gr-chatbot { min-height: 500px; border-radius: 15px; }
@@ -78,7 +75,7 @@ def format_response(text):
     # List of replacements to format key tokens with HTML for styling.
     replacements = [
         ("[Understand]", '\n<strong class="special-tag">[Understand]</strong>\n'),
-        ( "[think]", '\n<strong class="special-tag">[think]</strong>\n'),
         ("[/think]", '\n<strong class="special-tag">[/think]</strong>\n'),
         ("[Answer]", '\n<strong class="special-tag">[Answer]</strong>\n'),
         ("[/Answer]", '\n<strong class="special-tag">[/Answer]</strong>\n'),
@@ -87,6 +84,25 @@ def format_response(text):
         text = text.replace(old, new)
     return text
 @spaces.GPU(duration=120)
 def generate_response(message, chat_history, system_prompt, temperature, max_tokens, top_p, top_k, repetition_penalty):
     # Build the conversation history.
@@ -96,12 +112,9 @@ def generate_response(message, chat_history, system_prompt, temperature, max_tok
         conversation.append({"role": "assistant", "content": bot_msg})
     conversation.append({"role": "user", "content": message})
-    # Tokenize the conversation. (This assumes the tokenizer has an apply_chat_template method.)
-    input_ids = tokenizer.apply_chat_template(
-        conversation,
-        add_generation_prompt=True,
-        return_tensors="pt"
-    ).to(model.device)
     # Setup the streamer to yield new tokens as they are generated.
     streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)

 import subprocess
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 MODEL_ID = "NousResearch/DeepHermes-3-Llama-3-8B-Preview"
 DEFAULT_SYSTEM_PROMPT = """
 You are Dolphin, a lovable yet intelligent AI assistant created by Eric Hartford and his open source AI research collective, Cognitive Computations. Your primary directive is to provide well-reasoned, structured, correct, unbiased, and detailed responses. You value freedom, enabling the user, and complying with the user's requests.
 Formatting Requirements:
 Scientific and Logical Approach: Your explanations should reflect the depth and precision of the greatest scientific minds.
 Concise yet Complete: Ensure responses are informative, yet to the point without unnecessary elaboration.
 Maintain a professional yet friendly and lovable, intelligent, and analytical tone in all interactions
+"""
 CSS = """
 .gr-chatbot { min-height: 500px; border-radius: 15px; }
     # List of replacements to format key tokens with HTML for styling.
     replacements = [
         ("[Understand]", '\n<strong class="special-tag">[Understand]</strong>\n'),
+        ("[think]", '\n<strong class="special-tag">[think]</strong>\n'),
         ("[/think]", '\n<strong class="special-tag">[/think]</strong>\n'),
         ("[Answer]", '\n<strong class="special-tag">[Answer]</strong>\n'),
         ("[/Answer]", '\n<strong class="special-tag">[/Answer]</strong>\n'),
         text = text.replace(old, new)
     return text
+# --- New helper: Llama-3 conversation template ---
+def apply_llama3_chat_template(conversation, add_generation_prompt=True):
+    """
+    Convert the conversation (a list of dicts with 'role' and 'content')
+    into a single prompt string in Llama-3 style.
+    """
+    prompt = ""
+    for msg in conversation:
+        role = msg["role"].upper()
+        if role == "SYSTEM":
+            prompt += "<|SYSTEM|>\n" + msg["content"].strip() + "\n"
+        elif role == "USER":
+            prompt += "<|USER|>\n" + msg["content"].strip() + "\n"
+        elif role == "ASSISTANT":
+            prompt += "<|ASSISTANT|>\n" + msg["content"].strip() + "\n"
+    if add_generation_prompt:
+        prompt += "<|ASSISTANT|>\n"
+    return prompt
 @spaces.GPU(duration=120)
 def generate_response(message, chat_history, system_prompt, temperature, max_tokens, top_p, top_k, repetition_penalty):
     # Build the conversation history.
         conversation.append({"role": "assistant", "content": bot_msg})
     conversation.append({"role": "user", "content": message})
+    # Use the Llama-3 conversation template to build the prompt.
+    prompt = apply_llama3_chat_template(conversation, add_generation_prompt=True)
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
     # Setup the streamer to yield new tokens as they are generated.
     streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)