Spaces:

OscarFAI
/

inference

Running on Zero

App Files Files Community

OscarFAI commited on 14 days ago

Commit

9577ec2

1 Parent(s): 69d0c7f

Lets roll

Browse files

Files changed (1) hide show

app.py +31 -16

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ HF_TOKEN = os.environ.get("HF_TOKEN", None)
 DESCRIPTION = '''
 <div>
-<h1 style="text-align: center;">Mistral Chat</h1>
 </div>
 '''
@@ -20,7 +20,7 @@ LICENSE = """
 PLACEHOLDER = """
 <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
-   <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">DeepSeek-R1-Distill-Llama-8B</h1>
    <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Ask me anything...</p>
 </div>
 """
@@ -49,17 +49,19 @@ terminators = [
 ]
 @spaces.GPU(duration=120)
-def chat_llama3_8b(message: str,
-                    history: list,
-                    temperature: float,
-                    max_new_tokens: int,
-                    system_prompt: str) -> str:
     """
     Generate a streaming response using the Mistral-8B model.
     Args:
         message (str): The input message.
         history (list): The conversation history used by ChatInterface.
         temperature (float): The temperature for generating the response.
         max_new_tokens (int): The maximum number of new tokens to generate.
         system_prompt (str): The system prompt to guide the assistant's behavior.
     Returns:
@@ -67,14 +69,24 @@ def chat_llama3_8b(message: str,
     """
     conversation = []
-    # Include system prompt at the beginning if provided
     if system_prompt:
-        conversation.append({"role": "system", "content": system_prompt})
-    for user, assistant in history:
-        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
-    conversation.append({"role": "user", "content": message})
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
@@ -86,6 +98,7 @@ def chat_llama3_8b(message: str,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
         eos_token_id=terminators,
     )
@@ -115,19 +128,21 @@ with gr.Blocks(fill_height=True, css=css) as demo:
     )
     gr.ChatInterface(
-        fn=chat_llama3_8b,
         chatbot=chatbot,
         fill_height=True,
         additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
         additional_inputs=[
             system_prompt_input,
             gr.Slider(minimum=0, maximum=1, step=0.1, value=0.8, label="Temperature", render=False),
             gr.Slider(minimum=128, maximum=4096, step=1, value=4096, label="Max new tokens", render=False),
         ],
         examples=[
             ['Are you a sentient being?']
         ],
-        cache_examples=False
     )
 if __name__ == "__main__":

 DESCRIPTION = '''
 <div>
+<h1 style="text-align: center;">Mistral 8B Instruct</h1>
 </div>
 '''
 PLACEHOLDER = """
 <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
+   <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">Mistral-8B</h1>
    <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Ask me anything...</p>
 </div>
 """
 ]
 @spaces.GPU(duration=120)
+def chat_mistral(message: str,
+                 history: list,
+                 temperature: float,
+                 top_p: float,
+                 max_new_tokens: int,
+                 system_prompt: str) -> str:
     """
     Generate a streaming response using the Mistral-8B model.
     Args:
         message (str): The input message.
         history (list): The conversation history used by ChatInterface.
         temperature (float): The temperature for generating the response.
+        top_p (float): The top-p (nucleus) sampling parameter.
         max_new_tokens (int): The maximum number of new tokens to generate.
         system_prompt (str): The system prompt to guide the assistant's behavior.
     Returns:
     """
     conversation = []
+    # Format system prompt correctly using [INST]
     if system_prompt:
+        formatted_prompt = f"[INST] {system_prompt} [/INST]\n\n"
+    else:
+        formatted_prompt = ""
+    # Modify first user message to include system prompt
+    if history:
+        first_user_msg = f"{formatted_prompt}{history[0][0]}" if formatted_prompt else history[0][0]
+        conversation.append({"role": "user", "content": first_user_msg})
+        conversation.append({"role": "assistant", "content": history[0][1]})
+        for user, assistant in history[1:]:
+            conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
+    else:
+        # First message in a new conversation
+        first_message = f"{formatted_prompt}{message}" if formatted_prompt else message
+        conversation.append({"role": "user", "content": first_message})
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
         max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
+        top_p=top_p,
         eos_token_id=terminators,
     )
     )
     gr.ChatInterface(
+        fn=chat_mistral,
         chatbot=chatbot,
         fill_height=True,
         additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
         additional_inputs=[
             system_prompt_input,
             gr.Slider(minimum=0, maximum=1, step=0.1, value=0.8, label="Temperature", render=False),
+            gr.Slider(minimum=0, maximum=1, step=0.1, value=0.9, label="Top-p", render=False),
             gr.Slider(minimum=128, maximum=4096, step=1, value=4096, label="Max new tokens", render=False),
         ],
         examples=[
             ['Are you a sentient being?']
         ],
+        cache_examples=False,
+        type='messages',
     )
 if __name__ == "__main__":