Spaces:

OscarFAI
/

inference

Running on Zero

App Files Files Community

OscarFAI commited on 12 days ago

Commit

47dded2

1 Parent(s): f1d7efb

Llama

Browse files

Files changed (1) hide show

app.py +19 -48

app.py CHANGED Viewed

@@ -3,14 +3,13 @@ import os
 import spaces
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
-import torch
 # Set an environment variable
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 DESCRIPTION = '''
 <div>
-<h1 style="text-align: center;">Mistral 8B Instruct</h1>
 </div>
 '''
@@ -21,7 +20,7 @@ LICENSE = """
 PLACEHOLDER = """
 <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
-   <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">Mistral-8B</h1>
    <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Ask me anything...</p>
 </div>
 """
@@ -41,12 +40,8 @@ h1 {
 """
 # Load the tokenizer and model
-tokenizer = AutoTokenizer.from_pretrained("mistralai/Ministral-8B-Instruct-2410")
-model = AutoModelForCausalLM.from_pretrained("mistralai/Ministral-8B-Instruct-2410", device_map="auto")
-# Ensure we have a pad token
-if tokenizer.pad_token_id is None:
-    tokenizer.pad_token_id = tokenizer.eos_token_id
 terminators = [
     tokenizer.eos_token_id,
@@ -54,19 +49,17 @@ terminators = [
 ]
 @spaces.GPU(duration=120)
-def chat_mistral(message: str,
-                 history: list,
-                 temperature: float,
-                 top_p: float,
-                 max_new_tokens: int,
-                 system_prompt: str) -> str:
     """
     Generate a streaming response using the Mistral-8B model.
     Args:
         message (str): The input message.
         history (list): The conversation history used by ChatInterface.
         temperature (float): The temperature for generating the response.
-        top_p (float): The top-p (nucleus) sampling parameter.
         max_new_tokens (int): The maximum number of new tokens to generate.
         system_prompt (str): The system prompt to guide the assistant's behavior.
     Returns:
@@ -74,42 +67,25 @@ def chat_mistral(message: str,
     """
     conversation = []
-    # Format system prompt correctly using [INST]
     if system_prompt:
-        formatted_prompt = f"[INST] {system_prompt} [/INST]\n\n"
-    else:
-        formatted_prompt = ""
-    # Modify first user message to include system prompt
-    if history:
-        first_user_msg = f"{formatted_prompt}{history[0][0]}" if formatted_prompt else history[0][0]
-        conversation.append({"role": "user", "content": first_user_msg})
-        conversation.append({"role": "assistant", "content": history[0][1]})
-        for user, assistant in history[1:]:
-            conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
-    else:
-        # First message in a new conversation
-        first_message = f"{formatted_prompt}{message}" if formatted_prompt else message
-        conversation.append({"role": "user", "content": first_message})
-    # Tokenize with padding and attention mask
-    input_data = tokenizer.apply_chat_template(conversation, return_tensors="pt", padding=True, truncation=True)
-    input_ids = input_data.to(model.device)
-    attention_mask = input_ids.ne(tokenizer.pad_token_id).to(dtype=torch.long, device=model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         input_ids=input_ids,
-        attention_mask=attention_mask,  # Fixes the warning
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
-        top_p=top_p,
-        pad_token_id=tokenizer.pad_token_id,  # Explicitly set
         eos_token_id=terminators,
     )
@@ -139,22 +115,17 @@ with gr.Blocks(fill_height=True, css=css) as demo:
     )
     gr.ChatInterface(
-        fn=chat_mistral,
         chatbot=chatbot,
         fill_height=True,
         additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
         additional_inputs=[
             system_prompt_input,
             gr.Slider(minimum=0, maximum=1, step=0.1, value=0.8, label="Temperature", render=False),
-            gr.Slider(minimum=0, maximum=1, step=0.1, value=0.9, label="Top-p", render=False),
             gr.Slider(minimum=128, maximum=4096, step=1, value=4096, label="Max new tokens", render=False),
         ],
         examples=[
-            ['How to setup a human base on Mars? Give short answer.'],
-            ['Explain theory of relativity to me like I’m 8 years old.'],
-            ['What is 9,000 * 9,000?'],
-            ['Write a pun-filled happy birthday message to my friend Alex.'],
-            ['Justify why a penguin might make a good king of the jungle.']
         ],
         cache_examples=False
     )

 import spaces
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
 # Set an environment variable
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 DESCRIPTION = '''
 <div>
+<h1 style="text-align: center;">Mistral Chat</h1>
 </div>
 '''
 PLACEHOLDER = """
 <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
+   <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">Mistral Chat 8B</h1>
    <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Ask me anything...</p>
 </div>
 """
 """
 # Load the tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained("Orenguteng/Llama-3-8B-Lexi-Uncensored")
+model = AutoModelForCausalLM.from_pretrained("Orenguteng/Llama-3-8B-Lexi-Uncensored", device_map="auto")
 terminators = [
     tokenizer.eos_token_id,
 ]
 @spaces.GPU(duration=120)
+def chat_llama3_8b(message: str,
+                    history: list,
+                    temperature: float,
+                    max_new_tokens: int,
+                    system_prompt: str) -> str:
     """
     Generate a streaming response using the Mistral-8B model.
     Args:
         message (str): The input message.
         history (list): The conversation history used by ChatInterface.
         temperature (float): The temperature for generating the response.
         max_new_tokens (int): The maximum number of new tokens to generate.
         system_prompt (str): The system prompt to guide the assistant's behavior.
     Returns:
     """
     conversation = []
+    # Include system prompt at the beginning if provided
     if system_prompt:
+        conversation.append({"role": "system", "content": system_prompt})
+    for user, assistant in history:
+        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
+    conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         input_ids=input_ids,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
         eos_token_id=terminators,
     )
     )
     gr.ChatInterface(
+        fn=chat_llama3_8b,
         chatbot=chatbot,
         fill_height=True,
         additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
         additional_inputs=[
             system_prompt_input,
             gr.Slider(minimum=0, maximum=1, step=0.1, value=0.8, label="Temperature", render=False),
             gr.Slider(minimum=128, maximum=4096, step=1, value=4096, label="Max new tokens", render=False),
         ],
         examples=[
+            ['Are you a sentient being?']
         ],
         cache_examples=False
     )