Llama-3.3-70B-Instruct-abliterated

Sleeping

App Files Files Community

DJStomp commited on Jan 13

Commit

ad24926

verified ·

1 Parent(s): f0f6bff

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -13

app.py CHANGED Viewed

@@ -4,15 +4,24 @@ import transformers
 import torch
 import spaces
 hf_token = os.getenv("HF_TOKEN")
 if not hf_token:
     raise ValueError("HF_TOKEN is not set in environment variables!")
 model_id = "huihui-ai/Llama-3.3-70B-Instruct-abliterated"
-pipeline = None
 @spaces.GPU
 def generate_response(
     message,
@@ -22,17 +31,7 @@ def generate_response(
     temperature,
     top_p,
 ):
-    global pipeline
-    if pipeline is None:
-        pipeline = transformers.pipeline(
-            "text-generation",
-            model=model_id,
-            use_auth_token=hf_token,
-            model_kwargs={"torch_dtype": torch.bfloat16},
-            device_map="auto",
-        )
     messages = [{"role": "system", "content": system_message}]
     for user_msg, assistant_msg in history:
         if user_msg:
@@ -40,8 +39,11 @@ def generate_response(
         if assistant_msg:
             messages.append({"role": "assistant", "content": assistant_msg})
     messages.append({"role": "user", "content": message})
     conversation = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
     try:
         outputs = pipeline(
             conversation,
@@ -51,11 +53,13 @@ def generate_response(
         )
         generated_text = outputs[0]["generated_text"]
         response = generated_text.split("\n")[-1].replace("assistant: ", "")
         return response
     except Exception as e:
         return f"Error: {str(e)}"
 demo = gr.ChatInterface(
     generate_response,
     additional_inputs=[

 import torch
 import spaces
+# Load Hugging Face token from environment variables
 hf_token = os.getenv("HF_TOKEN")
 if not hf_token:
     raise ValueError("HF_TOKEN is not set in environment variables!")
+# Model ID
 model_id = "huihui-ai/Llama-3.3-70B-Instruct-abliterated"
+# Initialize the pipeline at startup
+pipeline = transformers.pipeline(
+    "text-generation",
+    model=model_id,
+    use_auth_token=hf_token,
+    model_kwargs={"torch_dtype": torch.bfloat16},  # Optimize memory usage
+    device_map="auto",  # Automatically map to available GPUs
+)
+# Define the inference function with GPU allocation
 @spaces.GPU
 def generate_response(
     message,
     temperature,
     top_p,
 ):
+    # Combine system, history, and user messages into a formatted input string
     messages = [{"role": "system", "content": system_message}]
     for user_msg, assistant_msg in history:
         if user_msg:
         if assistant_msg:
             messages.append({"role": "assistant", "content": assistant_msg})
     messages.append({"role": "user", "content": message})
+    # Format the conversation as a single string
     conversation = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
+    # Generate a response using the preloaded pipeline
     try:
         outputs = pipeline(
             conversation,
         )
         generated_text = outputs[0]["generated_text"]
+        # Extract and return the assistant's response
         response = generated_text.split("\n")[-1].replace("assistant: ", "")
         return response
     except Exception as e:
         return f"Error: {str(e)}"
+# Define the Gradio Chat Interface
 demo = gr.ChatInterface(
     generate_response,
     additional_inputs=[