Llama-3.3-70B-Instruct-abliterated

Sleeping

App Files Files Community

DJStomp commited on Jan 13

Commit

f0f6bff

verified ·

1 Parent(s): 491841d

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -21

app.py CHANGED Viewed

@@ -4,29 +4,15 @@ import transformers
 import torch
 import spaces
-# Load Hugging Face token from environment variables
 hf_token = os.getenv("HF_TOKEN")
 if not hf_token:
     raise ValueError("HF_TOKEN is not set in environment variables!")
-# Model ID
 model_id = "huihui-ai/Llama-3.3-70B-Instruct-abliterated"
-# Load the model pipeline with ZeroGPU compatibility
-@spaces.GPU
-def load_pipeline():
-    return transformers.pipeline(
-        "text-generation",
-        model=model_id,
-        use_auth_token=hf_token,  # Pass the HF token
-        model_kwargs={"torch_dtype": torch.bfloat16},  # Use optimized dtype
-        device_map="auto",  # Automatically map across GPUs
-    )
-# Initialize the pipeline once
-pipeline = load_pipeline()
-# Define the function for response generation
 @spaces.GPU
 def generate_response(
     message,
@@ -36,7 +22,17 @@ def generate_response(
     temperature,
     top_p,
 ):
-    # Combine system, history, and user messages into a formatted input string
     messages = [{"role": "system", "content": system_message}]
     for user_msg, assistant_msg in history:
         if user_msg:
@@ -44,11 +40,8 @@ def generate_response(
         if assistant_msg:
             messages.append({"role": "assistant", "content": assistant_msg})
     messages.append({"role": "user", "content": message})
-    # Format the conversation as a single string
     conversation = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
-    # Generate a response using the pipeline
     try:
         outputs = pipeline(
             conversation,
@@ -58,13 +51,11 @@ def generate_response(
         )
         generated_text = outputs[0]["generated_text"]
-        # Extract and return the assistant's response
         response = generated_text.split("\n")[-1].replace("assistant: ", "")
         return response
     except Exception as e:
         return f"Error: {str(e)}"
-# Define the Gradio Chat Interface
 demo = gr.ChatInterface(
     generate_response,
     additional_inputs=[

 import torch
 import spaces
 hf_token = os.getenv("HF_TOKEN")
 if not hf_token:
     raise ValueError("HF_TOKEN is not set in environment variables!")
 model_id = "huihui-ai/Llama-3.3-70B-Instruct-abliterated"
+pipeline = None
 @spaces.GPU
 def generate_response(
     message,
     temperature,
     top_p,
 ):
+    global pipeline
+    if pipeline is None:
+        pipeline = transformers.pipeline(
+            "text-generation",
+            model=model_id,
+            use_auth_token=hf_token,
+            model_kwargs={"torch_dtype": torch.bfloat16},
+            device_map="auto",
+        )
     messages = [{"role": "system", "content": system_message}]
     for user_msg, assistant_msg in history:
         if user_msg:
         if assistant_msg:
             messages.append({"role": "assistant", "content": assistant_msg})
     messages.append({"role": "user", "content": message})
     conversation = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
     try:
         outputs = pipeline(
             conversation,
         )
         generated_text = outputs[0]["generated_text"]
         response = generated_text.split("\n")[-1].replace("assistant: ", "")
         return response
     except Exception as e:
         return f"Error: {str(e)}"
 demo = gr.ChatInterface(
     generate_response,
     additional_inputs=[