DJStomp commited on
Commit
ad24926
·
verified ·
1 Parent(s): f0f6bff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -13
app.py CHANGED
@@ -4,15 +4,24 @@ import transformers
4
  import torch
5
  import spaces
6
 
 
7
  hf_token = os.getenv("HF_TOKEN")
8
  if not hf_token:
9
  raise ValueError("HF_TOKEN is not set in environment variables!")
10
 
 
11
  model_id = "huihui-ai/Llama-3.3-70B-Instruct-abliterated"
12
 
13
- pipeline = None
14
-
 
 
 
 
 
 
15
 
 
16
  @spaces.GPU
17
  def generate_response(
18
  message,
@@ -22,17 +31,7 @@ def generate_response(
22
  temperature,
23
  top_p,
24
  ):
25
- global pipeline
26
-
27
- if pipeline is None:
28
- pipeline = transformers.pipeline(
29
- "text-generation",
30
- model=model_id,
31
- use_auth_token=hf_token,
32
- model_kwargs={"torch_dtype": torch.bfloat16},
33
- device_map="auto",
34
- )
35
-
36
  messages = [{"role": "system", "content": system_message}]
37
  for user_msg, assistant_msg in history:
38
  if user_msg:
@@ -40,8 +39,11 @@ def generate_response(
40
  if assistant_msg:
41
  messages.append({"role": "assistant", "content": assistant_msg})
42
  messages.append({"role": "user", "content": message})
 
 
43
  conversation = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
44
 
 
45
  try:
46
  outputs = pipeline(
47
  conversation,
@@ -51,11 +53,13 @@ def generate_response(
51
  )
52
  generated_text = outputs[0]["generated_text"]
53
 
 
54
  response = generated_text.split("\n")[-1].replace("assistant: ", "")
55
  return response
56
  except Exception as e:
57
  return f"Error: {str(e)}"
58
 
 
59
  demo = gr.ChatInterface(
60
  generate_response,
61
  additional_inputs=[
 
4
  import torch
5
  import spaces
6
 
7
+ # Load Hugging Face token from environment variables
8
  hf_token = os.getenv("HF_TOKEN")
9
  if not hf_token:
10
  raise ValueError("HF_TOKEN is not set in environment variables!")
11
 
12
+ # Model ID
13
  model_id = "huihui-ai/Llama-3.3-70B-Instruct-abliterated"
14
 
15
+ # Initialize the pipeline at startup
16
+ pipeline = transformers.pipeline(
17
+ "text-generation",
18
+ model=model_id,
19
+ use_auth_token=hf_token,
20
+ model_kwargs={"torch_dtype": torch.bfloat16}, # Optimize memory usage
21
+ device_map="auto", # Automatically map to available GPUs
22
+ )
23
 
24
+ # Define the inference function with GPU allocation
25
  @spaces.GPU
26
  def generate_response(
27
  message,
 
31
  temperature,
32
  top_p,
33
  ):
34
+ # Combine system, history, and user messages into a formatted input string
 
 
 
 
 
 
 
 
 
 
35
  messages = [{"role": "system", "content": system_message}]
36
  for user_msg, assistant_msg in history:
37
  if user_msg:
 
39
  if assistant_msg:
40
  messages.append({"role": "assistant", "content": assistant_msg})
41
  messages.append({"role": "user", "content": message})
42
+
43
+ # Format the conversation as a single string
44
  conversation = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
45
 
46
+ # Generate a response using the preloaded pipeline
47
  try:
48
  outputs = pipeline(
49
  conversation,
 
53
  )
54
  generated_text = outputs[0]["generated_text"]
55
 
56
+ # Extract and return the assistant's response
57
  response = generated_text.split("\n")[-1].replace("assistant: ", "")
58
  return response
59
  except Exception as e:
60
  return f"Error: {str(e)}"
61
 
62
+ # Define the Gradio Chat Interface
63
  demo = gr.ChatInterface(
64
  generate_response,
65
  additional_inputs=[