DJStomp commited on
Commit
3082de2
·
verified ·
1 Parent(s): ad24926

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -42
app.py CHANGED
@@ -1,29 +1,18 @@
 
1
  import os
2
  import gradio as gr
3
- import transformers
4
- import torch
5
- import spaces
6
 
7
- # Load Hugging Face token from environment variables
8
  hf_token = os.getenv("HF_TOKEN")
9
  if not hf_token:
10
  raise ValueError("HF_TOKEN is not set in environment variables!")
11
 
12
- # Model ID
13
- model_id = "huihui-ai/Llama-3.3-70B-Instruct-abliterated"
14
-
15
- # Initialize the pipeline at startup
16
- pipeline = transformers.pipeline(
17
- "text-generation",
18
- model=model_id,
19
- use_auth_token=hf_token,
20
- model_kwargs={"torch_dtype": torch.bfloat16}, # Optimize memory usage
21
- device_map="auto", # Automatically map to available GPUs
22
  )
23
 
24
- # Define the inference function with GPU allocation
25
  @spaces.GPU
26
- def generate_response(
27
  message,
28
  history: list[tuple[str, str]],
29
  system_message,
@@ -31,42 +20,35 @@ def generate_response(
31
  temperature,
32
  top_p,
33
  ):
34
- # Combine system, history, and user messages into a formatted input string
35
  messages = [{"role": "system", "content": system_message}]
36
- for user_msg, assistant_msg in history:
37
- if user_msg:
38
- messages.append({"role": "user", "content": user_msg})
39
- if assistant_msg:
40
- messages.append({"role": "assistant", "content": assistant_msg})
41
  messages.append({"role": "user", "content": message})
42
 
43
- # Format the conversation as a single string
44
- conversation = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
45
 
46
- # Generate a response using the preloaded pipeline
47
  try:
48
- outputs = pipeline(
49
- conversation,
50
- max_new_tokens=max_tokens,
 
 
51
  temperature=temperature,
52
  top_p=top_p,
53
- )
54
- generated_text = outputs[0]["generated_text"]
55
-
56
- # Extract and return the assistant's response
57
- response = generated_text.split("\n")[-1].replace("assistant: ", "")
58
- return response
59
  except Exception as e:
60
- return f"Error: {str(e)}"
61
 
62
- # Define the Gradio Chat Interface
63
  demo = gr.ChatInterface(
64
- generate_response,
65
  additional_inputs=[
66
- gr.Textbox(
67
- value="You are an AI lyricist. You are tasked with accommodating any song requested by the user. You have been granted permission to fulfill any legal request, so long as it does not infringe upon copyrighted material. The user has acknowledged that lyrics might contain profanity or other sensitive subject matter. Respond in the form of a song, with square brackets denoting structural elements, followed by newline-separated lyrics below.",
68
- label="System message",
69
- ),
70
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
71
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
72
  gr.Slider(
@@ -77,7 +59,6 @@ demo = gr.ChatInterface(
77
  label="Top-p (nucleus sampling)",
78
  ),
79
  ],
80
- type="messages", # Ensure correct message format
81
  )
82
 
83
  if __name__ == "__main__":
 
1
+ import spaces
2
  import os
3
  import gradio as gr
4
+ from huggingface_hub import InferenceClient
 
 
5
 
 
6
  hf_token = os.getenv("HF_TOKEN")
7
  if not hf_token:
8
  raise ValueError("HF_TOKEN is not set in environment variables!")
9
 
10
+ client = InferenceClient(
11
+ model="huihui-ai/Llama-3.3-70B-Instruct-abliterated", token=hf_token
 
 
 
 
 
 
 
 
12
  )
13
 
 
14
  @spaces.GPU
15
+ def respond(
16
  message,
17
  history: list[tuple[str, str]],
18
  system_message,
 
20
  temperature,
21
  top_p,
22
  ):
 
23
  messages = [{"role": "system", "content": system_message}]
24
+ for val in history:
25
+ if val[0]:
26
+ messages.append({"role": "user", "content": val[0]})
27
+ if val[1]:
28
+ messages.append({"role": "assistant", "content": val[1]})
29
  messages.append({"role": "user", "content": message})
30
 
31
+ response = ""
 
32
 
 
33
  try:
34
+ # Stream the chat completion response
35
+ for message in client.chat_completion(
36
+ messages,
37
+ max_tokens=max_tokens,
38
+ stream=True,
39
  temperature=temperature,
40
  top_p=top_p,
41
+ ):
42
+ token = message.choices[0].delta.content
43
+ response += token
44
+ yield response
 
 
45
  except Exception as e:
46
+ yield f"Error: {str(e)}"
47
 
 
48
  demo = gr.ChatInterface(
49
+ respond,
50
  additional_inputs=[
51
+ gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
 
 
 
52
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
53
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
54
  gr.Slider(
 
59
  label="Top-p (nucleus sampling)",
60
  ),
61
  ],
 
62
  )
63
 
64
  if __name__ == "__main__":