Nymbo commited on
Commit
cf508a7
·
verified ·
1 Parent(s): fde397b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -174
app.py CHANGED
@@ -1,231 +1,169 @@
1
  import gradio as gr
2
  from openai import OpenAI
3
  import os
4
- import time
5
 
6
  # Retrieve the access token from the environment variable
7
  ACCESS_TOKEN = os.getenv("HF_TOKEN")
8
- print("Access token loaded.")
9
 
10
- # Initialize the OpenAI client with the Hugging Face Inference API endpoint
11
  client = OpenAI(
12
  base_url="https://api-inference.huggingface.co/v1/",
13
  api_key=ACCESS_TOKEN,
14
  )
15
- print("OpenAI client initialized.")
16
 
17
  def respond(
18
  message,
19
- history: list[tuple[str, str]],
20
  system_message,
21
  max_tokens,
22
  temperature,
23
  top_p,
24
  frequency_penalty,
25
- seed,
26
- model_filter,
27
- model,
28
- custom_model
29
  ):
30
- """
31
- This function handles the chatbot response. It takes in:
32
- - message: the user's new message
33
- - history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
34
- - system_message: the system prompt
35
- - max_tokens: the maximum number of tokens to generate in the response
36
- - temperature: sampling temperature
37
- - top_p: top-p (nucleus) sampling
38
- - frequency_penalty: penalize repeated tokens in the output
39
- - seed: a fixed seed for reproducibility; -1 will mean 'random'
40
- - model_filter: search term to filter available models
41
- - model: the selected model from the radio choices
42
- - custom_model: manually entered HF model path
43
- """
44
-
45
  print(f"Received message: {message}")
46
  print(f"History: {history}")
47
- print(f"System message: {system_message}")
48
- print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
49
  print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
50
- print(f"Model Filter: {model_filter}, Selected Model: {model}, Custom Model: {custom_model}")
51
 
52
- # Convert seed to None if -1 (meaning random)
53
  if seed == -1:
54
  seed = None
55
 
56
- # Construct the messages array required by the API
57
  messages = [{"role": "system", "content": system_message}]
58
 
59
  # Add conversation history to the context
60
- for val in history:
61
- user_part = val[0]
62
- assistant_part = val[1]
63
- if user_part:
64
- messages.append({"role": "user", "content": user_part})
65
- print(f"Added user message to context: {user_part}")
66
- if assistant_part:
67
- messages.append({"role": "assistant", "content": assistant_part})
68
- print(f"Added assistant message to context: {assistant_part}")
69
-
70
- # Append the latest user message
71
  messages.append({"role": "user", "content": message})
72
 
73
- # Determine the model to use
74
- # Set the API URL based on the selected model or custom model
75
- if custom_model.strip() != "":
76
- api_model = custom_model.strip()
77
- else:
78
- if model == "Llama-3-70B-Instruct":
79
- api_model = "meta-llama/Llama-3.3-70B-Instruct"
80
- elif model == "Mistral-7B-Instruct-v0.2":
81
- api_model = "mistralai/Mistral-7B-Instruct-v0.2"
82
- elif model == "OpenHermes-2.5-Mistral-7B":
83
- api_model = "teknium/OpenHermes-2.5-Mistral-7B"
84
- elif model == "Phi-2":
85
- api_model = "microsoft/Phi-2"
86
- else:
87
- api_model = "meta-llama/Llama-3.3-70B-Instruct"
88
- print(f"Using model: {api_model}")
89
-
90
- # Start with an empty string to build the response as tokens stream in
91
  response = ""
92
- print(f"Sending request to OpenAI API, using model {api_model}.")
93
 
94
- # Make the streaming request to the HF Inference API via openai-like client
95
- for message_chunk in client.chat.completions.create(
96
- model=api_model,
 
97
  max_tokens=max_tokens,
98
- stream=True, # Stream the response
99
  temperature=temperature,
100
  top_p=top_p,
101
  frequency_penalty=frequency_penalty,
102
  seed=seed,
103
- messages=messages,
104
  ):
105
  # Extract the token text from the response chunk
106
- token_text = message_chunk.choices[0].delta.content
107
- print(f"Received token: {token_text}")
108
-
109
- # Check if token_text is None before appending
110
- if token_text is not None:
111
- response += token_text
112
- yield response
113
-
114
- print("Completed response generation.")
115
 
116
- # Placeholder list of models for the accordion
117
- models_list = [
118
- "Llama-3-70B-Instruct",
119
- "Mistral-7B-Instruct-v0.2",
120
- "OpenHermes-2.5-Mistral-7B",
121
- "Phi-2",
122
- ]
123
-
124
- # Create a Chatbot component with a specified height
125
  chatbot = gr.Chatbot(height=600)
126
- print("Chatbot interface created.")
127
 
128
- # Create the Gradio ChatInterface
129
  demo = gr.ChatInterface(
130
- respond,
131
- additional_inputs=[
132
- gr.Textbox(value="", label="System message"),
133
- gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens"),
134
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
135
- gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P"),
136
- gr.Slider(
137
- minimum=-2.0,
138
- maximum=2.0,
139
- value=0.0,
140
- step=0.1,
141
- label="Frequency Penalty"
142
- ),
143
- gr.Slider(
144
- minimum=-1,
145
- maximum=65535,
146
- value=-1,
147
- step=1,
148
- label="Seed (-1 for random)"
149
- ),
150
- gr.Textbox(label="Filter Featured Models", placeholder="Search...", lines=1),
151
- gr.Radio(label="Select a Featured Model", choices=models_list, value="Llama-3-70B-Instruct"),
152
- gr.Textbox(label="Custom Model", placeholder="Enter Hugging Face model path", lines=1),
153
- ],
154
- additional_inputs_accordion=gr.Accordion("Advanced Parameters", open=False),
155
- fill_height=True,
156
  chatbot=chatbot,
 
 
 
 
 
 
 
 
 
 
 
157
  theme="Nymbo/Nymbo_Theme",
158
  )
159
 
160
- # Add the "Information" tab to the demo
161
- with gr.Tab("Information", parent=demo):
162
- with gr.Accordion("Featured Models", open=True):
163
- gr.HTML(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  """
165
- <table style="width:100%; text-align:center; margin:auto;">
166
- <tr>
167
- <th>Model Name</th>
168
- <th>Provider</th>
169
- <th>Notes</th>
170
- </tr>
171
- <tr>
172
- <td>Llama-3-70B-Instruct</td>
173
- <td>Meta</td>
174
- <td>Powerful large language model.</td>
175
- </tr>
176
- <tr>
177
- <td>Mistral-7B-Instruct-v0.2</td>
178
- <td>Mistral AI</td>
179
- <td>Efficient and versatile model.</td>
180
- </tr>
181
- <tr>
182
- <td>OpenHermes-2.5-Mistral-7B</td>
183
- <td>Teknium</td>
184
- <td>Community-driven, fine-tuned model.</td>
185
- </tr>
186
- <tr>
187
- <td>Phi-2</td>
188
- <td>Microsoft</td>
189
- <td>Compact yet powerful model.</td>
190
- </tr>
191
- </table>
192
- """
193
  )
 
 
194
  with gr.Accordion("Parameters Overview", open=False):
195
  gr.Markdown(
196
- """
197
- ## System Message
198
- ###### The system message sets the behavior and persona of the chatbot. It's a way to provide context and instructions to the AI. For example, you can tell it to act as a helpful assistant, a storyteller, or any other role.
199
- ## Max New Tokens
200
- ###### This setting limits the length of the response generated by the AI. A higher number allows for longer, more detailed responses, while a lower number keeps the responses concise.
201
- ## Temperature
202
- ###### Temperature controls the randomness of the AI's output. A higher temperature makes the responses more creative and varied, while a lower temperature makes them more predictable and focused.
203
- ## Top-P (Nucleus Sampling)
204
- ###### Top-P sampling is a way to control the diversity of the AI's responses. It sets a threshold for the cumulative probability of the most likely next words. The AI then randomly selects from the words whose probabilities add up to this threshold. A lower Top-P value means less diversity.
205
- ## Frequency Penalty
206
- ###### Frequency penalty discourages the AI from repeating the same words or phrases too often in its responses. A higher penalty means the AI is less likely to repeat itself.
207
- ## Seed
208
- ###### The seed is a starting point for the random number generator that influences the AI's responses. If you set a specific seed, you'll get the same response every time you use that seed with the same prompt and settings. If you set it to -1, the AI will generate a new seed each time, leading to different responses.
209
- ## Featured Models
210
- ###### This section lists pre-selected models that are known to perform well. You can filter the list by typing in the search box.
211
- ## Custom Model
212
- ###### If you want to use a model that's not in the featured list, you can enter its Hugging Face model path here.
213
- ### Feel free to experiment with these settings to see how they affect the AI's responses. Happy chatting!
214
- """
215
- )
216
 
217
- # Filter models function
218
- def filter_models(search_term, model_radio):
219
- filtered_models = [m for m in models_list if search_term.lower() in m.lower()]
220
- if not filtered_models:
221
- filtered_models = ["No matching models"] # Provide feedback
222
- return gr.Radio.update(choices=filtered_models)
223
 
224
- # Update model list when search box is used
225
- demo.additional_inputs[6].change(filter_models, inputs=[demo.additional_inputs[6], demo.additional_inputs[7]], outputs=demo.additional_inputs[7])
 
 
 
226
 
227
- print("Gradio interface initialized.")
 
 
228
 
229
- if __name__ == "__main__":
230
- print("Launching the demo application.")
231
- demo.queue().launch()
 
1
  import gradio as gr
2
  from openai import OpenAI
3
  import os
 
4
 
5
  # Retrieve the access token from the environment variable
6
  ACCESS_TOKEN = os.getenv("HF_TOKEN")
 
7
 
8
+ # Initialize the OpenAI API client
9
  client = OpenAI(
10
  base_url="https://api-inference.huggingface.co/v1/",
11
  api_key=ACCESS_TOKEN,
12
  )
 
13
 
14
  def respond(
15
  message,
16
+ history,
17
  system_message,
18
  max_tokens,
19
  temperature,
20
  top_p,
21
  frequency_penalty,
22
+ seed
 
 
 
23
  ):
24
+ # Process the incoming message
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  print(f"Received message: {message}")
26
  print(f"History: {history}")
27
+ print(f"System Message: {system_message}")
28
+ print(f"Max Tokens: {max_tokens}, Temperature: {temperature}, Top P: {top_p}")
29
  print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
 
30
 
31
+ # Convert seed to None if -1 (random)
32
  if seed == -1:
33
  seed = None
34
 
35
+ # Construct the messages list for the API
36
  messages = [{"role": "system", "content": system_message}]
37
 
38
  # Add conversation history to the context
39
+ for user_message, assistant_message in history:
40
+ if user_message:
41
+ messages.append({"role": "user", "content": user_message})
42
+ print(f"Added user message: {user_message}")
43
+ if assistant_message:
44
+ messages.append({"role": "assistant", "content": assistant_message})
45
+ print(f"Added assistant message: {assistant_message}")
46
+
47
+ # Append the latest message
 
 
48
  messages.append({"role": "user", "content": message})
49
 
50
+ # Initialize response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  response = ""
 
52
 
53
+ # Make the API request
54
+ for chunk in client.chat.completions.create(
55
+ model="meta-llama/Llama-3.3-70B-Instruct",
56
+ messages=messages,
57
  max_tokens=max_tokens,
 
58
  temperature=temperature,
59
  top_p=top_p,
60
  frequency_penalty=frequency_penalty,
61
  seed=seed,
62
+ stream=True,
63
  ):
64
  # Extract the token text from the response chunk
65
+ token = chunk.choices[0].message.content
66
+ response += token
67
+ yield response
 
 
 
 
 
 
68
 
69
+ # Create the Gradio Chatbot component
 
 
 
 
 
 
 
 
70
  chatbot = gr.Chatbot(height=600)
 
71
 
72
+ # Define the Gradio ChatInterface
73
  demo = gr.ChatInterface(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  chatbot=chatbot,
75
+ fn=respond,
76
+ inputs=[
77
+ gr.Textbox(lines=1, placeholder="Enter your message..."),
78
+ gr.Chatbot(label="Conversation History"),
79
+ gr.Textbox(label="System Message"),
80
+ gr.Slider(minimum=10, maximum=200, step=1, label="Max Tokens"),
81
+ gr.Slider(minimum=0, maximum=2, step=0.1, label="Temperature"),
82
+ gr.Slider(minimum=0, maximum=1, step=0.05, label="Top P"),
83
+ gr.Slider(minimum=-2, maximum=2, step=0.1, label="Frequency Penalty"),
84
+ gr.Slider(minimum=-1, maximum=1000000, step=1, label="Seed (-1 for random)"),
85
+ ],
86
  theme="Nymbo/Nymbo_Theme",
87
  )
88
 
89
+ # Create the "Featured Models" accordion
90
+ with gr.Accordion("Featured Models", open=True) as featured_models:
91
+ # Textbox for searching models
92
+ model_search = gr.Textbox(label="Filter Models")
93
+ # List of featured models
94
+ models = [
95
+ "meta-llama/Llama-3.3-70B-Instruct",
96
+ "meta-llama/Llama-2-70B-Chat-hf",
97
+ "TheBloke/Llama-2-13B-Chat-GGML",
98
+ "TheBloke/Llama-2-70B-Chat-GGML",
99
+ "TheBloke/Llama-2-13B-Chat-GGML-v2",
100
+ "TheBloke/Llama-2-70B-Chat-GGML-v2",
101
+ "TheBloke/Llama-2-70B-Chat-HF-API-compatible-GGML",
102
+ "TheBloke/Llama-2-70b-chat-hf",
103
+ "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
104
+ "TheBloke/Llama-2-13B-Chat-GGML-v2-32K",
105
+ "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
106
+ "TheBloke/Llama-2-13B-Chat-GGML-v2-32K",
107
+ "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
108
+ "TheBloke/Llama-7-13B-Chat-GGML-v2-32K",
109
+ "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
110
+ "TheBloke/Llama-2-13B-Chat-GGML-v2-32K",
111
+ "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
112
+ # Add more models as needed...
113
+ ]
114
+ # Radio buttons for selecting a model
115
+ model_radio = gr.Radio(choices=models, label="Select a Model")
116
+
117
+ # Update the model list based on search input
118
+ def filter_models(search_term):
119
+ filtered_models = [model for model in models if search_term.lower() in model.lower()]
120
+ return gr.update(choices=filtered_models)
121
+
122
+ # Update the model list when the search box is used
123
+ model_search.change(filter_models, inputs=model_search, outputs=model_radio)
124
+
125
+ # Create a "Custom Model" textbox
126
+ custom_model = gr.Textbox(label="Custom Model", placeholder="Hugging Face model path")
127
+
128
+ # Create the "Information" tab
129
+ with gr.Tab("Information"):
130
+ # Featured Models accordion
131
+ with gr.Accordion("Featured Models", open=False):
132
+ gr.Markdown(
133
+ """
134
+ # Featured Models
135
+
136
+ Here's a list of some popular models available on Hugging Face:
137
+
138
+ - meta-llama/Llama-3.3-70B-Instruct
139
+ - meta-llama/Llama-2-70B-Chat-hf
140
+ - TheBloke/Llama-2-13B-Chat-GGML
141
+ - TheBloke/Llama-2-70B-Chat-GGML
142
+ - TheBloke/Llama-2-13B-Chat-GGML-v2
143
+ - TheBloke/Llama-2-70B-Chat-GGML-v2
144
+ - ... (and many more)
145
+
146
+ You can search and select a model from the list above, or use your own custom model path.
147
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  )
149
+
150
+ # Parameters Overview accordion
151
  with gr.Accordion("Parameters Overview", open=False):
152
  gr.Markdown(
153
+ """
154
+ # Parameters Overview
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
+ Here's a brief explanation of the parameters you can adjust:
 
 
 
 
 
157
 
158
+ - **Max Tokens**: The maximum number of tokens to generate in the response.
159
+ - **Temperature**: Controls the randomness of the output. Higher values make the output more random.
160
+ - **Top P**: Also known as nucleus sampling, it filters the least probable tokens, encouraging the model to be more creative.
161
+ - **Frequency Penalty**: Penalizes repeated tokens to avoid repetition.
162
+ - **Seed**: A fixed seed for reproducibility. Use -1 for a random seed.
163
 
164
+ Feel free to experiment with these settings to achieve the desired output.
165
+ """
166
+ )
167
 
168
+ # Launch the Gradio interface
169
+ demo.launch(share=True)