Spaces:
Running
Running
File size: 15,956 Bytes
a430d0d 038f313 a430d0d 038f313 a430d0d 038f313 3a64d68 a430d0d 038f313 5b1509d a430d0d 5b1509d a430d0d 038f313 050af7a 5b1509d a430d0d 5b1509d a430d0d 5b1509d 038f313 a430d0d 038f313 5b1509d 038f313 a430d0d 038f313 a430d0d 038f313 a430d0d 5b1509d a430d0d 038f313 a430d0d 038f313 a430d0d 038f313 5b1509d a430d0d 038f313 3a64d68 038f313 a430d0d 038f313 a430d0d 038f313 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 |
import os
import gradio as gr
from openai import OpenAI
# Load your Hugging Face Inference API token from environment
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")
# Initialize the OpenAI-like client that points to the HF Inference endpoint
client = OpenAI(
base_url="https://api-inference.huggingface.co/v1/",
api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
frequency_penalty,
seed,
featured_model, # Selected from "Featured Models" radio
custom_model # Optional user-provided custom model path
):
"""
Respond to user messages using the Hugging Face Inference API with OpenAI-like syntax.
Parameters:
- message (str): The latest user message
- history (list of tuples): The conversation history [(user_msg, assistant_msg), ...]
- system_message (str): System-level instruction or context
- max_tokens (int): Max tokens to generate
- temperature (float): Sampling temperature
- top_p (float): Nucleus sampling (top-p)
- frequency_penalty (float): Penalize repeated tokens
- seed (int): Fixed seed; if -1 => random
- featured_model (str): The featured model name selected in the UI
- custom_model (str): A custom model path (HF repo) provided by the user
"""
print(f"Received message: {message}")
print(f"History: {history}")
print(f"System message: {system_message}")
print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
print(f"Featured Model (chosen): {featured_model}")
print(f"Custom Model (if any): {custom_model}")
# Decide which model to use. If the user typed a custom model, we use that.
# Otherwise, we use the featured model they picked from the radio.
if custom_model.strip():
model_to_use = custom_model.strip()
else:
model_to_use = featured_model
print(f"Final model to use: {model_to_use}")
# Convert seed to None if -1 => means random
if seed == -1:
seed = None
# Prepare the conversation
messages = [{"role": "system", "content": system_message}]
for val in history:
user_part = val[0]
assistant_part = val[1]
if user_part:
messages.append({"role": "user", "content": user_part})
print(f"Added user message to context: {user_part}")
if assistant_part:
messages.append({"role": "assistant", "content": assistant_part})
print(f"Added assistant message to context: {assistant_part}")
# Add the latest user message
messages.append({"role": "user", "content": message})
# Generate the response in a streaming manner
response = ""
print("Sending request to HF Inference API via OpenAI-like client.")
for message_chunk in client.chat.completions.create(
model=model_to_use,
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
frequency_penalty=frequency_penalty,
seed=seed,
messages=messages,
):
token_text = message_chunk.choices[0].delta.content
print(f"Received token: {token_text}")
response += token_text
# Yield partial responses to get streaming in Gradio
yield response
print("Completed response generation.")
# ----------------------------
# DEFINE THE GRADIO INTERFACE
# ----------------------------
def build_demo():
"""
Build the entire Gradio Blocks interface, featuring:
- A Tab for the chatbot (with featured models, custom model)
- An Information tab with model table, parameter overview, etc.
"""
# Define your placeholder featured models
featured_models_list = [
"meta-llama/Llama-3.3-70B-Instruct",
"Qwen/Qwen2.5-7B-Instruct",
"google/gemma-2-2b-it",
"microsoft/Phi-3-mini-4k-instruct",
]
with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
gr.Markdown("## Serverless Text Generation Hub")
with gr.Tabs():
# -------------------- CHAT TAB --------------------
with gr.Tab("Chat"):
with gr.Row():
with gr.Column():
# "Featured Models" Accordion
with gr.Accordion("Featured Models", open=False):
model_search = gr.Textbox(
label="Filter Featured Models",
placeholder="Search featured models...",
lines=1,
)
# Radio for selecting a featured model
featured_models = gr.Radio(
label="Pick a Featured Model",
choices=featured_models_list,
value=featured_models_list[0],
interactive=True,
)
# Function to filter the model list by search text
def filter_models(search_term):
filtered = [
m
for m in featured_models_list
if search_term.lower() in m.lower()
]
return gr.update(choices=filtered)
# Update the radio choices when user enters text in the search box
model_search.change(
filter_models,
inputs=model_search,
outputs=featured_models,
)
# "Custom Model" text box
custom_model = gr.Textbox(
label="Custom Model",
placeholder="Paste a Hugging Face repo path, e.g. 'myuser/my-model'",
lines=1,
)
gr.Markdown(
"If you provide a custom model path above, it will override your featured model selection."
)
with gr.Column():
# Create the Gradio Chatbot
chatbot = gr.Chatbot(height=600, label="Chat Output")
# Additional controls for system prompt & generation parameters
with gr.Box():
system_message = gr.Textbox(
value="",
label="System message",
placeholder="System-level instruction or context here...",
)
max_tokens = gr.Slider(
minimum=1,
maximum=4096,
value=512,
step=1,
label="Max new tokens",
)
temperature = gr.Slider(
minimum=0.1,
maximum=4.0,
value=0.7,
step=0.1,
label="Temperature",
)
top_p = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-P",
)
frequency_penalty = gr.Slider(
minimum=-2.0,
maximum=2.0,
value=0.0,
step=0.1,
label="Frequency Penalty",
)
seed = gr.Slider(
minimum=-1,
maximum=65535,
value=-1,
step=1,
label="Seed (-1 for random)",
)
# We will attach a ChatInterface-like set of controls manually.
# Keep track of conversation state
state = gr.State([]) # Holds conversation as a list of (user, assistant)
# Define "user" event function
def user_message(user_text, history):
"""
When the user sends a message, add it to history as (user_text, "")
The assistant's response will fill the second part of the tuple later.
"""
if not user_text:
return gr.update(), history
new_history = history + [(user_text, "")] # user question, empty answer
return gr.update(value=""), new_history
# Define "bot" event function
def bot_message(history, system_message, max_tokens, temperature, top_p,
frequency_penalty, seed, featured_models, custom_model):
"""
Generate assistant reply given the entire chat history,
system prompt, and generation params. The function will stream
tokens from respond().
"""
user_text = history[-1][0] if history else ""
# We'll call respond() as a generator, so we can stream back tokens.
bot_stream = respond(
message=user_text,
history=history[:-1],
system_message=system_message,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
frequency_penalty=frequency_penalty,
seed=seed,
featured_model=featured_models,
custom_model=custom_model,
)
# We'll build up the assistant's reply token by token
final_assistant_text = ""
for token in bot_stream:
final_assistant_text = token
# We yield partial updates to the chatbot
yield history[:-1] + [(user_text, final_assistant_text)]
# Once complete, update the conversation in state
history[-1] = (user_text, final_assistant_text)
yield history
# Textbox for the user to type a message
with gr.Row():
with gr.Column(scale=8):
user_textbox = gr.Textbox(
label="Your message",
placeholder="Type your question or prompt here...",
lines=2,
interactive=True,
)
with gr.Column(scale=2):
send_button = gr.Button(
value="Send",
variant="primary"
)
# When user clicks "Send", first call user_message(), then bot_message()
send_button.click(
fn=user_message,
inputs=[user_textbox, state],
outputs=[user_textbox, state],
).then(
fn=bot_message,
inputs=[
state,
system_message,
max_tokens,
temperature,
top_p,
frequency_penalty,
seed,
featured_models,
custom_model,
],
outputs=chatbot,
)
# -------------------- INFORMATION TAB --------------------
with gr.Tab("Information"):
# Put information about featured models
with gr.Accordion("Featured Models", open=False):
gr.HTML(
"""
<table style="width:100%; text-align:center; margin:auto;">
<tr>
<th>Model Name</th>
<th>Description</th>
<th>Status</th>
</tr>
<tr>
<td>meta-llama/Llama-3.3-70B-Instruct</td>
<td>Powerful large model by Llama, fine-tuned to follow instructions.</td>
<td>✅</td>
</tr>
<tr>
<td>Qwen/Qwen2.5-7B-Instruct</td>
<td>Instruction-tuned LLM with good accuracy and speed.</td>
<td>✅</td>
</tr>
<tr>
<td>google/gemma-2-2b-it</td>
<td>Compact 2B parameter model for quick text generation tasks.</td>
<td>✅</td>
</tr>
<tr>
<td>microsoft/Phi-3-mini-4k-instruct</td>
<td>Small but effective model, optimized for instruction-based tasks.</td>
<td>✅</td>
</tr>
</table>
"""
)
# Put general parameter info
with gr.Accordion("Parameters Overview", open=False):
gr.Markdown(
"""
## Parameters Overview
- **System Message**
This is a special prompt that sets the behavior or context for the AI.
- **Max New Tokens**
The maximum length of the AI's reply in tokens.
- **Temperature**
Controls how random or "creative" the model is. A higher value yields more unexpected outputs.
- **Top-P**
Nucleus sampling — only the tokens whose probabilities add up to `top_p` or higher are kept for generation.
- **Frequency Penalty**
Discourages the model from repeating tokens that already appeared.
- **Seed**
For reproducible outputs. If set to `-1`, a random seed is chosen each time.
### Model Selection
- **Featured Models**
A curated set of recommended or widely-used LLMs you can pick from.
- **Custom Model**
If you have a specific Hugging Face repo (e.g. `some-user/my-cool-model`), paste it here to override.
***
Feel free to experiment with different settings to see how they affect the response!
"""
)
return demo
# Actually build and launch the app
if __name__ == "__main__":
print("Launching the demo application.")
demo = build_demo()
demo.launch() |