Nymbo's picture
adding custom models support, featured models tab, information tab, better model selection logic
a430d0d verified
raw
history blame
16 kB
import os
import gradio as gr
from openai import OpenAI
# Load your Hugging Face Inference API token from environment
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")
# Initialize the OpenAI-like client that points to the HF Inference endpoint
client = OpenAI(
base_url="https://api-inference.huggingface.co/v1/",
api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
frequency_penalty,
seed,
featured_model, # Selected from "Featured Models" radio
custom_model # Optional user-provided custom model path
):
"""
Respond to user messages using the Hugging Face Inference API with OpenAI-like syntax.
Parameters:
- message (str): The latest user message
- history (list of tuples): The conversation history [(user_msg, assistant_msg), ...]
- system_message (str): System-level instruction or context
- max_tokens (int): Max tokens to generate
- temperature (float): Sampling temperature
- top_p (float): Nucleus sampling (top-p)
- frequency_penalty (float): Penalize repeated tokens
- seed (int): Fixed seed; if -1 => random
- featured_model (str): The featured model name selected in the UI
- custom_model (str): A custom model path (HF repo) provided by the user
"""
print(f"Received message: {message}")
print(f"History: {history}")
print(f"System message: {system_message}")
print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
print(f"Featured Model (chosen): {featured_model}")
print(f"Custom Model (if any): {custom_model}")
# Decide which model to use. If the user typed a custom model, we use that.
# Otherwise, we use the featured model they picked from the radio.
if custom_model.strip():
model_to_use = custom_model.strip()
else:
model_to_use = featured_model
print(f"Final model to use: {model_to_use}")
# Convert seed to None if -1 => means random
if seed == -1:
seed = None
# Prepare the conversation
messages = [{"role": "system", "content": system_message}]
for val in history:
user_part = val[0]
assistant_part = val[1]
if user_part:
messages.append({"role": "user", "content": user_part})
print(f"Added user message to context: {user_part}")
if assistant_part:
messages.append({"role": "assistant", "content": assistant_part})
print(f"Added assistant message to context: {assistant_part}")
# Add the latest user message
messages.append({"role": "user", "content": message})
# Generate the response in a streaming manner
response = ""
print("Sending request to HF Inference API via OpenAI-like client.")
for message_chunk in client.chat.completions.create(
model=model_to_use,
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
frequency_penalty=frequency_penalty,
seed=seed,
messages=messages,
):
token_text = message_chunk.choices[0].delta.content
print(f"Received token: {token_text}")
response += token_text
# Yield partial responses to get streaming in Gradio
yield response
print("Completed response generation.")
# ----------------------------
# DEFINE THE GRADIO INTERFACE
# ----------------------------
def build_demo():
"""
Build the entire Gradio Blocks interface, featuring:
- A Tab for the chatbot (with featured models, custom model)
- An Information tab with model table, parameter overview, etc.
"""
# Define your placeholder featured models
featured_models_list = [
"meta-llama/Llama-3.3-70B-Instruct",
"Qwen/Qwen2.5-7B-Instruct",
"google/gemma-2-2b-it",
"microsoft/Phi-3-mini-4k-instruct",
]
with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
gr.Markdown("## Serverless Text Generation Hub")
with gr.Tabs():
# -------------------- CHAT TAB --------------------
with gr.Tab("Chat"):
with gr.Row():
with gr.Column():
# "Featured Models" Accordion
with gr.Accordion("Featured Models", open=False):
model_search = gr.Textbox(
label="Filter Featured Models",
placeholder="Search featured models...",
lines=1,
)
# Radio for selecting a featured model
featured_models = gr.Radio(
label="Pick a Featured Model",
choices=featured_models_list,
value=featured_models_list[0],
interactive=True,
)
# Function to filter the model list by search text
def filter_models(search_term):
filtered = [
m
for m in featured_models_list
if search_term.lower() in m.lower()
]
return gr.update(choices=filtered)
# Update the radio choices when user enters text in the search box
model_search.change(
filter_models,
inputs=model_search,
outputs=featured_models,
)
# "Custom Model" text box
custom_model = gr.Textbox(
label="Custom Model",
placeholder="Paste a Hugging Face repo path, e.g. 'myuser/my-model'",
lines=1,
)
gr.Markdown(
"If you provide a custom model path above, it will override your featured model selection."
)
with gr.Column():
# Create the Gradio Chatbot
chatbot = gr.Chatbot(height=600, label="Chat Output")
# Additional controls for system prompt & generation parameters
with gr.Box():
system_message = gr.Textbox(
value="",
label="System message",
placeholder="System-level instruction or context here...",
)
max_tokens = gr.Slider(
minimum=1,
maximum=4096,
value=512,
step=1,
label="Max new tokens",
)
temperature = gr.Slider(
minimum=0.1,
maximum=4.0,
value=0.7,
step=0.1,
label="Temperature",
)
top_p = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-P",
)
frequency_penalty = gr.Slider(
minimum=-2.0,
maximum=2.0,
value=0.0,
step=0.1,
label="Frequency Penalty",
)
seed = gr.Slider(
minimum=-1,
maximum=65535,
value=-1,
step=1,
label="Seed (-1 for random)",
)
# We will attach a ChatInterface-like set of controls manually.
# Keep track of conversation state
state = gr.State([]) # Holds conversation as a list of (user, assistant)
# Define "user" event function
def user_message(user_text, history):
"""
When the user sends a message, add it to history as (user_text, "")
The assistant's response will fill the second part of the tuple later.
"""
if not user_text:
return gr.update(), history
new_history = history + [(user_text, "")] # user question, empty answer
return gr.update(value=""), new_history
# Define "bot" event function
def bot_message(history, system_message, max_tokens, temperature, top_p,
frequency_penalty, seed, featured_models, custom_model):
"""
Generate assistant reply given the entire chat history,
system prompt, and generation params. The function will stream
tokens from respond().
"""
user_text = history[-1][0] if history else ""
# We'll call respond() as a generator, so we can stream back tokens.
bot_stream = respond(
message=user_text,
history=history[:-1],
system_message=system_message,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
frequency_penalty=frequency_penalty,
seed=seed,
featured_model=featured_models,
custom_model=custom_model,
)
# We'll build up the assistant's reply token by token
final_assistant_text = ""
for token in bot_stream:
final_assistant_text = token
# We yield partial updates to the chatbot
yield history[:-1] + [(user_text, final_assistant_text)]
# Once complete, update the conversation in state
history[-1] = (user_text, final_assistant_text)
yield history
# Textbox for the user to type a message
with gr.Row():
with gr.Column(scale=8):
user_textbox = gr.Textbox(
label="Your message",
placeholder="Type your question or prompt here...",
lines=2,
interactive=True,
)
with gr.Column(scale=2):
send_button = gr.Button(
value="Send",
variant="primary"
)
# When user clicks "Send", first call user_message(), then bot_message()
send_button.click(
fn=user_message,
inputs=[user_textbox, state],
outputs=[user_textbox, state],
).then(
fn=bot_message,
inputs=[
state,
system_message,
max_tokens,
temperature,
top_p,
frequency_penalty,
seed,
featured_models,
custom_model,
],
outputs=chatbot,
)
# -------------------- INFORMATION TAB --------------------
with gr.Tab("Information"):
# Put information about featured models
with gr.Accordion("Featured Models", open=False):
gr.HTML(
"""
<table style="width:100%; text-align:center; margin:auto;">
<tr>
<th>Model Name</th>
<th>Description</th>
<th>Status</th>
</tr>
<tr>
<td>meta-llama/Llama-3.3-70B-Instruct</td>
<td>Powerful large model by Llama, fine-tuned to follow instructions.</td>
<td>✅</td>
</tr>
<tr>
<td>Qwen/Qwen2.5-7B-Instruct</td>
<td>Instruction-tuned LLM with good accuracy and speed.</td>
<td>✅</td>
</tr>
<tr>
<td>google/gemma-2-2b-it</td>
<td>Compact 2B parameter model for quick text generation tasks.</td>
<td>✅</td>
</tr>
<tr>
<td>microsoft/Phi-3-mini-4k-instruct</td>
<td>Small but effective model, optimized for instruction-based tasks.</td>
<td>✅</td>
</tr>
</table>
"""
)
# Put general parameter info
with gr.Accordion("Parameters Overview", open=False):
gr.Markdown(
"""
## Parameters Overview
- **System Message**
This is a special prompt that sets the behavior or context for the AI.
- **Max New Tokens**
The maximum length of the AI's reply in tokens.
- **Temperature**
Controls how random or "creative" the model is. A higher value yields more unexpected outputs.
- **Top-P**
Nucleus sampling — only the tokens whose probabilities add up to `top_p` or higher are kept for generation.
- **Frequency Penalty**
Discourages the model from repeating tokens that already appeared.
- **Seed**
For reproducible outputs. If set to `-1`, a random seed is chosen each time.
### Model Selection
- **Featured Models**
A curated set of recommended or widely-used LLMs you can pick from.
- **Custom Model**
If you have a specific Hugging Face repo (e.g. `some-user/my-cool-model`), paste it here to override.
***
Feel free to experiment with different settings to see how they affect the response!
"""
)
return demo
# Actually build and launch the app
if __name__ == "__main__":
print("Launching the demo application.")
demo = build_demo()
demo.launch()