Nymbo's picture
Update app.py
542c2ac verified
raw
history blame
12.4 kB
import gradio as gr
from openai import OpenAI
import os
# --------------------------------------------------------------------------------
# Serverless-TextGen-Hub
# This application is a Gradio-based UI for text generation using
# Hugging Face's serverless Inference API. We also incorporate features
# inspired by the ImgGen-Hub, such as:
# - A "Featured Models" accordion with text filtering.
# - A "Custom Model" textbox for specifying a non-featured model.
# - An "Information" tab with accordions for "Featured Models" and
# "Parameters Overview" containing helpful user guides.
# --------------------------------------------------------------------------------
# Retrieve the access token from environment variables
ACCESS_TOKEN = os.getenv("HF_TOKEN") # HF_TOKEN is your Hugging Face Inference API key
print("Access token loaded.")
# Initialize the OpenAI client with the Hugging Face Inference API endpoint
client = OpenAI(
base_url="https://api-inference.huggingface.co/v1/",
api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
frequency_penalty,
seed,
# NEW inputs for model selection
model_search,
selected_model,
custom_model
):
"""
This function handles the chatbot response.
Parameters:
- message: The user's newest message (string).
- history: The list of previous messages in the conversation, each as a tuple (user_msg, assistant_msg).
- system_message: The system prompt provided.
- max_tokens: The maximum number of tokens to generate in the response.
- temperature: Sampling temperature (float).
- top_p: Top-p (nucleus) sampling (float).
- frequency_penalty: Penalize repeated tokens in the output (float).
- seed: A fixed seed for reproducibility; -1 means 'random'.
- model_search: The text used to filter the "Featured Models" Radio button list (unused here directly, but updated by the UI).
- selected_model: The model selected via the "Featured Models" Radio button.
- custom_model: If not empty, overrides selected_model with this custom path.
"""
# DEBUG LOGGING
print(f"Received message: {message}")
print(f"History: {history}")
print(f"System message: {system_message}")
print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
print(f"Model search text: {model_search}")
print(f"Selected featured model: {selected_model}")
print(f"Custom model (overrides if not empty): {custom_model}")
# Convert seed to None if -1 (meaning random)
if seed == -1:
seed = None
# Determine the final model name to use
# If the custom_model textbox is non-empty, we use that.
# Otherwise, we use the selected model from the Radio buttons.
if custom_model.strip():
model_to_use = custom_model.strip()
else:
model_to_use = selected_model
# Construct the messages array required by the OpenAI-like HF API
messages = [{"role": "system", "content": system_message}] # System prompt
# Add conversation history to context
for val in history:
user_part = val[0]
assistant_part = val[1]
if user_part:
messages.append({"role": "user", "content": user_part})
if assistant_part:
messages.append({"role": "assistant", "content": assistant_part})
# Append the latest user message
messages.append({"role": "user", "content": message})
# Start with an empty string to build the response as tokens stream in
response = ""
print(f"Using model: {model_to_use}")
print("Sending request to OpenAI API...")
# Make the streaming request to the HF Inference API via openai-like client
# Below, we pass 'model_to_use' instead of a hard-coded model
for message_chunk in client.chat.completions.create(
model=model_to_use, # <-- model is now dynamically selected
max_tokens=max_tokens,
stream=True, # Stream the response
temperature=temperature,
top_p=top_p,
frequency_penalty=frequency_penalty,
seed=seed,
messages=messages,
):
# Extract token text from the response chunk
token_text = message_chunk.choices[0].delta.content
response += token_text
# As we get new tokens, we stream them back to the user
yield response
print("Completed response generation.")
# Create a Chatbot component with a specified height
chatbot = gr.Chatbot(height=600)
# ------------------------------------------------------------
# Below: We define the UI with additional features integrated.
# We'll replicate some of the style from the ImgGen-Hub code:
# - A "Featured Models" accordion with the ability to filter
# - A "Custom Model" text box
# - An "Information" tab with "Featured Models" table and
# "Parameters Overview" containing markdown descriptions.
# ------------------------------------------------------------
# List of placeholder "Featured Models" for demonstration
featured_models_list = [
"meta-llama/Llama-3.3-70B-Instruct",
"meta-llama/Llama-2-70B-chat-hf",
"meta-llama/Llama-2-13B-chat-hf",
"bigscience/bloom",
"google/flan-t5-xxl",
]
# This function filters the models in featured_models_list based on user input
def filter_models(search_term):
"""
Filters featured_models_list based on the text in 'search_term'.
"""
filtered = [m for m in featured_models_list if search_term.lower() in m.lower()]
return gr.update(choices=filtered)
print("Initializing Gradio interface...") # Debug log
# We build a custom Blocks layout to incorporate tabs and advanced UI elements
with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
# Top-level heading for clarity
gr.Markdown("# Serverless-TextGen-Hub\nA Comprehensive UI for Text Generation")
with gr.Tab("Chat"):
# We'll place the ChatInterface within this tab
# Create the additional UI elements in a collapsible or visible layout
with gr.Accordion("Featured Models", open=False):
with gr.Row():
model_search = gr.Textbox(
label="Filter Models",
placeholder="Search for a featured model...",
lines=1,
)
with gr.Row():
model_radio = gr.Radio(
label="Select a featured model below",
choices=featured_models_list,
value="meta-llama/Llama-3.3-70B-Instruct",
interactive=True,
)
# On change of model_search, we update the radio choices
model_search.change(
filter_models,
inputs=model_search,
outputs=model_radio
)
# Textbox for specifying a custom model that overrides the featured selection if not empty
custom_model = gr.Textbox(
label="Custom Model Path (overrides Featured Models if not empty)",
placeholder="e.g. meta-llama/Llama-2-13B-chat-hf",
lines=1
)
# Build the chat interface itself
# We'll pass "model_search", "model_radio", and "custom_model" as additional inputs
# so that the 'respond' function can see them and decide which model to use
chatbot_interface = gr.ChatInterface(
fn=respond, # The function that generates the text
additional_inputs=[
gr.Textbox(
value="You are a helpful AI assistant.",
label="System message",
lines=2
), # system_message
gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens"), # max_tokens
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), # temperature
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05,label="Top-P"), # top_p
gr.Slider(
minimum=-2.0,
maximum=2.0,
value=0.0,
step=0.1,
label="Frequency Penalty"
), # frequency_penalty
gr.Slider(
minimum=-1,
maximum=65535,
value=-1,
step=1,
label="Seed (-1 for random)"
), # seed
model_search, # Exposed but won't be typed into during conversation,
model_radio,
custom_model
],
chatbot=chatbot,
title="Serverless-TextGen-Hub",
# The fill_height ensures the chat area expands
fill_height=True
)
# A new tab for "Information" about Featured Models and Parameters
with gr.Tab("Information"):
gr.Markdown("## Learn More About the Parameters and Models")
# Accordion for "Featured Models"
with gr.Accordion("Featured Models (WiP)", open=False):
gr.HTML(
"""
<p>Below is a small table of example models. In practice, you can pick from
thousands of available text generation models on Hugging Face.
<br>
Use the <b>Filter Models</b> box under the <b>Featured Models</b> accordion
in the Chat tab to search by name, or enter a <b>Custom Model</b> path.</p>
<table style="width:100%; text-align:center; margin:auto;">
<tr>
<th>Model Name</th>
<th>Is It Large?</th>
<th>Notes</th>
</tr>
<tr>
<td>meta-llama/Llama-3.3-70B-Instruct</td>
<td>Yes</td>
<td>Placeholder example</td>
</tr>
<tr>
<td>meta-llama/Llama-2-13B-chat-hf</td>
<td>Medium</td>
<td>Placeholder example</td>
</tr>
<tr>
<td>google/flan-t5-xxl</td>
<td>Yes</td>
<td>Placeholder example</td>
</tr>
</table>
"""
)
# Accordion for "Parameters Overview"
with gr.Accordion("Parameters Overview", open=False):
gr.Markdown(
"""
### Max New Tokens
Controls how many tokens can be generated in the response. A token is roughly a word or a piece of a word. If you need longer answers, increase this.
### Temperature
A higher temperature makes the AI more 'creative' and random in its responses. Lower temperature keeps it more focused and deterministic.
### Top-P
This is 'nucleus sampling.' It dictates the proportion of probability mass the model considers. At 1.0, it considers all words. Lower it to focus on the most likely words.
### Frequency Penalty
Penalizes repeated tokens in the output. If you see a lot of repetition, increase this slightly to reduce the repetition.
### Seed
If set to -1, the randomness is different each time. Setting a specific number ensures the same result each run, making responses reproducible.
### Custom Model
If this field is filled, it overrides the selection from Featured Models. This way, you can try out any model on the HF Hub, e.g.
<code>meta-llama/Llama-2-70B-chat-hf</code> or <code>bigscience/bloom</code>.
"""
)
print("Gradio interface initialized.")
# ------------------------------------------------------------
# Finally, we launch the app if the script is run directly.
# ------------------------------------------------------------
if __name__ == "__main__":
print("Launching the demo application...")
demo.launch()