Spaces:
Running
Running
import os | |
import gradio as gr | |
from openai import OpenAI | |
# Load your Hugging Face Inference API token from environment | |
ACCESS_TOKEN = os.getenv("HF_TOKEN") | |
print("Access token loaded.") | |
# Initialize the OpenAI-like client that points to the HF Inference endpoint | |
client = OpenAI( | |
base_url="https://api-inference.huggingface.co/v1/", | |
api_key=ACCESS_TOKEN, | |
) | |
print("OpenAI client initialized.") | |
def respond( | |
message, | |
history: list[tuple[str, str]], | |
system_message, | |
max_tokens, | |
temperature, | |
top_p, | |
frequency_penalty, | |
seed, | |
featured_model, # Selected from "Featured Models" radio | |
custom_model # Optional user-provided custom model path | |
): | |
""" | |
Respond to user messages using the Hugging Face Inference API with OpenAI-like syntax. | |
Parameters: | |
- message (str): The latest user message | |
- history (list of tuples): The conversation history [(user_msg, assistant_msg), ...] | |
- system_message (str): System-level instruction or context | |
- max_tokens (int): Max tokens to generate | |
- temperature (float): Sampling temperature | |
- top_p (float): Nucleus sampling (top-p) | |
- frequency_penalty (float): Penalize repeated tokens | |
- seed (int): Fixed seed; if -1 => random | |
- featured_model (str): The featured model name selected in the UI | |
- custom_model (str): A custom model path (HF repo) provided by the user | |
""" | |
print(f"Received message: {message}") | |
print(f"History: {history}") | |
print(f"System message: {system_message}") | |
print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}") | |
print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}") | |
print(f"Featured Model (chosen): {featured_model}") | |
print(f"Custom Model (if any): {custom_model}") | |
# Decide which model to use. If the user typed a custom model, we use that. | |
# Otherwise, we use the featured model they picked from the radio. | |
if custom_model.strip(): | |
model_to_use = custom_model.strip() | |
else: | |
model_to_use = featured_model | |
print(f"Final model to use: {model_to_use}") | |
# Convert seed to None if -1 => means random | |
if seed == -1: | |
seed = None | |
# Prepare the conversation | |
messages = [{"role": "system", "content": system_message}] | |
for val in history: | |
user_part = val[0] | |
assistant_part = val[1] | |
if user_part: | |
messages.append({"role": "user", "content": user_part}) | |
print(f"Added user message to context: {user_part}") | |
if assistant_part: | |
messages.append({"role": "assistant", "content": assistant_part}) | |
print(f"Added assistant message to context: {assistant_part}") | |
# Add the latest user message | |
messages.append({"role": "user", "content": message}) | |
# Generate the response in a streaming manner | |
response = "" | |
print("Sending request to HF Inference API via OpenAI-like client.") | |
for message_chunk in client.chat.completions.create( | |
model=model_to_use, | |
max_tokens=max_tokens, | |
stream=True, | |
temperature=temperature, | |
top_p=top_p, | |
frequency_penalty=frequency_penalty, | |
seed=seed, | |
messages=messages, | |
): | |
token_text = message_chunk.choices[0].delta.content | |
print(f"Received token: {token_text}") | |
response += token_text | |
# Yield partial responses to get streaming in Gradio | |
yield response | |
print("Completed response generation.") | |
# ---------------------------- | |
# DEFINE THE GRADIO INTERFACE | |
# ---------------------------- | |
def build_demo(): | |
""" | |
Build the entire Gradio Blocks interface, featuring: | |
- A Tab for the chatbot (with featured models, custom model) | |
- An Information tab with model table, parameter overview, etc. | |
""" | |
# Define your placeholder featured models | |
featured_models_list = [ | |
"meta-llama/Llama-3.3-70B-Instruct", | |
"Qwen/Qwen2.5-7B-Instruct", | |
"google/gemma-2-2b-it", | |
"microsoft/Phi-3-mini-4k-instruct", | |
] | |
with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo: | |
gr.Markdown("## Serverless Text Generation Hub") | |
with gr.Tabs(): | |
# -------------------- CHAT TAB -------------------- | |
with gr.Tab("Chat"): | |
with gr.Row(): | |
with gr.Column(): | |
# "Featured Models" Accordion | |
with gr.Accordion("Featured Models", open=False): | |
model_search = gr.Textbox( | |
label="Filter Featured Models", | |
placeholder="Search featured models...", | |
lines=1, | |
) | |
# Radio for selecting a featured model | |
featured_models = gr.Radio( | |
label="Pick a Featured Model", | |
choices=featured_models_list, | |
value=featured_models_list[0], | |
interactive=True, | |
) | |
# Function to filter the model list by search text | |
def filter_models(search_term): | |
filtered = [ | |
m | |
for m in featured_models_list | |
if search_term.lower() in m.lower() | |
] | |
return gr.update(choices=filtered) | |
# Update the radio choices when user enters text in the search box | |
model_search.change( | |
filter_models, | |
inputs=model_search, | |
outputs=featured_models, | |
) | |
# "Custom Model" text box | |
custom_model = gr.Textbox( | |
label="Custom Model", | |
placeholder="Paste a Hugging Face repo path, e.g. 'myuser/my-model'", | |
lines=1, | |
) | |
gr.Markdown( | |
"If you provide a custom model path above, it will override your featured model selection." | |
) | |
with gr.Column(): | |
# Create the Gradio Chatbot | |
chatbot = gr.Chatbot(height=600, label="Chat Output") | |
# Additional controls for system prompt & generation parameters | |
with gr.Box(): | |
system_message = gr.Textbox( | |
value="", | |
label="System message", | |
placeholder="System-level instruction or context here...", | |
) | |
max_tokens = gr.Slider( | |
minimum=1, | |
maximum=4096, | |
value=512, | |
step=1, | |
label="Max new tokens", | |
) | |
temperature = gr.Slider( | |
minimum=0.1, | |
maximum=4.0, | |
value=0.7, | |
step=0.1, | |
label="Temperature", | |
) | |
top_p = gr.Slider( | |
minimum=0.1, | |
maximum=1.0, | |
value=0.95, | |
step=0.05, | |
label="Top-P", | |
) | |
frequency_penalty = gr.Slider( | |
minimum=-2.0, | |
maximum=2.0, | |
value=0.0, | |
step=0.1, | |
label="Frequency Penalty", | |
) | |
seed = gr.Slider( | |
minimum=-1, | |
maximum=65535, | |
value=-1, | |
step=1, | |
label="Seed (-1 for random)", | |
) | |
# We will attach a ChatInterface-like set of controls manually. | |
# Keep track of conversation state | |
state = gr.State([]) # Holds conversation as a list of (user, assistant) | |
# Define "user" event function | |
def user_message(user_text, history): | |
""" | |
When the user sends a message, add it to history as (user_text, "") | |
The assistant's response will fill the second part of the tuple later. | |
""" | |
if not user_text: | |
return gr.update(), history | |
new_history = history + [(user_text, "")] # user question, empty answer | |
return gr.update(value=""), new_history | |
# Define "bot" event function | |
def bot_message(history, system_message, max_tokens, temperature, top_p, | |
frequency_penalty, seed, featured_models, custom_model): | |
""" | |
Generate assistant reply given the entire chat history, | |
system prompt, and generation params. The function will stream | |
tokens from respond(). | |
""" | |
user_text = history[-1][0] if history else "" | |
# We'll call respond() as a generator, so we can stream back tokens. | |
bot_stream = respond( | |
message=user_text, | |
history=history[:-1], | |
system_message=system_message, | |
max_tokens=max_tokens, | |
temperature=temperature, | |
top_p=top_p, | |
frequency_penalty=frequency_penalty, | |
seed=seed, | |
featured_model=featured_models, | |
custom_model=custom_model, | |
) | |
# We'll build up the assistant's reply token by token | |
final_assistant_text = "" | |
for token in bot_stream: | |
final_assistant_text = token | |
# We yield partial updates to the chatbot | |
yield history[:-1] + [(user_text, final_assistant_text)] | |
# Once complete, update the conversation in state | |
history[-1] = (user_text, final_assistant_text) | |
yield history | |
# Textbox for the user to type a message | |
with gr.Row(): | |
with gr.Column(scale=8): | |
user_textbox = gr.Textbox( | |
label="Your message", | |
placeholder="Type your question or prompt here...", | |
lines=2, | |
interactive=True, | |
) | |
with gr.Column(scale=2): | |
send_button = gr.Button( | |
value="Send", | |
variant="primary" | |
) | |
# When user clicks "Send", first call user_message(), then bot_message() | |
send_button.click( | |
fn=user_message, | |
inputs=[user_textbox, state], | |
outputs=[user_textbox, state], | |
).then( | |
fn=bot_message, | |
inputs=[ | |
state, | |
system_message, | |
max_tokens, | |
temperature, | |
top_p, | |
frequency_penalty, | |
seed, | |
featured_models, | |
custom_model, | |
], | |
outputs=chatbot, | |
) | |
# -------------------- INFORMATION TAB -------------------- | |
with gr.Tab("Information"): | |
# Put information about featured models | |
with gr.Accordion("Featured Models", open=False): | |
gr.HTML( | |
""" | |
<table style="width:100%; text-align:center; margin:auto;"> | |
<tr> | |
<th>Model Name</th> | |
<th>Description</th> | |
<th>Status</th> | |
</tr> | |
<tr> | |
<td>meta-llama/Llama-3.3-70B-Instruct</td> | |
<td>Powerful large model by Llama, fine-tuned to follow instructions.</td> | |
<td>✅</td> | |
</tr> | |
<tr> | |
<td>Qwen/Qwen2.5-7B-Instruct</td> | |
<td>Instruction-tuned LLM with good accuracy and speed.</td> | |
<td>✅</td> | |
</tr> | |
<tr> | |
<td>google/gemma-2-2b-it</td> | |
<td>Compact 2B parameter model for quick text generation tasks.</td> | |
<td>✅</td> | |
</tr> | |
<tr> | |
<td>microsoft/Phi-3-mini-4k-instruct</td> | |
<td>Small but effective model, optimized for instruction-based tasks.</td> | |
<td>✅</td> | |
</tr> | |
</table> | |
""" | |
) | |
# Put general parameter info | |
with gr.Accordion("Parameters Overview", open=False): | |
gr.Markdown( | |
""" | |
## Parameters Overview | |
- **System Message** | |
This is a special prompt that sets the behavior or context for the AI. | |
- **Max New Tokens** | |
The maximum length of the AI's reply in tokens. | |
- **Temperature** | |
Controls how random or "creative" the model is. A higher value yields more unexpected outputs. | |
- **Top-P** | |
Nucleus sampling — only the tokens whose probabilities add up to `top_p` or higher are kept for generation. | |
- **Frequency Penalty** | |
Discourages the model from repeating tokens that already appeared. | |
- **Seed** | |
For reproducible outputs. If set to `-1`, a random seed is chosen each time. | |
### Model Selection | |
- **Featured Models** | |
A curated set of recommended or widely-used LLMs you can pick from. | |
- **Custom Model** | |
If you have a specific Hugging Face repo (e.g. `some-user/my-cool-model`), paste it here to override. | |
*** | |
Feel free to experiment with different settings to see how they affect the response! | |
""" | |
) | |
return demo | |
# Actually build and launch the app | |
if __name__ == "__main__": | |
print("Launching the demo application.") | |
demo = build_demo() | |
demo.launch() |