Spaces:
Running
Running
File size: 5,706 Bytes
038f313 4c18bfc 038f313 880ced6 e13eb1b 038f313 e13eb1b 038f313 e13eb1b 038f313 e13eb1b 69b4a5f 038f313 3a64d68 52ad57a 038f313 e13eb1b 52ad57a e13eb1b 52ad57a f7c4208 86297f5 52ad57a f7c4208 52ad57a 038f313 e13eb1b 880ced6 f7c4208 e13eb1b 86297f5 e13eb1b 038f313 e13eb1b 038f313 b56d11c f7c4208 52ad57a e13eb1b 52ad57a 038f313 52ad57a 038f313 52ad57a 86297f5 038f313 f7c4208 86297f5 b56d11c 52ad57a b56d11c 542c2ac e13eb1b f7c4208 52ad57a e13eb1b 52ad57a c20c4dd 52ad57a c20c4dd 52ad57a c20c4dd 52ad57a c20c4dd 52ad57a c20c4dd 52ad57a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import gradio as gr
from openai import OpenAI
import os
# Retrieve the access token from the environment variable
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")
# Initialize the OpenAI client with the Hugging Face Inference API endpoint
client = OpenAI(
base_url="https://api-inference.huggingface.co/v1/",
api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
frequency_penalty,
seed
):
"""
This function handles the chatbot response. It takes in:
- message: the user's new message
- history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
- system_message: the system prompt
- max_tokens: the maximum number of tokens to generate in the response
- temperature: sampling temperature
- top_p: top-p (nucleus) sampling
- frequency_penalty: penalize repeated tokens in the output
- seed: a fixed seed for reproducibility; -1 will mean 'random'
"""
print(f"Received message: {message}")
print(f"History: {history}")
print(f"System message: {system_message}")
print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
# Convert seed to None if -1 (meaning random)
if seed == -1:
seed = None
# Construct the messages array required by the API
messages = [{"role": "system", "content": system_message}]
# Add conversation history to the context
for val in history:
user_part = val[0]
assistant_part = val[1]
if user_part:
messages.append({"role": "user", "content": user_part})
print(f"Added user message to context: {user_part}")
if assistant_part:
messages.append({"role": "assistant", "content": assistant_part})
print(f"Added assistant message to context: {assistant_part}")
# Append the latest user message
messages.append({"role": "user", "content": message})
# Start with an empty string to build the response as tokens stream in
response = ""
print("Sending request to OpenAI API.")
# Make the streaming request to the HF Inference API via openai-like client
for message_chunk in client.chat.completions.create(
model="meta-llama/Llama-3.3-70B-Instruct", # You can update this to your specific model
max_tokens=max_tokens,
stream=True, # Stream the response
temperature=temperature,
top_p=top_p,
frequency_penalty=frequency_penalty, # <-- NEW
seed=seed, # <-- NEW
messages=messages,
):
# Extract the token text from the response chunk
token_text = message_chunk.choices[0].delta.content
print(f"Received token: {token_text}")
response += token_text
# As streaming progresses, yield partial output
yield response
print("Completed response generation.")
# Create a Chatbot component with a specified height
chatbot = gr.Chatbot(height=600)
print("Chatbot interface created.")
MODELS_LIST = [
"meta-llama/Llama-3.1-8B-Instruct",
"microsoft/Phi-3.5-mini-instruct",
]
def filter_models(search_term):
"""
Simple function to filter the placeholder model list based on the user's input
"""
filtered_models = [m for m in MODELS_LIST if search_term.lower() in m.lower()]
return gr.update(choices=filtered_models)
# --------------------------------------
# REBUILD THE INTERFACE USING BLOCKS
# --------------------------------------
print("Building Gradio interface with Blocks...")
with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
# Title
gr.Markdown("# Serverless-TextGen-Hub")
# Accordion: Parameters (sliders, etc.)
with gr.Accordion("Parameters", open=True):
system_message = gr.Textbox(value="", label="System message")
max_tokens = gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens")
temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P")
frequency_penalty = gr.Slider(minimum=-2.0, maximum=2.0, value=0.0, step=0.1, label="Frequency Penalty")
seed = gr.Slider(minimum=-1, maximum=65535, value=-1, step=1, label="Seed (-1 for random)")
# Accordion: Featured Models (Below the parameters)
with gr.Accordion("Featured Models", open=False):
model_search = gr.Textbox(
label="Filter Models",
placeholder="Search for a featured model...",
lines=1
)
model_radio = gr.Radio(
label="Select a model below",
value=MODELS_LIST[0], # default
choices=MODELS_LIST,
interactive=True
)
model_search.change(filter_models, inputs=model_search, outputs=model_radio)
# The main ChatInterface
chat_interface = gr.ChatInterface(
fn=respond,
additional_inputs=[
system_message,
max_tokens,
temperature,
top_p,
frequency_penalty,
seed
],
fill_height=True,
chatbot=chatbot,
theme="Nymbo/Nymbo_Theme",
title="Serverless-TextGen-Hub",
description="A comprehensive UI for text generation using the HF Inference API."
)
print("Gradio interface initialized.")
if __name__ == "__main__":
print("Launching the demo application.")
demo.launch() |