Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

App Files Files Community

Serverless-TextGen-Hub / app.py

Nymbo

Update app.py

8696822 verified 6 months ago

raw

history blame

11.5 kB

	import gradio as gr
	from openai import OpenAI
	import os

	# =============================
	# GLOBAL SETUP / CLIENT
	# =============================

	# Retrieve the access token from the environment variable
	ACCESS_TOKEN = os.getenv("HF_TOKEN")
	print("Access token loaded.")

	# Initialize the OpenAI client with the Hugging Face Inference API endpoint
	client = OpenAI(
	base_url="https://api-inference.huggingface.co/v1/",
	api_key=ACCESS_TOKEN,
	)
	print("OpenAI client initialized.")

	# =============================
	# MODEL CONFIG / LOGIC
	# =============================

	# Sample placeholder list of "featured" models for demonstration
	featured_models_list = [
	"meta-llama/Llama-2-13B-chat-hf",
	"bigscience/bloom",
	"microsoft/DialoGPT-large",
	"OpenAssistant/oasst-sft-1-pythia-12b",
	"tiiuae/falcon-7b-instruct",
	"meta-llama/Llama-3.3-70B-Instruct"
	]

	def filter_featured_models(search_term: str):
	"""
	Returns a list of models that contain the search term (case-insensitive).
	"""
	filtered = [m for m in featured_models_list if search_term.lower() in m.lower()]
	return gr.update(choices=filtered)


	def respond(
	message,
	history: list[tuple[str, str]],
	system_message,
	max_tokens,
	temperature,
	top_p,
	frequency_penalty,
	seed,
	custom_model,
	selected_featured_model
	):
	"""
	This function handles the chatbot response. It takes in:
	- message: the user's new message
	- history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
	- system_message: the system prompt
	- max_tokens, temperature, top_p, frequency_penalty, seed: generation params
	- custom_model: user-provided custom model path/name
	- selected_featured_model: model chosen from the featured radio list
	"""
	print(f"Received message: {message}")
	print(f"History: {history}")
	print(f"System message: {system_message}")
	print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
	print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
	print(f"Custom model: {custom_model}")
	print(f"Selected featured model: {selected_featured_model}")

	# Convert seed to None if -1 (meaning random)
	if seed == -1:
	seed = None

	# Construct the messages array required by the API
	messages = [{"role": "system", "content": system_message}] if system_message.strip() else []

	# Add conversation history to the context
	for val in history:
	user_part = val[0]
	assistant_part = val[1]
	if user_part:
	messages.append({"role": "user", "content": user_part})
	print(f"Added user message to context: {user_part}")
	if assistant_part:
	messages.append({"role": "assistant", "content": assistant_part})
	print(f"Added assistant message to context: {assistant_part}")

	# Append the latest user message
	messages.append({"role": "user", "content": message})

	# Determine which model to use:
	# 1) If custom_model is non-empty, it overrides everything.
	# 2) Otherwise, use the selected featured model from the radio button if available.
	# 3) If both are empty, fall back to the default.
	model_to_use = "meta-llama/Llama-3.3-70B-Instruct" # Default
	if custom_model.strip() != "":
	model_to_use = custom_model.strip()
	elif selected_featured_model.strip() != "":
	model_to_use = selected_featured_model.strip()

	print(f"Model selected for inference: {model_to_use}")

	# Start building the streaming response
	response = ""
	print("Sending request to OpenAI API.")

	# Make the streaming request to the HF Inference API via openai-like client
	for message_chunk in client.chat.completions.create(
	model=model_to_use,
	max_tokens=max_tokens,
	stream=True, # Stream the response
	temperature=temperature,
	top_p=top_p,
	frequency_penalty=frequency_penalty,
	seed=seed,
	messages=messages,
	):
	# Extract the token text from the response chunk
	token_text = message_chunk.choices[0].delta.content
	print(f"Received token: {token_text}", flush=True)
	response += token_text
	# Yield the partial response to Gradio so it can display in real-time
	yield response

	print("Completed response generation.")

	# =============================
	# MAIN UI
	# =============================

	def build_app():
	"""
	Build the Gradio Blocks interface containing:
	- A Chat tab (ChatInterface)
	- A Featured Models tab
	- An Information tab
	"""
	with gr.Blocks(theme="Nymbo/Nymbo_Theme") as main_interface:

	# We define a Gr.State to hold the user's chosen featured model
	selected_featured_model_state = gr.State("")

	with gr.Tab("Chat Interface"):
	gr.Markdown("## Serverless-TextGen-Hub")

	# Here we embed the ChatInterface for streaming conversation
	# We add extra inputs for "Selected Featured Model" as hidden,
	# so the user can't directly edit but it flows into respond().
	demo = gr.ChatInterface(
	fn=respond,
	additional_inputs=[
	gr.Textbox(value="", label="System message", lines=2),
	gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens"),
	gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
	gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P"),
	gr.Slider(minimum=-2.0, maximum=2.0, value=0.0, step=0.1, label="Frequency Penalty"),
	gr.Slider(minimum=-1, maximum=65535, value=-1, step=1, label="Seed (-1 for random)"),
	gr.Textbox(value="", label="Custom Model", info="(Optional) Provide a custom HF model path"),
	gr.Textbox(value="", label="Selected Featured Model (from tab)", visible=False),
	],
	fill_height=True,
	chatbot=gr.Chatbot(height=600),
	theme="Nymbo/Nymbo_Theme",
	)

	# We want to connect the selected_featured_model_state to that hidden text box
	def set_featured_model_in_chatbox(val):
	return val

	# Whenever the selected_featured_model_state changes, update the hidden field in the ChatInterface
	selected_featured_model_state.change(
	fn=set_featured_model_in_chatbox,
	inputs=selected_featured_model_state,
	outputs=demo.additional_inputs[-1], # The last additional input is the "Selected Featured Model"
	)

	# ==========================
	# Featured Models Tab
	# ==========================
	with gr.Tab("Featured Models"):
	gr.Markdown("### Choose from our Featured Models")

	# A text box for searching/filtering
	model_search = gr.Textbox(
	label="Filter Models",
	placeholder="Search for a featured model..."
	)

	# A radio component listing the featured models (default to first)
	model_radio = gr.Radio(
	choices=featured_models_list,
	label="Select a model below",
	value=featured_models_list[0],
	interactive=True
	)

	# Define how to update the radio choices when the search box changes
	model_search.change(
	fn=filter_featured_models,
	inputs=model_search,
	outputs=model_radio
	)

	# Button to confirm the selection
	def select_featured_model(radio_val):
	"""
	Updates the hidden state with the user-chosen featured model.
	"""
	return radio_val

	choose_btn = gr.Button("Use this Featured Model", variant="primary")

	choose_btn.click(
	fn=select_featured_model,
	inputs=model_radio,
	outputs=selected_featured_model_state
	)

	gr.Markdown(
	"""
	Tip: If you type a Custom Model in the "Chat Interface" tab, it overrides the
	featured model you selected here.
	"""
	)

	# ==========================
	# Information Tab
	# ==========================
	with gr.Tab("Information"):
	gr.Markdown("## Learn More About These Models and Parameters")

	with gr.Accordion("Featured Models (Table)", open=False):
	gr.HTML(
	"""
	<p>Below is a small sample table showing some featured models.</p>
	<table style="width:100%; text-align:center; margin:auto;">
	<tr>
	<th>Model Name</th>
	<th>Type</th>
	<th>Notes</th>
	</tr>
	<tr>
	<td>meta-llama/Llama-2-13B-chat-hf</td>
	<td>Chat</td>
	<td>Good for multi-turn dialogue.</td>
	</tr>
	<tr>
	<td>bigscience/bloom</td>
	<td>Language Model</td>
	<td>Large multilingual model.</td>
	</tr>
	<tr>
	<td>microsoft/DialoGPT-large</td>
	<td>Chat</td>
	<td>Well-known smaller chat model.</td>
	</tr>
	</table>
	"""
	)

	with gr.Accordion("Parameters Overview", open=False):
	gr.Markdown(
	"""
	### Explanation of Key Parameters

	- System Message: Provides context or initial instructions to the model.
	- Max Tokens: The maximum number of tokens (roughly pieces of words) in the generated response.
	- Temperature: Higher values produce more random/creative outputs, while lower values make the output more focused and deterministic.
	- Top-P: Controls nucleus sampling. The model considers only the tokens whose probability mass exceeds this value.
	- Frequency Penalty: Penalizes repeated tokens. Positive values (like 1.0) reduce repetition in the output. Negative values can increase repetition.
	- Seed: Determines reproducibility. Set it to a fixed integer for consistent results; `-1` is random each time.
	- Custom Model: Overwrites the featured model. Provide the Hugging Face path (e.g., `openai/whisper-base`) for your own usage.

	Use these settings to guide how the model generates text. If in doubt, stick to defaults and experiment in small increments.
	"""
	)

	return main_interface

	# If run as a standalone script, just launch.
	if __name__ == "__main__":
	print("Building and launching the Serverless-TextGen-Hub interface...")
	ui = build_app()
	ui.launch()