Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

App Files Files Community

Serverless-TextGen-Hub / app.py

Nymbo

Update app.py

542c2ac verified 6 months ago

raw

history blame

12.4 kB

	import gradio as gr
	from openai import OpenAI
	import os

	# --------------------------------------------------------------------------------
	# Serverless-TextGen-Hub
	# This application is a Gradio-based UI for text generation using
	# Hugging Face's serverless Inference API. We also incorporate features
	# inspired by the ImgGen-Hub, such as:
	# - A "Featured Models" accordion with text filtering.
	# - A "Custom Model" textbox for specifying a non-featured model.
	# - An "Information" tab with accordions for "Featured Models" and
	# "Parameters Overview" containing helpful user guides.
	# --------------------------------------------------------------------------------

	# Retrieve the access token from environment variables
	ACCESS_TOKEN = os.getenv("HF_TOKEN") # HF_TOKEN is your Hugging Face Inference API key
	print("Access token loaded.")

	# Initialize the OpenAI client with the Hugging Face Inference API endpoint
	client = OpenAI(
	base_url="https://api-inference.huggingface.co/v1/",
	api_key=ACCESS_TOKEN,
	)
	print("OpenAI client initialized.")

	def respond(
	message,
	history: list[tuple[str, str]],
	system_message,
	max_tokens,
	temperature,
	top_p,
	frequency_penalty,
	seed,
	# NEW inputs for model selection
	model_search,
	selected_model,
	custom_model
	):
	"""
	This function handles the chatbot response.

	Parameters:
	- message: The user's newest message (string).
	- history: The list of previous messages in the conversation, each as a tuple (user_msg, assistant_msg).
	- system_message: The system prompt provided.
	- max_tokens: The maximum number of tokens to generate in the response.
	- temperature: Sampling temperature (float).
	- top_p: Top-p (nucleus) sampling (float).
	- frequency_penalty: Penalize repeated tokens in the output (float).
	- seed: A fixed seed for reproducibility; -1 means 'random'.
	- model_search: The text used to filter the "Featured Models" Radio button list (unused here directly, but updated by the UI).
	- selected_model: The model selected via the "Featured Models" Radio button.
	- custom_model: If not empty, overrides selected_model with this custom path.
	"""

	# DEBUG LOGGING
	print(f"Received message: {message}")
	print(f"History: {history}")
	print(f"System message: {system_message}")
	print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
	print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
	print(f"Model search text: {model_search}")
	print(f"Selected featured model: {selected_model}")
	print(f"Custom model (overrides if not empty): {custom_model}")

	# Convert seed to None if -1 (meaning random)
	if seed == -1:
	seed = None

	# Determine the final model name to use
	# If the custom_model textbox is non-empty, we use that.
	# Otherwise, we use the selected model from the Radio buttons.
	if custom_model.strip():
	model_to_use = custom_model.strip()
	else:
	model_to_use = selected_model

	# Construct the messages array required by the OpenAI-like HF API
	messages = [{"role": "system", "content": system_message}] # System prompt
	# Add conversation history to context
	for val in history:
	user_part = val[0]
	assistant_part = val[1]
	if user_part:
	messages.append({"role": "user", "content": user_part})
	if assistant_part:
	messages.append({"role": "assistant", "content": assistant_part})

	# Append the latest user message
	messages.append({"role": "user", "content": message})

	# Start with an empty string to build the response as tokens stream in
	response = ""
	print(f"Using model: {model_to_use}")
	print("Sending request to OpenAI API...")

	# Make the streaming request to the HF Inference API via openai-like client
	# Below, we pass 'model_to_use' instead of a hard-coded model
	for message_chunk in client.chat.completions.create(
	model=model_to_use, # <-- model is now dynamically selected
	max_tokens=max_tokens,
	stream=True, # Stream the response
	temperature=temperature,
	top_p=top_p,
	frequency_penalty=frequency_penalty,
	seed=seed,
	messages=messages,
	):
	# Extract token text from the response chunk
	token_text = message_chunk.choices[0].delta.content
	response += token_text
	# As we get new tokens, we stream them back to the user
	yield response

	print("Completed response generation.")

	# Create a Chatbot component with a specified height
	chatbot = gr.Chatbot(height=600)

	# ------------------------------------------------------------
	# Below: We define the UI with additional features integrated.
	# We'll replicate some of the style from the ImgGen-Hub code:
	# - A "Featured Models" accordion with the ability to filter
	# - A "Custom Model" text box
	# - An "Information" tab with "Featured Models" table and
	# "Parameters Overview" containing markdown descriptions.
	# ------------------------------------------------------------

	# List of placeholder "Featured Models" for demonstration
	featured_models_list = [
	"meta-llama/Llama-3.3-70B-Instruct",
	"meta-llama/Llama-2-70B-chat-hf",
	"meta-llama/Llama-2-13B-chat-hf",
	"bigscience/bloom",
	"google/flan-t5-xxl",
	]

	# This function filters the models in featured_models_list based on user input
	def filter_models(search_term):
	"""
	Filters featured_models_list based on the text in 'search_term'.
	"""
	filtered = [m for m in featured_models_list if search_term.lower() in m.lower()]
	return gr.update(choices=filtered)

	print("Initializing Gradio interface...") # Debug log

	# We build a custom Blocks layout to incorporate tabs and advanced UI elements
	with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:

	# Top-level heading for clarity
	gr.Markdown("# Serverless-TextGen-Hub\nA Comprehensive UI for Text Generation")

	with gr.Tab("Chat"):
	# We'll place the ChatInterface within this tab

	# Create the additional UI elements in a collapsible or visible layout
	with gr.Accordion("Featured Models", open=False):
	with gr.Row():
	model_search = gr.Textbox(
	label="Filter Models",
	placeholder="Search for a featured model...",
	lines=1,
	)
	with gr.Row():
	model_radio = gr.Radio(
	label="Select a featured model below",
	choices=featured_models_list,
	value="meta-llama/Llama-3.3-70B-Instruct",
	interactive=True,
	)
	# On change of model_search, we update the radio choices
	model_search.change(
	filter_models,
	inputs=model_search,
	outputs=model_radio
	)

	# Textbox for specifying a custom model that overrides the featured selection if not empty
	custom_model = gr.Textbox(
	label="Custom Model Path (overrides Featured Models if not empty)",
	placeholder="e.g. meta-llama/Llama-2-13B-chat-hf",
	lines=1
	)

	# Build the chat interface itself
	# We'll pass "model_search", "model_radio", and "custom_model" as additional inputs
	# so that the 'respond' function can see them and decide which model to use
	chatbot_interface = gr.ChatInterface(
	fn=respond, # The function that generates the text
	additional_inputs=[
	gr.Textbox(
	value="You are a helpful AI assistant.",
	label="System message",
	lines=2
	), # system_message
	gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens"), # max_tokens
	gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), # temperature
	gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05,label="Top-P"), # top_p
	gr.Slider(
	minimum=-2.0,
	maximum=2.0,
	value=0.0,
	step=0.1,
	label="Frequency Penalty"
	), # frequency_penalty
	gr.Slider(
	minimum=-1,
	maximum=65535,
	value=-1,
	step=1,
	label="Seed (-1 for random)"
	), # seed
	model_search, # Exposed but won't be typed into during conversation,
	model_radio,
	custom_model
	],
	chatbot=chatbot,
	title="Serverless-TextGen-Hub",
	# The fill_height ensures the chat area expands
	fill_height=True
	)

	# A new tab for "Information" about Featured Models and Parameters
	with gr.Tab("Information"):
	gr.Markdown("## Learn More About the Parameters and Models")

	# Accordion for "Featured Models"
	with gr.Accordion("Featured Models (WiP)", open=False):
	gr.HTML(
	"""
	<p>Below is a small table of example models. In practice, you can pick from
	thousands of available text generation models on Hugging Face.
	<br>
	Use the <b>Filter Models</b> box under the <b>Featured Models</b> accordion
	in the Chat tab to search by name, or enter a <b>Custom Model</b> path.</p>
	<table style="width:100%; text-align:center; margin:auto;">
	<tr>
	<th>Model Name</th>
	<th>Is It Large?</th>
	<th>Notes</th>
	</tr>
	<tr>
	<td>meta-llama/Llama-3.3-70B-Instruct</td>
	<td>Yes</td>
	<td>Placeholder example</td>
	</tr>
	<tr>
	<td>meta-llama/Llama-2-13B-chat-hf</td>
	<td>Medium</td>
	<td>Placeholder example</td>
	</tr>
	<tr>
	<td>google/flan-t5-xxl</td>
	<td>Yes</td>
	<td>Placeholder example</td>
	</tr>
	</table>
	"""
	)

	# Accordion for "Parameters Overview"
	with gr.Accordion("Parameters Overview", open=False):
	gr.Markdown(
	"""
	### Max New Tokens
	Controls how many tokens can be generated in the response. A token is roughly a word or a piece of a word. If you need longer answers, increase this.

	### Temperature
	A higher temperature makes the AI more 'creative' and random in its responses. Lower temperature keeps it more focused and deterministic.

	### Top-P
	This is 'nucleus sampling.' It dictates the proportion of probability mass the model considers. At 1.0, it considers all words. Lower it to focus on the most likely words.

	### Frequency Penalty
	Penalizes repeated tokens in the output. If you see a lot of repetition, increase this slightly to reduce the repetition.

	### Seed
	If set to -1, the randomness is different each time. Setting a specific number ensures the same result each run, making responses reproducible.

	### Custom Model
	If this field is filled, it overrides the selection from Featured Models. This way, you can try out any model on the HF Hub, e.g.
	<code>meta-llama/Llama-2-70B-chat-hf</code> or <code>bigscience/bloom</code>.
	"""
	)

	print("Gradio interface initialized.")

	# ------------------------------------------------------------
	# Finally, we launch the app if the script is run directly.
	# ------------------------------------------------------------
	if __name__ == "__main__":
	print("Launching the demo application...")
	demo.launch()