import gradio as gr from huggingface_hub import InferenceClient # -- 1) DEFINE YOUR MODELS HERE -- models = [ { "name": "Tiny Model", "description": "A small chat model.", "id": "amusktweewt/tiny-model-500M-chat-v2", # The one you say works already "enabled": True }, { "name": "Another Model", "description": "A bigger chat model (disabled).", "id": "another-model", "enabled": False } ] # Build the custom HTML for a disabled-capable {dropdown_options} """ def respond(message, history: list[tuple[str, str]], model_id, system_message, max_tokens, temperature, top_p): """ Builds a chat prompt using a simple template: - Optionally includes a system message. - Iterates over conversation history (each exchange as a tuple of (user, assistant)). - Adds the new user message and appends an empty assistant turn. Then it streams the response from the model. """ # -- 2) Instantiate the InferenceClient using the chosen model -- client = InferenceClient(model_id) # Build the messages list. messages = [] if system_message: messages.append({"role": "system", "content": system_message}) if history: for user_msg, bot_msg in history: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": bot_msg}) messages.append({"role": "user", "content": message}) messages.append({"role": "assistant", "content": ""}) response_text = "" # Stream the response token-by-token. for resp in client.chat_completion( messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): token = resp.choices[0].delta.content response_text += token yield response_text # -- 3) BUILD THE UI IN A BLOCKS CONTEXT (so we can add custom HTML above the chat) -- with gr.Blocks() as demo: # Our custom HTML dropdown (shows model + description, supports disabled) gr.HTML(value=dropdown_html) # Hidden textbox to store the current model ID (will be read by 'respond'). hidden_model = gr.Textbox( value=models[0]["id"], # Default to the first model visible=False, elem_id="hidden_model" ) # The ChatInterface is almost the same as your original code. # We simply add `hidden_model` as one more input argument. chat = gr.ChatInterface( respond, additional_inputs=[ hidden_model, gr.Textbox( value="You are a friendly Chatbot.", label="System message" ), gr.Slider( minimum=1, maximum=2048, value=512, step=1, label="Max new tokens" ), gr.Slider( minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature" ), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)" ), ] ) if __name__ == "__main__": demo.launch()