Spaces:

SaisExperiments
/

Sad-Llama-3.2-3B

Running

File size: 5,498 Bytes

e04246a
 
a8e97ac
e04246a
a8e97ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e04246a
 
 
a8e97ac
e04246a
a8e97ac
 
 
 
e04246a
a8e97ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e04246a
 
a8e97ac
 
 
 
 
e04246a
 
 
a8e97ac
e04246a
a8e97ac
 
 
 
 
 
 
 
 
 
 
e04246a
a8e97ac
 
 
 
e04246a
a8e97ac
 
 
 
e04246a
a8e97ac
 
e04246a
 
a8e97ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e04246a
a8e97ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e04246a
 
 
a8e97ac
e04246a
 
a8e97ac
e04246a
 
a8e97ac
e04246a
 
 
 
a8e97ac

import gradio as gr
from huggingface_hub import InferenceClient
import os

# --- Installation Note ---
# Ensure you have the necessary libraries installed:
# pip install gradio huggingface_hub

# --- Hugging Face Hub Token ---
# The InferenceClient might require a Hugging Face Hub token for certain models or usage.
# Set it as an environment variable HUGGING_FACE_HUB_TOKEN, or log in via `huggingface-cli login`.
# If the model is public and doesn't require login, this might work without a token.
# HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN") # Optional: explicitly get token if needed
client = None
try:
    client = InferenceClient(
        "HuggingFaceH4/zephyr-7b-beta",
        # token=HUGGING_FACE_HUB_TOKEN # Uncomment if you want to pass token explicitly
    )
    print("InferenceClient initialized successfully.")
except Exception as e:
    print(f"Error initializing InferenceClient: {e}")
    print("Please ensure the model identifier is correct and you have necessary permissions/token.")
    # You might want to exit or raise the error depending on your application structure
    # For this Gradio app, we'll let the respond function handle the missing client.


def respond(
    message: str,
    history: list[tuple[str, str]],
    system_message: str = "You are a friendly Chatbot.", # Default value matching UI
    max_tokens: int = 512, # Default value matching UI
    temperature: float = 0.7, # Default value matching UI
    top_p: float = 0.95, # Default value matching UI
):
    """
    Chat response function for the Gradio interface.
    """
    # --- Client Check ---
    if client is None:
        yield "Error: InferenceClient could not be initialized. Please check server logs."
        return # Stop generation if client is not available

    # --- Input Validation (Basic) ---
    if not message:
        yield "Error: Please enter a message."
        return
    if not system_message:
        system_message = "You are a helpful assistant." # Fallback system message

    messages = [{"role": "system", "content": system_message}]

    for user_msg, assistant_msg in history:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})

    messages.append({"role": "user", "content": message})

    response_text = ""

    try:
        # Stream the response
        for message_chunk in client.chat_completion(
            messages=messages,
            max_tokens=max_tokens,
            stream=True,
            temperature=temperature,
            top_p=top_p,
        ):
            # Check if delta and content exist and are not None
            token = message_chunk.choices[0].delta.content

            # --- Robust Token Handling ---
            if token is not None:
                response_text += token
                yield response_text # Yield the accumulated response incrementally

    except Exception as e:
        print(f"Error during API call: {e}")
        # Yield a user-friendly error message
        yield f"An error occurred while generating the response: {e}"


# --- Gradio Interface Definition ---
demo = gr.ChatInterface(
    respond,
    chatbot=gr.Chatbot(
        height=500,
        label="Zephyr 7B Beta",
        show_label=True,
        bubble_full_width=False, # Optional: Adjust bubble width
    ),
    title="🤖 Zephyr 7B Beta Chat",
    description="Chat with the Zephyr 7B Beta model using the Hugging Face Inference API. \nEnter your message and adjust settings below.",
    examples=[
        ["Hello, how are you today?"],
        ["What is the capital of France?"],
        ["Explain the concept of large language models in simple terms."],
        ["Write a short poem about the rain."]
    ],
    cache_examples=False, # Set to True to cache example results if desired
    additional_inputs=[
        gr.Textbox(
            value="You are a friendly and helpful chatbot.", # Default system message
            label="System Message",
            info="The instruction given to the chatbot to guide its behavior.",
        ),
        gr.Slider(
            minimum=1,
            maximum=2048,
            value=512, # Default max tokens
            step=1,
            label="Max New Tokens",
            info="Maximum number of tokens to generate."
        ),
        gr.Slider(
            minimum=0.1,
            # Max temperature adjusted: values > 1.0 often degrade quality
            maximum=1.0,
            value=0.7, # Default temperature
            step=0.1,
            label="Temperature",
            info="Controls randomness. Lower values make output more focused, higher values make it more diverse."
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95, # Default top-p
            step=0.05,
            label="Top-p (nucleus sampling)",
            info="Considers only the most probable tokens with cumulative probability p. Helps prevent low-probability tokens."
        ),
    ],
     additional_inputs_accordion_name="⚙️ Advanced Settings" # Group settings
)


if __name__ == "__main__":
    # Launch the Gradio app
    demo.launch(
        # share=True # Uncomment to create a temporary public link (use with caution)
        # server_name="0.0.0.0" # Uncomment to allow access from your local network
        # auth=("user", "password") # Optional: Add basic authentication
    )