Spaces:

SaisExperiments
/

Sad-Llama-3.2-3B

Running

File size: 5,278 Bytes

e04246a
 
c137ca3
 
a8e97ac
e04246a
43de95d
 
a8e97ac
43de95d
 
 
 
 
 
a8e97ac
c137ca3
 
 
43de95d
a8e97ac
 
c137ca3
 
e04246a
 
a8e97ac
43de95d
 
 
 
 
e04246a
a8e97ac
43de95d
 
 
 
 
 
 
 
 
 
 
 
 
a8e97ac
e04246a
 
43de95d
 
a8e97ac
 
43de95d
 
e04246a
43de95d
e04246a
 
43de95d
a8e97ac
43de95d
 
a8e97ac
 
 
 
 
 
43de95d
 
 
 
 
 
c137ca3
 
 
 
 
 
 
 
 
 
 
 
 
 
a8e97ac
43de95d
 
e04246a
a8e97ac
43de95d
 
 
e04246a
 
43de95d
 
 
 
 
a8e97ac
43de95d
 
 
a8e97ac
43de95d
 
 
 
e04246a
43de95d
 
 
e04246a
 
 
43de95d
e04246a
 
 
 
43de95d
e04246a
 
 
 
c137ca3
 
43de95d

import gradio as gr
from huggingface_hub import InferenceClient
# Import the correct exception class
from huggingface_hub.utils import HfHubHTTPError
import os

"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference

**Note:** You might need to authenticate with Hugging Face for this to work reliably.
Run `huggingface-cli login` in your terminal or set the HUGGING_FACE_HUB_TOKEN environment variable.
Alternatively, pass your token directly: InferenceClient(token="hf_YOUR_TOKEN")
"""
# Initialize the Inference Client
# It will try to use HUGGING_FACE_HUB_TOKEN environment variable or cached login
try:
    # You might need to provide a token if you haven't logged in via CLI
    # token = os.getenv("HUGGING_FACE_HUB_TOKEN")
    # client = InferenceClient("HuggingFaceH4/zephyr-7b-beta", token=token)
    client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
except Exception as e:
    print(f"Error initializing InferenceClient: {e}")
    raise ValueError("Could not initialize InferenceClient. Ensure you are logged in or provide a token.") from e


def respond(
    message: str,
    history: list[tuple[str | None, str | None]],
    system_message: str,
    max_tokens: int,
    temperature: float,
    top_p: float,
):
    """
    Generates a response using the Hugging Face Inference API.

    Args:
        message: The user's input message.
        history: A list of tuples representing the conversation history.
                 Each tuple is (user_message, bot_message).
        system_message: The system prompt to guide the model.
        max_tokens: The maximum number of new tokens to generate.
        temperature: Controls randomness (higher = more random).
        top_p: Nucleus sampling parameter.

    Yields:
        The generated response incrementally.
    """
    messages = [{"role": "system", "content": system_message}]

    # Add conversation history
    for user_msg, bot_msg in history:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if bot_msg:
            messages.append({"role": "assistant", "content": bot_msg})

    # Add the latest user message
    messages.append({"role": "user", "content": message})

    response = ""
    try:
        # Start streaming the response
        for msg_chunk in client.chat_completion(
            messages=messages,
            max_tokens=max_tokens,
            stream=True,
            temperature=temperature,
            top_p=top_p,
        ):
            # Check if there's content in the delta
            token = msg_chunk.choices[0].delta.content
            if token:  # Add check for empty/None token
                response += token
                yield response # Yield the accumulated response so far

    # Catch HTTP errors from the Hugging Face Hub API
    except HfHubHTTPError as e:
        error_message = f"Inference API Error: {e}"
        # Try to get more details from the response if available
        if e.response:
            try:
                details = e.response.json()
                error_message += f"\nDetails: {details.get('error', 'N/A')}"
            except Exception: # Catch potential JSON decoding errors
                pass # Keep the original error message
        print(error_message)
        yield f"Sorry, I encountered an error communicating with the model service: {e}" # Display a user-friendly message

    # Catch other potential errors
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        yield f"Sorry, an unexpected error occurred: {e}"


"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    respond,
    chatbot=gr.Chatbot(height=400), # Adjust chatbot height if desired
    textbox=gr.Textbox(placeholder="Ask me anything...", container=False, scale=7),
    title="Zephyr 7B Beta Chat",
    description="Chat with the Zephyr 7B Beta model using the Hugging Face Inference API.",
    theme="soft", # Optional: Apply a theme
    examples=[
        ["Hello!"],
        ["Explain the concept of Large Language Models in simple terms."],
        ["Write a short poem about the moon."],
    ],
    cache_examples=False, # Set to True to cache example results
    retry_btn="Retry",
    undo_btn="Undo",
    clear_btn="Clear",
    additional_inputs=[
        gr.Textbox(value="You are a friendly and helpful chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"), # Note: Max temp often capped lower (e.g., 1.0 or 2.0)
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
     additional_inputs_accordion=gr.Accordion(label="Advanced Options", open=False), # Group additional inputs
)


if __name__ == "__main__":
    # Ensure huggingface_hub library is up-to-date: pip install --upgrade huggingface_hub
    print("Launching Gradio Interface...")
    demo.launch()