FastRTC Chat

# /// script
# dependencies = [
#   "fastrtc[vad, stt]==0.0.26.rc1",
#   "openai",
# ]
# ///


import gradio as gr
import huggingface_hub
from fastrtc import (
    AdditionalOutputs,
    ReplyOnPause,
    WebRTC,
    WebRTCData,
    WebRTCError,
    get_hf_turn_credentials,
    get_stt_model,
)
from gradio.utils import get_space
from openai import OpenAI

stt_model = get_stt_model()

conversations = {}


def response(
    data: WebRTCData,
    conversation: list[dict],
    token: str | None = None,
    model: str = "meta-llama/Llama-3.2-3B-Instruct",
    provider: str = "sambanova",
):
    print("conversation before", conversation)
    if not provider.startswith("http") and not token:
        raise WebRTCError("Please add your HF token.")

    if data.audio is not None and data.audio[1].size > 0:
        user_audio_text = stt_model.stt(data.audio)
        conversation.append({"role": "user", "content": user_audio_text})
    else:
        conversation.append({"role": "user", "content": data.textbox})

    yield AdditionalOutputs(conversation)

    if provider.startswith("http"):
        client = OpenAI(base_url=provider, api_key="ollama")
    else:
        client = huggingface_hub.InferenceClient(
            api_key=token,
            provider=provider,  # type: ignore
        )

    request = client.chat.completions.create(
        model=model,
        messages=conversation,  # type: ignore
        temperature=1,
        top_p=0.1,
    )
    response = {"role": "assistant", "content": request.choices[0].message.content}

    conversation.append(response)
    print("conversation after", conversation)
    yield AdditionalOutputs(conversation)


css = """
footer {
    display: none !important;
}
"""

providers = [
    "black-forest-labs",
    "cerebras",
    "cohere",
    "fal-ai",
    "fireworks-ai",
    "hf-inference",
    "hyperbolic",
    "nebius",
    "novita",
    "openai",
    "replicate",
    "sambanova",
    "together",
]


def hide_token(provider: str):
    if provider.startswith("http"):
        return gr.Textbox(visible=False)
    return gr.skip()


with gr.Blocks(css=css) as demo:
    gr.HTML(
        """
            <h1 style='text-align: center; display: flex; align-items: center; justify-content: center;'>
            <img src="https://huggingface.co/datasets/freddyaboulton/bucket/resolve/main/AV_Huggy.png" alt="Streaming Huggy" style="height: 50px; margin-right: 10px"> FastRTC Chat
            </h1>
        """
    )
    with gr.Sidebar():
        token = gr.Textbox(
            placeholder="Place your HF token here", type="password", label="HF Token"
        )
        model = gr.Dropdown(
            choices=["meta-llama/Llama-3.2-3B-Instruct"],
            allow_custom_value=True,
            label="Model",
        )
        provider = gr.Dropdown(
            label="Provider",
            choices=providers,
            value="sambanova",
            info="Select a hf-compatible provider or type the url of your server, e.g. http://127.0.0.1:11434/v1 for ollama",
            allow_custom_value=True,
        )
    provider.change(hide_token, inputs=[provider], outputs=[token])
    cb = gr.Chatbot(type="messages", height=600)
    webrtc = WebRTC(
        modality="audio",
        mode="send",
        variant="textbox",
        rtc_configuration=get_hf_turn_credentials if get_space() else None,
        server_rtc_configuration=get_hf_turn_credentials(ttl=3_600 * 24 * 30)
        if get_space()
        else None,
    )
    webrtc.stream(
        ReplyOnPause(response),  # type: ignore
        inputs=[webrtc, cb, token, model, provider],
        outputs=[cb],
        concurrency_limit=100,
    )
    webrtc.on_additional_outputs(
        lambda old, new: new, inputs=[cb], outputs=[cb], concurrency_limit=100
    )

if __name__ == "__main__":
    demo.launch(server_port=7860)