# /// script # dependencies = [ # "fastrtc[vad, stt]==0.0.26.rc1", # "openai", # ] # /// import gradio as gr import huggingface_hub from fastrtc import ( AdditionalOutputs, ReplyOnPause, WebRTC, WebRTCData, WebRTCError, get_hf_turn_credentials, get_stt_model, ) from gradio.utils import get_space from openai import OpenAI stt_model = get_stt_model() conversations = {} def response( data: WebRTCData, conversation: list[dict], token: str | None = None, model: str = "meta-llama/Llama-3.2-3B-Instruct", provider: str = "sambanova", ): print("conversation before", conversation) if not provider.startswith("http") and not token: raise WebRTCError("Please add your HF token.") if data.audio is not None and data.audio[1].size > 0: user_audio_text = stt_model.stt(data.audio) conversation.append({"role": "user", "content": user_audio_text}) else: conversation.append({"role": "user", "content": data.textbox}) yield AdditionalOutputs(conversation) if provider.startswith("http"): client = OpenAI(base_url=provider, api_key="ollama") else: client = huggingface_hub.InferenceClient( api_key=token, provider=provider, # type: ignore ) request = client.chat.completions.create( model=model, messages=conversation, # type: ignore temperature=1, top_p=0.1, ) response = {"role": "assistant", "content": request.choices[0].message.content} conversation.append(response) print("conversation after", conversation) yield AdditionalOutputs(conversation) css = """ footer { display: none !important; } """ providers = [ "black-forest-labs", "cerebras", "cohere", "fal-ai", "fireworks-ai", "hf-inference", "hyperbolic", "nebius", "novita", "openai", "replicate", "sambanova", "together", ] def hide_token(provider: str): if provider.startswith("http"): return gr.Textbox(visible=False) return gr.skip() with gr.Blocks(css=css) as demo: gr.HTML( """

Streaming Huggy FastRTC Chat

""" ) with gr.Sidebar(): token = gr.Textbox( placeholder="Place your HF token here", type="password", label="HF Token" ) model = gr.Dropdown( choices=["meta-llama/Llama-3.2-3B-Instruct"], allow_custom_value=True, label="Model", ) provider = gr.Dropdown( label="Provider", choices=providers, value="sambanova", info="Select a hf-compatible provider or type the url of your server, e.g. http://127.0.0.1:11434/v1 for ollama", allow_custom_value=True, ) provider.change(hide_token, inputs=[provider], outputs=[token]) cb = gr.Chatbot(type="messages", height=600) webrtc = WebRTC( modality="audio", mode="send", variant="textbox", rtc_configuration=get_hf_turn_credentials if get_space() else None, server_rtc_configuration=get_hf_turn_credentials(ttl=3_600 * 24 * 30) if get_space() else None, ) webrtc.stream( ReplyOnPause(response), # type: ignore inputs=[webrtc, cb, token, model, provider], outputs=[cb], concurrency_limit=100, ) webrtc.on_additional_outputs( lambda old, new: new, inputs=[cb], outputs=[cb], concurrency_limit=100 ) if __name__ == "__main__": demo.launch(server_port=7860)