import gradio as gr import torch import llava from peft import PeftModel import os from huggingface_hub import snapshot_download # --------------------------------- # MULTI-TURN MODEL SETUP # --------------------------------- MODEL_BASE_MULTI = snapshot_download(repo_id="nvidia/audio-flamingo-3-chat") # model_multi = llava.load(MODEL_BASE_MULTI, model_base=None, devices=[0]) model_multi = llava.load(MODEL_BASE_MULTI, model_base=None) model_multi = model_multi.to("cuda") generation_config_multi = model_multi.default_generation_config # --------------------------------- # MULTI-TURN INFERENCE FUNCTION # --------------------------------- def multi_turn_chat(user_input, audio_file, history, current_audio): try: if audio_file is not None: current_audio = audio_file # Update state if a new file is uploaded if current_audio is None: return history + [("System", "โŒ Please upload an audio file before chatting.")], history, current_audio sound = llava.Sound(current_audio) prompt = f"\n{user_input}" response = model_multi.generate_content([sound, prompt], generation_config=generation_config_multi) history.append((user_input, response)) return history, history, current_audio except Exception as e: history.append((user_input, f"โŒ Error: {str(e)}")) return history, history, current_audio def speech_prompt_infer(audio_prompt_file): try: sound = llava.Sound(audio_prompt_file) full_prompt = "" response = model_multi.generate_content([sound, full_prompt], generation_config=generation_config_multi) return response except Exception as e: return f"โŒ Error: {str(e)}" # --------------------------------- # INTERFACE # --------------------------------- with gr.Blocks(css=""" .gradio-container { max-width: 100% !important; width: 100% !important; margin: 0 !important; padding: 0 !important; } #component-0, .gr-block.gr-box { width: 100% !important; } .gr-block.gr-box, .gr-column, .gr-row { padding: 0 !important; margin: 0 !important; } """) as demo: with gr.Column(): gr.HTML("""
Audio Flamingo 3 Logo

Audio Flamingo 3

Advancing Audio Intelligence with Fully Open Large Audio-Language Models

arXiv Demo Page GitHub GitHub Stars
""") # gr.Markdown("#### NVIDIA (2025)") with gr.Tabs(): # ---------------- MULTI-TURN CHAT ---------------- with gr.Tab("๐Ÿ’ฌ Multi-Turn Chat"): chatbot = gr.Chatbot(label="Audio Chatbot") audio_input_multi = gr.Audio(type="filepath", label="Upload or Replace Audio Context") user_input_multi = gr.Textbox(label="Your message", placeholder="Ask a question about the audio...", lines=8) btn_multi = gr.Button("Send") history_state = gr.State([]) # Chat history current_audio_state = gr.State(None) # Most recent audio file path btn_multi.click( fn=multi_turn_chat, inputs=[user_input_multi, audio_input_multi, history_state, current_audio_state], outputs=[chatbot, history_state, current_audio_state] ) gr.Examples( examples=[ ["static/chat/audio1.mp3", "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?"], ["static/chat/audio2.mp3", "Switching gears, this one is super energetic and synthetic. If I wanted to remix the calming folk piece into something closer to this, what would you suggest?"], ], inputs=[audio_input_multi, user_input_multi], label="๐Ÿงช Try Examples" ) with gr.Tab("๐Ÿ—ฃ๏ธ Speech Prompt"): gr.Markdown("Use your **voice** to talk to the model.") with gr.Row(): with gr.Column(): speech_input = gr.Audio(type="filepath", label="Speak or Upload Audio") btn_speech = gr.Button("Submit") gr.Examples( examples=[ ["static/voice/voice_0.mp3"], ["static/voice/voice_1.mp3"], ["static/voice/voice_2.mp3"], ], inputs=speech_input, label="๐Ÿงช Try Examples" ) with gr.Column(): response_box = gr.Textbox(label="Model Response", lines=15) btn_speech.click(fn=speech_prompt_infer, inputs=speech_input, outputs=response_box) # ---------------- ABOUT ---------------- with gr.Tab("๐Ÿ“„ About"): gr.Markdown(""" ### ๐Ÿ“š Overview **Audio Flamingo 3** is a fully open state-of-the-art (SOTA) large audio-language model that advances reasoning and understanding across speech, sound, and music. AF3 introduces: (i) AF-Whisper, a unified audio encoder trained using a novel strategy for joint representation learning across all 3 modalities of speech, sound, and music; (ii) flexible, on-demand thinking, allowing the model to do chain-of-thought reasoning before answering; (iii) multi-turn, multi-audio chat; (iv) long audio understanding and reasoning (including speech) up to 10 minutes; and (v) voice-to-voice interaction. To enable these capabilities, we propose several large-scale training datasets curated using novel strategies, including AudioSkills-XL, LongAudio-XL, AF-Think, and AF-Chat, and train AF3 with a novel five-stage curriculum-based training strategy. Trained on only open-source audio data, AF3 achieves new SOTA results on over 20+ (long) audio understanding and reasoning benchmarks, surpassing both open-weight and closed-source models trained on much larger datasets. **Key Features:** ๐Ÿ’ก Audio Flamingo 3 has strong audio, music and speech understanding capabilities. ๐Ÿ’ก Audio Flamingo 3 supports on-demand thinking for chain-of-thought reasoning. ๐Ÿ’ก Audio Flamingo 3 supports long audio and speech understanding for audios up to 10 minutes. ๐Ÿ’ก Audio Flamingo 3 can have multi-turn, multi-audio chat with users under complex context. ๐Ÿ’ก Audio Flamingo 3 has voice-to-voice conversation abilities. """) gr.Markdown("ยฉ 2025 NVIDIA | Built with โค๏ธ using Gradio + PyTorch") # ----------------------- # Launch App # ----------------------- if __name__ == "__main__": demo.launch(share=True)