Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| import llava | |
| from peft import PeftModel | |
| import os | |
| from huggingface_hub import snapshot_download | |
| # --------------------------------- | |
| # MULTI-TURN MODEL SETUP | |
| # --------------------------------- | |
| MODEL_BASE_MULTI = snapshot_download(repo_id="nvidia/audio-flamingo-3-chat") | |
| # model_multi = llava.load(MODEL_BASE_MULTI, model_base=None, devices=[0]) | |
| model_multi = llava.load(MODEL_BASE_MULTI, model_base=None) | |
| model_multi = model_multi.to("cuda") | |
| generation_config_multi = model_multi.default_generation_config | |
| # --------------------------------- | |
| # MULTI-TURN INFERENCE FUNCTION | |
| # --------------------------------- | |
| def multi_turn_chat(user_input, audio_file, history, current_audio): | |
| """ | |
| Handle multi-turn chat interactions with audio context. | |
| Args: | |
| user_input: The user's text message/question about the audio | |
| audio_file: New audio file path if uploaded, otherwise None | |
| history: List of previous conversation turns as (user_msg, bot_response) tuples | |
| current_audio: Path to the currently active audio file in the conversation | |
| Returns: | |
| Tuple of (updated_chatbot_display, updated_history, updated_current_audio) | |
| """ | |
| try: | |
| if audio_file is not None: | |
| current_audio = audio_file # Update state if a new file is uploaded | |
| if current_audio is None: | |
| return history + [("System", "β Please upload an audio file before chatting.")], history, current_audio | |
| sound = llava.Sound(current_audio) | |
| prompt = f"<sound>\n{user_input}" | |
| response = model_multi.generate_content([sound, prompt], generation_config=generation_config_multi) | |
| history.append((user_input, response)) | |
| return history, history, current_audio | |
| except Exception as e: | |
| history.append((user_input, f"β Error: {str(e)}")) | |
| return history, history, current_audio | |
| def speech_prompt_infer(audio_prompt_file): | |
| """ | |
| Process speech/audio input and generate a text response. | |
| Args: | |
| audio_prompt_file: Path to the audio file containing the user's speech prompt | |
| Returns: | |
| String containing the model's text response or error message | |
| """ | |
| try: | |
| sound = llava.Sound(audio_prompt_file) | |
| full_prompt = "<sound>" | |
| response = model_multi.generate_content([sound, full_prompt], generation_config=generation_config_multi) | |
| return response | |
| except Exception as e: | |
| return f"β Error: {str(e)}" | |
| # --------------------------------- | |
| # INTERFACE | |
| # --------------------------------- | |
| with gr.Blocks(css=""" | |
| .gradio-container { | |
| max-width: 100% !important; | |
| width: 100% !important; | |
| margin: 0 !important; | |
| padding: 0 !important; | |
| } | |
| #component-0, .gr-block.gr-box { | |
| width: 100% !important; | |
| } | |
| .gr-block.gr-box, .gr-column, .gr-row { | |
| padding: 0 !important; | |
| margin: 0 !important; | |
| } | |
| """) as demo: | |
| with gr.Column(): | |
| gr.HTML(""" | |
| <div align="center"> | |
| <img src="https://raw.githubusercontent.com/NVIDIA/audio-flamingo/audio_flamingo_3/static/logo-no-bg.png" alt="Audio Flamingo 3 Logo" width="120" style="margin-bottom: 10px;"> | |
| <h2><strong>Audio Flamingo 3</strong></h2> | |
| <p><em>Advancing Audio Intelligence with Fully Open Large Audio-Language Models</em></p> | |
| </div> | |
| <div align="center" style="margin-top: 10px;"> | |
| <a href="https://arxiv.org/abs/2507.08128"> | |
| <img src="https://img.shields.io/badge/arXiv-2503.03983-AD1C18" alt="arXiv" style="display:inline;"> | |
| </a> | |
| <a href="https://research.nvidia.com/labs/adlr/AF3/"> | |
| <img src="https://img.shields.io/badge/Demo%20page-228B22" alt="Demo Page" style="display:inline;"> | |
| </a> | |
| <a href="https://github.com/NVIDIA/audio-flamingo"> | |
| <img src="https://img.shields.io/badge/Github-Audio_Flamingo_3-9C276A" alt="GitHub" style="display:inline;"> | |
| </a> | |
| <a href="https://github.com/NVIDIA/audio-flamingo/stargazers"> | |
| <img src="https://img.shields.io/github/stars/NVIDIA/audio-flamingo.svg?style=social" alt="GitHub Stars" style="display:inline;"> | |
| </a> | |
| </div> | |
| <div align="center" style="display: flex; justify-content: center; margin-top: 10px; flex-wrap: wrap; gap: 5px;"> | |
| <a href="https://huggingface.co/nvidia/audio-flamingo-3"> | |
| <img src="https://img.shields.io/badge/π€-Checkpoints-ED5A22.svg"> | |
| </a> | |
| <a href="https://huggingface.co/nvidia/audio-flamingo-3-chat"> | |
| <img src="https://img.shields.io/badge/π€-Checkpoints_(Chat)-ED5A22.svg"> | |
| </a> | |
| </div> | |
| <div align="center" style="display: flex; justify-content: center; margin-top: 10px; flex-wrap: wrap; gap: 5px;"> | |
| <a href="https://huggingface.co/datasets/nvidia/AudioSkills"> | |
| <img src="https://img.shields.io/badge/π€-Dataset:_AudioSkills--XL-ED5A22.svg"> | |
| </a> | |
| <a href="https://huggingface.co/datasets/nvidia/LongAudio"> | |
| <img src="https://img.shields.io/badge/π€-Dataset:_LongAudio--XL-ED5A22.svg"> | |
| </a> | |
| <a href="https://huggingface.co/datasets/nvidia/AF-Chat"> | |
| <img src="https://img.shields.io/badge/π€-Dataset:_AF--Chat-ED5A22.svg"> | |
| </a> | |
| <a href="https://huggingface.co/datasets/nvidia/AF-Think"> | |
| <img src="https://img.shields.io/badge/π€-Dataset:_AF--Think-ED5A22.svg"> | |
| </a> | |
| </div> | |
| """) | |
| # gr.Markdown("#### NVIDIA (2025)") | |
| with gr.Tabs(): | |
| # ---------------- MULTI-TURN CHAT ---------------- | |
| with gr.Tab("π¬ Multi-Turn Chat"): | |
| chatbot = gr.Chatbot(label="Audio Chatbot") | |
| audio_input_multi = gr.Audio(type="filepath", label="Upload or Replace Audio Context") | |
| user_input_multi = gr.Textbox(label="Your message", placeholder="Ask a question about the audio...", lines=8) | |
| btn_multi = gr.Button("Send") | |
| history_state = gr.State([]) # Chat history | |
| current_audio_state = gr.State(None) # Most recent audio file path | |
| btn_multi.click( | |
| fn=multi_turn_chat, | |
| inputs=[user_input_multi, audio_input_multi, history_state, current_audio_state], | |
| outputs=[chatbot, history_state, current_audio_state] | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["static/chat/audio1.mp3", "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?"], | |
| ["static/chat/audio2.mp3", "Switching gears, this one is super energetic and synthetic. If I wanted to remix the calming folk piece into something closer to this, what would you suggest?"], | |
| ], | |
| inputs=[audio_input_multi, user_input_multi], | |
| label="π§ͺ Try Examples" | |
| ) | |
| with gr.Tab("π£οΈ Speech Prompt"): | |
| gr.Markdown("Use your **voice** to talk to the model.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| speech_input = gr.Audio(type="filepath", label="Speak or Upload Audio") | |
| btn_speech = gr.Button("Submit") | |
| gr.Examples( | |
| examples=[ | |
| ["static/voice/voice_0.mp3"], | |
| ["static/voice/voice_1.mp3"], | |
| ["static/voice/voice_2.mp3"], | |
| ], | |
| inputs=speech_input, | |
| label="π§ͺ Try Examples" | |
| ) | |
| with gr.Column(): | |
| response_box = gr.Textbox(label="Model Response", lines=15) | |
| btn_speech.click(fn=speech_prompt_infer, inputs=speech_input, outputs=response_box) | |
| # ---------------- ABOUT ---------------- | |
| with gr.Tab("π About"): | |
| gr.Markdown(""" | |
| ### π Overview | |
| **Audio Flamingo 3** is a fully open state-of-the-art (SOTA) large audio-language model that advances reasoning and understanding across speech, sound, and music. AF3 introduces: | |
| (i) AF-Whisper, a unified audio encoder trained using a novel strategy for joint representation learning across all 3 modalities of speech, sound, and music; | |
| (ii) flexible, on-demand thinking, allowing the model to do chain-of-thought reasoning before answering; | |
| (iii) multi-turn, multi-audio chat; | |
| (iv) long audio understanding and reasoning (including speech) up to 10 minutes; and | |
| (v) voice-to-voice interaction. | |
| To enable these capabilities, we propose several large-scale training datasets curated using novel strategies, including AudioSkills-XL, LongAudio-XL, AF-Think, and AF-Chat, and train AF3 with a novel five-stage curriculum-based training strategy. Trained on only open-source audio data, AF3 achieves new SOTA results on over 20+ (long) audio understanding and reasoning benchmarks, surpassing both open-weight and closed-source models trained on much larger datasets. | |
| **Key Features:** | |
| π‘ Audio Flamingo 3 has strong audio, music and speech understanding capabilities. | |
| π‘ Audio Flamingo 3 supports on-demand thinking for chain-of-though reasoning. | |
| π‘ Audio Flamingo 3 supports long audio and speech understanding for audios up to 10 minutes. | |
| π‘ Audio Flamingo 3 can have multi-turn, multi-audio chat with users under complex context. | |
| π‘ Audio Flamingo 3 has voice-to-voice conversation abilities. | |
| """) | |
| gr.Markdown("Β© 2025 NVIDIA | Built with β€οΈ using Gradio + PyTorch") | |
| # ----------------------- | |
| # Launch App | |
| # ----------------------- | |
| if __name__ == "__main__": | |
| demo.launch(share=True, mcp_server=True) |