Spaces:
Running
on
A100
Running
on
A100
Update app.py
Browse files
app.py
CHANGED
@@ -18,46 +18,24 @@ generation_config_multi = model_multi.default_generation_config
|
|
18 |
# ---------------------------------
|
19 |
# MULTI-TURN INFERENCE FUNCTION
|
20 |
# ---------------------------------
|
21 |
-
|
22 |
-
# try:
|
23 |
-
# if audio_file is not None:
|
24 |
-
# current_audio = audio_file # Update state if a new file is uploaded
|
25 |
-
|
26 |
-
# if current_audio is None:
|
27 |
-
# return history + [("System", "β Please upload an audio file before chatting.")], history, current_audio
|
28 |
-
|
29 |
-
# sound = llava.Sound(current_audio)
|
30 |
-
# prompt = f"<sound>\n{user_input}"
|
31 |
-
|
32 |
-
# response = model_multi.generate_content([sound, prompt], generation_config=generation_config_multi)
|
33 |
-
|
34 |
-
# history.append((user_input, response))
|
35 |
-
# return history, history, current_audio
|
36 |
-
# except Exception as e:
|
37 |
-
# history.append((user_input, f"β Error: {str(e)}"))
|
38 |
-
# return history, history, current_audio
|
39 |
-
|
40 |
-
def multi_turn_chat(user_input, audio_file, history, audio_history):
|
41 |
try:
|
42 |
if audio_file is not None:
|
43 |
-
|
44 |
-
|
45 |
-
if not audio_history:
|
46 |
-
return history + [("System", "β Please upload an audio file before chatting.")], history, audio_history
|
47 |
|
48 |
-
|
49 |
-
|
50 |
|
51 |
-
|
52 |
prompt = f"<sound>\n{user_input}"
|
53 |
-
response = model_multi.generate_content(audio_sounds + [prompt], generation_config=generation_config_multi)
|
54 |
|
55 |
-
|
56 |
-
return history, history, audio_history
|
57 |
|
|
|
|
|
58 |
except Exception as e:
|
59 |
history.append((user_input, f"β Error: {str(e)}"))
|
60 |
-
return history, history,
|
61 |
|
62 |
|
63 |
def speech_prompt_infer(audio_prompt_file):
|
@@ -142,16 +120,13 @@ with gr.Blocks(css="""
|
|
142 |
user_input_multi = gr.Textbox(label="Your message", placeholder="Ask a question about the audio...", lines=8)
|
143 |
btn_multi = gr.Button("Send")
|
144 |
history_state = gr.State([]) # Chat history
|
145 |
-
|
146 |
-
audio_history_state = gr.State([]) # List of audio file paths
|
147 |
|
148 |
|
149 |
btn_multi.click(
|
150 |
fn=multi_turn_chat,
|
151 |
-
inputs=[user_input_multi, audio_input_multi, history_state,
|
152 |
-
outputs=[chatbot, history_state,
|
153 |
-
# inputs=[user_input_multi, audio_input_multi, history_state, current_audio_state],
|
154 |
-
# outputs=[chatbot, history_state, current_audio_state]
|
155 |
)
|
156 |
gr.Examples(
|
157 |
examples=[
|
@@ -207,7 +182,7 @@ To enable these capabilities, we propose several large-scale training datasets c
|
|
207 |
|
208 |
π‘ Audio Flamingo 3 has strong audio, music and speech understanding capabilities.
|
209 |
|
210 |
-
π‘ Audio Flamingo 3 supports on-demand thinking for chain-of-
|
211 |
|
212 |
π‘ Audio Flamingo 3 supports long audio and speech understanding for audios up to 10 minutes.
|
213 |
|
|
|
18 |
# ---------------------------------
|
19 |
# MULTI-TURN INFERENCE FUNCTION
|
20 |
# ---------------------------------
|
21 |
+
def multi_turn_chat(user_input, audio_file, history, current_audio):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
try:
|
23 |
if audio_file is not None:
|
24 |
+
current_audio = audio_file # Update state if a new file is uploaded
|
|
|
|
|
|
|
25 |
|
26 |
+
if current_audio is None:
|
27 |
+
return history + [("System", "β Please upload an audio file before chatting.")], history, current_audio
|
28 |
|
29 |
+
sound = llava.Sound(current_audio)
|
30 |
prompt = f"<sound>\n{user_input}"
|
|
|
31 |
|
32 |
+
response = model_multi.generate_content([sound, prompt], generation_config=generation_config_multi)
|
|
|
33 |
|
34 |
+
history.append((user_input, response))
|
35 |
+
return history, history, current_audio
|
36 |
except Exception as e:
|
37 |
history.append((user_input, f"β Error: {str(e)}"))
|
38 |
+
return history, history, current_audio
|
39 |
|
40 |
|
41 |
def speech_prompt_infer(audio_prompt_file):
|
|
|
120 |
user_input_multi = gr.Textbox(label="Your message", placeholder="Ask a question about the audio...", lines=8)
|
121 |
btn_multi = gr.Button("Send")
|
122 |
history_state = gr.State([]) # Chat history
|
123 |
+
current_audio_state = gr.State(None) # Most recent audio file path
|
|
|
124 |
|
125 |
|
126 |
btn_multi.click(
|
127 |
fn=multi_turn_chat,
|
128 |
+
inputs=[user_input_multi, audio_input_multi, history_state, current_audio_state],
|
129 |
+
outputs=[chatbot, history_state, current_audio_state]
|
|
|
|
|
130 |
)
|
131 |
gr.Examples(
|
132 |
examples=[
|
|
|
182 |
|
183 |
π‘ Audio Flamingo 3 has strong audio, music and speech understanding capabilities.
|
184 |
|
185 |
+
π‘ Audio Flamingo 3 supports on-demand thinking for chain-of-thought reasoning.
|
186 |
|
187 |
π‘ Audio Flamingo 3 supports long audio and speech understanding for audios up to 10 minutes.
|
188 |
|