Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -635,20 +635,25 @@ def submit_caption(image_input, state, generated_caption, text_refiner, visual_c
|
|
| 635 |
state = state + [(None, f"{focus_info}")]
|
| 636 |
print("new_cap",focus_info)
|
| 637 |
|
| 638 |
-
refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
|
| 639 |
-
|
| 640 |
try:
|
| 641 |
-
waveform_visual, audio_output = tts.predict(focus_info
|
| 642 |
-
return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
|
|
|
|
|
|
| 643 |
except Exception as e:
|
| 644 |
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
| 645 |
print(f"Error during TTS prediction: {str(e)}")
|
| 646 |
-
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
|
|
|
|
| 647 |
|
| 648 |
else:
|
| 649 |
try:
|
| 650 |
waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
|
| 651 |
-
return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
|
|
|
|
|
|
| 652 |
except Exception as e:
|
| 653 |
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
| 654 |
print(f"Error during TTS prediction: {str(e)}")
|
|
|
|
| 635 |
state = state + [(None, f"{focus_info}")]
|
| 636 |
print("new_cap",focus_info)
|
| 637 |
|
| 638 |
+
# refined_image_input = create_bubble_frame(np.array(origin_image_input), focus_info, click_index, input_mask,
|
| 639 |
+
# input_points=input_points, input_labels=input_labels)
|
| 640 |
try:
|
| 641 |
+
waveform_visual, audio_output = tts.predict(focus_info, input_language, input_audio, input_mic, use_mic, agree)
|
| 642 |
+
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
| 643 |
+
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
| 644 |
+
|
| 645 |
except Exception as e:
|
| 646 |
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
| 647 |
print(f"Error during TTS prediction: {str(e)}")
|
| 648 |
+
# return state, state, refined_image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, None, None
|
| 649 |
+
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
| 650 |
|
| 651 |
else:
|
| 652 |
try:
|
| 653 |
waveform_visual, audio_output = tts.predict(generated_caption, input_language, input_audio, input_mic, use_mic, agree)
|
| 654 |
+
# return state, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
| 655 |
+
return state, state, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, waveform_visual, audio_output
|
| 656 |
+
|
| 657 |
except Exception as e:
|
| 658 |
state = state + [(None, f"Error during TTS prediction: {str(e)}")]
|
| 659 |
print(f"Error during TTS prediction: {str(e)}")
|