speech-to-speech-translation

Running

denis-kazakov commited on Apr 20, 2024

Commit

fee55e0

verified ·

1 Parent(s): 6297784

Upload app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -29,11 +29,11 @@ def speech_to_speech_translation(audio):
     translated_text = translate(audio)
     synthesised_speech = synthesise(translated_text)
     synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
-    return 16000, synthesised_speech
 title = "Cascaded STST"
 description = """
-Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
 [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
 ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
@@ -44,7 +44,7 @@ demo = gr.Blocks()
 mic_translate = gr.Interface(
     fn=speech_to_speech_translation,
     inputs=gr.Audio(source="microphone", type="filepath"),
-    outputs=gr.Audio(label="Generated Speech", type="numpy"),
     title=title,
     description=description,
 )
@@ -52,7 +52,7 @@ mic_translate = gr.Interface(
 file_translate = gr.Interface(
     fn=speech_to_speech_translation,
     inputs=gr.Audio(source="upload", type="filepath"),
-    outputs=gr.Audio(label="Generated Speech", type="numpy"),
     examples=[["./example.wav"]],
     title=title,
     description=description,

     translated_text = translate(audio)
     synthesised_speech = synthesise(translated_text)
     synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
+    return [translated_text, (16000, synthesised_speech)]
 title = "Cascaded STST"
 description = """
+Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Russian. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
 [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
 ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
 mic_translate = gr.Interface(
     fn=speech_to_speech_translation,
     inputs=gr.Audio(source="microphone", type="filepath"),
+    outputs=['text', gr.Audio(label="Generated Speech", type="numpy")],
     title=title,
     description=description,
 )
 file_translate = gr.Interface(
     fn=speech_to_speech_translation,
     inputs=gr.Audio(source="upload", type="filepath"),
+    outputs=['text', gr.Audio(label="Generated Speech", type="numpy")],
     examples=[["./example.wav"]],
     title=title,
     description=description,