Spaces:

TeamTonic
/

SeamlessOnDevice

Running

File size: 1,724 Bytes

3eefcff
 
 
 
678c468
 
 
 
 
 
 
 
3eefcff
678c468
 
3eefcff
 
678c468
3eefcff
 
678c468
 
3eefcff
 
678c468
3eefcff
 
 
 
 
 
 
678c468
 
 
 
3eefcff
 
 
 
 
 
678c468
 
 
 
3eefcff
 
 
 
8fa6df8

import gradio as gr
import torchaudio
import torch

# Define the list of target languages
languages = {
    "English": "eng",
    "Hindi": "hin",
    "Portuguese": "por",
    "Russian": "rus",
    "Spanish": "spa"
}

def speech_to_text(audio_data, tgt_lang):
    audio_input, _ = torchaudio.load(audio_data)
    s2t_model = torch.jit.load("unity_on_device_s2t.ptl")
    with torch.no_grad():
        text = s2t_model(audio_input, tgt_lang=languages[tgt_lang])
    return text

def speech_to_speech_translation(audio_data, tgt_lang):
    audio_input, _ = torchaudio.load(audio_data)
    s2st_model = torch.jit.load("unity_on_device.ptl")
    with torch.no_grad():
        text, units, waveform = s2st_model(audio_input, tgt_lang=languages[tgt_lang])
    output_file = "/tmp/result.wav"
    torchaudio.save(output_file, waveform.unsqueeze(0), sample_rate=16000)
    return text, output_file

# Gradio interfaces
iface_s2t = gr.Interface(
    fn=speech_to_text,
    inputs=[
        gr.inputs.Audio(label="Upload or Record Audio for Speech to Text"),
        gr.inputs.Dropdown(list(languages.keys()), label="Select Target Language")
    ],
    outputs="text",
    title="Speech to Text"
)

iface_s2st = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=[
        gr.inputs.Audio(label="Upload or Record Audio for Speech to Speech Translation"),
        gr.inputs.Dropdown(list(languages.keys()), label="Select Target Language")
    ],
    outputs=["text", "audio"],
    title="Speech to Speech Translation"
)

# Combine into an accordion interface
accordion = gr.Accordion(
    iface_s2t,
    iface_s2st,
    labels=["Speech to Text", "Speech to Speech Translation"]
)

# Launch the application
accordion.launch()