Spaces:
Runtime error
Runtime error
File size: 2,915 Bytes
d347764 275ebc6 d347764 7f18a5d f2eb61e 99623ea d347764 7219472 c88b4e1 99623ea fd6ca3f c88b4e1 fd6ca3f c88b4e1 d347764 c88b4e1 d347764 593ca04 ff2df5d 593ca04 99623ea c88b4e1 d347764 9e55989 c88b4e1 f2eb61e d347764 b2319dd c88b4e1 d347764 f805e49 b5787bb f2eb61e f805e49 c737803 d347764 593ca04 d347764 f805e49 d347764 c737803 593ca04 c737803 3946ba6 c737803 d347764 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import gradio as gr
import numpy as np
import torch
from datasets import load_dataset
import librosa
from transformers import pipeline
from transformers import BarkModel, BarkProcessor
from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
device = "cuda:0" if torch.cuda.is_available() else "cpu"
asr_model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
asr_processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
asr_model.to(device)
bark_model = BarkModel.from_pretrained("suno/bark-small")
bark_processor = BarkProcessor.from_pretrained("suno/bark-small")
bark_model.to(device)
def translate(audio):
sr, y = audio
y = y.astype(np.float32)
y /= np.max(np.abs(y))
if sr != 16000:
y = librosa.resample(y, orig_sr=sr, target_sr=16000)
inputs = asr_processor(y, sampling_rate=16000, return_tensors="pt")
generated_ids = asr_model.generate(inputs["input_features"],attention_mask=inputs["attention_mask"],
forced_bos_token_id=asr_processor.tokenizer.lang_code_to_id['it'],)
translation = asr_processor.batch_decode(generated_ids, skip_special_tokens=True)
return translation
def synthesise(text):
inputs = bark_processor(text=text, voice_preset="v2/it_speaker_4",return_tensors="pt")
speech = bark_model.generate(**inputs, do_sample=True)
speech = speech.cpu().numpy().squeeze()
return speech
def speech_to_speech_translation(audio):
translated_text = translate(audio)
synthesised_speech = synthesise(translated_text)
synthesised_speech = (synthesised_speech * 32767).astype(np.int16)
return 16000, synthesised_speech
title = "Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Italian. Demo uses Meta's [Speech2Text](https://huggingface.co/facebook/s2t-medium-mustc-multilingual-st) model for speech translation, and Suno's
[Bark](https://huggingface.co/suno/bark) model for text-to-speech:

"""
demo = gr.Blocks()
mic_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="microphone"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
title=title,
description=description,
)
file_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="upload"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
examples=[["./example.wav"]],
title=title,
description=description,
)
with demo:
gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
demo.launch()
|