speech-to-speech-translation-ca

Runtime error

App Files Files Community

speech-to-speech-translation-ca / app.py

JanLilan

UPDAED app.py with català speaker embedding

2a3194a almost 2 years ago

raw

history blame

5.05 kB

	import os
	import torch
	import gradio as gr
	import numpy as np
	import torch
	from datasets import load_dataset
	from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
	from speechbrain.pretrained import EncoderClassifier

	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	# load speech translation checkpoint
	asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)

	# load text-to-speech checkpoint and speaker embeddings
	processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")

	# model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
	model = SpeechT5ForTextToSpeech.from_pretrained(
	"JanLilan/speecht5_finetuned_openslr-slr69-cat"
	).to(device)
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

	######################################################################################
	################################## SPEAKER EMBEDDING #################################
	######################################################################################
	# we will try to translate with this voice embedding... Let's see what happen. else:
	dataset = load_dataset("projecte-aina/openslr-slr69-ca-trimmed-denoised", split="train")
	dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
	# LOAD
	spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
	speaker_model = EncoderClassifier.from_hparams(
	source=spk_model_name,
	run_opts={"device": device},
	savedir=os.path.join("/tmp", spk_model_name),
	)

	def create_speaker_embedding(waveform):
	with torch.no_grad():
	speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
	speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
	speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
	return speaker_embeddings

	# we must take one speaker embeding
	checkpoint = "microsoft/speecht5_tts"
	processor = SpeechT5Processor.from_pretrained(checkpoint)

	# function to embedd
	def prepare_dataset(example):
	audio = example["audio"]

	example = processor(
	text=example["transcription"],
	audio_target=audio["array"],
	sampling_rate=audio["sampling_rate"],
	return_attention_mask=False,
	)

	# strip off the batch dimension
	example["labels"] = example["labels"][0]

	# use SpeechBrain to obtain x-vector
	example["speaker_embeddings"] = create_speaker_embedding(audio["array"])

	return example

	processed_example = prepare_dataset(dataset[0])
	speaker_embeddings = torch.tensor(processed_example["speaker_embeddings"]).unsqueeze(0)



	# etc.
	# embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	# speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)


	def translate(audio):
	outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "catalan"})
	return outputs["text"]


	def synthesise(text):
	inputs = processor(text=text, return_tensors="pt")
	speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
	return speech.cpu()


	def speech_to_speech_translation(audio):
	translated_text = translate(audio)
	synthesised_speech = synthesise(translated_text)
	synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
	return 16000, synthesised_speech


	title = "Demo STST - Multilingual to Català Speech"
	description = """
	Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Català. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation to català, and Microsoft's
	[SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech fine-tuned on [projecte-aina/openslr-slr69-ca-trimmed-denoised](https://huggingface.co/datasets/projecte-aina/openslr-slr69-ca-trimmed-denoised). This demo can be improve updating it with [projecte-aina/tts-ca-coqui-vits-multispeaker](https://huggingface.co/projecte-aina/tts-ca-coqui-vits-multispeaker) model:

	![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
	"""

	demo = gr.Blocks()

	mic_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="microphone", type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	title=title,
	description=description,
	)

	file_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="upload", type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	examples=[["./example.wav"]],
	title=title,
	description=description,
	)

	with demo:
	gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

	demo.launch()