speech-to-speech-translation-audiocourse-unit7

Sleeping

EwoutLagendijk commited on Dec 26, 2024

Commit

b1962e4

verified ·

1 Parent(s): de757ed

Update app.py

Added translating to French

Files changed (1) hide show

app.py CHANGED Viewed

@@ -8,22 +8,31 @@ from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Proce
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
-# load speech translation checkpoint
 asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
 # load text-to-speech checkpoint and speaker embeddings
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
 speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 def translate(audio):
-    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
-    return outputs["text"]
 def synthesise(text):

 device = "cuda:0" if torch.cuda.is_available() else "cpu"
+# load speech recognition checkpoint
 asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
+# load translation model for translating transcribed text to French
+translation_model_name = "Helsinki-NLP/opus-mt-en-fr"
+translation_model = MarianMTModel.from_pretrained(translation_model_name).to(device)
+translation_tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
 # load text-to-speech checkpoint and speaker embeddings
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
 speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 def translate(audio):
+    # Transcribe speech to text (Whisper ASR)
+    transcription = asr_pipe(audio)["text"]
+    # Translate the transcribed text from English to French
+    translated = translation_model.generate(**translation_tokenizer(transcription, return_tensors="pt", padding=True).to(device))
+    translated_text = translation_tokenizer.decode(translated[0], skip_special_tokens=True)
+    return translated_text
 def synthesise(text):