File size: 2,084 Bytes
f013ca5
9d588a9
f013ca5
9d588a9
 
 
 
f013ca5
9d588a9
f013ca5
9d588a9
 
 
 
 
f013ca5
9d588a9
f013ca5
a20f38d
f013ca5
9d588a9
 
f013ca5
9d588a9
f013ca5
9d588a9
 
f013ca5
9d588a9
 
 
 
 
 
f013ca5
9d588a9
f013ca5
 
 
9d588a9
 
 
f013ca5
 
9d588a9
 
 
 
 
 
 
f013ca5
 
 
 
9d588a9
f013ca5
 
 
9d588a9
f013ca5
9d588a9
f013ca5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import torch
from transformers import pipeline

device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
    "automatic-speech-recognition", model="openai/whisper-base", device=device
)

# %%

def translate(audio):
    outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
    return outputs["text"]

# %%

from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan

processor = SpeechT5Processor.from_pretrained("sanchit-gandhi/speecht5_tts_vox_nl")

model = SpeechT5ForTextToSpeech.from_pretrained("sanchit-gandhi/speecht5_tts_vox_nl")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# %%

model.to(device)
vocoder.to(device)

# %%

from datasets import load_dataset, Audio

embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

# %%

def synthesise(text):
    inputs = processor(text=text, return_tensors="pt")
    speech = model.generate_speech(
        inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
    )
    return speech.cpu()

# %%

import numpy as np

target_dtype = np.int16
max_range = np.iinfo(target_dtype).max


def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
    return 16000, synthesised_speech


# %%

import gradio as gr

demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.launch()