|
import IPython |
|
from huggingface_hub.inference_api import InferenceApi |
|
import torch |
|
from TTS.api import TTS |
|
import wave |
|
import espeakng |
|
import subprocess |
|
from scipy.io import wavfile |
|
from transformers import pipeline |
|
import os |
|
import numpy as np |
|
|
|
def synth_mms(text:str, model:str): |
|
''' |
|
Use Huggingface inference pipeline to synthesize text. |
|
(Can be replaced by inference API, but that requires stored API token.) |
|
|
|
Inputs: |
|
text: Text to synthesze |
|
model: Model code of the form mms-tts-LAN |
|
Returns: |
|
Streaming numpy and sampling rate. |
|
''' |
|
|
|
|
|
|
|
|
|
|
|
if model is not None: |
|
pipe = pipeline("text-to-speech", model=model, device=-1) |
|
mms_tts = pipe(text) |
|
return mms_tts['audio'], mms_tts['sampling_rate'] |
|
else: |
|
return None |
|
|
|
|
|
|
|
def synth_coqui(text:str, model:str): |
|
''' |
|
Use Coqui inference API to synthesize text. |
|
|
|
Inputs: |
|
text: Text to synthesze |
|
model: Model code |
|
Returns: |
|
Streaming Wav and sampling rate. |
|
|
|
IMPORTANT: Current implementation assumes 22050 sampling rate, this should be verified when adding a new model. |
|
''' |
|
if model is not None: |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
tts = TTS(model, progress_bar=False).to(device) |
|
|
|
|
|
wav = tts.tts(text=text) |
|
|
|
return np.array(wav), 22050 |
|
else: |
|
return None |
|
|
|
|
|
def synth_espeakng(text:str, model:str): |
|
''' |
|
Use ESpeak-NG to synthesize text. |
|
|
|
Inputs: |
|
text: Text to synthesze |
|
model: Model code |
|
Returns: |
|
Streaming Wav and sampling rate. |
|
''' |
|
if model is not None: |
|
|
|
subprocess.run(['espeak-ng', f'-v{model}', "-w test.wav", text]) |
|
|
|
|
|
|
|
|
|
sampling_rate, wav = wavfile.read('test.wav') |
|
os.remove("test.wav") |
|
|
|
|
|
return wav, sampling_rate |
|
else: |
|
return None |
|
|