File size: 2,058 Bytes
fe62fb4 bdb4f02 02bf1ff dafcadc c7362aa 02bf1ff bdb4f02 dafcadc 02bf1ff c7362aa bdb4f02 dafcadc c7362aa 02bf1ff dafcadc 02bf1ff bdb4f02 dafcadc c7362aa 02bf1ff bdb4f02 02bf1ff c7362aa 02bf1ff c7362aa bc08da5 c7362aa 780c8d5 c7362aa bdb4f02 780c8d5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
import numpy as np
import soundfile
import msinference # If using api.py/live_demo.py instead of this demo.py has also split into sentences for long form text OOM
from audiocraft.builders import AudioGen # has custom accelerations for long form text - needs 14 GB of cuda
def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
voice='en_US/m-ailabs_low#mary_ann', # Listen to voices https://huggingface.co/dkounadis/artificial-styletts2/discussions/1
soundscape = 'birds fomig'): # purposeful spells for AudioGen (behaves as controllable top-p)
if ('en_US/' in voice) or ('en_UK/' in voice):
style_vector = msinference.compute_style('assets/wavs/style_vector/' + voice.replace(
'/', '_').replace('#', '_').replace(
'cmu-arctic', 'cmu_arctic').replace(
'_low', '') + '.wav')
x = msinference.inference(text, style_vector)
elif '_' in voice:
style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + voice.replace(
'/', '_').replace('#', '_').replace(
'cmu-arctic', 'cmu_arctic').replace(
'_low', '') + '.wav')
x = msinference.inference(text, style_vector)
else:
x = msinference.foreign(text=text, lang=voice)
x /= 1.02 * np.abs(x).max() + 1e-7 # volume amplify full [-1,1]
if soundscape is not None:
sound_gen = AudioGen().to('cuda:0').eval()
background = sound_gen.generate(soundscape, duration=len(x)/16000 + .74, # sound duration seconds
).detach().cpu().numpy()
x = .6 * x + .4 * background[:len(x)]
return x
soundfile.write(f'demo.wav', tts_entry(), 16000)
|