File size: 2,058 Bytes
fe62fb4
bdb4f02
02bf1ff
 
dafcadc
c7362aa
02bf1ff
 
bdb4f02
dafcadc
02bf1ff
c7362aa
bdb4f02
 
 
dafcadc
c7362aa
02bf1ff
dafcadc
02bf1ff
bdb4f02
 
 
 
dafcadc
c7362aa
02bf1ff
bdb4f02
02bf1ff
c7362aa
02bf1ff
c7362aa
bc08da5
 
c7362aa
780c8d5
c7362aa
bdb4f02
 
780c8d5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import numpy as np
import soundfile
import msinference  # If using api.py/live_demo.py instead of this demo.py has also split into sentences for long form text OOM
from audiocraft.builders import AudioGen  # has custom accelerations for long form text - needs 14 GB of cuda

def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
              voice='en_US/m-ailabs_low#mary_ann', # Listen to voices https://huggingface.co/dkounadis/artificial-styletts2/discussions/1
              soundscape = 'birds fomig'):         # purposeful spells for AudioGen (behaves as controllable top-p)

    if ('en_US/' in voice) or ('en_UK/' in voice):

        style_vector = msinference.compute_style('assets/wavs/style_vector/' + voice.replace(
                                                '/', '_').replace('#', '_').replace(
                                                    'cmu-arctic', 'cmu_arctic').replace(
                                                        '_low', '') + '.wav')

        x = msinference.inference(text, style_vector)

    elif '_' in  voice:

        style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + voice.replace(
                                                '/', '_').replace('#', '_').replace(
                                                    'cmu-arctic', 'cmu_arctic').replace(
                                                        '_low', '') + '.wav')

        x = msinference.inference(text, style_vector)

    else:

        x = msinference.foreign(text=text, lang=voice)

    x /= 1.02 * np.abs(x).max() + 1e-7  # volume amplify full [-1,1]
    if soundscape is not None:
        sound_gen = AudioGen().to('cuda:0').eval()
        background = sound_gen.generate(soundscape, duration=len(x)/16000 + .74,  # sound duration seconds
                                              ).detach().cpu().numpy()
        x = .6 * x + .4 * background[:len(x)]
    return x

soundfile.write(f'demo.wav', tts_entry(), 16000)