File size: 2,506 Bytes
fe62fb4
bdb4f02
 
 
 
 
dd7320e
 
 
bdb4f02
 
dafcadc
bdb4f02
dafcadc
bdb4f02
dafcadc
9146509
dafcadc
 
 
9146509
bdb4f02
 
dafcadc
 
bdb4f02
 
dafcadc
bdb4f02
 
 
 
 
dafcadc
bdb4f02
9146509
dafcadc
 
 
 
bdb4f02
 
 
 
dafcadc
bdb4f02
9146509
dafcadc
bdb4f02
 
 
 
 
dd7320e
 
bdb4f02
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import numpy as np
import soundfile
import msinference


def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
              voice='af_ZA_google-nwu_1919',  # 'serbian', 'en_US/vctk_low#p276', 'isl',
              speed=1.4,  # only for MMS TTS
              affect = True  # False = higher clarity sound for partially sight
              ):
    '''returns 24kHZ np.array TTS

       voice : 'en_US/vctk_low#p276'  # from English voices -> https://audeering.github.io/shift/

          or

       voice : 'af_ZA_google-nwu_1919' # from english non-native accents -> https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6

          or

       voice : 'deu'  # foreign langs -> https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv
       '''

    # StyleTTS2 - En

    # mimic-3 format of voice (English txt - English accent)

    if ('en_US/' in voice) or ('en_UK/' in voice):
        a = '' if affect else 'v2/'
        style_vector = msinference.compute_style('assets/wavs/style_vector/' + a + voice.replace(
                                                '/', '_').replace('#', '_').replace(
                                                    'cmu-arctic', 'cmu_arctic').replace(
                                                        '_low', '') + '.wav')

        x = msinference.inference(text,
                                    style_vector)

    # mimic-3 format of voice (English text - Foreign accent)

    elif '_' in  voice:
        style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + voice.replace(
                                                '/', '_').replace('#', '_').replace(
                                                    'cmu-arctic', 'cmu_arctic').replace(
                                                        '_low', '') + '.wav')

        x = msinference.inference(text,
                                    style_vector)


    # Fallback - MMS TTS - Non-English

    else:

        # MMS TTS - list of sentences
        x = msinference.foreign(text=[text],
                                lang=voice,  # voice = 'romanian', 'serbian' 'hungarian'
                                speed=speed)  # normalisation externally

    # volume

    x /= np.abs(x).max() + 1e-7  # amplify speech to full [-1,1]
    print(x.shape, 'TTS OK')
    return x

soundfile.write(f'demo.wav', tts_entry(), 24000)