File size: 2,506 Bytes
fe62fb4 bdb4f02 dd7320e bdb4f02 dafcadc bdb4f02 dafcadc bdb4f02 dafcadc 9146509 dafcadc 9146509 bdb4f02 dafcadc bdb4f02 dafcadc bdb4f02 dafcadc bdb4f02 9146509 dafcadc bdb4f02 dafcadc bdb4f02 9146509 dafcadc bdb4f02 dd7320e bdb4f02 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import numpy as np
import soundfile
import msinference
def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
voice='af_ZA_google-nwu_1919', # 'serbian', 'en_US/vctk_low#p276', 'isl',
speed=1.4, # only for MMS TTS
affect = True # False = higher clarity sound for partially sight
):
'''returns 24kHZ np.array TTS
voice : 'en_US/vctk_low#p276' # from English voices -> https://audeering.github.io/shift/
or
voice : 'af_ZA_google-nwu_1919' # from english non-native accents -> https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6
or
voice : 'deu' # foreign langs -> https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv
'''
# StyleTTS2 - En
# mimic-3 format of voice (English txt - English accent)
if ('en_US/' in voice) or ('en_UK/' in voice):
a = '' if affect else 'v2/'
style_vector = msinference.compute_style('assets/wavs/style_vector/' + a + voice.replace(
'/', '_').replace('#', '_').replace(
'cmu-arctic', 'cmu_arctic').replace(
'_low', '') + '.wav')
x = msinference.inference(text,
style_vector)
# mimic-3 format of voice (English text - Foreign accent)
elif '_' in voice:
style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + voice.replace(
'/', '_').replace('#', '_').replace(
'cmu-arctic', 'cmu_arctic').replace(
'_low', '') + '.wav')
x = msinference.inference(text,
style_vector)
# Fallback - MMS TTS - Non-English
else:
# MMS TTS - list of sentences
x = msinference.foreign(text=[text],
lang=voice, # voice = 'romanian', 'serbian' 'hungarian'
speed=speed) # normalisation externally
# volume
x /= np.abs(x).max() + 1e-7 # amplify speech to full [-1,1]
print(x.shape, 'TTS OK')
return x
soundfile.write(f'demo.wav', tts_entry(), 24000)
|