Spaces:
Running
on
T4
Running
on
T4
File size: 6,556 Bytes
68a11d5 ec11a77 185fc75 6cd09aa 62d7978 6cd09aa c69f215 6cd09aa 185fc75 6cd09aa 185fc75 6cd09aa 185fc75 6cd09aa 185fc75 6cd09aa aed09b0 6cd09aa aed09b0 6cd09aa aed09b0 6cd09aa aed09b0 6cd09aa aed09b0 6cd09aa 99b69ce 6a66802 62d7978 6cd09aa 5ad4ca1 1b1179a 6cd09aa 6a66802 9ad08b6 6a66802 62d7978 6a66802 62d7978 6a66802 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import os
from run_model_downloader import download_models
#if not os.path.exists("Models/ToucanTTS_Meta/best.pt"):
# download_models()
import gradio as gr
from Preprocessing.multilinguality.SimilaritySolver import load_json_from_path
from Utility.utils import float2pcm
import os
import torch
from Architectures.ControllabilityGAN.GAN import GanWrapper
from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
from Utility.storage_config import MODELS_DIR
class ControllableInterface(torch.nn.Module):
def __init__(self, available_artificial_voices=1000):
super().__init__()
self.model = ToucanTTSInterface(device="cuda", tts_model_path="Meta", language="eng")
self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device="cuda")
self.generated_speaker_embeds = list()
self.available_artificial_voices = available_artificial_voices
self.current_language = ""
self.current_accent = ""
def read(self,
prompt,
language,
accent,
voice_seed,
duration_scaling_factor,
pause_duration_scaling_factor,
pitch_variance_scale,
energy_variance_scale,
emb_slider_1,
emb_slider_2,
emb_slider_3,
emb_slider_4,
emb_slider_5,
emb_slider_6,
loudness_in_db
):
if self.current_language != language:
self.model.set_language(language)
self.current_language = language
self.wgan.set_latent(voice_seed)
controllability_vector = torch.tensor([emb_slider_1,
emb_slider_2,
emb_slider_3,
emb_slider_4,
emb_slider_5,
emb_slider_6], dtype=torch.float32)
embedding = self.wgan.modify_embed(controllability_vector)
self.model.set_utterance_embedding(embedding=embedding)
if len(prompt) > 1800:
raise AssertionError("The input is too long!")
phones = self.model.text2phone.get_phone_string(prompt)
if len(phones) > 1800:
raise AssertionError("The input is too long!")
print("\n\n")
print(prompt)
print(language)
print("\n\n")
wav, sr, fig = self.model(prompt,
input_is_phones=False,
duration_scaling_factor=duration_scaling_factor,
pitch_variance_scale=pitch_variance_scale,
energy_variance_scale=energy_variance_scale,
pause_duration_scaling_factor=pause_duration_scaling_factor,
return_plot_as_filepath=True,
loudness_in_db=loudness_in_db)
return sr, wav, fig
title = "Controllable Text-to-Speech for over 7000 Languages"
article = "Check out the IMS Toucan TTS Toolkit at https://github.com/DigitalPhonetics/IMS-Toucan"
available_artificial_voices = 1000
path_to_iso_list = "Preprocessing/multilinguality/iso_to_fullname.json"
iso_to_name = load_json_from_path(path_to_iso_list)
text_selection = [f"{iso_to_name[iso_code]} Text ({iso_code})" for iso_code in iso_to_name]
controllable_ui = ControllableInterface(available_artificial_voices=available_artificial_voices)
def read(prompt,
language,
voice_seed,
duration_scaling_factor,
pitch_variance_scale,
energy_variance_scale,
emb1,
emb2
):
with torch.no_grad():
sr, wav, fig = controllable_ui.read(prompt,
language.split(" ")[-1].split("(")[1].split(")")[0],
language.split(" ")[-1].split("(")[1].split(")")[0],
voice_seed,
duration_scaling_factor,
1.,
pitch_variance_scale,
energy_variance_scale,
emb1,
emb2,
0.,
0.,
0.,
0.,
-24.)
return (sr, float2pcm(wav)), fig
iface = gr.Interface(fn=read,
inputs=[gr.Textbox(lines=2,
placeholder="write what you want the synthesis to read here...",
value="What I cannot create, I do not understand.",
label="Text input"),
gr.Dropdown(text_selection,
type="value",
value='English Text (eng)',
label="Select the Language of the Text (type on your keyboard to find it quickly)"),
gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
value=279,
label="Random Seed for the artificial Voice"),
gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
],
outputs=[gr.Audio(type="numpy", label="Speech"),
gr.Image(label="Visualization")],
title=title,
theme="default",
allow_flagging="never",
article=article)
iface.launch()
|