Spaces:

Flux9665
/

MassivelyMultilingualTTS

Running on T4

App Files Files

Flux9665 commited on Jun 11, 2024

Commit

62d7978

1 Parent(s): 3a5e670

try to figure out how ZeroGPU works

Browse files

Files changed (2) hide show

Architectures/ControllabilityGAN/wgan/wgan_qc.py +4 -4
app.py +11 -16

Architectures/ControllabilityGAN/wgan/wgan_qc.py CHANGED Viewed

@@ -11,12 +11,13 @@ from cvxopt import sparse
 from cvxopt import spmatrix
 from torch.autograd import grad as torch_grad
 from tqdm import tqdm
-class WassersteinGanQuadraticCost:
-    def __init__(self, generator, discriminator, gen_optimizer, dis_optimizer, criterion, epochs, n_max_iterations,
-                 data_dimensions, batch_size, device, gamma=0.1, K=-1, milestones=[150000, 250000], lr_anneal=1.0):
         self.G = generator
         self.G_opt = gen_optimizer
         self.D = discriminator
@@ -242,7 +243,6 @@ class WassersteinGanQuadraticCost:
         else:
             latent_samples = self.G.sample_latent(num_samples, self.G.z_dim)
         latent_samples = latent_samples.to(self.device)
-        print(self.device)
         if nograd:
             with torch.no_grad():
                 generated_data = self.G(latent_samples, return_intermediate=return_intermediate)

 from cvxopt import spmatrix
 from torch.autograd import grad as torch_grad
 from tqdm import tqdm
+import spaces
+class WassersteinGanQuadraticCost(torch.nn.Module):
+    def __init__(self, generator, discriminator, gen_optimizer, dis_optimizer, criterion, epochs, n_max_iterations, data_dimensions, batch_size, device, gamma=0.1, K=-1, milestones=[150000, 250000], lr_anneal=1.0, *args, **kwargs):
+        super().__init__(*args, **kwargs)
         self.G = generator
         self.G_opt = gen_optimizer
         self.D = discriminator
         else:
             latent_samples = self.G.sample_latent(num_samples, self.G.z_dim)
         latent_samples = latent_samples.to(self.device)
         if nograd:
             with torch.no_grad():
                 generated_data = self.G(latent_samples, return_intermediate=return_intermediate)

app.py CHANGED Viewed

@@ -21,10 +21,10 @@ from Utility.storage_config import MODELS_DIR
 class ControllableInterface(torch.nn.Module):
-    def __init__(self, gan_wrapper, available_artificial_voices=1000):
         super().__init__()
         self.model = ToucanTTSInterface(device="cpu", tts_model_path="Meta")
-        self.wgan = gan_wrapper
         self.generated_speaker_embeds = list()
         self.available_artificial_voices = available_artificial_voices
         self.current_language = ""
@@ -117,9 +117,6 @@ class ControllableInterface(torch.nn.Module):
                                   loudness_in_db=loudness_in_db)
         return sr, wav, fig
-@spaces.GPU
-def get_gw():
-    return GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device="cuda" if torch.cuda.is_available() else "cpu")
 title = "Controllable Text-to-Speech for over 7000 Languages"
 article = "Check out the IMS Toucan TTS Toolkit at https://github.com/DigitalPhonetics/IMS-Toucan"
@@ -127,8 +124,7 @@ available_artificial_voices = 1000
 path_to_iso_list = "Preprocessing/multilinguality/iso_to_fullname.json"
 iso_to_name = load_json_from_path(path_to_iso_list)
 text_selection = [f"{iso_to_name[iso_code]} Text ({iso_code})" for iso_code in iso_to_name]
-gw = get_gw()
-controllable_ui = ControllableInterface(gan_wrapper=gw, available_artificial_voices=available_artificial_voices)
 def read(prompt,
@@ -157,7 +153,6 @@ def read(prompt,
                                             -24.)
     return (sr, float2pcm(wav)), fig
 iface = gr.Interface(fn=read,
                      inputs=[gr.Textbox(lines=2,
                                         placeholder="write what you want the synthesis to read here...",
@@ -169,15 +164,15 @@ iface = gr.Interface(fn=read,
                                          label="Select the Language of the Text (type on your keyboard to find it quickly)"),
                              gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
                                        value=279,
-                                           label="Random Seed for the artificial Voice"),
-                                 gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
-                                 gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
-                                 gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
-                                 gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
-                                 gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
-                                 ],
                      outputs=[gr.Audio(type="numpy", label="Speech"),
-                                  gr.Image(label="Visualization")],
                      title=title,
                      theme="default",
                      allow_flagging="never",

 class ControllableInterface(torch.nn.Module):
+    def __init__(self, available_artificial_voices=1000):
         super().__init__()
         self.model = ToucanTTSInterface(device="cpu", tts_model_path="Meta")
+        self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device="cpu")
         self.generated_speaker_embeds = list()
         self.available_artificial_voices = available_artificial_voices
         self.current_language = ""
                                   loudness_in_db=loudness_in_db)
         return sr, wav, fig
 title = "Controllable Text-to-Speech for over 7000 Languages"
 article = "Check out the IMS Toucan TTS Toolkit at https://github.com/DigitalPhonetics/IMS-Toucan"
 path_to_iso_list = "Preprocessing/multilinguality/iso_to_fullname.json"
 iso_to_name = load_json_from_path(path_to_iso_list)
 text_selection = [f"{iso_to_name[iso_code]} Text ({iso_code})" for iso_code in iso_to_name]
+controllable_ui = ControllableInterface(available_artificial_voices=available_artificial_voices)
 def read(prompt,
                                             -24.)
     return (sr, float2pcm(wav)), fig
 iface = gr.Interface(fn=read,
                      inputs=[gr.Textbox(lines=2,
                                         placeholder="write what you want the synthesis to read here...",
                                          label="Select the Language of the Text (type on your keyboard to find it quickly)"),
                              gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
                                        value=279,
+                                       label="Random Seed for the artificial Voice"),
+                             gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
+                             gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
+                             gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
+                             gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
+                             gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
+                             ],
                      outputs=[gr.Audio(type="numpy", label="Speech"),
+                              gr.Image(label="Visualization")],
                      title=title,
                      theme="default",
                      allow_flagging="never",