Spaces:

ByteDance
/

MegaTTS3

Running on Zero

App Files Files Community

ZiyueJiang commited on Apr 5

Commit

c90b394

1 Parent(s): f89f703

update gradio cached examples

Browse files

Files changed (2) hide show

.gitignore +2 -1
tts/gradio_api.py +16 -1

.gitignore CHANGED Viewed

	@@ -1 +1,2 @@
1	- checkpoints


1	+ checkpoints
2	+ official_test_case

tts/gradio_api.py CHANGED Viewed

@@ -26,7 +26,7 @@ os.system('huggingface-cli download ByteDance/MegaTTS3 --local-dir ./checkpoints
 CUDA_AVAILABLE = torch.cuda.is_available()
 infer_pipe = MegaTTS3DiTInfer(device='cuda' if CUDA_AVAILABLE else 'cpu')
-@spaces.GPU(duration=120)
 def forward_gpu(file_content, latent_file, inp_text, time_step, p_w, t_w):
     resource_context = infer_pipe.preprocess(file_content, latent_file)
     wav_bytes = infer_pipe.forward(resource_context, inp_text, time_step=time_step, p_w=p_w, t_w=t_w)
@@ -36,6 +36,14 @@ def model_worker(input_queue, output_queue, device_id):
     while True:
         task = input_queue.get()
         inp_audio_path, inp_npy_path, inp_text, infer_timestep, p_w, t_w = task
         try:
             convert_to_wav(inp_audio_path)
             wav_path = os.path.splitext(inp_audio_path)[0] + '.wav'
@@ -48,6 +56,7 @@ def model_worker(input_queue, output_queue, device_id):
             traceback.print_exc()
             print(task, str(e))
             output_queue.put(None)
 def main(inp_audio, inp_npy, inp_text, infer_timestep, p_w, t_w, processes, input_queue, output_queue):
@@ -85,6 +94,12 @@ if __name__ == '__main__':
                                         gr.Number(label="Intelligibility Weight", value=1.4),
                                         gr.Number(label="Similarity Weight", value=3.0)], outputs=[gr.Audio(label="Synthesized Audio")],
                                 title="MegaTTS3",
                                 description="Upload a speech clip as a reference for timbre, " +
                                 "upload the pre-extracted latent file, "+
                                 "input the target text, and receive the cloned voice. "+

 CUDA_AVAILABLE = torch.cuda.is_available()
 infer_pipe = MegaTTS3DiTInfer(device='cuda' if CUDA_AVAILABLE else 'cpu')
+@spaces.GPU(duration=60)
 def forward_gpu(file_content, latent_file, inp_text, time_step, p_w, t_w):
     resource_context = infer_pipe.preprocess(file_content, latent_file)
     wav_bytes = infer_pipe.forward(resource_context, inp_text, time_step=time_step, p_w=p_w, t_w=t_w)
     while True:
         task = input_queue.get()
         inp_audio_path, inp_npy_path, inp_text, infer_timestep, p_w, t_w = task
+        if inp_npy_path is None:
+            raise gr.Error("Please provide .npy file")
+        if (inp_audio_path[:-4] != inp_npy_path[:-4]):
+            raise gr.Error(".npy and .wav mismatch")
+        if len(inp_text) > 200:
+            raise gr.Error("input text is too long")
         try:
             convert_to_wav(inp_audio_path)
             wav_path = os.path.splitext(inp_audio_path)[0] + '.wav'
             traceback.print_exc()
             print(task, str(e))
             output_queue.put(None)
+            raise gr.Error("Generation failed")
 def main(inp_audio, inp_npy, inp_text, infer_timestep, p_w, t_w, processes, input_queue, output_queue):
                                         gr.Number(label="Intelligibility Weight", value=1.4),
                                         gr.Number(label="Similarity Weight", value=3.0)], outputs=[gr.Audio(label="Synthesized Audio")],
                                 title="MegaTTS3",
+                                examples=[
+                                    ['./official_test_case/范闲.wav', './official_test_case/范闲.npy', "你好呀，我是范闲。我给你读一段清泉石上流。"]
+                                    ['./official_test_case/周杰伦1.wav', './official_test_case/周杰伦1.npy', "有的时候嘛，我去台湾开演唱会的时候，会很喜欢来一碗卤肉饭的。"]
+                                    ['./official_test_case/keep_app.wav', './official_test_case/keep_app.npy', "Let do some exercise and practice more."]
+                                ],
+                                cache_examples=True,
                                 description="Upload a speech clip as a reference for timbre, " +
                                 "upload the pre-extracted latent file, "+
                                 "input the target text, and receive the cloned voice. "+