Spaces:

akhaliq
/

Real-Time-Voice-Cloning

Runtime error

App Files Files Community

Ahsen Khaliq commited on Sep 30, 2021

Commit

7f7f412

1 Parent(s): 5af8374

Update demo_cli.py

Browse files

Files changed (1) hide show

demo_cli.py +12 -18

demo_cli.py CHANGED Viewed

@@ -20,26 +20,21 @@ if __name__ == '__main__':
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
     parser.add_argument("-e", "--enc_model_fpath", type=Path,
-                        default="encoder/saved_models/pretrained.pt",
                         help="Path to a saved encoder")
     parser.add_argument("-s", "--syn_model_fpath", type=Path,
-                        default="synthesizer/saved_models/pretrained/pretrained.pt",
                         help="Path to a saved synthesizer")
     parser.add_argument("-v", "--voc_model_fpath", type=Path,
-                        default="vocoder/saved_models/pretrained/pretrained.pt",
                         help="Path to a saved vocoder")
-    parser.add_argument("--cpu", action="store_true", help=\
-        "If True, processing is done on CPU, even when a GPU is available.")
-    parser.add_argument("--no_sound", action="store_true", help=\
-        "If True, audio won't be played.")
-    parser.add_argument("--seed", type=int, default=None, help=\
-        "Optional random number seed value to make toolbox deterministic.")
-    parser.add_argument("--no_mp3_support", action="store_true", help=\
-        "If True, disallows loading mp3 files to prevent audioread errors when ffmpeg is not installed.")
     parser.add_argument("-audio", "--audio_path", type=Path, required = True,
                         help="Path to a audio file")
-    parser.add_argument("--text", type=str, required = True, help=\
-        "Text Input")
     args = parser.parse_args()
     print_args(args, parser)
     if not args.no_sound:
@@ -95,7 +90,7 @@ if __name__ == '__main__':
     # The sampling rate is the number of values (samples) recorded per second, it is set to
     # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
     # to an audio of 1 second.
-    print("\tTesting the encoder...")
     encoder.embed_utterance(np.zeros(encoder.sampling_rate))
     # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
@@ -109,7 +104,7 @@ if __name__ == '__main__':
     # illustrate that
     embeds = [embed, np.zeros(speaker_embedding_size)]
     texts = ["test 1", "test 2"]
-    print("\tTesting the synthesizer... (loading the model will output a lot of text)")
     mels = synthesizer.synthesize_spectrograms(texts, embeds)
     # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
@@ -118,7 +113,7 @@ if __name__ == '__main__':
     # The vocoder can take a callback function to display the generation. More on that later. For
     # now we'll simply hide it like this:
     no_action = lambda *args: None
-    print("\tTesting the vocoder...")
     # For the sake of making this test short, we'll pass a short target length. The target length
     # is the length of the wav segments that are processed in parallel. E.g. for audio sampled
     # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
@@ -139,8 +134,7 @@ if __name__ == '__main__':
     # while True:
     try:
         # Get the reference audio filepath
-        message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \
-                    "wav, m4a, flac, ...):\n"
         in_fpath = args.audio_path
         if in_fpath.suffix.lower() == ".mp3" and args.no_mp3_support:

         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
     parser.add_argument("-e", "--enc_model_fpath", type=Path,
+                        default="encpretrained.pt",
                         help="Path to a saved encoder")
     parser.add_argument("-s", "--syn_model_fpath", type=Path,
+                        default="synpretrained.pt",
                         help="Path to a saved synthesizer")
     parser.add_argument("-v", "--voc_model_fpath", type=Path,
+                        default="vocpretrained.pt",
                         help="Path to a saved vocoder")
+    parser.add_argument("--cpu", action="store_true", help=\\n        "If True, processing is done on CPU, even when a GPU is available.")
+    parser.add_argument("--no_sound", action="store_true", help=\\n        "If True, audio won't be played.")
+    parser.add_argument("--seed", type=int, default=None, help=\\n        "Optional random number seed value to make toolbox deterministic.")
+    parser.add_argument("--no_mp3_support", action="store_true", help=\\n        "If True, disallows loading mp3 files to prevent audioread errors when ffmpeg is not installed.")
     parser.add_argument("-audio", "--audio_path", type=Path, required = True,
                         help="Path to a audio file")
+    parser.add_argument("--text", type=str, required = True, help=\\n        "Text Input")
     args = parser.parse_args()
     print_args(args, parser)
     if not args.no_sound:
     # The sampling rate is the number of values (samples) recorded per second, it is set to
     # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
     # to an audio of 1 second.
+    print("	Testing the encoder...")
     encoder.embed_utterance(np.zeros(encoder.sampling_rate))
     # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
     # illustrate that
     embeds = [embed, np.zeros(speaker_embedding_size)]
     texts = ["test 1", "test 2"]
+    print("	Testing the synthesizer... (loading the model will output a lot of text)")
     mels = synthesizer.synthesize_spectrograms(texts, embeds)
     # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
     # The vocoder can take a callback function to display the generation. More on that later. For
     # now we'll simply hide it like this:
     no_action = lambda *args: None
+    print("	Testing the vocoder...")
     # For the sake of making this test short, we'll pass a short target length. The target length
     # is the length of the wav segments that are processed in parallel. E.g. for audio sampled
     # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
     # while True:
     try:
         # Get the reference audio filepath
+        message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \\n                    "wav, m4a, flac, ...):\n"
         in_fpath = args.audio_path
         if in_fpath.suffix.lower() == ".mp3" and args.no_mp3_support: