Spaces:

marioboy
/

neil-breen

Runtime error

App Files Files Community

marioboy commited on Aug 18, 2022

Commit

d2a588b

1 Parent(s): 229302e

feat: add universal approach for multiple models

Browse files

Files changed (3) hide show

.gitignore +1 -0
app.py +14 -9
demo_cli.py +4 -48

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ README.md

app.py CHANGED Viewed

@@ -3,18 +3,23 @@ import os
 import shlex
 import random
-os.system("megadl https://mega.nz/folder/7d4xUIIa#TnvmAWa5Av7QGo6gAuQj7g")
 os.system("ls")
 def inference(text):
     os.system("python demo_cli.py --no_sound --cpu --text " + shlex.quote(text.strip()))
-    image_number = random.randint(2, len(os.listdir("pat_gifs/")))
-    return [f"pat_gifs/{image_number}.gif", "demo_output_1.wav"]
-title = "Pat NES Punk's Voice"
-description = "<center> Text-to-speech engine with Pat Contri's voice. </center>"
 article = "<p style='text-align: center'>Based on <a href='https://matheo.uliege.be/handle/2268.2/6801' target='_blank'>Real-Time Voice Cloning</a> | <a href='https://github.com/CorentinJ/Real-Time-Voice-Cloning' target='_blank'>Github Repo</a></p>"
 examples = [
@@ -25,7 +30,7 @@ examples = [
         "My name is Samantha Morris. I'm the editor of an internet news magazine exploring news most media shy away from."
     ],
     [
-        'I have a morning ritual that I need to share. I call it "the terminator". First I crouch down in the shower in the classic "naked terminator traveling through time" pose.'
     ],
     [
         'With my eyes closed I crouch there for a minute, visualizing either Arnold or the guy from the second movie (not the chick in the third one because that one sucked) and I start to hum the terminator theme.'
@@ -44,12 +49,12 @@ gr.Interface(
     inference,
     inputs=["text"],
     outputs=[
-        gr.Image(show_label=False, shape=(20, 20), value="pat_gifs/1.gif"),
         gr.outputs.Audio(type="file", label="Speech"),
     ],
     enable_queue=True,
-    title=title,
-    description=description,
     article=article,
     examples=examples
 ).launch()

 import shlex
 import random
+LINK = os.environ.get('link')
+ALIAS = os.environ.get('alias')
+TITLE = os.environ.get('title')
+DESCRIPTION = os.environ.get('description')
+os.system(f"megadl {LINK}")
 os.system("ls")
 def inference(text):
     os.system("python demo_cli.py --no_sound --cpu --text " + shlex.quote(text.strip()))
+    image_number = random.randint(2, len(os.listdir(f"images/{ALIAS}/")))
+    return [f"images/{ALIAS}/{image_number}.gif", "demo_output_1.wav"]
 article = "<p style='text-align: center'>Based on <a href='https://matheo.uliege.be/handle/2268.2/6801' target='_blank'>Real-Time Voice Cloning</a> | <a href='https://github.com/CorentinJ/Real-Time-Voice-Cloning' target='_blank'>Github Repo</a></p>"
 examples = [
         "My name is Samantha Morris. I'm the editor of an internet news magazine exploring news most media shy away from."
     ],
     [
+        'I have a morning ritual that I need to share. I call it - the terminator. First I crouch down in the shower in the classic naked terminator traveling through time pose.'
     ],
     [
         'With my eyes closed I crouch there for a minute, visualizing either Arnold or the guy from the second movie (not the chick in the third one because that one sucked) and I start to hum the terminator theme.'
     inference,
     inputs=["text"],
     outputs=[
+        gr.Image(show_label=False, shape=(20, 20), value=f"images/{ALIAS}/1.gif"),
         gr.outputs.Audio(type="file", label="Speech"),
     ],
     enable_queue=True,
+    title=TITLE,
+    description=DESCRIPTION,
     article=article,
     examples=examples
 ).launch()

demo_cli.py CHANGED Viewed

@@ -15,6 +15,8 @@ import os
 from audioread.exceptions import NoBackendError
 import pickle
 if __name__ == '__main__':
     ## Info & args
     parser = argparse.ArgumentParser(
@@ -78,51 +80,7 @@ if __name__ == '__main__':
     encoder.load_model(args.enc_model_fpath)
     synthesizer = Synthesizer(args.syn_model_fpath)
     vocoder.load_model(args.voc_model_fpath)
-    ## Run a test
-    # print("Testing your configuration with small inputs.")
-    # # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
-    # # sampling rate, which may differ.
-    # # If you're unfamiliar with digital audio, know that it is encoded as an array of floats
-    # # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
-    # # The sampling rate is the number of values (samples) recorded per second, it is set to
-    # # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
-    # # to an audio of 1 second.
-    # print("	Testing the encoder...")
-    # encoder.embed_utterance(np.zeros(encoder.sampling_rate))
-    # # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
-    # # returns, but here we're going to make one ourselves just for the sake of showing that it's
-    # # possible.
-    # embed = np.random.rand(speaker_embedding_size)
-    # # Embeddings are L2-normalized (this isn't important here, but if you want to make your own
-    # # embeddings it will be).
-    # embed /= np.linalg.norm(embed)
-    # # The synthesizer can handle multiple inputs with batching. Let's create another embedding to
-    # # illustrate that
-    # embeds = [embed, np.zeros(speaker_embedding_size)]
-    # texts = ["test 1", "test 2"]
-    # print("	Testing the synthesizer... (loading the model will output a lot of text)")
-    # mels = synthesizer.synthesize_spectrograms(texts, embeds)
-    # # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
-    # # can concatenate the mel spectrograms to a single one.
-    # mel = np.concatenate(mels, axis=1)
-    # # The vocoder can take a callback function to display the generation. More on that later. For
-    # # now we'll simply hide it like this:
-    # no_action = lambda *args: None
-    # print("	Testing the vocoder...")
-    # # For the sake of making this test short, we'll pass a short target length. The target length
-    # # is the length of the wav segments that are processed in parallel. E.g. for audio sampled
-    # # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
-    # # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
-    # # that has a detrimental effect on the quality of the audio. The default parameters are
-    # # recommended in general.
-    # vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
-    print("All test passed! You can now synthesize speech.\n\n")
     ## Interactive speech generation
     print("This is a GUI-less example of interface to SV2TTS. The purpose of this script is to "
@@ -142,9 +100,8 @@ if __name__ == '__main__':
     # The following two methods are equivalent:
     # - Directly load from the filepath:
-    with open('pat.pickle', 'rb') as handle:
         preprocessed_wav = pickle.load(handle)
-    # - If the wav is already loaded:
     print("Loaded file succesfully")
@@ -198,4 +155,3 @@ if __name__ == '__main__':
     print(generated_wav.dtype)
     sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
     print("\nSaved output as %s\n\n" % filename)
-    print(os.environ)

 from audioread.exceptions import NoBackendError
 import pickle
+ALIAS = os.environ.get('alias', 'breen')
 if __name__ == '__main__':
     ## Info & args
     parser = argparse.ArgumentParser(
     encoder.load_model(args.enc_model_fpath)
     synthesizer = Synthesizer(args.syn_model_fpath)
     vocoder.load_model(args.voc_model_fpath)
     ## Interactive speech generation
     print("This is a GUI-less example of interface to SV2TTS. The purpose of this script is to "
     # The following two methods are equivalent:
     # - Directly load from the filepath:
+    with open(f'pickles/{ALIAS}.pickle', 'rb') as handle:
         preprocessed_wav = pickle.load(handle)
     print("Loaded file succesfully")
     print(generated_wav.dtype)
     sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
     print("\nSaved output as %s\n\n" % filename)