Spaces:
Runtime error
Runtime error
Seed
Browse files
app.py
CHANGED
|
@@ -10,6 +10,8 @@ from audioldm.audio.stft import TacotronSTFT
|
|
| 10 |
from audioldm.variational_autoencoder import AutoencoderKL
|
| 11 |
from pydub import AudioSegment
|
| 12 |
|
|
|
|
|
|
|
| 13 |
# Automatic device detection
|
| 14 |
if torch.cuda.is_available():
|
| 15 |
device_type = "cuda"
|
|
@@ -81,11 +83,18 @@ tango.vae.to(device_type)
|
|
| 81 |
tango.stft.to(device_type)
|
| 82 |
tango.model.to(device_type)
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
def check(
|
| 85 |
prompt,
|
| 86 |
output_number,
|
| 87 |
steps,
|
| 88 |
-
guidance
|
|
|
|
|
|
|
| 89 |
):
|
| 90 |
if prompt is None or prompt == "":
|
| 91 |
raise gr.Error("Please provide a prompt input.")
|
|
@@ -104,9 +113,18 @@ def text2audio(
|
|
| 104 |
prompt,
|
| 105 |
output_number,
|
| 106 |
steps,
|
| 107 |
-
guidance
|
|
|
|
|
|
|
| 108 |
):
|
| 109 |
start = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
output_wave = tango.generate(prompt, steps, guidance, output_number)
|
| 111 |
|
| 112 |
output_wave_1 = gr.make_waveform((16000, output_wave[0]))
|
|
@@ -162,6 +180,8 @@ with gr.Blocks() as interface:
|
|
| 162 |
output_number = gr.Slider(label = "Number of generations", info = "1, 2 or 3 output files", minimum = 1, maximum = 3, value = 3, step = 1, interactive = True)
|
| 163 |
denoising_steps = gr.Slider(label = "Steps", info = "lower=faster & variant, higher=audio quality & similar", minimum = 10, maximum = 200, value = 10, step = 1, interactive = True)
|
| 164 |
guidance_scale = gr.Slider(label = "Guidance Scale", info = "lower=audio quality, higher=follow the prompt", minimum = 1, maximum = 10, value = 3, step = 0.1, interactive = True)
|
|
|
|
|
|
|
| 165 |
|
| 166 |
submit = gr.Button("🚀 Generate", variant = "primary")
|
| 167 |
|
|
@@ -170,11 +190,18 @@ with gr.Blocks() as interface:
|
|
| 170 |
output_audio_3 = gr.Audio(label = "Generated Audio #3/3", format = "wav", type="numpy")
|
| 171 |
information = gr.Label(label = "Information")
|
| 172 |
|
| 173 |
-
submit.click(fn =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
input_text,
|
| 175 |
output_number,
|
| 176 |
denoising_steps,
|
| 177 |
-
guidance_scale
|
|
|
|
|
|
|
| 178 |
], outputs = [], queue = False, show_progress = False).success(fn = update_output, inputs = [
|
| 179 |
output_format,
|
| 180 |
output_number
|
|
@@ -187,7 +214,9 @@ with gr.Blocks() as interface:
|
|
| 187 |
input_text,
|
| 188 |
output_number,
|
| 189 |
denoising_steps,
|
| 190 |
-
guidance_scale
|
|
|
|
|
|
|
| 191 |
], outputs = [
|
| 192 |
output_audio_1,
|
| 193 |
output_audio_2,
|
|
@@ -209,7 +238,9 @@ with gr.Blocks() as interface:
|
|
| 209 |
input_text,
|
| 210 |
output_number,
|
| 211 |
denoising_steps,
|
| 212 |
-
guidance_scale
|
|
|
|
|
|
|
| 213 |
],
|
| 214 |
outputs = [
|
| 215 |
output_audio_1,
|
|
@@ -218,11 +249,11 @@ with gr.Blocks() as interface:
|
|
| 218 |
information
|
| 219 |
],
|
| 220 |
examples = [
|
| 221 |
-
["A hammer is hitting a wooden surface", 3, 100, 3],
|
| 222 |
-
["Peaceful and calming ambient music with singing bowl and other instruments.", 3, 100, 3],
|
| 223 |
-
["A man is speaking in a small room.", 2, 100, 3],
|
| 224 |
-
["A female is speaking followed by footstep sound", 1, 100, 3],
|
| 225 |
-
["Wooden table tapping sound followed by water pouring sound.", 3, 200, 3],
|
| 226 |
],
|
| 227 |
cache_examples = "lazy",
|
| 228 |
)
|
|
|
|
| 10 |
from audioldm.variational_autoencoder import AutoencoderKL
|
| 11 |
from pydub import AudioSegment
|
| 12 |
|
| 13 |
+
max_64_bit_int = 2**63 - 1
|
| 14 |
+
|
| 15 |
# Automatic device detection
|
| 16 |
if torch.cuda.is_available():
|
| 17 |
device_type = "cuda"
|
|
|
|
| 83 |
tango.stft.to(device_type)
|
| 84 |
tango.model.to(device_type)
|
| 85 |
|
| 86 |
+
def update_seed(is_randomize_seed, seed):
|
| 87 |
+
if is_randomize_seed:
|
| 88 |
+
return random.randint(0, max_64_bit_int)
|
| 89 |
+
return seed
|
| 90 |
+
|
| 91 |
def check(
|
| 92 |
prompt,
|
| 93 |
output_number,
|
| 94 |
steps,
|
| 95 |
+
guidance,
|
| 96 |
+
is_randomize_seed,
|
| 97 |
+
seed
|
| 98 |
):
|
| 99 |
if prompt is None or prompt == "":
|
| 100 |
raise gr.Error("Please provide a prompt input.")
|
|
|
|
| 113 |
prompt,
|
| 114 |
output_number,
|
| 115 |
steps,
|
| 116 |
+
guidance,
|
| 117 |
+
is_randomize_seed,
|
| 118 |
+
seed
|
| 119 |
):
|
| 120 |
start = time.time()
|
| 121 |
+
|
| 122 |
+
if seed is None:
|
| 123 |
+
seed = random.randint(0, max_64_bit_int)
|
| 124 |
+
|
| 125 |
+
random.seed(seed)
|
| 126 |
+
torch.manual_seed(seed)
|
| 127 |
+
|
| 128 |
output_wave = tango.generate(prompt, steps, guidance, output_number)
|
| 129 |
|
| 130 |
output_wave_1 = gr.make_waveform((16000, output_wave[0]))
|
|
|
|
| 180 |
output_number = gr.Slider(label = "Number of generations", info = "1, 2 or 3 output files", minimum = 1, maximum = 3, value = 3, step = 1, interactive = True)
|
| 181 |
denoising_steps = gr.Slider(label = "Steps", info = "lower=faster & variant, higher=audio quality & similar", minimum = 10, maximum = 200, value = 10, step = 1, interactive = True)
|
| 182 |
guidance_scale = gr.Slider(label = "Guidance Scale", info = "lower=audio quality, higher=follow the prompt", minimum = 1, maximum = 10, value = 3, step = 0.1, interactive = True)
|
| 183 |
+
randomize_seed = gr.Checkbox(label = "\U0001F3B2 Randomize seed", value = True, info = "If checked, result is always different")
|
| 184 |
+
seed = gr.Slider(minimum = 0, maximum = max_64_bit_int, step = 1, randomize = True, label = "Seed")
|
| 185 |
|
| 186 |
submit = gr.Button("🚀 Generate", variant = "primary")
|
| 187 |
|
|
|
|
| 190 |
output_audio_3 = gr.Audio(label = "Generated Audio #3/3", format = "wav", type="numpy")
|
| 191 |
information = gr.Label(label = "Information")
|
| 192 |
|
| 193 |
+
submit.click(fn = update_seed, inputs = [
|
| 194 |
+
randomize_seed,
|
| 195 |
+
seed
|
| 196 |
+
], outputs = [
|
| 197 |
+
seed
|
| 198 |
+
], queue = False, show_progress = False).then(fn = check, inputs = [
|
| 199 |
input_text,
|
| 200 |
output_number,
|
| 201 |
denoising_steps,
|
| 202 |
+
guidance_scale,
|
| 203 |
+
randomize_seed,
|
| 204 |
+
seed
|
| 205 |
], outputs = [], queue = False, show_progress = False).success(fn = update_output, inputs = [
|
| 206 |
output_format,
|
| 207 |
output_number
|
|
|
|
| 214 |
input_text,
|
| 215 |
output_number,
|
| 216 |
denoising_steps,
|
| 217 |
+
guidance_scale,
|
| 218 |
+
randomize_seed,
|
| 219 |
+
seed
|
| 220 |
], outputs = [
|
| 221 |
output_audio_1,
|
| 222 |
output_audio_2,
|
|
|
|
| 238 |
input_text,
|
| 239 |
output_number,
|
| 240 |
denoising_steps,
|
| 241 |
+
guidance_scale,
|
| 242 |
+
randomize_seed,
|
| 243 |
+
seed
|
| 244 |
],
|
| 245 |
outputs = [
|
| 246 |
output_audio_1,
|
|
|
|
| 249 |
information
|
| 250 |
],
|
| 251 |
examples = [
|
| 252 |
+
["A hammer is hitting a wooden surface", 3, 100, 3, False, 123],
|
| 253 |
+
["Peaceful and calming ambient music with singing bowl and other instruments.", 3, 100, 3, False, 123],
|
| 254 |
+
["A man is speaking in a small room.", 2, 100, 3, False, 123],
|
| 255 |
+
["A female is speaking followed by footstep sound", 1, 100, 3, False, 123],
|
| 256 |
+
["Wooden table tapping sound followed by water pouring sound.", 3, 200, 3, False, 123],
|
| 257 |
],
|
| 258 |
cache_examples = "lazy",
|
| 259 |
)
|