Spaces:
Sleeping
Sleeping
Commit
·
ee3a553
1
Parent(s):
5b58cc8
batch
Browse files
app.py
CHANGED
|
@@ -218,32 +218,52 @@ def generate_base(subject, setting, ):
|
|
| 218 |
play_steps = int(frame_rate * play_steps_in_s)
|
| 219 |
|
| 220 |
description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
for i
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
|
| 248 |
|
| 249 |
with gr.Blocks() as block:
|
|
|
|
| 218 |
play_steps = int(frame_rate * play_steps_in_s)
|
| 219 |
|
| 220 |
description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
|
| 221 |
+
description = [description for _ in range(len(model_input))]
|
| 222 |
+
description_tokens = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
| 223 |
+
|
| 224 |
+
# for i in range(0, len(model_input), BATCH_SIZE):
|
| 225 |
+
# inputs = model_input[i:min(i + BATCH_SIZE, len(model_input))]
|
| 226 |
+
|
| 227 |
+
# if len(inputs) != 0:
|
| 228 |
+
# input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
| 229 |
+
story = tokenizer(model_input, return_tensors="pt", padding=True).input_ids.to(device)
|
| 230 |
+
|
| 231 |
+
speech_output = model.generate(input_ids=description_tokens, prompt_input_ids=story)
|
| 232 |
+
|
| 233 |
+
speech_output = [output.cpu().numpy() for output in speech_output]
|
| 234 |
+
|
| 235 |
+
for i, new_audio in enumerate(speech_output):
|
| 236 |
+
if i == 0:
|
| 237 |
+
gr.Info("Reading story", duration=3)
|
| 238 |
+
print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
| 239 |
+
yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
|
| 240 |
+
|
| 241 |
+
# print(f"{i}-th part generated")
|
| 242 |
+
# pieces += [*speech_output, silence.copy()]
|
| 243 |
+
|
| 244 |
+
# for i, sentence in enumerate(model_input):
|
| 245 |
+
# streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
|
| 246 |
+
|
| 247 |
+
# prompt = tokenizer(sentence, return_tensors="pt").to(device)
|
| 248 |
+
|
| 249 |
+
# generation_kwargs = dict(
|
| 250 |
+
# input_ids=inputs.input_ids,
|
| 251 |
+
# prompt_input_ids=prompt.input_ids,
|
| 252 |
+
# streamer=streamer,
|
| 253 |
+
# do_sample=True,
|
| 254 |
+
# temperature=1.0,
|
| 255 |
+
# min_new_tokens=10,
|
| 256 |
+
# )
|
| 257 |
+
|
| 258 |
+
# set_seed(SEED)
|
| 259 |
+
# thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
| 260 |
+
# thread.start()
|
| 261 |
+
|
| 262 |
+
# for new_audio in streamer:
|
| 263 |
+
# if i == 0:
|
| 264 |
+
# gr.Info("Reading story", duration=3)
|
| 265 |
+
# print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
| 266 |
+
# yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
|
| 267 |
|
| 268 |
|
| 269 |
with gr.Blocks() as block:
|