Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,5 @@
|
|
1 |
import io
|
2 |
import math
|
3 |
-
from queue import Queue
|
4 |
-
from threading import Thread
|
5 |
from typing import Optional
|
6 |
|
7 |
import numpy as np
|
@@ -12,7 +10,6 @@ import torch
|
|
12 |
from parler_tts import ParlerTTSForConditionalGeneration
|
13 |
from pydub import AudioSegment
|
14 |
from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
|
15 |
-
from transformers.generation.streamers import BaseStreamer
|
16 |
from huggingface_hub import InferenceClient
|
17 |
import nltk
|
18 |
nltk.download('punkt')
|
@@ -38,135 +35,6 @@ SAMPLE_RATE = feature_extractor.sampling_rate
|
|
38 |
SEED = 42
|
39 |
|
40 |
|
41 |
-
class ParlerTTSStreamer(BaseStreamer):
|
42 |
-
def __init__(
|
43 |
-
self,
|
44 |
-
model: ParlerTTSForConditionalGeneration,
|
45 |
-
device: Optional[str] = None,
|
46 |
-
play_steps: Optional[int] = 10,
|
47 |
-
stride: Optional[int] = None,
|
48 |
-
timeout: Optional[float] = None,
|
49 |
-
):
|
50 |
-
"""
|
51 |
-
Streamer that stores playback-ready audio in a queue, to be used by a downstream application as an iterator. This is
|
52 |
-
useful for applications that benefit from accessing the generated audio in a non-blocking way (e.g. in an interactive
|
53 |
-
Gradio demo).
|
54 |
-
Parameters:
|
55 |
-
model (`ParlerTTSForConditionalGeneration`):
|
56 |
-
The Parler-TTS model used to generate the audio waveform.
|
57 |
-
device (`str`, *optional*):
|
58 |
-
The torch device on which to run the computation. If `None`, will default to the device of the model.
|
59 |
-
play_steps (`int`, *optional*, defaults to 10):
|
60 |
-
The number of generation steps with which to return the generated audio array. Using fewer steps will
|
61 |
-
mean the first chunk is ready faster, but will require more codec decoding steps overall. This value
|
62 |
-
should be tuned to your device and latency requirements.
|
63 |
-
stride (`int`, *optional*):
|
64 |
-
The window (stride) between adjacent audio samples. Using a stride between adjacent audio samples reduces
|
65 |
-
the hard boundary between them, giving smoother playback. If `None`, will default to a value equivalent to
|
66 |
-
play_steps // 6 in the audio space.
|
67 |
-
timeout (`int`, *optional*):
|
68 |
-
The timeout for the audio queue. If `None`, the queue will block indefinitely. Useful to handle exceptions
|
69 |
-
in `.generate()`, when it is called in a separate thread.
|
70 |
-
"""
|
71 |
-
self.decoder = model.decoder
|
72 |
-
self.audio_encoder = model.audio_encoder
|
73 |
-
self.generation_config = model.generation_config
|
74 |
-
self.device = device if device is not None else model.device
|
75 |
-
|
76 |
-
# variables used in the streaming process
|
77 |
-
self.play_steps = play_steps
|
78 |
-
if stride is not None:
|
79 |
-
self.stride = stride
|
80 |
-
else:
|
81 |
-
hop_length = math.floor(self.audio_encoder.config.sampling_rate / self.audio_encoder.config.frame_rate)
|
82 |
-
self.stride = hop_length * (play_steps - self.decoder.num_codebooks) // 6
|
83 |
-
self.token_cache = None
|
84 |
-
self.to_yield = 0
|
85 |
-
|
86 |
-
# varibles used in the thread process
|
87 |
-
self.audio_queue = Queue()
|
88 |
-
self.stop_signal = None
|
89 |
-
self.timeout = timeout
|
90 |
-
|
91 |
-
def apply_delay_pattern_mask(self, input_ids):
|
92 |
-
# build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to Parler)
|
93 |
-
_, delay_pattern_mask = self.decoder.build_delay_pattern_mask(
|
94 |
-
input_ids[:, :1],
|
95 |
-
bos_token_id=self.generation_config.bos_token_id,
|
96 |
-
pad_token_id=self.generation_config.decoder_start_token_id,
|
97 |
-
max_length=input_ids.shape[-1],
|
98 |
-
)
|
99 |
-
# apply the pattern mask to the input ids
|
100 |
-
input_ids = self.decoder.apply_delay_pattern_mask(input_ids, delay_pattern_mask)
|
101 |
-
|
102 |
-
# revert the pattern delay mask by filtering the pad token id
|
103 |
-
mask = (delay_pattern_mask != self.generation_config.bos_token_id) & (delay_pattern_mask != self.generation_config.pad_token_id)
|
104 |
-
input_ids = input_ids[mask].reshape(1, self.decoder.num_codebooks, -1)
|
105 |
-
# append the frame dimension back to the audio codes
|
106 |
-
input_ids = input_ids[None, ...]
|
107 |
-
|
108 |
-
# send the input_ids to the correct device
|
109 |
-
input_ids = input_ids.to(self.audio_encoder.device)
|
110 |
-
|
111 |
-
decode_sequentially = (
|
112 |
-
self.generation_config.bos_token_id in input_ids
|
113 |
-
or self.generation_config.pad_token_id in input_ids
|
114 |
-
or self.generation_config.eos_token_id in input_ids
|
115 |
-
)
|
116 |
-
if not decode_sequentially:
|
117 |
-
output_values = self.audio_encoder.decode(
|
118 |
-
input_ids,
|
119 |
-
audio_scales=[None],
|
120 |
-
)
|
121 |
-
else:
|
122 |
-
sample = input_ids[:, 0]
|
123 |
-
sample_mask = (sample >= self.audio_encoder.config.codebook_size).sum(dim=(0, 1)) == 0
|
124 |
-
sample = sample[:, :, sample_mask]
|
125 |
-
output_values = self.audio_encoder.decode(sample[None, ...], [None])
|
126 |
-
|
127 |
-
audio_values = output_values.audio_values[0, 0]
|
128 |
-
return audio_values.cpu().float().numpy()
|
129 |
-
|
130 |
-
def put(self, value):
|
131 |
-
batch_size = value.shape[0] // self.decoder.num_codebooks
|
132 |
-
if batch_size > 1:
|
133 |
-
raise ValueError("ParlerTTSStreamer only supports batch size 1")
|
134 |
-
|
135 |
-
if self.token_cache is None:
|
136 |
-
self.token_cache = value
|
137 |
-
else:
|
138 |
-
self.token_cache = torch.concatenate([self.token_cache, value[:, None]], dim=-1)
|
139 |
-
|
140 |
-
if self.token_cache.shape[-1] % self.play_steps == 0:
|
141 |
-
audio_values = self.apply_delay_pattern_mask(self.token_cache)
|
142 |
-
self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
|
143 |
-
self.to_yield += len(audio_values) - self.to_yield - self.stride
|
144 |
-
|
145 |
-
def end(self):
|
146 |
-
"""Flushes any remaining cache and appends the stop symbol."""
|
147 |
-
if self.token_cache is not None:
|
148 |
-
audio_values = self.apply_delay_pattern_mask(self.token_cache)
|
149 |
-
else:
|
150 |
-
audio_values = np.zeros(self.to_yield)
|
151 |
-
|
152 |
-
self.on_finalized_audio(audio_values[self.to_yield :], stream_end=True)
|
153 |
-
|
154 |
-
def on_finalized_audio(self, audio: np.ndarray, stream_end: bool = False):
|
155 |
-
"""Put the new audio in the queue. If the stream is ending, also put a stop signal in the queue."""
|
156 |
-
self.audio_queue.put(audio, timeout=self.timeout)
|
157 |
-
if stream_end:
|
158 |
-
self.audio_queue.put(self.stop_signal, timeout=self.timeout)
|
159 |
-
|
160 |
-
def __iter__(self):
|
161 |
-
return self
|
162 |
-
|
163 |
-
def __next__(self):
|
164 |
-
value = self.audio_queue.get(timeout=self.timeout)
|
165 |
-
if not isinstance(value, np.ndarray) and value == self.stop_signal:
|
166 |
-
raise StopIteration()
|
167 |
-
else:
|
168 |
-
return value
|
169 |
-
|
170 |
def numpy_to_mp3(audio_array, sampling_rate):
|
171 |
# Normalize audio_array if it's floating-point
|
172 |
if np.issubdtype(audio_array.dtype, np.floating):
|
@@ -195,8 +63,7 @@ def numpy_to_mp3(audio_array, sampling_rate):
|
|
195 |
sampling_rate = model.audio_encoder.config.sampling_rate
|
196 |
frame_rate = model.audio_encoder.config.frame_rate
|
197 |
|
198 |
-
|
199 |
-
import datetime
|
200 |
|
201 |
@spaces.GPU
|
202 |
def generate_base(subject, setting):
|
@@ -234,67 +101,10 @@ def stream_audio(state):
|
|
234 |
|
235 |
story = ""
|
236 |
for sentence, new_audio in zip(sentences, speech_output):
|
237 |
-
# print(f"i, j, time: {i}, {j} {datetime.datetime.now()}")
|
238 |
print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
239 |
story += f"{sentence}\n"
|
240 |
yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
|
241 |
|
242 |
-
# BATCH_SIZE = 4
|
243 |
-
# for i in range(0, len(model_input), BATCH_SIZE):
|
244 |
-
# inputs = model_input[i:min(i + BATCH_SIZE, len(model_input))]
|
245 |
-
# story_tokens = tokenizer(inputs, return_tensors="pt", padding=True).input_ids.to(device)
|
246 |
-
# description_tokens = tokenizer([description for _ in range(len(inputs))], return_tensors="pt").input_ids.to(device)
|
247 |
-
# speech_output = model.generate(input_ids=description_tokens, prompt_input_ids=story_tokens)
|
248 |
-
|
249 |
-
# speech_output = [output.cpu().numpy() for output in speech_output]
|
250 |
-
# for j, new_audio in enumerate(speech_output):
|
251 |
-
# if i + j == 0:
|
252 |
-
# gr.Info("Reading story", duration=3)
|
253 |
-
# print(f"i, j, time: {i}, {j} {datetime.datetime.now()}")
|
254 |
-
# print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
255 |
-
# yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
|
256 |
-
|
257 |
-
# if len(inputs) != 0:
|
258 |
-
# input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
259 |
-
# story = tokenizer(model_input, return_tensors="pt", padding=True).input_ids.to(device)
|
260 |
-
|
261 |
-
# speech_output = model.generate(input_ids=description_tokens, prompt_input_ids=story)
|
262 |
-
|
263 |
-
# speech_output = [output.cpu().numpy() for output in speech_output]
|
264 |
-
|
265 |
-
# for i, new_audio in enumerate(speech_output):
|
266 |
-
# if i == 0:
|
267 |
-
# gr.Info("Reading story", duration=3)
|
268 |
-
# print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
269 |
-
# yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
|
270 |
-
|
271 |
-
# print(f"{i}-th part generated")
|
272 |
-
# pieces += [*speech_output, silence.copy()]
|
273 |
-
|
274 |
-
# for i, sentence in enumerate(model_input):
|
275 |
-
# streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
|
276 |
-
|
277 |
-
# prompt = tokenizer(sentence, return_tensors="pt").to(device)
|
278 |
-
|
279 |
-
# generation_kwargs = dict(
|
280 |
-
# input_ids=inputs.input_ids,
|
281 |
-
# prompt_input_ids=prompt.input_ids,
|
282 |
-
# streamer=streamer,
|
283 |
-
# do_sample=True,
|
284 |
-
# temperature=1.0,
|
285 |
-
# min_new_tokens=10,
|
286 |
-
# )
|
287 |
-
|
288 |
-
# set_seed(SEED)
|
289 |
-
# thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
290 |
-
# thread.start()
|
291 |
-
|
292 |
-
# for new_audio in streamer:
|
293 |
-
# if i == 0:
|
294 |
-
# gr.Info("Reading story", duration=3)
|
295 |
-
# print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
296 |
-
# yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
|
297 |
-
|
298 |
|
299 |
with gr.Blocks() as block:
|
300 |
gr.HTML(
|
|
|
1 |
import io
|
2 |
import math
|
|
|
|
|
3 |
from typing import Optional
|
4 |
|
5 |
import numpy as np
|
|
|
10 |
from parler_tts import ParlerTTSForConditionalGeneration
|
11 |
from pydub import AudioSegment
|
12 |
from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
|
|
|
13 |
from huggingface_hub import InferenceClient
|
14 |
import nltk
|
15 |
nltk.download('punkt')
|
|
|
35 |
SEED = 42
|
36 |
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
def numpy_to_mp3(audio_array, sampling_rate):
|
39 |
# Normalize audio_array if it's floating-point
|
40 |
if np.issubdtype(audio_array.dtype, np.floating):
|
|
|
63 |
sampling_rate = model.audio_encoder.config.sampling_rate
|
64 |
frame_rate = model.audio_encoder.config.frame_rate
|
65 |
|
66 |
+
|
|
|
67 |
|
68 |
@spaces.GPU
|
69 |
def generate_base(subject, setting):
|
|
|
101 |
|
102 |
story = ""
|
103 |
for sentence, new_audio in zip(sentences, speech_output):
|
|
|
104 |
print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
105 |
story += f"{sentence}\n"
|
106 |
yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
|
107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
with gr.Blocks() as block:
|
110 |
gr.HTML(
|