bedtime-story-batches

Sleeping

App Files Files Community

freddyaboulton HF Staff commited on Aug 1, 2024

Commit

6a42799

verified ·

1 Parent(s): 840333c

Update app.py

Browse files

Files changed (1) hide show

app.py +1 -191

app.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import io
 import math
-from queue import Queue
-from threading import Thread
 from typing import Optional
 import numpy as np
@@ -12,7 +10,6 @@ import torch
 from parler_tts import ParlerTTSForConditionalGeneration
 from pydub import AudioSegment
 from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
-from transformers.generation.streamers import BaseStreamer
 from huggingface_hub import InferenceClient
 import nltk
 nltk.download('punkt')
@@ -38,135 +35,6 @@ SAMPLE_RATE = feature_extractor.sampling_rate
 SEED = 42
-class ParlerTTSStreamer(BaseStreamer):
-    def __init__(
-        self,
-        model: ParlerTTSForConditionalGeneration,
-        device: Optional[str] = None,
-        play_steps: Optional[int] = 10,
-        stride: Optional[int] = None,
-        timeout: Optional[float] = None,
-    ):
-        """
-        Streamer that stores playback-ready audio in a queue, to be used by a downstream application as an iterator. This is
-        useful for applications that benefit from accessing the generated audio in a non-blocking way (e.g. in an interactive
-        Gradio demo).
-        Parameters:
-            model (`ParlerTTSForConditionalGeneration`):
-                The Parler-TTS model used to generate the audio waveform.
-            device (`str`, *optional*):
-                The torch device on which to run the computation. If `None`, will default to the device of the model.
-            play_steps (`int`, *optional*, defaults to 10):
-                The number of generation steps with which to return the generated audio array. Using fewer steps will
-                mean the first chunk is ready faster, but will require more codec decoding steps overall. This value
-                should be tuned to your device and latency requirements.
-            stride (`int`, *optional*):
-                The window (stride) between adjacent audio samples. Using a stride between adjacent audio samples reduces
-                the hard boundary between them, giving smoother playback. If `None`, will default to a value equivalent to
-                play_steps // 6 in the audio space.
-            timeout (`int`, *optional*):
-                The timeout for the audio queue. If `None`, the queue will block indefinitely. Useful to handle exceptions
-                in `.generate()`, when it is called in a separate thread.
-        """
-        self.decoder = model.decoder
-        self.audio_encoder = model.audio_encoder
-        self.generation_config = model.generation_config
-        self.device = device if device is not None else model.device
-        # variables used in the streaming process
-        self.play_steps = play_steps
-        if stride is not None:
-            self.stride = stride
-        else:
-            hop_length = math.floor(self.audio_encoder.config.sampling_rate / self.audio_encoder.config.frame_rate)
-            self.stride = hop_length * (play_steps - self.decoder.num_codebooks) // 6
-        self.token_cache = None
-        self.to_yield = 0
-        # varibles used in the thread process
-        self.audio_queue = Queue()
-        self.stop_signal = None
-        self.timeout = timeout
-    def apply_delay_pattern_mask(self, input_ids):
-        # build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to Parler)
-        _, delay_pattern_mask = self.decoder.build_delay_pattern_mask(
-            input_ids[:, :1],
-            bos_token_id=self.generation_config.bos_token_id,
-            pad_token_id=self.generation_config.decoder_start_token_id,
-            max_length=input_ids.shape[-1],
-        )
-        # apply the pattern mask to the input ids
-        input_ids = self.decoder.apply_delay_pattern_mask(input_ids, delay_pattern_mask)
-        # revert the pattern delay mask by filtering the pad token id
-        mask = (delay_pattern_mask != self.generation_config.bos_token_id) & (delay_pattern_mask != self.generation_config.pad_token_id)
-        input_ids = input_ids[mask].reshape(1, self.decoder.num_codebooks, -1)
-        # append the frame dimension back to the audio codes
-        input_ids = input_ids[None, ...]
-        # send the input_ids to the correct device
-        input_ids = input_ids.to(self.audio_encoder.device)
-        decode_sequentially = (
-            self.generation_config.bos_token_id in input_ids
-            or self.generation_config.pad_token_id in input_ids
-            or self.generation_config.eos_token_id in input_ids
-        )
-        if not decode_sequentially:
-            output_values = self.audio_encoder.decode(
-                input_ids,
-                audio_scales=[None],
-            )
-        else:
-            sample = input_ids[:, 0]
-            sample_mask = (sample >= self.audio_encoder.config.codebook_size).sum(dim=(0, 1)) == 0
-            sample = sample[:, :, sample_mask]
-            output_values = self.audio_encoder.decode(sample[None, ...], [None])
-        audio_values = output_values.audio_values[0, 0]
-        return audio_values.cpu().float().numpy()
-    def put(self, value):
-        batch_size = value.shape[0] // self.decoder.num_codebooks
-        if batch_size > 1:
-            raise ValueError("ParlerTTSStreamer only supports batch size 1")
-        if self.token_cache is None:
-            self.token_cache = value
-        else:
-            self.token_cache = torch.concatenate([self.token_cache, value[:, None]], dim=-1)
-        if self.token_cache.shape[-1] % self.play_steps == 0:
-            audio_values = self.apply_delay_pattern_mask(self.token_cache)
-            self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
-            self.to_yield += len(audio_values) - self.to_yield - self.stride
-    def end(self):
-        """Flushes any remaining cache and appends the stop symbol."""
-        if self.token_cache is not None:
-            audio_values = self.apply_delay_pattern_mask(self.token_cache)
-        else:
-            audio_values = np.zeros(self.to_yield)
-        self.on_finalized_audio(audio_values[self.to_yield :], stream_end=True)
-    def on_finalized_audio(self, audio: np.ndarray, stream_end: bool = False):
-        """Put the new audio in the queue. If the stream is ending, also put a stop signal in the queue."""
-        self.audio_queue.put(audio, timeout=self.timeout)
-        if stream_end:
-            self.audio_queue.put(self.stop_signal, timeout=self.timeout)
-    def __iter__(self):
-        return self
-    def __next__(self):
-        value = self.audio_queue.get(timeout=self.timeout)
-        if not isinstance(value, np.ndarray) and value == self.stop_signal:
-            raise StopIteration()
-        else:
-            return value
 def numpy_to_mp3(audio_array, sampling_rate):
     # Normalize audio_array if it's floating-point
     if np.issubdtype(audio_array.dtype, np.floating):
@@ -195,8 +63,7 @@ def numpy_to_mp3(audio_array, sampling_rate):
 sampling_rate = model.audio_encoder.config.sampling_rate
 frame_rate = model.audio_encoder.config.frame_rate
-import random
-import datetime
 @spaces.GPU
 def generate_base(subject, setting):
@@ -234,67 +101,10 @@ def stream_audio(state):
     story = ""
     for sentence, new_audio in zip(sentences, speech_output):
-#         print(f"i, j, time: {i}, {j} {datetime.datetime.now()}")
         print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
         story += f"{sentence}\n"
         yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
-    # BATCH_SIZE = 4
-    # for i in range(0, len(model_input), BATCH_SIZE):
-    #     inputs = model_input[i:min(i + BATCH_SIZE, len(model_input))]
-    #     story_tokens = tokenizer(inputs, return_tensors="pt", padding=True).input_ids.to(device)
-    #     description_tokens = tokenizer([description for _ in range(len(inputs))], return_tensors="pt").input_ids.to(device)
-    #     speech_output = model.generate(input_ids=description_tokens, prompt_input_ids=story_tokens)
-    #     speech_output = [output.cpu().numpy() for output in speech_output]
-    #     for j, new_audio in enumerate(speech_output):
-    #         if i + j == 0:
-    #             gr.Info("Reading story", duration=3)
-    #         print(f"i, j, time: {i}, {j} {datetime.datetime.now()}")
-    #         print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
-    #         yield story,  numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
-    #     if len(inputs) != 0:
-    # input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
-    # story = tokenizer(model_input, return_tensors="pt", padding=True).input_ids.to(device)
-    # speech_output = model.generate(input_ids=description_tokens, prompt_input_ids=story)
-    # speech_output = [output.cpu().numpy() for output in speech_output]
-    # for i, new_audio in enumerate(speech_output):
-    #     if i == 0:
-    #         gr.Info("Reading story", duration=3)
-    #     print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
-    #     yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
-    # print(f"{i}-th part generated")
-    # pieces += [*speech_output, silence.copy()]
-    # for i, sentence in enumerate(model_input):
-    #     streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
-    #     prompt = tokenizer(sentence, return_tensors="pt").to(device)
-    #     generation_kwargs = dict(
-    #         input_ids=inputs.input_ids,
-    #         prompt_input_ids=prompt.input_ids,
-    #         streamer=streamer,
-    #         do_sample=True,
-    #         temperature=1.0,
-    #         min_new_tokens=10,
-    #     )
-    #     set_seed(SEED)
-    #     thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    #     thread.start()
-        # for new_audio in streamer:
-        #     if i == 0:
-        #         gr.Info("Reading story", duration=3)
-        #     print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
-        #     yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
 with gr.Blocks() as block:
     gr.HTML(

 import io
 import math
 from typing import Optional
 import numpy as np
 from parler_tts import ParlerTTSForConditionalGeneration
 from pydub import AudioSegment
 from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
 from huggingface_hub import InferenceClient
 import nltk
 nltk.download('punkt')
 SEED = 42
 def numpy_to_mp3(audio_array, sampling_rate):
     # Normalize audio_array if it's floating-point
     if np.issubdtype(audio_array.dtype, np.floating):
 sampling_rate = model.audio_encoder.config.sampling_rate
 frame_rate = model.audio_encoder.config.frame_rate
 @spaces.GPU
 def generate_base(subject, setting):
     story = ""
     for sentence, new_audio in zip(sentences, speech_output):
         print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
         story += f"{sentence}\n"
         yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
 with gr.Blocks() as block:
     gr.HTML(