Pijush2023 commited on
Commit
637dfac
·
verified ·
1 Parent(s): 20d73f5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -63
app.py CHANGED
@@ -746,9 +746,9 @@ def generate_audio_elevenlabs(text):
746
  return None
747
 
748
 
749
- from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
750
- from transformers import AutoTokenizer
751
- from threading import Thread
752
 
753
  # repo_id = "parler-tts/parler-tts-mini-v1"
754
 
@@ -795,12 +795,11 @@ from threading import Thread
795
  # audio_segments = []
796
  # for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
797
  # audio_segments.append(audio_chunk)
798
- # # Here, you can save the chunk to a file or send it to a frontend
799
- # # For example, you could write the chunk to a file immediately:
800
  # temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
801
  # write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
802
  # logging.debug(f"Saved chunk to {temp_audio_path}")
803
- # # You could also send the chunk to a web client if this was a web application
804
 
805
  # # Combine all the audio chunks into one audio file
806
  # combined_audio = np.concatenate(audio_segments)
@@ -811,61 +810,37 @@ from threading import Thread
811
  # logging.debug(f"Combined audio saved to {combined_audio_path}")
812
  # return combined_audio_path
813
 
 
814
  import concurrent.futures
815
  import tempfile
816
  import os
817
  import numpy as np
 
 
818
  from threading import Thread
819
- from transformers import AutoTokenizer
820
- from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
821
  from scipy.io.wavfile import write as write_wav
822
- import logging
823
- import torch
824
-
825
- # Set up device and dtype
826
- torch_device = "cuda:0" # Use "mps" for Mac or "cpu" if CUDA is unavailable
827
- torch_dtype = torch.bfloat16
828
-
829
- # Set model name and other configurations
830
- model_name = "parler-tts/parler-tts-mini-v1"
831
- attn_implementation = "eager" # Options: "eager", "sdpa", "flash_attention_2"
832
- compile_mode = "default" # Options: "default", "reduce-overhead"
833
- max_length = 50 # Set padding max length
834
-
835
- # Load the model with efficient attention and compile optimizations
836
- parler_tokenizer = AutoTokenizer.from_pretrained(model_name)
837
- parler_model = ParlerTTSForConditionalGeneration.from_pretrained(
838
- model_name,
839
- attn_implementation=attn_implementation
840
- ).to(torch_device, dtype=torch_dtype)
841
-
842
- # Compile the forward pass for faster generation
843
- parler_model.generation_config.cache_implementation = "static"
844
- parler_model.forward = torch.compile(parler_model.forward, mode=compile_mode)
845
-
846
- # Warmup to optimize the model after compilation
847
- inputs = parler_tokenizer("This is for compilation", return_tensors="pt", padding="max_length", max_length=max_length).to(torch_device)
848
- model_kwargs = {**inputs, "prompt_input_ids": inputs.input_ids, "prompt_attention_mask": inputs.attention_mask}
849
-
850
- n_steps = 1 if compile_mode == "default" else 2
851
- for _ in range(n_steps):
852
- _ = parler_model.generate(**model_kwargs)
853
 
 
 
854
 
855
  def generate_audio_parler_tts(text):
856
  description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
857
-
858
- chunk_size_in_s = 0.3 # Smaller chunk size for lower latency
 
 
 
859
  sampling_rate = parler_model.audio_encoder.config.sampling_rate
860
  frame_rate = parler_model.audio_encoder.config.frame_rate
861
-
862
- play_steps = int(frame_rate * chunk_size_in_s)
863
 
864
- def generate_chunks(text, description):
865
- streamer = ParlerTTSStreamer(parler_model, device=torch_device, play_steps=play_steps)
 
866
 
867
- inputs = parler_tokenizer(description, return_tensors="pt").to(torch_device)
868
- prompt = parler_tokenizer(text, return_tensors="pt").to(torch_device)
869
 
870
  generation_kwargs = dict(
871
  input_ids=inputs.input_ids,
@@ -874,7 +849,7 @@ def generate_audio_parler_tts(text):
874
  prompt_attention_mask=prompt.attention_mask,
875
  streamer=streamer,
876
  do_sample=True,
877
- temperature=0.7, # Lower temperature for faster generation
878
  min_new_tokens=10,
879
  )
880
 
@@ -886,31 +861,37 @@ def generate_audio_parler_tts(text):
886
  break
887
  yield sampling_rate, new_audio
888
 
889
- def process_audio_chunks(chunks):
890
- audio_segments = []
891
- for sampling_rate, audio_chunk in chunks:
892
- audio_segments.append(audio_chunk)
893
- temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
 
 
 
 
 
 
894
  write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
895
  logging.debug(f"Saved chunk to {temp_audio_path}")
896
- # Optionally, send this chunk to the client in real-time
897
- return audio_segments
898
 
 
899
  with concurrent.futures.ThreadPoolExecutor() as executor:
900
- # Start processing audio chunks in a separate thread
901
- future_chunks = executor.submit(process_audio_chunks, generate_chunks(text, description))
902
 
903
- # Continue with other tasks in parallel
904
- # (e.g., you can update the chatbot interface, handle other requests, etc.)
905
 
906
- # Wait for audio processing to complete and get the result
907
- audio_segments = future_chunks.result()
908
 
909
  # Combine all the audio chunks into one audio file
910
- combined_audio = np.concatenate(audio_segments)
911
  combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
912
 
913
- write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
914
 
915
  logging.debug(f"Combined audio saved to {combined_audio_path}")
916
  return combined_audio_path
 
746
  return None
747
 
748
 
749
+ # from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
750
+ # from transformers import AutoTokenizer
751
+ # from threading import Thread
752
 
753
  # repo_id = "parler-tts/parler-tts-mini-v1"
754
 
 
795
  # audio_segments = []
796
  # for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
797
  # audio_segments.append(audio_chunk)
798
+
 
799
  # temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
800
  # write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
801
  # logging.debug(f"Saved chunk to {temp_audio_path}")
802
+
803
 
804
  # # Combine all the audio chunks into one audio file
805
  # combined_audio = np.concatenate(audio_segments)
 
810
  # logging.debug(f"Combined audio saved to {combined_audio_path}")
811
  # return combined_audio_path
812
 
813
+
814
  import concurrent.futures
815
  import tempfile
816
  import os
817
  import numpy as np
818
+ import logging
819
+ from queue import Queue
820
  from threading import Thread
 
 
821
  from scipy.io.wavfile import write as write_wav
822
+ from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
823
+ from transformers import AutoTokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
824
 
825
+ repo_id = "parler-tts/parler-tts-mini-v1"
826
+ device = "cuda:0" # or "cpu" if CUDA is not available
827
 
828
  def generate_audio_parler_tts(text):
829
  description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
830
+ chunk_size_in_s = 0.5
831
+
832
+ # Initialize the tokenizer and model
833
+ parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
834
+ parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
835
  sampling_rate = parler_model.audio_encoder.config.sampling_rate
836
  frame_rate = parler_model.audio_encoder.config.frame_rate
 
 
837
 
838
+ def generate(text, description, play_steps_in_s=0.5):
839
+ play_steps = int(frame_rate * play_steps_in_s)
840
+ streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
841
 
842
+ inputs = parler_tokenizer(description, return_tensors="pt").to(device)
843
+ prompt = parler_tokenizer(text, return_tensors="pt").to(device)
844
 
845
  generation_kwargs = dict(
846
  input_ids=inputs.input_ids,
 
849
  prompt_attention_mask=prompt.attention_mask,
850
  streamer=streamer,
851
  do_sample=True,
852
+ temperature=1.0,
853
  min_new_tokens=10,
854
  )
855
 
 
861
  break
862
  yield sampling_rate, new_audio
863
 
864
+ # Queue to hold the audio chunks
865
+ audio_queue = Queue()
866
+ combined_audio = []
867
+
868
+ def process_chunks():
869
+ while True:
870
+ sampling_rate, audio_chunk = audio_queue.get()
871
+ if audio_chunk is None: # Stop processing when a None is received
872
+ break
873
+ combined_audio.append(audio_chunk)
874
+ temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(combined_audio)}.wav")
875
  write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
876
  logging.debug(f"Saved chunk to {temp_audio_path}")
877
+ # Start playing or buffering the audio chunk here if required
878
+ # (e.g., send to a player or a frontend for immediate playback)
879
 
880
+ # Start the chunk processing in a separate thread
881
  with concurrent.futures.ThreadPoolExecutor() as executor:
882
+ executor.submit(process_chunks)
 
883
 
884
+ for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
885
+ audio_queue.put((sampling_rate, audio_chunk))
886
 
887
+ # Signal the end of processing
888
+ audio_queue.put((None, None))
889
 
890
  # Combine all the audio chunks into one audio file
891
+ combined_audio_np = np.concatenate(combined_audio)
892
  combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
893
 
894
+ write_wav(combined_audio_path, sampling_rate, combined_audio_np.astype(np.float32))
895
 
896
  logging.debug(f"Combined audio saved to {combined_audio_path}")
897
  return combined_audio_path