Pijush2023 commited on
Commit
1472828
·
verified ·
1 Parent(s): a17fdc2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -4
app.py CHANGED
@@ -746,14 +746,87 @@ def generate_audio_elevenlabs(text):
746
  return None
747
 
748
 
749
- from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
750
- from transformers import AutoTokenizer
751
- from threading import Thread
752
 
753
- repo_id = "parler-tts/parler-tts-mini-v1"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
754
 
 
 
 
755
 
 
756
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
757
 
758
  def generate_audio_parler_tts(text):
759
  description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
@@ -817,6 +890,7 @@ def generate_audio_parler_tts(text):
817
 
818
 
819
 
 
820
  def fetch_local_events():
821
  api_key = os.environ['SERP_API']
822
  url = f'https://serpapi.com/search.json?engine=google_events&q=Events+in+Birmingham&hl=en&gl=us&api_key={api_key}'
 
746
  return None
747
 
748
 
749
+ # from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
750
+ # from transformers import AutoTokenizer
751
+ # from threading import Thread
752
 
753
+ # repo_id = "parler-tts/parler-tts-mini-v1"
754
+
755
+
756
+
757
+
758
+ # def generate_audio_parler_tts(text):
759
+ # description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
760
+ # chunk_size_in_s = 0.5
761
+
762
+ # # Initialize the tokenizer and model
763
+ # parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
764
+ # parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
765
+ # sampling_rate = parler_model.audio_encoder.config.sampling_rate
766
+ # frame_rate = parler_model.audio_encoder.config.frame_rate
767
+
768
+ # def generate(text, description, play_steps_in_s=0.5):
769
+ # play_steps = int(frame_rate * play_steps_in_s)
770
+ # streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
771
+
772
+ # inputs = parler_tokenizer(description, return_tensors="pt").to(device)
773
+ # prompt = parler_tokenizer(text, return_tensors="pt").to(device)
774
+
775
+ # generation_kwargs = dict(
776
+ # input_ids=inputs.input_ids,
777
+ # prompt_input_ids=prompt.input_ids,
778
+ # attention_mask=inputs.attention_mask,
779
+ # prompt_attention_mask=prompt.attention_mask,
780
+ # streamer=streamer,
781
+ # do_sample=True,
782
+ # temperature=1.0,
783
+ # min_new_tokens=10,
784
+ # )
785
+
786
+ # thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
787
+ # thread.start()
788
+
789
+ # for new_audio in streamer:
790
+ # if new_audio.shape[0] == 0:
791
+ # break
792
+ # # Save or process each audio chunk as it is generated
793
+ # yield sampling_rate, new_audio
794
+
795
+ # audio_segments = []
796
+ # for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
797
+ # audio_segments.append(audio_chunk)
798
+
799
+ # temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
800
+ # write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
801
+ # logging.debug(f"Saved chunk to {temp_audio_path}")
802
+
803
 
804
+ # # Combine all the audio chunks into one audio file
805
+ # combined_audio = np.concatenate(audio_segments)
806
+ # combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
807
 
808
+ # write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
809
 
810
+ # logging.debug(f"Combined audio saved to {combined_audio_path}")
811
+ # return combined_audio_path
812
+
813
+
814
+
815
+ import concurrent.futures
816
+ import tempfile
817
+ import os
818
+ import numpy as np
819
+ import logging
820
+ from queue import Queue
821
+ from threading import Thread
822
+ from scipy.io.wavfile import write as write_wav
823
+ from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
824
+ from transformers import AutoTokenizer
825
+
826
+ # Ensure your device is set to CUDA
827
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
828
+
829
+ repo_id = "parler-tts/parler-tts-mini-v1"
830
 
831
  def generate_audio_parler_tts(text):
832
  description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
 
890
 
891
 
892
 
893
+
894
  def fetch_local_events():
895
  api_key = os.environ['SERP_API']
896
  url = f'https://serpapi.com/search.json?engine=google_events&q=Events+in+Birmingham&hl=en&gl=us&api_key={api_key}'