Pijush2023 commited on
Commit
7e81093
·
verified ·
1 Parent(s): 3c82a74

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -60
app.py CHANGED
@@ -949,76 +949,76 @@ def generate_audio_elevenlabs(text):
949
 
950
  # chunking audio and then Process
951
 
952
- import concurrent.futures
953
- import tempfile
954
- import os
955
- import numpy as np
956
- import logging
957
- from queue import Queue
958
- from threading import Thread
959
- from scipy.io.wavfile import write as write_wav
960
- from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
961
- from transformers import AutoTokenizer
 
 
 
962
 
963
- # Ensure your device is set to CUDA
964
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
965
 
966
- repo_id = "parler-tts/parler-tts-mini-v1"
967
-
968
- def generate_audio_parler_tts(text):
969
- description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
970
- chunk_size_in_s = 0.5
971
-
972
- # Initialize the tokenizer and model
973
- parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
974
- parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
975
- sampling_rate = parler_model.audio_encoder.config.sampling_rate
976
- frame_rate = parler_model.audio_encoder.config.frame_rate
977
-
978
- def generate(text, description, play_steps_in_s=0.5):
979
- play_steps = int(frame_rate * play_steps_in_s)
980
- streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
981
-
982
- inputs = parler_tokenizer(description, return_tensors="pt").to(device)
983
- prompt = parler_tokenizer(text, return_tensors="pt").to(device)
984
-
985
- generation_kwargs = dict(
986
- input_ids=inputs.input_ids,
987
- prompt_input_ids=prompt.input_ids,
988
- attention_mask=inputs.attention_mask,
989
- prompt_attention_mask=prompt.attention_mask,
990
- streamer=streamer,
991
- do_sample=True,
992
- temperature=1.0,
993
- min_new_tokens=10,
994
- )
995
 
996
- thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
997
- thread.start()
998
 
999
- for new_audio in streamer:
1000
- if new_audio.shape[0] == 0:
1001
- break
1002
- # Save or process each audio chunk as it is generated
1003
- yield sampling_rate, new_audio
1004
 
1005
- audio_segments = []
1006
- for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
1007
- audio_segments.append(audio_chunk)
1008
 
1009
- temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
1010
- write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
1011
- logging.debug(f"Saved chunk to {temp_audio_path}")
1012
 
1013
 
1014
- # Combine all the audio chunks into one audio file
1015
- combined_audio = np.concatenate(audio_segments)
1016
- combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
1017
 
1018
- write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
1019
 
1020
- logging.debug(f"Combined audio saved to {combined_audio_path}")
1021
- return combined_audio_path
1022
 
1023
 
1024
  def fetch_local_events():
 
949
 
950
  # chunking audio and then Process
951
 
952
+ # import concurrent.futures
953
+ # import tempfile
954
+ # import os
955
+ # import numpy as np
956
+ # import logging
957
+ # from queue import Queue
958
+ # from threading import Thread
959
+ # from scipy.io.wavfile import write as write_wav
960
+ # from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
961
+ # from transformers import AutoTokenizer
962
+
963
+ # # Ensure your device is set to CUDA
964
+ # device = "cuda:0" if torch.cuda.is_available() else "cpu"
965
 
966
+ # repo_id = "parler-tts/parler-tts-mini-v1"
 
967
 
968
+ # def generate_audio_parler_tts(text):
969
+ # description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
970
+ # chunk_size_in_s = 0.5
971
+
972
+ # # Initialize the tokenizer and model
973
+ # parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
974
+ # parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
975
+ # sampling_rate = parler_model.audio_encoder.config.sampling_rate
976
+ # frame_rate = parler_model.audio_encoder.config.frame_rate
977
+
978
+ # def generate(text, description, play_steps_in_s=0.5):
979
+ # play_steps = int(frame_rate * play_steps_in_s)
980
+ # streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
981
+
982
+ # inputs = parler_tokenizer(description, return_tensors="pt").to(device)
983
+ # prompt = parler_tokenizer(text, return_tensors="pt").to(device)
984
+
985
+ # generation_kwargs = dict(
986
+ # input_ids=inputs.input_ids,
987
+ # prompt_input_ids=prompt.input_ids,
988
+ # attention_mask=inputs.attention_mask,
989
+ # prompt_attention_mask=prompt.attention_mask,
990
+ # streamer=streamer,
991
+ # do_sample=True,
992
+ # temperature=1.0,
993
+ # min_new_tokens=10,
994
+ # )
 
 
995
 
996
+ # thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
997
+ # thread.start()
998
 
999
+ # for new_audio in streamer:
1000
+ # if new_audio.shape[0] == 0:
1001
+ # break
1002
+ # # Save or process each audio chunk as it is generated
1003
+ # yield sampling_rate, new_audio
1004
 
1005
+ # audio_segments = []
1006
+ # for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
1007
+ # audio_segments.append(audio_chunk)
1008
 
1009
+ # temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
1010
+ # write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
1011
+ # logging.debug(f"Saved chunk to {temp_audio_path}")
1012
 
1013
 
1014
+ # # Combine all the audio chunks into one audio file
1015
+ # combined_audio = np.concatenate(audio_segments)
1016
+ # combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
1017
 
1018
+ # write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
1019
 
1020
+ # logging.debug(f"Combined audio saved to {combined_audio_path}")
1021
+ # return combined_audio_path
1022
 
1023
 
1024
  def fetch_local_events():