Pijush2023 commited on
Commit
8a9dfcb
·
verified ·
1 Parent(s): 11d8a01

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -81
app.py CHANGED
@@ -337,7 +337,7 @@ chain_neo4j = (
337
 
338
 
339
 
340
-
341
 
342
 
343
  def bot(history, choice, tts_choice, retrieval_mode, model_choice):
@@ -369,6 +369,8 @@ def bot(history, choice, tts_choice, retrieval_mode, model_choice):
369
  history.append([response, None])
370
 
371
 
 
 
372
  # def bot(history, choice, tts_choice, retrieval_mode, model_choice):
373
  # if not history:
374
  # return history
@@ -771,6 +773,8 @@ def generate_audio_elevenlabs(text):
771
  return None
772
 
773
 
 
 
774
  # from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
775
  # from transformers import AutoTokenizer
776
  # from threading import Thread
@@ -836,78 +840,7 @@ def generate_audio_elevenlabs(text):
836
  # return combined_audio_path
837
 
838
 
839
-
840
- # import concurrent.futures
841
- # import tempfile
842
- # import os
843
- # import numpy as np
844
- # import logging
845
- # from queue import Queue
846
- # from threading import Thread
847
- # from scipy.io.wavfile import write as write_wav
848
- # from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
849
- # from transformers import AutoTokenizer
850
-
851
- # # Ensure your device is set to CUDA
852
- # device = "cuda:0" if torch.cuda.is_available() else "cpu"
853
-
854
- # repo_id = "parler-tts/parler-tts-mini-v1"
855
-
856
- # def generate_audio_parler_tts(text):
857
- # description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
858
- # chunk_size_in_s = 0.5
859
-
860
- # # Initialize the tokenizer and model
861
- # parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
862
- # parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
863
- # sampling_rate = parler_model.audio_encoder.config.sampling_rate
864
- # frame_rate = parler_model.audio_encoder.config.frame_rate
865
-
866
- # def generate(text, description, play_steps_in_s=0.5):
867
- # play_steps = int(frame_rate * play_steps_in_s)
868
- # streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
869
-
870
- # inputs = parler_tokenizer(description, return_tensors="pt").to(device)
871
- # prompt = parler_tokenizer(text, return_tensors="pt").to(device)
872
-
873
- # generation_kwargs = dict(
874
- # input_ids=inputs.input_ids,
875
- # prompt_input_ids=prompt.input_ids,
876
- # attention_mask=inputs.attention_mask,
877
- # prompt_attention_mask=prompt.attention_mask,
878
- # streamer=streamer,
879
- # do_sample=True,
880
- # temperature=1.0,
881
- # min_new_tokens=10,
882
- # )
883
-
884
- # thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
885
- # thread.start()
886
-
887
- # for new_audio in streamer:
888
- # if new_audio.shape[0] == 0:
889
- # break
890
- # # Save or process each audio chunk as it is generated
891
- # yield sampling_rate, new_audio
892
-
893
- # audio_segments = []
894
- # for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
895
- # audio_segments.append(audio_chunk)
896
-
897
- # temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
898
- # write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
899
- # logging.debug(f"Saved chunk to {temp_audio_path}")
900
-
901
-
902
- # # Combine all the audio chunks into one audio file
903
- # combined_audio = np.concatenate(audio_segments)
904
- # combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
905
-
906
- # write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
907
-
908
- # logging.debug(f"Combined audio saved to {combined_audio_path}")
909
- # return combined_audio_path
910
-
911
 
912
  import concurrent.futures
913
  import tempfile
@@ -917,8 +850,8 @@ import logging
917
  from queue import Queue
918
  from threading import Thread
919
  from scipy.io.wavfile import write as write_wav
920
- from transformers import AutoTokenizer
921
  from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
 
922
 
923
  # Ensure your device is set to CUDA
924
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
@@ -959,22 +892,97 @@ def generate_audio_parler_tts(text):
959
  for new_audio in streamer:
960
  if new_audio.shape[0] == 0:
961
  break
962
- # Save each audio chunk as it is generated
963
- temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
964
- write_wav(temp_audio_path, sampling_rate, new_audio.astype(np.float32))
965
- yield temp_audio_path
966
 
967
  audio_segments = []
968
- for audio_chunk_path in generate(text, description, chunk_size_in_s):
969
- audio_segments.append(audio_chunk_path)
970
- yield audio_chunk_path # Yield each audio chunk path immediately
 
 
 
 
971
 
972
  # Combine all the audio chunks into one audio file
973
  combined_audio = np.concatenate(audio_segments)
974
  combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
 
975
  write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
976
 
977
  logging.debug(f"Combined audio saved to {combined_audio_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
978
 
979
 
980
 
 
337
 
338
 
339
 
340
+ #perfect
341
 
342
 
343
  def bot(history, choice, tts_choice, retrieval_mode, model_choice):
 
369
  history.append([response, None])
370
 
371
 
372
+ #Bot response withstream and flicker with time
373
+
374
  # def bot(history, choice, tts_choice, retrieval_mode, model_choice):
375
  # if not history:
376
  # return history
 
773
  return None
774
 
775
 
776
+ #Normal cases
777
+
778
  # from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
779
  # from transformers import AutoTokenizer
780
  # from threading import Thread
 
840
  # return combined_audio_path
841
 
842
 
843
+ # chunking audio and then Process
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
844
 
845
  import concurrent.futures
846
  import tempfile
 
850
  from queue import Queue
851
  from threading import Thread
852
  from scipy.io.wavfile import write as write_wav
 
853
  from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
854
+ from transformers import AutoTokenizer
855
 
856
  # Ensure your device is set to CUDA
857
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
892
  for new_audio in streamer:
893
  if new_audio.shape[0] == 0:
894
  break
895
+ # Save or process each audio chunk as it is generated
896
+ yield sampling_rate, new_audio
 
 
897
 
898
  audio_segments = []
899
+ for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
900
+ audio_segments.append(audio_chunk)
901
+
902
+ temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
903
+ write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
904
+ logging.debug(f"Saved chunk to {temp_audio_path}")
905
+
906
 
907
  # Combine all the audio chunks into one audio file
908
  combined_audio = np.concatenate(audio_segments)
909
  combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
910
+
911
  write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
912
 
913
  logging.debug(f"Combined audio saved to {combined_audio_path}")
914
+ return combined_audio_path
915
+
916
+
917
+ # #streaming code in chunk
918
+
919
+
920
+ # import concurrent.futures
921
+ # import tempfile
922
+ # import os
923
+ # import numpy as np
924
+ # import logging
925
+ # from queue import Queue
926
+ # from threading import Thread
927
+ # from scipy.io.wavfile import write as write_wav
928
+ # from transformers import AutoTokenizer
929
+ # from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
930
+
931
+ # # Ensure your device is set to CUDA
932
+ # device = "cuda:0" if torch.cuda.is_available() else "cpu"
933
+
934
+ # repo_id = "parler-tts/parler-tts-mini-v1"
935
+
936
+ # def generate_audio_parler_tts(text):
937
+ # description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
938
+ # chunk_size_in_s = 0.5
939
+
940
+ # # Initialize the tokenizer and model
941
+ # parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
942
+ # parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
943
+ # sampling_rate = parler_model.audio_encoder.config.sampling_rate
944
+ # frame_rate = parler_model.audio_encoder.config.frame_rate
945
+
946
+ # def generate(text, description, play_steps_in_s=0.5):
947
+ # play_steps = int(frame_rate * play_steps_in_s)
948
+ # streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
949
+
950
+ # inputs = parler_tokenizer(description, return_tensors="pt").to(device)
951
+ # prompt = parler_tokenizer(text, return_tensors="pt").to(device)
952
+
953
+ # generation_kwargs = dict(
954
+ # input_ids=inputs.input_ids,
955
+ # prompt_input_ids=prompt.input_ids,
956
+ # attention_mask=inputs.attention_mask,
957
+ # prompt_attention_mask=prompt.attention_mask,
958
+ # streamer=streamer,
959
+ # do_sample=True,
960
+ # temperature=1.0,
961
+ # min_new_tokens=10,
962
+ # )
963
+
964
+ # thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
965
+ # thread.start()
966
+
967
+ # for new_audio in streamer:
968
+ # if new_audio.shape[0] == 0:
969
+ # break
970
+ # # Save each audio chunk as it is generated
971
+ # temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
972
+ # write_wav(temp_audio_path, sampling_rate, new_audio.astype(np.float32))
973
+ # yield temp_audio_path
974
+
975
+ # audio_segments = []
976
+ # for audio_chunk_path in generate(text, description, chunk_size_in_s):
977
+ # audio_segments.append(audio_chunk_path)
978
+ # yield audio_chunk_path # Yield each audio chunk path immediately
979
+
980
+ # # Combine all the audio chunks into one audio file
981
+ # combined_audio = np.concatenate(audio_segments)
982
+ # combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
983
+ # write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
984
+
985
+ # logging.debug(f"Combined audio saved to {combined_audio_path}")
986
 
987
 
988