Pijush2023 commited on
Commit
a4a37b4
·
verified ·
1 Parent(s): b361b0f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -87
app.py CHANGED
@@ -885,41 +885,41 @@ def fetch_local_news():
885
  else:
886
  return "<p>Failed to fetch local news</p>"
887
 
888
- import numpy as np
889
- import torch
890
- from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
891
 
892
- model_id = 'openai/whisper-large-v3'
893
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
894
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
895
- model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
896
- processor = AutoProcessor.from_pretrained(model_id)
897
 
898
- pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=True)
899
 
900
- base_audio_drive = "/data/audio"
901
 
902
- #Normal Code with sample rate is 44100 Hz
903
 
904
- def transcribe_function(stream, new_chunk):
905
- try:
906
- sr, y = new_chunk[0], new_chunk[1]
907
- except TypeError:
908
- print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
909
- return stream, "", None
910
 
911
- y = y.astype(np.float32) / np.max(np.abs(y))
912
 
913
- if stream is not None:
914
- stream = np.concatenate([stream, y])
915
- else:
916
- stream = y
917
 
918
- result = pipe_asr({"array": stream, "sampling_rate": sr}, return_timestamps=False)
919
 
920
- full_text = result.get("text","")
921
 
922
- return stream, full_text, result
923
 
924
 
925
 
@@ -994,76 +994,76 @@ def generate_audio_elevenlabs(text):
994
 
995
  # chunking audio and then Process
996
 
997
- import concurrent.futures
998
- import tempfile
999
- import os
1000
- import numpy as np
1001
- import logging
1002
- from queue import Queue
1003
- from threading import Thread
1004
- from scipy.io.wavfile import write as write_wav
1005
- from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
1006
- from transformers import AutoTokenizer
1007
-
1008
- # Ensure your device is set to CUDA
1009
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
1010
-
1011
- repo_id = "parler-tts/parler-tts-mini-v1"
1012
-
1013
- def generate_audio_parler_tts(text):
1014
- description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
1015
- chunk_size_in_s = 0.5
1016
-
1017
- # Initialize the tokenizer and model
1018
- parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
1019
- parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
1020
- sampling_rate = parler_model.audio_encoder.config.sampling_rate
1021
- frame_rate = parler_model.audio_encoder.config.frame_rate
1022
-
1023
- def generate(text, description, play_steps_in_s=0.5):
1024
- play_steps = int(frame_rate * play_steps_in_s)
1025
- streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
1026
-
1027
- inputs = parler_tokenizer(description, return_tensors="pt").to(device)
1028
- prompt = parler_tokenizer(text, return_tensors="pt").to(device)
1029
-
1030
- generation_kwargs = dict(
1031
- input_ids=inputs.input_ids,
1032
- prompt_input_ids=prompt.input_ids,
1033
- attention_mask=inputs.attention_mask,
1034
- prompt_attention_mask=prompt.attention_mask,
1035
- streamer=streamer,
1036
- do_sample=True,
1037
- temperature=1.0,
1038
- min_new_tokens=10,
1039
- )
1040
 
1041
- thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
1042
- thread.start()
1043
 
1044
- for new_audio in streamer:
1045
- if new_audio.shape[0] == 0:
1046
- break
1047
- # Save or process each audio chunk as it is generated
1048
- yield sampling_rate, new_audio
1049
 
1050
- audio_segments = []
1051
- for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
1052
- audio_segments.append(audio_chunk)
1053
 
1054
- temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
1055
- write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
1056
- logging.debug(f"Saved chunk to {temp_audio_path}")
1057
 
1058
 
1059
- # Combine all the audio chunks into one audio file
1060
- combined_audio = np.concatenate(audio_segments)
1061
- combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
1062
 
1063
- write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
1064
 
1065
- logging.debug(f"Combined audio saved to {combined_audio_path}")
1066
- return combined_audio_path
1067
 
1068
 
1069
  def fetch_local_events():
 
885
  else:
886
  return "<p>Failed to fetch local news</p>"
887
 
888
+ # import numpy as np
889
+ # import torch
890
+ # from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
891
 
892
+ # model_id = 'openai/whisper-large-v3'
893
+ # device = "cuda:0" if torch.cuda.is_available() else "cpu"
894
+ # torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
895
+ # model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
896
+ # processor = AutoProcessor.from_pretrained(model_id)
897
 
898
+ # pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=True)
899
 
900
+ # base_audio_drive = "/data/audio"
901
 
902
+ # #Normal Code with sample rate is 44100 Hz
903
 
904
+ # def transcribe_function(stream, new_chunk):
905
+ # try:
906
+ # sr, y = new_chunk[0], new_chunk[1]
907
+ # except TypeError:
908
+ # print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
909
+ # return stream, "", None
910
 
911
+ # y = y.astype(np.float32) / np.max(np.abs(y))
912
 
913
+ # if stream is not None:
914
+ # stream = np.concatenate([stream, y])
915
+ # else:
916
+ # stream = y
917
 
918
+ # result = pipe_asr({"array": stream, "sampling_rate": sr}, return_timestamps=False)
919
 
920
+ # full_text = result.get("text","")
921
 
922
+ # return stream, full_text, result
923
 
924
 
925
 
 
994
 
995
  # chunking audio and then Process
996
 
997
+ # import concurrent.futures
998
+ # import tempfile
999
+ # import os
1000
+ # import numpy as np
1001
+ # import logging
1002
+ # from queue import Queue
1003
+ # from threading import Thread
1004
+ # from scipy.io.wavfile import write as write_wav
1005
+ # from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
1006
+ # from transformers import AutoTokenizer
1007
+
1008
+ # # Ensure your device is set to CUDA
1009
+ # device = "cuda:0" if torch.cuda.is_available() else "cpu"
1010
+
1011
+ # repo_id = "parler-tts/parler-tts-mini-v1"
1012
+
1013
+ # def generate_audio_parler_tts(text):
1014
+ # description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
1015
+ # chunk_size_in_s = 0.5
1016
+
1017
+ # # Initialize the tokenizer and model
1018
+ # parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
1019
+ # parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
1020
+ # sampling_rate = parler_model.audio_encoder.config.sampling_rate
1021
+ # frame_rate = parler_model.audio_encoder.config.frame_rate
1022
+
1023
+ # def generate(text, description, play_steps_in_s=0.5):
1024
+ # play_steps = int(frame_rate * play_steps_in_s)
1025
+ # streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
1026
+
1027
+ # inputs = parler_tokenizer(description, return_tensors="pt").to(device)
1028
+ # prompt = parler_tokenizer(text, return_tensors="pt").to(device)
1029
+
1030
+ # generation_kwargs = dict(
1031
+ # input_ids=inputs.input_ids,
1032
+ # prompt_input_ids=prompt.input_ids,
1033
+ # attention_mask=inputs.attention_mask,
1034
+ # prompt_attention_mask=prompt.attention_mask,
1035
+ # streamer=streamer,
1036
+ # do_sample=True,
1037
+ # temperature=1.0,
1038
+ # min_new_tokens=10,
1039
+ # )
1040
 
1041
+ # thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
1042
+ # thread.start()
1043
 
1044
+ # for new_audio in streamer:
1045
+ # if new_audio.shape[0] == 0:
1046
+ # break
1047
+ # # Save or process each audio chunk as it is generated
1048
+ # yield sampling_rate, new_audio
1049
 
1050
+ # audio_segments = []
1051
+ # for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
1052
+ # audio_segments.append(audio_chunk)
1053
 
1054
+ # temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
1055
+ # write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
1056
+ # logging.debug(f"Saved chunk to {temp_audio_path}")
1057
 
1058
 
1059
+ # # Combine all the audio chunks into one audio file
1060
+ # combined_audio = np.concatenate(audio_segments)
1061
+ # combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
1062
 
1063
+ # write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
1064
 
1065
+ # logging.debug(f"Combined audio saved to {combined_audio_path}")
1066
+ # return combined_audio_path
1067
 
1068
 
1069
  def fetch_local_events():