Tonic commited on
Commit
a175fb2
·
verified ·
1 Parent(s): ce9c685

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -28
app.py CHANGED
@@ -38,9 +38,6 @@ def parse_multilingual_text(input_text):
38
  def generate_segment_audio(text, lang, speaker_url, pipe):
39
  if not isinstance(text, str):
40
  text = text.decode("utf-8") if isinstance(text, bytes) else str(text)
41
-
42
- # Generating stoks (tokens<pl>) from text
43
- # stoks = pipe.t2s.generate([text], lang=[lang])
44
  audio_data = pipe.generate(text, speaker_url, lang)
45
  resample_audio = resampler(newsr=24000)
46
  audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
@@ -48,33 +45,10 @@ def generate_segment_audio(text, lang, speaker_url, pipe):
48
  print("Shape after resampling:", audio_np.shape) # Debug statement
49
  return audio_np
50
 
51
- # Function to append and concatenate audio segments with padding
52
  def concatenate_audio_segments(segments):
53
- # # Determine the length of the longest segment
54
- # max_length = max(seg.shape[0] for seg in segments)
55
- # print("Max length of segments:", max_length) # Debug statement
56
- # # Pad each segment to the length of the longest segment and stack them
57
- # padded_segments = []
58
- # for seg in segments:
59
- # # Check if the segment is stereo; if not, convert it to stereo
60
- # if seg.ndim == 1 or seg.shape[1] == 1:
61
- # stereo_segment = np.stack((seg, seg), axis=-1)
62
- # else:
63
- # stereo_segment = seg
64
-
65
- # Pad the segment to the max length
66
- # padding_length = max_length - stereo_segment.shape[0]
67
- # padded_segment = np.pad(stereo_segment, ((0, padding_length), (0, 0)), 'constant')
68
- # print("Padded segment shape:", padded_segment.shape) # Debug statement
69
- # padded_segments.append(padded_segment)
70
-
71
  concatenated_audio = np.concatenate(segments , axis=1)
72
-
73
- print("Concatenated audio shape:", concatenated_audio.shape) # Debug statement
74
- # concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
75
  return concatenated_audio
76
 
77
- # The rest of the code in app.py remains the same
78
 
79
  @spaces.GPU
80
  def whisper_speech_demo(multilingual_text, speaker_audio):
@@ -94,10 +68,8 @@ def whisper_speech_demo(multilingual_text, speaker_audio):
94
 
95
  concatenated_audio = concatenate_audio_segments(audio_segments)
96
  print("Final concatenated audio shape:", concatenated_audio.shape) # Debug statement
97
- # Normalize the concatenated audio
98
  concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
99
 
100
- # Write the audio data to a temporary file and return the file path
101
  with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
102
  sf.write(tmp_file.name, concatenated_audio.T, 24000, format='WAV', subtype='PCM_16')
103
  return tmp_file.name
 
38
  def generate_segment_audio(text, lang, speaker_url, pipe):
39
  if not isinstance(text, str):
40
  text = text.decode("utf-8") if isinstance(text, bytes) else str(text)
 
 
 
41
  audio_data = pipe.generate(text, speaker_url, lang)
42
  resample_audio = resampler(newsr=24000)
43
  audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
 
45
  print("Shape after resampling:", audio_np.shape) # Debug statement
46
  return audio_np
47
 
 
48
  def concatenate_audio_segments(segments):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  concatenated_audio = np.concatenate(segments , axis=1)
 
 
 
50
  return concatenated_audio
51
 
 
52
 
53
  @spaces.GPU
54
  def whisper_speech_demo(multilingual_text, speaker_audio):
 
68
 
69
  concatenated_audio = concatenate_audio_segments(audio_segments)
70
  print("Final concatenated audio shape:", concatenated_audio.shape) # Debug statement
 
71
  concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
72
 
 
73
  with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
74
  sf.write(tmp_file.name, concatenated_audio.T, 24000, format='WAV', subtype='PCM_16')
75
  return tmp_file.name