WhisperSpeech

Runtime error

App Files Files Community

Tonic commited on Jan 25, 2024

Commit

a175fb2

verified ·

1 Parent(s): ce9c685

Update app.py

Browse files

Files changed (1) hide show

app.py +0 -28

app.py CHANGED Viewed

@@ -38,9 +38,6 @@ def parse_multilingual_text(input_text):
 def generate_segment_audio(text, lang, speaker_url, pipe):
     if not isinstance(text, str):
         text = text.decode("utf-8") if isinstance(text, bytes) else str(text)
-    # Generating stoks (tokens<pl>) from text
-    # stoks = pipe.t2s.generate([text], lang=[lang])
     audio_data = pipe.generate(text, speaker_url, lang)
     resample_audio = resampler(newsr=24000)
     audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
@@ -48,33 +45,10 @@ def generate_segment_audio(text, lang, speaker_url, pipe):
     print("Shape after resampling:", audio_np.shape)  # Debug statement
     return audio_np
-# Function to append and concatenate audio segments with padding
 def concatenate_audio_segments(segments):
-#   # Determine the length of the longest segment
-#   max_length = max(seg.shape[0] for seg in segments)
-#   print("Max length of segments:", max_length)  # Debug statement
-#   # Pad each segment to the length of the longest segment and stack them
-#   padded_segments = []
-#   for seg in segments:
-#       # Check if the segment is stereo; if not, convert it to stereo
-#       if seg.ndim == 1 or seg.shape[1] == 1:
-#           stereo_segment = np.stack((seg, seg), axis=-1)
-#       else:
-#           stereo_segment = seg
-        # Pad the segment to the max length
- #      padding_length = max_length - stereo_segment.shape[0]
- #      padded_segment = np.pad(stereo_segment, ((0, padding_length), (0, 0)), 'constant')
- #      print("Padded segment shape:", padded_segment.shape)  # Debug statement
- #      padded_segments.append(padded_segment)
     concatenated_audio = np.concatenate(segments , axis=1)
-    print("Concatenated audio shape:", concatenated_audio.shape)  # Debug statement
-    # concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
     return concatenated_audio
-# The rest of the code in app.py remains the same
 @spaces.GPU
 def whisper_speech_demo(multilingual_text, speaker_audio):
@@ -94,10 +68,8 @@ def whisper_speech_demo(multilingual_text, speaker_audio):
     concatenated_audio = concatenate_audio_segments(audio_segments)
     print("Final concatenated audio shape:", concatenated_audio.shape)  # Debug statement
-    # Normalize the concatenated audio
     concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
-    # Write the audio data to a temporary file and return the file path
     with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
         sf.write(tmp_file.name, concatenated_audio.T, 24000, format='WAV', subtype='PCM_16')
         return tmp_file.name

 def generate_segment_audio(text, lang, speaker_url, pipe):
     if not isinstance(text, str):
         text = text.decode("utf-8") if isinstance(text, bytes) else str(text)
     audio_data = pipe.generate(text, speaker_url, lang)
     resample_audio = resampler(newsr=24000)
     audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
     print("Shape after resampling:", audio_np.shape)  # Debug statement
     return audio_np
 def concatenate_audio_segments(segments):
     concatenated_audio = np.concatenate(segments , axis=1)
     return concatenated_audio
 @spaces.GPU
 def whisper_speech_demo(multilingual_text, speaker_audio):
     concatenated_audio = concatenate_audio_segments(audio_segments)
     print("Final concatenated audio shape:", concatenated_audio.shape)  # Debug statement
     concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
     with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
         sf.write(tmp_file.name, concatenated_audio.T, 24000, format='WAV', subtype='PCM_16')
         return tmp_file.name