kokoro-onnx-api-test

Running

App Files Files Community

bcci commited on Feb 8

Commit

c6817ce

verified ·

1 Parent(s): f5d0fca

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -10

app.py CHANGED Viewed

@@ -18,6 +18,7 @@ from onnxruntime import InferenceSession
 from huggingface_hub import snapshot_download
 import json
 # Load the configuration file
 config_file_path = 'config.json'  # Update this with the path to your config file
@@ -162,11 +163,13 @@ def audio_tensor_to_opus_bytes(audio_tensor: torch.Tensor, sample_rate: int = 24
 g2p = en.G2P(trf=False, british=False, fallback=None) # no transformer, American English
 def tokenizer(text):
     phonemes_string, _ = g2p(text)
     phonemes = []
     for i in phonemes_string:
         phonemes.append(i)
     tokens = [phoneme_vocab[phoneme] for phoneme in phonemes if phoneme in phoneme_vocab]
     return tokens
@@ -240,8 +243,6 @@ def tts_full(text: str, voice: str = "af_heart", speed: float = 1.0, format: str
     final_token = [[0, *tokens, 0]]
-    print(final_token)
     full_audio = sess.run(None, dict(
         input_ids=final_token,
         style=ref_s,
@@ -250,17 +251,17 @@ def tts_full(text: str, voice: str = "af_heart", speed: float = 1.0, format: str
     # Write the concatenated audio to an in-memory WAV or Opus file.
     sample_rate = 24000
-    num_channels = 1
-    sample_width = 2  # 16-bit PCM -> 2 bytes per sample
     if format.lower() == "wav":
         wav_io = io.BytesIO()
-        with wave.open(wav_io, "wb") as wav_file:
-            wav_file.setnchannels(num_channels)
-            wav_file.setsampwidth(sample_width)
-            wav_file.setframerate(sample_rate)
-            full_audio_int16 = np.int16(full_audio * 32767)
-            wav_file.writeframes(full_audio_int16.tobytes())
         wav_io.seek(0)
         return Response(content=wav_io.read(), media_type="audio/wav")
     elif format.lower() == "opus":
         opus_data = audio_tensor_to_opus_bytes(torch.from_numpy(full_audio), sample_rate=sample_rate)

 from huggingface_hub import snapshot_download
 import json
+from scipy.io.wavfile import write as write_wav
 # Load the configuration file
 config_file_path = 'config.json'  # Update this with the path to your config file
 g2p = en.G2P(trf=False, british=False, fallback=None) # no transformer, American English
 def tokenizer(text):
+    print("Text": text)
     phonemes_string, _ = g2p(text)
     phonemes = []
     for i in phonemes_string:
         phonemes.append(i)
     tokens = [phoneme_vocab[phoneme] for phoneme in phonemes if phoneme in phoneme_vocab]
+    print(tokens)
     return tokens
     final_token = [[0, *tokens, 0]]
     full_audio = sess.run(None, dict(
         input_ids=final_token,
         style=ref_s,
     # Write the concatenated audio to an in-memory WAV or Opus file.
     sample_rate = 24000
     if format.lower() == "wav":
+        # Create an in-memory buffer
         wav_io = io.BytesIO()
+        # Write the audio data to the buffer in WAV format
+        write_wav(wav_io, sample_rate, audio)
+        # Seek to the beginning of the buffer
         wav_io.seek(0)
         return Response(content=wav_io.read(), media_type="audio/wav")
     elif format.lower() == "opus":
         opus_data = audio_tensor_to_opus_bytes(torch.from_numpy(full_audio), sample_rate=sample_rate)