bcci commited on
Commit
c6817ce
·
verified ·
1 Parent(s): f5d0fca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -10
app.py CHANGED
@@ -18,6 +18,7 @@ from onnxruntime import InferenceSession
18
  from huggingface_hub import snapshot_download
19
 
20
  import json
 
21
 
22
  # Load the configuration file
23
  config_file_path = 'config.json' # Update this with the path to your config file
@@ -162,11 +163,13 @@ def audio_tensor_to_opus_bytes(audio_tensor: torch.Tensor, sample_rate: int = 24
162
  g2p = en.G2P(trf=False, british=False, fallback=None) # no transformer, American English
163
 
164
  def tokenizer(text):
 
165
  phonemes_string, _ = g2p(text)
166
  phonemes = []
167
  for i in phonemes_string:
168
  phonemes.append(i)
169
  tokens = [phoneme_vocab[phoneme] for phoneme in phonemes if phoneme in phoneme_vocab]
 
170
  return tokens
171
 
172
 
@@ -240,8 +243,6 @@ def tts_full(text: str, voice: str = "af_heart", speed: float = 1.0, format: str
240
 
241
  final_token = [[0, *tokens, 0]]
242
 
243
- print(final_token)
244
-
245
  full_audio = sess.run(None, dict(
246
  input_ids=final_token,
247
  style=ref_s,
@@ -250,17 +251,17 @@ def tts_full(text: str, voice: str = "af_heart", speed: float = 1.0, format: str
250
 
251
  # Write the concatenated audio to an in-memory WAV or Opus file.
252
  sample_rate = 24000
253
- num_channels = 1
254
- sample_width = 2 # 16-bit PCM -> 2 bytes per sample
255
  if format.lower() == "wav":
 
 
256
  wav_io = io.BytesIO()
257
- with wave.open(wav_io, "wb") as wav_file:
258
- wav_file.setnchannels(num_channels)
259
- wav_file.setsampwidth(sample_width)
260
- wav_file.setframerate(sample_rate)
261
- full_audio_int16 = np.int16(full_audio * 32767)
262
- wav_file.writeframes(full_audio_int16.tobytes())
263
  wav_io.seek(0)
 
264
  return Response(content=wav_io.read(), media_type="audio/wav")
265
  elif format.lower() == "opus":
266
  opus_data = audio_tensor_to_opus_bytes(torch.from_numpy(full_audio), sample_rate=sample_rate)
 
18
  from huggingface_hub import snapshot_download
19
 
20
  import json
21
+ from scipy.io.wavfile import write as write_wav
22
 
23
  # Load the configuration file
24
  config_file_path = 'config.json' # Update this with the path to your config file
 
163
  g2p = en.G2P(trf=False, british=False, fallback=None) # no transformer, American English
164
 
165
  def tokenizer(text):
166
+ print("Text": text)
167
  phonemes_string, _ = g2p(text)
168
  phonemes = []
169
  for i in phonemes_string:
170
  phonemes.append(i)
171
  tokens = [phoneme_vocab[phoneme] for phoneme in phonemes if phoneme in phoneme_vocab]
172
+ print(tokens)
173
  return tokens
174
 
175
 
 
243
 
244
  final_token = [[0, *tokens, 0]]
245
 
 
 
246
  full_audio = sess.run(None, dict(
247
  input_ids=final_token,
248
  style=ref_s,
 
251
 
252
  # Write the concatenated audio to an in-memory WAV or Opus file.
253
  sample_rate = 24000
 
 
254
  if format.lower() == "wav":
255
+
256
+ # Create an in-memory buffer
257
  wav_io = io.BytesIO()
258
+
259
+ # Write the audio data to the buffer in WAV format
260
+ write_wav(wav_io, sample_rate, audio)
261
+
262
+ # Seek to the beginning of the buffer
 
263
  wav_io.seek(0)
264
+
265
  return Response(content=wav_io.read(), media_type="audio/wav")
266
  elif format.lower() == "opus":
267
  opus_data = audio_tensor_to_opus_bytes(torch.from_numpy(full_audio), sample_rate=sample_rate)