Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -18,6 +18,7 @@ from onnxruntime import InferenceSession
|
|
18 |
from huggingface_hub import snapshot_download
|
19 |
|
20 |
import json
|
|
|
21 |
|
22 |
# Load the configuration file
|
23 |
config_file_path = 'config.json' # Update this with the path to your config file
|
@@ -162,11 +163,13 @@ def audio_tensor_to_opus_bytes(audio_tensor: torch.Tensor, sample_rate: int = 24
|
|
162 |
g2p = en.G2P(trf=False, british=False, fallback=None) # no transformer, American English
|
163 |
|
164 |
def tokenizer(text):
|
|
|
165 |
phonemes_string, _ = g2p(text)
|
166 |
phonemes = []
|
167 |
for i in phonemes_string:
|
168 |
phonemes.append(i)
|
169 |
tokens = [phoneme_vocab[phoneme] for phoneme in phonemes if phoneme in phoneme_vocab]
|
|
|
170 |
return tokens
|
171 |
|
172 |
|
@@ -240,8 +243,6 @@ def tts_full(text: str, voice: str = "af_heart", speed: float = 1.0, format: str
|
|
240 |
|
241 |
final_token = [[0, *tokens, 0]]
|
242 |
|
243 |
-
print(final_token)
|
244 |
-
|
245 |
full_audio = sess.run(None, dict(
|
246 |
input_ids=final_token,
|
247 |
style=ref_s,
|
@@ -250,17 +251,17 @@ def tts_full(text: str, voice: str = "af_heart", speed: float = 1.0, format: str
|
|
250 |
|
251 |
# Write the concatenated audio to an in-memory WAV or Opus file.
|
252 |
sample_rate = 24000
|
253 |
-
num_channels = 1
|
254 |
-
sample_width = 2 # 16-bit PCM -> 2 bytes per sample
|
255 |
if format.lower() == "wav":
|
|
|
|
|
256 |
wav_io = io.BytesIO()
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
wav_file.writeframes(full_audio_int16.tobytes())
|
263 |
wav_io.seek(0)
|
|
|
264 |
return Response(content=wav_io.read(), media_type="audio/wav")
|
265 |
elif format.lower() == "opus":
|
266 |
opus_data = audio_tensor_to_opus_bytes(torch.from_numpy(full_audio), sample_rate=sample_rate)
|
|
|
18 |
from huggingface_hub import snapshot_download
|
19 |
|
20 |
import json
|
21 |
+
from scipy.io.wavfile import write as write_wav
|
22 |
|
23 |
# Load the configuration file
|
24 |
config_file_path = 'config.json' # Update this with the path to your config file
|
|
|
163 |
g2p = en.G2P(trf=False, british=False, fallback=None) # no transformer, American English
|
164 |
|
165 |
def tokenizer(text):
|
166 |
+
print("Text": text)
|
167 |
phonemes_string, _ = g2p(text)
|
168 |
phonemes = []
|
169 |
for i in phonemes_string:
|
170 |
phonemes.append(i)
|
171 |
tokens = [phoneme_vocab[phoneme] for phoneme in phonemes if phoneme in phoneme_vocab]
|
172 |
+
print(tokens)
|
173 |
return tokens
|
174 |
|
175 |
|
|
|
243 |
|
244 |
final_token = [[0, *tokens, 0]]
|
245 |
|
|
|
|
|
246 |
full_audio = sess.run(None, dict(
|
247 |
input_ids=final_token,
|
248 |
style=ref_s,
|
|
|
251 |
|
252 |
# Write the concatenated audio to an in-memory WAV or Opus file.
|
253 |
sample_rate = 24000
|
|
|
|
|
254 |
if format.lower() == "wav":
|
255 |
+
|
256 |
+
# Create an in-memory buffer
|
257 |
wav_io = io.BytesIO()
|
258 |
+
|
259 |
+
# Write the audio data to the buffer in WAV format
|
260 |
+
write_wav(wav_io, sample_rate, audio)
|
261 |
+
|
262 |
+
# Seek to the beginning of the buffer
|
|
|
263 |
wav_io.seek(0)
|
264 |
+
|
265 |
return Response(content=wav_io.read(), media_type="audio/wav")
|
266 |
elif format.lower() == "opus":
|
267 |
opus_data = audio_tensor_to_opus_bytes(torch.from_numpy(full_audio), sample_rate=sample_rate)
|