AnhP's picture
Upload 170 files
1e4a2ab verified
raw
history blame
4.16 kB
import os
import sys
import pysrt
import codecs
import librosa
import asyncio
import requests
import tempfile
import numpy as np
import soundfile as sf
from edge_tts import Communicate
sys.path.append(os.getcwd())
from main.app.variables import translations
from main.app.core.ui import gr_info, gr_warning, gr_error
def synthesize_tts(prompt, voice, speed, output, pitch, google):
if not google: asyncio.run(Communicate(text=prompt, voice=voice, rate=f"+{speed}%" if speed >= 0 else f"{speed}%", pitch=f"+{pitch}Hz" if pitch >= 0 else f"{pitch}Hz").save(output))
else:
response = requests.get(codecs.decode("uggcf://genafyngr.tbbtyr.pbz/genafyngr_ggf", "rot13"), params={"ie": "UTF-8", "q": prompt, "tl": voice, "ttsspeed": speed, "client": "tw-ob"}, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"})
if response.status_code == 200:
with open(output, "wb") as f:
f.write(response.content)
if pitch != 0 or speed != 0:
y, sr = librosa.load(output, sr=None)
if pitch != 0: y = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch)
if speed != 0: y = librosa.effects.time_stretch(y, rate=speed)
sf.write(file=output, data=y, samplerate=sr, format=os.path.splitext(os.path.basename(output))[-1].lower().replace('.', ''))
else: gr_error(f"{response.status_code}, {response.text}")
def time_stretch(y, sr, target_duration):
rate = (len(y) / sr) / target_duration
if rate != 1.0: y = librosa.effects.time_stretch(y=y.astype(np.float32), rate=rate)
n_target = int(round(target_duration * sr))
return np.pad(y, (0, n_target - len(y))) if len(y) < n_target else y[:n_target]
def pysrttime_to_seconds(t):
return (t.hours * 60 + t.minutes) * 60 + t.seconds + t.milliseconds / 1000
def srt_tts(srt_file, out_file, voice, rate = 0, sr = 24000, google = False):
subs = pysrt.open(srt_file)
if not subs: raise ValueError(translations["srt"])
final_audio = np.zeros(int(round(pysrttime_to_seconds(subs[-1].end) * sr)), dtype=np.float32)
with tempfile.TemporaryDirectory() as tempdir:
for idx, seg in enumerate(subs):
wav_path = os.path.join(tempdir, f"seg_{idx}.wav")
synthesize_tts(" ".join(seg.text.splitlines()), voice, 0, wav_path, rate, google)
audio, file_sr = sf.read(wav_path, dtype=np.float32)
if file_sr != sr: audio = np.interp(np.linspace(0, len(audio) - 1, int(len(audio) * sr / file_sr)), np.arange(len(audio)), audio)
adjusted = time_stretch(audio, sr, pysrttime_to_seconds(seg.duration))
start_sample = int(round(pysrttime_to_seconds(seg.start) * sr))
end_sample = start_sample + adjusted.shape[0]
if end_sample > final_audio.shape[0]:
adjusted = adjusted[: final_audio.shape[0] - start_sample]
end_sample = final_audio.shape[0]
final_audio[start_sample:end_sample] += adjusted
sf.write(out_file, final_audio, sr)
def TTS(prompt, voice, speed, output, pitch, google, srt_input):
if not srt_input: srt_input = ""
if not prompt and not srt_input.endswith(".srt"):
gr_warning(translations["enter_the_text"])
return None
if not voice:
gr_warning(translations["choose_voice"])
return None
if not output:
gr_warning(translations["output_not_valid"])
return None
if os.path.isdir(output): output = os.path.join(output, f"tts.wav")
gr_info(translations["convert"].format(name=translations["text"]))
output_dir = os.path.dirname(output) or output
if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
if srt_input.endswith(".srt"): srt_tts(srt_input, output, voice, 0, 24000, google)
else: synthesize_tts(prompt, voice, speed, output, pitch, google)
gr_info(translations["success"])
return output