import os import sys import pysrt import codecs import librosa import asyncio import requests import tempfile import numpy as np import soundfile as sf from edge_tts import Communicate sys.path.append(os.getcwd()) from main.app.variables import translations from main.app.core.ui import gr_info, gr_warning, gr_error def synthesize_tts(prompt, voice, speed, output, pitch, google): if not google: asyncio.run(Communicate(text=prompt, voice=voice, rate=f"+{speed}%" if speed >= 0 else f"{speed}%", pitch=f"+{pitch}Hz" if pitch >= 0 else f"{pitch}Hz").save(output)) else: response = requests.get(codecs.decode("uggcf://genafyngr.tbbtyr.pbz/genafyngr_ggf", "rot13"), params={"ie": "UTF-8", "q": prompt, "tl": voice, "ttsspeed": speed, "client": "tw-ob"}, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"}) if response.status_code == 200: with open(output, "wb") as f: f.write(response.content) if pitch != 0 or speed != 0: y, sr = librosa.load(output, sr=None) if pitch != 0: y = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch) if speed != 0: y = librosa.effects.time_stretch(y, rate=speed) sf.write(file=output, data=y, samplerate=sr, format=os.path.splitext(os.path.basename(output))[-1].lower().replace('.', '')) else: gr_error(f"{response.status_code}, {response.text}") def time_stretch(y, sr, target_duration): rate = (len(y) / sr) / target_duration if rate != 1.0: y = librosa.effects.time_stretch(y=y.astype(np.float32), rate=rate) n_target = int(round(target_duration * sr)) return np.pad(y, (0, n_target - len(y))) if len(y) < n_target else y[:n_target] def pysrttime_to_seconds(t): return (t.hours * 60 + t.minutes) * 60 + t.seconds + t.milliseconds / 1000 def srt_tts(srt_file, out_file, voice, rate = 0, sr = 24000, google = False): subs = pysrt.open(srt_file) if not subs: raise ValueError(translations["srt"]) final_audio = np.zeros(int(round(pysrttime_to_seconds(subs[-1].end) * sr)), dtype=np.float32) with tempfile.TemporaryDirectory() as tempdir: for idx, seg in enumerate(subs): wav_path = os.path.join(tempdir, f"seg_{idx}.wav") synthesize_tts(" ".join(seg.text.splitlines()), voice, 0, wav_path, rate, google) audio, file_sr = sf.read(wav_path, dtype=np.float32) if file_sr != sr: audio = np.interp(np.linspace(0, len(audio) - 1, int(len(audio) * sr / file_sr)), np.arange(len(audio)), audio) adjusted = time_stretch(audio, sr, pysrttime_to_seconds(seg.duration)) start_sample = int(round(pysrttime_to_seconds(seg.start) * sr)) end_sample = start_sample + adjusted.shape[0] if end_sample > final_audio.shape[0]: adjusted = adjusted[: final_audio.shape[0] - start_sample] end_sample = final_audio.shape[0] final_audio[start_sample:end_sample] += adjusted sf.write(out_file, final_audio, sr) def TTS(prompt, voice, speed, output, pitch, google, srt_input): if not srt_input: srt_input = "" if not prompt and not srt_input.endswith(".srt"): gr_warning(translations["enter_the_text"]) return None if not voice: gr_warning(translations["choose_voice"]) return None if not output: gr_warning(translations["output_not_valid"]) return None if os.path.isdir(output): output = os.path.join(output, f"tts.wav") gr_info(translations["convert"].format(name=translations["text"])) output_dir = os.path.dirname(output) or output if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) if srt_input.endswith(".srt"): srt_tts(srt_input, output, voice, 0, 24000, google) else: synthesize_tts(prompt, voice, speed, output, pitch, google) gr_info(translations["success"]) return output