File size: 4,156 Bytes
1e4a2ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import sys
import pysrt
import codecs
import librosa
import asyncio
import requests
import tempfile

import numpy as np
import soundfile as sf

from edge_tts import Communicate

sys.path.append(os.getcwd())

from main.app.variables import translations
from main.app.core.ui import gr_info, gr_warning, gr_error

def synthesize_tts(prompt, voice, speed, output, pitch, google):
    if not google:  asyncio.run(Communicate(text=prompt, voice=voice, rate=f"+{speed}%" if speed >= 0 else f"{speed}%", pitch=f"+{pitch}Hz" if pitch >= 0 else f"{pitch}Hz").save(output))
    else: 
        response = requests.get(codecs.decode("uggcf://genafyngr.tbbtyr.pbz/genafyngr_ggf", "rot13"), params={"ie": "UTF-8", "q": prompt, "tl": voice, "ttsspeed": speed, "client": "tw-ob"}, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"})

        if response.status_code == 200:
            with open(output, "wb") as f:
                f.write(response.content)

            if pitch != 0 or speed != 0:
                y, sr = librosa.load(output, sr=None)

                if pitch != 0: y = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch)
                if speed != 0: y = librosa.effects.time_stretch(y, rate=speed)

                sf.write(file=output, data=y, samplerate=sr, format=os.path.splitext(os.path.basename(output))[-1].lower().replace('.', ''))
        else: gr_error(f"{response.status_code}, {response.text}")

def time_stretch(y, sr, target_duration):
    rate = (len(y) / sr) / target_duration
    if rate != 1.0: y = librosa.effects.time_stretch(y=y.astype(np.float32), rate=rate)

    n_target = int(round(target_duration * sr))
    return np.pad(y, (0, n_target - len(y))) if len(y) < n_target else y[:n_target]

def pysrttime_to_seconds(t):
    return (t.hours * 60 + t.minutes) * 60 + t.seconds + t.milliseconds / 1000

def srt_tts(srt_file, out_file, voice, rate = 0, sr = 24000, google = False):
    subs = pysrt.open(srt_file)
    if not subs: raise ValueError(translations["srt"])

    final_audio = np.zeros(int(round(pysrttime_to_seconds(subs[-1].end) * sr)), dtype=np.float32)

    with tempfile.TemporaryDirectory() as tempdir:
        for idx, seg in enumerate(subs):
            wav_path = os.path.join(tempdir, f"seg_{idx}.wav")
            synthesize_tts(" ".join(seg.text.splitlines()), voice, 0, wav_path, rate, google)

            audio, file_sr = sf.read(wav_path, dtype=np.float32)
            if file_sr != sr: audio = np.interp(np.linspace(0, len(audio) - 1, int(len(audio) * sr / file_sr)), np.arange(len(audio)), audio)
            adjusted = time_stretch(audio, sr, pysrttime_to_seconds(seg.duration))

            start_sample = int(round(pysrttime_to_seconds(seg.start) * sr))
            end_sample = start_sample + adjusted.shape[0]

            if end_sample > final_audio.shape[0]:
                adjusted = adjusted[: final_audio.shape[0] - start_sample]
                end_sample = final_audio.shape[0]

            final_audio[start_sample:end_sample] += adjusted

    sf.write(out_file, final_audio, sr)

def TTS(prompt, voice, speed, output, pitch, google, srt_input):
    if not srt_input: srt_input = ""

    if not prompt and not srt_input.endswith(".srt"):
        gr_warning(translations["enter_the_text"])
        return None
    
    if not voice:
        gr_warning(translations["choose_voice"])
        return None
    
    if not output: 
        gr_warning(translations["output_not_valid"])
        return None
    
    if os.path.isdir(output): output = os.path.join(output, f"tts.wav")
    gr_info(translations["convert"].format(name=translations["text"]))

    output_dir = os.path.dirname(output) or output
    if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)

    if srt_input.endswith(".srt"): srt_tts(srt_input, output, voice, 0, 24000, google)
    else: synthesize_tts(prompt, voice, speed, output, pitch, google)

    gr_info(translations["success"])
    return output