Katock commited on
Commit
6052830
·
1 Parent(s): b8a3545
Files changed (5) hide show
  1. LICENSE +0 -21
  2. app-slice.py +0 -135
  3. app.py +0 -1
  4. data_utils.py +0 -184
  5. utils.py +6 -11
LICENSE DELETED
@@ -1,21 +0,0 @@
1
- MIT License
2
-
3
- Copyright (c) 2021 Jingyi Li
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app-slice.py DELETED
@@ -1,135 +0,0 @@
1
- import os
2
- import gradio as gr
3
- import edge_tts
4
- from pathlib import Path
5
- import inference.infer_tool as infer_tool
6
- import utils
7
- from inference.infer_tool import Svc
8
- import logging
9
- import webbrowser
10
- import argparse
11
- import asyncio
12
- import librosa
13
- import soundfile
14
- import gradio.processing_utils as gr_processing_utils
15
- logging.getLogger('numba').setLevel(logging.WARNING)
16
- logging.getLogger('markdown_it').setLevel(logging.WARNING)
17
- logging.getLogger('urllib3').setLevel(logging.WARNING)
18
- logging.getLogger('matplotlib').setLevel(logging.WARNING)
19
-
20
- limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces
21
-
22
- audio_postprocess_ori = gr.Audio.postprocess
23
-
24
- def audio_postprocess(self, y):
25
- data = audio_postprocess_ori(self, y)
26
- if data is None:
27
- return None
28
- return gr_processing_utils.encode_url_or_file_to_base64(data["name"])
29
-
30
-
31
- gr.Audio.postprocess = audio_postprocess
32
- def create_vc_fn(model, sid):
33
- def vc_fn(input_audio, vc_transform, auto_f0, slice_db, noise_scale, pad_seconds, tts_text, tts_voice, tts_mode):
34
- if tts_mode:
35
- if len(tts_text) > 100 and limitation:
36
- return "Text is too long", None
37
- if tts_text is None or tts_voice is None:
38
- return "You need to enter text and select a voice", None
39
- asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
40
- audio, sr = librosa.load("tts.mp3")
41
- soundfile.write("tts.wav", audio, 24000, format="wav")
42
- wav_path = "tts.wav"
43
- else:
44
- if input_audio is None:
45
- return "You need to select an audio", None
46
- raw_audio_path = f"raw/{input_audio}"
47
- if "." not in raw_audio_path:
48
- raw_audio_path += ".wav"
49
- infer_tool.format_wav(raw_audio_path)
50
- wav_path = Path(raw_audio_path).with_suffix('.wav')
51
- _audio = model.slice_inference(
52
- wav_path, sid, vc_transform, slice_db,
53
- cluster_infer_ratio=0,
54
- auto_predict_f0=auto_f0,
55
- noice_scale=noise_scale,
56
- pad_seconds=pad_seconds)
57
- model.clear_empty()
58
- return "Success", (44100, _audio)
59
- return vc_fn
60
-
61
- def refresh_raw_wav():
62
- return gr.Dropdown.update(choices=os.listdir("raw"))
63
-
64
- def change_to_tts_mode(tts_mode):
65
- if tts_mode:
66
- return gr.Audio.update(visible=False), gr.Button.update(visible=False), gr.Textbox.update(visible=True), gr.Dropdown.update(visible=True)
67
- else:
68
- return gr.Audio.update(visible=True), gr.Button.update(visible=True), gr.Textbox.update(visible=False), gr.Dropdown.update(visible=False)
69
-
70
- if __name__ == '__main__':
71
- parser = argparse.ArgumentParser()
72
- parser.add_argument('--device', type=str, default='cpu')
73
- parser.add_argument('--api', action="store_true", default=False)
74
- parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
75
- parser.add_argument("--colab", action="store_true", default=False, help="share gradio app")
76
- args = parser.parse_args()
77
- hubert_model = utils.get_hubert_model().to(args.device)
78
- models = []
79
- voices = []
80
- tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
81
- for r in tts_voice_list:
82
- voices.append(f"{r['ShortName']}-{r['Gender']}")
83
- raw = os.listdir("raw")
84
- for f in os.listdir("models"):
85
- name = f
86
- model = Svc(fr"models/{f}/{f}.pth", f"models/{f}/config.json", device=args.device)
87
- cover = f"models/{f}/cover.png" if os.path.exists(f"models/{f}/cover.png") else None
88
- models.append((name, cover, create_vc_fn(model, name)))
89
- with gr.Blocks() as app:
90
- gr.Markdown(
91
- "# <center> Sovits Models\n"
92
- "## <center> The input audio should be clean and pure voice without background music.\n"
93
- "![visitor badge](https://visitor-badge.glitch.me/badge?page_id=sayashi.Sovits-Umamusume)\n\n"
94
- "[Open In Colab](https://colab.research.google.com/drive/1wfsBbMzmtLflOJeqc5ZnJiLY7L239hJW?usp=share_link)"
95
- " without queue and length limitation.\n\n"
96
- "[Original Repo](https://github.com/svc-develop-team/so-vits-svc)\n\n"
97
- "Other models:\n"
98
- "[rudolf](https://huggingface.co/spaces/sayashi/sovits-rudolf)\n"
99
- "[teio](https://huggingface.co/spaces/sayashi/sovits-teio)\n"
100
- "[goldship](https://huggingface.co/spaces/sayashi/sovits-goldship)\n"
101
- "[tannhauser](https://huggingface.co/spaces/sayashi/sovits-tannhauser)\n"
102
-
103
- )
104
- with gr.Tabs():
105
- for (name, cover, vc_fn) in models:
106
- with gr.TabItem(name):
107
- with gr.Row():
108
- gr.Markdown(
109
- '<div align="center">'
110
- f'<img style="width:auto;height:300px;" src="file/{cover}">' if cover else ""
111
- '</div>'
112
- )
113
- with gr.Row():
114
- with gr.Column():
115
- with gr.Row():
116
- vc_input = gr.Dropdown(label="Input audio", choices=raw)
117
- vc_refresh = gr.Button("🔁", variant="primary")
118
- vc_transform = gr.Number(label="vc_transform", value=0)
119
- slice_db = gr.Number(label="slice_db", value=-40)
120
- noise_scale = gr.Number(label="noise_scale", value=0.4)
121
- pad_seconds = gr.Number(label="pad_seconds", value=0.5)
122
- auto_f0 = gr.Checkbox(label="auto_f0", value=False)
123
- tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
124
- tts_text = gr.Textbox(visible=False,label="TTS text (100 words limitation)" if limitation else "TTS text")
125
- tts_voice = gr.Dropdown(choices=voices, visible=False)
126
- vc_submit = gr.Button("Generate", variant="primary")
127
- with gr.Column():
128
- vc_output1 = gr.Textbox(label="Output Message")
129
- vc_output2 = gr.Audio(label="Output Audio")
130
- vc_submit.click(vc_fn, [vc_input, vc_transform, auto_f0, slice_db, noise_scale, pad_seconds, tts_text, tts_voice, tts_mode], [vc_output1, vc_output2])
131
- vc_refresh.click(refresh_raw_wav, [], [vc_input])
132
- tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, vc_refresh, tts_text, tts_voice])
133
- if args.colab:
134
- webbrowser.open("http://127.0.0.1:7860")
135
- app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -17,7 +17,6 @@ logging.getLogger('markdown_it').setLevel(logging.WARNING)
17
  logging.getLogger('urllib3').setLevel(logging.WARNING)
18
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
19
 
20
- limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces
21
  sampling_rate = 44100
22
 
23
 
 
17
  logging.getLogger('urllib3').setLevel(logging.WARNING)
18
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
19
 
 
20
  sampling_rate = 44100
21
 
22
 
data_utils.py DELETED
@@ -1,184 +0,0 @@
1
- import time
2
- import os
3
- import random
4
- import numpy as np
5
- import torch
6
- import torch.utils.data
7
-
8
- import modules.commons as commons
9
- import utils
10
- from modules.mel_processing import spectrogram_torch, spec_to_mel_torch, spectrogram_torch
11
- from utils import load_wav_to_torch, load_filepaths_and_text
12
-
13
- # import h5py
14
-
15
-
16
- """Multi speaker version"""
17
-
18
-
19
- class TextAudioSpeakerLoader(torch.utils.data.Dataset):
20
- """
21
- 1) loads audio, speaker_id, text pairs
22
- 2) normalizes text and converts them to sequences of integers
23
- 3) computes spectrograms from audio files.
24
- """
25
-
26
- def __init__(self, audiopaths, hparams, all_in_mem: bool = False, vol_aug: bool = True):
27
- self.audiopaths = load_filepaths_and_text(audiopaths)
28
- self.hparams = hparams
29
- self.max_wav_value = hparams.data.max_wav_value
30
- self.sampling_rate = hparams.data.sampling_rate
31
- self.filter_length = hparams.data.filter_length
32
- self.hop_length = hparams.data.hop_length
33
- self.win_length = hparams.data.win_length
34
- self.sampling_rate = hparams.data.sampling_rate
35
- self.use_sr = hparams.train.use_sr
36
- self.spec_len = hparams.train.max_speclen
37
- self.spk_map = hparams.spk
38
- self.vol_emb = hparams.model.vol_embedding
39
- self.vol_aug = hparams.train.vol_aug and vol_aug
40
- random.seed(1234)
41
- random.shuffle(self.audiopaths)
42
-
43
- self.all_in_mem = all_in_mem
44
- if self.all_in_mem:
45
- self.cache = [self.get_audio(p[0]) for p in self.audiopaths]
46
-
47
- def get_audio(self, filename):
48
- filename = filename.replace("\\", "/")
49
- audio, sampling_rate = load_wav_to_torch(filename)
50
- if sampling_rate != self.sampling_rate:
51
- raise ValueError("{} SR doesn't match target {} SR".format(
52
- sampling_rate, self.sampling_rate))
53
- audio_norm = audio / self.max_wav_value
54
- audio_norm = audio_norm.unsqueeze(0)
55
- spec_filename = filename.replace(".wav", ".spec.pt")
56
-
57
- # Ideally, all data generated after Mar 25 should have .spec.pt
58
- if os.path.exists(spec_filename):
59
- spec = torch.load(spec_filename)
60
- else:
61
- spec = spectrogram_torch(audio_norm, self.filter_length,
62
- self.sampling_rate, self.hop_length, self.win_length,
63
- center=False)
64
- spec = torch.squeeze(spec, 0)
65
- torch.save(spec, spec_filename)
66
-
67
- spk = filename.split("/")[-2]
68
- spk = torch.LongTensor([self.spk_map[spk]])
69
-
70
- f0, uv = np.load(filename + ".f0.npy",allow_pickle=True)
71
-
72
- f0 = torch.FloatTensor(np.array(f0,dtype=float))
73
- uv = torch.FloatTensor(np.array(uv,dtype=float))
74
-
75
- c = torch.load(filename+ ".soft.pt")
76
- c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[0])
77
- if self.vol_emb:
78
- volume_path = filename + ".vol.npy"
79
- volume = np.load(volume_path)
80
- volume = torch.from_numpy(volume).float()
81
- else:
82
- volume = None
83
-
84
- lmin = min(c.size(-1), spec.size(-1))
85
- assert abs(c.size(-1) - spec.size(-1)) < 3, (c.size(-1), spec.size(-1), f0.shape, filename)
86
- assert abs(audio_norm.shape[1]-lmin * self.hop_length) < 3 * self.hop_length
87
- spec, c, f0, uv = spec[:, :lmin], c[:, :lmin], f0[:lmin], uv[:lmin]
88
- audio_norm = audio_norm[:, :lmin * self.hop_length]
89
- if volume!= None:
90
- volume = volume[:lmin]
91
- return c, f0, spec, audio_norm, spk, uv, volume
92
-
93
- def random_slice(self, c, f0, spec, audio_norm, spk, uv, volume):
94
- # if spec.shape[1] < 30:
95
- # print("skip too short audio:", filename)
96
- # return None
97
-
98
- if random.choice([True, False]) and self.vol_aug and volume!=None:
99
- max_amp = float(torch.max(torch.abs(audio_norm))) + 1e-5
100
- max_shift = min(1, np.log10(1/max_amp))
101
- log10_vol_shift = random.uniform(-1, max_shift)
102
- audio_norm = audio_norm * (10 ** log10_vol_shift)
103
- volume = volume * (10 ** log10_vol_shift)
104
- spec = spectrogram_torch(audio_norm,
105
- self.hparams.data.filter_length,
106
- self.hparams.data.sampling_rate,
107
- self.hparams.data.hop_length,
108
- self.hparams.data.win_length,
109
- center=False)[0]
110
-
111
- if spec.shape[1] > 800:
112
- start = random.randint(0, spec.shape[1]-800)
113
- end = start + 790
114
- spec, c, f0, uv = spec[:, start:end], c[:, start:end], f0[start:end], uv[start:end]
115
- audio_norm = audio_norm[:, start * self.hop_length : end * self.hop_length]
116
- if volume !=None:
117
- volume = volume[start:end]
118
- return c, f0, spec, audio_norm, spk, uv,volume
119
-
120
- def __getitem__(self, index):
121
- if self.all_in_mem:
122
- return self.random_slice(*self.cache[index])
123
- else:
124
- return self.random_slice(*self.get_audio(self.audiopaths[index][0]))
125
-
126
- def __len__(self):
127
- return len(self.audiopaths)
128
-
129
-
130
- class TextAudioCollate:
131
-
132
- def __call__(self, batch):
133
- batch = [b for b in batch if b is not None]
134
-
135
- input_lengths, ids_sorted_decreasing = torch.sort(
136
- torch.LongTensor([x[0].shape[1] for x in batch]),
137
- dim=0, descending=True)
138
-
139
- max_c_len = max([x[0].size(1) for x in batch])
140
- max_wav_len = max([x[3].size(1) for x in batch])
141
-
142
- lengths = torch.LongTensor(len(batch))
143
-
144
- c_padded = torch.FloatTensor(len(batch), batch[0][0].shape[0], max_c_len)
145
- f0_padded = torch.FloatTensor(len(batch), max_c_len)
146
- spec_padded = torch.FloatTensor(len(batch), batch[0][2].shape[0], max_c_len)
147
- wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
148
- spkids = torch.LongTensor(len(batch), 1)
149
- uv_padded = torch.FloatTensor(len(batch), max_c_len)
150
- volume_padded = torch.FloatTensor(len(batch), max_c_len)
151
-
152
- c_padded.zero_()
153
- spec_padded.zero_()
154
- f0_padded.zero_()
155
- wav_padded.zero_()
156
- uv_padded.zero_()
157
- volume_padded.zero_()
158
-
159
- for i in range(len(ids_sorted_decreasing)):
160
- row = batch[ids_sorted_decreasing[i]]
161
-
162
- c = row[0]
163
- c_padded[i, :, :c.size(1)] = c
164
- lengths[i] = c.size(1)
165
-
166
- f0 = row[1]
167
- f0_padded[i, :f0.size(0)] = f0
168
-
169
- spec = row[2]
170
- spec_padded[i, :, :spec.size(1)] = spec
171
-
172
- wav = row[3]
173
- wav_padded[i, :, :wav.size(1)] = wav
174
-
175
- spkids[i, 0] = row[4]
176
-
177
- uv = row[5]
178
- uv_padded[i, :uv.size(0)] = uv
179
- volume = row[6]
180
- if volume != None:
181
- volume_padded[i, :volume.size(0)] = volume
182
- else :
183
- volume_padded = None
184
- return c_padded, f0_padded, spec_padded, wav_padded, spkids, lengths, uv_padded, volume_padded
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils.py CHANGED
@@ -1,21 +1,16 @@
1
- import os
2
  import glob
3
- import re
4
- import sys
5
- import argparse
6
- import logging
7
  import json
 
 
 
8
  import subprocess
9
- import warnings
10
- import random
11
- import functools
12
  import librosa
13
  import numpy as np
14
- from scipy.io.wavfile import read
15
  import torch
 
16
  from torch.nn import functional as F
17
- from modules.commons import sequence_mask
18
- import tqdm
19
 
20
  MATPLOTLIB_FLAG = False
21
 
 
 
1
  import glob
 
 
 
 
2
  import json
3
+ import logging
4
+ import os
5
+ import re
6
  import subprocess
7
+ import sys
8
+
 
9
  import librosa
10
  import numpy as np
 
11
  import torch
12
+ from scipy.io.wavfile import read
13
  from torch.nn import functional as F
 
 
14
 
15
  MATPLOTLIB_FLAG = False
16