Plachta commited on
Commit
63edc9f
·
verified ·
1 Parent(s): 304a108

Update app_v1v2.py

Browse files
Files changed (1) hide show
  1. app_v1v2.py +242 -174
app_v1v2.py CHANGED
@@ -1,175 +1,243 @@
1
- import spaces
2
- import gradio as gr
3
- import torch
4
- import yaml
5
- import argparse
6
- from seed_vc_wrapper import SeedVCWrapper
7
-
8
- # Set up device and torch configurations
9
- if torch.cuda.is_available():
10
- device = torch.device("cuda")
11
- elif torch.backends.mps.is_available():
12
- device = torch.device("mps")
13
- else:
14
- device = torch.device("cpu")
15
-
16
- torch._inductor.config.coordinate_descent_tuning = True
17
- torch._inductor.config.triton.unique_kernel_names = True
18
-
19
- if hasattr(torch._inductor.config, "fx_graph_cache"):
20
- # Experimental feature to reduce compilation times, will be on by default in future
21
- torch._inductor.config.fx_graph_cache = True
22
-
23
- dtype = torch.float16
24
-
25
- def load_v2_models(args):
26
- from hydra.utils import instantiate
27
- from omegaconf import DictConfig
28
- cfg = DictConfig(yaml.safe_load(open("configs/v2/vc_wrapper.yaml", "r")))
29
- vc_wrapper = instantiate(cfg)
30
- vc_wrapper.load_checkpoints()
31
- vc_wrapper.to(device)
32
- vc_wrapper.eval()
33
-
34
- vc_wrapper.setup_ar_caches(max_batch_size=1, max_seq_len=4096, dtype=dtype, device=device)
35
-
36
- if args.compile:
37
- vc_wrapper.compile_ar()
38
- # vc_wrapper.compile_cfm()
39
-
40
- return vc_wrapper
41
-
42
- def create_v1_interface():
43
- # Initialize the V1 wrapper
44
- vc_wrapper = SeedVCWrapper()
45
-
46
- # Set up Gradio interface
47
- description = ("Zero-shot voice conversion with in-context learning. For local deployment please check [GitHub repository](https://github.com/Plachtaa/seed-vc) "
48
- "for details and updates.<br>Note that any reference audio will be forcefully clipped to 25s if beyond this length.<br> "
49
- "If total duration of source and reference audio exceeds 30s, source audio will be processed in chunks.<br> "
50
- "无需训练的 zero-shot 语音/歌声转换模型,若需本地部署查看[GitHub页面](https://github.com/Plachtaa/seed-vc)<br>"
51
- "请注意,参考音频若超过 25 秒,则会被自动裁剪至此长度。<br>若源音频和参考音频的总时长超过 30 秒,源音频将被分段处理。")
52
-
53
- inputs = [
54
- gr.Audio(type="filepath", label="Source Audio / 源音频"),
55
- gr.Audio(type="filepath", label="Reference Audio / 参考音频"),
56
- gr.Slider(minimum=1, maximum=200, value=10, step=1, label="Diffusion Steps / 扩散步数",
57
- info="10 by default, 50~100 for best quality / 默认为 10,50~100 为最佳质量"),
58
- gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Length Adjust / 长度调整",
59
- info="<1.0 for speed-up speech, >1.0 for slow-down speech / <1.0 加速语速,>1.0 减慢语速"),
60
- gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="Inference CFG Rate",
61
- info="has subtle influence / 有微小影响"),
62
- gr.Checkbox(label="Use F0 conditioned model / 启用F0输入", value=False,
63
- info="Must set to true for singing voice conversion / 歌声转换时必须勾选"),
64
- gr.Checkbox(label="Auto F0 adjust / 自动F0调整", value=True,
65
- info="Roughly adjust F0 to match target voice. Only works when F0 conditioned model is used. / 粗略调整 F0 以匹配目标音色,仅在勾选 '启用F0输入' 时生效"),
66
- gr.Slider(label='Pitch shift / 音调变换', minimum=-24, maximum=24, step=1, value=0,
67
- info="Pitch shift in semitones, only works when F0 conditioned model is used / 半音数的音高变换,仅在勾选 '启用F0输入' 时生效"),
68
- ]
69
-
70
- examples = [
71
- ["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 25, 1.0, 0.7, False, True, 0],
72
- ["examples/source/jay_0.wav", "examples/reference/azuma_0.wav", 25, 1.0, 0.7, True, True, 0],
73
- ["examples/source/Wiz Khalifa,Charlie Puth - See You Again [vocals]_[cut_28sec].wav",
74
- "examples/reference/teio_0.wav", 100, 1.0, 0.7, True, False, 0],
75
- ["examples/source/TECHNOPOLIS - 2085 [vocals]_[cut_14sec].wav",
76
- "examples/reference/trump_0.wav", 50, 1.0, 0.7, True, False, -12],
77
- ]
78
-
79
- outputs = [
80
- gr.Audio(label="Stream Output Audio / 流式输出", streaming=True, format='mp3'),
81
- gr.Audio(label="Full Output Audio / 完整输出", streaming=False, format='wav')
82
- ]
83
-
84
- return gr.Interface(
85
- fn=vc_wrapper.convert_voice,
86
- description=description,
87
- inputs=inputs,
88
- outputs=outputs,
89
- title="Seed Voice Conversion V1 (Voice & Singing Voice Conversion)",
90
- examples=examples,
91
- cache_examples=False,
92
- )
93
-
94
- def create_v2_interface(vc_wrapper):
95
- # Set up Gradio interface
96
- description = ("Zero-shot voice/style conversion with in-context learning. For local deployment please check [GitHub repository](https://github.com/Plachtaa/seed-vc) "
97
- "for details and updates.<br>Note that any reference audio will be forcefully clipped to 25s if beyond this length.<br> "
98
- "If total duration of source and reference audio exceeds 30s, source audio will be processed in chunks.<br> "
99
- "Please click the 'convert style/emotion/accent' checkbox to convert the style, emotion, or accent of the source audio, or else only timbre conversion will be performed.<br> "
100
- "Click the 'anonymization only' checkbox will ignore reference audio but convert source to an 'average voice' determined by model itself.<br> "
101
- "无需训练的 zero-shot 语音/口音转换模型,若需本地部署查看[GitHub页面](https://github.com/Plachtaa/seed-vc)<br>"
102
- "请注意,参考音频若超过 25 秒,则会被自动裁剪至此长度。<br>若源音频和参考音频的总时长超过 30 秒,源音频将被分段处理。"
103
- "<br>请勾选 'convert style/emotion/accent' 以转换源音频的风格、情感或口音,否则仅执行音色转换。<br>"
104
- "勾选 'anonymization only' 会无视参考音频而将源音频转换为某种由模型自身决定的 '平均音色'。<br>"
105
-
106
- "Credits to [Vevo](https://github.com/open-mmlab/Amphion/tree/main/models/vc/vevo)"
107
- )
108
- inputs = [
109
- gr.Audio(type="filepath", label="Source Audio / 源音频"),
110
- gr.Audio(type="filepath", label="Reference Audio / 参考音频"),
111
- gr.Slider(minimum=1, maximum=200, value=30, step=1, label="Diffusion Steps / 扩散步数",
112
- info="30 by default, 50~100 for best quality / 默认为 30,50~100 为最佳质量"),
113
- gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Length Adjust / 长度调整",
114
- info="<1.0 for speed-up speech, >1.0 for slow-down speech / <1.0 加速语速,>1.0 减慢语速"),
115
- gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.0, label="Intelligibility CFG Rate",
116
- info="controls pronunciation intelligibility / 控制发音清晰度"),
117
- gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="Similarity CFG Rate",
118
- info="controls similarity to reference audio / 控制与参考音频的相似度"),
119
- gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.9, label="Top-p",
120
- info="AR model sampling top P"),
121
- gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=1.0, label="Temperature",
122
- info="AR model sampling temperature"),
123
- gr.Slider(minimum=1.0, maximum=3.0, step=0.1, value=1.0, label="Repetition Penalty",
124
- info="AR model sampling repetition penalty"),
125
- gr.Checkbox(label="convert style/emotion/accent", value=False),
126
- gr.Checkbox(label="anonymization only", value=False),
127
- ]
128
-
129
- examples = [
130
- ["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 50, 1.0, 0.0, 0.7, 0.9, 1.0, 1.0, False, False],
131
- ["examples/source/jay_0.wav", "examples/reference/azuma_0.wav", 50, 1.0, 0.0, 0.7, 0.9, 1.0, 1.0, False, False],
132
- ]
133
-
134
- outputs = [
135
- gr.Audio(label="Stream Output Audio / 流式输出", streaming=True, format='mp3'),
136
- gr.Audio(label="Full Output Audio / 完整输出", streaming=False, format='wav')
137
- ]
138
-
139
- return gr.Interface(
140
- fn=vc_wrapper.convert_voice_with_streaming,
141
- description=description,
142
- inputs=inputs,
143
- outputs=outputs,
144
- title="Seed Voice Conversion V2 (Voice & Style Conversion)",
145
- examples=examples,
146
- cache_examples=False,
147
- )
148
-
149
- def main(args):
150
- # Load V2 models
151
- vc_wrapper_v2 = load_v2_models(args)
152
-
153
- # Create interfaces
154
- v1_interface = create_v1_interface()
155
- v2_interface = create_v2_interface(vc_wrapper_v2)
156
-
157
- # Create tabs
158
- with gr.Blocks(title="Seed Voice Conversion") as demo:
159
- gr.Markdown("# Seed Voice Conversion")
160
- gr.Markdown("Choose between V1 (Voice & Singing Voice Conversion) or V2 (Voice & Style Conversion)")
161
-
162
- with gr.Tabs():
163
- with gr.TabItem("V2 - Voice & Style Conversion"):
164
- v2_interface.render()
165
- with gr.TabItem("V1 - Voice & Singing Voice Conversion"):
166
- v1_interface.render()
167
-
168
- # Launch the combined interface
169
- demo.launch()
170
-
171
- if __name__ == "__main__":
172
- parser = argparse.ArgumentParser()
173
- parser.add_argument("--compile", type=bool, default=True)
174
- args = parser.parse_args()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  main(args)
 
1
+ import spaces
2
+ import gradio as gr
3
+ import torch
4
+ import yaml
5
+ import argparse
6
+ from seed_vc_wrapper import SeedVCWrapper
7
+ from modules.v2.vc_wrapper import VoiceConversionWrapper
8
+
9
+ # Set up device and torch configurations
10
+ if torch.cuda.is_available():
11
+ device = torch.device("cuda")
12
+ elif torch.backends.mps.is_available():
13
+ device = torch.device("mps")
14
+ else:
15
+ device = torch.device("cpu")
16
+
17
+ torch._inductor.config.coordinate_descent_tuning = True
18
+ torch._inductor.config.triton.unique_kernel_names = True
19
+
20
+ if hasattr(torch._inductor.config, "fx_graph_cache"):
21
+ # Experimental feature to reduce compilation times, will be on by default in future
22
+ torch._inductor.config.fx_graph_cache = True
23
+
24
+ dtype = torch.float16
25
+
26
+ # Global variables to store model instances
27
+ vc_wrapper_v1 = None
28
+ vc_wrapper_v2 = None
29
+
30
+
31
+ def load_v2_models(args):
32
+ from hydra.utils import instantiate
33
+ from omegaconf import DictConfig
34
+ cfg = DictConfig(yaml.safe_load(open("configs/v2/vc_wrapper.yaml", "r")))
35
+ vc_wrapper = instantiate(cfg)
36
+ vc_wrapper.load_checkpoints()
37
+ vc_wrapper.to(device)
38
+ vc_wrapper.eval()
39
+
40
+ vc_wrapper.setup_ar_caches(max_batch_size=1, max_seq_len=4096, dtype=dtype, device=device)
41
+
42
+ if args.compile:
43
+ vc_wrapper.compile_ar()
44
+ # vc_wrapper.compile_cfm()
45
+
46
+ return vc_wrapper
47
+
48
+ @spaces.GPU
49
+ def convert_voice_v1_wrapper(source_audio_path, target_audio_path, diffusion_steps=10,
50
+ length_adjust=1.0, inference_cfg_rate=0.7, f0_condition=False,
51
+ auto_f0_adjust=True, pitch_shift=0, stream_output=True):
52
+ """
53
+ Wrapper function for vc_wrapper.convert_voice that can be decorated with @spaces.GPU
54
+ """
55
+ global vc_wrapper_v1
56
+ if vc_wrapper_v1 is None:
57
+ vc_wrapper_v1 = SeedVCWrapper()
58
+
59
+ # Use yield from to properly handle the generator
60
+ yield from vc_wrapper_v1.convert_voice(
61
+ source=source_audio_path,
62
+ target=target_audio_path,
63
+ diffusion_steps=diffusion_steps,
64
+ length_adjust=length_adjust,
65
+ inference_cfg_rate=inference_cfg_rate,
66
+ f0_condition=f0_condition,
67
+ auto_f0_adjust=auto_f0_adjust,
68
+ pitch_shift=pitch_shift,
69
+ stream_output=stream_output
70
+ )
71
+
72
+ @spaces.GPU
73
+ def convert_voice_v2_wrapper(source_audio_path, target_audio_path, diffusion_steps=30,
74
+ length_adjust=1.0, intelligebility_cfg_rate=0.7, similarity_cfg_rate=0.7,
75
+ top_p=0.7, temperature=0.7, repetition_penalty=1.5,
76
+ convert_style=False, anonymization_only=False, stream_output=True):
77
+ """
78
+ Wrapper function for vc_wrapper.convert_voice_with_streaming that can be decorated with @spaces.GPU
79
+ """
80
+ global vc_wrapper_v2
81
+ if vc_wrapper_v2 is None:
82
+ # Initialize with default arguments
83
+ parser = argparse.ArgumentParser()
84
+ parser.add_argument("--compile", type=bool, default=True)
85
+ args = parser.parse_args([])
86
+ vc_wrapper_v2 = load_v2_models(args)
87
+
88
+ # Use yield from to properly handle the generator
89
+ yield from vc_wrapper_v2.convert_voice_with_streaming(
90
+ source_audio_path=source_audio_path,
91
+ target_audio_path=target_audio_path,
92
+ diffusion_steps=diffusion_steps,
93
+ length_adjust=length_adjust,
94
+ intelligebility_cfg_rate=intelligebility_cfg_rate,
95
+ similarity_cfg_rate=similarity_cfg_rate,
96
+ top_p=top_p,
97
+ temperature=temperature,
98
+ repetition_penalty=repetition_penalty,
99
+ convert_style=convert_style,
100
+ anonymization_only=anonymization_only,
101
+ device=device,
102
+ dtype=dtype,
103
+ stream_output=stream_output
104
+ )
105
+
106
+
107
+ def create_v1_interface():
108
+ # Set up Gradio interface
109
+ description = (
110
+ "Zero-shot voice conversion with in-context learning. For local deployment please check [GitHub repository](https://github.com/Plachtaa/seed-vc) "
111
+ "for details and updates.<br>Note that any reference audio will be forcefully clipped to 25s if beyond this length.<br> "
112
+ "If total duration of source and reference audio exceeds 30s, source audio will be processed in chunks.<br> "
113
+ "无需训练的 zero-shot 语音/歌声转换模型,若需本地部署查看[GitHub页面](https://github.com/Plachtaa/seed-vc)<br>"
114
+ "请注意,参考音频若超过 25 秒,则会被自动裁剪至此长度。<br>若源音频和参考音频的总时长超过 30 秒,源音频将被分段处理。")
115
+
116
+ inputs = [
117
+ gr.Audio(type="filepath", label="Source Audio / 源音频"),
118
+ gr.Audio(type="filepath", label="Reference Audio / 参考音频"),
119
+ gr.Slider(minimum=1, maximum=200, value=10, step=1, label="Diffusion Steps / 扩散步数",
120
+ info="10 by default, 50~100 for best quality / 默认为 10,50~100 为最佳质量"),
121
+ gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Length Adjust / 长度调整",
122
+ info="<1.0 for speed-up speech, >1.0 for slow-down speech / <1.0 加速语速,>1.0 减慢语速"),
123
+ gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="Inference CFG Rate",
124
+ info="has subtle influence / 有微小影响"),
125
+ gr.Checkbox(label="Use F0 conditioned model / 启用F0输入", value=False,
126
+ info="Must set to true for singing voice conversion / 歌声转换时必须勾选"),
127
+ gr.Checkbox(label="Auto F0 adjust / 自动F0调整", value=True,
128
+ info="Roughly adjust F0 to match target voice. Only works when F0 conditioned model is used. / 粗略调整 F0 以匹配目标音色,仅在勾选 '启用F0输入' 时生效"),
129
+ gr.Slider(label='Pitch shift / 音调变换', minimum=-24, maximum=24, step=1, value=0,
130
+ info="Pitch shift in semitones, only works when F0 conditioned model is used / 半音数的音高变换,仅在勾选 '启用F0输入' 时生效"),
131
+ ]
132
+
133
+ examples = [
134
+ ["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 25, 1.0, 0.7, False, True, 0],
135
+ ["examples/source/jay_0.wav", "examples/reference/azuma_0.wav", 25, 1.0, 0.7, True, True, 0],
136
+ ["examples/source/Wiz Khalifa,Charlie Puth - See You Again [vocals]_[cut_28sec].wav",
137
+ "examples/reference/teio_0.wav", 100, 1.0, 0.7, True, False, 0],
138
+ ["examples/source/TECHNOPOLIS - 2085 [vocals]_[cut_14sec].wav",
139
+ "examples/reference/trump_0.wav", 50, 1.0, 0.7, True, False, -12],
140
+ ]
141
+
142
+ outputs = [
143
+ gr.Audio(label="Stream Output Audio / 流式输出", streaming=True, format='mp3'),
144
+ gr.Audio(label="Full Output Audio / 完整输出", streaming=False, format='wav')
145
+ ]
146
+
147
+ return gr.Interface(
148
+ fn=convert_voice_v1_wrapper,
149
+ description=description,
150
+ inputs=inputs,
151
+ outputs=outputs,
152
+ title="Seed Voice Conversion V1 (Voice & Singing Voice Conversion)",
153
+ examples=examples,
154
+ cache_examples=False,
155
+ )
156
+
157
+
158
+ def create_v2_interface(vc_wrapper):
159
+ # Set up Gradio interface
160
+ description = (
161
+ "Zero-shot voice/style conversion with in-context learning. For local deployment please check [GitHub repository](https://github.com/Plachtaa/seed-vc) "
162
+ "for details and updates.<br>Note that any reference audio will be forcefully clipped to 25s if beyond this length.<br> "
163
+ "If total duration of source and reference audio exceeds 30s, source audio will be processed in chunks.<br> "
164
+ "Please click the 'convert style/emotion/accent' checkbox to convert the style, emotion, or accent of the source audio, or else only timbre conversion will be performed.<br> "
165
+ "Click the 'anonymization only' checkbox will ignore reference audio but convert source to an 'average voice' determined by model itself.<br> "
166
+ "无需训练的 zero-shot 语音/口音转换模型,若需本地部署查看[GitHub页面](https://github.com/Plachtaa/seed-vc)<br>"
167
+ "请注意,参考音频若超过 25 秒,则会被自动裁剪至此长度。<br>若源音频和参考音频的总时长超过 30 秒,源音频将被分段处理。"
168
+ "<br>请勾选 'convert style/emotion/accent' 以转换源音频的风格、情感或口音,否则仅执行音色转换。<br>"
169
+ "勾选 'anonymization only' 会无视参考音频而将源音频转换为某种由模型自身决定的 '平均音色'。<br>"
170
+
171
+ "Credits to [Vevo](https://github.com/open-mmlab/Amphion/tree/main/models/vc/vevo)"
172
+ )
173
+ inputs = [
174
+ gr.Audio(type="filepath", label="Source Audio / 源音频"),
175
+ gr.Audio(type="filepath", label="Reference Audio / 参考音频"),
176
+ gr.Slider(minimum=1, maximum=200, value=30, step=1, label="Diffusion Steps / 扩散步数",
177
+ info="30 by default, 50~100 for best quality / 默认为 30,50~100 为最佳质量"),
178
+ gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Length Adjust / 长度调整",
179
+ info="<1.0 for speed-up speech, >1.0 for slow-down speech / <1.0 加速语速,>1.0 减慢语速"),
180
+ gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.0, label="Intelligibility CFG Rate",
181
+ info="controls pronunciation intelligibility / 控制发音清晰度"),
182
+ gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="Similarity CFG Rate",
183
+ info="controls similarity to reference audio / 控制与参考音频的相似度"),
184
+ gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.9, label="Top-p",
185
+ info="AR model sampling top P"),
186
+ gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=1.0, label="Temperature",
187
+ info="AR model sampling temperature"),
188
+ gr.Slider(minimum=1.0, maximum=3.0, step=0.1, value=1.0, label="Repetition Penalty",
189
+ info="AR model sampling repetition penalty"),
190
+ gr.Checkbox(label="convert style/emotion/accent", value=False),
191
+ gr.Checkbox(label="anonymization only", value=False),
192
+ ]
193
+
194
+ examples = [
195
+ ["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 25, 1.0, 0.0, 0.7, 0.9, 1.0, 1.0, False,
196
+ False],
197
+ ["examples/source/jay_0.wav", "examples/reference/azuma_0.wav", 25, 1.0, 0.0, 0.7, 0.9, 1.0, 1.0, False, False],
198
+ ]
199
+
200
+ outputs = [
201
+ gr.Audio(label="Stream Output Audio / 流式输���", streaming=True, format='mp3'),
202
+ gr.Audio(label="Full Output Audio / 完整输出", streaming=False, format='wav')
203
+ ]
204
+
205
+ return gr.Interface(
206
+ fn=convert_voice_v2_wrapper,
207
+ description=description,
208
+ inputs=inputs,
209
+ outputs=outputs,
210
+ title="Seed Voice Conversion V2 (Voice & Style Conversion)",
211
+ examples=examples,
212
+ cache_examples=False,
213
+ )
214
+
215
+
216
+ def main(args):
217
+ # Load V2 models
218
+ vc_wrapper_v2 = load_v2_models(args)
219
+
220
+ # Create interfaces
221
+ v1_interface = create_v1_interface()
222
+ v2_interface = create_v2_interface(vc_wrapper_v2)
223
+
224
+ # Create tabs
225
+ with gr.Blocks(title="Seed Voice Conversion") as demo:
226
+ gr.Markdown("# Seed Voice Conversion")
227
+ gr.Markdown("Choose between V1 (Voice & Singing Voice Conversion) or V2 (Voice & Style Conversion)")
228
+
229
+ with gr.Tabs():
230
+ with gr.TabItem("V2 - Voice & Style Conversion"):
231
+ v2_interface.render()
232
+ with gr.TabItem("V1 - Voice & Singing Voice Conversion"):
233
+ v1_interface.render()
234
+
235
+ # Launch the combined interface
236
+ demo.launch()
237
+
238
+
239
+ if __name__ == "__main__":
240
+ parser = argparse.ArgumentParser()
241
+ parser.add_argument("--compile", type=bool, default=True)
242
+ args = parser.parse_args()
243
  main(args)