File size: 5,029 Bytes
69c7b60
06441c0
 
 
 
 
 
 
 
 
 
69c7b60
06441c0
 
 
 
 
 
 
69c7b60
06441c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d7a6b8
06441c0
 
 
 
 
 
 
 
 
 
f7c1f1e
 
06441c0
f7c1f1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c7f733
 
1ca47ff
f7c1f1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c7f733
 
4149c50
24f022c
06441c0
 
1ca47ff
06441c0
 
4c7f733
eed69cb
f7c1f1e
 
 
 
 
 
 
1ca47ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import gradio as gr
# import matplotlib.pyplot as plt
import logging
# logger = logging.getLogger(__name__)
import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
import time

def get_text(text, hps):
    # text_norm = requests.post("http://121.5.171.42:39001/texttosequence?text="+text).json()["text_norm"]
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    # print(hps.data.text_cleaners)
    # print(text_norm)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

def load_model(config_path, pth_path):
    global dev, hps_ms, net_g
    dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    hps_ms = utils.get_hparams_from_file(config_path)

    net_g = SynthesizerTrn(
        len(symbols),
        hps_ms.data.filter_length // 2 + 1,
        hps_ms.train.segment_size // hps_ms.data.hop_length,
        **hps_ms.model).to(dev)
    _ = net_g.eval()
    _ = utils.load_checkpoint(pth_path, net_g)

    print(f"{pth_path}加载成功!")
    
def infer(c_id, text):
    stn_tst = get_text(text, hps_ms)
    with torch.no_grad():
        x_tst = stn_tst.to(dev).unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
        sid = torch.LongTensor([c_id]).to(dev)
        audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()

        return audio

pth_path = "model/G_70000.pth"
config_path = "configs/config.json"
character_dict = {
    "十香": 1,
    "折纸": 2,
    "狂三": 3,
    "四糸乃": 4,
    "琴里": 5,
    "夕弦": 6,
    "耶俱矢": 7,
    "美九": 8,
    "凛祢": 9,
    "凛绪": 10,
    "鞠亚": 11,
    "鞠奈": 12,
    "真那": 13,
}

load_model(config_path, pth_path)

'''app = gr.Blocks()
with app:
    gr.HTML("""
<div
    style="width: 100%;padding-top:116px;background-image: url('https://huggingface.co/spaces/tumuyan/vits-miki/resolve/main/bg.webp');;background-size:cover">
    <div>
                <div>
                    <h4 class="h-sign" style="font-size: 12px;">
                        这是一个使用<a href="https://github.com/thesupersonic16/DALTools" target="_blank">thesupersonic16/DALTools</a>提供的解包音频作为数据集,
                        使用<a href="https://github.com/jaywalnut310/vits" target="_blank">VITS</a>技术训练的语音合成demo。
                    </h4>
                </div>
            </div>
</div>
    """)
    tmp = gr.Markdown("")
    with gr.Tabs():
        with gr.TabItem("Basic"):
           # with gr.Row():
           #     model_submit = gr.Button("加载/重载模型", variant="primary")

            with gr.Row():
                tts_input1 = gr.TextArea(
                    label="请输入文本(仅支持日语)", value="你好,世界!")
                tts_input2 = gr.Dropdown([character_dict.keys], type="value",label="选择角色")
                tts_submit = gr.Button("用文本合成", variant="primary")
                tts_output2 = gr.Audio(label="Output")
        # model_submit.click(load_model, [config_path, pth_path])
        tts_submit.click(infer, [character_dict[tts_input2], tts_input1], [tts_output2])
    gr.HTML("""
<div style="text-align:center"> 
    仅供学习交流,不可用于商业或非法用途
    <br/>
    使用本项目模型直接或间接生成的音频,必须声明由AI技术或VITS技术合成
</div>
    """)
    app.launch()'''

gr.HTML("""
<div
    style="width: 100%;padding-top:116px;background-image: url('https://huggingface.co/spaces/tumuyan/vits-miki/resolve/main/bg.webp');;background-size:cover">
    <div>
                <div>
                    <h4 class="h-sign" style="font-size: 12px;">
                        这是一个使用<a href="https://github.com/thesupersonic16/DALTools" target="_blank">thesupersonic16/DALTools</a>提供的解包音频作为数据集,
                        使用<a href="https://github.com/jaywalnut310/vits" target="_blank">VITS</a>技术训练的语音合成demo。
                    </h4>
                </div>
            </div>
</div>
    """)

demo = gr.Interface(
    fn=infer,
    inputs=[gr.Dropdown([character_dict.keys], type="value",label="选择角色"), gr.TextArea(label="请输入文本(仅支持日语)", value="你好,世界!")],
    outputs=[gr.Audio(label="Output")],
)
demo.launch()

gr.HTML("""
<div style="text-align:center"> 
    仅供学习交流,不可用于商业或非法用途
    <br/>
    使用本项目模型直接或间接生成的音频,必须声明由AI技术或VITS技术合成
</div>
    """)