File size: 7,010 Bytes
bcf08c1
 
 
 
 
 
 
 
 
 
 
 
 
 
5a26d97
 
bcf08c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a26d97
671d69d
bcf08c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
498736f
671d69d
bcf08c1
 
 
 
 
 
 
d18f59c
671d69d
 
 
bcf08c1
671d69d
bcf08c1
 
 
 
 
 
 
 
5a26d97
671d69d
bcf08c1
671d69d
bcf08c1
671d69d
bcf08c1
 
 
 
 
 
 
671d69d
 
 
 
bcf08c1
 
671d69d
 
 
 
 
 
 
 
bcf08c1
 
 
 
 
 
 
 
 
671d69d
bcf08c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
671d69d
bcf08c1
 
 
 
671d69d
bcf08c1
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# Copyright (c) 2025 MediaTek Reserch Inc (authors: Chan-Jan Hsu)
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Liu Yue)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import spaces

import os
import sys
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))

import argparse
import gradio as gr
import numpy as np
import torch
torch.set_num_threads(1)
import torchaudio
import random
import librosa
from transformers import pipeline
import subprocess
from scipy.signal import resample

import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

from cosyvoice.cli.cosyvoice import CosyVoice
from cosyvoice.utils.file_utils import load_wav, speed_change

#logging.basicConfig(level=logging.DEBUG,
#                    format='%(asctime)s %(levelname)s %(message)s')

def generate_seed():
    seed = random.randint(1, 100000000)
    return {
        "__type__": "update",
        "value": seed
    }

def set_all_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

max_val = 0.8
def postprocess(speech, top_db=60, hop_length=220, win_length=440):
    speech, _ = librosa.effects.trim(
        speech, top_db=top_db,
        frame_length=win_length,
        hop_length=hop_length
    )
    if speech.abs().max() > max_val:
        speech = speech / speech.abs().max() * max_val
    speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
    return speech

@spaces.GPU
def generate_audio(tts_text, prompt_text, prompt_wav, seed):
    # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode

    prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
    set_all_random_seed(seed)
    output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k)
    speed_factor = 1
    if speed_factor != 1.0:
        #try:
            #audio_data, sample_rate = speed_change(output["tts_speech"], target_sr, str(speed_factor))
            #audio_data = audio_data.numpy().flatten()
        new_length = int(len(output['tts_speech']) / speed_factor)
        audio_data = resample(output['tts_speech'], new_length)
        # except Exception as e:
        #     print(f"Failed to change speed of audio: \n{e}")
    else:
        audio_data = output['tts_speech'].numpy().flatten()

    return (target_sr, audio_data)


@spaces.GPU
def generate_text(prompt_wav):
    if prompt_wav:
        results = asr_pipeline(prompt_wav)
        return results['text']
    return "No valid input detected."

def main():
    with gr.Blocks(title="BreezyVoice 語音合成系統", theme="default") as demo:
        gr.Markdown(
            """# BreezyVoice 語音合成系統
            
            #### Runs on Huggingface Zero GPU (A100)

            為了加快推理速度,g2pw 注音標註並未被啟動。"""
        )

        # All content arranged in a single column
        with gr.Column():
            # Configuration Section

            # Grouping prompt audio inputs and auto speech recognition in one block using Markdown
            gr.Markdown("### 步驟 1. 音訊樣本輸入 & 音訊樣本文本輸入")
            gr.Markdown("選擇 prompt 音訊檔案或錄製 prompt 音訊 (5~15秒),並手動校對自動產生的音訊樣本文本。")
            prompt_wav = gr.Audio(
                type='filepath',
                label='選擇 prompt 音訊檔案(確保取樣率不低於 16khz)或錄製 prompt 音訊'
            )

            with gr.Blocks():
                prompt_text = gr.Textbox(
                    label="音訊樣本文本輸入(此欄位應與音檔內容完全相同)",
                    lines=2,
                    placeholder="音訊樣本文本"
                )

            prompt_wav.input(
                fn=generate_text,
                inputs=[prompt_wav],
                outputs=prompt_text
            )

            gr.Examples(
                examples=[
                    ["examples/commonvoice-example-1.mp3", "明月幾時有,去問氣象局"],
                    ["examples/commonvoice-example-2.mp3", "雲林縣斗六市與林內鄉交界"],
                    ["examples/commonvoice-example-3.mp3", "法律應保障所有的人獲得相同的發展結果"]
                ],
                inputs=[prompt_wav, prompt_text],
                label="範例"
            )

            # Input Section: Synthesis Text

            gr.Markdown("### 步驟 2.合成文本輸入")
            tts_text = gr.Textbox(
                label="輸入想要合成的文本",
                lines=2,
                placeholder="請輸入想要合成的文本...",
                value="我今天忙了一整天,現在好想睡覺喔 QQ"
            )


            # Output Section
            gr.Markdown("### 步驟 3. 合成音訊")
            # Generation button for audio synthesis (triggered manually)

            with gr.Accordion("進階設定", open=False):
                seed = gr.Number(value=0, label="隨機推理種子")
                #seed_button = gr.Button("隨機")
                seed_button = gr.Button(value="\U0001F3B2生成隨機推理種子\U0001F3B2")
                speed_factor = 1
                # speed_factor = gr.Slider(
                #     minimum=0.25,
                #     maximum=4,
                #     step=0.05,
                #     label="語速",
                #     value=1.0,
                #     interactive=True
                # )

            generate_button = gr.Button("生成音訊")
            audio_output = gr.Audio(label="合成音訊")

            # Set up callbacks for seed generation and audio synthesis
            seed_button.click(fn=generate_seed, inputs=[], outputs=seed)
            generate_button.click(
                fn=generate_audio,
                inputs=[tts_text, prompt_text, prompt_wav, seed],
                outputs=audio_output
            )

        demo.launch()

if __name__ == '__main__':
    cosyvoice = CosyVoice('Splend1dchan/BreezyVoice')
    asr_pipeline = pipeline(
        "automatic-speech-recognition",
        model="openai/whisper-tiny",
        tokenizer="openai/whisper-tiny",
        device=0  # Use GPU (if available); set to -1 for CPU
    )
    sft_spk = cosyvoice.list_avaliable_spks()
    prompt_sr, target_sr = 16000, 22050
    default_data = np.zeros(target_sr)
    main()