Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,010 Bytes
bcf08c1 5a26d97 bcf08c1 5a26d97 671d69d bcf08c1 498736f 671d69d bcf08c1 d18f59c 671d69d bcf08c1 671d69d bcf08c1 5a26d97 671d69d bcf08c1 671d69d bcf08c1 671d69d bcf08c1 671d69d bcf08c1 671d69d bcf08c1 671d69d bcf08c1 671d69d bcf08c1 671d69d bcf08c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
# Copyright (c) 2025 MediaTek Reserch Inc (authors: Chan-Jan Hsu)
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Liu Yue)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import spaces
import os
import sys
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
import argparse
import gradio as gr
import numpy as np
import torch
torch.set_num_threads(1)
import torchaudio
import random
import librosa
from transformers import pipeline
import subprocess
from scipy.signal import resample
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)
from cosyvoice.cli.cosyvoice import CosyVoice
from cosyvoice.utils.file_utils import load_wav, speed_change
#logging.basicConfig(level=logging.DEBUG,
# format='%(asctime)s %(levelname)s %(message)s')
def generate_seed():
seed = random.randint(1, 100000000)
return {
"__type__": "update",
"value": seed
}
def set_all_random_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
max_val = 0.8
def postprocess(speech, top_db=60, hop_length=220, win_length=440):
speech, _ = librosa.effects.trim(
speech, top_db=top_db,
frame_length=win_length,
hop_length=hop_length
)
if speech.abs().max() > max_val:
speech = speech / speech.abs().max() * max_val
speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
return speech
@spaces.GPU
def generate_audio(tts_text, prompt_text, prompt_wav, seed):
# if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
set_all_random_seed(seed)
output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k)
speed_factor = 1
if speed_factor != 1.0:
#try:
#audio_data, sample_rate = speed_change(output["tts_speech"], target_sr, str(speed_factor))
#audio_data = audio_data.numpy().flatten()
new_length = int(len(output['tts_speech']) / speed_factor)
audio_data = resample(output['tts_speech'], new_length)
# except Exception as e:
# print(f"Failed to change speed of audio: \n{e}")
else:
audio_data = output['tts_speech'].numpy().flatten()
return (target_sr, audio_data)
@spaces.GPU
def generate_text(prompt_wav):
if prompt_wav:
results = asr_pipeline(prompt_wav)
return results['text']
return "No valid input detected."
def main():
with gr.Blocks(title="BreezyVoice 語音合成系統", theme="default") as demo:
gr.Markdown(
"""# BreezyVoice 語音合成系統
#### Runs on Huggingface Zero GPU (A100)
為了加快推理速度,g2pw 注音標註並未被啟動。"""
)
# All content arranged in a single column
with gr.Column():
# Configuration Section
# Grouping prompt audio inputs and auto speech recognition in one block using Markdown
gr.Markdown("### 步驟 1. 音訊樣本輸入 & 音訊樣本文本輸入")
gr.Markdown("選擇 prompt 音訊檔案或錄製 prompt 音訊 (5~15秒),並手動校對自動產生的音訊樣本文本。")
prompt_wav = gr.Audio(
type='filepath',
label='選擇 prompt 音訊檔案(確保取樣率不低於 16khz)或錄製 prompt 音訊'
)
with gr.Blocks():
prompt_text = gr.Textbox(
label="音訊樣本文本輸入(此欄位應與音檔內容完全相同)",
lines=2,
placeholder="音訊樣本文本"
)
prompt_wav.input(
fn=generate_text,
inputs=[prompt_wav],
outputs=prompt_text
)
gr.Examples(
examples=[
["examples/commonvoice-example-1.mp3", "明月幾時有,去問氣象局"],
["examples/commonvoice-example-2.mp3", "雲林縣斗六市與林內鄉交界"],
["examples/commonvoice-example-3.mp3", "法律應保障所有的人獲得相同的發展結果"]
],
inputs=[prompt_wav, prompt_text],
label="範例"
)
# Input Section: Synthesis Text
gr.Markdown("### 步驟 2.合成文本輸入")
tts_text = gr.Textbox(
label="輸入想要合成的文本",
lines=2,
placeholder="請輸入想要合成的文本...",
value="我今天忙了一整天,現在好想睡覺喔 QQ"
)
# Output Section
gr.Markdown("### 步驟 3. 合成音訊")
# Generation button for audio synthesis (triggered manually)
with gr.Accordion("進階設定", open=False):
seed = gr.Number(value=0, label="隨機推理種子")
#seed_button = gr.Button("隨機")
seed_button = gr.Button(value="\U0001F3B2生成隨機推理種子\U0001F3B2")
speed_factor = 1
# speed_factor = gr.Slider(
# minimum=0.25,
# maximum=4,
# step=0.05,
# label="語速",
# value=1.0,
# interactive=True
# )
generate_button = gr.Button("生成音訊")
audio_output = gr.Audio(label="合成音訊")
# Set up callbacks for seed generation and audio synthesis
seed_button.click(fn=generate_seed, inputs=[], outputs=seed)
generate_button.click(
fn=generate_audio,
inputs=[tts_text, prompt_text, prompt_wav, seed],
outputs=audio_output
)
demo.launch()
if __name__ == '__main__':
cosyvoice = CosyVoice('Splend1dchan/BreezyVoice')
asr_pipeline = pipeline(
"automatic-speech-recognition",
model="openai/whisper-tiny",
tokenizer="openai/whisper-tiny",
device=0 # Use GPU (if available); set to -1 for CPU
)
sft_spk = cosyvoice.list_avaliable_spks()
prompt_sr, target_sr = 16000, 22050
default_data = np.zeros(target_sr)
main()
|