Spaces:
Build error
Build error
0220-152614-some_fix
Browse files- app.py +38 -31
- text/chinese.py +2 -2
- text/english.py +4 -4
app.py
CHANGED
|
@@ -1,13 +1,3 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
logging.getLogger("markdown_it").setLevel(logging.ERROR)
|
| 3 |
-
logging.getLogger("urllib3").setLevel(logging.ERROR)
|
| 4 |
-
logging.getLogger("httpcore").setLevel(logging.ERROR)
|
| 5 |
-
logging.getLogger("httpx").setLevel(logging.ERROR)
|
| 6 |
-
logging.getLogger("asyncio").setLevel(logging.ERROR)
|
| 7 |
-
logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
|
| 8 |
-
logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
|
| 9 |
-
logging.getLogger("multipart").setLevel(logging.WARNING)
|
| 10 |
-
|
| 11 |
import gradio as gr
|
| 12 |
import numpy as np
|
| 13 |
import soundfile as sf
|
|
@@ -26,6 +16,18 @@ from transformers.pipelines.audio_utils import ffmpeg_read
|
|
| 26 |
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
| 27 |
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
if "_CUDA_VISIBLE_DEVICES" in os.environ:
|
| 30 |
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
|
| 31 |
tz = pytz.timezone('Asia/Singapore')
|
|
@@ -365,9 +367,9 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|
| 365 |
startTime=timer()
|
| 366 |
text=trim_text(text,text_language)
|
| 367 |
change_sovits_weights(sovits_path)
|
| 368 |
-
tprint(f'
|
| 369 |
change_gpt_weights(gpt_path)
|
| 370 |
-
tprint(f'
|
| 371 |
|
| 372 |
prompt_language = dict_language[prompt_language]
|
| 373 |
text_language = dict_language[text_language]
|
|
@@ -375,8 +377,8 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|
| 375 |
if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "."
|
| 376 |
text = text.strip("\n")
|
| 377 |
if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text
|
| 378 |
-
print(("实际输入的参考文本:"), prompt_text)
|
| 379 |
-
print(("📝实际输入的目标文本:"), text)
|
| 380 |
zero_wav = np.zeros(
|
| 381 |
int(hps.data.sampling_rate * 0.3),
|
| 382 |
dtype=np.float16 if is_half == True else np.float32,
|
|
@@ -418,7 +420,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|
| 418 |
text = cut5(text)
|
| 419 |
while "\n\n" in text:
|
| 420 |
text = text.replace("\n\n", "\n")
|
| 421 |
-
print(
|
| 422 |
texts = text.split("\n")
|
| 423 |
texts = merge_short_text_in_array(texts, 5)
|
| 424 |
audio_opt = []
|
|
@@ -428,7 +430,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|
| 428 |
if (len(text.strip()) == 0):
|
| 429 |
continue
|
| 430 |
if (text[-1] not in splits): text += "。" if text_language != "en" else "."
|
| 431 |
-
print(("
|
| 432 |
phones2, word2ph2, norm_text2 = get_cleaned_text_final(text, text_language)
|
| 433 |
bert2 = get_bert_final(phones2, word2ph2, norm_text2, text_language, device).to(dtype)
|
| 434 |
bert = torch.cat([bert1, bert2], 1)
|
|
@@ -561,13 +563,16 @@ def cut5(inp):
|
|
| 561 |
# if not re.search(r'[^\w\s]', inp[-1]):
|
| 562 |
# inp += '。'
|
| 563 |
inp = inp.strip("\n")
|
| 564 |
-
punds = r'[
|
| 565 |
items = re.split(f'({punds})', inp)
|
| 566 |
-
|
| 567 |
-
|
|
|
|
|
|
|
| 568 |
return opt
|
| 569 |
|
| 570 |
|
|
|
|
| 571 |
def custom_sort_key(s):
|
| 572 |
# 使用正则表达式提取字符串中的数字部分和非数字部分
|
| 573 |
parts = re.split('(\d+)', s)
|
|
@@ -580,7 +585,7 @@ def tprint(text):
|
|
| 580 |
print(f'UTC+8 - {now} - {text}')
|
| 581 |
|
| 582 |
def wprint(text):
|
| 583 |
-
|
| 584 |
gr.Warning(text)
|
| 585 |
|
| 586 |
#裁切文本
|
|
@@ -589,11 +594,13 @@ def trim_text(text,language):
|
|
| 589 |
limit_en = 60 #words
|
| 590 |
search_limit_cj = limit_cj+30
|
| 591 |
search_limit_en = limit_en +30
|
|
|
|
|
|
|
| 592 |
if language =='English':
|
| 593 |
words = text.split()
|
| 594 |
if len(words) <= limit_en:
|
| 595 |
return text
|
| 596 |
-
#
|
| 597 |
for i in range(limit_en, -1, -1):
|
| 598 |
if any(punct in words[i] for punct in splits):
|
| 599 |
return ' '.join(words[:i+1])
|
|
@@ -605,13 +612,13 @@ def trim_text(text,language):
|
|
| 605 |
else:#中文日文
|
| 606 |
if len(text) <= limit_cj:
|
| 607 |
return text
|
| 608 |
-
for i in range(limit_cj, -1, -1):
|
| 609 |
if text[i] in splits:
|
| 610 |
return text[:i+1]
|
| 611 |
-
for i in range(limit_cj, min(len(text), search_limit_cj)):
|
| 612 |
if text[i] in splits:
|
| 613 |
return text[:i+1]
|
| 614 |
-
return text[:limit_cj]
|
| 615 |
|
| 616 |
def duration(audio_file_path):
|
| 617 |
try:
|
|
@@ -670,7 +677,7 @@ def transcribe(voice):
|
|
| 670 |
|
| 671 |
time2=timer()
|
| 672 |
tprint(f'transcribe COMPLETE,{round(time2-time1,4)}s')
|
| 673 |
-
tprint(f'\n
|
| 674 |
return text,language
|
| 675 |
|
| 676 |
def clone_voice(user_voice,user_text,user_lang):
|
|
@@ -679,7 +686,7 @@ def clone_voice(user_voice,user_text,user_lang):
|
|
| 679 |
if user_text == '':
|
| 680 |
wprint("Please enter text to generate/请输入生成文字")
|
| 681 |
return None
|
| 682 |
-
tprint('⚡Start clone')
|
| 683 |
user_text=trim_text(user_text,user_lang)
|
| 684 |
time1=timer()
|
| 685 |
global gpt_path, sovits_path
|
|
@@ -736,9 +743,9 @@ with gr.Blocks(theme='Kasien/ali_theme_custom') as app:
|
|
| 736 |
chinese_models = [name for name, _ in models_by_language["中文"]]
|
| 737 |
japanese_models = [name for name, _ in models_by_language["日本語"]]
|
| 738 |
with gr.Row():
|
| 739 |
-
english_choice = gr.Radio(english_models, label="EN|English Model",value="Trump")
|
| 740 |
-
chinese_choice = gr.Radio(chinese_models, label="CN|中文模型")
|
| 741 |
-
japanese_choice = gr.Radio(japanese_models, label="JP|日本語モデル")
|
| 742 |
|
| 743 |
plsh='Text must match the selected language option to prevent errors, for example, if English is input but Chinese is selected for generation.\n文字一定要和语言选项匹配,不然要报错,比如输入的是英文,生成语言选中文'
|
| 744 |
limit='Max 70 words. Excess will be ignored./单次最多处理120字左右,多余的会被忽略'
|
|
@@ -784,7 +791,7 @@ with gr.Blocks(theme='Kasien/ali_theme_custom') as app:
|
|
| 784 |
interactive=True,
|
| 785 |
info='A suitable splitting method can achieve better generation results'
|
| 786 |
)
|
| 787 |
-
volume = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.01, label='Volume')
|
| 788 |
|
| 789 |
|
| 790 |
|
|
@@ -809,7 +816,7 @@ with gr.Blocks(theme='Kasien/ali_theme_custom') as app:
|
|
| 809 |
placeholder=plsh,info=limit)
|
| 810 |
|
| 811 |
user_button = gr.Button("✨Clone Voice", variant="primary")
|
| 812 |
-
user_output = gr.Audio(label="💾
|
| 813 |
|
| 814 |
gr.HTML('''<div align=center><img id="visitor-badge" alt="visitor badge" src="https://visitor-badge.laobi.icu/badge?page_id=Ailyth/DLMP9" /></div>''')
|
| 815 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import numpy as np
|
| 3 |
import soundfile as sf
|
|
|
|
| 16 |
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
| 17 |
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
| 18 |
|
| 19 |
+
|
| 20 |
+
import logging
|
| 21 |
+
logging.getLogger("markdown_it").setLevel(logging.ERROR)
|
| 22 |
+
logging.getLogger("urllib3").setLevel(logging.ERROR)
|
| 23 |
+
logging.getLogger("httpcore").setLevel(logging.ERROR)
|
| 24 |
+
logging.getLogger("httpx").setLevel(logging.ERROR)
|
| 25 |
+
logging.getLogger("asyncio").setLevel(logging.ERROR)
|
| 26 |
+
logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
|
| 27 |
+
logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
|
| 28 |
+
logging.getLogger("multipart").setLevel(logging.WARNING)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
if "_CUDA_VISIBLE_DEVICES" in os.environ:
|
| 32 |
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
|
| 33 |
tz = pytz.timezone('Asia/Singapore')
|
|
|
|
| 367 |
startTime=timer()
|
| 368 |
text=trim_text(text,text_language)
|
| 369 |
change_sovits_weights(sovits_path)
|
| 370 |
+
tprint(f'🏕️LOADED SoVITS Model: {sovits_path}')
|
| 371 |
change_gpt_weights(gpt_path)
|
| 372 |
+
tprint(f'🏕️LOADED GPT Model: {gpt_path}')
|
| 373 |
|
| 374 |
prompt_language = dict_language[prompt_language]
|
| 375 |
text_language = dict_language[text_language]
|
|
|
|
| 377 |
if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "."
|
| 378 |
text = text.strip("\n")
|
| 379 |
if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text
|
| 380 |
+
#print(("实际输入的参考文本:"), prompt_text)
|
| 381 |
+
#print(("📝实际输入的目标文本:"), text)
|
| 382 |
zero_wav = np.zeros(
|
| 383 |
int(hps.data.sampling_rate * 0.3),
|
| 384 |
dtype=np.float16 if is_half == True else np.float32,
|
|
|
|
| 420 |
text = cut5(text)
|
| 421 |
while "\n\n" in text:
|
| 422 |
text = text.replace("\n\n", "\n")
|
| 423 |
+
print(f"🧨实际输入的目标文本(切句后):{text}\n")
|
| 424 |
texts = text.split("\n")
|
| 425 |
texts = merge_short_text_in_array(texts, 5)
|
| 426 |
audio_opt = []
|
|
|
|
| 430 |
if (len(text.strip()) == 0):
|
| 431 |
continue
|
| 432 |
if (text[-1] not in splits): text += "。" if text_language != "en" else "."
|
| 433 |
+
print(("\n🎈实际输入的目标文本(每句):"), text)
|
| 434 |
phones2, word2ph2, norm_text2 = get_cleaned_text_final(text, text_language)
|
| 435 |
bert2 = get_bert_final(phones2, word2ph2, norm_text2, text_language, device).to(dtype)
|
| 436 |
bert = torch.cat([bert1, bert2], 1)
|
|
|
|
| 563 |
# if not re.search(r'[^\w\s]', inp[-1]):
|
| 564 |
# inp += '。'
|
| 565 |
inp = inp.strip("\n")
|
| 566 |
+
punds = r'[,.;?!、,。?!;:…]'
|
| 567 |
items = re.split(f'({punds})', inp)
|
| 568 |
+
mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])]
|
| 569 |
+
if len(items)%2 == 1:
|
| 570 |
+
mergeitems.append(items[-1])
|
| 571 |
+
opt = "\n".join(mergeitems)
|
| 572 |
return opt
|
| 573 |
|
| 574 |
|
| 575 |
+
|
| 576 |
def custom_sort_key(s):
|
| 577 |
# 使用正则表达式提取字符串中的数字部分和非数字部分
|
| 578 |
parts = re.split('(\d+)', s)
|
|
|
|
| 585 |
print(f'UTC+8 - {now} - {text}')
|
| 586 |
|
| 587 |
def wprint(text):
|
| 588 |
+
tprint(text)
|
| 589 |
gr.Warning(text)
|
| 590 |
|
| 591 |
#裁切文本
|
|
|
|
| 594 |
limit_en = 60 #words
|
| 595 |
search_limit_cj = limit_cj+30
|
| 596 |
search_limit_en = limit_en +30
|
| 597 |
+
text = text.replace('\n', '').strip()
|
| 598 |
+
|
| 599 |
if language =='English':
|
| 600 |
words = text.split()
|
| 601 |
if len(words) <= limit_en:
|
| 602 |
return text
|
| 603 |
+
# English
|
| 604 |
for i in range(limit_en, -1, -1):
|
| 605 |
if any(punct in words[i] for punct in splits):
|
| 606 |
return ' '.join(words[:i+1])
|
|
|
|
| 612 |
else:#中文日文
|
| 613 |
if len(text) <= limit_cj:
|
| 614 |
return text
|
| 615 |
+
for i in range(limit_cj, -1, -1):
|
| 616 |
if text[i] in splits:
|
| 617 |
return text[:i+1]
|
| 618 |
+
for i in range(limit_cj, min(len(text), search_limit_cj)):
|
| 619 |
if text[i] in splits:
|
| 620 |
return text[:i+1]
|
| 621 |
+
return text[:limit_cj]
|
| 622 |
|
| 623 |
def duration(audio_file_path):
|
| 624 |
try:
|
|
|
|
| 677 |
|
| 678 |
time2=timer()
|
| 679 |
tprint(f'transcribe COMPLETE,{round(time2-time1,4)}s')
|
| 680 |
+
tprint(f'\n🔣转录结果:\n 🔣Language:{language} \n 🔣Text:{text}' )
|
| 681 |
return text,language
|
| 682 |
|
| 683 |
def clone_voice(user_voice,user_text,user_lang):
|
|
|
|
| 686 |
if user_text == '':
|
| 687 |
wprint("Please enter text to generate/请输入生成文字")
|
| 688 |
return None
|
| 689 |
+
#tprint('⚡Start clone')
|
| 690 |
user_text=trim_text(user_text,user_lang)
|
| 691 |
time1=timer()
|
| 692 |
global gpt_path, sovits_path
|
|
|
|
| 743 |
chinese_models = [name for name, _ in models_by_language["中文"]]
|
| 744 |
japanese_models = [name for name, _ in models_by_language["日本語"]]
|
| 745 |
with gr.Row():
|
| 746 |
+
english_choice = gr.Radio(english_models, label="EN|English Model",value="Trump",scale=3)
|
| 747 |
+
chinese_choice = gr.Radio(chinese_models, label="CN|中文模型",scale=2)
|
| 748 |
+
japanese_choice = gr.Radio(japanese_models, label="JP|日本語モデル",scale=4)
|
| 749 |
|
| 750 |
plsh='Text must match the selected language option to prevent errors, for example, if English is input but Chinese is selected for generation.\n文字一定要和语言选项匹配,不然要报错,比如输入的是英文,生成语言选中文'
|
| 751 |
limit='Max 70 words. Excess will be ignored./单次最多处理120字左右,多余的会被忽略'
|
|
|
|
| 791 |
interactive=True,
|
| 792 |
info='A suitable splitting method can achieve better generation results'
|
| 793 |
)
|
| 794 |
+
volume = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.01, label='Volume/音量')
|
| 795 |
|
| 796 |
|
| 797 |
|
|
|
|
| 816 |
placeholder=plsh,info=limit)
|
| 817 |
|
| 818 |
user_button = gr.Button("✨Clone Voice", variant="primary")
|
| 819 |
+
user_output = gr.Audio(label="💾Download it by clicking ⬇️")
|
| 820 |
|
| 821 |
gr.HTML('''<div align=center><img id="visitor-badge" alt="visitor badge" src="https://visitor-badge.laobi.icu/badge?page_id=Ailyth/DLMP9" /></div>''')
|
| 822 |
|
text/chinese.py
CHANGED
|
@@ -30,7 +30,7 @@ rep_map = {
|
|
| 30 |
"\n": ".",
|
| 31 |
"·": ",",
|
| 32 |
"、": ",",
|
| 33 |
-
"...": "…",
|
| 34 |
"$": ".",
|
| 35 |
"/": ",",
|
| 36 |
"—": "-",
|
|
@@ -169,4 +169,4 @@ if __name__ == "__main__":
|
|
| 169 |
|
| 170 |
# # 示例用法
|
| 171 |
# text = "这是一个示例文本:,你好!这是一个测试..."
|
| 172 |
-
# print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
|
|
|
|
| 30 |
"\n": ".",
|
| 31 |
"·": ",",
|
| 32 |
"、": ",",
|
| 33 |
+
# "...": "…",
|
| 34 |
"$": ".",
|
| 35 |
"/": ",",
|
| 36 |
"—": "-",
|
|
|
|
| 169 |
|
| 170 |
# # 示例用法
|
| 171 |
# text = "这是一个示例文本:,你好!这是一个测试..."
|
| 172 |
+
# print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
|
text/english.py
CHANGED
|
@@ -169,9 +169,9 @@ def read_dict_new():
|
|
| 169 |
line = line.strip()
|
| 170 |
word_split = line.split(" ")
|
| 171 |
word = word_split[0]
|
| 172 |
-
if word not in g2p_dict:
|
| 173 |
-
|
| 174 |
-
|
| 175 |
|
| 176 |
line_index = line_index + 1
|
| 177 |
line = f.readline()
|
|
@@ -231,4 +231,4 @@ if __name__ == "__main__":
|
|
| 231 |
# for group in syllables:
|
| 232 |
# for ph in group:
|
| 233 |
# all_phones.add(ph)
|
| 234 |
-
# print(all_phones)
|
|
|
|
| 169 |
line = line.strip()
|
| 170 |
word_split = line.split(" ")
|
| 171 |
word = word_split[0]
|
| 172 |
+
#if word not in g2p_dict:
|
| 173 |
+
g2p_dict[word] = []
|
| 174 |
+
g2p_dict[word].append(word_split[1:])
|
| 175 |
|
| 176 |
line_index = line_index + 1
|
| 177 |
line = f.readline()
|
|
|
|
| 231 |
# for group in syllables:
|
| 232 |
# for ph in group:
|
| 233 |
# all_phones.add(ph)
|
| 234 |
+
# print(all_phones)
|