Mahiruoshi commited on
Commit
9e07225
·
verified ·
1 Parent(s): 014b277

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +178 -143
app.py CHANGED
@@ -3,7 +3,6 @@ import os
3
  from pathlib import Path
4
 
5
  import logging
6
- import uuid
7
  import re_matching
8
 
9
  logging.getLogger("numba").setLevel(logging.WARNING)
@@ -59,9 +58,16 @@ from bs4 import BeautifulSoup
59
  import jieba
60
  import romajitable
61
 
62
- from flask import Flask, request, jsonify, render_template_string, send_file
63
- from flask_cors import CORS
64
- from scipy.io.wavfile import write
 
 
 
 
 
 
 
65
  net_g = None
66
 
67
  device = (
@@ -91,18 +97,7 @@ BandList = {
91
  "西克菲尔特音乐学院":["晶","未知留","八千代","栞","美帆"]
92
  }
93
 
94
- webBase = 'https://mahiruoshi-bangdream-bert-vits2.hf.space/'
95
-
96
- port = 7860
97
-
98
- languages = [ "Auto", "ZH", "JP"]
99
- modelPaths = []
100
- modes = ['pyopenjtalk-V2.3-Katakana','fugashi-V2.3-Katakana','pyopenjtalk-V2.3-Katakana-Katakana','fugashi-V2.3-Katakana-Katakana','onnx-V2.3']
101
- sentence_modes = ['sentence','paragraph']
102
- for dirpath, dirnames, filenames in os.walk('Data/BangDream/models/'):
103
- for filename in filenames:
104
- modelPaths.append(os.path.join(dirpath, filename))
105
- hps = utils.get_hparams_from_file('Data/BangDream/config.json')
106
 
107
  def translate(Sentence: str, to_Language: str = "jp", from_Language: str = ""):
108
  """
@@ -508,14 +503,14 @@ def infer(
508
  style_text=None,
509
  style_weight=0.7,
510
  language = "Auto",
511
- mode = 'pyopenjtalk-V2.3-Katakana',
512
  skip_start=False,
513
  skip_end=False,
514
  ):
515
  if style_text == None:
516
  style_text = ""
517
  style_weight=0,
518
- if mode == 'fugashi-V2.3-Katakana':
519
  text = kanji_to_hiragana(text) if is_japanese(text) else text
520
  if language == "JP":
521
  text = translate(text,"jp")
@@ -635,7 +630,7 @@ def generate_audio_and_srt_for_group(
635
  """
636
 
637
  for sentence in group:
638
-
639
  if len(sentence) > 1:
640
  FakeSpeaker = sentence.split("|")[0]
641
  print(FakeSpeaker)
@@ -647,7 +642,7 @@ def generate_audio_and_srt_for_group(
647
  speaker = i.split("|")[0]
648
  if sentence != '\n':
649
  text = (remove_annotations(sentence.split("|")[-1]).replace(" ","")+"。").replace(",。","。")
650
- if mode == 'pyopenjtalk-V2.3-Katakana' or mode == 'fugashi-V2.3-Katakana':
651
  #print(f'{text}:{sdp_ratio}:{noise_scale}:{noise_scale_w}:{length_scale}:{length_scale}:{speaker}:{language}:{mode}:{skip_start}:{skip_end}')
652
  audio = infer(
653
  text,
@@ -672,7 +667,8 @@ def generate_audio_and_srt_for_group(
672
  end_time = start_time + duration + silenceTime
673
  ass_entries.append("Dialogue: 0,{},{},".format(seconds_to_ass_time(start_time), seconds_to_ass_time(end_time)) + "Default,,0,0,0,,{}".format(sentence.replace("|",":")))
674
  start_time = end_time
675
-
 
676
  wav_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.wav')
677
  ass_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.ass')
678
  write(wav_filename, sampling_rate, gr.processing_utils.convert_to_16_bit_wav(np.concatenate(audio_fin)))
@@ -683,7 +679,7 @@ def generate_audio_and_srt_for_group(
683
 
684
  def generate_audio(
685
  inputFile,
686
- groupsize,
687
  filepath,
688
  silenceTime,
689
  speakerList,
@@ -696,12 +692,15 @@ def generate_audio(
696
  style_text=None,
697
  style_weight=0.7,
698
  language = "Auto",
699
- mode = 'pyopenjtalk-V2.3-Katakana',
700
  sentence_mode = 'sentence',
701
  skip_start=False,
702
  skip_end=False,
703
  ):
704
- if mode == 'pyopenjtalk-V2.3-Katakana' or mode == 'fugashi-V2.3-Katakana':
 
 
 
705
  if sentence_mode == 'sentence':
706
  audio = infer(
707
  text,
@@ -719,13 +718,11 @@ def generate_audio(
719
  )
720
  return (hps.data.sampling_rate,gr.processing_utils.convert_to_16_bit_wav(audio))
721
  if sentence_mode == 'paragraph':
722
- GROUP_SIZE = groupsize
723
  directory_path = filepath if torch.cuda.is_available() else "books"
724
  if os.path.exists(directory_path):
725
  shutil.rmtree(directory_path)
726
  os.makedirs(directory_path)
727
- if inputFile:
728
- text = extract_text_from_file(inputFile.name)
729
  if language == 'Auto':
730
  sentences = extrac(extract_and_convert(text))
731
  else:
@@ -756,129 +753,167 @@ def generate_audio(
756
  if not torch.cuda.is_available():
757
  return result
758
  return result
759
-
760
- Flaskapp = Flask(__name__)
761
- CORS(Flaskapp)
762
- @Flaskapp.route('/', methods=['GET', 'POST'])
763
-
764
- def tts():
765
- if request.method == 'POST':
766
- input = request.json
767
- inputFile = None
768
- filepath = input['filepath']
769
- groupSize = input['groupSize']
770
- text = input['text']
771
- sdp_ratio = input['sdp_ratio']
772
- noise_scale = input['noise_scale']
773
- noise_scale_w = input['noise_scale_w']
774
- length_scale = input['length_scale']
775
- sid = input['speaker']
776
- style_text = input['style_text']
777
- style_weight = input['style_weight']
778
- language = input['language']
779
- mode = input['mode']
780
- sentence_mode = input['sentence_mode']
781
- skip_start = input['skip_start']
782
- skip_end = input['skip_end']
783
- speakerList = input['speakerList']
784
- silenceTime = input['silenceTime']
785
- samplerate, audio = generate_audio(
786
- inputFile,
787
- groupSize,
788
- filepath,
789
- silenceTime,
790
- speakerList,
791
- text,
792
- sdp_ratio,
793
- noise_scale,
794
- noise_scale_w,
795
- length_scale,
796
- sid,
797
- style_text,
798
- style_weight,
799
- language,
800
- mode,
801
- sentence_mode,
802
- skip_start,
803
- skip_end,
804
- )
805
- unique_filename = f"temp{uuid.uuid4()}.wav"
806
- write(unique_filename, samplerate, audio)
807
- with open(unique_filename ,'rb') as bit:
808
- wav_bytes = bit.read()
809
- os.remove(unique_filename)
810
- headers = {
811
- 'Content-Type': 'audio/wav',
812
- 'Text': unique_filename .encode('utf-8')}
813
- return wav_bytes, 200, headers
814
- groupSize = request.args.get('groupSize', default = 50, type = int)
815
- text = request.args.get('text', default = '', type = str)
816
- sdp_ratio = request.args.get('sdp_ratio', default = 0.5, type = float)
817
- noise_scale = request.args.get('noise_scale', default = 0.6, type = float)
818
- noise_scale_w = request.args.get('noise_scale_w', default = 0.667, type = float)
819
- length_scale = request.args.get('length_scale', default = 1, type = float)
820
- sid = request.args.get('speaker', default = '八千代', type = str)
821
- style_text = request.args.get('style_text', default = '', type = str)
822
- style_weight = request.args.get('style_weight', default = 0.7, type = float)
823
- language = request.args.get('language', default = 'Auto', type = str)
824
- mode = request.args.get('mode', default = 'pyopenjtalk-V2.3-Katakana', type = str)
825
- sentence_mode = request.args.get('sentence_mode', default = 'sentence', type = str)
826
- skip_start = request.args.get('skip_start', default = False, type = bool)
827
- skip_end = request.args.get('skip_end', default = False, type = bool)
828
- speakerList = request.args.get('speakerList', default = '', type = str)
829
- silenceTime = request.args.get('silenceTime', default = 0.1, type = float)
830
- inputFile = None
831
- if not sid or not text:
832
- return render_template_string(f"""
833
- <!DOCTYPE html>
834
- <html>
835
- <head>
836
- <title>TTS API Documentation</title>
837
- </head>
838
- <body>
839
- <iframe src={webBase} style="width:100%; height:100vh; border:none;"></iframe>
840
- </body>
841
- </html>
842
- """)
843
- samplerate, audio = generate_audio(
844
- inputFile,
845
- groupSize,
846
- None,
847
- silenceTime,
848
- speakerList,
849
- text,
850
- sdp_ratio,
851
- noise_scale,
852
- noise_scale_w,
853
- length_scale,
854
- sid,
855
- style_text,
856
- style_weight,
857
- language,
858
- mode,
859
- sentence_mode,
860
- skip_start,
861
- skip_end,
862
- )
863
- unique_filename = f"temp{uuid.uuid4()}.wav"
864
- write(unique_filename, samplerate, audio)
865
- with open(unique_filename ,'rb') as bit:
866
- wav_bytes = bit.read()
867
- os.remove(unique_filename)
868
- headers = {
869
- 'Content-Type': 'audio/wav',
870
- 'Text': unique_filename .encode('utf-8')}
871
- return wav_bytes, 200, headers
872
-
873
 
874
  if __name__ == "__main__":
875
  download_unidic()
876
  tagger = Tagger()
 
 
 
 
877
  net_g = get_net_g(
878
  model_path=modelPaths[-1], device=device, hps=hps
879
  )
880
  speaker_ids = hps.data.spk2id
881
  speakers = list(speaker_ids.keys())
882
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
883
  print("推理页面已开启!")
884
- Flaskapp.run(host="0.0.0.0", port=port,debug=True)
 
3
  from pathlib import Path
4
 
5
  import logging
 
6
  import re_matching
7
 
8
  logging.getLogger("numba").setLevel(logging.WARNING)
 
58
  import jieba
59
  import romajitable
60
 
61
+ webBase = {
62
+ 'pyopenjtalk-V2.3-Katakana': 'https://mahiruoshi-mygo-vits-bert.hf.space/',
63
+ 'fugashi-V2.3-Katakana': 'https://mahiruoshi-mygo-vits-bert.hf.space/',
64
+ }
65
+
66
+ languages = [ "Auto", "ZH", "JP"]
67
+ modelPaths = []
68
+ modes = ['pyopenjtalk-V2.3','fugashi-V2.3']
69
+ sentence_modes = ['sentence','paragraph']
70
+
71
  net_g = None
72
 
73
  device = (
 
97
  "西克菲尔特音乐学院":["晶","未知留","八千代","栞","美帆"]
98
  }
99
 
100
+ #翻译
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  def translate(Sentence: str, to_Language: str = "jp", from_Language: str = ""):
103
  """
 
503
  style_text=None,
504
  style_weight=0.7,
505
  language = "Auto",
506
+ mode = 'pyopenjtalk-V2.3',
507
  skip_start=False,
508
  skip_end=False,
509
  ):
510
  if style_text == None:
511
  style_text = ""
512
  style_weight=0,
513
+ if mode == 'fugashi-V2.3':
514
  text = kanji_to_hiragana(text) if is_japanese(text) else text
515
  if language == "JP":
516
  text = translate(text,"jp")
 
630
  """
631
 
632
  for sentence in group:
633
+ try:
634
  if len(sentence) > 1:
635
  FakeSpeaker = sentence.split("|")[0]
636
  print(FakeSpeaker)
 
642
  speaker = i.split("|")[0]
643
  if sentence != '\n':
644
  text = (remove_annotations(sentence.split("|")[-1]).replace(" ","")+"。").replace(",。","。")
645
+ if mode == 'pyopenjtalk-V2.3' or mode == 'fugashi-V2.3':
646
  #print(f'{text}:{sdp_ratio}:{noise_scale}:{noise_scale_w}:{length_scale}:{length_scale}:{speaker}:{language}:{mode}:{skip_start}:{skip_end}')
647
  audio = infer(
648
  text,
 
667
  end_time = start_time + duration + silenceTime
668
  ass_entries.append("Dialogue: 0,{},{},".format(seconds_to_ass_time(start_time), seconds_to_ass_time(end_time)) + "Default,,0,0,0,,{}".format(sentence.replace("|",":")))
669
  start_time = end_time
670
+ except:
671
+ pass
672
  wav_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.wav')
673
  ass_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.ass')
674
  write(wav_filename, sampling_rate, gr.processing_utils.convert_to_16_bit_wav(np.concatenate(audio_fin)))
 
679
 
680
  def generate_audio(
681
  inputFile,
682
+ groupSize,
683
  filepath,
684
  silenceTime,
685
  speakerList,
 
692
  style_text=None,
693
  style_weight=0.7,
694
  language = "Auto",
695
+ mode = 'pyopenjtalk-V2.3',
696
  sentence_mode = 'sentence',
697
  skip_start=False,
698
  skip_end=False,
699
  ):
700
+ if inputFile:
701
+ text = extract_text_from_file(inputFile.name)
702
+ sentence_mode = 'paragraph'
703
+ if mode == 'pyopenjtalk-V2.3' or mode == 'fugashi-V2.3':
704
  if sentence_mode == 'sentence':
705
  audio = infer(
706
  text,
 
718
  )
719
  return (hps.data.sampling_rate,gr.processing_utils.convert_to_16_bit_wav(audio))
720
  if sentence_mode == 'paragraph':
721
+ GROUP_SIZE = groupSize
722
  directory_path = filepath if torch.cuda.is_available() else "books"
723
  if os.path.exists(directory_path):
724
  shutil.rmtree(directory_path)
725
  os.makedirs(directory_path)
 
 
726
  if language == 'Auto':
727
  sentences = extrac(extract_and_convert(text))
728
  else:
 
753
  if not torch.cuda.is_available():
754
  return result
755
  return result
756
+ #url = f'{webBase[mode]}?text={text}&speaker={sid}&sdp_ratio={sdp_ratio}&noise_scale={noise_scale}&noise_scale_w={noise_scale_w}&length_scale={length_scale}&language={language}&skip_start={skip_start}&skip_end={skip_end}'
757
+ #print(url)
758
+ #res = requests.get(url)
759
+ #改用post
760
+ res = requests.post(webBase[mode], json = {
761
+ "groupSize": groupSize,
762
+ "filepath": filepath,
763
+ "silenceTime": silenceTime,
764
+ "speakerList": speakerList,
765
+ "text": text,
766
+ "speaker": sid,
767
+ "sdp_ratio": sdp_ratio,
768
+ "noise_scale": noise_scale,
769
+ "noise_scale_w": noise_scale_w,
770
+ "length_scale": length_scale,
771
+ "language": language,
772
+ "skip_start": skip_start,
773
+ "skip_end": skip_end,
774
+ "mode": mode,
775
+ "sentence_mode": sentence_mode,
776
+ "style_text": style_text,
777
+ "style_weight": style_weight
778
+ })
779
+ audio = res.content
780
+ with open('output.wav', 'wb') as code:
781
+ code.write(audio)
782
+ file_path = "output.wav"
783
+ return file_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
784
 
785
  if __name__ == "__main__":
786
  download_unidic()
787
  tagger = Tagger()
788
+ for dirpath, dirnames, filenames in os.walk('Data/BangDream/models/'):
789
+ for filename in filenames:
790
+ modelPaths.append(os.path.join(dirpath, filename))
791
+ hps = utils.get_hparams_from_file('Data/BangDream/config.json')
792
  net_g = get_net_g(
793
  model_path=modelPaths[-1], device=device, hps=hps
794
  )
795
  speaker_ids = hps.data.spk2id
796
  speakers = list(speaker_ids.keys())
797
+ with gr.Blocks() as app:
798
+ gr.Markdown(value="""
799
+ ([Bert-Vits2](https://github.com/Stardust-minus/Bert-VITS2) V2.3)少歌邦邦全员在线语音合成\n
800
+ [好玩的](http://love.soyorin.top/)\n
801
+ 该界面的真实链接(国内可用): https://mahiruoshi-bangdream-bert-vits2.hf.space/\n
802
+ API: https://mahiruoshi-bert-vits2-api.hf.space/ \n
803
+ 调用方式: https://mahiruoshi-bert-vits2-api.hf.space/?text={{speakText}}&speaker=chosen_speaker\n
804
+ 推荐搭配[Legado开源阅读](https://github.com/gedoor/legado)或[聊天bot](https://github.com/Paraworks/BangDreamAi)使用\n
805
+ 二创请标注作者:B站@Mahiroshi: https://space.bilibili.com/19874615\n
806
+ 训练数据集归属:BangDream及少歌手游,提取自BestDori,[数据集获取流程](https://nijigaku.top/2023/09/29/Bestbushiroad%E8%AE%A1%E5%88%92-vits-%E9%9F%B3%E9%A2%91%E6%8A%93%E5%8F%96%E5%8F%8A%E6%95%B0%E6%8D%AE%E9%9B%86%E5%AF%B9%E9%BD%90/)\n
807
+ BangDream数据集下载[链接](https://huggingface.co/spaces/Mahiruoshi/BangDream-Bert-VITS2/blob/main/%E7%88%AC%E8%99%AB/SortPathUrl.txt)\n
808
+ !!!注意:huggingface容器仅用作展示,建议在右上角更多选项中克隆本项目或Docker运行app.py/server.py,环境参考requirements.txt\n""")
809
+ for band in BandList:
810
+ with gr.TabItem(band):
811
+ for name in BandList[band]:
812
+ with gr.TabItem(name):
813
+ with gr.Row():
814
+ with gr.Column():
815
+ with gr.Row():
816
+ gr.Markdown(
817
+ '<div align="center">'
818
+ f'<img style="width:auto;height:400px;" src="https://mahiruoshi-bangdream-bert-vits2.hf.space/file/image/{name}.png">'
819
+ '</div>'
820
+ )
821
+ with gr.Accordion(label="参数设定", open=False):
822
+ sdp_ratio = gr.Slider(
823
+ minimum=0, maximum=1, value=0.5, step=0.01, label="SDP/DP混合比"
824
+ )
825
+ noise_scale = gr.Slider(
826
+ minimum=0.1, maximum=2, value=0.6, step=0.01, label="Noise:感情调节"
827
+ )
828
+ noise_scale_w = gr.Slider(
829
+ minimum=0.1, maximum=2, value=0.667, step=0.01, label="Noise_W:音素长度"
830
+ )
831
+ skip_start = gr.Checkbox(label="skip_start")
832
+ skip_end = gr.Checkbox(label="skip_end")
833
+ speaker = gr.Dropdown(
834
+ choices=speakers, value=name, label="说话人"
835
+ )
836
+ length_scale = gr.Slider(
837
+ minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节"
838
+ )
839
+ language = gr.Dropdown(
840
+ choices=languages, value="Auto", label="语言选择,若不选自动则会将输入语言翻译为日语或中文"
841
+ )
842
+ mode = gr.Dropdown(
843
+ choices=modes, value="pyopenjtalk-V2.3", label="TTS模式,合成少歌角色需要切换成 pyopenjtalk-V2.3-Katakana "
844
+ )
845
+ sentence_mode = gr.Dropdown(
846
+ choices=sentence_modes, value="sentence", label="���本合成模式"
847
+ )
848
+ with gr.Accordion(label="扩展选项", open=False):
849
+ inputFile = gr.UploadButton(label="txt文件输入")
850
+ speakerList = gr.TextArea(
851
+ label="角色对应表,如果你记不住角色名可以这样,左边是你想要在每一句话合成中用到的speaker(见角色清单)右边是你上传文本时分隔符左边设置的说话人:{ChoseSpeakerFromConfigList}|{SeakerInUploadText}",
852
+ value = "ましろ|真白\n七深|七深\n透子|透子\nつくし|筑紫\n瑠唯|瑠唯\nそよ|素世\n祥子|祥子",
853
+ )
854
+ groupSize = gr.Slider(
855
+ minimum=10, maximum=1000 if torch.cuda.is_available() else 50,value = 50, step=1, label="单个音频文件包含的最大句子数"
856
+ )
857
+ filepath = gr.TextArea(
858
+ label="本地合成时的音频存储文件夹(会清空文件夹,别把C盘删了)",
859
+ value = "D:/audiobook/book1",
860
+ )
861
+ silenceTime = gr.Slider(
862
+ minimum=0, maximum=1, value=0.5, step=0.01, label="句子的间隔"
863
+ )
864
+ modelstrs = gr.Dropdown(label = "模型", choices = modelPaths, value = modelPaths[0], type = "value")
865
+ btnMod = gr.Button("载入模型")
866
+ statusa = gr.TextArea(label = "模型加载状态")
867
+ btnMod.click(loadmodel, inputs=[modelstrs], outputs = [statusa])
868
+ with gr.Column():
869
+ text = gr.TextArea(
870
+ label="文本输入,可用'|'分割说话人和文本,注意换行",
871
+ info="输入纯日语或者中文",
872
+ #placeholder=f"{name}|你觉得你是职业歌手吗\n真白|我觉得我是",
873
+ value=f"{name}|你觉得你是职业歌手吗\n真白|我觉得我是"
874
+ )
875
+ style_text = gr.Textbox(
876
+ label="情感辅助文本",
877
+ info="语言保持跟主文本一致,文本可以参考训练集:https://huggingface.co/spaces/Mahiruoshi/BangDream-Bert-VITS2/blob/main/filelists/Mygo.list)",
878
+ placeholder="使用辅助文本的语意来辅助生成对话(语言保持与主文本相同)\n\n"
879
+ "**注意**:不要使用**指令式文本**(如:开心),要使用**带有强烈情感的文本**(如:我好快乐!!!)"
880
+ )
881
+ style_weight = gr.Slider(
882
+ minimum=0,
883
+ maximum=1,
884
+ value=0.7,
885
+ step=0.1,
886
+ label="Weight",
887
+ info="主文本和辅助文本的bert混合比率,0表示仅主文本,1表示仅辅助文本",
888
+ )
889
+ btn = gr.Button("点击生成", variant="primary")
890
+ audio_output = gr.Audio(label="Output Audio")
891
+ btntran = gr.Button("快速中翻日")
892
+ translateResult = gr.TextArea(label="使用百度翻译",placeholder="从这里复制翻译后的文本")
893
+ btntran.click(translate, inputs=[text], outputs = [translateResult])
894
+ btn.click(
895
+ generate_audio,
896
+ inputs=[
897
+ inputFile,
898
+ groupSize,
899
+ filepath,
900
+ silenceTime,
901
+ speakerList,
902
+ text,
903
+ sdp_ratio,
904
+ noise_scale,
905
+ noise_scale_w,
906
+ length_scale,
907
+ speaker,
908
+ style_text,
909
+ style_weight,
910
+ language,
911
+ mode,
912
+ sentence_mode,
913
+ skip_start,
914
+ skip_end
915
+ ],
916
+ outputs=[audio_output],
917
+ )
918
  print("推理页面已开启!")
919
+ app.launch()