Katock commited on
Commit
9e75f13
·
1 Parent(s): 185cd8d
Files changed (2) hide show
  1. app.py +16 -18
  2. inference/slicer.py +1 -1
app.py CHANGED
@@ -6,8 +6,6 @@ import gradio as gr
6
  import gradio.processing_utils as gr_processing_utils
7
  import librosa
8
  import numpy as np
9
- import soundfile
10
- import torch
11
 
12
  from inference.infer_tool import Svc
13
 
@@ -18,17 +16,17 @@ logging.getLogger('matplotlib').setLevel(logging.WARNING)
18
 
19
  limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces
20
 
21
- audio_postprocess_ori = gr.Audio.postprocess
22
 
23
 
24
- def audio_postprocess(self, y):
25
- data = audio_postprocess_ori(self, y)
26
- if data is None:
27
- return None
28
- return gr_processing_utils.encode_url_or_file_to_base64(data["name"])
29
-
30
-
31
- gr.Audio.postprocess = audio_postprocess
32
 
33
 
34
  def create_vc_fn(model, spk):
@@ -40,13 +38,13 @@ def create_vc_fn(model, spk):
40
  if duration > 20 and limitation:
41
  return "请上传小于20秒的音频,或点击右上角裁剪", None
42
  print("audio1: ", audio)
43
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
44
- if len(audio.shape) > 1:
45
- audio = librosa.to_mono(audio.transpose(1, 0))
46
- if sampling_rate != 16000:
47
- audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
48
- print("audio2: ", audio)
49
- input_audio = sampling_rate, audio
50
 
51
  # raw_path = io.BytesIO()
52
  # soundfile.write(raw_path, audio, sampling_rate, format="wav")
 
6
  import gradio.processing_utils as gr_processing_utils
7
  import librosa
8
  import numpy as np
 
 
9
 
10
  from inference.infer_tool import Svc
11
 
 
16
 
17
  limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces
18
 
19
+ # audio_postprocess_ori = gr.Audio.postprocess
20
 
21
 
22
+ # def audio_postprocess(self, y):
23
+ # data = audio_postprocess_ori(self, y)
24
+ # if data is None:
25
+ # return None
26
+ # return gr_processing_utils.encode_url_or_file_to_base64(data["name"])
27
+ #
28
+ #
29
+ # gr.Audio.postprocess = audio_postprocess
30
 
31
 
32
  def create_vc_fn(model, spk):
 
38
  if duration > 20 and limitation:
39
  return "请上传小于20秒的音频,或点击右上角裁剪", None
40
  print("audio1: ", audio)
41
+ # audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
42
+ # if len(audio.shape) > 1:
43
+ # audio = librosa.to_mono(audio.transpose(1, 0))
44
+ # if sampling_rate != 16000:
45
+ # audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
46
+ # print("audio2: ", audio)
47
+ # input_audio = sampling_rate, audio
48
 
49
  # raw_path = io.BytesIO()
50
  # soundfile.write(raw_path, audio, sampling_rate, format="wav")
inference/slicer.py CHANGED
@@ -134,7 +134,7 @@ def chunks2audio(input_audio, chunks):
134
  sr, audio = input_audio
135
  if len(audio.shape) == 2 and audio.shape[1] >= 2:
136
  audio = torch.mean(audio, dim=0).unsqueeze(0)
137
- # audio = audio.cpu().numpy()[0]
138
  result = []
139
  for k, v in chunks.items():
140
  tag = v["split_time"].split(",")
 
134
  sr, audio = input_audio
135
  if len(audio.shape) == 2 and audio.shape[1] >= 2:
136
  audio = torch.mean(audio, dim=0).unsqueeze(0)
137
+ audio = audio.cpu().numpy()[0]
138
  result = []
139
  for k, v in chunks.items():
140
  tag = v["split_time"].split(",")