Spaces:
Running
on
Zero
Running
on
Zero
Migrated from GitHub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +65 -0
- DataPreProcess/preprocess_lrs2_audio.py +71 -0
- DataPreProcess/process_echoset.py +75 -0
- DataPreProcess/process_librimix.py +57 -0
- ORIGINAL_README.md +95 -0
- assets/TIGER.png +3 -0
- assets/dnr-demo/sample1/dialog.mp4 +3 -0
- assets/dnr-demo/sample1/effect.mp4 +3 -0
- assets/dnr-demo/sample1/mixture.mp4 +3 -0
- assets/dnr-demo/sample1/music.mp4 +3 -0
- assets/dnr-demo/sample2/dialog.mp4 +3 -0
- assets/dnr-demo/sample2/effect.mp4 +3 -0
- assets/dnr-demo/sample2/mixture.mp4 +3 -0
- assets/dnr-demo/sample2/music.mp4 +3 -0
- assets/dnr-demo/sample3/dialog.mp4 +3 -0
- assets/dnr-demo/sample3/effect.mp4 +3 -0
- assets/dnr-demo/sample3/mixture.mp4 +3 -0
- assets/dnr-demo/sample3/music.mp4 +3 -0
- assets/dnr.png +3 -0
- assets/efficiency.png +3 -0
- assets/logo.png +3 -0
- assets/result.png +3 -0
- assets/sample1/GroundTruth/mix.wav +3 -0
- assets/sample1/GroundTruth/s1.wav +3 -0
- assets/sample1/GroundTruth/s2.wav +3 -0
- assets/sample1/TFGNet/s1.wav +3 -0
- assets/sample1/TFGNet/s2.wav +3 -0
- assets/sample1/TIGER/s1.wav +3 -0
- assets/sample1/TIGER/s2.wav +3 -0
- assets/sample1/spec/TFGNet_s1.png +3 -0
- assets/sample1/spec/TFGNet_s2.png +3 -0
- assets/sample1/spec/TIGER_s1.png +3 -0
- assets/sample1/spec/TIGER_s2.png +3 -0
- assets/sample1/spec/ground_truth_s1.png +3 -0
- assets/sample1/spec/ground_truth_s2.png +3 -0
- assets/sample2/GroundTruth/mix.wav +3 -0
- assets/sample2/GroundTruth/s1.wav +3 -0
- assets/sample2/GroundTruth/s2.wav +3 -0
- assets/sample2/TFGNet/s1.wav +3 -0
- assets/sample2/TFGNet/s2.wav +3 -0
- assets/sample2/TIGER/s1.wav +3 -0
- assets/sample2/TIGER/s2.wav +3 -0
- assets/sample2/spec/TFGNet_s1.png +3 -0
- assets/sample2/spec/TFGNet_s2.png +3 -0
- assets/sample2/spec/TIGER_s1.png +3 -0
- assets/sample2/spec/TIGER_s2.png +3 -0
- assets/sample2/spec/ground_truth_s1.png +3 -0
- assets/sample2/spec/ground_truth_s2.png +3 -0
- assets/sample3/GroundTruth/mix.wav +3 -0
- assets/sample3/GroundTruth/s1.wav +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,68 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
assets/TIGER.png filter=lfs diff=lfs merge=lfs -text
|
37 |
+
assets/dnr-demo/sample1/dialog.mp4 filter=lfs diff=lfs merge=lfs -text
|
38 |
+
assets/dnr-demo/sample1/effect.mp4 filter=lfs diff=lfs merge=lfs -text
|
39 |
+
assets/dnr-demo/sample1/mixture.mp4 filter=lfs diff=lfs merge=lfs -text
|
40 |
+
assets/dnr-demo/sample1/music.mp4 filter=lfs diff=lfs merge=lfs -text
|
41 |
+
assets/dnr-demo/sample2/dialog.mp4 filter=lfs diff=lfs merge=lfs -text
|
42 |
+
assets/dnr-demo/sample2/effect.mp4 filter=lfs diff=lfs merge=lfs -text
|
43 |
+
assets/dnr-demo/sample2/mixture.mp4 filter=lfs diff=lfs merge=lfs -text
|
44 |
+
assets/dnr-demo/sample2/music.mp4 filter=lfs diff=lfs merge=lfs -text
|
45 |
+
assets/dnr-demo/sample3/dialog.mp4 filter=lfs diff=lfs merge=lfs -text
|
46 |
+
assets/dnr-demo/sample3/effect.mp4 filter=lfs diff=lfs merge=lfs -text
|
47 |
+
assets/dnr-demo/sample3/mixture.mp4 filter=lfs diff=lfs merge=lfs -text
|
48 |
+
assets/dnr-demo/sample3/music.mp4 filter=lfs diff=lfs merge=lfs -text
|
49 |
+
assets/dnr.png filter=lfs diff=lfs merge=lfs -text
|
50 |
+
assets/efficiency.png filter=lfs diff=lfs merge=lfs -text
|
51 |
+
assets/logo.png filter=lfs diff=lfs merge=lfs -text
|
52 |
+
assets/result.png filter=lfs diff=lfs merge=lfs -text
|
53 |
+
assets/sample1/GroundTruth/mix.wav filter=lfs diff=lfs merge=lfs -text
|
54 |
+
assets/sample1/GroundTruth/s1.wav filter=lfs diff=lfs merge=lfs -text
|
55 |
+
assets/sample1/GroundTruth/s2.wav filter=lfs diff=lfs merge=lfs -text
|
56 |
+
assets/sample1/TFGNet/s1.wav filter=lfs diff=lfs merge=lfs -text
|
57 |
+
assets/sample1/TFGNet/s2.wav filter=lfs diff=lfs merge=lfs -text
|
58 |
+
assets/sample1/TIGER/s1.wav filter=lfs diff=lfs merge=lfs -text
|
59 |
+
assets/sample1/TIGER/s2.wav filter=lfs diff=lfs merge=lfs -text
|
60 |
+
assets/sample1/spec/TFGNet_s1.png filter=lfs diff=lfs merge=lfs -text
|
61 |
+
assets/sample1/spec/TFGNet_s2.png filter=lfs diff=lfs merge=lfs -text
|
62 |
+
assets/sample1/spec/TIGER_s1.png filter=lfs diff=lfs merge=lfs -text
|
63 |
+
assets/sample1/spec/TIGER_s2.png filter=lfs diff=lfs merge=lfs -text
|
64 |
+
assets/sample1/spec/ground_truth_s1.png filter=lfs diff=lfs merge=lfs -text
|
65 |
+
assets/sample1/spec/ground_truth_s2.png filter=lfs diff=lfs merge=lfs -text
|
66 |
+
assets/sample2/GroundTruth/mix.wav filter=lfs diff=lfs merge=lfs -text
|
67 |
+
assets/sample2/GroundTruth/s1.wav filter=lfs diff=lfs merge=lfs -text
|
68 |
+
assets/sample2/GroundTruth/s2.wav filter=lfs diff=lfs merge=lfs -text
|
69 |
+
assets/sample2/TFGNet/s1.wav filter=lfs diff=lfs merge=lfs -text
|
70 |
+
assets/sample2/TFGNet/s2.wav filter=lfs diff=lfs merge=lfs -text
|
71 |
+
assets/sample2/TIGER/s1.wav filter=lfs diff=lfs merge=lfs -text
|
72 |
+
assets/sample2/TIGER/s2.wav filter=lfs diff=lfs merge=lfs -text
|
73 |
+
assets/sample2/spec/TFGNet_s1.png filter=lfs diff=lfs merge=lfs -text
|
74 |
+
assets/sample2/spec/TFGNet_s2.png filter=lfs diff=lfs merge=lfs -text
|
75 |
+
assets/sample2/spec/TIGER_s1.png filter=lfs diff=lfs merge=lfs -text
|
76 |
+
assets/sample2/spec/TIGER_s2.png filter=lfs diff=lfs merge=lfs -text
|
77 |
+
assets/sample2/spec/ground_truth_s1.png filter=lfs diff=lfs merge=lfs -text
|
78 |
+
assets/sample2/spec/ground_truth_s2.png filter=lfs diff=lfs merge=lfs -text
|
79 |
+
assets/sample3/GroundTruth/mix.wav filter=lfs diff=lfs merge=lfs -text
|
80 |
+
assets/sample3/GroundTruth/s1.wav filter=lfs diff=lfs merge=lfs -text
|
81 |
+
assets/sample3/GroundTruth/s2.wav filter=lfs diff=lfs merge=lfs -text
|
82 |
+
assets/sample3/TFGNet/s1.wav filter=lfs diff=lfs merge=lfs -text
|
83 |
+
assets/sample3/TFGNet/s2.wav filter=lfs diff=lfs merge=lfs -text
|
84 |
+
assets/sample3/TIGER/s1.wav filter=lfs diff=lfs merge=lfs -text
|
85 |
+
assets/sample3/TIGER/s2.wav filter=lfs diff=lfs merge=lfs -text
|
86 |
+
assets/sample3/spec/TFGNet_s1.png filter=lfs diff=lfs merge=lfs -text
|
87 |
+
assets/sample3/spec/TFGNet_s2.png filter=lfs diff=lfs merge=lfs -text
|
88 |
+
assets/sample3/spec/TIGER_s1.png filter=lfs diff=lfs merge=lfs -text
|
89 |
+
assets/sample3/spec/TIGER_s2.png filter=lfs diff=lfs merge=lfs -text
|
90 |
+
assets/sample3/spec/ground_truth_s1.png filter=lfs diff=lfs merge=lfs -text
|
91 |
+
assets/sample3/spec/ground_truth_s2.png filter=lfs diff=lfs merge=lfs -text
|
92 |
+
test/mix.wav filter=lfs diff=lfs merge=lfs -text
|
93 |
+
test/s1.wav filter=lfs diff=lfs merge=lfs -text
|
94 |
+
test/s2.wav filter=lfs diff=lfs merge=lfs -text
|
95 |
+
test/spk1.wav filter=lfs diff=lfs merge=lfs -text
|
96 |
+
test/spk2.wav filter=lfs diff=lfs merge=lfs -text
|
97 |
+
test/test_mixture_466.wav filter=lfs diff=lfs merge=lfs -text
|
98 |
+
test/test_target_dialog_466.wav filter=lfs diff=lfs merge=lfs -text
|
99 |
+
test/test_target_effect_466.wav filter=lfs diff=lfs merge=lfs -text
|
100 |
+
test/test_target_music_466.wav filter=lfs diff=lfs merge=lfs -text
|
DataPreProcess/preprocess_lrs2_audio.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import soundfile as sf
|
5 |
+
from tqdm import tqdm
|
6 |
+
|
7 |
+
|
8 |
+
def get_mouth_path(in_mouth_dir, wav_file, spk, data_type):
|
9 |
+
wav_file = wav_file.replace(".wav", "").split("_")
|
10 |
+
if spk == "s1":
|
11 |
+
file_path = os.path.join(
|
12 |
+
in_mouth_dir, "{}_{}.npz".format(wav_file[0], wav_file[1])
|
13 |
+
)
|
14 |
+
else:
|
15 |
+
file_path = os.path.join(
|
16 |
+
in_mouth_dir, "{}_{}.npz".format(wav_file[3], wav_file[4])
|
17 |
+
)
|
18 |
+
return file_path
|
19 |
+
|
20 |
+
|
21 |
+
def preprocess_one_dir(in_data_dir, out_dir, data_type, spk):
|
22 |
+
"""Create .json file for one condition."""
|
23 |
+
file_infos = []
|
24 |
+
in_dir = os.path.abspath(os.path.join(in_data_dir, data_type, spk))
|
25 |
+
wav_list = os.listdir(in_dir)
|
26 |
+
wav_list.sort()
|
27 |
+
for wav_file in tqdm(wav_list):
|
28 |
+
if not wav_file.endswith(".wav"):
|
29 |
+
continue
|
30 |
+
wav_path = os.path.join(in_dir, wav_file)
|
31 |
+
samples = sf.SoundFile(wav_path)
|
32 |
+
if spk == "mix":
|
33 |
+
file_infos.append((wav_path, len(samples)))
|
34 |
+
else:
|
35 |
+
file_infos.append(
|
36 |
+
(
|
37 |
+
wav_path,
|
38 |
+
# get_mouth_path(os.path.join(in_data_dir, data_type, 'mouths'), wav_file, spk, data_type),
|
39 |
+
len(samples),
|
40 |
+
)
|
41 |
+
)
|
42 |
+
if not os.path.exists(os.path.join(out_dir, data_type)):
|
43 |
+
os.makedirs(os.path.join(out_dir, data_type))
|
44 |
+
with open(os.path.join(out_dir, data_type, spk + ".json"), "w") as f:
|
45 |
+
json.dump(file_infos, f, indent=4)
|
46 |
+
|
47 |
+
|
48 |
+
def preprocess_lrs2_audio(inp_args):
|
49 |
+
"""Create .json files for all conditions."""
|
50 |
+
speaker_list = ["mix", "s1", "s2"]
|
51 |
+
for data_type in ["tr", "cv", "tt"]:
|
52 |
+
for spk in speaker_list:
|
53 |
+
preprocess_one_dir(
|
54 |
+
inp_args.in_dir, inp_args.out_dir, data_type, spk,
|
55 |
+
)
|
56 |
+
|
57 |
+
|
58 |
+
if __name__ == "__main__":
|
59 |
+
parser = argparse.ArgumentParser("LRS2 audio data preprocessing")
|
60 |
+
parser.add_argument(
|
61 |
+
"--in_dir",
|
62 |
+
type=str,
|
63 |
+
default=None,
|
64 |
+
help="Directory path of audio including tr, cv and tt",
|
65 |
+
)
|
66 |
+
parser.add_argument(
|
67 |
+
"--out_dir", type=str, default=None, help="Directory path to put output files"
|
68 |
+
)
|
69 |
+
args = parser.parse_args()
|
70 |
+
print(args)
|
71 |
+
preprocess_lrs2_audio(args)
|
DataPreProcess/process_echoset.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import soundfile as sf
|
5 |
+
from tqdm import tqdm
|
6 |
+
from rich import print
|
7 |
+
|
8 |
+
|
9 |
+
def preprocess_one_dir(in_data_dir, out_dir, data_type):
|
10 |
+
"""Create .json file for one condition."""
|
11 |
+
mix_infos = []
|
12 |
+
s1_infos = []
|
13 |
+
s2_infos = []
|
14 |
+
in_dir = os.path.abspath(os.path.join(in_data_dir, data_type))
|
15 |
+
print("Process {} set...".format(data_type))
|
16 |
+
for root, dirs, files in os.walk(in_dir):
|
17 |
+
for file in files:
|
18 |
+
if file.endswith(".wav") and file.startswith("mix"):
|
19 |
+
file_path = os.path.join(root, file)
|
20 |
+
audio, _ = sf.read(file_path)
|
21 |
+
mix_infos.append((
|
22 |
+
file_path,
|
23 |
+
len(audio),
|
24 |
+
))
|
25 |
+
|
26 |
+
file_path = file_path.replace("mix", "spk1_reverb")
|
27 |
+
audio, _ = sf.read(file_path)
|
28 |
+
s1_infos.append((
|
29 |
+
file_path,
|
30 |
+
len(audio),
|
31 |
+
))
|
32 |
+
|
33 |
+
file_path = file_path.replace("spk1_reverb", "spk2_reverb")
|
34 |
+
audio, _ = sf.read(file_path)
|
35 |
+
s2_infos.append((
|
36 |
+
file_path,
|
37 |
+
len(audio),
|
38 |
+
))
|
39 |
+
print("Process num: {}".format(len(mix_infos)), end="\r")
|
40 |
+
|
41 |
+
if not os.path.exists(os.path.join(out_dir, data_type)):
|
42 |
+
os.makedirs(os.path.join(out_dir, data_type))
|
43 |
+
with open(os.path.join(out_dir, data_type, "mix.json"), "w") as f:
|
44 |
+
json.dump(mix_infos, f, indent=4)
|
45 |
+
|
46 |
+
with open(os.path.join(out_dir, data_type, "s1.json"), "w") as f:
|
47 |
+
json.dump(s1_infos, f, indent=4)
|
48 |
+
|
49 |
+
with open(os.path.join(out_dir, data_type, "s2.json"), "w") as f:
|
50 |
+
json.dump(s2_infos, f, indent=4)
|
51 |
+
|
52 |
+
|
53 |
+
def preprocess_lrs2_audio(inp_args):
|
54 |
+
"""Create .json files for all conditions."""
|
55 |
+
for data_type in ["train", "val", "test"]:
|
56 |
+
preprocess_one_dir(
|
57 |
+
inp_args.in_dir, inp_args.out_dir, data_type
|
58 |
+
)
|
59 |
+
|
60 |
+
|
61 |
+
if __name__ == "__main__":
|
62 |
+
parser = argparse.ArgumentParser("LRS2 audio data preprocessing")
|
63 |
+
parser.add_argument(
|
64 |
+
"--in_dir",
|
65 |
+
type=str,
|
66 |
+
default=None,
|
67 |
+
help="Directory path of audio including tr, cv and tt",
|
68 |
+
)
|
69 |
+
parser.add_argument(
|
70 |
+
"--out_dir", type=str, default=None, help="Directory path to put output files"
|
71 |
+
)
|
72 |
+
args = parser.parse_args()
|
73 |
+
print(args)
|
74 |
+
preprocess_lrs2_audio(args)
|
75 |
+
|
DataPreProcess/process_librimix.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import soundfile as sf
|
5 |
+
from tqdm import tqdm
|
6 |
+
|
7 |
+
|
8 |
+
def preprocess_one_dir(in_data_dir, out_dir, data_type, spk):
|
9 |
+
"""Create .json file for one condition."""
|
10 |
+
file_infos = []
|
11 |
+
in_dir = os.path.abspath(os.path.join(in_data_dir, data_type, spk))
|
12 |
+
wav_list = os.listdir(in_dir)
|
13 |
+
wav_list.sort()
|
14 |
+
for wav_file in tqdm(wav_list):
|
15 |
+
if not wav_file.endswith(".wav"):
|
16 |
+
continue
|
17 |
+
wav_path = os.path.join(in_dir, wav_file)
|
18 |
+
samples = sf.SoundFile(wav_path)
|
19 |
+
if spk == "mix":
|
20 |
+
file_infos.append((wav_path, len(samples)))
|
21 |
+
else:
|
22 |
+
file_infos.append(
|
23 |
+
(
|
24 |
+
wav_path,
|
25 |
+
len(samples),
|
26 |
+
)
|
27 |
+
)
|
28 |
+
if not os.path.exists(os.path.join(out_dir, data_type)):
|
29 |
+
os.makedirs(os.path.join(out_dir, data_type))
|
30 |
+
with open(os.path.join(out_dir, data_type, spk + ".json"), "w") as f:
|
31 |
+
json.dump(file_infos, f, indent=4)
|
32 |
+
|
33 |
+
|
34 |
+
def preprocess_librimix_audio(inp_args):
|
35 |
+
"""Create .json files for all conditions."""
|
36 |
+
speaker_list = ["mix_both", "s1", "s2"]
|
37 |
+
for data_type in ["train-100", "dev", "test"]:
|
38 |
+
for spk in speaker_list:
|
39 |
+
preprocess_one_dir(
|
40 |
+
inp_args.in_dir, inp_args.out_dir, data_type, spk,
|
41 |
+
)
|
42 |
+
|
43 |
+
|
44 |
+
if __name__ == "__main__":
|
45 |
+
parser = argparse.ArgumentParser("Librimix audio data preprocessing")
|
46 |
+
parser.add_argument(
|
47 |
+
"--in_dir",
|
48 |
+
type=str,
|
49 |
+
default=None,
|
50 |
+
help="Directory path of audio including tr, cv and tt",
|
51 |
+
)
|
52 |
+
parser.add_argument(
|
53 |
+
"--out_dir", type=str, default=None, help="Directory path to put output files"
|
54 |
+
)
|
55 |
+
args = parser.parse_args()
|
56 |
+
print(args)
|
57 |
+
preprocess_librimix_audio(args)
|
ORIGINAL_README.md
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<p align="center">
|
2 |
+
<img src="assets/logo.png" alt="Logo" width="150"/>
|
3 |
+
</p>
|
4 |
+
<h3 align="center">TIGER: Time-frequency Interleaved Gain Extraction and Reconstruction for Efficient Speech Separation</h3>
|
5 |
+
<p align="center">
|
6 |
+
<strong>Mohan Xu<sup>*</sup>, Kai Li<sup>*</sup>, Guo Chen, Xiaolin Hu</strong><br>
|
7 |
+
<strong>Tsinghua University, Beijing, China</strong><br>
|
8 |
+
<strong><sup>*</sup>Equal contribution</strong><br>
|
9 |
+
<a href="https://arxiv.org/abs/2410.01469">📜 ICLR 2025</a> | <a href="https://cslikai.cn/TIGER/">🎶 Demo</a> | <a href="https://huggingface.co/datasets/JusperLee/EchoSet">🤗 Dataset</a>
|
10 |
+
|
11 |
+
<p align="center">
|
12 |
+
<img src="https://visitor-badge.laobi.icu/badge?page_id=JusperLee.TIGER" alt="访客统计" />
|
13 |
+
<img src="https://img.shields.io/github/stars/JusperLee/TIGER?style=social" alt="GitHub stars" />
|
14 |
+
<img alt="Static Badge" src="https://img.shields.io/badge/license-Apache%202.0-blue.svg" />
|
15 |
+
</p>
|
16 |
+
|
17 |
+
<p align="center">
|
18 |
+
|
19 |
+
> TIGER is a lightweight model for speech separation which effectively extracts key acoustic features through frequency band-split, multi-scale and full-frequency-frame modeling.
|
20 |
+
|
21 |
+
## 💥 News
|
22 |
+
|
23 |
+
- **[2025-01-23]** We release the code and pre-trained model of TIGER! 🚀
|
24 |
+
- **[2025-01-23]** We release the TIGER model and the EchoSet dataset! 🚀
|
25 |
+
|
26 |
+
## 📜 Abstract
|
27 |
+
|
28 |
+
In this paper, we propose a speech separation model with significantly reduced parameter size and computational cost: Time-Frequency Interleaved Gain Extraction and Reconstruction Network (TIGER). TIGER leverages prior knowledge to divide frequency bands and applies compression on frequency information. We employ a multi-scale selective attention (MSA) module to extract contextual features, while introducing a full-frequency-frame attention (F^3A) module to capture both temporal and frequency contextual information. Additionally, to more realistically evaluate the performance of speech separation models in complex acoustic environments, we introduce a novel dataset called EchoSet. This dataset includes noise and more realistic reverberation (e.g., considering object occlusions and material properties), with speech from two speakers overlapping at random proportions. Experimental results demonstrated that TIGER significantly outperformed state-of-the-art (SOTA) model TF-GridNet on the EchoSet dataset in both inference speed and separation quality, while reducing the number of parameters by 94.3% and the MACs by 95.3%. These results indicate that by utilizing frequency band-split and interleaved modeling structures, TIGER achieves a substantial reduction in parameters and computational costs while maintaining high performance. Notably, TIGER is the first speech separation model with fewer than 1 million parameters that achieves performance close to the SOTA model.
|
29 |
+
|
30 |
+
## TIGER
|
31 |
+
|
32 |
+
Overall pipeline of the model architecture of TIGER and its modules.
|
33 |
+
|
34 |
+

|
35 |
+
|
36 |
+
## Results
|
37 |
+
|
38 |
+
Performance comparisons of TIGER and other existing separation models on ***Libri2Mix, LRS2-2Mix, and EchoSet***. Bold indicates optimal performance, and italics indicate suboptimal performance.
|
39 |
+
|
40 |
+

|
41 |
+
|
42 |
+
Efficiency comparisons of TIGER and other models.
|
43 |
+
|
44 |
+

|
45 |
+
|
46 |
+
Comparison of performance and efficiency of cinematic sound separation models on DnR. '*' means the result comes from the original paper of DnR.
|
47 |
+
|
48 |
+

|
49 |
+
|
50 |
+
## 📦 Installation
|
51 |
+
|
52 |
+
```bash
|
53 |
+
git clone https://github.com/JusperLee/TIGER.git
|
54 |
+
cd TIGER
|
55 |
+
pip install -r requirements.txt
|
56 |
+
```
|
57 |
+
|
58 |
+
## 🚀 Quick Start
|
59 |
+
|
60 |
+
### Test with Pre-trained Model
|
61 |
+
|
62 |
+
```bash
|
63 |
+
# Test using speech
|
64 |
+
python inference_speech.py --audio_path test/mix.wav
|
65 |
+
|
66 |
+
# Test using DnR
|
67 |
+
python inference_dnr.py --audio_path test/test_mixture_466.wav
|
68 |
+
```
|
69 |
+
|
70 |
+
### Train with EchoSet
|
71 |
+
|
72 |
+
```bash
|
73 |
+
python audio_train.py --conf_dir configs/tiger.yml
|
74 |
+
```
|
75 |
+
|
76 |
+
### Evaluate with EchoSet
|
77 |
+
|
78 |
+
```bash
|
79 |
+
python audio_test.py --conf_dir configs/tiger.yml
|
80 |
+
```
|
81 |
+
|
82 |
+
## 📖 Citation
|
83 |
+
|
84 |
+
```bibtex
|
85 |
+
@article{xu2024tiger,
|
86 |
+
title={TIGER: Time-frequency Interleaved Gain Extraction and Reconstruction for Efficient Speech Separation},
|
87 |
+
author={Xu, Mohan and Li, Kai and Chen, Guo and Hu, Xiaolin},
|
88 |
+
journal={arXiv preprint arXiv:2410.01469},
|
89 |
+
year={2024}
|
90 |
+
}
|
91 |
+
```
|
92 |
+
|
93 |
+
## 📧 Contact
|
94 |
+
|
95 |
+
If you have any questions, please feel free to contact us via `[email protected]`.
|
assets/TIGER.png
ADDED
![]() |
Git LFS Details
|
assets/dnr-demo/sample1/dialog.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f01fa3aeddcca598793810ead2ecf0140f82ebb43e96800578ec52af71f6e35
|
3 |
+
size 2810085
|
assets/dnr-demo/sample1/effect.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a7cbf696ebb01eda225c9ebf77856677afe3ccd983b0e2bc45547ee96cd074fb
|
3 |
+
size 2939960
|
assets/dnr-demo/sample1/mixture.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a66b7b61dc5a1e4639321b6255a23cc0e504a1492b9396c7e287d28d39627f35
|
3 |
+
size 2918024
|
assets/dnr-demo/sample1/music.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:413d85706158662a0c8852612cb4536be9b62fb4717cd4cbbc7c6a868194a0ca
|
3 |
+
size 2884375
|
assets/dnr-demo/sample2/dialog.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2677b02f921915a8bafcd9149a297f54657cafbc3464bec833743a6c7ad7779c
|
3 |
+
size 1718587
|
assets/dnr-demo/sample2/effect.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:784a90eadcb0cae93d83ce739d95c06163423d8742448b7ffd373676a2f51a01
|
3 |
+
size 1752110
|
assets/dnr-demo/sample2/mixture.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ab05854c7d9f4bbee331c9e8655b4bdadb4723050332baba027a06b33391fe36
|
3 |
+
size 1781103
|
assets/dnr-demo/sample2/music.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a0b92919ad64ee5e11b14b8a8ab58361fc05c36ba4fe0873d6616f0d9eeead39
|
3 |
+
size 1778868
|
assets/dnr-demo/sample3/dialog.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bc1e9c2336a61630f7f48561d21fc51c5f68731cf2a247e640b2ca8a5a576da2
|
3 |
+
size 3988060
|
assets/dnr-demo/sample3/effect.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:efaf9714d91b8caf53b7f7b4f3e39a811d6e20393fc48bcbc5d23930ec66742a
|
3 |
+
size 4035872
|
assets/dnr-demo/sample3/mixture.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ffb8dee7b106e5c7d1da78608b1992b93bb764825bdc9c22c286f6c04b020fa5
|
3 |
+
size 4053761
|
assets/dnr-demo/sample3/music.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:90b24a2e5aaf1a4c880c83f68d25179ebb119905658a8f28e8cdc3d33cd24f10
|
3 |
+
size 4062344
|
assets/dnr.png
ADDED
![]() |
Git LFS Details
|
assets/efficiency.png
ADDED
![]() |
Git LFS Details
|
assets/logo.png
ADDED
![]() |
Git LFS Details
|
assets/result.png
ADDED
![]() |
Git LFS Details
|
assets/sample1/GroundTruth/mix.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:96096869c977120503b2ef4b53429a4bce510b344d6a274c3371457be854f38e
|
3 |
+
size 192044
|
assets/sample1/GroundTruth/s1.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6128c53d4d57c61eae0b1d7030a155b91db64fb6ca5ec72c982c4f3b6ac3e071
|
3 |
+
size 192044
|
assets/sample1/GroundTruth/s2.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:342f9c15fc0b0445ff82b1026551b33fbafe3ee89c060990ed402ad4ad2a62ee
|
3 |
+
size 192044
|
assets/sample1/TFGNet/s1.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8676fdb6412ed3123d025e145abdd86bbaf2956e55cbc3be5034857ee03b562f
|
3 |
+
size 384080
|
assets/sample1/TFGNet/s2.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ac533785786cac0cacc9f5dde2a32c90c7b057da759be4837b44aa7ddf586b94
|
3 |
+
size 384080
|
assets/sample1/TIGER/s1.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f8ec094c7c7d9262f734b651735ae209eeaf70f1f7d0e18085ab2db330cdc53f
|
3 |
+
size 384080
|
assets/sample1/TIGER/s2.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:92837bf98498e4f7f23d5f918cf042f60f05c534eb1cc4fd27fadce170e38132
|
3 |
+
size 384080
|
assets/sample1/spec/TFGNet_s1.png
ADDED
![]() |
Git LFS Details
|
assets/sample1/spec/TFGNet_s2.png
ADDED
![]() |
Git LFS Details
|
assets/sample1/spec/TIGER_s1.png
ADDED
![]() |
Git LFS Details
|
assets/sample1/spec/TIGER_s2.png
ADDED
![]() |
Git LFS Details
|
assets/sample1/spec/ground_truth_s1.png
ADDED
![]() |
Git LFS Details
|
assets/sample1/spec/ground_truth_s2.png
ADDED
![]() |
Git LFS Details
|
assets/sample2/GroundTruth/mix.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:26f24e81d600dab059662afec2d0ef6197875ce8e233e554c423d435527560a8
|
3 |
+
size 192044
|
assets/sample2/GroundTruth/s1.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bbfcb21a93c1c1896800a5d057f8e9c497c27c92a36233c90920c6a2e1831ee2
|
3 |
+
size 192044
|
assets/sample2/GroundTruth/s2.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ad6579a1ee21ac0d9fb416f9d79a10d65f52f7f5a51759355d259141e966e33e
|
3 |
+
size 192044
|
assets/sample2/TFGNet/s1.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:401a5efe1d3feda8a1c34a4f887bd353114166561d6aa81e28097fc68933004b
|
3 |
+
size 384080
|
assets/sample2/TFGNet/s2.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:59c8a509d19b47254ff37a7f15b0194a49bd2b154ad2725ac7d07efe9ff174bd
|
3 |
+
size 384080
|
assets/sample2/TIGER/s1.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5e804f6d6c6d63f3756427e8c6832a9a28a0bbbcb808e2f7faf14acdc5adc5d8
|
3 |
+
size 384080
|
assets/sample2/TIGER/s2.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:45f92e18ce5371fcc3d07a7010e406efad0092090a3003e9252f901e12e353ba
|
3 |
+
size 384080
|
assets/sample2/spec/TFGNet_s1.png
ADDED
![]() |
Git LFS Details
|
assets/sample2/spec/TFGNet_s2.png
ADDED
![]() |
Git LFS Details
|
assets/sample2/spec/TIGER_s1.png
ADDED
![]() |
Git LFS Details
|
assets/sample2/spec/TIGER_s2.png
ADDED
![]() |
Git LFS Details
|
assets/sample2/spec/ground_truth_s1.png
ADDED
![]() |
Git LFS Details
|
assets/sample2/spec/ground_truth_s2.png
ADDED
![]() |
Git LFS Details
|
assets/sample3/GroundTruth/mix.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:92be7d03f4ca97e3ee88272490838fe77d64207fb0d326cd70391da061498a8a
|
3 |
+
size 192044
|
assets/sample3/GroundTruth/s1.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4b4071c7f390ce9e9a3ccfcf58afaec1e5b237aa150c3c87cf46a3c5a361e2f7
|
3 |
+
size 192044
|