fffiloni commited on
Commit
c4c4663
·
verified ·
1 Parent(s): 0d0ce41

Update gradio_app.py

Browse files
Files changed (1) hide show
  1. gradio_app.py +99 -26
gradio_app.py CHANGED
@@ -1,47 +1,120 @@
1
  import os
 
2
  import torch
3
  import torchaudio
 
4
  import gradio as gr
5
  import look2hear.models
6
 
7
- # Setup environment and model
8
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9
 
10
- model = look2hear.models.TIGERDNR.from_pretrained("JusperLee/TIGER-DnR", cache_dir="cache")
11
- model.to(device)
12
- model.eval()
13
 
14
- # Processing function
15
- def separate_audio(audio_file):
 
 
 
 
 
 
16
  audio, sr = torchaudio.load(audio_file)
17
  audio = audio.to(device)
18
 
19
  with torch.no_grad():
20
- all_target_dialog, all_target_effect, all_target_music = model(audio[None])
 
 
 
 
 
21
 
22
- # Save outputs
23
- dialog_path = "dialog_output.wav"
24
- effect_path = "effect_output.wav"
25
- music_path = "music_output.wav"
26
 
27
- torchaudio.save(dialog_path, all_target_dialog.cpu(), sr)
28
- torchaudio.save(effect_path, all_target_effect.cpu(), sr)
29
- torchaudio.save(music_path, all_target_music.cpu(), sr)
30
 
31
  return dialog_path, effect_path, music_path
32
 
33
- # Gradio UI
34
- demo = gr.Interface(
35
- fn=separate_audio,
36
- inputs=gr.Audio(type="filepath", label="Upload Audio File"),
37
- outputs=[
38
- gr.Audio(label="Dialog", type="filepath"),
39
- gr.Audio(label="Effects", type="filepath"),
40
- gr.Audio(label="Music", type="filepath")
41
- ],
42
- title="TIGER-DnR Audio Separator",
43
- description="Upload a mixed audio file to separate it into dialog, effects, and music using the TIGER-DnR model."
44
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  if __name__ == "__main__":
47
  demo.launch()
 
1
  import os
2
+ import uuid
3
  import torch
4
  import torchaudio
5
+ import torchaudio.transforms as T
6
  import gradio as gr
7
  import look2hear.models
8
 
9
+ # Setup device
10
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
 
12
+ # Load models
13
+ dnr_model = look2hear.models.TIGERDNR.from_pretrained("JusperLee/TIGER-DnR", cache_dir="cache")
14
+ dnr_model.to(device).eval()
15
 
16
+ sep_model = look2hear.models.TIGER.from_pretrained("JusperLee/TIGER-speech", cache_dir="cache")
17
+ sep_model.to(device).eval()
18
+
19
+ TARGET_SR = 16000
20
+ MAX_SPEAKERS = 4
21
+
22
+ # --- DnR Function ---
23
+ def separate_dnr(audio_file):
24
  audio, sr = torchaudio.load(audio_file)
25
  audio = audio.to(device)
26
 
27
  with torch.no_grad():
28
+ dialog, effect, music = dnr_model(audio[None])
29
+
30
+ # Unique output folder
31
+ session_id = uuid.uuid4().hex[:8]
32
+ output_dir = os.path.join("output_dnr", session_id)
33
+ os.makedirs(output_dir, exist_ok=True)
34
 
35
+ dialog_path = os.path.join(output_dir, "dialog.wav")
36
+ effect_path = os.path.join(output_dir, "effect.wav")
37
+ music_path = os.path.join(output_dir, "music.wav")
 
38
 
39
+ torchaudio.save(dialog_path, dialog.cpu(), sr)
40
+ torchaudio.save(effect_path, effect.cpu(), sr)
41
+ torchaudio.save(music_path, music.cpu(), sr)
42
 
43
  return dialog_path, effect_path, music_path
44
 
45
+ # --- Speaker Separation Function ---
46
+ def separate_speakers(audio_path):
47
+ waveform, original_sr = torchaudio.load(audio_path)
48
+ if original_sr != TARGET_SR:
49
+ waveform = T.Resample(orig_freq=original_sr, new_freq=TARGET_SR)(waveform)
50
+
51
+ if waveform.dim() == 1:
52
+ waveform = waveform.unsqueeze(0)
53
+ audio_input = waveform.unsqueeze(0).to(device)
54
+
55
+ with torch.no_grad():
56
+ ests_speech = sep_model(audio_input)
57
+
58
+ ests_speech = ests_speech.squeeze(0)
59
+
60
+ # Unique output folder
61
+ session_id = uuid.uuid4().hex[:8]
62
+ output_dir = os.path.join("output_sep", session_id)
63
+ os.makedirs(output_dir, exist_ok=True)
64
+
65
+ output_files = []
66
+ for i in range(ests_speech.shape[0]):
67
+ path = os.path.join(output_dir, f"speaker_{i+1}.wav")
68
+ torchaudio.save(path, ests_speech[i].cpu(), TARGET_SR)
69
+ output_files.append(path)
70
+
71
+ updates = []
72
+ for i in range(MAX_SPEAKERS):
73
+ if i < len(output_files):
74
+ updates.append(gr.update(value=output_files[i], visible=True, label=f"Speaker {i+1}"))
75
+ else:
76
+ updates.append(gr.update(value=None, visible=False))
77
+ return updates
78
+
79
+ # --- Gradio App ---
80
+ with gr.Blocks() as demo:
81
+ gr.Markdown("# Look2Hear Audio Processing Toolkit")
82
+
83
+ with gr.Tabs():
84
+ # --- Tab 1: DnR ---
85
+ with gr.Tab("Dialog/Effects/Music Separation (DnR)"):
86
+ gr.Markdown("### Separate Dialog, Effects, and Music from Mixed Audio")
87
+
88
+ dnr_input = gr.Audio(type="filepath", label="Upload Audio File")
89
+ dnr_button = gr.Button("Separate Audio")
90
+
91
+ dnr_output_dialog = gr.Audio(label="Dialog", type="filepath")
92
+ dnr_output_effect = gr.Audio(label="Effects", type="filepath")
93
+ dnr_output_music = gr.Audio(label="Music", type="filepath")
94
+
95
+ dnr_button.click(
96
+ fn=separate_dnr,
97
+ inputs=dnr_input,
98
+ outputs=[dnr_output_dialog, dnr_output_effect, dnr_output_music]
99
+ )
100
+
101
+ # --- Tab 2: Speaker Separation ---
102
+ with gr.Tab("Speaker Separation"):
103
+ gr.Markdown("### Separate Individual Speakers from Mixed Speech")
104
+
105
+ sep_input = gr.Audio(type="filepath", label="Upload Speech Audio")
106
+ sep_button = gr.Button("Separate Speakers")
107
+
108
+ gr.Markdown("#### Separated Speakers")
109
+ sep_outputs = []
110
+ for i in range(MAX_SPEAKERS):
111
+ sep_outputs.append(gr.Audio(label=f"Speaker {i+1}", visible=(i == 0), interactive=False))
112
+
113
+ sep_button.click(
114
+ fn=separate_speakers,
115
+ inputs=sep_input,
116
+ outputs=sep_outputs
117
+ )
118
 
119
  if __name__ == "__main__":
120
  demo.launch()