ylacombe commited on
Commit
8a2eb00
·
1 Parent(s): 6acebe0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -64
app.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  import torch
2
 
3
  import gradio as gr
@@ -15,6 +18,8 @@ import time
15
  import demucs.api
16
 
17
 
 
 
18
 
19
  MODEL_NAME = "openai/whisper-large-v3"
20
  DEMUCS_MODEL_NAME = "htdemucs_ft"
@@ -39,30 +44,36 @@ def separate_vocal(path):
39
  return path
40
 
41
 
42
-
43
- # def separate_vocal(path, track_name, output_folder, demucs_model_name = "htdemucs_ft"):
44
- #
45
- # os.system(f"python3 -m demucs.separate --two-stems=vocals -n {demucs_model_name} {path} -o {output_folder}")
46
- #
47
- # return os.path.join(output_folder, demucs_model_name, track_name, "vocals.wav")
48
-
49
-
50
- def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token: gr.OAuthToken | None):
51
  if inputs_path is None:
52
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
 
 
 
 
 
53
 
 
 
 
54
  sampling_rate, inputs = wavfile.read(inputs_path)
55
 
56
  out = pipe(inputs_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
57
 
58
  text = out["text"]
59
 
 
 
60
  chunks = naive_postprocess_whisper_chunks(out["chunks"], inputs, sampling_rate)
61
 
 
 
 
 
62
  transcripts = []
63
  audios = []
64
  with tempfile.TemporaryDirectory() as tmpdirname:
65
- for i,chunk in enumerate(chunks):
66
 
67
  # TODO: make sure 1D or 2D?
68
  arr = chunk["audio"]
@@ -79,10 +90,11 @@ def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token: gr.OAut
79
 
80
  dataset = Dataset.from_dict({"audio": audios, "transcript": transcripts}).cast_column("audio", Audio())
81
 
82
-
83
- dataset.push_to_hub(dataset_name, token=oauth_token.token)
 
84
 
85
- return text
86
 
87
 
88
  def _return_yt_html_embed(yt_url):
@@ -125,11 +137,24 @@ def download_yt_audio(yt_url, filename):
125
  raise gr.Error(str(err))
126
 
127
 
128
- def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token: gr.OAuthToken | None, max_filesize=75.0, dataset_sampling_rate = 24000):
 
 
 
 
 
 
 
 
 
 
 
 
129
  html_embed_str = _return_yt_html_embed(yt_url)
130
 
131
  with tempfile.TemporaryDirectory() as tmpdirname:
132
  filepath = os.path.join(tmpdirname, "video.mp4")
 
133
  download_yt_audio(yt_url, filepath)
134
  with open(filepath, "rb") as f:
135
  inputs_path = f.read()
@@ -137,18 +162,25 @@ def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token: gr.OAuthT
137
  inputs = ffmpeg_read(inputs_path, pipe.feature_extractor.sampling_rate)
138
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
139
 
 
 
140
  out = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
141
 
142
  text = out["text"]
143
 
144
  inputs = ffmpeg_read(inputs_path, dataset_sampling_rate)
145
 
 
 
146
  chunks = naive_postprocess_whisper_chunks(out["chunks"], inputs, dataset_sampling_rate)
147
 
 
 
 
148
  transcripts = []
149
  audios = []
150
  with tempfile.TemporaryDirectory() as tmpdirname:
151
- for i,chunk in enumerate(chunks):
152
 
153
  # TODO: make sure 1D or 2D?
154
  arr = chunk["audio"]
@@ -165,23 +197,25 @@ def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token: gr.OAuthT
165
 
166
  dataset = Dataset.from_dict({"audio": audios, "transcript": transcripts}).cast_column("audio", Audio())
167
 
168
-
169
- dataset.push_to_hub(dataset_name, token=oauth_token.token)
 
170
 
171
 
172
- return html_embed_str, text
173
 
174
 
175
  def naive_postprocess_whisper_chunks(chunks, audio_array, sampling_rate, stop_chars = ".!:;?", min_duration = 5):
176
  # merge chunks as long as merged audio duration is lower than min_duration and that a stop character is not met
177
  # return list of dictionnaries (text, audio)
178
  # min duration is in seconds
179
-
180
  min_duration = int(min_duration * sampling_rate)
 
181
 
182
  new_chunks = []
183
  while chunks:
184
  current_chunk = chunks.pop(0)
 
185
  begin, end = current_chunk["timestamp"]
186
  begin, end = int(begin*sampling_rate), int(end*sampling_rate)
187
 
@@ -193,7 +227,6 @@ def naive_postprocess_whisper_chunks(chunks, audio_array, sampling_rate, stop_c
193
  chunk_to_concat = [audio_array[begin:end]]
194
  while chunks and (text[-1] not in stop_chars or (current_dur<min_duration)):
195
  ch = chunks.pop(0)
196
-
197
  begin, end = ch["timestamp"]
198
  begin, end = int(begin*sampling_rate), int(end*sampling_rate)
199
  current_dur += end-begin
@@ -209,53 +242,71 @@ def naive_postprocess_whisper_chunks(chunks, audio_array, sampling_rate, stop_c
209
  "audio": np.concatenate(chunk_to_concat),
210
  })
211
  print(f"LENGTH CHUNK #{len(new_chunks)}: {current_dur/sampling_rate}s")
212
-
213
  return new_chunks
214
 
215
 
216
-
217
- mf_transcribe = gr.Interface(
218
- fn=transcribe,
219
- inputs=[
220
- gr.Audio(type="filepath"),
221
- gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
222
- gr.Radio(["no-post-processing", "separate-audio"], label="Audio separation and cleaning (takes longer - use it if your samples are not cleaned (background noise and music))", value="separate-audio"),
223
- gr.Textbox(lines=1, placeholder="Place your new dataset name here", label="Dataset name"),
224
- ],
225
- outputs="text",
226
- theme="huggingface",
227
- title="Create your own TTS dataset using your own recordings",
228
- description=(
229
- "This demo allows use to create a text-to-speech dataset from an input audio snippet and push it to hub to keep track of it."
230
- f"Demo uses the checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to automatically transcribe audio files"
231
- " of arbitrary length. It then merge chunks of audio and push it to the hub."
232
- ),
233
- allow_flagging="never",
234
- )
235
 
236
- yt_transcribe = gr.Interface(
237
- fn=yt_transcribe,
238
- inputs=[
239
- gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
240
- gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
241
- gr.Radio(["no-post-processing", "separate-audio"], label="Audio separation and cleaning (takes longer - use it if your samples are not cleaned (background noise and music))", value="separate-audio"),
242
- gr.Textbox(lines=1, placeholder="Place your new dataset name here", label="Dataset name"),
243
- ],
244
- outputs=["html", "text"],
245
- theme="huggingface",
246
- title="Create your own TTS dataset using Youtube",
247
- description=(
248
- "This demo allows use to create a text-to-speech dataset from an input audio snippet and push it to hub to keep track of it."
249
- f"Demo uses the checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to automatically transcribe audio files"
250
- " of arbitrary length. It then merge chunks of audio and push it to the hub."
251
- ),
252
- allow_flagging="never",
253
- )
254
-
255
- with gr.Blocks() as demo:
256
  with gr.Row():
257
- gr.LoginButton().activate()
258
  gr.LogoutButton()
259
- gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Microphone or Audio file", "YouTube"])
260
-
261
- demo.launch(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+
4
  import torch
5
 
6
  import gradio as gr
 
18
  import demucs.api
19
 
20
 
21
+ os.environ["GRADIO_TEMP_DIR"] = "/home/yoach/spaces/tmp"
22
+
23
 
24
  MODEL_NAME = "openai/whisper-large-v3"
25
  DEMUCS_MODEL_NAME = "htdemucs_ft"
 
44
  return path
45
 
46
 
47
+ def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token: gr.OAuthToken | None, progress=gr.Progress()):
 
 
 
 
 
 
 
 
48
  if inputs_path is None:
49
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
50
+ if dataset_name is None:
51
+ raise gr.Error("No dataset name submitted! Please submit a dataset name. Should be in the format : <user>/<dataset_name> or <org>/<dataset_name>. Also accepts <dataset_name>, which will default to the namespace of the logged-in user.")
52
+
53
+ total_step = 4
54
+ current_step = 0
55
 
56
+ current_step += 1
57
+ progress((current_step, total_step), desc="Transcribe using Whisper.")
58
+
59
  sampling_rate, inputs = wavfile.read(inputs_path)
60
 
61
  out = pipe(inputs_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
62
 
63
  text = out["text"]
64
 
65
+ current_step += 1
66
+ progress((current_step, total_step), desc="Merge chunks.")
67
  chunks = naive_postprocess_whisper_chunks(out["chunks"], inputs, sampling_rate)
68
 
69
+ current_step += 1
70
+ progress((current_step, total_step), desc="Create dataset.")
71
+
72
+
73
  transcripts = []
74
  audios = []
75
  with tempfile.TemporaryDirectory() as tmpdirname:
76
+ for i,chunk in enumerate(progress.tqdm(chunks, desc="Creating dataset (and clean audio if asked for)")):
77
 
78
  # TODO: make sure 1D or 2D?
79
  arr = chunk["audio"]
 
90
 
91
  dataset = Dataset.from_dict({"audio": audios, "transcript": transcripts}).cast_column("audio", Audio())
92
 
93
+ current_step += 1
94
+ progress((current_step, total_step), desc="Push dataset.")
95
+ dataset.push_to_hub(dataset_name, token=oauth_token.token if oauth_token else oauth_token)
96
 
97
+ return [[transcript] for transcript in transcripts], text
98
 
99
 
100
  def _return_yt_html_embed(yt_url):
 
137
  raise gr.Error(str(err))
138
 
139
 
140
+ def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token: gr.OAuthToken | None, max_filesize=75.0, dataset_sampling_rate = 24000,
141
+ progress=gr.Progress()):
142
+
143
+ if yt_url is None:
144
+ raise gr.Error("No youtube link submitted! Please put a working link.")
145
+ if dataset_name is None:
146
+ raise gr.Error("No dataset name submitted! Please submit a dataset name. Should be in the format : <user>/<dataset_name> or <org>/<dataset_name>. Also accepts <dataset_name>, which will default to the namespace of the logged-in user.")
147
+
148
+ total_step = 5
149
+ current_step = 0
150
+
151
+ current_step += 1
152
+ progress((current_step, total_step), desc="Load video.")
153
  html_embed_str = _return_yt_html_embed(yt_url)
154
 
155
  with tempfile.TemporaryDirectory() as tmpdirname:
156
  filepath = os.path.join(tmpdirname, "video.mp4")
157
+
158
  download_yt_audio(yt_url, filepath)
159
  with open(filepath, "rb") as f:
160
  inputs_path = f.read()
 
162
  inputs = ffmpeg_read(inputs_path, pipe.feature_extractor.sampling_rate)
163
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
164
 
165
+ current_step += 1
166
+ progress((current_step, total_step), desc="Transcribe using Whisper.")
167
  out = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
168
 
169
  text = out["text"]
170
 
171
  inputs = ffmpeg_read(inputs_path, dataset_sampling_rate)
172
 
173
+ current_step += 1
174
+ progress((current_step, total_step), desc="Merge chunks.")
175
  chunks = naive_postprocess_whisper_chunks(out["chunks"], inputs, dataset_sampling_rate)
176
 
177
+ current_step += 1
178
+ progress((current_step, total_step), desc="Create dataset.")
179
+
180
  transcripts = []
181
  audios = []
182
  with tempfile.TemporaryDirectory() as tmpdirname:
183
+ for i,chunk in enumerate(progress.tqdm(chunks, desc="Creating dataset (and clean audio if asked for).")):
184
 
185
  # TODO: make sure 1D or 2D?
186
  arr = chunk["audio"]
 
197
 
198
  dataset = Dataset.from_dict({"audio": audios, "transcript": transcripts}).cast_column("audio", Audio())
199
 
200
+ current_step += 1
201
+ progress((current_step, total_step), desc="Push dataset.")
202
+ dataset.push_to_hub(dataset_name, token=oauth_token.token if oauth_token else oauth_token)
203
 
204
 
205
+ return html_embed_str, [[transcript] for transcript in transcripts], text
206
 
207
 
208
  def naive_postprocess_whisper_chunks(chunks, audio_array, sampling_rate, stop_chars = ".!:;?", min_duration = 5):
209
  # merge chunks as long as merged audio duration is lower than min_duration and that a stop character is not met
210
  # return list of dictionnaries (text, audio)
211
  # min duration is in seconds
 
212
  min_duration = int(min_duration * sampling_rate)
213
+
214
 
215
  new_chunks = []
216
  while chunks:
217
  current_chunk = chunks.pop(0)
218
+
219
  begin, end = current_chunk["timestamp"]
220
  begin, end = int(begin*sampling_rate), int(end*sampling_rate)
221
 
 
227
  chunk_to_concat = [audio_array[begin:end]]
228
  while chunks and (text[-1] not in stop_chars or (current_dur<min_duration)):
229
  ch = chunks.pop(0)
 
230
  begin, end = ch["timestamp"]
231
  begin, end = int(begin*sampling_rate), int(end*sampling_rate)
232
  current_dur += end-begin
 
242
  "audio": np.concatenate(chunk_to_concat),
243
  })
244
  print(f"LENGTH CHUNK #{len(new_chunks)}: {current_dur/sampling_rate}s")
245
+
246
  return new_chunks
247
 
248
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
+ css = """
251
+ #intro{
252
+ max-width: 100%;
253
+ text-align: center;
254
+ margin: 0 auto;
255
+ }
256
+ """
257
+ with gr.Blocks(css=css) as demo:
 
 
 
 
 
 
 
 
 
 
 
 
258
  with gr.Row():
259
+ gr.LoginButton()
260
  gr.LogoutButton()
261
+
262
+ with gr.Tab("YouTube"):
263
+ gr.Markdown("Create your own TTS dataset using Youtube", elem_id="intro")
264
+ gr.Markdown(
265
+ "This demo allows use to create a text-to-speech dataset from an input audio snippet and push it to hub to keep track of it."
266
+ f"Demo uses the checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to automatically transcribe audio files"
267
+ " of arbitrary length. It then merge chunks of audio and push it to the hub."
268
+ )
269
+ with gr.Row():
270
+ with gr.Column():
271
+ audio_youtube = gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")
272
+ task_youtube = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
273
+ cleaning_youtube = gr.Radio(["no-post-processing", "separate-audio"], label="Audio separation and cleaning (takes longer - use it if your samples are not cleaned (background noise and music))", value="separate-audio")
274
+ textbox_youtube = gr.Textbox(lines=1, placeholder="Place your new dataset name here. Should be in the format : <user>/<dataset_name> or <org>/<dataset_name>. Also accepts <dataset_name>, which will default to the namespace of the logged-in user.", label="Dataset name")
275
+
276
+ with gr.Row():
277
+ clear_youtube = gr.ClearButton([audio_youtube, task_youtube, cleaning_youtube, textbox_youtube])
278
+ submit_youtube = gr.Button("Submit")
279
+
280
+ with gr.Column():
281
+ html_youtube = gr.HTML()
282
+ dataset_youtube = gr.Dataset(label="Transcribed samples.",components=["text"], headers=["Transcripts"], samples=[["transcripts will appear here"]])
283
+ transcript_youtube = gr.Textbox(label="Transcription")
284
+
285
+ with gr.Tab("Microphone or Audio file"):
286
+ gr.Markdown("Create your own TTS dataset using your own recordings", elem_id="intro")
287
+ gr.Markdown(
288
+ "This demo allows use to create a text-to-speech dataset from an input audio snippet and push it to hub to keep track of it."
289
+ f"Demo uses the checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to automatically transcribe audio files"
290
+ " of arbitrary length. It then merge chunks of audio and push it to the hub."
291
+ )
292
+ with gr.Row():
293
+ with gr.Column():
294
+ audio_file = gr.Audio(type="filepath")
295
+ task_file = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
296
+ cleaning_file = gr.Radio(["no-post-processing", "separate-audio"], label="Audio separation and cleaning (takes longer - use it if your samples are not cleaned (background noise and music))", value="separate-audio")
297
+ textbox_file = gr.Textbox(lines=1, placeholder="Place your new dataset name here. Should be in the format : <user>/<dataset_name> or <org>/<dataset_name>. Also accepts <dataset_name>, which will default to the namespace of the logged-in user.", label="Dataset name")
298
+
299
+ with gr.Row():
300
+ clear_file = gr.ClearButton([audio_file, task_file, cleaning_file, textbox_file])
301
+ submit_file = gr.Button("Submit")
302
+
303
+ with gr.Column():
304
+ dataset_file = gr.Dataset(label="Transcribed samples.", components=["text"], headers=["Transcripts"], samples=[["transcripts will appear here"]])
305
+ transcript_file = gr.Textbox(label="Transcription")
306
+
307
+
308
+
309
+ submit_file.click(transcribe, inputs=[audio_file, task_file, cleaning_file, textbox_file], outputs=[dataset_file, transcript_file])
310
+ submit_youtube.click(yt_transcribe, inputs=[audio_youtube, task_youtube, cleaning_youtube, textbox_youtube], outputs=[html_youtube, dataset_youtube, transcript_youtube])
311
+
312
+ demo.launch(debug=True)