Spaces:
Sleeping
Sleeping
added more curse words
Browse files
app.py
CHANGED
@@ -13,7 +13,15 @@ import shutil
|
|
13 |
from fsp import analyze_audio, apply_censoring, default_curse_words, seconds_to_minutes
|
14 |
from datetime import datetime
|
15 |
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
print(f"Executing {os.path.basename(__file__)} at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
18 |
|
19 |
################ Load models
|
@@ -33,6 +41,7 @@ tox_pipe = pipeline("text-classification", model=tox_model, tokenizer=tox_tokeni
|
|
33 |
## 2. Create our Whisper model from the LoRA weights
|
34 |
## Whisper_timestamped requires the entire model to be saved, this saves static storage space by only saving the lora config
|
35 |
def load_whisper_model(model_path, lora_config, base_model_name="openai/whisper-medium.en"):
|
|
|
36 |
if os.path.exists('./whisper-medium-ft/model.safetensors'):
|
37 |
print(f'Fine tuned model at {model_path} already exists')
|
38 |
return
|
@@ -52,11 +61,11 @@ def load_whisper_model(model_path, lora_config, base_model_name="openai/whisper-
|
|
52 |
model_path = 'whisper-medium-ft'
|
53 |
lora_config = './lora_config'
|
54 |
|
55 |
-
# Uncheck when uploaded to hf
|
56 |
load_whisper_model(model_path=model_path, lora_config=lora_config)
|
57 |
|
58 |
###### Helper functions #######
|
59 |
|
|
|
60 |
def format_metadata_header(filename, metadata, explicit_word_count):
|
61 |
title, artist, album, year = metadata.get('title', 'N/A'), metadata.get('artist', 'N/A'), metadata.get('album', 'N/A'), metadata.get('year', 'N/A')
|
62 |
genius_url, wer_score = metadata.get('genius_url'), metadata.get('wer_score')
|
@@ -69,6 +78,7 @@ def format_metadata_header(filename, metadata, explicit_word_count):
|
|
69 |
|
70 |
return f"### Details for: *{filename}*\n**Artist:** {artist} | **Song:** {title} | **Album:** {album} ({year}) {genius_link} {wer_display}{status_message}"
|
71 |
|
|
|
72 |
def generate_static_transcript(transcript_data, initial_times):
|
73 |
initial_times_set = {f"{t['start']}-{t['end']}" for t in initial_times}
|
74 |
table_header = "<table><thead><tr><th style='width: 125px;'>Time</th><th>Line transcript</th><th>Explicit flag(s)</th></tr></thead><tbody>"
|
@@ -124,6 +134,7 @@ def generate_static_transcript(transcript_data, initial_times):
|
|
124 |
|
125 |
return table_header + "".join(table_rows) + "</tbody></table>"
|
126 |
|
|
|
127 |
def handle_batch_analysis(files, progress=gr.Progress()):
|
128 |
if not files:
|
129 |
raise gr.Error("Please upload one or more audio files.")
|
@@ -143,7 +154,7 @@ def handle_batch_analysis(files, progress=gr.Progress()):
|
|
143 |
analysis_state = analyze_audio(audio_file.name, model, device, fine_tuned, progress=None)
|
144 |
all_results[filename] = analysis_state
|
145 |
# MODIFIED: Print filename to console after transcription
|
146 |
-
print(f"Transcription complete for: {filename}")
|
147 |
|
148 |
file_list = list(all_results.keys())
|
149 |
first_file_results = all_results[file_list[0]]
|
@@ -152,6 +163,7 @@ def handle_batch_analysis(files, progress=gr.Progress()):
|
|
152 |
transcript_html = generate_static_transcript(first_file_results['transcript'], first_file_results['initial_explicit_times'])
|
153 |
|
154 |
# Check if ANY file has explicit content to determine if the apply button should be active
|
|
|
155 |
any_explicit_content = any(len(res['initial_explicit_times']) > 0 for res in all_results.values())
|
156 |
if any_explicit_content:
|
157 |
apply_button_update = gr.update(interactive=True, value="Apply all edits")
|
@@ -169,6 +181,7 @@ def handle_batch_analysis(files, progress=gr.Progress()):
|
|
169 |
apply_button_update
|
170 |
)
|
171 |
|
|
|
172 |
def update_details_view(selected_filename, all_results):
|
173 |
if not selected_filename or not all_results:
|
174 |
return "", ""
|
@@ -179,6 +192,7 @@ def update_details_view(selected_filename, all_results):
|
|
179 |
transcript_html = generate_static_transcript(file_results['transcript'], file_results['initial_explicit_times'])
|
180 |
return header, transcript_html
|
181 |
|
|
|
182 |
def handle_batch_finalization(all_results, progress=gr.Progress()):
|
183 |
if not all_results:
|
184 |
raise gr.Error("No active analysis session. Please process files first.")
|
@@ -204,6 +218,7 @@ def handle_batch_finalization(all_results, progress=gr.Progress()):
|
|
204 |
gr.update(visible=False)
|
205 |
)
|
206 |
|
|
|
207 |
def return_to_start(all_results):
|
208 |
"""Cleans up all temporary directories and resets the UI to its initial state."""
|
209 |
if all_results:
|
@@ -230,7 +245,9 @@ def return_to_start(all_results):
|
|
230 |
)
|
231 |
|
232 |
|
233 |
-
###### Gradio UI
|
|
|
|
|
234 |
css = """
|
235 |
#main-container { max-width: 1250px; margin: auto; }
|
236 |
#main-container .prose { font-size: 15px !important; }
|
@@ -244,11 +261,13 @@ s { color: #d32f2f; text-decoration: line-through; }
|
|
244 |
with gr.Blocks(theme=gr.themes.Soft(), title="FSP Finder", css=css) as demo:
|
245 |
analysis_results_state = gr.State(None)
|
246 |
|
|
|
247 |
with gr.Column(elem_id="main-container"):
|
248 |
gr.Markdown("# FSP Finder - AI-powered explicit content detector")
|
249 |
gr.Markdown("Detects and automatically censors explicit content in music files. For source code and more details, visit our [github page](https://github.com/dclark202/auto-censoring).")
|
250 |
gr.Markdown("---")
|
251 |
|
|
|
252 |
with gr.Column(visible=True) as upload_view:
|
253 |
gr.Markdown("### How to use")
|
254 |
gr.Markdown('- Upload one or more audio files using the box below. Most common audio formats are accepted (e.g., `.mp3`, `.wav`, etc.).')
|
@@ -261,6 +280,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="FSP Finder", css=css) as demo:
|
|
261 |
gr.Markdown('### How it works')
|
262 |
gr.Markdown("This app uses a fine-tuned version of OpenAI's automatic speech recognition model [Whisper](https://github.com/openai/whisper) to create a lyrics transcript of the uploaded music files. Explicit content (e.g., curse words) are then searched for in the lyrics transcript and highlighted. The vocals stem of the track is split off from the song using [demucs](https://github.com/facebookresearch/demucs) and muted at the appropriate times to create a high-quality edited version of the song.")
|
263 |
|
|
|
264 |
with gr.Column(visible=False) as review_view:
|
265 |
gr.Markdown("### Review transcript(s) and apply edits")
|
266 |
gr.Markdown(f'Words to be censored will appear in <caption>{html.escape("red strikethrough")}</s> text in the transcript below. Apply edits by clicking **Apply all edits** below.')
|
@@ -288,28 +308,35 @@ with gr.Blocks(theme=gr.themes.Soft(), title="FSP Finder", css=css) as demo:
|
|
288 |
with gr.Accordion("Full audio transcript", open=True):
|
289 |
transcript_output = gr.HTML()
|
290 |
|
|
|
|
|
291 |
with gr.Column(visible=False, elem_id="loading-view") as loading_view:
|
292 |
gr.Markdown("## ⏳ Processing... please wait")
|
293 |
|
294 |
-
#
|
|
|
|
|
295 |
process_button.click(
|
296 |
fn=handle_batch_analysis,
|
297 |
inputs=[files_input],
|
298 |
outputs=[upload_view, review_view, loading_view, analysis_results_state, processed_files_selector, details_header, transcript_output, apply_button]
|
299 |
)
|
300 |
|
|
|
301 |
processed_files_selector.change(
|
302 |
fn=update_details_view,
|
303 |
inputs=[processed_files_selector, analysis_results_state],
|
304 |
outputs=[details_header, transcript_output]
|
305 |
)
|
306 |
|
|
|
307 |
apply_button.click(
|
308 |
fn=handle_batch_finalization,
|
309 |
inputs=[analysis_results_state],
|
310 |
outputs=[review_view, loading_view, final_view, final_status_output, edited_files_output, processed_files_selector, apply_button]
|
311 |
)
|
312 |
|
|
|
313 |
return_to_start_button.click(
|
314 |
fn=return_to_start,
|
315 |
inputs=[analysis_results_state],
|
@@ -329,4 +356,5 @@ with gr.Blocks(theme=gr.themes.Soft(), title="FSP Finder", css=css) as demo:
|
|
329 |
js="() => { if (confirm('Are you sure you want to return to the start? All current analysis will be lost.')) { return true; } else { return false; } }"
|
330 |
)
|
331 |
|
|
|
332 |
demo.launch(share=True, favicon_path='fav.png')
|
|
|
13 |
from fsp import analyze_audio, apply_censoring, default_curse_words, seconds_to_minutes
|
14 |
from datetime import datetime
|
15 |
|
16 |
+
|
17 |
+
###### Ideas ########
|
18 |
+
# - Javascript for toggling individual words to mute --> playright
|
19 |
+
# - Use LLM to determine what is "explicit" in the ouputs --> structured output?
|
20 |
+
# - Mute explicit nonvocal sounds: e.g., gun shots, sex scenes, etc.
|
21 |
+
# - Additional words to censor at the beginning screen ?
|
22 |
+
|
23 |
+
|
24 |
+
# Print the start time
|
25 |
print(f"Executing {os.path.basename(__file__)} at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
26 |
|
27 |
################ Load models
|
|
|
41 |
## 2. Create our Whisper model from the LoRA weights
|
42 |
## Whisper_timestamped requires the entire model to be saved, this saves static storage space by only saving the lora config
|
43 |
def load_whisper_model(model_path, lora_config, base_model_name="openai/whisper-medium.en"):
|
44 |
+
# If the model exists already we're good to go
|
45 |
if os.path.exists('./whisper-medium-ft/model.safetensors'):
|
46 |
print(f'Fine tuned model at {model_path} already exists')
|
47 |
return
|
|
|
61 |
model_path = 'whisper-medium-ft'
|
62 |
lora_config = './lora_config'
|
63 |
|
|
|
64 |
load_whisper_model(model_path=model_path, lora_config=lora_config)
|
65 |
|
66 |
###### Helper functions #######
|
67 |
|
68 |
+
# Metadata display for the full transcriptions. Includes genius link if possible
|
69 |
def format_metadata_header(filename, metadata, explicit_word_count):
|
70 |
title, artist, album, year = metadata.get('title', 'N/A'), metadata.get('artist', 'N/A'), metadata.get('album', 'N/A'), metadata.get('year', 'N/A')
|
71 |
genius_url, wer_score = metadata.get('genius_url'), metadata.get('wer_score')
|
|
|
78 |
|
79 |
return f"### Details for: *{filename}*\n**Artist:** {artist} | **Song:** {title} | **Album:** {album} ({year}) {genius_link} {wer_display}{status_message}"
|
80 |
|
81 |
+
# Creates the table of the transcription
|
82 |
def generate_static_transcript(transcript_data, initial_times):
|
83 |
initial_times_set = {f"{t['start']}-{t['end']}" for t in initial_times}
|
84 |
table_header = "<table><thead><tr><th style='width: 125px;'>Time</th><th>Line transcript</th><th>Explicit flag(s)</th></tr></thead><tbody>"
|
|
|
134 |
|
135 |
return table_header + "".join(table_rows) + "</tbody></table>"
|
136 |
|
137 |
+
# Execute the whisper model for transcription
|
138 |
def handle_batch_analysis(files, progress=gr.Progress()):
|
139 |
if not files:
|
140 |
raise gr.Error("Please upload one or more audio files.")
|
|
|
154 |
analysis_state = analyze_audio(audio_file.name, model, device, fine_tuned, progress=None)
|
155 |
all_results[filename] = analysis_state
|
156 |
# MODIFIED: Print filename to console after transcription
|
157 |
+
print(f"Transcription complete for: {filename} (file {i+1} of {num_files})")
|
158 |
|
159 |
file_list = list(all_results.keys())
|
160 |
first_file_results = all_results[file_list[0]]
|
|
|
163 |
transcript_html = generate_static_transcript(first_file_results['transcript'], first_file_results['initial_explicit_times'])
|
164 |
|
165 |
# Check if ANY file has explicit content to determine if the apply button should be active
|
166 |
+
# If not, display no edits to make
|
167 |
any_explicit_content = any(len(res['initial_explicit_times']) > 0 for res in all_results.values())
|
168 |
if any_explicit_content:
|
169 |
apply_button_update = gr.update(interactive=True, value="Apply all edits")
|
|
|
181 |
apply_button_update
|
182 |
)
|
183 |
|
184 |
+
# Selecting between different transcripts
|
185 |
def update_details_view(selected_filename, all_results):
|
186 |
if not selected_filename or not all_results:
|
187 |
return "", ""
|
|
|
192 |
transcript_html = generate_static_transcript(file_results['transcript'], file_results['initial_explicit_times'])
|
193 |
return header, transcript_html
|
194 |
|
195 |
+
# Apply the edits to all songs
|
196 |
def handle_batch_finalization(all_results, progress=gr.Progress()):
|
197 |
if not all_results:
|
198 |
raise gr.Error("No active analysis session. Please process files first.")
|
|
|
218 |
gr.update(visible=False)
|
219 |
)
|
220 |
|
221 |
+
# Clear temp files and return to start
|
222 |
def return_to_start(all_results):
|
223 |
"""Cleans up all temporary directories and resets the UI to its initial state."""
|
224 |
if all_results:
|
|
|
245 |
)
|
246 |
|
247 |
|
248 |
+
###### Gradio UI ########
|
249 |
+
|
250 |
+
## CSS for formatting
|
251 |
css = """
|
252 |
#main-container { max-width: 1250px; margin: auto; }
|
253 |
#main-container .prose { font-size: 15px !important; }
|
|
|
261 |
with gr.Blocks(theme=gr.themes.Soft(), title="FSP Finder", css=css) as demo:
|
262 |
analysis_results_state = gr.State(None)
|
263 |
|
264 |
+
# Main header. Persistent over all pages
|
265 |
with gr.Column(elem_id="main-container"):
|
266 |
gr.Markdown("# FSP Finder - AI-powered explicit content detector")
|
267 |
gr.Markdown("Detects and automatically censors explicit content in music files. For source code and more details, visit our [github page](https://github.com/dclark202/auto-censoring).")
|
268 |
gr.Markdown("---")
|
269 |
|
270 |
+
# Upload page
|
271 |
with gr.Column(visible=True) as upload_view:
|
272 |
gr.Markdown("### How to use")
|
273 |
gr.Markdown('- Upload one or more audio files using the box below. Most common audio formats are accepted (e.g., `.mp3`, `.wav`, etc.).')
|
|
|
280 |
gr.Markdown('### How it works')
|
281 |
gr.Markdown("This app uses a fine-tuned version of OpenAI's automatic speech recognition model [Whisper](https://github.com/openai/whisper) to create a lyrics transcript of the uploaded music files. Explicit content (e.g., curse words) are then searched for in the lyrics transcript and highlighted. The vocals stem of the track is split off from the song using [demucs](https://github.com/facebookresearch/demucs) and muted at the appropriate times to create a high-quality edited version of the song.")
|
282 |
|
283 |
+
# Results page
|
284 |
with gr.Column(visible=False) as review_view:
|
285 |
gr.Markdown("### Review transcript(s) and apply edits")
|
286 |
gr.Markdown(f'Words to be censored will appear in <caption>{html.escape("red strikethrough")}</s> text in the transcript below. Apply edits by clicking **Apply all edits** below.')
|
|
|
308 |
with gr.Accordion("Full audio transcript", open=True):
|
309 |
transcript_output = gr.HTML()
|
310 |
|
311 |
+
# Processing page. I want this to display more information about what is happening behind the scenes
|
312 |
+
# e.g., to inform the user that the program has not just crashed
|
313 |
with gr.Column(visible=False, elem_id="loading-view") as loading_view:
|
314 |
gr.Markdown("## ⏳ Processing... please wait")
|
315 |
|
316 |
+
# Buttons
|
317 |
+
|
318 |
+
# Process all inputs
|
319 |
process_button.click(
|
320 |
fn=handle_batch_analysis,
|
321 |
inputs=[files_input],
|
322 |
outputs=[upload_view, review_view, loading_view, analysis_results_state, processed_files_selector, details_header, transcript_output, apply_button]
|
323 |
)
|
324 |
|
325 |
+
# Select between multiple files
|
326 |
processed_files_selector.change(
|
327 |
fn=update_details_view,
|
328 |
inputs=[processed_files_selector, analysis_results_state],
|
329 |
outputs=[details_header, transcript_output]
|
330 |
)
|
331 |
|
332 |
+
# Apply edits
|
333 |
apply_button.click(
|
334 |
fn=handle_batch_finalization,
|
335 |
inputs=[analysis_results_state],
|
336 |
outputs=[review_view, loading_view, final_view, final_status_output, edited_files_output, processed_files_selector, apply_button]
|
337 |
)
|
338 |
|
339 |
+
# Go back to start. The JS for the confirmation is not working!
|
340 |
return_to_start_button.click(
|
341 |
fn=return_to_start,
|
342 |
inputs=[analysis_results_state],
|
|
|
356 |
js="() => { if (confirm('Are you sure you want to return to the start? All current analysis will be lost.')) { return true; } else { return false; } }"
|
357 |
)
|
358 |
|
359 |
+
# Made a little favicon :)
|
360 |
demo.launch(share=True, favicon_path='fav.png')
|
fsp.py
CHANGED
@@ -11,16 +11,45 @@ import jiwer
|
|
11 |
import shutil
|
12 |
import tempfile
|
13 |
|
14 |
-
|
|
|
|
|
|
|
|
|
15 |
genius = lyricsgenius.Genius(GENIUS_API_TOKEN, verbose=False, remove_section_headers=True)
|
16 |
|
17 |
-
default_curse_words = {'fuck', 'shit', 'piss', 'bitch', 'nigg', 'cock', 'faggot', 'cunt', 'clint', 'tits', 'pussy', 'dick', 'asshole', 'whore', 'goddam'}
|
18 |
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
def remove_punctuation(s):
|
21 |
s = re.sub(r'[^a-zA-Z0-9\s]', '', s)
|
22 |
return s.lower()
|
23 |
|
|
|
24 |
def silence_audio_segment(input_audio_path, output_audio_path, times):
|
25 |
audio = AudioSegment.from_file(input_audio_path)
|
26 |
for (start_ms, end_ms) in times:
|
@@ -30,12 +59,14 @@ def silence_audio_segment(input_audio_path, output_audio_path, times):
|
|
30 |
audio = before_segment + target_segment + after_segment
|
31 |
audio.export(output_audio_path, format='wav')
|
32 |
|
|
|
33 |
def combine_audio(path1, path2, outpath):
|
34 |
audio1 = AudioSegment.from_file(path1, format='wav')
|
35 |
audio2 = AudioSegment.from_file(path2, format='wav')
|
36 |
combined_audio = audio1.overlay(audio2)
|
37 |
combined_audio.export(outpath, format="mp3")
|
38 |
|
|
|
39 |
def get_metadata(original_audio_path):
|
40 |
try:
|
41 |
audio_orig = EasyID3(original_audio_path)
|
@@ -44,6 +75,7 @@ def get_metadata(original_audio_path):
|
|
44 |
metadata = {'title': 'N/A', 'artist': 'N/A', 'album': 'N/A', 'year': 'N/A'}
|
45 |
return metadata
|
46 |
|
|
|
47 |
def transfer_metadata(original_audio_path, edited_audio_path):
|
48 |
try:
|
49 |
audio_orig = EasyID3(original_audio_path)
|
@@ -54,6 +86,7 @@ def transfer_metadata(original_audio_path, edited_audio_path):
|
|
54 |
except Exception as e:
|
55 |
print(f"Could not transfer metadata: {e}")
|
56 |
|
|
|
57 |
def seconds_to_minutes(time):
|
58 |
mins = int(time // 60)
|
59 |
secs = int(time % 60)
|
@@ -67,6 +100,7 @@ def seconds_to_minutes(time):
|
|
67 |
else:
|
68 |
return f"{mins}:{secs}"
|
69 |
|
|
|
70 |
def get_genius_url(artist, song_title):
|
71 |
if not artist or not song_title or artist == 'N/A' or song_title == 'N/A': return None
|
72 |
try:
|
@@ -74,6 +108,7 @@ def get_genius_url(artist, song_title):
|
|
74 |
return song.url if song else None
|
75 |
except Exception: return None
|
76 |
|
|
|
77 |
def calculate_wer(ground_truth, hypothesis):
|
78 |
if not ground_truth or not hypothesis or "not available" in ground_truth.lower(): return None
|
79 |
try:
|
@@ -82,6 +117,7 @@ def calculate_wer(ground_truth, hypothesis):
|
|
82 |
return f"{error:.3f}"
|
83 |
except Exception: return "Error"
|
84 |
|
|
|
85 |
def get_genius_lyrics(artist, song_title):
|
86 |
if not artist or not song_title or artist == 'N/A' or song_title == 'N/A': return "Lyrics not available (missing metadata)."
|
87 |
try:
|
@@ -92,6 +128,8 @@ def get_genius_lyrics(artist, song_title):
|
|
92 |
##########################################################
|
93 |
# STEP 1: Analyze Audio, Separate Tracks, and Transcribe #
|
94 |
##########################################################
|
|
|
|
|
95 |
def analyze_audio(audio_path, model, device, fine_tuned=True, progress=None):
|
96 |
"""
|
97 |
Performs audio separation and transcription. Does NOT apply any edits.
|
@@ -128,12 +166,14 @@ def analyze_audio(audio_path, model, device, fine_tuned=True, progress=None):
|
|
128 |
full_transcript = []
|
129 |
initial_explicit_times = []
|
130 |
|
|
|
|
|
|
|
|
|
131 |
for segment in result["segments"]:
|
132 |
segment_words = []
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
for i, word_info in enumerate(seg):
|
137 |
word_text = word_info.get(word_key, '').strip()
|
138 |
if not word_text: continue
|
139 |
|
@@ -145,18 +185,23 @@ def analyze_audio(audio_path, model, device, fine_tuned=True, progress=None):
|
|
145 |
|
146 |
word_data = {'text': word_text, 'start': start_time, 'end': end_time, 'prob': word_info[prob_key]}
|
147 |
segment_words.append(word_data)
|
148 |
-
|
149 |
-
|
|
|
150 |
initial_explicit_times.append({'start': start_time, 'end': end_time})
|
151 |
-
|
152 |
-
# Handle two word cluster "god
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
initial_explicit_times.append({'start':
|
|
|
|
|
|
|
157 |
initial_explicit_times.append({'start': start_time, 'end': end_time})
|
158 |
|
159 |
prev_word = cleaned_word
|
|
|
160 |
|
161 |
full_transcript.append({'line_words': segment_words, 'start': segment['start'], 'end': segment['end']})
|
162 |
|
@@ -180,6 +225,7 @@ def analyze_audio(audio_path, model, device, fine_tuned=True, progress=None):
|
|
180 |
# STEP 2: Apply Censoring and Finalize Audio #
|
181 |
##############################################
|
182 |
|
|
|
183 |
def apply_censoring(analysis_state, times_to_censor, progress=None):
|
184 |
"""
|
185 |
Takes the state from analyze_audio and a final list of timestamps,
|
|
|
11 |
import shutil
|
12 |
import tempfile
|
13 |
|
14 |
+
|
15 |
+
|
16 |
+
## Get a genius API key at https://genius.com/api-clients
|
17 |
+
## put your key in system environment at GENIUS_API_TOKEN or set it manually here
|
18 |
+
GENIUS_API_TOKEN = os.getenv("GENIUS_API_TOKEN")
|
19 |
genius = lyricsgenius.Genius(GENIUS_API_TOKEN, verbose=False, remove_section_headers=True)
|
20 |
|
|
|
21 |
|
22 |
+
#############################################################################
|
23 |
+
### just a heads up there's a bunch of curse words and racial slurs below ###
|
24 |
+
#############################################################################
|
25 |
+
|
26 |
+
|
27 |
+
# List of words to search for to be muted:
|
28 |
+
# The way this works currently is that we look for these words as **substrings** of each transcribed word
|
29 |
+
# this means that 'fuck' handles all versions 'fucking', 'motherfucker', 'fucked', etc.
|
30 |
+
# This method is a bit crude as it can lead to some false positive, ex. 'Dickens' would be censored.
|
31 |
+
# Consider using an LLM on the output for classification?
|
32 |
+
default_curse_words = {
|
33 |
+
'fuck', 'shit', 'piss', 'bitch', 'nigg', 'dyke', 'cock', 'faggot',
|
34 |
+
'cunt', 'tits', 'pussy', 'dick', 'asshole', 'whore', 'goddam',
|
35 |
+
'douche', 'chink', 'tranny', 'slut', 'jizz', 'kike', 'gook'
|
36 |
+
}
|
37 |
+
|
38 |
+
# Words for which the substring method will absolutely not work
|
39 |
+
singular_curse_words = {
|
40 |
+
'fag', 'cum', 'hell', 'spic', 'clit', 'wank', 'ass'
|
41 |
+
}
|
42 |
+
|
43 |
+
######################################################
|
44 |
+
# Helper functions required for the gradio interface #
|
45 |
+
######################################################
|
46 |
+
|
47 |
+
# Removes all punctuation and returns lower case only words
|
48 |
def remove_punctuation(s):
|
49 |
s = re.sub(r'[^a-zA-Z0-9\s]', '', s)
|
50 |
return s.lower()
|
51 |
|
52 |
+
# For silencing the audio tracks at the indicated times
|
53 |
def silence_audio_segment(input_audio_path, output_audio_path, times):
|
54 |
audio = AudioSegment.from_file(input_audio_path)
|
55 |
for (start_ms, end_ms) in times:
|
|
|
59 |
audio = before_segment + target_segment + after_segment
|
60 |
audio.export(output_audio_path, format='wav')
|
61 |
|
62 |
+
# For combining the vocals and instrument stems once the censoring has been applied
|
63 |
def combine_audio(path1, path2, outpath):
|
64 |
audio1 = AudioSegment.from_file(path1, format='wav')
|
65 |
audio2 = AudioSegment.from_file(path2, format='wav')
|
66 |
combined_audio = audio1.overlay(audio2)
|
67 |
combined_audio.export(outpath, format="mp3")
|
68 |
|
69 |
+
# Extracts metadata from the original song
|
70 |
def get_metadata(original_audio_path):
|
71 |
try:
|
72 |
audio_orig = EasyID3(original_audio_path)
|
|
|
75 |
metadata = {'title': 'N/A', 'artist': 'N/A', 'album': 'N/A', 'year': 'N/A'}
|
76 |
return metadata
|
77 |
|
78 |
+
# Transfers metadata between two songs
|
79 |
def transfer_metadata(original_audio_path, edited_audio_path):
|
80 |
try:
|
81 |
audio_orig = EasyID3(original_audio_path)
|
|
|
86 |
except Exception as e:
|
87 |
print(f"Could not transfer metadata: {e}")
|
88 |
|
89 |
+
# Probably overcomplicated function to convert time in seconds to mm:ss format
|
90 |
def seconds_to_minutes(time):
|
91 |
mins = int(time // 60)
|
92 |
secs = int(time % 60)
|
|
|
100 |
else:
|
101 |
return f"{mins}:{secs}"
|
102 |
|
103 |
+
# Lookup url on genius of lyrics for given song
|
104 |
def get_genius_url(artist, song_title):
|
105 |
if not artist or not song_title or artist == 'N/A' or song_title == 'N/A': return None
|
106 |
try:
|
|
|
108 |
return song.url if song else None
|
109 |
except Exception: return None
|
110 |
|
111 |
+
# It's called calculate_wer but I'm actually using *mer*
|
112 |
def calculate_wer(ground_truth, hypothesis):
|
113 |
if not ground_truth or not hypothesis or "not available" in ground_truth.lower(): return None
|
114 |
try:
|
|
|
117 |
return f"{error:.3f}"
|
118 |
except Exception: return "Error"
|
119 |
|
120 |
+
# Gets the lyrics from genius for a given song
|
121 |
def get_genius_lyrics(artist, song_title):
|
122 |
if not artist or not song_title or artist == 'N/A' or song_title == 'N/A': return "Lyrics not available (missing metadata)."
|
123 |
try:
|
|
|
128 |
##########################################################
|
129 |
# STEP 1: Analyze Audio, Separate Tracks, and Transcribe #
|
130 |
##########################################################
|
131 |
+
|
132 |
+
# Obtain transcript from song using Whisper. Whisper_timestamps handles all the splitting of the segments
|
133 |
def analyze_audio(audio_path, model, device, fine_tuned=True, progress=None):
|
134 |
"""
|
135 |
Performs audio separation and transcription. Does NOT apply any edits.
|
|
|
166 |
full_transcript = []
|
167 |
initial_explicit_times = []
|
168 |
|
169 |
+
# Certain phrases can run two words, we need a previous word catcher
|
170 |
+
prev_word = ''
|
171 |
+
prev_start, prev_end = 0.0, 0.0
|
172 |
+
|
173 |
for segment in result["segments"]:
|
174 |
segment_words = []
|
175 |
+
|
176 |
+
for word_info in segment.get('words', []):
|
|
|
|
|
177 |
word_text = word_info.get(word_key, '').strip()
|
178 |
if not word_text: continue
|
179 |
|
|
|
185 |
|
186 |
word_data = {'text': word_text, 'start': start_time, 'end': end_time, 'prob': word_info[prob_key]}
|
187 |
segment_words.append(word_data)
|
188 |
+
|
189 |
+
# Short words that can be substrings of nonsensitive words
|
190 |
+
if cleaned_word in singular_curse_words:
|
191 |
initial_explicit_times.append({'start': start_time, 'end': end_time})
|
192 |
+
|
193 |
+
# Handle two word cluster "god dam*", "mother fuck*".
|
194 |
+
# Other ones: jerk off, cock sucker, ... ?
|
195 |
+
elif ('dam' in cleaned_word and prev_word == 'god') or ('fuck' in cleaned_word and prev_word == 'mother') or (cleaned_word == 'off' and prev_word == 'jerk'):
|
196 |
+
initial_explicit_times.append({'start': prev_start, 'end': prev_end})
|
197 |
+
initial_explicit_times.append({'start': start_time, 'end': end_time})
|
198 |
+
|
199 |
+
# The majority of censored words will come from here
|
200 |
+
elif is_explicit:
|
201 |
initial_explicit_times.append({'start': start_time, 'end': end_time})
|
202 |
|
203 |
prev_word = cleaned_word
|
204 |
+
prev_start, prev_end = start_time, end_time
|
205 |
|
206 |
full_transcript.append({'line_words': segment_words, 'start': segment['start'], 'end': segment['end']})
|
207 |
|
|
|
225 |
# STEP 2: Apply Censoring and Finalize Audio #
|
226 |
##############################################
|
227 |
|
228 |
+
# Applies the censoring at the indicated times
|
229 |
def apply_censoring(analysis_state, times_to_censor, progress=None):
|
230 |
"""
|
231 |
Takes the state from analyze_audio and a final list of timestamps,
|