Spaces:
Sleeping
Sleeping
File size: 17,076 Bytes
ad16150 9e4f7db ad16150 9e4f7db ad16150 8c9536c ab759b9 ad16150 9e4f7db ad16150 9e4f7db ad16150 9e4f7db ad16150 9e4f7db ad16150 9e4f7db ad16150 9e4f7db ad16150 9e4f7db ad16150 9e4f7db ad16150 9e4f7db ad16150 9e4f7db ad16150 9e4f7db ad16150 9e4f7db ad16150 9e4f7db ad16150 9e4f7db ad16150 9e4f7db ad16150 9e4f7db ad16150 9e4f7db ad16150 9e4f7db ad16150 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 |
import gradio as gr
import os
import torch
import whisper_timestamped as whisper_t
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from transformers import WhisperForConditionalGeneration
from peft import PeftModel
import time
import re
import html
import json
import shutil
from fsp import analyze_audio, apply_censoring, default_curse_words, seconds_to_minutes
from datetime import datetime
###### Ideas ########
# - Javascript for toggling individual words to mute --> playright
# - Use LLM to determine what is "explicit" in the ouputs --> structured output?
# - Mute explicit nonvocal sounds: e.g., gun shots, sex scenes, etc.
# - Additional words to censor at the beginning screen ?
# Print the start time
print(f"Executing {os.path.basename(__file__)} at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
################ Load models
## 1. Toxicity filter. Using the base version
print('Loading toxicity classifier...')
tox_model = "cardiffnlp/twitter-roberta-large-sensitive-multilabel"
tox_tokenizer = AutoTokenizer.from_pretrained(tox_model)
tox_model = AutoModelForSequenceClassification.from_pretrained(tox_model)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tox_model.to(device)
tox_pipe = pipeline("text-classification", model=tox_model, tokenizer=tox_tokenizer, device=device, top_k=2)
## 2. Create our Whisper model from the LoRA weights
## Whisper_timestamped requires the entire model to be saved, this saves static storage space by only saving the lora config
def load_whisper_model(model_path, lora_config, base_model_name="openai/whisper-medium.en"):
# If the model exists already we're good to go
if os.path.exists('./whisper-medium-ft/model.safetensors'):
print(f'Fine tuned model at {model_path} already exists')
return
print(f'Fine-tuned model not found. Creating model from LoRA configuration at {lora_config}')
model = WhisperForConditionalGeneration.from_pretrained(base_model_name)
model = PeftModel.from_pretrained(model, lora_config)
model = model.merge_and_unload()
model.save_pretrained(model_path, save_serialization=False)
print(f'Whisper model from {lora_config} saved at {model_path}')
return
# Where fsp.py expects to find our fine-tuned model
model_path = 'whisper-medium-ft'
lora_config = './lora_config'
load_whisper_model(model_path=model_path, lora_config=lora_config)
###### Helper functions #######
# Metadata display for the full transcriptions. Includes genius link if possible
def format_metadata_header(filename, metadata, explicit_word_count):
title, artist, album, year = metadata.get('title', 'N/A'), metadata.get('artist', 'N/A'), metadata.get('album', 'N/A'), metadata.get('year', 'N/A')
genius_url, wer_score = metadata.get('genius_url'), metadata.get('wer_score')
genius_link = f"|| **[View lyrics on Genius]({genius_url})**" if genius_url else ""
wer_display = f"| similarity score = {wer_score} (lower is better)" if wer_score and genius_url else ""
status_message = ""
if explicit_word_count == 0:
status_message = "\n\n**β
No explicit content found in this track.**"
return f"### Details for: *{filename}*\n**Artist:** {artist} | **Song:** {title} | **Album:** {album} ({year}) {genius_link} {wer_display}{status_message}"
# Creates the table of the transcription
def generate_static_transcript(transcript_data, initial_times):
initial_times_set = {f"{t['start']}-{t['end']}" for t in initial_times}
table_header = "<table><thead><tr><th style='width: 125px;'>Time</th><th>Line transcript</th><th>Explicit flag(s)</th></tr></thead><tbody>"
table_rows = []
all_lines = [" ".join([word['text'] for word in segment.get('line_words', [])]) for segment in transcript_data]
explicit_results = []
if all_lines:
pipeline_outputs = tox_pipe(all_lines)
for line_result in pipeline_outputs:
flags = []
for d in line_result:
label = d['label']
score = d['score']
if score < 0.5: continue
elif label == 'confilctual' or label == 'selfharm': flags.append('violence')
elif label == 'profanity': flags.append('curse')
elif label == 'drugs': flags.append('drugs')
elif label == 'sex': flags.append('sex')
explicit_results.append(flags)
for i, segment in enumerate(transcript_data):
start_time_str, end_time_str = seconds_to_minutes(segment.get('start')), seconds_to_minutes(segment.get('end'))
explicit_flag = ""
if explicit_results:
for flags in explicit_results[i]:
if 'violence' in flags: explicit_flag += 'π₯'
if 'curse' in flags: explicit_flag += 'π€¬'
if 'drugs' in flags: explicit_flag += 'π¬'
if 'sex' in flags: explicit_flag += 'π'
words_in_line = segment.get('line_words', [])
formatted_words = []
for word in words_in_line:
word_id = f"{word['start']}-{word['end']}"
if word_id in initial_times_set:
formatted_words.append(f"<s>{html.escape(word['text'])}</s>")
else:
formatted_words.append(html.escape(word["text"]))
formatted_line = " ".join(formatted_words)
table_rows.append(f"<tr><td>{start_time_str} - {end_time_str}</td><td>{formatted_line}</td><td style='text-align:center'>{explicit_flag}</td></tr>")
return table_header + "".join(table_rows) + "</tbody></table>"
# Execute the whisper model for transcription
def handle_batch_analysis(files, progress=gr.Progress()):
if not files:
raise gr.Error("Please upload one or more audio files.")
yield gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), None, None, None, None, None
try:
model, fine_tuned = (whisper_t.load_model(model_path, device=device), True)
except Exception as e:
raise gr.Error(f"Error loading fine-tuned Whisper model: {e}")
all_results = {}
num_files = len(files)
for i, audio_file in enumerate(files):
progress((i + 1) / num_files, desc=f"Analyzing File {i + 1} of {num_files}")
filename = os.path.basename(audio_file.name)
analysis_state = analyze_audio(audio_file.name, model, device, fine_tuned, progress=None)
all_results[filename] = analysis_state
# MODIFIED: Print filename to console after transcription
print(f"Transcription complete for: {filename} (file {i+1} of {num_files})")
file_list = list(all_results.keys())
first_file_results = all_results[file_list[0]]
explicit_count_first_file = len(first_file_results['initial_explicit_times'])
header = format_metadata_header(file_list[0], first_file_results['metadata'], explicit_count_first_file)
transcript_html = generate_static_transcript(first_file_results['transcript'], first_file_results['initial_explicit_times'])
# Check if ANY file has explicit content to determine if the apply button should be active
# If not, display no edits to make
any_explicit_content = any(len(res['initial_explicit_times']) > 0 for res in all_results.values())
if any_explicit_content:
apply_button_update = gr.update(interactive=True, value="Apply all edits")
else:
apply_button_update = gr.update(interactive=False, value="No edits to make")
yield (
gr.update(visible=False),
gr.update(visible=True),
gr.update(visible=False),
all_results,
gr.update(choices=file_list, value=file_list[0]),
header,
transcript_html,
apply_button_update
)
# Selecting between different transcripts
def update_details_view(selected_filename, all_results):
if not selected_filename or not all_results:
return "", ""
file_results = all_results[selected_filename]
explicit_word_count = len(file_results['initial_explicit_times'])
header = format_metadata_header(selected_filename, file_results['metadata'], explicit_word_count)
transcript_html = generate_static_transcript(file_results['transcript'], file_results['initial_explicit_times'])
return header, transcript_html
# Apply the edits to all songs
def handle_batch_finalization(all_results, progress=gr.Progress()):
if not all_results:
raise gr.Error("No active analysis session. Please process files first.")
output_paths = []
num_files = len(all_results)
for i, (filename, analysis_state) in enumerate(all_results.items()):
progress((i + 1) / num_files, desc=f"Applying edits {i + 1} of {num_files}")
times_to_censor = analysis_state.get('initial_explicit_times', [])
output_path = apply_censoring(analysis_state, times_to_censor, progress=None)
if output_path:
output_paths.append(output_path)
status_message = f"β
**Success!** {len(output_paths)} of {len(all_results)} files have been censored."
yield (
gr.update(visible=True),
gr.update(visible=False),
gr.update(visible=True),
status_message,
output_paths,
gr.update(visible=True),
gr.update(visible=False)
)
# Clear temp files and return to start
def return_to_start(all_results):
"""Cleans up all temporary directories and resets the UI to its initial state."""
if all_results:
for analysis_state in all_results.values():
temp_dir_path = analysis_state.get('temp_dir')
if temp_dir_path and os.path.exists(temp_dir_path):
try:
shutil.rmtree(temp_dir_path)
except Exception as e:
print(f"Error removing temporary directory {temp_dir_path}: {e}")
return (
gr.update(visible=True), # upload_view
gr.update(visible=False), # review_view
gr.update(visible=False), # final_view
gr.update(visible=True, interactive=True), # apply_button
gr.update(choices=[], value=None, visible=True), # processed_files_selector
None, # analysis_results_state
"", # details_header
"", # transcript_output
"", # final_status_output
None, # edited_files_output
None # files_input (to clear it)
)
###### Gradio UI ########
## CSS for formatting
css = """
#main-container { max-width: 1250px; margin: auto; }
#main-container .prose { font-size: 15px !important; }
#upload-view { max-width: 60%; margin: 0 auto; }
#loading-view { min-height: 500px; display: flex; justify-content: center; align-items: center; }
#apply-button { background-color: #3d9c3e !important; color: white !important; }
#processed-files-radio { min-height: 300px; }
s { color: #d32f2f; text-decoration: line-through; }
"""
with gr.Blocks(theme=gr.themes.Soft(), title="FSP Finder", css=css) as demo:
analysis_results_state = gr.State(None)
# Main header. Persistent over all pages
with gr.Column(elem_id="main-container"):
gr.Markdown("# FSP Finder - AI-powered explicit content detector")
gr.Markdown("Detects and automatically censors explicit content in music files. For source code and more details, visit our [github page](https://github.com/dclark202/auto-censoring).")
gr.Markdown("---")
# Upload page
with gr.Column(visible=True) as upload_view:
gr.Markdown("### How to use")
gr.Markdown('- Upload one or more audio files using the box below. Most common audio formats are accepted (e.g., `.mp3`, `.wav`, etc.).')
gr.Markdown(f'- Click the **Process audio** button to create the transcriptions of the uploaded track(s). You will have a chance to review the edits before applying the censoring.')
files_input = gr.File(label="Upload audio files", file_count="multiple", elem_id="upload-view", file_types=["audio"])
process_button = gr.Button("Process audio", elem_id="upload-view")
gr.Markdown('---')
gr.Markdown('### How it works')
gr.Markdown("This app uses a fine-tuned version of OpenAI's automatic speech recognition model [Whisper](https://github.com/openai/whisper) to create a lyrics transcript of the uploaded music files. Explicit content (e.g., curse words) are then searched for in the lyrics transcript and highlighted. The vocals stem of the track is split off from the song using [demucs](https://github.com/facebookresearch/demucs) and muted at the appropriate times to create a high-quality edited version of the song.")
# Results page
with gr.Column(visible=False) as review_view:
gr.Markdown("### Review transcript(s) and apply edits")
gr.Markdown(f'Words to be censored will appear in <caption>{html.escape("red strikethrough")}</s> text in the transcript below. Apply edits by clicking **Apply all edits** below.')
gr.Markdown("""Entries in the **Explicit flag** column are determined by running the corresponding line through a [toxicity filter](https://huggingface.co/cardiffnlp/twitter-roberta-large-sensitive-multilabel).
- π₯ = violence or self harm
- π€¬ = curse words
- π¬ = drugs
- π = sexual content
We are currently working on allowing users to select additional words to censor from the full transcript, this flag should guide users towards identifying additional potentially explicit lines.""")
gr.Markdown("**Note**: Whisper's processing is not deterministic and it can sometimes get confused and hallucinate with audio. If your transcription seems inaccurate (e.g., a line contains the same word repeated *many* times, or a line contains a significant amount of transcribed text not present in the song), please try running the program again on that song.")
with gr.Row(variant="panel"):
with gr.Column(scale=1):
processed_files_selector = gr.Radio(label="Select a file to view its transcript", interactive=True, elem_id="processed-files-radio")
apply_button = gr.Button("Apply all edits", elem_id="apply-button", interactive=False)
return_to_start_button = gr.Button("Return to start")
with gr.Column(visible=False) as final_view:
final_status_output = gr.Markdown()
edited_files_output = gr.File(label="Download your edited files", file_count="multiple")
with gr.Column(scale=3):
details_header = gr.Markdown()
with gr.Accordion("Full audio transcript", open=True):
transcript_output = gr.HTML()
# Processing page. I want this to display more information about what is happening behind the scenes
# e.g., to inform the user that the program has not just crashed
with gr.Column(visible=False, elem_id="loading-view") as loading_view:
gr.Markdown("## β³ Processing... please wait")
# Buttons
# Process all inputs
process_button.click(
fn=handle_batch_analysis,
inputs=[files_input],
outputs=[upload_view, review_view, loading_view, analysis_results_state, processed_files_selector, details_header, transcript_output, apply_button]
)
# Select between multiple files
processed_files_selector.change(
fn=update_details_view,
inputs=[processed_files_selector, analysis_results_state],
outputs=[details_header, transcript_output]
)
# Apply edits
apply_button.click(
fn=handle_batch_finalization,
inputs=[analysis_results_state],
outputs=[review_view, loading_view, final_view, final_status_output, edited_files_output, processed_files_selector, apply_button]
)
# Go back to start. The JS for the confirmation is not working!
return_to_start_button.click(
fn=return_to_start,
inputs=[analysis_results_state],
outputs=[
upload_view,
review_view,
final_view,
apply_button,
processed_files_selector,
analysis_results_state,
details_header,
transcript_output,
final_status_output,
edited_files_output,
files_input
],
js="() => { if (confirm('Are you sure you want to return to the start? All current analysis will be lost.')) { return true; } else { return false; } }"
)
# Made a little favicon :)
demo.launch(share=True, favicon_path='fav.png') |