Spaces:

dac202
/

fsp-finder

Sleeping

App Files Files Community

dac202 commited on 8 days ago

Commit

9e4f7db

1 Parent(s): 8c9536c

added more curse words

Browse files

Files changed (2) hide show

app.py +33 -5
fsp.py +61 -15

app.py CHANGED Viewed

@@ -13,7 +13,15 @@ import shutil
 from fsp import analyze_audio, apply_censoring, default_curse_words, seconds_to_minutes
 from datetime import datetime
-# MODIFIED: Print start time and filename
 print(f"Executing {os.path.basename(__file__)} at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
 ################ Load models
@@ -33,6 +41,7 @@ tox_pipe = pipeline("text-classification", model=tox_model, tokenizer=tox_tokeni
 ## 2. Create our Whisper model from the LoRA weights
 ## Whisper_timestamped requires the entire model to be saved, this saves static storage space by only saving the lora config
 def load_whisper_model(model_path, lora_config, base_model_name="openai/whisper-medium.en"):
     if os.path.exists('./whisper-medium-ft/model.safetensors'):
         print(f'Fine tuned model at {model_path} already exists')
         return
@@ -52,11 +61,11 @@ def load_whisper_model(model_path, lora_config, base_model_name="openai/whisper-
 model_path = 'whisper-medium-ft'
 lora_config = './lora_config'
-# Uncheck when uploaded to hf
 load_whisper_model(model_path=model_path, lora_config=lora_config)
 ###### Helper functions #######
 def format_metadata_header(filename, metadata, explicit_word_count):
     title, artist, album, year = metadata.get('title', 'N/A'), metadata.get('artist', 'N/A'), metadata.get('album', 'N/A'), metadata.get('year', 'N/A')
     genius_url, wer_score = metadata.get('genius_url'), metadata.get('wer_score')
@@ -69,6 +78,7 @@ def format_metadata_header(filename, metadata, explicit_word_count):
     return f"### Details for: *{filename}*\n**Artist:** {artist} | **Song:** {title} | **Album:** {album} ({year}) {genius_link} {wer_display}{status_message}"
 def generate_static_transcript(transcript_data, initial_times):
     initial_times_set = {f"{t['start']}-{t['end']}" for t in initial_times}
     table_header = "<table><thead><tr><th style='width: 125px;'>Time</th><th>Line transcript</th><th>Explicit flag(s)</th></tr></thead><tbody>"
@@ -124,6 +134,7 @@ def generate_static_transcript(transcript_data, initial_times):
     return table_header + "".join(table_rows) + "</tbody></table>"
 def handle_batch_analysis(files, progress=gr.Progress()):
     if not files:
         raise gr.Error("Please upload one or more audio files.")
@@ -143,7 +154,7 @@ def handle_batch_analysis(files, progress=gr.Progress()):
         analysis_state = analyze_audio(audio_file.name, model, device, fine_tuned, progress=None)
         all_results[filename] = analysis_state
         # MODIFIED: Print filename to console after transcription
-        print(f"Transcription complete for: {filename}")
     file_list = list(all_results.keys())
     first_file_results = all_results[file_list[0]]
@@ -152,6 +163,7 @@ def handle_batch_analysis(files, progress=gr.Progress()):
     transcript_html = generate_static_transcript(first_file_results['transcript'], first_file_results['initial_explicit_times'])
     # Check if ANY file has explicit content to determine if the apply button should be active
     any_explicit_content = any(len(res['initial_explicit_times']) > 0 for res in all_results.values())
     if any_explicit_content:
         apply_button_update = gr.update(interactive=True, value="Apply all edits")
@@ -169,6 +181,7 @@ def handle_batch_analysis(files, progress=gr.Progress()):
         apply_button_update
     )
 def update_details_view(selected_filename, all_results):
     if not selected_filename or not all_results:
         return "", ""
@@ -179,6 +192,7 @@ def update_details_view(selected_filename, all_results):
     transcript_html = generate_static_transcript(file_results['transcript'], file_results['initial_explicit_times'])
     return header, transcript_html
 def handle_batch_finalization(all_results, progress=gr.Progress()):
     if not all_results:
         raise gr.Error("No active analysis session. Please process files first.")
@@ -204,6 +218,7 @@ def handle_batch_finalization(all_results, progress=gr.Progress()):
         gr.update(visible=False)
     )
 def return_to_start(all_results):
     """Cleans up all temporary directories and resets the UI to its initial state."""
     if all_results:
@@ -230,7 +245,9 @@ def return_to_start(all_results):
     )
-######  Gradio UI Definition  ########
 css = """
 #main-container { max-width: 1250px; margin: auto; }
 #main-container .prose { font-size: 15px !important; }
@@ -244,11 +261,13 @@ s { color: #d32f2f; text-decoration: line-through; }
 with gr.Blocks(theme=gr.themes.Soft(), title="FSP Finder", css=css) as demo:
     analysis_results_state = gr.State(None)
     with gr.Column(elem_id="main-container"):
         gr.Markdown("# FSP Finder - AI-powered explicit content detector")
         gr.Markdown("Detects and automatically censors explicit content in music files. For source code and more details, visit our [github page](https://github.com/dclark202/auto-censoring).")
         gr.Markdown("---")
         with gr.Column(visible=True) as upload_view:
             gr.Markdown("### How to use")
             gr.Markdown('- Upload one or more audio files using the box below. Most common audio formats are accepted (e.g., `.mp3`, `.wav`, etc.).')
@@ -261,6 +280,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="FSP Finder", css=css) as demo:
             gr.Markdown('### How it works')
             gr.Markdown("This app uses a fine-tuned version of OpenAI's automatic speech recognition model [Whisper](https://github.com/openai/whisper) to create a lyrics transcript of the uploaded music files. Explicit content (e.g., curse words) are then searched for in the lyrics transcript and highlighted. The vocals stem of the track is split off from the song using [demucs](https://github.com/facebookresearch/demucs) and muted at the appropriate times to create a high-quality edited version of the song.")
         with gr.Column(visible=False) as review_view:
             gr.Markdown("### Review transcript(s) and apply edits")
             gr.Markdown(f'Words to be censored will appear in <caption>{html.escape("red strikethrough")}</s> text in the transcript below. Apply edits by clicking **Apply all edits** below.')
@@ -288,28 +308,35 @@ with gr.Blocks(theme=gr.themes.Soft(), title="FSP Finder", css=css) as demo:
                     with gr.Accordion("Full audio transcript", open=True):
                         transcript_output = gr.HTML()
         with gr.Column(visible=False, elem_id="loading-view") as loading_view:
             gr.Markdown("## ⏳ Processing... please wait")
-    # --- Event Handlers ---
     process_button.click(
         fn=handle_batch_analysis,
         inputs=[files_input],
         outputs=[upload_view, review_view, loading_view, analysis_results_state, processed_files_selector, details_header, transcript_output, apply_button]
     )
     processed_files_selector.change(
         fn=update_details_view,
         inputs=[processed_files_selector, analysis_results_state],
         outputs=[details_header, transcript_output]
     )
     apply_button.click(
         fn=handle_batch_finalization,
         inputs=[analysis_results_state],
         outputs=[review_view, loading_view, final_view, final_status_output, edited_files_output, processed_files_selector, apply_button]
     )
     return_to_start_button.click(
         fn=return_to_start,
         inputs=[analysis_results_state],
@@ -329,4 +356,5 @@ with gr.Blocks(theme=gr.themes.Soft(), title="FSP Finder", css=css) as demo:
         js="() => { if (confirm('Are you sure you want to return to the start? All current analysis will be lost.')) { return true; } else { return false; } }"
     )
 demo.launch(share=True, favicon_path='fav.png')

 from fsp import analyze_audio, apply_censoring, default_curse_words, seconds_to_minutes
 from datetime import datetime
+###### Ideas ########
+# - Javascript for toggling individual words to mute --> playright
+# - Use LLM to determine what is "explicit" in the ouputs --> structured output?
+# - Mute explicit nonvocal sounds: e.g., gun shots, sex scenes, etc.
+# - Additional words to censor at the beginning screen ?
+# Print the start time
 print(f"Executing {os.path.basename(__file__)} at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
 ################ Load models
 ## 2. Create our Whisper model from the LoRA weights
 ## Whisper_timestamped requires the entire model to be saved, this saves static storage space by only saving the lora config
 def load_whisper_model(model_path, lora_config, base_model_name="openai/whisper-medium.en"):
+    # If the model exists already we're good to go
     if os.path.exists('./whisper-medium-ft/model.safetensors'):
         print(f'Fine tuned model at {model_path} already exists')
         return
 model_path = 'whisper-medium-ft'
 lora_config = './lora_config'
 load_whisper_model(model_path=model_path, lora_config=lora_config)
 ###### Helper functions #######
+# Metadata display for the full transcriptions. Includes genius link if possible
 def format_metadata_header(filename, metadata, explicit_word_count):
     title, artist, album, year = metadata.get('title', 'N/A'), metadata.get('artist', 'N/A'), metadata.get('album', 'N/A'), metadata.get('year', 'N/A')
     genius_url, wer_score = metadata.get('genius_url'), metadata.get('wer_score')
     return f"### Details for: *{filename}*\n**Artist:** {artist} | **Song:** {title} | **Album:** {album} ({year}) {genius_link} {wer_display}{status_message}"
+# Creates the table of the transcription
 def generate_static_transcript(transcript_data, initial_times):
     initial_times_set = {f"{t['start']}-{t['end']}" for t in initial_times}
     table_header = "<table><thead><tr><th style='width: 125px;'>Time</th><th>Line transcript</th><th>Explicit flag(s)</th></tr></thead><tbody>"
     return table_header + "".join(table_rows) + "</tbody></table>"
+# Execute the whisper model for transcription
 def handle_batch_analysis(files, progress=gr.Progress()):
     if not files:
         raise gr.Error("Please upload one or more audio files.")
         analysis_state = analyze_audio(audio_file.name, model, device, fine_tuned, progress=None)
         all_results[filename] = analysis_state
         # MODIFIED: Print filename to console after transcription
+        print(f"Transcription complete for: {filename} (file {i+1} of {num_files})")
     file_list = list(all_results.keys())
     first_file_results = all_results[file_list[0]]
     transcript_html = generate_static_transcript(first_file_results['transcript'], first_file_results['initial_explicit_times'])
     # Check if ANY file has explicit content to determine if the apply button should be active
+    # If not, display no edits to make
     any_explicit_content = any(len(res['initial_explicit_times']) > 0 for res in all_results.values())
     if any_explicit_content:
         apply_button_update = gr.update(interactive=True, value="Apply all edits")
         apply_button_update
     )
+# Selecting between different transcripts
 def update_details_view(selected_filename, all_results):
     if not selected_filename or not all_results:
         return "", ""
     transcript_html = generate_static_transcript(file_results['transcript'], file_results['initial_explicit_times'])
     return header, transcript_html
+# Apply the edits to all songs
 def handle_batch_finalization(all_results, progress=gr.Progress()):
     if not all_results:
         raise gr.Error("No active analysis session. Please process files first.")
         gr.update(visible=False)
     )
+# Clear temp files and return to start
 def return_to_start(all_results):
     """Cleans up all temporary directories and resets the UI to its initial state."""
     if all_results:
     )
+######  Gradio UI   ########
+## CSS for formatting
 css = """
 #main-container { max-width: 1250px; margin: auto; }
 #main-container .prose { font-size: 15px !important; }
 with gr.Blocks(theme=gr.themes.Soft(), title="FSP Finder", css=css) as demo:
     analysis_results_state = gr.State(None)
+    # Main header. Persistent over all pages
     with gr.Column(elem_id="main-container"):
         gr.Markdown("# FSP Finder - AI-powered explicit content detector")
         gr.Markdown("Detects and automatically censors explicit content in music files. For source code and more details, visit our [github page](https://github.com/dclark202/auto-censoring).")
         gr.Markdown("---")
+        # Upload page
         with gr.Column(visible=True) as upload_view:
             gr.Markdown("### How to use")
             gr.Markdown('- Upload one or more audio files using the box below. Most common audio formats are accepted (e.g., `.mp3`, `.wav`, etc.).')
             gr.Markdown('### How it works')
             gr.Markdown("This app uses a fine-tuned version of OpenAI's automatic speech recognition model [Whisper](https://github.com/openai/whisper) to create a lyrics transcript of the uploaded music files. Explicit content (e.g., curse words) are then searched for in the lyrics transcript and highlighted. The vocals stem of the track is split off from the song using [demucs](https://github.com/facebookresearch/demucs) and muted at the appropriate times to create a high-quality edited version of the song.")
+        # Results page
         with gr.Column(visible=False) as review_view:
             gr.Markdown("### Review transcript(s) and apply edits")
             gr.Markdown(f'Words to be censored will appear in <caption>{html.escape("red strikethrough")}</s> text in the transcript below. Apply edits by clicking **Apply all edits** below.')
                     with gr.Accordion("Full audio transcript", open=True):
                         transcript_output = gr.HTML()
+        # Processing page. I want this to display more information about what is happening behind the scenes
+        # e.g., to inform the user that the program has not just crashed
         with gr.Column(visible=False, elem_id="loading-view") as loading_view:
             gr.Markdown("## ⏳ Processing... please wait")
+    # Buttons
+    # Process all inputs
     process_button.click(
         fn=handle_batch_analysis,
         inputs=[files_input],
         outputs=[upload_view, review_view, loading_view, analysis_results_state, processed_files_selector, details_header, transcript_output, apply_button]
     )
+    # Select between multiple files
     processed_files_selector.change(
         fn=update_details_view,
         inputs=[processed_files_selector, analysis_results_state],
         outputs=[details_header, transcript_output]
     )
+    # Apply edits
     apply_button.click(
         fn=handle_batch_finalization,
         inputs=[analysis_results_state],
         outputs=[review_view, loading_view, final_view, final_status_output, edited_files_output, processed_files_selector, apply_button]
     )
+    # Go back to start. The JS for the confirmation is not working!
     return_to_start_button.click(
         fn=return_to_start,
         inputs=[analysis_results_state],
         js="() => { if (confirm('Are you sure you want to return to the start? All current analysis will be lost.')) { return true; } else { return false; } }"
     )
+# Made a little favicon :)
 demo.launch(share=True, favicon_path='fav.png')

fsp.py CHANGED Viewed

@@ -11,16 +11,45 @@ import jiwer
 import shutil
 import tempfile
-GENIUS_API_TOKEN = os.getenv("GENIUS_API_TOKEN") # Or your key here!
 genius = lyricsgenius.Genius(GENIUS_API_TOKEN, verbose=False, remove_section_headers=True)
-default_curse_words = {'fuck', 'shit', 'piss', 'bitch', 'nigg', 'cock', 'faggot', 'cunt', 'clint', 'tits', 'pussy', 'dick', 'asshole', 'whore', 'goddam'}
-# --- Helper Functions (remove_punctuation, get_metadata, etc.) ---
 def remove_punctuation(s):
     s = re.sub(r'[^a-zA-Z0-9\s]', '', s)
     return s.lower()
 def silence_audio_segment(input_audio_path, output_audio_path, times):
     audio = AudioSegment.from_file(input_audio_path)
     for (start_ms, end_ms) in times:
@@ -30,12 +59,14 @@ def silence_audio_segment(input_audio_path, output_audio_path, times):
         audio = before_segment + target_segment + after_segment
     audio.export(output_audio_path, format='wav')
 def combine_audio(path1, path2, outpath):
     audio1 = AudioSegment.from_file(path1, format='wav')
     audio2 = AudioSegment.from_file(path2, format='wav')
     combined_audio = audio1.overlay(audio2)
     combined_audio.export(outpath, format="mp3")
 def get_metadata(original_audio_path):
     try:
         audio_orig = EasyID3(original_audio_path)
@@ -44,6 +75,7 @@ def get_metadata(original_audio_path):
         metadata = {'title': 'N/A', 'artist': 'N/A', 'album': 'N/A', 'year': 'N/A'}
     return metadata
 def transfer_metadata(original_audio_path, edited_audio_path):
     try:
         audio_orig = EasyID3(original_audio_path)
@@ -54,6 +86,7 @@ def transfer_metadata(original_audio_path, edited_audio_path):
     except Exception as e:
         print(f"Could not transfer metadata: {e}")
 def seconds_to_minutes(time):
     mins = int(time // 60)
     secs = int(time % 60)
@@ -67,6 +100,7 @@ def seconds_to_minutes(time):
     else:
         return f"{mins}:{secs}"
 def get_genius_url(artist, song_title):
     if not artist or not song_title or artist == 'N/A' or song_title == 'N/A': return None
     try:
@@ -74,6 +108,7 @@ def get_genius_url(artist, song_title):
         return song.url if song else None
     except Exception: return None
 def calculate_wer(ground_truth, hypothesis):
     if not ground_truth or not hypothesis or "not available" in ground_truth.lower(): return None
     try:
@@ -82,6 +117,7 @@ def calculate_wer(ground_truth, hypothesis):
         return f"{error:.3f}"
     except Exception: return "Error"
 def get_genius_lyrics(artist, song_title):
     if not artist or not song_title or artist == 'N/A' or song_title == 'N/A': return "Lyrics not available (missing metadata)."
     try:
@@ -92,6 +128,8 @@ def get_genius_lyrics(artist, song_title):
 ##########################################################
 # STEP 1: Analyze Audio, Separate Tracks, and Transcribe #
 ##########################################################
 def analyze_audio(audio_path, model, device, fine_tuned=True, progress=None):
     """
     Performs audio separation and transcription. Does NOT apply any edits.
@@ -128,12 +166,14 @@ def analyze_audio(audio_path, model, device, fine_tuned=True, progress=None):
     full_transcript = []
     initial_explicit_times = []
     for segment in result["segments"]:
         segment_words = []
-        seg = segment.get('words', [])
-        prev_word = ''
-        for i, word_info in enumerate(seg):
             word_text = word_info.get(word_key, '').strip()
             if not word_text: continue
@@ -145,18 +185,23 @@ def analyze_audio(audio_path, model, device, fine_tuned=True, progress=None):
             word_data = {'text': word_text, 'start': start_time, 'end': end_time, 'prob': word_info[prob_key]}
             segment_words.append(word_data)
-            if is_explicit:
                 initial_explicit_times.append({'start': start_time, 'end': end_time})
-            # Handle two word cluster "god damn"
-            if cleaned_word == 'damn' and prev_word == 'god':
-                god_start = seg[i-1]['start']
-                god_end = seg[i-1]['end']
-                initial_explicit_times.append({'start': god_start, 'end': god_end})
                 initial_explicit_times.append({'start': start_time, 'end': end_time})
             prev_word = cleaned_word
         full_transcript.append({'line_words': segment_words, 'start': segment['start'], 'end': segment['end']})
@@ -180,6 +225,7 @@ def analyze_audio(audio_path, model, device, fine_tuned=True, progress=None):
 # STEP 2: Apply Censoring and Finalize Audio #
 ##############################################
 def apply_censoring(analysis_state, times_to_censor, progress=None):
     """
     Takes the state from analyze_audio and a final list of timestamps,

 import shutil
 import tempfile
+## Get a genius API key at https://genius.com/api-clients
+## put your key in system environment at GENIUS_API_TOKEN or set it manually here
+GENIUS_API_TOKEN = os.getenv("GENIUS_API_TOKEN")
 genius = lyricsgenius.Genius(GENIUS_API_TOKEN, verbose=False, remove_section_headers=True)
+#############################################################################
+### just a heads up there's a bunch of curse words and racial slurs below ###
+#############################################################################
+# List of words to search for to be muted:
+# The way this works currently is that we look for these words as **substrings** of each transcribed word
+# this means that 'fuck' handles all versions 'fucking', 'motherfucker', 'fucked', etc.
+# This method is a bit crude as it can lead to some false positive, ex. 'Dickens' would be censored.
+# Consider using an LLM on the output for classification?
+default_curse_words = {
+    'fuck', 'shit', 'piss', 'bitch', 'nigg', 'dyke', 'cock', 'faggot',
+    'cunt', 'tits', 'pussy', 'dick', 'asshole', 'whore', 'goddam',
+    'douche', 'chink', 'tranny', 'slut', 'jizz', 'kike', 'gook'
+}
+# Words for which the substring method will absolutely not work
+singular_curse_words = {
+    'fag', 'cum', 'hell', 'spic', 'clit', 'wank', 'ass'
+}
+######################################################
+# Helper functions required for the gradio interface #
+######################################################
+# Removes all punctuation and returns lower case only words
 def remove_punctuation(s):
     s = re.sub(r'[^a-zA-Z0-9\s]', '', s)
     return s.lower()
+# For silencing the audio tracks at the indicated times
 def silence_audio_segment(input_audio_path, output_audio_path, times):
     audio = AudioSegment.from_file(input_audio_path)
     for (start_ms, end_ms) in times:
         audio = before_segment + target_segment + after_segment
     audio.export(output_audio_path, format='wav')
+# For combining the vocals and instrument stems once the censoring has been applied
 def combine_audio(path1, path2, outpath):
     audio1 = AudioSegment.from_file(path1, format='wav')
     audio2 = AudioSegment.from_file(path2, format='wav')
     combined_audio = audio1.overlay(audio2)
     combined_audio.export(outpath, format="mp3")
+# Extracts metadata from the original song
 def get_metadata(original_audio_path):
     try:
         audio_orig = EasyID3(original_audio_path)
         metadata = {'title': 'N/A', 'artist': 'N/A', 'album': 'N/A', 'year': 'N/A'}
     return metadata
+# Transfers metadata between two songs
 def transfer_metadata(original_audio_path, edited_audio_path):
     try:
         audio_orig = EasyID3(original_audio_path)
     except Exception as e:
         print(f"Could not transfer metadata: {e}")
+# Probably overcomplicated function to convert time in seconds to mm:ss format
 def seconds_to_minutes(time):
     mins = int(time // 60)
     secs = int(time % 60)
     else:
         return f"{mins}:{secs}"
+# Lookup url on genius of lyrics for given song
 def get_genius_url(artist, song_title):
     if not artist or not song_title or artist == 'N/A' or song_title == 'N/A': return None
     try:
         return song.url if song else None
     except Exception: return None
+# It's called calculate_wer but I'm actually using *mer*
 def calculate_wer(ground_truth, hypothesis):
     if not ground_truth or not hypothesis or "not available" in ground_truth.lower(): return None
     try:
         return f"{error:.3f}"
     except Exception: return "Error"
+# Gets the lyrics from genius for a given song
 def get_genius_lyrics(artist, song_title):
     if not artist or not song_title or artist == 'N/A' or song_title == 'N/A': return "Lyrics not available (missing metadata)."
     try:
 ##########################################################
 # STEP 1: Analyze Audio, Separate Tracks, and Transcribe #
 ##########################################################
+# Obtain transcript from song using Whisper. Whisper_timestamps handles all the splitting of the segments
 def analyze_audio(audio_path, model, device, fine_tuned=True, progress=None):
     """
     Performs audio separation and transcription. Does NOT apply any edits.
     full_transcript = []
     initial_explicit_times = []
+    # Certain phrases can run two words, we need a previous word catcher
+    prev_word = ''
+    prev_start, prev_end = 0.0, 0.0
     for segment in result["segments"]:
         segment_words = []
+        for word_info in segment.get('words', []):
             word_text = word_info.get(word_key, '').strip()
             if not word_text: continue
             word_data = {'text': word_text, 'start': start_time, 'end': end_time, 'prob': word_info[prob_key]}
             segment_words.append(word_data)
+            # Short words that can be substrings of nonsensitive words
+            if cleaned_word in singular_curse_words:
                 initial_explicit_times.append({'start': start_time, 'end': end_time})
+            # Handle two word cluster "god dam*", "mother fuck*".
+            # Other ones: jerk off, cock sucker, ... ?
+            elif ('dam' in cleaned_word and prev_word == 'god') or ('fuck' in cleaned_word and prev_word == 'mother') or (cleaned_word == 'off' and prev_word == 'jerk'):
+                initial_explicit_times.append({'start': prev_start, 'end': prev_end})
+                initial_explicit_times.append({'start': start_time, 'end': end_time})
+            # The majority of censored words will come from here
+            elif is_explicit:
                 initial_explicit_times.append({'start': start_time, 'end': end_time})
             prev_word = cleaned_word
+            prev_start, prev_end = start_time, end_time
         full_transcript.append({'line_words': segment_words, 'start': segment['start'], 'end': segment['end']})
 # STEP 2: Apply Censoring and Finalize Audio #
 ##############################################
+# Applies the censoring at the indicated times
 def apply_censoring(analysis_state, times_to_censor, progress=None):
     """
     Takes the state from analyze_audio and a final list of timestamps,