import os, sys, re, json import argparse import shutil import warnings import whisper_timestamped as wt from pdb import set_trace as b from pprint import pprint as pp from profanity_check import predict, predict_prob from pydub import AudioSegment from pydub.playback import play from subprocess import Popen, PIPE import gradio as gr def parse_args(): """ """ parser = argparse.ArgumentParser( description=('Tool to mute profanities in a song (source separation -> speech recognition -> profanity detection -> mask profanities -> re-mix)'), usage=('see or run as local web app with streamlit: ') ) parser.add_argument( '-i', '--input', default=None, nargs='?', #required=True, help=("path to a mp3") ) parser.add_argument( '-m', '--model', default='small', nargs='?', help=("model used by whisper for speech recognition: tiny, small (default) or medium") ) parser.add_argument( '-p', '--play', default=False, action='store_true', help=("play output audio at the end") ) parser.add_argument( '-v', '--verbose', default=True, action='store_true', help=("print transcribed text and detected profanities to screen") ) return parser.parse_args() def main(args, input_file=None, model_size=None, verbose=False, play_output=False, skip_ss=False): """ """ if not input_file: input_file = args.input if not model_size: model_size = args.model if not verbose: verbose = args.verbose if not play_output: play_output = args.play # exit if input file not found if len(sys.argv)>1 and not os.path.isfile(input_file): print('Error: --input file not found') raise Exception print(f'\nProcessing input file: {input_file}') if not skip_ss: # split audio into vocals + accompaniment print('Running source separation') stems_dir = source_separation(input_file, use_demucs=False, use_spleeter=True) vocal_stem = os.path.join(stems_dir, 'vocals.wav') #instr_stem = os.path.join(stems_dir, 'no_vocals.wav') # demucs instr_stem = os.path.join(stems_dir, 'accompaniment.wav') # spleeter print(f'Vocal stem written to: {vocal_stem}') else: vocal_stem = input_file instr_stem = None audio = wt.load_audio(vocal_stem) model = wt.load_model(model_size, device='cpu') text = wt.transcribe(model, audio, language='en') if verbose: print('\nTranscribed text:') print(text['text']+'\n') # checking for profanities in text print('Run profanity detection on text') profanities = profanity_detection(text) if not profanities: print(f'No profanities found in {input_file} - exiting') return 'No profanities found', None, None if verbose: print('profanities found in text:') pp(profanities) # masking print('Mask profanities in vocal stem') vocals = mask_profanities(vocal_stem, profanities) # re-mixing print('Merge instrumentals stem and masked vocals stem') if not skip_ss: mix = AudioSegment.from_wav(instr_stem).overlay(vocals) else: mix = vocals # write mix to file outpath = input_file.replace('.mp3', '_masked.mp3').replace('.wav', '_masked.wav') if input_file.endswith('.wav'): mix.export(outpath, format="wav") elif input_file.endswith('.mp3'): mix.export(outpath, format="mp3") print(f'Mixed file written to: {outpath}') # play output if play_output: print('\nPlaying output...') play(mix) return outpath, vocal_stem, instr_stem def source_separation(inpath, use_demucs=False, use_spleeter=True): """ Execute shell command to run demucs and pipe stdout/stderr back to python """ infile = os.path.basename(inpath) if use_demucs: cmd = f'demucs --two-stems=vocals --jobs 8 "{inpath}"' #stems_dir = os.path.join(re.findall('/.*', stdout)[0], infile.replace('.mp3','').replace('.wav','')) elif use_spleeter: outdir = 'audio/separated' cmd = f'spleeter separate {inpath} -p spleeter:2stems -o {outdir}' stems_dir = os.path.join(outdir, os.path.splitext(infile)[0]) stdout, stderr = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True, executable='/bin/bash').communicate() stdout = stdout.decode('utf8') # exit if lib error'd out if stderr: stderr = stderr.decode('utf-8').lower() if 'error' in stderr or 'not exist' in stderr: print(stderr.decode('utf8').split('\n')[0]) raise Exception # parse stems directory path from stdout and return it if successful if not os.path.isdir(stems_dir): print(f'Error: output stem directory "{stems_dir}" not found') raise Exception return stems_dir def profanity_detection(text): """ """ # detect profanities in text profs = [] for segment in text['segments']: for word in segment['words']: #if word['confidence']<.25: # print(word) text = word['text'].replace('.','').replace(',','').lower() # skip false positives if text in ['cancer','hell','junk','die','lame','freak','freaky','white','stink','shut','spit','mouth','orders','eat','clouds','ugly','dirty','wet']: continue # assume anything returned by whisper with more than 1 * is profanity e.g n***a if '**' in text: profs.append(word) continue # add true negatives if text in ['bitchy', 'puss']: profs.append(word) continue # run profanity detection - returns 1 (True) or 0 (False) if predict([word['text']])[0]: profs.append(word) return profs def mask_profanities(vocal_stem, profanities): """ """ # load vocal stem and mask profanities vocals = AudioSegment.from_wav(vocal_stem) for prof in profanities: mask = vocals[prof['start']*1000:prof['end']*1000] # pydub works in milliseconds mask -= 50 # reduce lvl by some dB (enough to ~mute it) #mask = mask.silent(len(mask)) #mask = mask.fade_in(100).fade_out(100) # it prepends/appends fades so end up with longer mask start = vocals[:prof['start']*1000] end = vocals[prof['end']*1000:] #print(f"masking {prof['text']} from {prof['start']} to {prof['end']}") vocals = start + mask + end return vocals def process_audio(input_file, model_size): args = parse_args() inpath = os.path.abspath(input_file.name) outpath, vocal_stem, instr_stem = main(args, input_file=inpath, model_size=model_size) return outpath if __name__ == "__main__": args = parse_args() if len(sys.argv)>1: main(args, skip_ss=False) else: iface = gr.Interface( fn=process_audio, inputs=[gr.Audio(source="upload"), gr.Radio(['tiny','small','medium'])], outputs='audio' ) iface.launch()