Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	Create app.py
Browse files
    	
        app.py
    ADDED
    
    | @@ -0,0 +1,133 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import streamlit as st
         | 
| 2 | 
            +
            from transformers import WhisperForConditionalGeneration, WhisperProcessor
         | 
| 3 | 
            +
            from transformers import pipeline
         | 
| 4 | 
            +
            import librosa
         | 
| 5 | 
            +
            import torch
         | 
| 6 | 
            +
            from spleeter.separator import Separator
         | 
| 7 | 
            +
            from pydub import AudioSegment
         | 
| 8 | 
            +
            from IPython.display import Audio
         | 
| 9 | 
            +
            import os
         | 
| 10 | 
            +
            import accelerate
         | 
| 11 | 
            +
             | 
| 12 | 
            +
             | 
| 13 | 
            +
             | 
| 14 | 
            +
             | 
| 15 | 
            +
             | 
| 16 | 
            +
             | 
| 17 | 
            +
            # preprocess and crop audio file
         | 
| 18 | 
            +
            def audio_preprocess(file_name = '/test1/vocals.wav'):
         | 
| 19 | 
            +
               # separate music and vocal
         | 
| 20 | 
            +
               separator = Separator('spleeter:2stems')
         | 
| 21 | 
            +
               separator.separate_to_file(input_file, output_file)
         | 
| 22 | 
            +
             | 
| 23 | 
            +
             | 
| 24 | 
            +
               # Crop the audio
         | 
| 25 | 
            +
               start_time = 60000  # e.g. 30 seconds, 30000
         | 
| 26 | 
            +
               end_time = 110000  # e.g. 40 seconds, 40000
         | 
| 27 | 
            +
             | 
| 28 | 
            +
             | 
| 29 | 
            +
             | 
| 30 | 
            +
             | 
| 31 | 
            +
               audio = AudioSegment.from_file(file_name)
         | 
| 32 | 
            +
               cropped_audio = audio[start_time:end_time]
         | 
| 33 | 
            +
               processed_audio = cropped_audio
         | 
| 34 | 
            +
               # .export('cropped_vocals.wav', format='wav') # save vocal audio file
         | 
| 35 | 
            +
               return processed_audio
         | 
| 36 | 
            +
             | 
| 37 | 
            +
             | 
| 38 | 
            +
             | 
| 39 | 
            +
             | 
| 40 | 
            +
            # ASR transcription
         | 
| 41 | 
            +
            def asr_model(processed_audio):
         | 
| 42 | 
            +
               # load audio file
         | 
| 43 | 
            +
               y, sr = librosa.load(processed_audio, sr=16000)
         | 
| 44 | 
            +
             | 
| 45 | 
            +
             | 
| 46 | 
            +
               # ASR model
         | 
| 47 | 
            +
               MODEL_NAME = "RexChan/ISOM5240-whisper-small-zhhk_1"
         | 
| 48 | 
            +
               processor = WhisperProcessor.from_pretrained(MODEL_NAME)
         | 
| 49 | 
            +
               model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, low_cpu_mem_usage=True)
         | 
| 50 | 
            +
             | 
| 51 | 
            +
             | 
| 52 | 
            +
               model.config.forced_decoder_ids = None
         | 
| 53 | 
            +
               model.config.suppress_tokens = []
         | 
| 54 | 
            +
               model.config.use_cache = False
         | 
| 55 | 
            +
             | 
| 56 | 
            +
             | 
| 57 | 
            +
               processed_in = processor(y, sampling_rate=sr, return_tensors="pt")
         | 
| 58 | 
            +
               gout = model.generate(
         | 
| 59 | 
            +
                   input_features=processed_in.input_features,
         | 
| 60 | 
            +
                   output_scores=True, return_dict_in_generate=True
         | 
| 61 | 
            +
               )
         | 
| 62 | 
            +
               transcription = processor.batch_decode(gout.sequences, skip_special_tokens=True)[0]
         | 
| 63 | 
            +
             | 
| 64 | 
            +
             | 
| 65 | 
            +
               # print result
         | 
| 66 | 
            +
               print(f"Song lyrics = {transcription}")
         | 
| 67 | 
            +
             | 
| 68 | 
            +
             | 
| 69 | 
            +
               return transcription
         | 
| 70 | 
            +
             | 
| 71 | 
            +
             | 
| 72 | 
            +
             | 
| 73 | 
            +
             | 
| 74 | 
            +
            # sentiment analysis
         | 
| 75 | 
            +
            def senti_model(transcription):
         | 
| 76 | 
            +
             | 
| 77 | 
            +
             | 
| 78 | 
            +
               pipe = pipeline("text-classification", model="lxyuan/distilbert-base-multilingual-cased-sentiments-student")
         | 
| 79 | 
            +
               final_result = pipe(transcription)
         | 
| 80 | 
            +
               display = f"Sentiment Analysis shows that this song is {final_result[0]['label']}. Confident level of this analysis is {final_result[0]['score']*100:.1f}%."
         | 
| 81 | 
            +
               print(display)
         | 
| 82 | 
            +
               return display
         | 
| 83 | 
            +
             | 
| 84 | 
            +
             | 
| 85 | 
            +
               # return final_result
         | 
| 86 | 
            +
             | 
| 87 | 
            +
             | 
| 88 | 
            +
             | 
| 89 | 
            +
             | 
| 90 | 
            +
            # main
         | 
| 91 | 
            +
            def main(input_file):
         | 
| 92 | 
            +
             | 
| 93 | 
            +
             | 
| 94 | 
            +
               # processed_audio = audio_preprocess(input_file)
         | 
| 95 | 
            +
               processed_audio = input_file
         | 
| 96 | 
            +
             | 
| 97 | 
            +
             | 
| 98 | 
            +
               transcription = asr_model(processed_audio)
         | 
| 99 | 
            +
               final_result = senti_model(transcription)
         | 
| 100 | 
            +
               st.write(final_result)
         | 
| 101 | 
            +
             | 
| 102 | 
            +
             | 
| 103 | 
            +
               if st.button("Play Audio"):
         | 
| 104 | 
            +
                   st.audio(audio_data['audio'],
         | 
| 105 | 
            +
                               format="audio/wav",
         | 
| 106 | 
            +
                               start_time=0,
         | 
| 107 | 
            +
                               sample_rate = audio_data['sampling_rate'])
         | 
| 108 | 
            +
             | 
| 109 | 
            +
             | 
| 110 | 
            +
             | 
| 111 | 
            +
             | 
| 112 | 
            +
            if __name__ == '__main__':
         | 
| 113 | 
            +
             | 
| 114 | 
            +
             | 
| 115 | 
            +
               # steamlit setup
         | 
| 116 | 
            +
               st.set_page_config(page_title="Sentiment Analysis on Your Cantonese Song",)
         | 
| 117 | 
            +
               st.header("Cantonese Song Sentiment Analyzer")
         | 
| 118 | 
            +
               input_file = st.file_uploader("upload a song in mp3 format", type="mp3") # upload song
         | 
| 119 | 
            +
               if input_file is not None:
         | 
| 120 | 
            +
                   st.write("File uploaded successfully!")
         | 
| 121 | 
            +
                   st.write(input_file)
         | 
| 122 | 
            +
               else:
         | 
| 123 | 
            +
                   st.write("No file uploaded.")
         | 
| 124 | 
            +
               button_click = st.button("Run Analysis", type="primary")
         | 
| 125 | 
            +
             | 
| 126 | 
            +
             | 
| 127 | 
            +
               # load song
         | 
| 128 | 
            +
               #input_file = os.path.isfile("test1.mp3")
         | 
| 129 | 
            +
               # output_file = os.path.isdir("")
         | 
| 130 | 
            +
             | 
| 131 | 
            +
             | 
| 132 | 
            +
               if button_click:
         | 
| 133 | 
            +
                   main(input_file=input_file)
         |