music2emo-youtube-link-ja

Sleeping

App Files Files Community

kjysmu commited on Feb 11

Commit

7aff93b

verified ·

1 Parent(s): f1fa359

Update app.py

Browse files

Files changed (1) hide show

app.py +167 -74

app.py CHANGED Viewed

@@ -38,6 +38,9 @@ from utils.mir_eval_modules import (
 from utils.mert import FeatureExtractorMERT
 from model.linear_mt_attn_ck import FeedforwardModelMTAttnCK
 # Suppress unnecessary warnings and logs
 warnings.filterwarnings("ignore")
 logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
@@ -170,7 +173,20 @@ def split_audio(waveform, sample_rate):
     return segments
 class Music2emo:
@@ -248,20 +264,32 @@ class Music2emo:
         feature_dir = Path("./inference/temp_out")
         output_dir = Path("./inference/output")
-        if feature_dir.exists():
-            shutil.rmtree(str(feature_dir))
-        if output_dir.exists():
-            shutil.rmtree(str(output_dir))
-        feature_dir.mkdir(parents=True)
-        output_dir.mkdir(parents=True)
         warnings.filterwarnings('ignore')
         logger.logging_verbosity(1)
         mert_dir = feature_dir / "mert"
-        mert_dir.mkdir(parents=True)
         waveform, sample_rate = torchaudio.load(audio)
         if waveform.shape[0] > 1:
             waveform = waveform.mean(dim=0).unsqueeze(0)
@@ -381,9 +409,6 @@ class Music2emo:
         midi.instruments.append(instrument)
         midi.write(save_path.replace('.lab', '.midi'))
         try:
             midi_file = converter.parse(save_path.replace('.lab', '.midi'))
             key_signature = str(midi_file.analyze('key'))
@@ -483,101 +508,158 @@ class Music2emo:
         model_input_dic = {k: v.to(self.device) for k, v in model_input_dic.items()}
         classification_output, regression_output = self.music2emo_model(model_input_dic)
-        probs = torch.sigmoid(classification_output)
         tag_list = np.load ( "./inference/data/tag_list.npy")
         tag_list = tag_list[127:]
         mood_list = [t.replace("mood/theme---", "") for t in tag_list]
         threshold = threshold
-        predicted_moods = [mood_list[i] for i, p in enumerate(probs.squeeze().tolist()) if p > threshold]
         valence, arousal = regression_output.squeeze().tolist()
         model_output_dic = {
             "valence": valence,
             "arousal": arousal,
-            "predicted_moods": predicted_moods
         }
         return model_output_dic
-# Initialize Mustango
 if torch.cuda.is_available():
     music2emo = Music2emo()
 else:
     music2emo = Music2emo(device="cpu")
 def format_prediction(model_output_dic):
-    """Format the model output in a more readable and attractive format"""
     valence = model_output_dic["valence"]
     arousal = model_output_dic["arousal"]
-    moods = model_output_dic["predicted_moods"]
-    # Create a formatted string with emojis and proper formatting
-    output_text = """
-🎵 **Music Emotion Recognition Results** 🎵
---------------------------------------------------
-🎭 **Predicted Mood Tags:** {}
-💖 **Valence:** {:.2f} (Scale: 1-9)
-⚡ **Arousal:** {:.2f} (Scale: 1-9)
---------------------------------------------------
-    """.format(
-        ', '.join(moods) if moods else 'None',
-        valence,
-        arousal
-    )
-    return output_text
 title = "Music2Emo: Towards Unified Music Emotion Recognition across Dimensional and Categorical Models"
-description_text = """
-<p>
-Upload an audio file to analyze its emotional characteristics using Music2Emo.
-The model will predict:
-• Mood tags describing the emotional content
-• Valence score (1-9 scale, representing emotional positivity)
-• Arousal score (1-9 scale, representing emotional intensity)
-</p>
-"""
 css = """
 #output-text {
-    font-family: monospace;
     white-space: pre-wrap;
-    font-size: 16px;
-    background-color: #333333;
-    padding: 20px;
-    border-radius: 10px;
-    margin: 10px 0;
 }
 .gradio-container {
     font-family: 'Inter', -apple-system, system-ui, sans-serif;
 }
 .gr-button {
     color: white;
-    background: #1565c0;
-    border-radius: 100vh;
 }
 """
-# Initialize Music2Emo
-if torch.cuda.is_available():
-    music2emo = Music2emo()
-else:
-    music2emo = Music2emo(device="cpu")
 with gr.Blocks(css=css) as demo:
-    gr.HTML(f"<h1><center>{title}</center></h1>")
     gr.Markdown(description_text)
     with gr.Row():
         with gr.Column(scale=1):
             input_audio = gr.Audio(
                 label="Upload Audio File",
-                type="filepath"  # Removed 'source' parameter
             )
             threshold = gr.Slider(
                 minimum=0.0,
@@ -585,29 +667,40 @@ with gr.Blocks(css=css) as demo:
                 value=0.5,
                 step=0.01,
                 label="Mood Detection Threshold",
-                info="Adjust threshold for mood detection (0.0 to 1.0)"
             )
             predict_btn = gr.Button("🎭 Analyze Emotions", variant="primary")
         with gr.Column(scale=1):
-            output_text = gr.Markdown(
-                label="Analysis Results",
-                elem_id="output-text"
-            )
     predict_btn.click(
         fn=lambda audio, thresh: format_prediction(music2emo.predict(audio, thresh)),
         inputs=[input_audio, threshold],
-        outputs=output_text
     )
     gr.Markdown("""
     ### 📝 Notes:
-    - Supported audio formats: MP3, WAV
-    - For best results, use high-quality audio files
-    - Processing may take a few moments depending on file size
     """)
-# Launch the demo
 demo.queue().launch()

 from utils.mert import FeatureExtractorMERT
 from model.linear_mt_attn_ck import FeedforwardModelMTAttnCK
+import matplotlib.pyplot as plt
 # Suppress unnecessary warnings and logs
 warnings.filterwarnings("ignore")
 logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
     return segments
+def safe_remove_dir(directory):
+    """
+    Safely removes a directory only if it exists and is empty.
+    """
+    directory = Path(directory)
+    if directory.exists():
+        try:
+            shutil.rmtree(directory)
+        except FileNotFoundError:
+            print(f"Warning: Some files in {directory} were already deleted.")
+        except PermissionError:
+            print(f"Warning: Permission issue encountered while deleting {directory}.")
+        except Exception as e:
+            print(f"Unexpected error while deleting {directory}: {e}")
 class Music2emo:
         feature_dir = Path("./inference/temp_out")
         output_dir = Path("./inference/output")
+        # if feature_dir.exists():
+        #     shutil.rmtree(str(feature_dir))
+        # if output_dir.exists():
+        #     shutil.rmtree(str(output_dir))
+        # feature_dir.mkdir(parents=True)
+        # output_dir.mkdir(parents=True)
+        # warnings.filterwarnings('ignore')
+        # logger.logging_verbosity(1)
+        # mert_dir = feature_dir / "mert"
+        # mert_dir.mkdir(parents=True)
+        safe_remove_dir(feature_dir)
+        safe_remove_dir(output_dir)
+        feature_dir.mkdir(parents=True, exist_ok=True)
+        output_dir.mkdir(parents=True, exist_ok=True)
         warnings.filterwarnings('ignore')
         logger.logging_verbosity(1)
         mert_dir = feature_dir / "mert"
+        mert_dir.mkdir(parents=True, exist_ok=True)
         waveform, sample_rate = torchaudio.load(audio)
         if waveform.shape[0] > 1:
             waveform = waveform.mean(dim=0).unsqueeze(0)
         midi.instruments.append(instrument)
         midi.write(save_path.replace('.lab', '.midi'))
         try:
             midi_file = converter.parse(save_path.replace('.lab', '.midi'))
             key_signature = str(midi_file.analyze('key'))
         model_input_dic = {k: v.to(self.device) for k, v in model_input_dic.items()}
         classification_output, regression_output = self.music2emo_model(model_input_dic)
+        # probs = torch.sigmoid(classification_output)
         tag_list = np.load ( "./inference/data/tag_list.npy")
         tag_list = tag_list[127:]
         mood_list = [t.replace("mood/theme---", "") for t in tag_list]
         threshold = threshold
+        # Get probabilities
+        probs = torch.sigmoid(classification_output).squeeze().tolist()
+        # Include both mood names and scores
+        predicted_moods_with_scores = [
+            {"mood": mood_list[i], "score": round(p, 4)}  # Rounded for better readability
+            for i, p in enumerate(probs) if p > threshold
+        ]
+        # Include both mood names and scores
+        predicted_moods_with_scores_all = [
+            {"mood": mood_list[i], "score": round(p, 4)}  # Rounded for better readability
+            for i, p in enumerate(probs)
+        ]
+        # Sort by highest probability
+        predicted_moods_with_scores.sort(key=lambda x: x["score"], reverse=True)
         valence, arousal = regression_output.squeeze().tolist()
         model_output_dic = {
             "valence": valence,
             "arousal": arousal,
+            "predicted_moods": predicted_moods_with_scores,
+            "predicted_moods_all": predicted_moods_with_scores_all
         }
         return model_output_dic
+# Music2Emo Model Initialization
 if torch.cuda.is_available():
     music2emo = Music2emo()
 else:
     music2emo = Music2emo(device="cpu")
+# Plot Functions
+def plot_mood_probabilities(predicted_moods_with_scores):
+    """Plot mood probabilities as a horizontal bar chart."""
+    if not predicted_moods_with_scores:
+        return None
+    # Extract mood names and their scores
+    moods = [m["mood"] for m in predicted_moods_with_scores]
+    probs = [m["score"] for m in predicted_moods_with_scores]
+    # Sort moods by probability
+    sorted_indices = np.argsort(probs)[::-1]
+    sorted_probs = [probs[i] for i in sorted_indices]
+    sorted_moods = [moods[i] for i in sorted_indices]
+    # Create bar chart
+    fig, ax = plt.subplots(figsize=(8, 4))
+    ax.barh(sorted_moods[:10], sorted_probs[:10], color="#4CAF50")
+    ax.set_xlabel("Probability")
+    ax.set_title("Top 10 Predicted Mood Tags")
+    ax.invert_yaxis()
+    return fig
+def plot_valence_arousal(valence, arousal):
+    """Plot valence-arousal on a 2D circumplex model."""
+    fig, ax = plt.subplots(figsize=(4, 4))
+    ax.scatter(valence, arousal, color="red", s=100)
+    ax.set_xlim(1, 9)
+    ax.set_ylim(1, 9)
+# Add midpoint lines
+    ax.axhline(y=5, color='gray', linestyle='--', linewidth=1)  # Horizontal middle line
+    ax.axvline(x=5, color='gray', linestyle='--', linewidth=1)  # Vertical middle line
+    # Labels & Grid
+    ax.set_xlabel("Valence (Positivity)")
+    ax.set_ylabel("Arousal (Intensity)")
+    ax.set_title("Valence-Arousal Plot")
+    ax.legend()
+    ax.grid(True, linestyle="--", alpha=0.6)
+    return fig
+# Prediction Formatting
 def format_prediction(model_output_dic):
+    """Format the model output in a structured format"""
     valence = model_output_dic["valence"]
     arousal = model_output_dic["arousal"]
+    predicted_moods_with_scores = model_output_dic["predicted_moods"]
+    predicted_moods_with_scores_all = model_output_dic["predicted_moods_all"]
+    # Generate charts
+    va_chart = plot_valence_arousal(valence, arousal)
+    mood_chart = plot_mood_probabilities(predicted_moods_with_scores_all)
+    # Format mood output with scores
+    if predicted_moods_with_scores:
+        moods_text = ", ".join(
+            [f"**{m['mood']}** ({m['score']:.2f})" for m in predicted_moods_with_scores]
+        )
+    else:
+        moods_text = "No significant moods detected."
+    # Create formatted output
+    output_text = f""" 🎭 Predicted Mood Tags : {moods_text}
+💖 Valence: {valence:.2f} (Scale: 1-9)
+⚡ Arousal: {arousal:.2f} (Scale: 1-9)"""
+    return output_text, va_chart, mood_chart
+# Gradio UI Elements
 title = "Music2Emo: Towards Unified Music Emotion Recognition across Dimensional and Categorical Models"
+description_text = "Upload an audio file to analyze its emotional characteristics using Music2Emo. The model will predict: • Mood tags describing the emotional content • Valence score (1-9 scale, representing emotional positivity) • Arousal score (1-9 scale, representing emotional intensity) "
+# Custom CSS Styling
 css = """
 #output-text {
+    font-family: 'Inter', sans-serif;
     white-space: pre-wrap;
+    font-size: 14px;
+    background-color: #222222;
+    padding: 0spx;
+    border-radius: 8px;
+    border-left: 5px solid #4CAF50;
+    margin: 0px 0;
 }
 .gradio-container {
     font-family: 'Inter', -apple-system, system-ui, sans-serif;
 }
 .gr-button {
     color: white;
+    background: #4CAF50;
+    border-radius: 8px;
+    padding: 10px;
 }
 """
 with gr.Blocks(css=css) as demo:
+    gr.HTML(f"<h1 style='text-align: center;'>{title}</h1>")
     gr.Markdown(description_text)
     with gr.Row():
+        # Left Panel (Input)
         with gr.Column(scale=1):
             input_audio = gr.Audio(
                 label="Upload Audio File",
+                type="filepath"
             )
             threshold = gr.Slider(
                 minimum=0.0,
                 value=0.5,
                 step=0.01,
                 label="Mood Detection Threshold",
+                info="Adjust threshold for mood detection"
             )
             predict_btn = gr.Button("🎭 Analyze Emotions", variant="primary")
+        # Right Panel (Output)
         with gr.Column(scale=1):
+            output_text = gr.Markdown(label="Analysis Results", elem_id="output-text")
+            # ✅ Using `gr.Row(equal_height=True)` ensures both plots stay on the same level
+            with gr.Row(equal_height=True):
+                mood_chart = gr.Plot(label=" ", scale=2)
+                va_chart = gr.Plot(label=" ", scale=1)
+    predict_btn.click(
+        fn=lambda audio, thresh: format_prediction(music2emo.predict(audio, thresh)),
+        inputs=[input_audio, threshold],
+        outputs=[output_text, va_chart, mood_chart]
+    )
+    # Button Click Function
     predict_btn.click(
         fn=lambda audio, thresh: format_prediction(music2emo.predict(audio, thresh)),
         inputs=[input_audio, threshold],
+        outputs=[output_text, va_chart, mood_chart]
     )
+    # Notes Section
     gr.Markdown("""
     ### 📝 Notes:
+    - **Supported audio formats:** MP3, WAV
+    - **Recommended:** High-quality audio files
+    - **Processing time:** A few seconds, depending on file size
     """)
+# Launch the App
 demo.queue().launch()