File size: 3,843 Bytes
3f50570 11f59f6 3f50570 11f59f6 3f50570 11f59f6 3f50570 11f59f6 3f50570 11f59f6 3f50570 11f59f6 3f50570 11f59f6 3f50570 11f59f6 3f50570 11f59f6 3f50570 11f59f6 3f50570 11f59f6 3f50570 11f59f6 3f50570 11f59f6 3f50570 11f59f6 3f50570 11f59f6 3f50570 11f59f6 3f50570 11f59f6 3f50570 11f59f6 3f50570 11f59f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import os
import torch
import librosa
import numpy as np
import gradio as gr
from sonics import HFAudioClassifier
# Model configurations
MODEL_IDS = {
"SpecTTTra-α (5s)": "awsaf49/sonics-spectttra-alpha-5s",
"SpecTTTra-β (5s)": "awsaf49/sonics-spectttra-beta-5s",
"SpecTTTra-γ (5s)": "awsaf49/sonics-spectttra-gamma-5s",
"SpecTTTra-α (120s)": "awsaf49/sonics-spectttra-alpha-120s",
"SpecTTTra-β (120s)": "awsaf49/sonics-spectttra-beta-120s",
"SpecTTTra-γ (120s)": "awsaf49/sonics-spectttra-gamma-120s",
}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_cache = {}
def load_model(model_name):
"""Load model if not already cached"""
if model_name not in model_cache:
model_id = MODEL_IDS[model_name]
model = HFAudioClassifier.from_pretrained(model_id)
model = model.to(device)
model.eval()
model_cache[model_name] = model
return model_cache[model_name]
def process_audio(audio_path, model_name):
"""Process audio file and return prediction"""
try:
model = load_model(model_name)
max_time = model.config.audio.max_time
# Load and process audio
audio, sr = librosa.load(audio_path, sr=16000)
chunk_samples = int(max_time * sr)
total_chunks = len(audio) // chunk_samples
middle_chunk_idx = total_chunks // 2
# Extract middle chunk
start = middle_chunk_idx * chunk_samples
end = start + chunk_samples
chunk = audio[start:end]
if len(chunk) < chunk_samples:
chunk = np.pad(chunk, (0, chunk_samples - len(chunk)))
# Get prediction
with torch.no_grad():
chunk = torch.from_numpy(chunk).float().to(device)
pred = model(chunk.unsqueeze(0))
prob = torch.sigmoid(pred).cpu().numpy()[0]
return {"Real": 1 - prob, "Fake": prob}
except Exception as e:
return {"Error": str(e)}
def predict(audio_file, model_name):
"""Gradio interface function"""
if audio_file is None:
return {"Message": "Please upload an audio file"}
return process_audio(audio_file, model_name)
# Create Gradio interface
with gr.Blocks() as demo:
gr.HTML(
"""
<div style="text-align: center; margin-bottom: 1rem;">
<img src="https://i.postimg.cc/3Jx3yZ5b/real-vs-fake-sonics-w-logo.jpg"
style="max-width: 300px; margin: 0 auto;">
<h1>SONICS: Synthetic Or Not - Identifying Counterfeit Songs</h1>
<h3>ICLR 2025 [Poster]</h3>
</div>
"""
)
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
label="Upload Audio File",
type="filepath"
)
model_dropdown = gr.Dropdown(
choices=list(MODEL_IDS.keys()),
value="SpecTTTra-γ (5s)",
label="Select Model"
)
submit_btn = gr.Button("Analyze Audio")
with gr.Column():
output = gr.Label(
label="Analysis Result",
num_top_classes=2
)
gr.Markdown(
"""
### Resources
- [📄 Paper](https://openreview.net/forum?id=PY7KSh29Z8)
- [🎵 Dataset](https://huggingface.co/datasets/awsaf49/sonics)
- [🔬 ArXiv](https://arxiv.org/abs/2408.14080)
- [💻 GitHub](https://github.com/awsaf49/sonics)
"""
)
submit_btn.click(
fn=predict,
inputs=[audio_input, model_dropdown],
outputs=[output]
)
if __name__ == "__main__":
demo.launch() |