Spaces:
Sleeping
Sleeping
File size: 2,513 Bytes
d1fb9a5 938bf7e d1fb9a5 675e1e5 a1288b8 d1fb9a5 8a7312c a1288b8 4a065d2 864e9d8 8a7312c a1288b8 90f8c97 675e1e5 d1fb9a5 938bf7e a1288b8 90f8c97 a1288b8 938bf7e a1288b8 938bf7e 675e1e5 938bf7e 4a065d2 938bf7e 4a065d2 a1288b8 4a065d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import gradio as gr
#import spaces ## For ZeroGPU
import torch
import torchaudio
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "Hatman/audio-emotion-detection"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
def preprocess_audio(audio):
waveform, sampling_rate = torchaudio.load(audio)
resampled_waveform = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)(waveform)
return {'speech': resampled_waveform.numpy().flatten(), 'sampling_rate': 16000}
#@spaces.GPU ## For ZeroGPU
def inference(audio):
example = preprocess_audio(audio)
inputs = feature_extractor(example['speech'], sampling_rate=16000, return_tensors="pt", padding=True)
inputs = {k: v.to('cpu') for k, v in inputs.items()} # Not necessary on ZeroGPU
with torch.no_grad():
logits = model(**inputs).logits
predicted_ids = torch.argmax(logits, dim=-1)
return model.config.id2label[predicted_ids.item()], logits, predicted_ids
#@spaces.GPU ## For ZeroGPU
def inference_label(audio):
example = preprocess_audio(audio)
inputs = feature_extractor(example['speech'], sampling_rate=16000, return_tensors="pt", padding=True)
inputs = {k: v.to('cpu') for k, v in inputs.items()} # Not necessary on ZeroGPU
with torch.no_grad():
logits = model(**inputs).logits
predicted_ids = torch.argmax(logits, dim=-1)
return model.config.id2label[predicted_ids.item()]
with gr.Blocks() as demo:
gr.Markdown("# Audio Sentiment Analysis")
with gr.Tab("Label Only Inference"):
gr.Interface(
fn=inference_label,
inputs=gr.Audio(type="filepath"),
outputs=gr.Label(label="Predicted Sentiment"),
title="Audio Sentiment Analysis",
description="Upload an audio file or record one to get the predicted sentiment label."
)
with gr.Tab("Full Inference"):
gr.Interface(
fn=inference,
inputs=gr.Audio(type="filepath"),
outputs=[gr.Label(label="Predicted Sentiment"), gr.Textbox(label="Logits"), gr.Textbox(label="Predicted IDs")],
title="Audio Sentiment Analysis (Full)",
description="Upload an audio file or record one to analyze sentiment and get detailed results."
)
demo.launch(share=True) |