Spaces:

Dpngtm
/

Audio-Emotion-Recognition

Running

App Files Files Community

Audio-Emotion-Recognition / app.py

Dpngtm

Create app.py

e111c36 verified 8 months ago

raw

history blame

1.79 kB

	import gradio as gr
	import torch
	from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
	import torchaudio

	# Load model and processor
	model_name = "Dpngtm/wave2vec2-emotion-recognition" # Replace with your model's Hugging Face Hub path
	model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
	processor = Wav2Vec2Processor.from_pretrained(model_name)

	# Define device (use GPU if available)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model.to(device)

	# Preprocessing and inference function
	def recognize_emotion(audio):
	# Load and resample audio to 16kHz
	speech_array, sampling_rate = torchaudio.load(audio)
	if sampling_rate != 16000:
	resampler = torchaudio.transforms.Resample(sampling_rate, 16000)
	speech_array = resampler(speech_array)
	speech_array = speech_array.mean(dim=0).numpy() # Convert to mono if multi-channel

	# Process input and make predictions
	inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt", padding=True)
	inputs = {k: v.to(device) for k, v in inputs.items()}
	with torch.no_grad():
	logits = model(**inputs).logits
	predicted_id = torch.argmax(logits, dim=-1).item()

	# Define emotion labels (use the same order as during training)
	# Emotion labels mapped to indices
	emotion_labels = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"]
	return emotion_labels[predicted_id]

	# Gradio interface
	interface = gr.Interface(
	fn=recognize_emotion,
	inputs=gr.Audio(source="microphone", type="filepath"),
	outputs="text",
	title="Emotion Recognition with Wav2Vec2",
	description="Upload or record audio, and the model will predict the emotion."
	)

	# Launch the app
	interface.launch()