import gradio as gr import torch from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification import torchaudio # Load model and processor model_name = "Dpngtm/wave2vec2-emotion-recognition" # Replace with your model's Hugging Face Hub path model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name) processor = Wav2Vec2Processor.from_pretrained(model_name) # Define device (use GPU if available) device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) # Preprocessing and inference function def recognize_emotion(audio): # Load and resample audio to 16kHz speech_array, sampling_rate = torchaudio.load(audio) if sampling_rate != 16000: resampler = torchaudio.transforms.Resample(sampling_rate, 16000) speech_array = resampler(speech_array) speech_array = speech_array.mean(dim=0).numpy() # Convert to mono if multi-channel # Process input and make predictions inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt", padding=True) inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): logits = model(**inputs).logits predicted_id = torch.argmax(logits, dim=-1).item() # Define emotion labels (use the same order as during training) # Emotion labels mapped to indices emotion_labels = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"] return emotion_labels[predicted_id] # Gradio interface interface = gr.Interface( fn=recognize_emotion, inputs=gr.Audio(source="microphone", type="filepath"), outputs="text", title="Emotion Recognition with Wav2Vec2", description="Upload or record audio, and the model will predict the emotion." ) # Launch the app interface.launch()