import gradio as gr import numpy as np import librosa from transformers import pipeline # Initialize models emotion_analyzer = pipeline("audio-classification", model="MIT/ast-finetuned-speech-commands-v2") speech_recognizer = pipeline("automatic-speech-recognition", model="kresnik/wav2vec2-large-xlsr-korean") def analyze_voice(audio_file): """Voice analysis function""" try: # Load audio y, sr = librosa.load(audio_file) # 1. Voice emotion analysis emotions = emotion_analyzer(y) primary_emotion = emotions[0] # 2. Speech to text text_result = speech_recognizer(y) # 3. Extract audio features mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) energy = np.mean(librosa.feature.rms(y=y)) return { "emotion": primary_emotion['label'], "emotion_probability": f"{primary_emotion['score']:.2f}", "transcribed_text": text_result['text'], "energy_level": f"{energy:.2f}", "status": "Analysis complete" } except Exception as e: return { "error": str(e), "status": "Error occurred" } # Create Gradio interface interface = gr.Interface( fn=analyze_voice, inputs=gr.Audio(source="microphone", type="filepath", label="Voice Input"), outputs=gr.JSON(label="Analysis Results"), title="Digital Gut - Voice Emotion Analysis", description="Performs emotion analysis and text conversion from voice input.", theme=gr.themes.Soft(), analytics_enabled=True ) # Launch app if __name__ == "__main__": interface.launch()