import gradio as gr import numpy as np import librosa from transformers import pipeline import json # Initialize AI models emotion_analyzer = pipeline("audio-classification", model="MIT/ast-finetuned-speech-commands-v2") speech_recognizer = pipeline("automatic-speech-recognition", model="kresnik/wav2vec2-large-xlsr-korean") # Global state management current_stage = "intro" session_data = {} def create_interface(): with gr.Blocks(theme=gr.themes.Soft()) as app: # State management state = gr.State(value={"stage": "intro", "session_data": {}}) # Header gr.Markdown("# 디지털 굿판") # Navigation tabs with gr.Tabs() as tabs: # Intro/세계관 Stage with gr.Tab("입장", id="intro"): gr.Markdown(""" # 디지털 굿판에 오신 것을 환영합니다 온천천의 디지털 치유 공간으로 들어가보세요. """) intro_next = gr.Button("여정 시작하기") # 청신 Stage (Sound Purification) with gr.Tab("청신", id="cleansing", visible=False): with gr.Row(): audio_player = gr.Audio( value="path_to_default_sound.mp3", # 기본 사운드 파일 type="filepath", label="온천천의 소리" ) location_info = gr.Textbox( label="현재 위치", value="온천장역", interactive=False ) cleansing_next = gr.Button("다음 단계로") # 기원 Stage (Voice Analysis) with gr.Tab("기원", id="voice", visible=False): with gr.Row(): # Voice input component voice_input = gr.Audio( label="목소리로 전하기", sources=["microphone", "upload"], type="filepath" ) # Analysis results with gr.Column(): emotion_output = gr.JSON( label="감정 분석 결과", visible=True ) text_output = gr.Textbox( label="음성 텍스트", visible=True ) voice_next = gr.Button("다음 단계로") # 송신 Stage (Sharing) with gr.Tab("송신", id="sharing", visible=False): with gr.Row(): gr.Gallery( label="생성된 이미지", show_label=True, elem_id="gallery" ) gr.Markdown("## 공동체와 함께 나누기") complete_button = gr.Button("완료") # Floating navigation menu with gr.Row(visible=True) as float_menu: gr.Button("🏠", scale=1) gr.Button("🎵", scale=1) gr.Button("🎤", scale=1) gr.Button("🖼️", scale=1) # Voice analysis function def analyze_voice(audio_file, state): try: if audio_file is None: return {"error": "No audio input provided"}, state # Load audio y, sr = librosa.load(audio_file) # Emotion analysis emotions = emotion_analyzer(y) primary_emotion = emotions[0] # Speech to text text_result = speech_recognizer(y) # Update state state["voice_analysis"] = { "emotion": primary_emotion['label'], "probability": float(primary_emotion['score']), "text": text_result['text'] } return { "emotion": primary_emotion['label'], "emotion_probability": f"{primary_emotion['score']:.2f}", "transcribed_text": text_result['text'], "status": "Analysis complete" }, state except Exception as e: return {"error": str(e), "status": "Error occurred"}, state # Event handlers voice_input.change( fn=analyze_voice, inputs=[voice_input, state], outputs=[emotion_output, state] ) # Stage navigation intro_next.click( fn=lambda s: {"stage": "cleansing", **s}, inputs=[state], outputs=[state], ) cleansing_next.click( fn=lambda s: {"stage": "voice", **s}, inputs=[state], outputs=[state], ) voice_next.click( fn=lambda s: {"stage": "sharing", **s}, inputs=[state], outputs=[state], ) return app # Launch the application if __name__ == "__main__": app = create_interface() app.launch()