import gradio as gr
import numpy as np
import librosa
from transformers import pipeline
import json

# Initialize AI models
emotion_analyzer = pipeline("audio-classification", model="MIT/ast-finetuned-speech-commands-v2")
speech_recognizer = pipeline("automatic-speech-recognition", 
                           model="kresnik/wav2vec2-large-xlsr-korean")

# Global state management
current_stage = "intro"
session_data = {}

def create_interface():
    with gr.Blocks(theme=gr.themes.Soft()) as app:
        # State management
        state = gr.State(value={"stage": "intro", "session_data": {}})
        
        # Header
        gr.Markdown("# 디지털 굿판")
        
        # Navigation tabs
        with gr.Tabs() as tabs:
            # Intro/세계관 Stage
            with gr.Tab("입장", id="intro"):
                gr.Markdown("""
                # 디지털 굿판에 오신 것을 환영합니다
                온천천의 디지털 치유 공간으로 들어가보세요.
                """)
                intro_next = gr.Button("여정 시작하기")
            
            # 청신 Stage (Sound Purification)
            with gr.Tab("청신", id="cleansing", visible=False):
                with gr.Row():
                    audio_player = gr.Audio(
                        value="path_to_default_sound.mp3",  # 기본 사운드 파일
                        type="filepath",
                        label="온천천의 소리"
                    )
                    location_info = gr.Textbox(
                        label="현재 위치",
                        value="온천장역",
                        interactive=False
                    )
                cleansing_next = gr.Button("다음 단계로")
            
            # 기원 Stage (Voice Analysis)
            with gr.Tab("기원", id="voice", visible=False):
                with gr.Row():
                    # Voice input component
                    voice_input = gr.Audio(
                        label="목소리로 전하기",
                        sources=["microphone", "upload"],
                        type="filepath"
                    )
                    
                    # Analysis results
                    with gr.Column():
                        emotion_output = gr.JSON(
                            label="감정 분석 결과",
                            visible=True
                        )
                        text_output = gr.Textbox(
                            label="음성 텍스트",
                            visible=True
                        )
                voice_next = gr.Button("다음 단계로")
            
            # 송신 Stage (Sharing)
            with gr.Tab("송신", id="sharing", visible=False):
                with gr.Row():
                    gr.Gallery(
                        label="생성된 이미지",
                        show_label=True,
                        elem_id="gallery"
                    )
                    gr.Markdown("## 공동체와 함께 나누기")
                complete_button = gr.Button("완료")

        # Floating navigation menu
        with gr.Row(visible=True) as float_menu:
            gr.Button("🏠", scale=1)
            gr.Button("🎵", scale=1)
            gr.Button("🎤", scale=1)
            gr.Button("🖼️", scale=1)

        # Voice analysis function
        def analyze_voice(audio_file, state):
            try:
                if audio_file is None:
                    return {"error": "No audio input provided"}, state
                
                # Load audio
                y, sr = librosa.load(audio_file)
                
                # Emotion analysis
                emotions = emotion_analyzer(y)
                primary_emotion = emotions[0]
                
                # Speech to text
                text_result = speech_recognizer(y)
                
                # Update state
                state["voice_analysis"] = {
                    "emotion": primary_emotion['label'],
                    "probability": float(primary_emotion['score']),
                    "text": text_result['text']
                }
                
                return {
                    "emotion": primary_emotion['label'],
                    "emotion_probability": f"{primary_emotion['score']:.2f}",
                    "transcribed_text": text_result['text'],
                    "status": "Analysis complete"
                }, state
            except Exception as e:
                return {"error": str(e), "status": "Error occurred"}, state

        # Event handlers
        voice_input.change(
            fn=analyze_voice,
            inputs=[voice_input, state],
            outputs=[emotion_output, state]
        )

        # Stage navigation
        intro_next.click(
            fn=lambda s: {"stage": "cleansing", **s},
            inputs=[state],
            outputs=[state],
        )
        
        cleansing_next.click(
            fn=lambda s: {"stage": "voice", **s},
            inputs=[state],
            outputs=[state],
        )
        
        voice_next.click(
            fn=lambda s: {"stage": "sharing", **s},
            inputs=[state],
            outputs=[state],
        )

    return app

# Launch the application
if __name__ == "__main__":
    app = create_interface()
    app.launch()