Spaces:

haepada
/

roots

Sleeping

File size: 17,160 Bytes

# Part 1/3 - Setup and Utilities
import gradio as gr
import numpy as np
import librosa
from transformers import pipeline
from datetime import datetime
import os
import requests

# 환경변수에서 토큰 가져오기
HF_API_TOKEN = os.getenv("roots")
if not HF_API_TOKEN:
    raise ValueError("roots token not found in environment variables")

# Inference API 설정
API_URL = "https://api-inference.huggingface.co/models/stabilityai/stable-diffusion-xl-base-1.0"
headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}

# AI 모델 초기화
speech_recognizer = pipeline(
    "automatic-speech-recognition",
    model="kresnik/wav2vec2-large-xlsr-korean"
)
text_analyzer = pipeline(
    "sentiment-analysis",
    model="nlptown/bert-base-multilingual-uncased-sentiment"
)

def calculate_baseline_features(audio_path):
    """기준점 음성 특성 분석"""
    try:
        y, sr = librosa.load(audio_path, sr=16000)
        features = {
            "energy": float(np.mean(librosa.feature.rms(y=y))),
            "tempo": float(librosa.beat.tempo(y)[0]),
            "pitch": float(np.mean(librosa.feature.zero_crossing_rate(y))),
            "volume": float(np.mean(np.abs(y))),
            "mfcc": librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).mean(axis=1).tolist()
        }
        return features
    except Exception as e:
        print(f"Error calculating baseline: {str(e)}")
        return None

def map_acoustic_to_emotion(features, baseline_features=None):
    """음향학적 특성을 감정으로 매핑 (기준점 대비)"""
    # 음성 특성 정규화
    energy_norm = min(features["energy"] * 100, 100)
    tempo_norm = min(features["tempo"] / 200, 1)
    pitch_norm = min(features["pitch"] * 2, 1)

    if baseline_features:
        energy_norm = (features["energy"] / baseline_features["energy"]) * 50
        tempo_norm = (features["tempo"] / baseline_features["tempo"])
        pitch_norm = (features["pitch"] / baseline_features["pitch"])

    emotions = {
        "primary": "",
        "intensity": energy_norm,
        "confidence": 0.0,
        "secondary": "",
        "characteristics": []
    }

    # 감정 매핑 로직
    if energy_norm > 70:
        if tempo_norm > 0.6:
            emotions["primary"] = "기쁨/열정"
            emotions["characteristics"].append("빠르고 활기찬 말하기 패턴")
        else:
            emotions["primary"] = "분노/강조"
            emotions["characteristics"].append("강한 음성 강도")
        emotions["confidence"] = energy_norm / 100
    elif pitch_norm > 0.6:
        if energy_norm > 50:
            emotions["primary"] = "놀람/흥분"
            emotions["characteristics"].append("높은 음고와 강한 강세")
        else:
            emotions["primary"] = "관심/호기심"
            emotions["characteristics"].append("음고 변화가 큼")
        emotions["confidence"] = pitch_norm
    elif energy_norm < 30:
        if tempo_norm < 0.4:
            emotions["primary"] = "슬픔/우울"
            emotions["characteristics"].append("느리고 약한 음성")
        else:
            emotions["primary"] = "피로/무기력"
            emotions["characteristics"].append("낮은 에너지 레벨")
        emotions["confidence"] = (30 - energy_norm) / 30
    else:
        if tempo_norm > 0.5:
            emotions["primary"] = "평온/안정"
            emotions["characteristics"].append("균형잡힌 말하기 패턴")
        else:
            emotions["primary"] = "차분/진지"
            emotions["characteristics"].append("안정적인 음성 특성")
        emotions["confidence"] = 0.5

    emotions["details"] = {
        "energy_level": f"{energy_norm:.1f}%",
        "speech_rate": f"{'빠름' if tempo_norm > 0.6 else '보통' if tempo_norm > 0.4 else '느림'}",
        "pitch_variation": f"{'높음' if pitch_norm > 0.6 else '보통' if pitch_norm > 0.3 else '낮음'}",
        "voice_volume": f"{'큼' if features['volume'] > 0.7 else '보통' if features['volume'] > 0.3 else '작음'}"
    }

    return emotions

# Part 2/3 - Core Functions
def generate_image_from_prompt(prompt):
    """이미지 생성 함수"""
    print(f"Generating image with prompt: {prompt}")
    try:
        if not prompt:
            print("No prompt provided")
            return None
        
        response = requests.post(
            API_URL,
            headers=headers,
            json={
                "inputs": prompt,
                "parameters": {
                    "negative_prompt": "ugly, blurry, poor quality, distorted",
                    "num_inference_steps": 30,
                    "guidance_scale": 7.5
                }
            }
        )
        
        if response.status_code == 200:
            print("Image generated successfully")
            return response.content
        else:
            print(f"Error: {response.status_code}")
            print(f"Response: {response.text}")
            return None
    except Exception as e:
        print(f"Error generating image: {str(e)}")
        return None

def generate_detailed_prompt(text, emotions, text_sentiment):
    """감정 기반 상세 프롬프트 생성"""
    emotion_colors = {
        "기쁨/열정": "밝은 노랑과 따뜻한 주황색",
        "분노/강조": "강렬한 빨강과 짙은 검정",
        "놀람/흥분": "선명한 파랑과 밝은 보라",
        "관심/호기심": "연한 하늘색과 민트색",
        "슬픔/우울": "어두운 파랑과 회색",
        "피로/무기력": "탁한 갈색과 짙은 회색",
        "평온/안정": "부드러운 초록과 베이지",
        "차분/진지": "차분한 남색과 깊은 보라"
    }

    if emotions["intensity"] > 70:
        visual_style = "역동적인 붓질과 강한 대비"
    elif emotions["intensity"] > 40:
        visual_style = "균형잡힌 구도와 중간 톤의 조화"
    else:
        visual_style = "부드러운 그라데이션과 차분한 톤"

    prompt = f"한국 전통 민화 스타일의 추상화, {emotion_colors.get(emotions['primary'], '자연스러운 색상')} 기반. "
    prompt += f"{visual_style}로 표현된 {emotions['primary']}의 감정. "
    prompt += f"음성의 특징({', '.join(emotions['characteristics'])})을 화면의 동적 요소로 표현. "
    prompt += f"발화 내용 '{text}'에서 느껴지는 감정(강도: {text_sentiment['score']}/5)을 은유적 이미지로 담아내기."
    
    return prompt

def update_final_prompt(state):
    """청신의 감상들을 종합하여 최종 프롬프트 업데이트"""
    if not state["reflections"]:
        return ""
    
    combined_prompt = "한국 전통 민화 스타일의 추상화, 온천천에서의 다음 감상들을 담아내기:\n"
    for time, text, sentiment in state["reflections"]:
        combined_prompt += f"- {time}: {text} ({sentiment})\n"
    
    return combined_prompt

def analyze_voice_with_retry(audio_path, state, max_retries=3):
    """음성 분석 함수 (재시도 로직 포함)"""
    for attempt in range(max_retries):
        try:
            y, sr = librosa.load(audio_path, sr=16000)
            
            acoustic_features = {
                "energy": float(np.mean(librosa.feature.rms(y=y))),
                "tempo": float(librosa.beat.tempo(y)[0]),
                "pitch": float(np.mean(librosa.feature.zero_crossing_rate(y))),
                "volume": float(np.mean(np.abs(y)))
            }
            
            transcription = speech_recognizer(y)
            text = transcription["text"]
            
            emotions = map_acoustic_to_emotion(acoustic_features, state.get("baseline_features"))
            text_sentiment = text_analyzer(text)[0]
            
            return {
                "text": text,
                "emotions": emotions,
                "sentiment": text_sentiment
            }
        except Exception as e:
            if attempt == max_retries - 1:
                raise e
            print(f"Attempt {attempt + 1} failed, retrying...")
            continue

# Part 3/3 - Interface and Main
def create_interface():
    with gr.Blocks(theme=gr.themes.Soft()) as app:
        state = gr.State({
            "user_name": "",
            "baseline_features": None,
            "reflections": [],
            "voice_analysis": None,
            "final_prompt": ""
        })

        # 헤더 및 네비게이션
        header = gr.Markdown("# 디지털 굿판")
        user_display = gr.Markdown("")

        with gr.Tabs() as tabs:
            # 입장
            with gr.Tab("입장"):
                gr.Markdown("### 디지털 굿판에 오신 것을 환영합니다")
                name_input = gr.Textbox(label="이름을 알려주세요", placeholder="이름을 입력해주세요")
                start_btn = gr.Button("여정 시작하기")

            # 기준 설정
            with gr.Tab("기준 설정", visible=False) as baseline_tab:
                gr.Markdown("""### 축원의 문장을 평온한 마음으로 읽어주세요
                '당신의 건강과 행복이 늘 가득하기를'""")
                baseline_audio = gr.Audio(
                    label="축원 문장 녹음하기",
                    sources=["microphone"],
                    type="filepath"
                )
                set_baseline_btn = gr.Button("기준점 설정 완료")
                baseline_status = gr.Markdown("")

            # 청신
            with gr.Tab("청신", visible=False) as cleansing_tab:
                gr.Markdown("""## 청신 - 소리로 정화하기
                
                이 앱은 어디서나 실행 가능하지만, 온천천의 사운드스케이프를 기반으로 제작되었습니다.
                온천장역에서 장전역까지의 구간을 걸으며 경험하면, 
                보다 자연과 하나 된 온천천의 신화를 느낄 수 있습니다.
                """)
                play_music_btn = gr.Button("온천천의 소리 듣기")
                with gr.Row():
                    audio = gr.Audio(
                        value=None,
                        type="filepath",
                        label="온천천의 소리",
                        interactive=False,
                        autoplay=False
                    )
                    with gr.Column():
                        reflection_input = gr.Textbox(
                            label="현재 순간의 감상을 적어주세요",
                            lines=3
                        )
                        save_btn = gr.Button("감상 저장하기")
                        reflections_display = gr.Dataframe(
                            headers=["시간", "감상", "감정 분석"],
                            label="기록된 감상들"
                        )
                continue_to_prayer_btn = gr.Button("기원으로 이동하기")

            # 기원
            with gr.Tab("기원", visible=False) as prayer_tab:
                with gr.Row():
                    with gr.Column():
                        voice_input = gr.Audio(
                            label="나누고 싶은 이야기를 들려주세요",
                            sources=["microphone"],
                            type="filepath"
                        )
                        clear_btn = gr.Button("녹음 지우기")
                        analyze_btn = gr.Button("분석하기")
                    
                    with gr.Column():
                        transcribed_text = gr.Textbox(label="인식된 텍스트")
                        voice_emotion = gr.Textbox(label="음성 감정 분석")
                        text_emotion = gr.Textbox(label="텍스트 감정 분석")
                        final_prompt = gr.Textbox(label="생성된 프롬프트", lines=3)

            # 송신
            with gr.Tab("송신", visible=False) as sending_tab:
                gr.Markdown("## 송신 - 시각화 결과")
                combined_prompt = gr.Textbox(
                    label="종합 프롬프트",
                    interactive=False,
                    lines=3
                )
                generate_btn = gr.Button("이미지 생성하기")
                with gr.Row():
                    result_image = gr.Image(label="생성된 이미지", type="pil")
                    with gr.Column():
                        share_btn = gr.Button("이미지 공유하기")
                        download_btn = gr.Button("이미지 저장하기")

        def start_journey(name):
            if not name.strip():
                return "이름을 입력해주세요", gr.update()
            
            welcome_text = f"""
            # 환영합니다, {name}님의 디지털 굿판
        
            ## 굿판의 세계관 🌌
            디지털 굿판은 현대 도시 속에서 잊혀진 전통 굿의 정수를 담아낸 **디지털 의례의 공간**입니다. 
            이곳에서는 사람들의 목소리와 감정을 통해 **영적 교감**을 나누고, **자연과 도시의 에너지**를 연결하며, 
            평온함과 치유를 경험하게 됩니다.
        
            ## 위치 안내 📍
            이 앱은 어디서나 실행 가능하지만, 온천천의 사운드스케이프를 녹음하고 이를 기반으로 제작되어,
            온천천 온천장역에서 장전역까지의 구간을 걸으며 경험하면 
            보다 자연과 하나 된 온천천의 신화를 느낄 수 있습니다.
        
            ## 여정을 시작하며 🚀
            먼저, 평온한 마음으로 축원의 문장을 읽어주세요. 
            이는 당신의 감정을 더 정확하게 이해하기 위한 기준점이 될 것입니다.
            """
            return welcome_text, gr.update(selected="기준 설정")

       def play_music():
           audio_path = os.path.abspath(os.path.join("assets", "main_music.mp3"))
           return gr.update(value=audio_path, autoplay=True)

       def save_reflection(text, state):
           if not text.strip():
               return state, state["reflections"]
           
           try:
               current_time = datetime.now().strftime("%H:%M:%S")
               sentiment = text_analyzer(text)[0]
               new_reflection = [current_time, text, f"{sentiment['label']} ({sentiment['score']:.2f})"]
               
               state = state.copy()
               if "reflections" not in state:
                   state["reflections"] = []
                   
               state["reflections"].append(new_reflection)
               return state, state["reflections"]
           except Exception as e:
               print(f"Error in save_reflection: {str(e)}")
               return state, []

       def analyze_voice(audio_path, state):
           if audio_path is None:
               return state, "음성을 먼저 녹음해주세요.", "", "", ""
           
           try:
               result = analyze_voice_with_retry(audio_path, state)
               
               voice_result = (
                   f"음성 감정: {result['emotions']['primary']} "
                   f"(강도: {result['emotions']['intensity']:.1f}%, "
                   f"신뢰도: {result['emotions']['confidence']:.2f})\n"
                   f"특징: {', '.join(result['emotions']['characteristics'])}"
               )
               
               text_result = f"텍스트 감정: {result['sentiment']['label']} "
               f"(강도: {result['sentiment']['score']}/5)"
               
               prompt = generate_detailed_prompt(
                   result['text'],
                   result['emotions'],
                   result['sentiment']
               )
               
               return state, result['text'], voice_result, text_result, prompt
           except Exception as e:
               return state, f"오류 발생: {str(e)}", "", "", ""

       def update_sending_tab(state):
           """청신의 감상과 기원의 음성을 종합하여 송신 탭 업데이트"""
           combined = "청신과 기원의 여정을 담은 이미지:\n\n"
           combined += update_final_prompt(state)
           return combined

       # 이벤트 연결
       start_btn.click(fn=start_journey, inputs=[name_input], outputs=[user_display, tabs])
       play_music_btn.click(fn=play_music, outputs=[audio])
       save_btn.click(fn=save_reflection, inputs=[reflection_input, state], outputs=[state, reflections_display])
       analyze_btn.click(fn=analyze_voice, inputs=[voice_input, state], outputs=[state, transcribed_text, voice_emotion, text_emotion, final_prompt])
       continue_to_prayer_btn.click(fn=lambda: gr.update(selected="기원"), outputs=[tabs])
       tabs.change(fn=update_sending_tab, inputs=[state], outputs=[combined_prompt])
       generate_btn.click(fn=generate_image_from_prompt, inputs=[combined_prompt], outputs=[result_image])

       return app

if __name__ == "__main__":
   demo = create_interface()
   demo.launch(debug=True)