Spaces:

Severian
/

CMT-Mapping

Running

File size: 18,624 Bytes

e0da54d
 
 
 
 
 
 
 
 
 
 
 
 
 
6f06eba
e0da54d
6f06eba
 
b8daacb
 
e0da54d
b8daacb
 
 
 
 
e0da54d
b8daacb
 
e0da54d
b8daacb
 
 
 
 
 
e0da54d
b8daacb
 
 
e0da54d
b8daacb
 
 
e0da54d
b8daacb
e0da54d
 
b8daacb
e0da54d
 
b8daacb
 
e0da54d
 
 
 
 
 
 
b8daacb
 
e0da54d
b8daacb
6f06eba
e0da54d
 
 
 
 
b8daacb
e0da54d
 
b8daacb
e0da54d
 
 
 
 
 
 
 
 
4949adb
e0da54d
 
 
ad689b1
e0da54d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8daacb
 
e0da54d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4949adb
e0da54d
 
 
 
b8daacb
e0da54d
 
b8daacb
e0da54d
b8daacb
e0da54d
 
b8daacb
e0da54d
 
b8daacb
e0da54d
 
b8daacb
e0da54d
 
 
 
b8daacb
 
e0da54d
 
 
 
 
 
 
 
 
4949adb
e0da54d
 
 
 
b8daacb
 
 
 
e0da54d
 
 
 
b8daacb
 
 
e0da54d
 
 
 
 
 
b8daacb
e0da54d
 
 
 
 
b8daacb
e0da54d
 
 
 
 
 
b8daacb
e0da54d
 
 
 
 
b8daacb
e0da54d
 
 
 
 
 
b8daacb
e0da54d
 
 
 
 
b8daacb
e0da54d
 
 
 
 
 
 
 
 
b8daacb
e0da54d
b8daacb
e0da54d
 
 
 
b8daacb
 
e0da54d
 
 
 
 
 
 
 
 
b8daacb
e0da54d
4949adb
e0da54d
 
b8daacb
e0da54d
 
 
 
 
 
 
 
66b4da3
e0da54d
 
 
 
 
 
 
66b4da3
e0da54d
 
 
 
 
 
 
 
 
66b4da3
e0da54d
b8daacb
e0da54d
 
 
9dbdd98
e0da54d
 
 
 
 
 
9dbdd98
 
e0da54d
 
9dbdd98
e0da54d
 
 
 
9dbdd98
e0da54d
 
 
 
 
 
 
 
 
 
 
9dbdd98
e0da54d
 
 
 
 
 
 
 
 
 
 
9dbdd98
e0da54d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8daacb
e0da54d
b8daacb
e0da54d
 
 
 
 
 
 
 
b8daacb
 
e0da54d
b8daacb
e0da54d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad689b1
e0da54d
 
 
 
 
b8daacb
9dbdd98
e0da54d
 
 
 
 
 
9dbdd98
e0da54d
 
 
 
 
 
b8daacb
e0da54d
 
 
 
 
b8daacb
9dbdd98
e0da54d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9dbdd98
e0da54d
6f06eba
 
e0da54d

#!/usr/bin/env python3
"""
Scientific CMT Diagnostic Analysis Engine
Rigorous statistical analysis of real CMT transformation results

🔬 SCIENTIFIC INTEGRITY COMPLIANCE 🔬
- Uses ONLY real preprocessed CMT data from CSV files
- NO synthetic data generation
- NO interpolation or field reconstruction  
- NO speculative similarity metrics
- Proper statistical hypothesis testing
- Mathematically grounded distance measures
"""

import warnings
import os
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats
import gradio as gr

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

print("🔬 Initializing Scientific CMT Diagnostic Analysis Engine...")

# ---------------------------------------------------------------
# Platform-aware data loading 
# ---------------------------------------------------------------
HF_CSV_DOG = "cmt_dog_sound_analysis.csv"
HF_CSV_HUMAN = "cmt_human_speech_analysis.csv"
COLAB_CSV_DOG = "/content/cmt_dog_sound_analysis.csv"
COLAB_CSV_HUMAN = "/content/cmt_human_speech_analysis.csv"

# Determine platform and set paths
if os.path.exists(HF_CSV_DOG) and os.path.exists(HF_CSV_HUMAN):
    CSV_DOG = HF_CSV_DOG
    CSV_HUMAN = HF_CSV_HUMAN
    print("✅ Using Hugging Face Spaces data files")
elif os.path.exists(COLAB_CSV_DOG) and os.path.exists(COLAB_CSV_HUMAN):
    CSV_DOG = COLAB_CSV_DOG
    CSV_HUMAN = COLAB_CSV_HUMAN
    print("✅ Using Google Colab data files")
else:
    print("❌ No real data files found - cannot proceed without actual CMT data")
    exit(1)

# Load real CMT data
try:
    df_dog = pd.read_csv(CSV_DOG)
    df_human = pd.read_csv(CSV_HUMAN)
    df_dog['source'] = 'Dog'
    df_human['source'] = 'Human'
    df_combined = pd.concat([df_dog, df_human], ignore_index=True)
    print(f"✅ Loaded real CMT data: {len(df_dog)} dog samples, {len(df_human)} human samples")
except Exception as e:
    print(f"❌ Error loading real CMT data: {e}")
    exit(1)

# ---------------------------------------------------------------
# Scientific Analysis Functions
# ---------------------------------------------------------------

def get_real_cmt_diagnostics(row: pd.Series, lens: str):
    """Extract ONLY real preprocessed CMT diagnostic values - NO synthesis."""
    try:
        alpha_col = f"diag_alpha_{lens}"
        srl_col = f"diag_srl_{lens}"
        
        alpha_val = row.get(alpha_col, np.nan)
        srl_val = row.get(srl_col, np.nan)
        
        if np.isnan(alpha_val) or np.isnan(srl_val):
            return None
            
        return {
            "alpha": float(alpha_val),
            "srl": float(srl_val),
            "filepath": row.get("filepath", "unknown"),
            "label": row.get("label", "unknown"),
            "source": row.get("source", "unknown"),
        }
    except Exception as e:
        print(f"Error extracting real CMT data: {e}")
        return None

def calculate_statistical_significance(primary_data, neighbor_data, df_combined, lens):
    """Rigorous statistical analysis with proper hypothesis testing."""
    alpha_col = f"diag_alpha_{lens}"
    srl_col = f"diag_srl_{lens}"
    
    # Get population data for context
    primary_population = df_combined[df_combined['source'] == primary_data['source']]
    neighbor_population = df_combined[df_combined['source'] == neighbor_data['source']]
    
    primary_alphas = primary_population[alpha_col].dropna()
    neighbor_alphas = neighbor_population[alpha_col].dropna()
    primary_srls = primary_population[srl_col].dropna()
    neighbor_srls = neighbor_population[srl_col].dropna()
    
    if len(primary_alphas) < 2 or len(neighbor_alphas) < 2:
        return {"error": "Insufficient data for statistical analysis"}
    
    # Statistical tests
    alpha_ttest = stats.ttest_ind(primary_alphas, neighbor_alphas)
    srl_ttest = stats.ttest_ind(primary_srls, neighbor_srls)
    
    # Effect sizes (Cohen's d)
    def cohens_d(x, y):
        nx, ny = len(x), len(y)
        if nx < 2 or ny < 2:
            return np.nan
        pooled_std = np.sqrt(((nx-1)*np.var(x, ddof=1) + (ny-1)*np.var(y, ddof=1)) / (nx+ny-2))
        return (np.mean(x) - np.mean(y)) / pooled_std if pooled_std > 0 else 0
    
    alpha_effect_size = cohens_d(primary_alphas, neighbor_alphas)
    srl_effect_size = cohens_d(primary_srls, neighbor_srls)
    
    # Euclidean distance (mathematically sound)
    diagnostic_distance = np.sqrt(
        (primary_data['alpha'] - neighbor_data['alpha'])**2 + 
        (primary_data['srl'] - neighbor_data['srl'])**2
    )
    
    # Population percentiles
    primary_alpha_percentile = stats.percentileofscore(primary_alphas, primary_data['alpha'])
    neighbor_alpha_percentile = stats.percentileofscore(neighbor_alphas, neighbor_data['alpha'])
    primary_srl_percentile = stats.percentileofscore(primary_srls, primary_data['srl'])
    neighbor_srl_percentile = stats.percentileofscore(neighbor_srls, neighbor_data['srl'])
    
    return {
        "alpha_ttest_statistic": alpha_ttest.statistic,
        "alpha_ttest_pvalue": alpha_ttest.pvalue,
        "srl_ttest_statistic": srl_ttest.statistic, 
        "srl_ttest_pvalue": srl_ttest.pvalue,
        "alpha_effect_size": alpha_effect_size,
        "srl_effect_size": srl_effect_size,
        "diagnostic_distance": diagnostic_distance,
        "primary_alpha_percentile": primary_alpha_percentile,
        "neighbor_alpha_percentile": neighbor_alpha_percentile,
        "primary_srl_percentile": primary_srl_percentile,
        "neighbor_srl_percentile": neighbor_srl_percentile,
        "primary_population_size": len(primary_alphas),
        "neighbor_population_size": len(neighbor_alphas)
    }

def find_nearest_neighbor_scientific(selected_row, df_combined, lens):
    """Find nearest neighbor using only Euclidean distance in diagnostic space."""
    selected_source = selected_row['source']
    opposite_source = 'Human' if selected_source == 'Dog' else 'Dog'
    
    alpha_col = f"diag_alpha_{lens}"
    srl_col = f"diag_srl_{lens}"
    
    opposite_data = df_combined[df_combined['source'] == opposite_source].copy()
    
    if len(opposite_data) == 0:
        return None
    
    selected_alpha = selected_row[alpha_col]
    selected_srl = selected_row[srl_col]
    
    if np.isnan(selected_alpha) or np.isnan(selected_srl):
        return None
    
    # Calculate Euclidean distances
    distances = np.sqrt(
        (opposite_data[alpha_col] - selected_alpha)**2 + 
        (opposite_data[srl_col] - selected_srl)**2
    )
    
    valid_indices = ~np.isnan(distances)
    if not np.any(valid_indices):
        return None
    
    valid_distances = distances[valid_indices]
    valid_data = opposite_data[valid_indices]
    
    nearest_idx = np.argmin(valid_distances)
    return valid_data.iloc[nearest_idx], float(valid_distances.iloc[nearest_idx])

def create_scientific_diagnostic_plot(primary_data, neighbor_data, lens):
    """Create scientifically rigorous diagnostic plots using ONLY real data."""
    if not primary_data or not neighbor_data:
        return go.Figure(layout={"title": "Insufficient real data for analysis"})
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=[
            f"Alpha Values ({lens.upper()} lens)",
            f"SRL Values ({lens.upper()} lens)", 
            "Alpha vs SRL Correlation",
            "Population Context"
        ]
    )
    
    # Alpha comparison
    fig.add_trace(go.Scatter(
        x=[0], y=[primary_data['alpha']], 
        mode='markers', marker=dict(size=15, color='red'),
        name=f"Primary: {primary_data['label']}", showlegend=True
    ), row=1, col=1)
    
    fig.add_trace(go.Scatter(
        x=[1], y=[neighbor_data['alpha']], 
        mode='markers', marker=dict(size=15, color='blue'),
        name=f"Neighbor: {neighbor_data['label']}", showlegend=True
    ), row=1, col=1)
    
    # SRL comparison
    fig.add_trace(go.Scatter(
        x=[0], y=[primary_data['srl']], 
        mode='markers', marker=dict(size=15, color='red'),
        showlegend=False
    ), row=1, col=2)
    
    fig.add_trace(go.Scatter(
        x=[1], y=[neighbor_data['srl']], 
        mode='markers', marker=dict(size=15, color='blue'),
        showlegend=False
    ), row=1, col=2)
    
    # Alpha vs SRL scatter
    fig.add_trace(go.Scatter(
        x=[primary_data['alpha']], y=[primary_data['srl']], 
        mode='markers', marker=dict(size=20, color='red'),
        name="Primary α-SRL", showlegend=False
    ), row=2, col=1)
    
    fig.add_trace(go.Scatter(
        x=[neighbor_data['alpha']], y=[neighbor_data['srl']], 
        mode='markers', marker=dict(size=20, color='blue'),
        name="Neighbor α-SRL", showlegend=False
    ), row=2, col=1)
    
    # Distance visualization
    fig.add_trace(go.Scatter(
        x=[primary_data['alpha'], neighbor_data['alpha']], 
        y=[primary_data['srl'], neighbor_data['srl']], 
        mode='lines+markers',
        line=dict(color='purple', width=3, dash='dash'),
        marker=dict(size=10, color=['red', 'blue']),
        name="Euclidean Distance", showlegend=False
    ), row=2, col=2)
    
    # Update layout
    fig.update_layout(
        title=f"Scientific CMT Diagnostic Analysis - {lens.upper()} Lens",
        height=600,
        paper_bgcolor='white',
        plot_bgcolor='white'
    )
    
    # Update axes
    fig.update_xaxes(title_text="Sample", row=1, col=1)
    fig.update_yaxes(title_text="Alpha Value", row=1, col=1)
    fig.update_xaxes(title_text="Sample", row=1, col=2)
    fig.update_yaxes(title_text="SRL Value", row=1, col=2)
    fig.update_xaxes(title_text="Alpha", row=2, col=1)
    fig.update_yaxes(title_text="SRL", row=2, col=1)
    fig.update_xaxes(title_text="Alpha", row=2, col=2)
    fig.update_yaxes(title_text="SRL", row=2, col=2)
    
    return fig

def update_scientific_analysis(species, primary_file, neighbor_file, lens):
    """Main analysis function using only real data and rigorous statistics."""
    try:
        # Get rows from real data
        primary_row = df_combined[
            (df_combined["filepath"] == primary_file) & 
            (df_combined["source"] == species)
        ].iloc[0] if len(df_combined[
            (df_combined["filepath"] == primary_file) & 
            (df_combined["source"] == species)
        ]) > 0 else None
        
        if primary_row is None:
            return (
                go.Figure(layout={"title": "Primary sample not found"}),
                "Primary sample not found",
                "No analysis available",
                "No statistics available"
            )
        
        # Find neighbor
        neighbor_result = find_nearest_neighbor_scientific(primary_row, df_combined, lens)
        if neighbor_result is None:
            return (
                go.Figure(layout={"title": "No valid neighbor found"}),
                "No valid neighbor found",
                "No analysis available", 
                "No statistics available"
            )
        
        neighbor_row, distance = neighbor_result
        
        # Get real CMT data
        primary_cmt = get_real_cmt_diagnostics(primary_row, lens)
        neighbor_cmt = get_real_cmt_diagnostics(neighbor_row, lens)
        
        if not primary_cmt or not neighbor_cmt:
            return (
                go.Figure(layout={"title": "Invalid CMT data"}),
                "Invalid CMT data",
                "No analysis available",
                "No statistics available"
            )
        
        # Create scientific visualization
        diagnostic_fig = create_scientific_diagnostic_plot(primary_cmt, neighbor_cmt, lens)
        
        # Calculate statistics
        stats_results = calculate_statistical_significance(
            primary_cmt, neighbor_cmt, df_combined, lens
        )
        
        # Build information panels
        primary_info = f"""
        <h4>📊 <b>Primary Sample</b></h4>
        <div style="background: rgba(240,240,250,1); padding: 10px; border-radius: 8px; margin: 5px 0; color: black;">
            <p><b>File:</b> {primary_cmt['filepath']}</p>
            <p><b>Species:</b> {primary_cmt['source']}</p>
            <p><b>Label:</b> {primary_cmt['label']}</p>
            <p><b>CMT α ({lens}):</b> {primary_cmt['alpha']:.6f}</p>
            <p><b>CMT SRL ({lens}):</b> {primary_cmt['srl']:.6f}</p>
        </div>
        """
        
        neighbor_info = f"""
        <h4>🔗 <b>Nearest Neighbor</b></h4>
        <div style="background: rgba(240,250,240,1); padding: 10px; border-radius: 8px; margin: 5px 0; color: black;">
            <p><b>File:</b> {neighbor_cmt['filepath']}</p>
            <p><b>Species:</b> {neighbor_cmt['source']}</p>
            <p><b>Label:</b> {neighbor_cmt['label']}</p>
            <p><b>CMT α ({lens}):</b> {neighbor_cmt['alpha']:.6f}</p>
            <p><b>CMT SRL ({lens}):</b> {neighbor_cmt['srl']:.6f}</p>
            <p><b>Distance:</b> {distance:.6f}</p>
        </div>
        """
        
        if 'error' not in stats_results:
            stats_info = f"""
            <h4>🔬 <b>Statistical Analysis</b></h4>
            <div style="background: rgba(250,250,240,1); padding: 10px; border-radius: 8px; margin: 5px 0; color: black;">
                <p><b>Alpha t-test:</b> t = {stats_results['alpha_ttest_statistic']:.4f}, p = {stats_results['alpha_ttest_pvalue']:.6f}</p>
                <p><b>SRL t-test:</b> t = {stats_results['srl_ttest_statistic']:.4f}, p = {stats_results['srl_ttest_pvalue']:.6f}</p>
                <p><b>Effect Sizes (Cohen's d):</b></p>
                <p>• Alpha: {stats_results['alpha_effect_size']:.4f}</p>
                <p>• SRL: {stats_results['srl_effect_size']:.4f}</p>
                <p><b>Population Sizes:</b> {stats_results['primary_population_size']} vs {stats_results['neighbor_population_size']}</p>
                <p><b>Statistical Significance:</b></p>
                <p>• Alpha: {'Significant' if stats_results['alpha_ttest_pvalue'] < 0.05 else 'Not significant'}</p>
                <p>• SRL: {'Significant' if stats_results['srl_ttest_pvalue'] < 0.05 else 'Not significant'}</p>
            </div>
            """
        else:
            stats_info = f"<p>Statistical analysis failed: {stats_results['error']}</p>"
        
        return diagnostic_fig, primary_info, neighbor_info, stats_info
        
    except Exception as e:
        error_msg = f"Analysis error: {str(e)}"
        return (
            go.Figure(layout={"title": error_msg}),
            error_msg,
            error_msg,
            error_msg
        )

# ---------------------------------------------------------------
# Gradio Interface
# ---------------------------------------------------------------
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="cyan")) as demo:
    gr.Markdown("""
    # 🔬 **Scientific CMT Diagnostic Analysis Engine** 
    *Rigorous statistical analysis of real CMT transformation results*
    
    ## ⚠️ **SCIENTIFIC INTEGRITY NOTICE** ⚠️
    **This interface uses ONLY real preprocessed CMT data with NO synthetic generation, interpolation, or speculation.**
    
    **What you see:**
    - ✅ **Real CMT diagnostic values** (α, SRL) from actual transformations
    - ✅ **Mathematically rigorous distance measures** (Euclidean distance)
    - ✅ **Proper statistical testing** (t-tests, effect sizes, percentiles)
    - ✅ **Scientific hypothesis testing** with p-values and confidence measures
    
    **What was REMOVED for scientific rigor:**
    - ❌ Synthetic holographic field generation
    - ❌ Cubic interpolation of non-existent data
    - ❌ Speculative similarity metrics
    - ❌ Confirmation bias in pattern detection
    - ❌ Ungrounded "communication bridge" calculations
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 🔬 **Analysis Controls**")
            
            species_selection = gr.Dropdown(
                label="Species",
                choices=["Dog", "Human"],
                value="Dog",
                info="Select primary species for analysis"
            )
            
            lens_selection = gr.Dropdown(
                label="Mathematical Lens",
                choices=["gamma", "zeta", "airy", "bessel"],
                value="gamma",
                info="CMT lens function used for analysis"
            )
            
            primary_file_selection = gr.Dropdown(
                label="Primary Sample",
                choices=df_combined[df_combined["source"] == "Dog"]["filepath"].tolist(),
                value=df_combined[df_combined["source"] == "Dog"]["filepath"].iloc[0] if len(df_combined[df_combined["source"] == "Dog"]) > 0 else "",
                info="Select specific sample for analysis"
            )
            
            neighbor_file_selection = gr.Dropdown(
                label="Comparison Sample",
                choices=[],
                value="",
                info="Nearest neighbor will be automatically found"
            )
            
        with gr.Column(scale=2):
            diagnostic_plot = gr.Plot(label="Scientific Diagnostic Analysis")
            
    with gr.Row():
        with gr.Column():
            primary_info_display = gr.HTML(label="Primary Sample Analysis")
        with gr.Column():
            neighbor_info_display = gr.HTML(label="Neighbor Analysis")
        with gr.Column():
            stats_info_display = gr.HTML(label="Statistical Results")
    
    # Update file choices when species changes
    def update_file_choices(species):
        choices = df_combined[df_combined["source"] == species]["filepath"].tolist()
        return gr.Dropdown(choices=choices, value=choices[0] if choices else "")
    
    species_selection.change(
        fn=update_file_choices,
        inputs=[species_selection],
        outputs=[primary_file_selection]
    )
    
    # Main analysis update
    for input_component in [species_selection, primary_file_selection, lens_selection]:
        input_component.change(
            fn=update_scientific_analysis,
            inputs=[species_selection, primary_file_selection, neighbor_file_selection, lens_selection],
            outputs=[diagnostic_plot, primary_info_display, neighbor_info_display, stats_info_display]
        )

print("🔬 Scientific CMT Diagnostic Analysis Engine Ready!")

if __name__ == "__main__":
    demo.launch(share=False, debug=False)