File size: 18,624 Bytes
e0da54d
 
 
 
 
 
 
 
 
 
 
 
 
 
6f06eba
e0da54d
6f06eba
 
b8daacb
 
e0da54d
b8daacb
 
 
 
 
e0da54d
b8daacb
 
e0da54d
b8daacb
 
 
 
 
 
e0da54d
b8daacb
 
 
e0da54d
b8daacb
 
 
e0da54d
b8daacb
e0da54d
 
b8daacb
e0da54d
 
b8daacb
 
e0da54d
 
 
 
 
 
 
b8daacb
 
e0da54d
b8daacb
6f06eba
e0da54d
 
 
 
 
b8daacb
e0da54d
 
b8daacb
e0da54d
 
 
 
 
 
 
 
 
4949adb
e0da54d
 
 
ad689b1
e0da54d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8daacb
 
e0da54d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4949adb
e0da54d
 
 
 
b8daacb
e0da54d
 
b8daacb
e0da54d
b8daacb
e0da54d
 
b8daacb
e0da54d
 
b8daacb
e0da54d
 
b8daacb
e0da54d
 
 
 
b8daacb
 
e0da54d
 
 
 
 
 
 
 
 
4949adb
e0da54d
 
 
 
b8daacb
 
 
 
e0da54d
 
 
 
b8daacb
 
 
e0da54d
 
 
 
 
 
b8daacb
e0da54d
 
 
 
 
b8daacb
e0da54d
 
 
 
 
 
b8daacb
e0da54d
 
 
 
 
b8daacb
e0da54d
 
 
 
 
 
b8daacb
e0da54d
 
 
 
 
b8daacb
e0da54d
 
 
 
 
 
 
 
 
b8daacb
e0da54d
b8daacb
e0da54d
 
 
 
b8daacb
 
e0da54d
 
 
 
 
 
 
 
 
b8daacb
e0da54d
4949adb
e0da54d
 
b8daacb
e0da54d
 
 
 
 
 
 
 
66b4da3
e0da54d
 
 
 
 
 
 
66b4da3
e0da54d
 
 
 
 
 
 
 
 
66b4da3
e0da54d
b8daacb
e0da54d
 
 
9dbdd98
e0da54d
 
 
 
 
 
9dbdd98
 
e0da54d
 
9dbdd98
e0da54d
 
 
 
9dbdd98
e0da54d
 
 
 
 
 
 
 
 
 
 
9dbdd98
e0da54d
 
 
 
 
 
 
 
 
 
 
9dbdd98
e0da54d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8daacb
e0da54d
b8daacb
e0da54d
 
 
 
 
 
 
 
b8daacb
 
e0da54d
b8daacb
e0da54d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad689b1
e0da54d
 
 
 
 
b8daacb
9dbdd98
e0da54d
 
 
 
 
 
9dbdd98
e0da54d
 
 
 
 
 
b8daacb
e0da54d
 
 
 
 
b8daacb
9dbdd98
e0da54d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9dbdd98
e0da54d
6f06eba
 
e0da54d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
#!/usr/bin/env python3
"""
Scientific CMT Diagnostic Analysis Engine
Rigorous statistical analysis of real CMT transformation results

πŸ”¬ SCIENTIFIC INTEGRITY COMPLIANCE πŸ”¬
- Uses ONLY real preprocessed CMT data from CSV files
- NO synthetic data generation
- NO interpolation or field reconstruction  
- NO speculative similarity metrics
- Proper statistical hypothesis testing
- Mathematically grounded distance measures
"""

import warnings
import os
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats
import gradio as gr

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

print("πŸ”¬ Initializing Scientific CMT Diagnostic Analysis Engine...")

# ---------------------------------------------------------------
# Platform-aware data loading 
# ---------------------------------------------------------------
HF_CSV_DOG = "cmt_dog_sound_analysis.csv"
HF_CSV_HUMAN = "cmt_human_speech_analysis.csv"
COLAB_CSV_DOG = "/content/cmt_dog_sound_analysis.csv"
COLAB_CSV_HUMAN = "/content/cmt_human_speech_analysis.csv"

# Determine platform and set paths
if os.path.exists(HF_CSV_DOG) and os.path.exists(HF_CSV_HUMAN):
    CSV_DOG = HF_CSV_DOG
    CSV_HUMAN = HF_CSV_HUMAN
    print("βœ… Using Hugging Face Spaces data files")
elif os.path.exists(COLAB_CSV_DOG) and os.path.exists(COLAB_CSV_HUMAN):
    CSV_DOG = COLAB_CSV_DOG
    CSV_HUMAN = COLAB_CSV_HUMAN
    print("βœ… Using Google Colab data files")
else:
    print("❌ No real data files found - cannot proceed without actual CMT data")
    exit(1)

# Load real CMT data
try:
    df_dog = pd.read_csv(CSV_DOG)
    df_human = pd.read_csv(CSV_HUMAN)
    df_dog['source'] = 'Dog'
    df_human['source'] = 'Human'
    df_combined = pd.concat([df_dog, df_human], ignore_index=True)
    print(f"βœ… Loaded real CMT data: {len(df_dog)} dog samples, {len(df_human)} human samples")
except Exception as e:
    print(f"❌ Error loading real CMT data: {e}")
    exit(1)

# ---------------------------------------------------------------
# Scientific Analysis Functions
# ---------------------------------------------------------------

def get_real_cmt_diagnostics(row: pd.Series, lens: str):
    """Extract ONLY real preprocessed CMT diagnostic values - NO synthesis."""
    try:
        alpha_col = f"diag_alpha_{lens}"
        srl_col = f"diag_srl_{lens}"
        
        alpha_val = row.get(alpha_col, np.nan)
        srl_val = row.get(srl_col, np.nan)
        
        if np.isnan(alpha_val) or np.isnan(srl_val):
            return None
            
        return {
            "alpha": float(alpha_val),
            "srl": float(srl_val),
            "filepath": row.get("filepath", "unknown"),
            "label": row.get("label", "unknown"),
            "source": row.get("source", "unknown"),
        }
    except Exception as e:
        print(f"Error extracting real CMT data: {e}")
        return None

def calculate_statistical_significance(primary_data, neighbor_data, df_combined, lens):
    """Rigorous statistical analysis with proper hypothesis testing."""
    alpha_col = f"diag_alpha_{lens}"
    srl_col = f"diag_srl_{lens}"
    
    # Get population data for context
    primary_population = df_combined[df_combined['source'] == primary_data['source']]
    neighbor_population = df_combined[df_combined['source'] == neighbor_data['source']]
    
    primary_alphas = primary_population[alpha_col].dropna()
    neighbor_alphas = neighbor_population[alpha_col].dropna()
    primary_srls = primary_population[srl_col].dropna()
    neighbor_srls = neighbor_population[srl_col].dropna()
    
    if len(primary_alphas) < 2 or len(neighbor_alphas) < 2:
        return {"error": "Insufficient data for statistical analysis"}
    
    # Statistical tests
    alpha_ttest = stats.ttest_ind(primary_alphas, neighbor_alphas)
    srl_ttest = stats.ttest_ind(primary_srls, neighbor_srls)
    
    # Effect sizes (Cohen's d)
    def cohens_d(x, y):
        nx, ny = len(x), len(y)
        if nx < 2 or ny < 2:
            return np.nan
        pooled_std = np.sqrt(((nx-1)*np.var(x, ddof=1) + (ny-1)*np.var(y, ddof=1)) / (nx+ny-2))
        return (np.mean(x) - np.mean(y)) / pooled_std if pooled_std > 0 else 0
    
    alpha_effect_size = cohens_d(primary_alphas, neighbor_alphas)
    srl_effect_size = cohens_d(primary_srls, neighbor_srls)
    
    # Euclidean distance (mathematically sound)
    diagnostic_distance = np.sqrt(
        (primary_data['alpha'] - neighbor_data['alpha'])**2 + 
        (primary_data['srl'] - neighbor_data['srl'])**2
    )
    
    # Population percentiles
    primary_alpha_percentile = stats.percentileofscore(primary_alphas, primary_data['alpha'])
    neighbor_alpha_percentile = stats.percentileofscore(neighbor_alphas, neighbor_data['alpha'])
    primary_srl_percentile = stats.percentileofscore(primary_srls, primary_data['srl'])
    neighbor_srl_percentile = stats.percentileofscore(neighbor_srls, neighbor_data['srl'])
    
    return {
        "alpha_ttest_statistic": alpha_ttest.statistic,
        "alpha_ttest_pvalue": alpha_ttest.pvalue,
        "srl_ttest_statistic": srl_ttest.statistic, 
        "srl_ttest_pvalue": srl_ttest.pvalue,
        "alpha_effect_size": alpha_effect_size,
        "srl_effect_size": srl_effect_size,
        "diagnostic_distance": diagnostic_distance,
        "primary_alpha_percentile": primary_alpha_percentile,
        "neighbor_alpha_percentile": neighbor_alpha_percentile,
        "primary_srl_percentile": primary_srl_percentile,
        "neighbor_srl_percentile": neighbor_srl_percentile,
        "primary_population_size": len(primary_alphas),
        "neighbor_population_size": len(neighbor_alphas)
    }

def find_nearest_neighbor_scientific(selected_row, df_combined, lens):
    """Find nearest neighbor using only Euclidean distance in diagnostic space."""
    selected_source = selected_row['source']
    opposite_source = 'Human' if selected_source == 'Dog' else 'Dog'
    
    alpha_col = f"diag_alpha_{lens}"
    srl_col = f"diag_srl_{lens}"
    
    opposite_data = df_combined[df_combined['source'] == opposite_source].copy()
    
    if len(opposite_data) == 0:
        return None
    
    selected_alpha = selected_row[alpha_col]
    selected_srl = selected_row[srl_col]
    
    if np.isnan(selected_alpha) or np.isnan(selected_srl):
        return None
    
    # Calculate Euclidean distances
    distances = np.sqrt(
        (opposite_data[alpha_col] - selected_alpha)**2 + 
        (opposite_data[srl_col] - selected_srl)**2
    )
    
    valid_indices = ~np.isnan(distances)
    if not np.any(valid_indices):
        return None
    
    valid_distances = distances[valid_indices]
    valid_data = opposite_data[valid_indices]
    
    nearest_idx = np.argmin(valid_distances)
    return valid_data.iloc[nearest_idx], float(valid_distances.iloc[nearest_idx])

def create_scientific_diagnostic_plot(primary_data, neighbor_data, lens):
    """Create scientifically rigorous diagnostic plots using ONLY real data."""
    if not primary_data or not neighbor_data:
        return go.Figure(layout={"title": "Insufficient real data for analysis"})
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=[
            f"Alpha Values ({lens.upper()} lens)",
            f"SRL Values ({lens.upper()} lens)", 
            "Alpha vs SRL Correlation",
            "Population Context"
        ]
    )
    
    # Alpha comparison
    fig.add_trace(go.Scatter(
        x=[0], y=[primary_data['alpha']], 
        mode='markers', marker=dict(size=15, color='red'),
        name=f"Primary: {primary_data['label']}", showlegend=True
    ), row=1, col=1)
    
    fig.add_trace(go.Scatter(
        x=[1], y=[neighbor_data['alpha']], 
        mode='markers', marker=dict(size=15, color='blue'),
        name=f"Neighbor: {neighbor_data['label']}", showlegend=True
    ), row=1, col=1)
    
    # SRL comparison
    fig.add_trace(go.Scatter(
        x=[0], y=[primary_data['srl']], 
        mode='markers', marker=dict(size=15, color='red'),
        showlegend=False
    ), row=1, col=2)
    
    fig.add_trace(go.Scatter(
        x=[1], y=[neighbor_data['srl']], 
        mode='markers', marker=dict(size=15, color='blue'),
        showlegend=False
    ), row=1, col=2)
    
    # Alpha vs SRL scatter
    fig.add_trace(go.Scatter(
        x=[primary_data['alpha']], y=[primary_data['srl']], 
        mode='markers', marker=dict(size=20, color='red'),
        name="Primary Ξ±-SRL", showlegend=False
    ), row=2, col=1)
    
    fig.add_trace(go.Scatter(
        x=[neighbor_data['alpha']], y=[neighbor_data['srl']], 
        mode='markers', marker=dict(size=20, color='blue'),
        name="Neighbor Ξ±-SRL", showlegend=False
    ), row=2, col=1)
    
    # Distance visualization
    fig.add_trace(go.Scatter(
        x=[primary_data['alpha'], neighbor_data['alpha']], 
        y=[primary_data['srl'], neighbor_data['srl']], 
        mode='lines+markers',
        line=dict(color='purple', width=3, dash='dash'),
        marker=dict(size=10, color=['red', 'blue']),
        name="Euclidean Distance", showlegend=False
    ), row=2, col=2)
    
    # Update layout
    fig.update_layout(
        title=f"Scientific CMT Diagnostic Analysis - {lens.upper()} Lens",
        height=600,
        paper_bgcolor='white',
        plot_bgcolor='white'
    )
    
    # Update axes
    fig.update_xaxes(title_text="Sample", row=1, col=1)
    fig.update_yaxes(title_text="Alpha Value", row=1, col=1)
    fig.update_xaxes(title_text="Sample", row=1, col=2)
    fig.update_yaxes(title_text="SRL Value", row=1, col=2)
    fig.update_xaxes(title_text="Alpha", row=2, col=1)
    fig.update_yaxes(title_text="SRL", row=2, col=1)
    fig.update_xaxes(title_text="Alpha", row=2, col=2)
    fig.update_yaxes(title_text="SRL", row=2, col=2)
    
    return fig

def update_scientific_analysis(species, primary_file, neighbor_file, lens):
    """Main analysis function using only real data and rigorous statistics."""
    try:
        # Get rows from real data
        primary_row = df_combined[
            (df_combined["filepath"] == primary_file) & 
            (df_combined["source"] == species)
        ].iloc[0] if len(df_combined[
            (df_combined["filepath"] == primary_file) & 
            (df_combined["source"] == species)
        ]) > 0 else None
        
        if primary_row is None:
            return (
                go.Figure(layout={"title": "Primary sample not found"}),
                "Primary sample not found",
                "No analysis available",
                "No statistics available"
            )
        
        # Find neighbor
        neighbor_result = find_nearest_neighbor_scientific(primary_row, df_combined, lens)
        if neighbor_result is None:
            return (
                go.Figure(layout={"title": "No valid neighbor found"}),
                "No valid neighbor found",
                "No analysis available", 
                "No statistics available"
            )
        
        neighbor_row, distance = neighbor_result
        
        # Get real CMT data
        primary_cmt = get_real_cmt_diagnostics(primary_row, lens)
        neighbor_cmt = get_real_cmt_diagnostics(neighbor_row, lens)
        
        if not primary_cmt or not neighbor_cmt:
            return (
                go.Figure(layout={"title": "Invalid CMT data"}),
                "Invalid CMT data",
                "No analysis available",
                "No statistics available"
            )
        
        # Create scientific visualization
        diagnostic_fig = create_scientific_diagnostic_plot(primary_cmt, neighbor_cmt, lens)
        
        # Calculate statistics
        stats_results = calculate_statistical_significance(
            primary_cmt, neighbor_cmt, df_combined, lens
        )
        
        # Build information panels
        primary_info = f"""
        <h4>πŸ“Š <b>Primary Sample</b></h4>
        <div style="background: rgba(240,240,250,1); padding: 10px; border-radius: 8px; margin: 5px 0; color: black;">
            <p><b>File:</b> {primary_cmt['filepath']}</p>
            <p><b>Species:</b> {primary_cmt['source']}</p>
            <p><b>Label:</b> {primary_cmt['label']}</p>
            <p><b>CMT Ξ± ({lens}):</b> {primary_cmt['alpha']:.6f}</p>
            <p><b>CMT SRL ({lens}):</b> {primary_cmt['srl']:.6f}</p>
        </div>
        """
        
        neighbor_info = f"""
        <h4>πŸ”— <b>Nearest Neighbor</b></h4>
        <div style="background: rgba(240,250,240,1); padding: 10px; border-radius: 8px; margin: 5px 0; color: black;">
            <p><b>File:</b> {neighbor_cmt['filepath']}</p>
            <p><b>Species:</b> {neighbor_cmt['source']}</p>
            <p><b>Label:</b> {neighbor_cmt['label']}</p>
            <p><b>CMT Ξ± ({lens}):</b> {neighbor_cmt['alpha']:.6f}</p>
            <p><b>CMT SRL ({lens}):</b> {neighbor_cmt['srl']:.6f}</p>
            <p><b>Distance:</b> {distance:.6f}</p>
        </div>
        """
        
        if 'error' not in stats_results:
            stats_info = f"""
            <h4>πŸ”¬ <b>Statistical Analysis</b></h4>
            <div style="background: rgba(250,250,240,1); padding: 10px; border-radius: 8px; margin: 5px 0; color: black;">
                <p><b>Alpha t-test:</b> t = {stats_results['alpha_ttest_statistic']:.4f}, p = {stats_results['alpha_ttest_pvalue']:.6f}</p>
                <p><b>SRL t-test:</b> t = {stats_results['srl_ttest_statistic']:.4f}, p = {stats_results['srl_ttest_pvalue']:.6f}</p>
                <p><b>Effect Sizes (Cohen's d):</b></p>
                <p>β€’ Alpha: {stats_results['alpha_effect_size']:.4f}</p>
                <p>β€’ SRL: {stats_results['srl_effect_size']:.4f}</p>
                <p><b>Population Sizes:</b> {stats_results['primary_population_size']} vs {stats_results['neighbor_population_size']}</p>
                <p><b>Statistical Significance:</b></p>
                <p>β€’ Alpha: {'Significant' if stats_results['alpha_ttest_pvalue'] < 0.05 else 'Not significant'}</p>
                <p>β€’ SRL: {'Significant' if stats_results['srl_ttest_pvalue'] < 0.05 else 'Not significant'}</p>
            </div>
            """
        else:
            stats_info = f"<p>Statistical analysis failed: {stats_results['error']}</p>"
        
        return diagnostic_fig, primary_info, neighbor_info, stats_info
        
    except Exception as e:
        error_msg = f"Analysis error: {str(e)}"
        return (
            go.Figure(layout={"title": error_msg}),
            error_msg,
            error_msg,
            error_msg
        )

# ---------------------------------------------------------------
# Gradio Interface
# ---------------------------------------------------------------
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="cyan")) as demo:
    gr.Markdown("""
    # πŸ”¬ **Scientific CMT Diagnostic Analysis Engine** 
    *Rigorous statistical analysis of real CMT transformation results*
    
    ## ⚠️ **SCIENTIFIC INTEGRITY NOTICE** ⚠️
    **This interface uses ONLY real preprocessed CMT data with NO synthetic generation, interpolation, or speculation.**
    
    **What you see:**
    - βœ… **Real CMT diagnostic values** (Ξ±, SRL) from actual transformations
    - βœ… **Mathematically rigorous distance measures** (Euclidean distance)
    - βœ… **Proper statistical testing** (t-tests, effect sizes, percentiles)
    - βœ… **Scientific hypothesis testing** with p-values and confidence measures
    
    **What was REMOVED for scientific rigor:**
    - ❌ Synthetic holographic field generation
    - ❌ Cubic interpolation of non-existent data
    - ❌ Speculative similarity metrics
    - ❌ Confirmation bias in pattern detection
    - ❌ Ungrounded "communication bridge" calculations
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### πŸ”¬ **Analysis Controls**")
            
            species_selection = gr.Dropdown(
                label="Species",
                choices=["Dog", "Human"],
                value="Dog",
                info="Select primary species for analysis"
            )
            
            lens_selection = gr.Dropdown(
                label="Mathematical Lens",
                choices=["gamma", "zeta", "airy", "bessel"],
                value="gamma",
                info="CMT lens function used for analysis"
            )
            
            primary_file_selection = gr.Dropdown(
                label="Primary Sample",
                choices=df_combined[df_combined["source"] == "Dog"]["filepath"].tolist(),
                value=df_combined[df_combined["source"] == "Dog"]["filepath"].iloc[0] if len(df_combined[df_combined["source"] == "Dog"]) > 0 else "",
                info="Select specific sample for analysis"
            )
            
            neighbor_file_selection = gr.Dropdown(
                label="Comparison Sample",
                choices=[],
                value="",
                info="Nearest neighbor will be automatically found"
            )
            
        with gr.Column(scale=2):
            diagnostic_plot = gr.Plot(label="Scientific Diagnostic Analysis")
            
    with gr.Row():
        with gr.Column():
            primary_info_display = gr.HTML(label="Primary Sample Analysis")
        with gr.Column():
            neighbor_info_display = gr.HTML(label="Neighbor Analysis")
        with gr.Column():
            stats_info_display = gr.HTML(label="Statistical Results")
    
    # Update file choices when species changes
    def update_file_choices(species):
        choices = df_combined[df_combined["source"] == species]["filepath"].tolist()
        return gr.Dropdown(choices=choices, value=choices[0] if choices else "")
    
    species_selection.change(
        fn=update_file_choices,
        inputs=[species_selection],
        outputs=[primary_file_selection]
    )
    
    # Main analysis update
    for input_component in [species_selection, primary_file_selection, lens_selection]:
        input_component.change(
            fn=update_scientific_analysis,
            inputs=[species_selection, primary_file_selection, neighbor_file_selection, lens_selection],
            outputs=[diagnostic_plot, primary_info_display, neighbor_info_display, stats_info_display]
        )

print("πŸ”¬ Scientific CMT Diagnostic Analysis Engine Ready!")

if __name__ == "__main__":
    demo.launch(share=False, debug=False)