Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Scientific CMT Diagnostic Analysis Engine | |
Rigorous statistical analysis of real CMT transformation results | |
π¬ SCIENTIFIC INTEGRITY COMPLIANCE π¬ | |
- Uses ONLY real preprocessed CMT data from CSV files | |
- NO synthetic data generation | |
- NO interpolation or field reconstruction | |
- NO speculative similarity metrics | |
- Proper statistical hypothesis testing | |
- Mathematically grounded distance measures | |
""" | |
import warnings | |
import os | |
import numpy as np | |
import pandas as pd | |
import plotly.graph_objects as go | |
from plotly.subplots import make_subplots | |
from scipy import stats | |
import gradio as gr | |
warnings.filterwarnings("ignore", category=FutureWarning) | |
warnings.filterwarnings("ignore", category=UserWarning) | |
print("π¬ Initializing Scientific CMT Diagnostic Analysis Engine...") | |
# --------------------------------------------------------------- | |
# Platform-aware data loading | |
# --------------------------------------------------------------- | |
HF_CSV_DOG = "cmt_dog_sound_analysis.csv" | |
HF_CSV_HUMAN = "cmt_human_speech_analysis.csv" | |
COLAB_CSV_DOG = "/content/cmt_dog_sound_analysis.csv" | |
COLAB_CSV_HUMAN = "/content/cmt_human_speech_analysis.csv" | |
# Determine platform and set paths | |
if os.path.exists(HF_CSV_DOG) and os.path.exists(HF_CSV_HUMAN): | |
CSV_DOG = HF_CSV_DOG | |
CSV_HUMAN = HF_CSV_HUMAN | |
print("β Using Hugging Face Spaces data files") | |
elif os.path.exists(COLAB_CSV_DOG) and os.path.exists(COLAB_CSV_HUMAN): | |
CSV_DOG = COLAB_CSV_DOG | |
CSV_HUMAN = COLAB_CSV_HUMAN | |
print("β Using Google Colab data files") | |
else: | |
print("β No real data files found - cannot proceed without actual CMT data") | |
exit(1) | |
# Load real CMT data | |
try: | |
df_dog = pd.read_csv(CSV_DOG) | |
df_human = pd.read_csv(CSV_HUMAN) | |
df_dog['source'] = 'Dog' | |
df_human['source'] = 'Human' | |
df_combined = pd.concat([df_dog, df_human], ignore_index=True) | |
print(f"β Loaded real CMT data: {len(df_dog)} dog samples, {len(df_human)} human samples") | |
except Exception as e: | |
print(f"β Error loading real CMT data: {e}") | |
exit(1) | |
# --------------------------------------------------------------- | |
# Scientific Analysis Functions | |
# --------------------------------------------------------------- | |
def get_real_cmt_diagnostics(row: pd.Series, lens: str): | |
"""Extract ONLY real preprocessed CMT diagnostic values - NO synthesis.""" | |
try: | |
alpha_col = f"diag_alpha_{lens}" | |
srl_col = f"diag_srl_{lens}" | |
alpha_val = row.get(alpha_col, np.nan) | |
srl_val = row.get(srl_col, np.nan) | |
if np.isnan(alpha_val) or np.isnan(srl_val): | |
return None | |
return { | |
"alpha": float(alpha_val), | |
"srl": float(srl_val), | |
"filepath": row.get("filepath", "unknown"), | |
"label": row.get("label", "unknown"), | |
"source": row.get("source", "unknown"), | |
} | |
except Exception as e: | |
print(f"Error extracting real CMT data: {e}") | |
return None | |
def calculate_statistical_significance(primary_data, neighbor_data, df_combined, lens): | |
"""Rigorous statistical analysis with proper hypothesis testing.""" | |
alpha_col = f"diag_alpha_{lens}" | |
srl_col = f"diag_srl_{lens}" | |
# Get population data for context | |
primary_population = df_combined[df_combined['source'] == primary_data['source']] | |
neighbor_population = df_combined[df_combined['source'] == neighbor_data['source']] | |
primary_alphas = primary_population[alpha_col].dropna() | |
neighbor_alphas = neighbor_population[alpha_col].dropna() | |
primary_srls = primary_population[srl_col].dropna() | |
neighbor_srls = neighbor_population[srl_col].dropna() | |
if len(primary_alphas) < 2 or len(neighbor_alphas) < 2: | |
return {"error": "Insufficient data for statistical analysis"} | |
# Statistical tests | |
alpha_ttest = stats.ttest_ind(primary_alphas, neighbor_alphas) | |
srl_ttest = stats.ttest_ind(primary_srls, neighbor_srls) | |
# Effect sizes (Cohen's d) | |
def cohens_d(x, y): | |
nx, ny = len(x), len(y) | |
if nx < 2 or ny < 2: | |
return np.nan | |
pooled_std = np.sqrt(((nx-1)*np.var(x, ddof=1) + (ny-1)*np.var(y, ddof=1)) / (nx+ny-2)) | |
return (np.mean(x) - np.mean(y)) / pooled_std if pooled_std > 0 else 0 | |
alpha_effect_size = cohens_d(primary_alphas, neighbor_alphas) | |
srl_effect_size = cohens_d(primary_srls, neighbor_srls) | |
# Euclidean distance (mathematically sound) | |
diagnostic_distance = np.sqrt( | |
(primary_data['alpha'] - neighbor_data['alpha'])**2 + | |
(primary_data['srl'] - neighbor_data['srl'])**2 | |
) | |
# Population percentiles | |
primary_alpha_percentile = stats.percentileofscore(primary_alphas, primary_data['alpha']) | |
neighbor_alpha_percentile = stats.percentileofscore(neighbor_alphas, neighbor_data['alpha']) | |
primary_srl_percentile = stats.percentileofscore(primary_srls, primary_data['srl']) | |
neighbor_srl_percentile = stats.percentileofscore(neighbor_srls, neighbor_data['srl']) | |
return { | |
"alpha_ttest_statistic": alpha_ttest.statistic, | |
"alpha_ttest_pvalue": alpha_ttest.pvalue, | |
"srl_ttest_statistic": srl_ttest.statistic, | |
"srl_ttest_pvalue": srl_ttest.pvalue, | |
"alpha_effect_size": alpha_effect_size, | |
"srl_effect_size": srl_effect_size, | |
"diagnostic_distance": diagnostic_distance, | |
"primary_alpha_percentile": primary_alpha_percentile, | |
"neighbor_alpha_percentile": neighbor_alpha_percentile, | |
"primary_srl_percentile": primary_srl_percentile, | |
"neighbor_srl_percentile": neighbor_srl_percentile, | |
"primary_population_size": len(primary_alphas), | |
"neighbor_population_size": len(neighbor_alphas) | |
} | |
def find_nearest_neighbor_scientific(selected_row, df_combined, lens): | |
"""Find nearest neighbor using only Euclidean distance in diagnostic space.""" | |
selected_source = selected_row['source'] | |
opposite_source = 'Human' if selected_source == 'Dog' else 'Dog' | |
alpha_col = f"diag_alpha_{lens}" | |
srl_col = f"diag_srl_{lens}" | |
opposite_data = df_combined[df_combined['source'] == opposite_source].copy() | |
if len(opposite_data) == 0: | |
return None | |
selected_alpha = selected_row[alpha_col] | |
selected_srl = selected_row[srl_col] | |
if np.isnan(selected_alpha) or np.isnan(selected_srl): | |
return None | |
# Calculate Euclidean distances | |
distances = np.sqrt( | |
(opposite_data[alpha_col] - selected_alpha)**2 + | |
(opposite_data[srl_col] - selected_srl)**2 | |
) | |
valid_indices = ~np.isnan(distances) | |
if not np.any(valid_indices): | |
return None | |
valid_distances = distances[valid_indices] | |
valid_data = opposite_data[valid_indices] | |
nearest_idx = np.argmin(valid_distances) | |
return valid_data.iloc[nearest_idx], float(valid_distances.iloc[nearest_idx]) | |
def create_scientific_diagnostic_plot(primary_data, neighbor_data, lens): | |
"""Create scientifically rigorous diagnostic plots using ONLY real data.""" | |
if not primary_data or not neighbor_data: | |
return go.Figure(layout={"title": "Insufficient real data for analysis"}) | |
fig = make_subplots( | |
rows=2, cols=2, | |
subplot_titles=[ | |
f"Alpha Values ({lens.upper()} lens)", | |
f"SRL Values ({lens.upper()} lens)", | |
"Alpha vs SRL Correlation", | |
"Population Context" | |
] | |
) | |
# Alpha comparison | |
fig.add_trace(go.Scatter( | |
x=[0], y=[primary_data['alpha']], | |
mode='markers', marker=dict(size=15, color='red'), | |
name=f"Primary: {primary_data['label']}", showlegend=True | |
), row=1, col=1) | |
fig.add_trace(go.Scatter( | |
x=[1], y=[neighbor_data['alpha']], | |
mode='markers', marker=dict(size=15, color='blue'), | |
name=f"Neighbor: {neighbor_data['label']}", showlegend=True | |
), row=1, col=1) | |
# SRL comparison | |
fig.add_trace(go.Scatter( | |
x=[0], y=[primary_data['srl']], | |
mode='markers', marker=dict(size=15, color='red'), | |
showlegend=False | |
), row=1, col=2) | |
fig.add_trace(go.Scatter( | |
x=[1], y=[neighbor_data['srl']], | |
mode='markers', marker=dict(size=15, color='blue'), | |
showlegend=False | |
), row=1, col=2) | |
# Alpha vs SRL scatter | |
fig.add_trace(go.Scatter( | |
x=[primary_data['alpha']], y=[primary_data['srl']], | |
mode='markers', marker=dict(size=20, color='red'), | |
name="Primary Ξ±-SRL", showlegend=False | |
), row=2, col=1) | |
fig.add_trace(go.Scatter( | |
x=[neighbor_data['alpha']], y=[neighbor_data['srl']], | |
mode='markers', marker=dict(size=20, color='blue'), | |
name="Neighbor Ξ±-SRL", showlegend=False | |
), row=2, col=1) | |
# Distance visualization | |
fig.add_trace(go.Scatter( | |
x=[primary_data['alpha'], neighbor_data['alpha']], | |
y=[primary_data['srl'], neighbor_data['srl']], | |
mode='lines+markers', | |
line=dict(color='purple', width=3, dash='dash'), | |
marker=dict(size=10, color=['red', 'blue']), | |
name="Euclidean Distance", showlegend=False | |
), row=2, col=2) | |
# Update layout | |
fig.update_layout( | |
title=f"Scientific CMT Diagnostic Analysis - {lens.upper()} Lens", | |
height=600, | |
paper_bgcolor='white', | |
plot_bgcolor='white' | |
) | |
# Update axes | |
fig.update_xaxes(title_text="Sample", row=1, col=1) | |
fig.update_yaxes(title_text="Alpha Value", row=1, col=1) | |
fig.update_xaxes(title_text="Sample", row=1, col=2) | |
fig.update_yaxes(title_text="SRL Value", row=1, col=2) | |
fig.update_xaxes(title_text="Alpha", row=2, col=1) | |
fig.update_yaxes(title_text="SRL", row=2, col=1) | |
fig.update_xaxes(title_text="Alpha", row=2, col=2) | |
fig.update_yaxes(title_text="SRL", row=2, col=2) | |
return fig | |
def update_scientific_analysis(species, primary_file, neighbor_file, lens): | |
"""Main analysis function using only real data and rigorous statistics.""" | |
try: | |
# Get rows from real data | |
primary_row = df_combined[ | |
(df_combined["filepath"] == primary_file) & | |
(df_combined["source"] == species) | |
].iloc[0] if len(df_combined[ | |
(df_combined["filepath"] == primary_file) & | |
(df_combined["source"] == species) | |
]) > 0 else None | |
if primary_row is None: | |
return ( | |
go.Figure(layout={"title": "Primary sample not found"}), | |
"Primary sample not found", | |
"No analysis available", | |
"No statistics available" | |
) | |
# Find neighbor | |
neighbor_result = find_nearest_neighbor_scientific(primary_row, df_combined, lens) | |
if neighbor_result is None: | |
return ( | |
go.Figure(layout={"title": "No valid neighbor found"}), | |
"No valid neighbor found", | |
"No analysis available", | |
"No statistics available" | |
) | |
neighbor_row, distance = neighbor_result | |
# Get real CMT data | |
primary_cmt = get_real_cmt_diagnostics(primary_row, lens) | |
neighbor_cmt = get_real_cmt_diagnostics(neighbor_row, lens) | |
if not primary_cmt or not neighbor_cmt: | |
return ( | |
go.Figure(layout={"title": "Invalid CMT data"}), | |
"Invalid CMT data", | |
"No analysis available", | |
"No statistics available" | |
) | |
# Create scientific visualization | |
diagnostic_fig = create_scientific_diagnostic_plot(primary_cmt, neighbor_cmt, lens) | |
# Calculate statistics | |
stats_results = calculate_statistical_significance( | |
primary_cmt, neighbor_cmt, df_combined, lens | |
) | |
# Build information panels | |
primary_info = f""" | |
<h4>π <b>Primary Sample</b></h4> | |
<div style="background: rgba(240,240,250,1); padding: 10px; border-radius: 8px; margin: 5px 0; color: black;"> | |
<p><b>File:</b> {primary_cmt['filepath']}</p> | |
<p><b>Species:</b> {primary_cmt['source']}</p> | |
<p><b>Label:</b> {primary_cmt['label']}</p> | |
<p><b>CMT Ξ± ({lens}):</b> {primary_cmt['alpha']:.6f}</p> | |
<p><b>CMT SRL ({lens}):</b> {primary_cmt['srl']:.6f}</p> | |
</div> | |
""" | |
neighbor_info = f""" | |
<h4>π <b>Nearest Neighbor</b></h4> | |
<div style="background: rgba(240,250,240,1); padding: 10px; border-radius: 8px; margin: 5px 0; color: black;"> | |
<p><b>File:</b> {neighbor_cmt['filepath']}</p> | |
<p><b>Species:</b> {neighbor_cmt['source']}</p> | |
<p><b>Label:</b> {neighbor_cmt['label']}</p> | |
<p><b>CMT Ξ± ({lens}):</b> {neighbor_cmt['alpha']:.6f}</p> | |
<p><b>CMT SRL ({lens}):</b> {neighbor_cmt['srl']:.6f}</p> | |
<p><b>Distance:</b> {distance:.6f}</p> | |
</div> | |
""" | |
if 'error' not in stats_results: | |
stats_info = f""" | |
<h4>π¬ <b>Statistical Analysis</b></h4> | |
<div style="background: rgba(250,250,240,1); padding: 10px; border-radius: 8px; margin: 5px 0; color: black;"> | |
<p><b>Alpha t-test:</b> t = {stats_results['alpha_ttest_statistic']:.4f}, p = {stats_results['alpha_ttest_pvalue']:.6f}</p> | |
<p><b>SRL t-test:</b> t = {stats_results['srl_ttest_statistic']:.4f}, p = {stats_results['srl_ttest_pvalue']:.6f}</p> | |
<p><b>Effect Sizes (Cohen's d):</b></p> | |
<p>β’ Alpha: {stats_results['alpha_effect_size']:.4f}</p> | |
<p>β’ SRL: {stats_results['srl_effect_size']:.4f}</p> | |
<p><b>Population Sizes:</b> {stats_results['primary_population_size']} vs {stats_results['neighbor_population_size']}</p> | |
<p><b>Statistical Significance:</b></p> | |
<p>β’ Alpha: {'Significant' if stats_results['alpha_ttest_pvalue'] < 0.05 else 'Not significant'}</p> | |
<p>β’ SRL: {'Significant' if stats_results['srl_ttest_pvalue'] < 0.05 else 'Not significant'}</p> | |
</div> | |
""" | |
else: | |
stats_info = f"<p>Statistical analysis failed: {stats_results['error']}</p>" | |
return diagnostic_fig, primary_info, neighbor_info, stats_info | |
except Exception as e: | |
error_msg = f"Analysis error: {str(e)}" | |
return ( | |
go.Figure(layout={"title": error_msg}), | |
error_msg, | |
error_msg, | |
error_msg | |
) | |
# --------------------------------------------------------------- | |
# Gradio Interface | |
# --------------------------------------------------------------- | |
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="cyan")) as demo: | |
gr.Markdown(""" | |
# π¬ **Scientific CMT Diagnostic Analysis Engine** | |
*Rigorous statistical analysis of real CMT transformation results* | |
## β οΈ **SCIENTIFIC INTEGRITY NOTICE** β οΈ | |
**This interface uses ONLY real preprocessed CMT data with NO synthetic generation, interpolation, or speculation.** | |
**What you see:** | |
- β **Real CMT diagnostic values** (Ξ±, SRL) from actual transformations | |
- β **Mathematically rigorous distance measures** (Euclidean distance) | |
- β **Proper statistical testing** (t-tests, effect sizes, percentiles) | |
- β **Scientific hypothesis testing** with p-values and confidence measures | |
**What was REMOVED for scientific rigor:** | |
- β Synthetic holographic field generation | |
- β Cubic interpolation of non-existent data | |
- β Speculative similarity metrics | |
- β Confirmation bias in pattern detection | |
- β Ungrounded "communication bridge" calculations | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown("### π¬ **Analysis Controls**") | |
species_selection = gr.Dropdown( | |
label="Species", | |
choices=["Dog", "Human"], | |
value="Dog", | |
info="Select primary species for analysis" | |
) | |
lens_selection = gr.Dropdown( | |
label="Mathematical Lens", | |
choices=["gamma", "zeta", "airy", "bessel"], | |
value="gamma", | |
info="CMT lens function used for analysis" | |
) | |
primary_file_selection = gr.Dropdown( | |
label="Primary Sample", | |
choices=df_combined[df_combined["source"] == "Dog"]["filepath"].tolist(), | |
value=df_combined[df_combined["source"] == "Dog"]["filepath"].iloc[0] if len(df_combined[df_combined["source"] == "Dog"]) > 0 else "", | |
info="Select specific sample for analysis" | |
) | |
neighbor_file_selection = gr.Dropdown( | |
label="Comparison Sample", | |
choices=[], | |
value="", | |
info="Nearest neighbor will be automatically found" | |
) | |
with gr.Column(scale=2): | |
diagnostic_plot = gr.Plot(label="Scientific Diagnostic Analysis") | |
with gr.Row(): | |
with gr.Column(): | |
primary_info_display = gr.HTML(label="Primary Sample Analysis") | |
with gr.Column(): | |
neighbor_info_display = gr.HTML(label="Neighbor Analysis") | |
with gr.Column(): | |
stats_info_display = gr.HTML(label="Statistical Results") | |
# Update file choices when species changes | |
def update_file_choices(species): | |
choices = df_combined[df_combined["source"] == species]["filepath"].tolist() | |
return gr.Dropdown(choices=choices, value=choices[0] if choices else "") | |
species_selection.change( | |
fn=update_file_choices, | |
inputs=[species_selection], | |
outputs=[primary_file_selection] | |
) | |
# Main analysis update | |
for input_component in [species_selection, primary_file_selection, lens_selection]: | |
input_component.change( | |
fn=update_scientific_analysis, | |
inputs=[species_selection, primary_file_selection, neighbor_file_selection, lens_selection], | |
outputs=[diagnostic_plot, primary_info_display, neighbor_info_display, stats_info_display] | |
) | |
print("π¬ Scientific CMT Diagnostic Analysis Engine Ready!") | |
if __name__ == "__main__": | |
demo.launch(share=False, debug=False) | |