CMT-Mapping / app.py
Severian's picture
Update app.py
e0da54d verified
raw
history blame
18.6 kB
#!/usr/bin/env python3
"""
Scientific CMT Diagnostic Analysis Engine
Rigorous statistical analysis of real CMT transformation results
πŸ”¬ SCIENTIFIC INTEGRITY COMPLIANCE πŸ”¬
- Uses ONLY real preprocessed CMT data from CSV files
- NO synthetic data generation
- NO interpolation or field reconstruction
- NO speculative similarity metrics
- Proper statistical hypothesis testing
- Mathematically grounded distance measures
"""
import warnings
import os
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats
import gradio as gr
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
print("πŸ”¬ Initializing Scientific CMT Diagnostic Analysis Engine...")
# ---------------------------------------------------------------
# Platform-aware data loading
# ---------------------------------------------------------------
HF_CSV_DOG = "cmt_dog_sound_analysis.csv"
HF_CSV_HUMAN = "cmt_human_speech_analysis.csv"
COLAB_CSV_DOG = "/content/cmt_dog_sound_analysis.csv"
COLAB_CSV_HUMAN = "/content/cmt_human_speech_analysis.csv"
# Determine platform and set paths
if os.path.exists(HF_CSV_DOG) and os.path.exists(HF_CSV_HUMAN):
CSV_DOG = HF_CSV_DOG
CSV_HUMAN = HF_CSV_HUMAN
print("βœ… Using Hugging Face Spaces data files")
elif os.path.exists(COLAB_CSV_DOG) and os.path.exists(COLAB_CSV_HUMAN):
CSV_DOG = COLAB_CSV_DOG
CSV_HUMAN = COLAB_CSV_HUMAN
print("βœ… Using Google Colab data files")
else:
print("❌ No real data files found - cannot proceed without actual CMT data")
exit(1)
# Load real CMT data
try:
df_dog = pd.read_csv(CSV_DOG)
df_human = pd.read_csv(CSV_HUMAN)
df_dog['source'] = 'Dog'
df_human['source'] = 'Human'
df_combined = pd.concat([df_dog, df_human], ignore_index=True)
print(f"βœ… Loaded real CMT data: {len(df_dog)} dog samples, {len(df_human)} human samples")
except Exception as e:
print(f"❌ Error loading real CMT data: {e}")
exit(1)
# ---------------------------------------------------------------
# Scientific Analysis Functions
# ---------------------------------------------------------------
def get_real_cmt_diagnostics(row: pd.Series, lens: str):
"""Extract ONLY real preprocessed CMT diagnostic values - NO synthesis."""
try:
alpha_col = f"diag_alpha_{lens}"
srl_col = f"diag_srl_{lens}"
alpha_val = row.get(alpha_col, np.nan)
srl_val = row.get(srl_col, np.nan)
if np.isnan(alpha_val) or np.isnan(srl_val):
return None
return {
"alpha": float(alpha_val),
"srl": float(srl_val),
"filepath": row.get("filepath", "unknown"),
"label": row.get("label", "unknown"),
"source": row.get("source", "unknown"),
}
except Exception as e:
print(f"Error extracting real CMT data: {e}")
return None
def calculate_statistical_significance(primary_data, neighbor_data, df_combined, lens):
"""Rigorous statistical analysis with proper hypothesis testing."""
alpha_col = f"diag_alpha_{lens}"
srl_col = f"diag_srl_{lens}"
# Get population data for context
primary_population = df_combined[df_combined['source'] == primary_data['source']]
neighbor_population = df_combined[df_combined['source'] == neighbor_data['source']]
primary_alphas = primary_population[alpha_col].dropna()
neighbor_alphas = neighbor_population[alpha_col].dropna()
primary_srls = primary_population[srl_col].dropna()
neighbor_srls = neighbor_population[srl_col].dropna()
if len(primary_alphas) < 2 or len(neighbor_alphas) < 2:
return {"error": "Insufficient data for statistical analysis"}
# Statistical tests
alpha_ttest = stats.ttest_ind(primary_alphas, neighbor_alphas)
srl_ttest = stats.ttest_ind(primary_srls, neighbor_srls)
# Effect sizes (Cohen's d)
def cohens_d(x, y):
nx, ny = len(x), len(y)
if nx < 2 or ny < 2:
return np.nan
pooled_std = np.sqrt(((nx-1)*np.var(x, ddof=1) + (ny-1)*np.var(y, ddof=1)) / (nx+ny-2))
return (np.mean(x) - np.mean(y)) / pooled_std if pooled_std > 0 else 0
alpha_effect_size = cohens_d(primary_alphas, neighbor_alphas)
srl_effect_size = cohens_d(primary_srls, neighbor_srls)
# Euclidean distance (mathematically sound)
diagnostic_distance = np.sqrt(
(primary_data['alpha'] - neighbor_data['alpha'])**2 +
(primary_data['srl'] - neighbor_data['srl'])**2
)
# Population percentiles
primary_alpha_percentile = stats.percentileofscore(primary_alphas, primary_data['alpha'])
neighbor_alpha_percentile = stats.percentileofscore(neighbor_alphas, neighbor_data['alpha'])
primary_srl_percentile = stats.percentileofscore(primary_srls, primary_data['srl'])
neighbor_srl_percentile = stats.percentileofscore(neighbor_srls, neighbor_data['srl'])
return {
"alpha_ttest_statistic": alpha_ttest.statistic,
"alpha_ttest_pvalue": alpha_ttest.pvalue,
"srl_ttest_statistic": srl_ttest.statistic,
"srl_ttest_pvalue": srl_ttest.pvalue,
"alpha_effect_size": alpha_effect_size,
"srl_effect_size": srl_effect_size,
"diagnostic_distance": diagnostic_distance,
"primary_alpha_percentile": primary_alpha_percentile,
"neighbor_alpha_percentile": neighbor_alpha_percentile,
"primary_srl_percentile": primary_srl_percentile,
"neighbor_srl_percentile": neighbor_srl_percentile,
"primary_population_size": len(primary_alphas),
"neighbor_population_size": len(neighbor_alphas)
}
def find_nearest_neighbor_scientific(selected_row, df_combined, lens):
"""Find nearest neighbor using only Euclidean distance in diagnostic space."""
selected_source = selected_row['source']
opposite_source = 'Human' if selected_source == 'Dog' else 'Dog'
alpha_col = f"diag_alpha_{lens}"
srl_col = f"diag_srl_{lens}"
opposite_data = df_combined[df_combined['source'] == opposite_source].copy()
if len(opposite_data) == 0:
return None
selected_alpha = selected_row[alpha_col]
selected_srl = selected_row[srl_col]
if np.isnan(selected_alpha) or np.isnan(selected_srl):
return None
# Calculate Euclidean distances
distances = np.sqrt(
(opposite_data[alpha_col] - selected_alpha)**2 +
(opposite_data[srl_col] - selected_srl)**2
)
valid_indices = ~np.isnan(distances)
if not np.any(valid_indices):
return None
valid_distances = distances[valid_indices]
valid_data = opposite_data[valid_indices]
nearest_idx = np.argmin(valid_distances)
return valid_data.iloc[nearest_idx], float(valid_distances.iloc[nearest_idx])
def create_scientific_diagnostic_plot(primary_data, neighbor_data, lens):
"""Create scientifically rigorous diagnostic plots using ONLY real data."""
if not primary_data or not neighbor_data:
return go.Figure(layout={"title": "Insufficient real data for analysis"})
fig = make_subplots(
rows=2, cols=2,
subplot_titles=[
f"Alpha Values ({lens.upper()} lens)",
f"SRL Values ({lens.upper()} lens)",
"Alpha vs SRL Correlation",
"Population Context"
]
)
# Alpha comparison
fig.add_trace(go.Scatter(
x=[0], y=[primary_data['alpha']],
mode='markers', marker=dict(size=15, color='red'),
name=f"Primary: {primary_data['label']}", showlegend=True
), row=1, col=1)
fig.add_trace(go.Scatter(
x=[1], y=[neighbor_data['alpha']],
mode='markers', marker=dict(size=15, color='blue'),
name=f"Neighbor: {neighbor_data['label']}", showlegend=True
), row=1, col=1)
# SRL comparison
fig.add_trace(go.Scatter(
x=[0], y=[primary_data['srl']],
mode='markers', marker=dict(size=15, color='red'),
showlegend=False
), row=1, col=2)
fig.add_trace(go.Scatter(
x=[1], y=[neighbor_data['srl']],
mode='markers', marker=dict(size=15, color='blue'),
showlegend=False
), row=1, col=2)
# Alpha vs SRL scatter
fig.add_trace(go.Scatter(
x=[primary_data['alpha']], y=[primary_data['srl']],
mode='markers', marker=dict(size=20, color='red'),
name="Primary Ξ±-SRL", showlegend=False
), row=2, col=1)
fig.add_trace(go.Scatter(
x=[neighbor_data['alpha']], y=[neighbor_data['srl']],
mode='markers', marker=dict(size=20, color='blue'),
name="Neighbor Ξ±-SRL", showlegend=False
), row=2, col=1)
# Distance visualization
fig.add_trace(go.Scatter(
x=[primary_data['alpha'], neighbor_data['alpha']],
y=[primary_data['srl'], neighbor_data['srl']],
mode='lines+markers',
line=dict(color='purple', width=3, dash='dash'),
marker=dict(size=10, color=['red', 'blue']),
name="Euclidean Distance", showlegend=False
), row=2, col=2)
# Update layout
fig.update_layout(
title=f"Scientific CMT Diagnostic Analysis - {lens.upper()} Lens",
height=600,
paper_bgcolor='white',
plot_bgcolor='white'
)
# Update axes
fig.update_xaxes(title_text="Sample", row=1, col=1)
fig.update_yaxes(title_text="Alpha Value", row=1, col=1)
fig.update_xaxes(title_text="Sample", row=1, col=2)
fig.update_yaxes(title_text="SRL Value", row=1, col=2)
fig.update_xaxes(title_text="Alpha", row=2, col=1)
fig.update_yaxes(title_text="SRL", row=2, col=1)
fig.update_xaxes(title_text="Alpha", row=2, col=2)
fig.update_yaxes(title_text="SRL", row=2, col=2)
return fig
def update_scientific_analysis(species, primary_file, neighbor_file, lens):
"""Main analysis function using only real data and rigorous statistics."""
try:
# Get rows from real data
primary_row = df_combined[
(df_combined["filepath"] == primary_file) &
(df_combined["source"] == species)
].iloc[0] if len(df_combined[
(df_combined["filepath"] == primary_file) &
(df_combined["source"] == species)
]) > 0 else None
if primary_row is None:
return (
go.Figure(layout={"title": "Primary sample not found"}),
"Primary sample not found",
"No analysis available",
"No statistics available"
)
# Find neighbor
neighbor_result = find_nearest_neighbor_scientific(primary_row, df_combined, lens)
if neighbor_result is None:
return (
go.Figure(layout={"title": "No valid neighbor found"}),
"No valid neighbor found",
"No analysis available",
"No statistics available"
)
neighbor_row, distance = neighbor_result
# Get real CMT data
primary_cmt = get_real_cmt_diagnostics(primary_row, lens)
neighbor_cmt = get_real_cmt_diagnostics(neighbor_row, lens)
if not primary_cmt or not neighbor_cmt:
return (
go.Figure(layout={"title": "Invalid CMT data"}),
"Invalid CMT data",
"No analysis available",
"No statistics available"
)
# Create scientific visualization
diagnostic_fig = create_scientific_diagnostic_plot(primary_cmt, neighbor_cmt, lens)
# Calculate statistics
stats_results = calculate_statistical_significance(
primary_cmt, neighbor_cmt, df_combined, lens
)
# Build information panels
primary_info = f"""
<h4>πŸ“Š <b>Primary Sample</b></h4>
<div style="background: rgba(240,240,250,1); padding: 10px; border-radius: 8px; margin: 5px 0; color: black;">
<p><b>File:</b> {primary_cmt['filepath']}</p>
<p><b>Species:</b> {primary_cmt['source']}</p>
<p><b>Label:</b> {primary_cmt['label']}</p>
<p><b>CMT Ξ± ({lens}):</b> {primary_cmt['alpha']:.6f}</p>
<p><b>CMT SRL ({lens}):</b> {primary_cmt['srl']:.6f}</p>
</div>
"""
neighbor_info = f"""
<h4>πŸ”— <b>Nearest Neighbor</b></h4>
<div style="background: rgba(240,250,240,1); padding: 10px; border-radius: 8px; margin: 5px 0; color: black;">
<p><b>File:</b> {neighbor_cmt['filepath']}</p>
<p><b>Species:</b> {neighbor_cmt['source']}</p>
<p><b>Label:</b> {neighbor_cmt['label']}</p>
<p><b>CMT Ξ± ({lens}):</b> {neighbor_cmt['alpha']:.6f}</p>
<p><b>CMT SRL ({lens}):</b> {neighbor_cmt['srl']:.6f}</p>
<p><b>Distance:</b> {distance:.6f}</p>
</div>
"""
if 'error' not in stats_results:
stats_info = f"""
<h4>πŸ”¬ <b>Statistical Analysis</b></h4>
<div style="background: rgba(250,250,240,1); padding: 10px; border-radius: 8px; margin: 5px 0; color: black;">
<p><b>Alpha t-test:</b> t = {stats_results['alpha_ttest_statistic']:.4f}, p = {stats_results['alpha_ttest_pvalue']:.6f}</p>
<p><b>SRL t-test:</b> t = {stats_results['srl_ttest_statistic']:.4f}, p = {stats_results['srl_ttest_pvalue']:.6f}</p>
<p><b>Effect Sizes (Cohen's d):</b></p>
<p>β€’ Alpha: {stats_results['alpha_effect_size']:.4f}</p>
<p>β€’ SRL: {stats_results['srl_effect_size']:.4f}</p>
<p><b>Population Sizes:</b> {stats_results['primary_population_size']} vs {stats_results['neighbor_population_size']}</p>
<p><b>Statistical Significance:</b></p>
<p>β€’ Alpha: {'Significant' if stats_results['alpha_ttest_pvalue'] < 0.05 else 'Not significant'}</p>
<p>β€’ SRL: {'Significant' if stats_results['srl_ttest_pvalue'] < 0.05 else 'Not significant'}</p>
</div>
"""
else:
stats_info = f"<p>Statistical analysis failed: {stats_results['error']}</p>"
return diagnostic_fig, primary_info, neighbor_info, stats_info
except Exception as e:
error_msg = f"Analysis error: {str(e)}"
return (
go.Figure(layout={"title": error_msg}),
error_msg,
error_msg,
error_msg
)
# ---------------------------------------------------------------
# Gradio Interface
# ---------------------------------------------------------------
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="cyan")) as demo:
gr.Markdown("""
# πŸ”¬ **Scientific CMT Diagnostic Analysis Engine**
*Rigorous statistical analysis of real CMT transformation results*
## ⚠️ **SCIENTIFIC INTEGRITY NOTICE** ⚠️
**This interface uses ONLY real preprocessed CMT data with NO synthetic generation, interpolation, or speculation.**
**What you see:**
- βœ… **Real CMT diagnostic values** (Ξ±, SRL) from actual transformations
- βœ… **Mathematically rigorous distance measures** (Euclidean distance)
- βœ… **Proper statistical testing** (t-tests, effect sizes, percentiles)
- βœ… **Scientific hypothesis testing** with p-values and confidence measures
**What was REMOVED for scientific rigor:**
- ❌ Synthetic holographic field generation
- ❌ Cubic interpolation of non-existent data
- ❌ Speculative similarity metrics
- ❌ Confirmation bias in pattern detection
- ❌ Ungrounded "communication bridge" calculations
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### πŸ”¬ **Analysis Controls**")
species_selection = gr.Dropdown(
label="Species",
choices=["Dog", "Human"],
value="Dog",
info="Select primary species for analysis"
)
lens_selection = gr.Dropdown(
label="Mathematical Lens",
choices=["gamma", "zeta", "airy", "bessel"],
value="gamma",
info="CMT lens function used for analysis"
)
primary_file_selection = gr.Dropdown(
label="Primary Sample",
choices=df_combined[df_combined["source"] == "Dog"]["filepath"].tolist(),
value=df_combined[df_combined["source"] == "Dog"]["filepath"].iloc[0] if len(df_combined[df_combined["source"] == "Dog"]) > 0 else "",
info="Select specific sample for analysis"
)
neighbor_file_selection = gr.Dropdown(
label="Comparison Sample",
choices=[],
value="",
info="Nearest neighbor will be automatically found"
)
with gr.Column(scale=2):
diagnostic_plot = gr.Plot(label="Scientific Diagnostic Analysis")
with gr.Row():
with gr.Column():
primary_info_display = gr.HTML(label="Primary Sample Analysis")
with gr.Column():
neighbor_info_display = gr.HTML(label="Neighbor Analysis")
with gr.Column():
stats_info_display = gr.HTML(label="Statistical Results")
# Update file choices when species changes
def update_file_choices(species):
choices = df_combined[df_combined["source"] == species]["filepath"].tolist()
return gr.Dropdown(choices=choices, value=choices[0] if choices else "")
species_selection.change(
fn=update_file_choices,
inputs=[species_selection],
outputs=[primary_file_selection]
)
# Main analysis update
for input_component in [species_selection, primary_file_selection, lens_selection]:
input_component.change(
fn=update_scientific_analysis,
inputs=[species_selection, primary_file_selection, neighbor_file_selection, lens_selection],
outputs=[diagnostic_plot, primary_info_display, neighbor_info_display, stats_info_display]
)
print("πŸ”¬ Scientific CMT Diagnostic Analysis Engine Ready!")
if __name__ == "__main__":
demo.launch(share=False, debug=False)