Spaces:

Severian
/

CMT-Mapping

Running

App Files Files Community

CMT-Mapping / app.py

Severian

Update app.py

e0da54d verified about 1 month ago

raw

history blame

18.6 kB

	#!/usr/bin/env python3
	"""
	Scientific CMT Diagnostic Analysis Engine
	Rigorous statistical analysis of real CMT transformation results

	🔬 SCIENTIFIC INTEGRITY COMPLIANCE 🔬
	- Uses ONLY real preprocessed CMT data from CSV files
	- NO synthetic data generation
	- NO interpolation or field reconstruction
	- NO speculative similarity metrics
	- Proper statistical hypothesis testing
	- Mathematically grounded distance measures
	"""

	import warnings
	import os
	import numpy as np
	import pandas as pd
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	from scipy import stats
	import gradio as gr

	warnings.filterwarnings("ignore", category=FutureWarning)
	warnings.filterwarnings("ignore", category=UserWarning)

	print("🔬 Initializing Scientific CMT Diagnostic Analysis Engine...")

	# ---------------------------------------------------------------
	# Platform-aware data loading
	# ---------------------------------------------------------------
	HF_CSV_DOG = "cmt_dog_sound_analysis.csv"
	HF_CSV_HUMAN = "cmt_human_speech_analysis.csv"
	COLAB_CSV_DOG = "/content/cmt_dog_sound_analysis.csv"
	COLAB_CSV_HUMAN = "/content/cmt_human_speech_analysis.csv"

	# Determine platform and set paths
	if os.path.exists(HF_CSV_DOG) and os.path.exists(HF_CSV_HUMAN):
	CSV_DOG = HF_CSV_DOG
	CSV_HUMAN = HF_CSV_HUMAN
	print("✅ Using Hugging Face Spaces data files")
	elif os.path.exists(COLAB_CSV_DOG) and os.path.exists(COLAB_CSV_HUMAN):
	CSV_DOG = COLAB_CSV_DOG
	CSV_HUMAN = COLAB_CSV_HUMAN
	print("✅ Using Google Colab data files")
	else:
	print("❌ No real data files found - cannot proceed without actual CMT data")
	exit(1)

	# Load real CMT data
	try:
	df_dog = pd.read_csv(CSV_DOG)
	df_human = pd.read_csv(CSV_HUMAN)
	df_dog['source'] = 'Dog'
	df_human['source'] = 'Human'
	df_combined = pd.concat([df_dog, df_human], ignore_index=True)
	print(f"✅ Loaded real CMT data: {len(df_dog)} dog samples, {len(df_human)} human samples")
	except Exception as e:
	print(f"❌ Error loading real CMT data: {e}")
	exit(1)

	# ---------------------------------------------------------------
	# Scientific Analysis Functions
	# ---------------------------------------------------------------

	def get_real_cmt_diagnostics(row: pd.Series, lens: str):
	"""Extract ONLY real preprocessed CMT diagnostic values - NO synthesis."""
	try:
	alpha_col = f"diag_alpha_{lens}"
	srl_col = f"diag_srl_{lens}"

	alpha_val = row.get(alpha_col, np.nan)
	srl_val = row.get(srl_col, np.nan)

	if np.isnan(alpha_val) or np.isnan(srl_val):
	return None

	return {
	"alpha": float(alpha_val),
	"srl": float(srl_val),
	"filepath": row.get("filepath", "unknown"),
	"label": row.get("label", "unknown"),
	"source": row.get("source", "unknown"),
	}
	except Exception as e:
	print(f"Error extracting real CMT data: {e}")
	return None

	def calculate_statistical_significance(primary_data, neighbor_data, df_combined, lens):
	"""Rigorous statistical analysis with proper hypothesis testing."""
	alpha_col = f"diag_alpha_{lens}"
	srl_col = f"diag_srl_{lens}"

	# Get population data for context
	primary_population = df_combined[df_combined['source'] == primary_data['source']]
	neighbor_population = df_combined[df_combined['source'] == neighbor_data['source']]

	primary_alphas = primary_population[alpha_col].dropna()
	neighbor_alphas = neighbor_population[alpha_col].dropna()
	primary_srls = primary_population[srl_col].dropna()
	neighbor_srls = neighbor_population[srl_col].dropna()

	if len(primary_alphas) < 2 or len(neighbor_alphas) < 2:
	return {"error": "Insufficient data for statistical analysis"}

	# Statistical tests
	alpha_ttest = stats.ttest_ind(primary_alphas, neighbor_alphas)
	srl_ttest = stats.ttest_ind(primary_srls, neighbor_srls)

	# Effect sizes (Cohen's d)
	def cohens_d(x, y):
	nx, ny = len(x), len(y)
	if nx < 2 or ny < 2:
	return np.nan
	pooled_std = np.sqrt(((nx-1)np.var(x, ddof=1) + (ny-1)np.var(y, ddof=1)) / (nx+ny-2))
	return (np.mean(x) - np.mean(y)) / pooled_std if pooled_std > 0 else 0

	alpha_effect_size = cohens_d(primary_alphas, neighbor_alphas)
	srl_effect_size = cohens_d(primary_srls, neighbor_srls)

	# Euclidean distance (mathematically sound)
	diagnostic_distance = np.sqrt(
	(primary_data['alpha'] - neighbor_data['alpha'])**2 +
	(primary_data['srl'] - neighbor_data['srl'])**2
	)

	# Population percentiles
	primary_alpha_percentile = stats.percentileofscore(primary_alphas, primary_data['alpha'])
	neighbor_alpha_percentile = stats.percentileofscore(neighbor_alphas, neighbor_data['alpha'])
	primary_srl_percentile = stats.percentileofscore(primary_srls, primary_data['srl'])
	neighbor_srl_percentile = stats.percentileofscore(neighbor_srls, neighbor_data['srl'])

	return {
	"alpha_ttest_statistic": alpha_ttest.statistic,
	"alpha_ttest_pvalue": alpha_ttest.pvalue,
	"srl_ttest_statistic": srl_ttest.statistic,
	"srl_ttest_pvalue": srl_ttest.pvalue,
	"alpha_effect_size": alpha_effect_size,
	"srl_effect_size": srl_effect_size,
	"diagnostic_distance": diagnostic_distance,
	"primary_alpha_percentile": primary_alpha_percentile,
	"neighbor_alpha_percentile": neighbor_alpha_percentile,
	"primary_srl_percentile": primary_srl_percentile,
	"neighbor_srl_percentile": neighbor_srl_percentile,
	"primary_population_size": len(primary_alphas),
	"neighbor_population_size": len(neighbor_alphas)
	}

	def find_nearest_neighbor_scientific(selected_row, df_combined, lens):
	"""Find nearest neighbor using only Euclidean distance in diagnostic space."""
	selected_source = selected_row['source']
	opposite_source = 'Human' if selected_source == 'Dog' else 'Dog'

	alpha_col = f"diag_alpha_{lens}"
	srl_col = f"diag_srl_{lens}"

	opposite_data = df_combined[df_combined['source'] == opposite_source].copy()

	if len(opposite_data) == 0:
	return None

	selected_alpha = selected_row[alpha_col]
	selected_srl = selected_row[srl_col]

	if np.isnan(selected_alpha) or np.isnan(selected_srl):
	return None

	# Calculate Euclidean distances
	distances = np.sqrt(
	(opposite_data[alpha_col] - selected_alpha)**2 +
	(opposite_data[srl_col] - selected_srl)**2
	)

	valid_indices = ~np.isnan(distances)
	if not np.any(valid_indices):
	return None

	valid_distances = distances[valid_indices]
	valid_data = opposite_data[valid_indices]

	nearest_idx = np.argmin(valid_distances)
	return valid_data.iloc[nearest_idx], float(valid_distances.iloc[nearest_idx])

	def create_scientific_diagnostic_plot(primary_data, neighbor_data, lens):
	"""Create scientifically rigorous diagnostic plots using ONLY real data."""
	if not primary_data or not neighbor_data:
	return go.Figure(layout={"title": "Insufficient real data for analysis"})

	fig = make_subplots(
	rows=2, cols=2,
	subplot_titles=[
	f"Alpha Values ({lens.upper()} lens)",
	f"SRL Values ({lens.upper()} lens)",
	"Alpha vs SRL Correlation",
	"Population Context"
	]
	)

	# Alpha comparison
	fig.add_trace(go.Scatter(
	x=[0], y=[primary_data['alpha']],
	mode='markers', marker=dict(size=15, color='red'),
	name=f"Primary: {primary_data['label']}", showlegend=True
	), row=1, col=1)

	fig.add_trace(go.Scatter(
	x=[1], y=[neighbor_data['alpha']],
	mode='markers', marker=dict(size=15, color='blue'),
	name=f"Neighbor: {neighbor_data['label']}", showlegend=True
	), row=1, col=1)

	# SRL comparison
	fig.add_trace(go.Scatter(
	x=[0], y=[primary_data['srl']],
	mode='markers', marker=dict(size=15, color='red'),
	showlegend=False
	), row=1, col=2)

	fig.add_trace(go.Scatter(
	x=[1], y=[neighbor_data['srl']],
	mode='markers', marker=dict(size=15, color='blue'),
	showlegend=False
	), row=1, col=2)

	# Alpha vs SRL scatter
	fig.add_trace(go.Scatter(
	x=[primary_data['alpha']], y=[primary_data['srl']],
	mode='markers', marker=dict(size=20, color='red'),
	name="Primary α-SRL", showlegend=False
	), row=2, col=1)

	fig.add_trace(go.Scatter(
	x=[neighbor_data['alpha']], y=[neighbor_data['srl']],
	mode='markers', marker=dict(size=20, color='blue'),
	name="Neighbor α-SRL", showlegend=False
	), row=2, col=1)

	# Distance visualization
	fig.add_trace(go.Scatter(
	x=[primary_data['alpha'], neighbor_data['alpha']],
	y=[primary_data['srl'], neighbor_data['srl']],
	mode='lines+markers',
	line=dict(color='purple', width=3, dash='dash'),
	marker=dict(size=10, color=['red', 'blue']),
	name="Euclidean Distance", showlegend=False
	), row=2, col=2)

	# Update layout
	fig.update_layout(
	title=f"Scientific CMT Diagnostic Analysis - {lens.upper()} Lens",
	height=600,
	paper_bgcolor='white',
	plot_bgcolor='white'
	)

	# Update axes
	fig.update_xaxes(title_text="Sample", row=1, col=1)
	fig.update_yaxes(title_text="Alpha Value", row=1, col=1)
	fig.update_xaxes(title_text="Sample", row=1, col=2)
	fig.update_yaxes(title_text="SRL Value", row=1, col=2)
	fig.update_xaxes(title_text="Alpha", row=2, col=1)
	fig.update_yaxes(title_text="SRL", row=2, col=1)
	fig.update_xaxes(title_text="Alpha", row=2, col=2)
	fig.update_yaxes(title_text="SRL", row=2, col=2)

	return fig

	def update_scientific_analysis(species, primary_file, neighbor_file, lens):
	"""Main analysis function using only real data and rigorous statistics."""
	try:
	# Get rows from real data
	primary_row = df_combined[
	(df_combined["filepath"] == primary_file) &
	(df_combined["source"] == species)
	].iloc[0] if len(df_combined[
	(df_combined["filepath"] == primary_file) &
	(df_combined["source"] == species)
	]) > 0 else None

	if primary_row is None:
	return (
	go.Figure(layout={"title": "Primary sample not found"}),
	"Primary sample not found",
	"No analysis available",
	"No statistics available"
	)

	# Find neighbor
	neighbor_result = find_nearest_neighbor_scientific(primary_row, df_combined, lens)
	if neighbor_result is None:
	return (
	go.Figure(layout={"title": "No valid neighbor found"}),
	"No valid neighbor found",
	"No analysis available",
	"No statistics available"
	)

	neighbor_row, distance = neighbor_result

	# Get real CMT data
	primary_cmt = get_real_cmt_diagnostics(primary_row, lens)
	neighbor_cmt = get_real_cmt_diagnostics(neighbor_row, lens)

	if not primary_cmt or not neighbor_cmt:
	return (
	go.Figure(layout={"title": "Invalid CMT data"}),
	"Invalid CMT data",
	"No analysis available",
	"No statistics available"
	)

	# Create scientific visualization
	diagnostic_fig = create_scientific_diagnostic_plot(primary_cmt, neighbor_cmt, lens)

	# Calculate statistics
	stats_results = calculate_statistical_significance(
	primary_cmt, neighbor_cmt, df_combined, lens
	)

	# Build information panels
	primary_info = f"""
	<h4>📊 <b>Primary Sample</b></h4>
	<div style="background: rgba(240,240,250,1); padding: 10px; border-radius: 8px; margin: 5px 0; color: black;">
	<p><b>File:</b> {primary_cmt['filepath']}</p>
	<p><b>Species:</b> {primary_cmt['source']}</p>
	<p><b>Label:</b> {primary_cmt['label']}</p>
	<p><b>CMT α ({lens}):</b> {primary_cmt['alpha']:.6f}</p>
	<p><b>CMT SRL ({lens}):</b> {primary_cmt['srl']:.6f}</p>
	</div>
	"""

	neighbor_info = f"""
	<h4>🔗 <b>Nearest Neighbor</b></h4>
	<div style="background: rgba(240,250,240,1); padding: 10px; border-radius: 8px; margin: 5px 0; color: black;">
	<p><b>File:</b> {neighbor_cmt['filepath']}</p>
	<p><b>Species:</b> {neighbor_cmt['source']}</p>
	<p><b>Label:</b> {neighbor_cmt['label']}</p>
	<p><b>CMT α ({lens}):</b> {neighbor_cmt['alpha']:.6f}</p>
	<p><b>CMT SRL ({lens}):</b> {neighbor_cmt['srl']:.6f}</p>
	<p><b>Distance:</b> {distance:.6f}</p>
	</div>
	"""

	if 'error' not in stats_results:
	stats_info = f"""
	<h4>🔬 <b>Statistical Analysis</b></h4>
	<div style="background: rgba(250,250,240,1); padding: 10px; border-radius: 8px; margin: 5px 0; color: black;">
	<p><b>Alpha t-test:</b> t = {stats_results['alpha_ttest_statistic']:.4f}, p = {stats_results['alpha_ttest_pvalue']:.6f}</p>
	<p><b>SRL t-test:</b> t = {stats_results['srl_ttest_statistic']:.4f}, p = {stats_results['srl_ttest_pvalue']:.6f}</p>
	<p><b>Effect Sizes (Cohen's d):</b></p>
	<p>• Alpha: {stats_results['alpha_effect_size']:.4f}</p>
	<p>• SRL: {stats_results['srl_effect_size']:.4f}</p>
	<p><b>Population Sizes:</b> {stats_results['primary_population_size']} vs {stats_results['neighbor_population_size']}</p>
	<p><b>Statistical Significance:</b></p>
	<p>• Alpha: {'Significant' if stats_results['alpha_ttest_pvalue'] < 0.05 else 'Not significant'}</p>
	<p>• SRL: {'Significant' if stats_results['srl_ttest_pvalue'] < 0.05 else 'Not significant'}</p>
	</div>
	"""
	else:
	stats_info = f"<p>Statistical analysis failed: {stats_results['error']}</p>"

	return diagnostic_fig, primary_info, neighbor_info, stats_info

	except Exception as e:
	error_msg = f"Analysis error: {str(e)}"
	return (
	go.Figure(layout={"title": error_msg}),
	error_msg,
	error_msg,
	error_msg
	)

	# ---------------------------------------------------------------
	# Gradio Interface
	# ---------------------------------------------------------------
	with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="cyan")) as demo:
	gr.Markdown("""
	# 🔬 Scientific CMT Diagnostic Analysis Engine
	Rigorous statistical analysis of real CMT transformation results

	## ⚠️ SCIENTIFIC INTEGRITY NOTICE ⚠️
	This interface uses ONLY real preprocessed CMT data with NO synthetic generation, interpolation, or speculation.

	What you see:
	- ✅ Real CMT diagnostic values (α, SRL) from actual transformations
	- ✅ Mathematically rigorous distance measures (Euclidean distance)
	- ✅ Proper statistical testing (t-tests, effect sizes, percentiles)
	- ✅ Scientific hypothesis testing with p-values and confidence measures

	What was REMOVED for scientific rigor:
	- ❌ Synthetic holographic field generation
	- ❌ Cubic interpolation of non-existent data
	- ❌ Speculative similarity metrics
	- ❌ Confirmation bias in pattern detection
	- ❌ Ungrounded "communication bridge" calculations
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 🔬 Analysis Controls")

	species_selection = gr.Dropdown(
	label="Species",
	choices=["Dog", "Human"],
	value="Dog",
	info="Select primary species for analysis"
	)

	lens_selection = gr.Dropdown(
	label="Mathematical Lens",
	choices=["gamma", "zeta", "airy", "bessel"],
	value="gamma",
	info="CMT lens function used for analysis"
	)

	primary_file_selection = gr.Dropdown(
	label="Primary Sample",
	choices=df_combined[df_combined["source"] == "Dog"]["filepath"].tolist(),
	value=df_combined[df_combined["source"] == "Dog"]["filepath"].iloc[0] if len(df_combined[df_combined["source"] == "Dog"]) > 0 else "",
	info="Select specific sample for analysis"
	)

	neighbor_file_selection = gr.Dropdown(
	label="Comparison Sample",
	choices=[],
	value="",
	info="Nearest neighbor will be automatically found"
	)

	with gr.Column(scale=2):
	diagnostic_plot = gr.Plot(label="Scientific Diagnostic Analysis")

	with gr.Row():
	with gr.Column():
	primary_info_display = gr.HTML(label="Primary Sample Analysis")
	with gr.Column():
	neighbor_info_display = gr.HTML(label="Neighbor Analysis")
	with gr.Column():
	stats_info_display = gr.HTML(label="Statistical Results")

	# Update file choices when species changes
	def update_file_choices(species):
	choices = df_combined[df_combined["source"] == species]["filepath"].tolist()
	return gr.Dropdown(choices=choices, value=choices[0] if choices else "")

	species_selection.change(
	fn=update_file_choices,
	inputs=[species_selection],
	outputs=[primary_file_selection]
	)

	# Main analysis update
	for input_component in [species_selection, primary_file_selection, lens_selection]:
	input_component.change(
	fn=update_scientific_analysis,
	inputs=[species_selection, primary_file_selection, neighbor_file_selection, lens_selection],
	outputs=[diagnostic_plot, primary_info_display, neighbor_info_display, stats_info_display]
	)

	print("🔬 Scientific CMT Diagnostic Analysis Engine Ready!")

	if __name__ == "__main__":
	demo.launch(share=False, debug=False)