Spaces:

whackthejacker
/

DataHubHub

Paused

App Files Files Community

DataHubHub / components /dataset_statistics.py

whackthejacker

Upload 34 files

43b66f1 verified 7 months ago

raw

history blame

6.05 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import plotly.express as px
	import plotly.graph_objects as go

	def render_dataset_statistics(dataset, dataset_type):
	"""
	Renders statistical analysis of the dataset.

	Args:
	dataset: The dataset to analyze (pandas DataFrame)
	dataset_type: The type of dataset (csv, json, etc.)
	"""
	if dataset is None:
	st.warning("No dataset to analyze.")
	return

	st.markdown("<h3>Dataset Statistics</h3>", unsafe_allow_html=True)

	# Tabs for different kinds of statistics
	tab1, tab2, tab3 = st.tabs(["Summary Statistics", "Distribution Analysis", "Correlation Analysis"])

	with tab1:
	# Summary statistics
	st.markdown("### Summary Statistics")

	# Filter only numeric columns for statistics
	numeric_cols = dataset.select_dtypes(include=[np.number]).columns.tolist()

	if numeric_cols:
	# Display summary statistics
	st.dataframe(dataset[numeric_cols].describe().T.style.highlight_max(axis=1, color='#FFD21E'), use_container_width=True)

	# Top values for categorical columns
	categorical_cols = dataset.select_dtypes(exclude=[np.number]).columns.tolist()
	if categorical_cols:
	st.markdown("### Category Value Counts")
	selected_cat_col = st.selectbox("Select categorical column", categorical_cols)

	# Show top values and their counts
	value_counts = dataset[selected_cat_col].value_counts().head(10)
	fig = px.bar(
	x=value_counts.index,
	y=value_counts.values,
	title=f"Top 10 values in {selected_cat_col}",
	labels={"x": selected_cat_col, "y": "Count"},
	color_discrete_sequence=["#2563EB"]
	)
	st.plotly_chart(fig, use_container_width=True)
	else:
	st.warning("No numeric columns found in the dataset.")

	with tab2:
	# Distribution analysis
	st.markdown("### Distribution Analysis")

	if numeric_cols:
	selected_num_col = st.selectbox("Select numeric column", numeric_cols)

	# Create distribution plot
	fig = px.histogram(
	dataset,
	x=selected_num_col,
	title=f"Distribution of {selected_num_col}",
	marginal="box",
	color_discrete_sequence=["#FFD21E"],
	template="simple_white"
	)
	st.plotly_chart(fig, use_container_width=True)

	# Basic distribution stats
	col1, col2, col3, col4 = st.columns(4)
	with col1:
	st.metric("Mean", f"{dataset[selected_num_col].mean():.2f}")
	with col2:
	st.metric("Median", f"{dataset[selected_num_col].median():.2f}")
	with col3:
	st.metric("Min", f"{dataset[selected_num_col].min():.2f}")
	with col4:
	st.metric("Max", f"{dataset[selected_num_col].max():.2f}")
	else:
	st.warning("No numeric columns found in the dataset.")

	with tab3:
	# Correlation analysis
	st.markdown("### Correlation Analysis")

	if len(numeric_cols) > 1:
	# Compute correlation matrix
	corr_matrix = dataset[numeric_cols].corr()

	# Plot heatmap
	fig = px.imshow(
	corr_matrix,
	color_continuous_scale=["#84919A", "#FFFFFF", "#FFD21E"],
	title="Correlation Matrix",
	template="simple_white"
	)
	st.plotly_chart(fig, use_container_width=True)

	# Top correlated features
	st.markdown("### Top Correlated Features")

	# Convert correlation matrix to a long format
	corr_pairs = []
	for i in range(len(corr_matrix.columns)):
	for j in range(i+1, len(corr_matrix.columns)):
	col1 = corr_matrix.columns[i]
	col2 = corr_matrix.columns[j]
	corr_value = corr_matrix.iloc[i, j]
	corr_pairs.append((col1, col2, corr_value))

	# Sort by absolute correlation
	corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)

	# Display top 10 correlated pairs
	if corr_pairs:
	top_pairs = pd.DataFrame(corr_pairs[:10], columns=["Feature 1", "Feature 2", "Correlation"])
	st.dataframe(
	top_pairs.style.format({
	"Correlation": "{:.4f}"
	}).background_gradient(subset=["Correlation"], cmap="coolwarm"),
	use_container_width=True
	)

	# Scatter plot for the top correlated pair
	if corr_pairs:
	top_pair = corr_pairs[0]
	fig = px.scatter(
	dataset,
	x=top_pair[0],
	y=top_pair[1],
	title=f"Scatter plot: {top_pair[0]} vs {top_pair[1]} (Corr: {top_pair[2]:.4f})",
	color_discrete_sequence=["#2563EB"],
	template="simple_white"
	)
	fig.add_traces(
	go.Scatter(
	x=[None],
	y=[None],
	mode='lines',
	line=dict(color="#FFD21E", width=3),
	name='Best Fit'
	)
	)
	st.plotly_chart(fig, use_container_width=True)
	else:
	st.warning("Need at least two numeric columns for correlation analysis.")