Spaces:
Running
Running
File size: 6,051 Bytes
43b66f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
def render_dataset_statistics(dataset, dataset_type):
"""
Renders statistical analysis of the dataset.
Args:
dataset: The dataset to analyze (pandas DataFrame)
dataset_type: The type of dataset (csv, json, etc.)
"""
if dataset is None:
st.warning("No dataset to analyze.")
return
st.markdown("<h3>Dataset Statistics</h3>", unsafe_allow_html=True)
# Tabs for different kinds of statistics
tab1, tab2, tab3 = st.tabs(["Summary Statistics", "Distribution Analysis", "Correlation Analysis"])
with tab1:
# Summary statistics
st.markdown("### Summary Statistics")
# Filter only numeric columns for statistics
numeric_cols = dataset.select_dtypes(include=[np.number]).columns.tolist()
if numeric_cols:
# Display summary statistics
st.dataframe(dataset[numeric_cols].describe().T.style.highlight_max(axis=1, color='#FFD21E'), use_container_width=True)
# Top values for categorical columns
categorical_cols = dataset.select_dtypes(exclude=[np.number]).columns.tolist()
if categorical_cols:
st.markdown("### Category Value Counts")
selected_cat_col = st.selectbox("Select categorical column", categorical_cols)
# Show top values and their counts
value_counts = dataset[selected_cat_col].value_counts().head(10)
fig = px.bar(
x=value_counts.index,
y=value_counts.values,
title=f"Top 10 values in {selected_cat_col}",
labels={"x": selected_cat_col, "y": "Count"},
color_discrete_sequence=["#2563EB"]
)
st.plotly_chart(fig, use_container_width=True)
else:
st.warning("No numeric columns found in the dataset.")
with tab2:
# Distribution analysis
st.markdown("### Distribution Analysis")
if numeric_cols:
selected_num_col = st.selectbox("Select numeric column", numeric_cols)
# Create distribution plot
fig = px.histogram(
dataset,
x=selected_num_col,
title=f"Distribution of {selected_num_col}",
marginal="box",
color_discrete_sequence=["#FFD21E"],
template="simple_white"
)
st.plotly_chart(fig, use_container_width=True)
# Basic distribution stats
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Mean", f"{dataset[selected_num_col].mean():.2f}")
with col2:
st.metric("Median", f"{dataset[selected_num_col].median():.2f}")
with col3:
st.metric("Min", f"{dataset[selected_num_col].min():.2f}")
with col4:
st.metric("Max", f"{dataset[selected_num_col].max():.2f}")
else:
st.warning("No numeric columns found in the dataset.")
with tab3:
# Correlation analysis
st.markdown("### Correlation Analysis")
if len(numeric_cols) > 1:
# Compute correlation matrix
corr_matrix = dataset[numeric_cols].corr()
# Plot heatmap
fig = px.imshow(
corr_matrix,
color_continuous_scale=["#84919A", "#FFFFFF", "#FFD21E"],
title="Correlation Matrix",
template="simple_white"
)
st.plotly_chart(fig, use_container_width=True)
# Top correlated features
st.markdown("### Top Correlated Features")
# Convert correlation matrix to a long format
corr_pairs = []
for i in range(len(corr_matrix.columns)):
for j in range(i+1, len(corr_matrix.columns)):
col1 = corr_matrix.columns[i]
col2 = corr_matrix.columns[j]
corr_value = corr_matrix.iloc[i, j]
corr_pairs.append((col1, col2, corr_value))
# Sort by absolute correlation
corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)
# Display top 10 correlated pairs
if corr_pairs:
top_pairs = pd.DataFrame(corr_pairs[:10], columns=["Feature 1", "Feature 2", "Correlation"])
st.dataframe(
top_pairs.style.format({
"Correlation": "{:.4f}"
}).background_gradient(subset=["Correlation"], cmap="coolwarm"),
use_container_width=True
)
# Scatter plot for the top correlated pair
if corr_pairs:
top_pair = corr_pairs[0]
fig = px.scatter(
dataset,
x=top_pair[0],
y=top_pair[1],
title=f"Scatter plot: {top_pair[0]} vs {top_pair[1]} (Corr: {top_pair[2]:.4f})",
color_discrete_sequence=["#2563EB"],
template="simple_white"
)
fig.add_traces(
go.Scatter(
x=[None],
y=[None],
mode='lines',
line=dict(color="#FFD21E", width=3),
name='Best Fit'
)
)
st.plotly_chart(fig, use_container_width=True)
else:
st.warning("Need at least two numeric columns for correlation analysis.")
|