Spaces:
Paused
Paused
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| def render_dataset_statistics(dataset, dataset_type): | |
| """ | |
| Renders statistical analysis of the dataset. | |
| Args: | |
| dataset: The dataset to analyze (pandas DataFrame) | |
| dataset_type: The type of dataset (csv, json, etc.) | |
| """ | |
| if dataset is None: | |
| st.warning("No dataset to analyze.") | |
| return | |
| st.markdown("<h3>Dataset Statistics</h3>", unsafe_allow_html=True) | |
| # Tabs for different kinds of statistics | |
| tab1, tab2, tab3 = st.tabs(["Summary Statistics", "Distribution Analysis", "Correlation Analysis"]) | |
| with tab1: | |
| # Summary statistics | |
| st.markdown("### Summary Statistics") | |
| # Filter only numeric columns for statistics | |
| numeric_cols = dataset.select_dtypes(include=[np.number]).columns.tolist() | |
| if numeric_cols: | |
| # Display summary statistics | |
| st.dataframe(dataset[numeric_cols].describe().T.style.highlight_max(axis=1, color='#FFD21E'), use_container_width=True) | |
| # Top values for categorical columns | |
| categorical_cols = dataset.select_dtypes(exclude=[np.number]).columns.tolist() | |
| if categorical_cols: | |
| st.markdown("### Category Value Counts") | |
| selected_cat_col = st.selectbox("Select categorical column", categorical_cols) | |
| # Show top values and their counts | |
| value_counts = dataset[selected_cat_col].value_counts().head(10) | |
| fig = px.bar( | |
| x=value_counts.index, | |
| y=value_counts.values, | |
| title=f"Top 10 values in {selected_cat_col}", | |
| labels={"x": selected_cat_col, "y": "Count"}, | |
| color_discrete_sequence=["#2563EB"] | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| else: | |
| st.warning("No numeric columns found in the dataset.") | |
| with tab2: | |
| # Distribution analysis | |
| st.markdown("### Distribution Analysis") | |
| if numeric_cols: | |
| selected_num_col = st.selectbox("Select numeric column", numeric_cols) | |
| # Create distribution plot | |
| fig = px.histogram( | |
| dataset, | |
| x=selected_num_col, | |
| title=f"Distribution of {selected_num_col}", | |
| marginal="box", | |
| color_discrete_sequence=["#FFD21E"], | |
| template="simple_white" | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Basic distribution stats | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric("Mean", f"{dataset[selected_num_col].mean():.2f}") | |
| with col2: | |
| st.metric("Median", f"{dataset[selected_num_col].median():.2f}") | |
| with col3: | |
| st.metric("Min", f"{dataset[selected_num_col].min():.2f}") | |
| with col4: | |
| st.metric("Max", f"{dataset[selected_num_col].max():.2f}") | |
| else: | |
| st.warning("No numeric columns found in the dataset.") | |
| with tab3: | |
| # Correlation analysis | |
| st.markdown("### Correlation Analysis") | |
| if len(numeric_cols) > 1: | |
| # Compute correlation matrix | |
| corr_matrix = dataset[numeric_cols].corr() | |
| # Plot heatmap | |
| fig = px.imshow( | |
| corr_matrix, | |
| color_continuous_scale=["#84919A", "#FFFFFF", "#FFD21E"], | |
| title="Correlation Matrix", | |
| template="simple_white" | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Top correlated features | |
| st.markdown("### Top Correlated Features") | |
| # Convert correlation matrix to a long format | |
| corr_pairs = [] | |
| for i in range(len(corr_matrix.columns)): | |
| for j in range(i+1, len(corr_matrix.columns)): | |
| col1 = corr_matrix.columns[i] | |
| col2 = corr_matrix.columns[j] | |
| corr_value = corr_matrix.iloc[i, j] | |
| corr_pairs.append((col1, col2, corr_value)) | |
| # Sort by absolute correlation | |
| corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True) | |
| # Display top 10 correlated pairs | |
| if corr_pairs: | |
| top_pairs = pd.DataFrame(corr_pairs[:10], columns=["Feature 1", "Feature 2", "Correlation"]) | |
| st.dataframe( | |
| top_pairs.style.format({ | |
| "Correlation": "{:.4f}" | |
| }).background_gradient(subset=["Correlation"], cmap="coolwarm"), | |
| use_container_width=True | |
| ) | |
| # Scatter plot for the top correlated pair | |
| if corr_pairs: | |
| top_pair = corr_pairs[0] | |
| fig = px.scatter( | |
| dataset, | |
| x=top_pair[0], | |
| y=top_pair[1], | |
| title=f"Scatter plot: {top_pair[0]} vs {top_pair[1]} (Corr: {top_pair[2]:.4f})", | |
| color_discrete_sequence=["#2563EB"], | |
| template="simple_white" | |
| ) | |
| fig.add_traces( | |
| go.Scatter( | |
| x=[None], | |
| y=[None], | |
| mode='lines', | |
| line=dict(color="#FFD21E", width=3), | |
| name='Best Fit' | |
| ) | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| else: | |
| st.warning("Need at least two numeric columns for correlation analysis.") | |