File size: 6,051 Bytes
43b66f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

def render_dataset_statistics(dataset, dataset_type):
    """
    Renders statistical analysis of the dataset.
    
    Args:
        dataset: The dataset to analyze (pandas DataFrame)
        dataset_type: The type of dataset (csv, json, etc.)
    """
    if dataset is None:
        st.warning("No dataset to analyze.")
        return
    
    st.markdown("<h3>Dataset Statistics</h3>", unsafe_allow_html=True)
    
    # Tabs for different kinds of statistics
    tab1, tab2, tab3 = st.tabs(["Summary Statistics", "Distribution Analysis", "Correlation Analysis"])
    
    with tab1:
        # Summary statistics
        st.markdown("### Summary Statistics")
        
        # Filter only numeric columns for statistics
        numeric_cols = dataset.select_dtypes(include=[np.number]).columns.tolist()
        
        if numeric_cols:
            # Display summary statistics
            st.dataframe(dataset[numeric_cols].describe().T.style.highlight_max(axis=1, color='#FFD21E'), use_container_width=True)
            
            # Top values for categorical columns
            categorical_cols = dataset.select_dtypes(exclude=[np.number]).columns.tolist()
            if categorical_cols:
                st.markdown("### Category Value Counts")
                selected_cat_col = st.selectbox("Select categorical column", categorical_cols)
                
                # Show top values and their counts
                value_counts = dataset[selected_cat_col].value_counts().head(10)
                fig = px.bar(
                    x=value_counts.index, 
                    y=value_counts.values,
                    title=f"Top 10 values in {selected_cat_col}",
                    labels={"x": selected_cat_col, "y": "Count"},
                    color_discrete_sequence=["#2563EB"]
                )
                st.plotly_chart(fig, use_container_width=True)
        else:
            st.warning("No numeric columns found in the dataset.")
    
    with tab2:
        # Distribution analysis
        st.markdown("### Distribution Analysis")
        
        if numeric_cols:
            selected_num_col = st.selectbox("Select numeric column", numeric_cols)
            
            # Create distribution plot
            fig = px.histogram(
                dataset, 
                x=selected_num_col,
                title=f"Distribution of {selected_num_col}",
                marginal="box",
                color_discrete_sequence=["#FFD21E"],
                template="simple_white"
            )
            st.plotly_chart(fig, use_container_width=True)
            
            # Basic distribution stats
            col1, col2, col3, col4 = st.columns(4)
            with col1:
                st.metric("Mean", f"{dataset[selected_num_col].mean():.2f}")
            with col2:
                st.metric("Median", f"{dataset[selected_num_col].median():.2f}")
            with col3:
                st.metric("Min", f"{dataset[selected_num_col].min():.2f}")
            with col4:
                st.metric("Max", f"{dataset[selected_num_col].max():.2f}")
        else:
            st.warning("No numeric columns found in the dataset.")
    
    with tab3:
        # Correlation analysis
        st.markdown("### Correlation Analysis")
        
        if len(numeric_cols) > 1:
            # Compute correlation matrix
            corr_matrix = dataset[numeric_cols].corr()
            
            # Plot heatmap
            fig = px.imshow(
                corr_matrix,
                color_continuous_scale=["#84919A", "#FFFFFF", "#FFD21E"],
                title="Correlation Matrix",
                template="simple_white"
            )
            st.plotly_chart(fig, use_container_width=True)
            
            # Top correlated features
            st.markdown("### Top Correlated Features")
            
            # Convert correlation matrix to a long format
            corr_pairs = []
            for i in range(len(corr_matrix.columns)):
                for j in range(i+1, len(corr_matrix.columns)):
                    col1 = corr_matrix.columns[i]
                    col2 = corr_matrix.columns[j]
                    corr_value = corr_matrix.iloc[i, j]
                    corr_pairs.append((col1, col2, corr_value))
            
            # Sort by absolute correlation
            corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)
            
            # Display top 10 correlated pairs
            if corr_pairs:
                top_pairs = pd.DataFrame(corr_pairs[:10], columns=["Feature 1", "Feature 2", "Correlation"])
                st.dataframe(
                    top_pairs.style.format({
                        "Correlation": "{:.4f}"
                    }).background_gradient(subset=["Correlation"], cmap="coolwarm"),
                    use_container_width=True
                )
                
                # Scatter plot for the top correlated pair
                if corr_pairs:
                    top_pair = corr_pairs[0]
                    fig = px.scatter(
                        dataset, 
                        x=top_pair[0], 
                        y=top_pair[1],
                        title=f"Scatter plot: {top_pair[0]} vs {top_pair[1]} (Corr: {top_pair[2]:.4f})",
                        color_discrete_sequence=["#2563EB"],
                        template="simple_white"
                    )
                    fig.add_traces(
                        go.Scatter(
                            x=[None], 
                            y=[None],
                            mode='lines',
                            line=dict(color="#FFD21E", width=3),
                            name='Best Fit'
                        )
                    )
                    st.plotly_chart(fig, use_container_width=True)
        else:
            st.warning("Need at least two numeric columns for correlation analysis.")