Spaces:

AMKhakbaz
/

AMKAPP

Running

File size: 15,655 Bytes

import pandas as pd
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import streamlit as st

def is_matching_pattern(column, prefix):
    """
    Checks if the column name matches the pattern: prefix-number (1 to 3 digits).
    
    Parameters:
    - column: The column name to check.
    - prefix: The prefix to match.
    
    Returns:
    - True if matches, False otherwise.
    """
    if not column.startswith(prefix + '_'):
        return False
    suffix = column[len(prefix) + 1:]
    if 1 <= len(suffix) <= 3 and suffix.isdigit():
        return True
    return False

def analyze_single_column(df, column_name, type_option):
    """
    Function 1: Analyzes data based on the selected type.
    
    Parameters:
    - df: pandas DataFrame
    - column_name: Name of the column or prefix
    - type_option: 1 or 2
    
    Returns:
    - table: DataFrame containing frequency percentages
    - fig: Plotly Figure object
    """
    total_rows = len(df)
    
    if type_option == 1:
        if column_name not in df.columns:
            st.error(f"Column '{column_name}' not found in the Excel file.")
            return None, None
        # Calculate frequency
        frequency = df[column_name].value_counts(dropna=False).sort_index()
        percentage = (frequency / total_rows) * 100
        table = percentage.reset_index()
        table.columns = [column_name, 'Percentage']
        st.subheader("Frequency Table (Percentage):")
        st.dataframe(table)
        
        # Plot interactive histogram
        fig = go.Figure(data=[go.Bar(
            x=table[column_name].astype(str),
            y=table['Percentage'],
            marker=dict(color='rgba(173, 216, 230, 0.6)', 
                        line=dict(color='rgba(173, 216, 230, 1.0)', width=1)),
            width=0.4
        )])
        fig.update_layout(
            title=f"Histogram of '{column_name}'",
            xaxis_title=column_name,
            yaxis_title='Percentage',
            bargap=0.2,
            template='plotly_white'
        )
        st.plotly_chart(fig, use_container_width=True)
        return table, fig
        
    elif type_option == 2:
        # Column name pattern: name-number (1 to 3 digits)
        selected_columns = [col for col in df.columns if is_matching_pattern(col, column_name)]
        
        if not selected_columns:
            st.error(f"No columns matching the pattern '{column_name}-<number>' found in the Excel file.")
            return None, None
        
        results = {}
        for col in selected_columns:
            unique_values = df[col].dropna().unique()
            if len(unique_values) > 2:
                st.warning(f"Column '{col}' has more than two unique values. It was skipped.")
                continue
            value_counts = df[col].value_counts(dropna=False)
            percentage = (value_counts / total_rows) * 100
            for val, pct in percentage.items():
                results[val] = results.get(val, 0) + pct
        
        if not results:
            st.error("No valid columns with exactly two unique values were found.")
            return None, None
        
        # Sort the keys
        table = pd.DataFrame(list(results.items()), columns=[column_name, 'Percentage'])
        table = table.sort_values(by=column_name).reset_index(drop=True)
        st.subheader("Aggregated Frequency Table (Percentage):")
        st.dataframe(table)
        
        # Plot interactive histogram
        fig = go.Figure(data=[go.Bar(
            x=table[column_name].astype(str),
            y=table['Percentage'],
            marker=dict(color='rgba(173, 216, 230, 0.6)', 
                        line=dict(color='rgba(173, 216, 230, 1.0)', width=1)),
            width=0.4
        )])
        fig.update_layout(
            title=f"Histogram of Columns '{column_name}-<number>'",
            xaxis_title=column_name,
            yaxis_title='Percentage',
            bargap=0.2,
            template='plotly_white'
        )
        st.plotly_chart(fig, use_container_width=True)
        return table, fig
        
    else:
        st.error("The 'type_option' must be either 1 or 2.")
        return None, None

def analyze_multiple_columns(df, first_name, second_name, first_type, second_type):
    """
    Function 2: Analyzes data based on combinations of first and second types.
    
    Parameters:
    - df: pandas DataFrame
    - first_name: Name or prefix of the first column
    - second_name: Name or prefix of the second column
    - first_type: 1 or 2
    - second_type: 1 or 2
    
    Returns:
    - table: DataFrame containing contingency or frequency tables
    - figs: List of Plotly Figure objects
    """
    total_rows = len(df)
    
    # Helper functions to select columns based on type
    def select_columns_type1(name):
        if name in df.columns:
            return [name]
        else:
            st.error(f"Column '{name}' not found in the Excel file.")
            return []
    
    def select_columns_type2(name):
        selected = [col for col in df.columns if is_matching_pattern(col, name)]
        if not selected:
            st.error(f"No columns matching the pattern '{name}-<number>' found in the Excel file.")
        return selected
    
    # Select columns based on types
    first_columns = select_columns_type1(first_name) if first_type == 1 else select_columns_type2(first_name)
    second_columns = select_columns_type1(second_name) if second_type == 1 else select_columns_type2(second_name)
    
    if not first_columns or not second_columns:
        st.error("Column selection failed.")
        return None, None
    
    figs = []
    
    if first_type == 1 and second_type == 1:
        # Both types are 1
        if len(first_columns) != 1 or len(second_columns) != 1:
            st.error("When both first and second types are 1, exactly one column must be selected for each.")
            return None, None
        col1 = first_columns[0]
        col2 = second_columns[0]
        contingency = pd.crosstab(df[col1], df[col2], normalize='all') * 100
        st.subheader("Contingency Table (Percentage):")
        st.dataframe(contingency)
        
        # Plot interactive heatmap
        fig = go.Figure(data=go.Heatmap(
            z=contingency.values,
            x=contingency.columns,
            y=contingency.index,
            colorscale='Blues'
        ))
        fig.update_layout(
            title=f"Contingency Table between '{col1}' and '{col2}'",
            xaxis_title=col2,
            yaxis_title=col1,
            template='plotly_white'
        )
        st.plotly_chart(fig, use_container_width=True)
        figs.append(fig)
        
    elif first_type == 1 and second_type == 2:
        # First type is 1 and second type is 2
        if len(first_columns) != 1:
            st.error("When first type is 1, exactly one column must be selected.")
            return None, None
        col1 = first_columns[0]
        col2_list = second_columns
        unique_values_col1 = df[col1].dropna().unique()
        
        results = {}
        for val in unique_values_col1:
            filter_df = df[df[col1] == val]
            freq_table = {}
            for col2 in col2_list:
                frequency = filter_df[col2].value_counts(dropna=False)
                percentage = (frequency / total_rows) * 100
                freq_table[col2] = percentage.to_dict()
            results[val] = freq_table
        
        # Convert results to a multi-dimensional DataFrame
        final_df = pd.DataFrame(results).T
        st.subheader("Multi-dimensional Frequency Table (Percentage):")
        st.dataframe(final_df)
        
        # Plot interactive charts
        for val, data in results.items():
            st.subheader(f"Frequencies for '{col1}' = {val}")
            fig = make_subplots(rows=1, cols=len(col2_list), subplot_titles=col2_list)
            for i, col2 in enumerate(col2_list, 1):
                categories = list(data[col2].keys())
                values = list(data[col2].values())
                fig.add_trace(go.Bar(
                    x=categories,
                    y=values,
                    marker=dict(color='rgba(173, 216, 230, 0.6)', 
                                line=dict(color='rgba(173, 216, 230, 1.0)', width=1)),
                    width=0.4
                ), row=1, col=i)
                fig.update_xaxes(title_text=col2, row=1, col=i)
                fig.update_yaxes(title_text='Percentage', row=1, col=i)
            fig.update_layout(
                title=f"Frequencies for '{col1}' = {val}",
                template='plotly_white',
                showlegend=False
            )
            st.plotly_chart(fig, use_container_width=True)
            figs.append(fig)
        
    elif first_type == 2 and second_type == 1:
        # First type is 2 and second type is 1
        if len(second_columns) != 1:
            st.error("When second type is 1, exactly one column must be selected.")
            return None, None
        col2 = second_columns[0]
        col1_list = first_columns
        unique_values_col2 = df[col2].dropna().unique()
        
        results = {}
        for val in unique_values_col2:
            filter_df = df[df[col2] == val]
            freq_table = {}
            for col1 in col1_list:
                frequency = filter_df[col1].value_counts(dropna=False)
                percentage = (frequency / total_rows) * 100
                freq_table[col1] = percentage.to_dict()
            results[val] = freq_table
        
        # Convert results to a multi-dimensional DataFrame
        final_df = pd.DataFrame(results).T
        st.subheader("Multi-dimensional Frequency Table (Percentage):")
        st.dataframe(final_df)
        
        # Plot interactive charts
        for val, data in results.items():
            st.subheader(f"Frequencies for '{col2}' = {val}")
            fig = make_subplots(rows=1, cols=len(col1_list), subplot_titles=col1_list)
            for i, col1 in enumerate(col1_list, 1):
                categories = list(data[col1].keys())
                values = list(data[col1].values())
                fig.add_trace(go.Bar(
                    x=categories,
                    y=values,
                    marker=dict(color='rgba(173, 216, 230, 0.6)', 
                                line=dict(color='rgba(173, 216, 230, 1.0)', width=1)),
                    width=0.4
                ), row=1, col=i)
                fig.update_xaxes(title_text=col1, row=1, col=i)
                fig.update_yaxes(title_text='Percentage', row=1, col=i)
            fig.update_layout(
                title=f"Frequencies for '{col2}' = {val}",
                template='plotly_white',
                showlegend=False
            )
            st.plotly_chart(fig, use_container_width=True)
            figs.append(fig)
        
    elif first_type == 2 and second_type == 2:
        # Both types are 2
        col1_list = first_columns
        col2_list = second_columns
        
        results = {}
        for col2 in col2_list:
            for val in df[col2].dropna().unique():
                filter_df = df[df[col2] == val]
                frequency = {}
                for col1 in col1_list:
                    count = filter_df[col1].count()
                    frequency[col1] = (count / total_rows) * 100
                results[(col2, val)] = frequency
        
        if not results:
            st.error("No valid combinations found for both types being 2.")
            return None, None
        
        # Convert results to a multi-dimensional DataFrame
        index = pd.MultiIndex.from_tuples(results.keys(), names=['Second Column', 'Second Column Value'])
        final_df = pd.DataFrame(list(results.values()), index=index)
        st.subheader("Multi-dimensional Frequency Table (Percentage):")
        st.dataframe(final_df)
        
        # Plot interactive charts
        for (col2, val), data in results.items():
            st.subheader(f"Frequencies for '{col2}' = {val}")
            fig = go.Figure(data=[go.Bar(
                x=list(data.keys()),
                y=list(data.values()),
                marker=dict(color='rgba(173, 216, 230, 0.6)', 
                            line=dict(color='rgba(173, 216, 230, 1.0)', width=1)),
                width=0.4
            )])
            fig.update_layout(
                title=f"Frequencies for '{col2}' = {val}",
                xaxis_title=first_name,
                yaxis_title='Percentage',
                template='plotly_white'
            )
            st.plotly_chart(fig, use_container_width=True)
            figs.append(fig)
        
    else:
        st.error("The 'first_type' and 'second_type' must each be either 1 or 2.")
        return None, None
    
    return final_df if 'final_df' in locals() else table, figs

def main():
    st.title("📊 Data Analysis Application")
    st.write("""
    This application allows you to upload an Excel file and perform data analysis based on your selected parameters.
    """)
    
    # File uploader
    uploaded_file = st.file_uploader("Upload your Excel file", type=["xlsx", "xls"])
    
    if uploaded_file is not None:
        try:
            df = pd.read_excel(uploaded_file)
            st.success("File uploaded successfully!")
            st.subheader("Preview of Uploaded Data:")
            st.dataframe(df.head())  # Display first few rows
        except Exception as e:
            st.error(f"Error reading the Excel file: {e}")
            return
        
        st.markdown("---")
        
        # Function Selection
        st.header("🔍 Select Analysis Function")
        analysis_function = st.radio("Choose Function", ("Analyze Single Column", "Analyze Multiple Columns"))
        
        if analysis_function == "Analyze Single Column":
            st.header("📈 Analyze Single Column")
            column_name = st.text_input("Enter the column name or prefix:", "")
            type_option = st.selectbox("Select Type Option", ("1", "2"))
            
            if st.button("Run Analysis"):
                if column_name.strip() == "":
                    st.error("Please enter a valid column name.")
                else:
                    type_option_int = int(type_option)
                    table, fig = analyze_single_column(df, column_name, type_option_int)
                    if table is not None:
                        st.success("Analysis completed successfully!")
        
        elif analysis_function == "Analyze Multiple Columns":
            st.header("📊 Analyze Multiple Columns")
            first_name = st.text_input("Enter the first column name or prefix:", "")
            second_name = st.text_input("Enter the second column name or prefix:", "")
            first_type = st.selectbox("Select First Type Option", ("1", "2"), key='first_type')
            second_type = st.selectbox("Select Second Type Option", ("1", "2"), key='second_type')
            
            if st.button("Run Analysis"):
                if first_name.strip() == "" or second_name.strip() == "":
                    st.error("Please enter valid column names.")
                else:
                    first_type_int = int(first_type)
                    second_type_int = int(second_type)
                    table, figs = analyze_multiple_columns(df, first_name, second_name, first_type_int, second_type_int)
                    if table is not None:
                        st.success("Analysis completed successfully!")

if __name__ == "__main__":
    main()