Spaces:
Running
Running
import pandas as pd | |
import plotly.graph_objs as go | |
from plotly.subplots import make_subplots | |
import streamlit as st | |
def is_matching_pattern(column, prefix): | |
""" | |
Checks if the column name matches the pattern: prefix-number (1 to 3 digits). | |
Parameters: | |
- column: The column name to check. | |
- prefix: The prefix to match. | |
Returns: | |
- True if matches, False otherwise. | |
""" | |
if not column.startswith(prefix + '_'): | |
return False | |
suffix = column[len(prefix) + 1:] | |
if 1 <= len(suffix) <= 3 and suffix.isdigit(): | |
return True | |
return False | |
def analyze_single_column(df, column_name, type_option): | |
""" | |
Function 1: Analyzes data based on the selected type. | |
Parameters: | |
- df: pandas DataFrame | |
- column_name: Name of the column or prefix | |
- type_option: 1 or 2 | |
Returns: | |
- table: DataFrame containing frequency percentages | |
- fig: Plotly Figure object | |
""" | |
total_rows = len(df) | |
if type_option == 1: | |
if column_name not in df.columns: | |
st.error(f"Column '{column_name}' not found in the Excel file.") | |
return None, None | |
# Calculate frequency | |
frequency = df[column_name].value_counts(dropna=False).sort_index() | |
percentage = (frequency / total_rows) * 100 | |
table = percentage.reset_index() | |
table.columns = [column_name, 'Percentage'] | |
st.subheader("Frequency Table (Percentage):") | |
st.dataframe(table) | |
# Plot interactive histogram | |
fig = go.Figure(data=[go.Bar( | |
x=table[column_name].astype(str), | |
y=table['Percentage'], | |
marker=dict(color='rgba(173, 216, 230, 0.6)', | |
line=dict(color='rgba(173, 216, 230, 1.0)', width=1)), | |
width=0.4 | |
)]) | |
fig.update_layout( | |
title=f"Histogram of '{column_name}'", | |
xaxis_title=column_name, | |
yaxis_title='Percentage', | |
bargap=0.2, | |
template='plotly_white' | |
) | |
st.plotly_chart(fig, use_container_width=True) | |
return table, fig | |
elif type_option == 2: | |
# Column name pattern: name-number (1 to 3 digits) | |
selected_columns = [col for col in df.columns if is_matching_pattern(col, column_name)] | |
if not selected_columns: | |
st.error(f"No columns matching the pattern '{column_name}-<number>' found in the Excel file.") | |
return None, None | |
results = {} | |
for col in selected_columns: | |
unique_values = df[col].dropna().unique() | |
if len(unique_values) > 2: | |
st.warning(f"Column '{col}' has more than two unique values. It was skipped.") | |
continue | |
value_counts = df[col].value_counts(dropna=False) | |
percentage = (value_counts / total_rows) * 100 | |
for val, pct in percentage.items(): | |
results[val] = results.get(val, 0) + pct | |
if not results: | |
st.error("No valid columns with exactly two unique values were found.") | |
return None, None | |
# Sort the keys | |
table = pd.DataFrame(list(results.items()), columns=[column_name, 'Percentage']) | |
table = table.sort_values(by=column_name).reset_index(drop=True) | |
st.subheader("Aggregated Frequency Table (Percentage):") | |
st.dataframe(table) | |
# Plot interactive histogram | |
fig = go.Figure(data=[go.Bar( | |
x=table[column_name].astype(str), | |
y=table['Percentage'], | |
marker=dict(color='rgba(173, 216, 230, 0.6)', | |
line=dict(color='rgba(173, 216, 230, 1.0)', width=1)), | |
width=0.4 | |
)]) | |
fig.update_layout( | |
title=f"Histogram of Columns '{column_name}-<number>'", | |
xaxis_title=column_name, | |
yaxis_title='Percentage', | |
bargap=0.2, | |
template='plotly_white' | |
) | |
st.plotly_chart(fig, use_container_width=True) | |
return table, fig | |
else: | |
st.error("The 'type_option' must be either 1 or 2.") | |
return None, None | |
def analyze_multiple_columns(df, first_name, second_name, first_type, second_type): | |
""" | |
Function 2: Analyzes data based on combinations of first and second types. | |
Parameters: | |
- df: pandas DataFrame | |
- first_name: Name or prefix of the first column | |
- second_name: Name or prefix of the second column | |
- first_type: 1 or 2 | |
- second_type: 1 or 2 | |
Returns: | |
- table: DataFrame containing contingency or frequency tables | |
- figs: List of Plotly Figure objects | |
""" | |
total_rows = len(df) | |
# Helper functions to select columns based on type | |
def select_columns_type1(name): | |
if name in df.columns: | |
return [name] | |
else: | |
st.error(f"Column '{name}' not found in the Excel file.") | |
return [] | |
def select_columns_type2(name): | |
selected = [col for col in df.columns if is_matching_pattern(col, name)] | |
if not selected: | |
st.error(f"No columns matching the pattern '{name}-<number>' found in the Excel file.") | |
return selected | |
# Select columns based on types | |
first_columns = select_columns_type1(first_name) if first_type == 1 else select_columns_type2(first_name) | |
second_columns = select_columns_type1(second_name) if second_type == 1 else select_columns_type2(second_name) | |
if not first_columns or not second_columns: | |
st.error("Column selection failed.") | |
return None, None | |
figs = [] | |
if first_type == 1 and second_type == 1: | |
# Both types are 1 | |
if len(first_columns) != 1 or len(second_columns) != 1: | |
st.error("When both first and second types are 1, exactly one column must be selected for each.") | |
return None, None | |
col1 = first_columns[0] | |
col2 = second_columns[0] | |
contingency = pd.crosstab(df[col1], df[col2], normalize='all') * 100 | |
st.subheader("Contingency Table (Percentage):") | |
st.dataframe(contingency) | |
# Plot interactive heatmap | |
fig = go.Figure(data=go.Heatmap( | |
z=contingency.values, | |
x=contingency.columns, | |
y=contingency.index, | |
colorscale='Blues' | |
)) | |
fig.update_layout( | |
title=f"Contingency Table between '{col1}' and '{col2}'", | |
xaxis_title=col2, | |
yaxis_title=col1, | |
template='plotly_white' | |
) | |
st.plotly_chart(fig, use_container_width=True) | |
figs.append(fig) | |
elif first_type == 1 and second_type == 2: | |
# First type is 1 and second type is 2 | |
if len(first_columns) != 1: | |
st.error("When first type is 1, exactly one column must be selected.") | |
return None, None | |
col1 = first_columns[0] | |
col2_list = second_columns | |
unique_values_col1 = df[col1].dropna().unique() | |
results = {} | |
for val in unique_values_col1: | |
filter_df = df[df[col1] == val] | |
freq_table = {} | |
for col2 in col2_list: | |
frequency = filter_df[col2].value_counts(dropna=False) | |
percentage = (frequency / total_rows) * 100 | |
freq_table[col2] = percentage.to_dict() | |
results[val] = freq_table | |
# Convert results to a multi-dimensional DataFrame | |
final_df = pd.DataFrame(results).T | |
st.subheader("Multi-dimensional Frequency Table (Percentage):") | |
st.dataframe(final_df) | |
# Plot interactive charts | |
for val, data in results.items(): | |
st.subheader(f"Frequencies for '{col1}' = {val}") | |
fig = make_subplots(rows=1, cols=len(col2_list), subplot_titles=col2_list) | |
for i, col2 in enumerate(col2_list, 1): | |
categories = list(data[col2].keys()) | |
values = list(data[col2].values()) | |
fig.add_trace(go.Bar( | |
x=categories, | |
y=values, | |
marker=dict(color='rgba(173, 216, 230, 0.6)', | |
line=dict(color='rgba(173, 216, 230, 1.0)', width=1)), | |
width=0.4 | |
), row=1, col=i) | |
fig.update_xaxes(title_text=col2, row=1, col=i) | |
fig.update_yaxes(title_text='Percentage', row=1, col=i) | |
fig.update_layout( | |
title=f"Frequencies for '{col1}' = {val}", | |
template='plotly_white', | |
showlegend=False | |
) | |
st.plotly_chart(fig, use_container_width=True) | |
figs.append(fig) | |
elif first_type == 2 and second_type == 1: | |
# First type is 2 and second type is 1 | |
if len(second_columns) != 1: | |
st.error("When second type is 1, exactly one column must be selected.") | |
return None, None | |
col2 = second_columns[0] | |
col1_list = first_columns | |
unique_values_col2 = df[col2].dropna().unique() | |
results = {} | |
for val in unique_values_col2: | |
filter_df = df[df[col2] == val] | |
freq_table = {} | |
for col1 in col1_list: | |
frequency = filter_df[col1].value_counts(dropna=False) | |
percentage = (frequency / total_rows) * 100 | |
freq_table[col1] = percentage.to_dict() | |
results[val] = freq_table | |
# Convert results to a multi-dimensional DataFrame | |
final_df = pd.DataFrame(results).T | |
st.subheader("Multi-dimensional Frequency Table (Percentage):") | |
st.dataframe(final_df) | |
# Plot interactive charts | |
for val, data in results.items(): | |
st.subheader(f"Frequencies for '{col2}' = {val}") | |
fig = make_subplots(rows=1, cols=len(col1_list), subplot_titles=col1_list) | |
for i, col1 in enumerate(col1_list, 1): | |
categories = list(data[col1].keys()) | |
values = list(data[col1].values()) | |
fig.add_trace(go.Bar( | |
x=categories, | |
y=values, | |
marker=dict(color='rgba(173, 216, 230, 0.6)', | |
line=dict(color='rgba(173, 216, 230, 1.0)', width=1)), | |
width=0.4 | |
), row=1, col=i) | |
fig.update_xaxes(title_text=col1, row=1, col=i) | |
fig.update_yaxes(title_text='Percentage', row=1, col=i) | |
fig.update_layout( | |
title=f"Frequencies for '{col2}' = {val}", | |
template='plotly_white', | |
showlegend=False | |
) | |
st.plotly_chart(fig, use_container_width=True) | |
figs.append(fig) | |
elif first_type == 2 and second_type == 2: | |
# Both types are 2 | |
col1_list = first_columns | |
col2_list = second_columns | |
results = {} | |
for col2 in col2_list: | |
for val in df[col2].dropna().unique(): | |
filter_df = df[df[col2] == val] | |
frequency = {} | |
for col1 in col1_list: | |
count = filter_df[col1].count() | |
frequency[col1] = (count / total_rows) * 100 | |
results[(col2, val)] = frequency | |
if not results: | |
st.error("No valid combinations found for both types being 2.") | |
return None, None | |
# Convert results to a multi-dimensional DataFrame | |
index = pd.MultiIndex.from_tuples(results.keys(), names=['Second Column', 'Second Column Value']) | |
final_df = pd.DataFrame(list(results.values()), index=index) | |
st.subheader("Multi-dimensional Frequency Table (Percentage):") | |
st.dataframe(final_df) | |
# Plot interactive charts | |
for (col2, val), data in results.items(): | |
st.subheader(f"Frequencies for '{col2}' = {val}") | |
fig = go.Figure(data=[go.Bar( | |
x=list(data.keys()), | |
y=list(data.values()), | |
marker=dict(color='rgba(173, 216, 230, 0.6)', | |
line=dict(color='rgba(173, 216, 230, 1.0)', width=1)), | |
width=0.4 | |
)]) | |
fig.update_layout( | |
title=f"Frequencies for '{col2}' = {val}", | |
xaxis_title=first_name, | |
yaxis_title='Percentage', | |
template='plotly_white' | |
) | |
st.plotly_chart(fig, use_container_width=True) | |
figs.append(fig) | |
else: | |
st.error("The 'first_type' and 'second_type' must each be either 1 or 2.") | |
return None, None | |
return final_df if 'final_df' in locals() else table, figs | |
def main(): | |
st.title("📊 Data Analysis Application") | |
st.write(""" | |
This application allows you to upload an Excel file and perform data analysis based on your selected parameters. | |
""") | |
# File uploader | |
uploaded_file = st.file_uploader("Upload your Excel file", type=["xlsx", "xls"]) | |
if uploaded_file is not None: | |
try: | |
df = pd.read_excel(uploaded_file) | |
st.success("File uploaded successfully!") | |
st.subheader("Preview of Uploaded Data:") | |
st.dataframe(df.head()) # Display first few rows | |
except Exception as e: | |
st.error(f"Error reading the Excel file: {e}") | |
return | |
st.markdown("---") | |
# Function Selection | |
st.header("🔍 Select Analysis Function") | |
analysis_function = st.radio("Choose Function", ("Analyze Single Column", "Analyze Multiple Columns")) | |
if analysis_function == "Analyze Single Column": | |
st.header("📈 Analyze Single Column") | |
column_name = st.text_input("Enter the column name or prefix:", "") | |
type_option = st.selectbox("Select Type Option", ("1", "2")) | |
if st.button("Run Analysis"): | |
if column_name.strip() == "": | |
st.error("Please enter a valid column name.") | |
else: | |
type_option_int = int(type_option) | |
table, fig = analyze_single_column(df, column_name, type_option_int) | |
if table is not None: | |
st.success("Analysis completed successfully!") | |
elif analysis_function == "Analyze Multiple Columns": | |
st.header("📊 Analyze Multiple Columns") | |
first_name = st.text_input("Enter the first column name or prefix:", "") | |
second_name = st.text_input("Enter the second column name or prefix:", "") | |
first_type = st.selectbox("Select First Type Option", ("1", "2"), key='first_type') | |
second_type = st.selectbox("Select Second Type Option", ("1", "2"), key='second_type') | |
if st.button("Run Analysis"): | |
if first_name.strip() == "" or second_name.strip() == "": | |
st.error("Please enter valid column names.") | |
else: | |
first_type_int = int(first_type) | |
second_type_int = int(second_type) | |
table, figs = analyze_multiple_columns(df, first_name, second_name, first_type_int, second_type_int) | |
if table is not None: | |
st.success("Analysis completed successfully!") | |
if __name__ == "__main__": | |
main() |