AMKAPP / app3.py
AMKhakbaz's picture
Rename app.py to app3.py
b2949cd verified
raw
history blame
15.7 kB
import pandas as pd
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import streamlit as st
def is_matching_pattern(column, prefix):
"""
Checks if the column name matches the pattern: prefix-number (1 to 3 digits).
Parameters:
- column: The column name to check.
- prefix: The prefix to match.
Returns:
- True if matches, False otherwise.
"""
if not column.startswith(prefix + '_'):
return False
suffix = column[len(prefix) + 1:]
if 1 <= len(suffix) <= 3 and suffix.isdigit():
return True
return False
def analyze_single_column(df, column_name, type_option):
"""
Function 1: Analyzes data based on the selected type.
Parameters:
- df: pandas DataFrame
- column_name: Name of the column or prefix
- type_option: 1 or 2
Returns:
- table: DataFrame containing frequency percentages
- fig: Plotly Figure object
"""
total_rows = len(df)
if type_option == 1:
if column_name not in df.columns:
st.error(f"Column '{column_name}' not found in the Excel file.")
return None, None
# Calculate frequency
frequency = df[column_name].value_counts(dropna=False).sort_index()
percentage = (frequency / total_rows) * 100
table = percentage.reset_index()
table.columns = [column_name, 'Percentage']
st.subheader("Frequency Table (Percentage):")
st.dataframe(table)
# Plot interactive histogram
fig = go.Figure(data=[go.Bar(
x=table[column_name].astype(str),
y=table['Percentage'],
marker=dict(color='rgba(173, 216, 230, 0.6)',
line=dict(color='rgba(173, 216, 230, 1.0)', width=1)),
width=0.4
)])
fig.update_layout(
title=f"Histogram of '{column_name}'",
xaxis_title=column_name,
yaxis_title='Percentage',
bargap=0.2,
template='plotly_white'
)
st.plotly_chart(fig, use_container_width=True)
return table, fig
elif type_option == 2:
# Column name pattern: name-number (1 to 3 digits)
selected_columns = [col for col in df.columns if is_matching_pattern(col, column_name)]
if not selected_columns:
st.error(f"No columns matching the pattern '{column_name}-<number>' found in the Excel file.")
return None, None
results = {}
for col in selected_columns:
unique_values = df[col].dropna().unique()
if len(unique_values) > 2:
st.warning(f"Column '{col}' has more than two unique values. It was skipped.")
continue
value_counts = df[col].value_counts(dropna=False)
percentage = (value_counts / total_rows) * 100
for val, pct in percentage.items():
results[val] = results.get(val, 0) + pct
if not results:
st.error("No valid columns with exactly two unique values were found.")
return None, None
# Sort the keys
table = pd.DataFrame(list(results.items()), columns=[column_name, 'Percentage'])
table = table.sort_values(by=column_name).reset_index(drop=True)
st.subheader("Aggregated Frequency Table (Percentage):")
st.dataframe(table)
# Plot interactive histogram
fig = go.Figure(data=[go.Bar(
x=table[column_name].astype(str),
y=table['Percentage'],
marker=dict(color='rgba(173, 216, 230, 0.6)',
line=dict(color='rgba(173, 216, 230, 1.0)', width=1)),
width=0.4
)])
fig.update_layout(
title=f"Histogram of Columns '{column_name}-<number>'",
xaxis_title=column_name,
yaxis_title='Percentage',
bargap=0.2,
template='plotly_white'
)
st.plotly_chart(fig, use_container_width=True)
return table, fig
else:
st.error("The 'type_option' must be either 1 or 2.")
return None, None
def analyze_multiple_columns(df, first_name, second_name, first_type, second_type):
"""
Function 2: Analyzes data based on combinations of first and second types.
Parameters:
- df: pandas DataFrame
- first_name: Name or prefix of the first column
- second_name: Name or prefix of the second column
- first_type: 1 or 2
- second_type: 1 or 2
Returns:
- table: DataFrame containing contingency or frequency tables
- figs: List of Plotly Figure objects
"""
total_rows = len(df)
# Helper functions to select columns based on type
def select_columns_type1(name):
if name in df.columns:
return [name]
else:
st.error(f"Column '{name}' not found in the Excel file.")
return []
def select_columns_type2(name):
selected = [col for col in df.columns if is_matching_pattern(col, name)]
if not selected:
st.error(f"No columns matching the pattern '{name}-<number>' found in the Excel file.")
return selected
# Select columns based on types
first_columns = select_columns_type1(first_name) if first_type == 1 else select_columns_type2(first_name)
second_columns = select_columns_type1(second_name) if second_type == 1 else select_columns_type2(second_name)
if not first_columns or not second_columns:
st.error("Column selection failed.")
return None, None
figs = []
if first_type == 1 and second_type == 1:
# Both types are 1
if len(first_columns) != 1 or len(second_columns) != 1:
st.error("When both first and second types are 1, exactly one column must be selected for each.")
return None, None
col1 = first_columns[0]
col2 = second_columns[0]
contingency = pd.crosstab(df[col1], df[col2], normalize='all') * 100
st.subheader("Contingency Table (Percentage):")
st.dataframe(contingency)
# Plot interactive heatmap
fig = go.Figure(data=go.Heatmap(
z=contingency.values,
x=contingency.columns,
y=contingency.index,
colorscale='Blues'
))
fig.update_layout(
title=f"Contingency Table between '{col1}' and '{col2}'",
xaxis_title=col2,
yaxis_title=col1,
template='plotly_white'
)
st.plotly_chart(fig, use_container_width=True)
figs.append(fig)
elif first_type == 1 and second_type == 2:
# First type is 1 and second type is 2
if len(first_columns) != 1:
st.error("When first type is 1, exactly one column must be selected.")
return None, None
col1 = first_columns[0]
col2_list = second_columns
unique_values_col1 = df[col1].dropna().unique()
results = {}
for val in unique_values_col1:
filter_df = df[df[col1] == val]
freq_table = {}
for col2 in col2_list:
frequency = filter_df[col2].value_counts(dropna=False)
percentage = (frequency / total_rows) * 100
freq_table[col2] = percentage.to_dict()
results[val] = freq_table
# Convert results to a multi-dimensional DataFrame
final_df = pd.DataFrame(results).T
st.subheader("Multi-dimensional Frequency Table (Percentage):")
st.dataframe(final_df)
# Plot interactive charts
for val, data in results.items():
st.subheader(f"Frequencies for '{col1}' = {val}")
fig = make_subplots(rows=1, cols=len(col2_list), subplot_titles=col2_list)
for i, col2 in enumerate(col2_list, 1):
categories = list(data[col2].keys())
values = list(data[col2].values())
fig.add_trace(go.Bar(
x=categories,
y=values,
marker=dict(color='rgba(173, 216, 230, 0.6)',
line=dict(color='rgba(173, 216, 230, 1.0)', width=1)),
width=0.4
), row=1, col=i)
fig.update_xaxes(title_text=col2, row=1, col=i)
fig.update_yaxes(title_text='Percentage', row=1, col=i)
fig.update_layout(
title=f"Frequencies for '{col1}' = {val}",
template='plotly_white',
showlegend=False
)
st.plotly_chart(fig, use_container_width=True)
figs.append(fig)
elif first_type == 2 and second_type == 1:
# First type is 2 and second type is 1
if len(second_columns) != 1:
st.error("When second type is 1, exactly one column must be selected.")
return None, None
col2 = second_columns[0]
col1_list = first_columns
unique_values_col2 = df[col2].dropna().unique()
results = {}
for val in unique_values_col2:
filter_df = df[df[col2] == val]
freq_table = {}
for col1 in col1_list:
frequency = filter_df[col1].value_counts(dropna=False)
percentage = (frequency / total_rows) * 100
freq_table[col1] = percentage.to_dict()
results[val] = freq_table
# Convert results to a multi-dimensional DataFrame
final_df = pd.DataFrame(results).T
st.subheader("Multi-dimensional Frequency Table (Percentage):")
st.dataframe(final_df)
# Plot interactive charts
for val, data in results.items():
st.subheader(f"Frequencies for '{col2}' = {val}")
fig = make_subplots(rows=1, cols=len(col1_list), subplot_titles=col1_list)
for i, col1 in enumerate(col1_list, 1):
categories = list(data[col1].keys())
values = list(data[col1].values())
fig.add_trace(go.Bar(
x=categories,
y=values,
marker=dict(color='rgba(173, 216, 230, 0.6)',
line=dict(color='rgba(173, 216, 230, 1.0)', width=1)),
width=0.4
), row=1, col=i)
fig.update_xaxes(title_text=col1, row=1, col=i)
fig.update_yaxes(title_text='Percentage', row=1, col=i)
fig.update_layout(
title=f"Frequencies for '{col2}' = {val}",
template='plotly_white',
showlegend=False
)
st.plotly_chart(fig, use_container_width=True)
figs.append(fig)
elif first_type == 2 and second_type == 2:
# Both types are 2
col1_list = first_columns
col2_list = second_columns
results = {}
for col2 in col2_list:
for val in df[col2].dropna().unique():
filter_df = df[df[col2] == val]
frequency = {}
for col1 in col1_list:
count = filter_df[col1].count()
frequency[col1] = (count / total_rows) * 100
results[(col2, val)] = frequency
if not results:
st.error("No valid combinations found for both types being 2.")
return None, None
# Convert results to a multi-dimensional DataFrame
index = pd.MultiIndex.from_tuples(results.keys(), names=['Second Column', 'Second Column Value'])
final_df = pd.DataFrame(list(results.values()), index=index)
st.subheader("Multi-dimensional Frequency Table (Percentage):")
st.dataframe(final_df)
# Plot interactive charts
for (col2, val), data in results.items():
st.subheader(f"Frequencies for '{col2}' = {val}")
fig = go.Figure(data=[go.Bar(
x=list(data.keys()),
y=list(data.values()),
marker=dict(color='rgba(173, 216, 230, 0.6)',
line=dict(color='rgba(173, 216, 230, 1.0)', width=1)),
width=0.4
)])
fig.update_layout(
title=f"Frequencies for '{col2}' = {val}",
xaxis_title=first_name,
yaxis_title='Percentage',
template='plotly_white'
)
st.plotly_chart(fig, use_container_width=True)
figs.append(fig)
else:
st.error("The 'first_type' and 'second_type' must each be either 1 or 2.")
return None, None
return final_df if 'final_df' in locals() else table, figs
def main():
st.title("📊 Data Analysis Application")
st.write("""
This application allows you to upload an Excel file and perform data analysis based on your selected parameters.
""")
# File uploader
uploaded_file = st.file_uploader("Upload your Excel file", type=["xlsx", "xls"])
if uploaded_file is not None:
try:
df = pd.read_excel(uploaded_file)
st.success("File uploaded successfully!")
st.subheader("Preview of Uploaded Data:")
st.dataframe(df.head()) # Display first few rows
except Exception as e:
st.error(f"Error reading the Excel file: {e}")
return
st.markdown("---")
# Function Selection
st.header("🔍 Select Analysis Function")
analysis_function = st.radio("Choose Function", ("Analyze Single Column", "Analyze Multiple Columns"))
if analysis_function == "Analyze Single Column":
st.header("📈 Analyze Single Column")
column_name = st.text_input("Enter the column name or prefix:", "")
type_option = st.selectbox("Select Type Option", ("1", "2"))
if st.button("Run Analysis"):
if column_name.strip() == "":
st.error("Please enter a valid column name.")
else:
type_option_int = int(type_option)
table, fig = analyze_single_column(df, column_name, type_option_int)
if table is not None:
st.success("Analysis completed successfully!")
elif analysis_function == "Analyze Multiple Columns":
st.header("📊 Analyze Multiple Columns")
first_name = st.text_input("Enter the first column name or prefix:", "")
second_name = st.text_input("Enter the second column name or prefix:", "")
first_type = st.selectbox("Select First Type Option", ("1", "2"), key='first_type')
second_type = st.selectbox("Select Second Type Option", ("1", "2"), key='second_type')
if st.button("Run Analysis"):
if first_name.strip() == "" or second_name.strip() == "":
st.error("Please enter valid column names.")
else:
first_type_int = int(first_type)
second_type_int = int(second_type)
table, figs = analyze_multiple_columns(df, first_name, second_name, first_type_int, second_type_int)
if table is not None:
st.success("Analysis completed successfully!")
if __name__ == "__main__":
main()