Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import plotly.express as px | |
import plotly.graph_objects as go | |
from scipy.stats import norm | |
def sorting(df): | |
df.index = list(map(float, df.index)) | |
df = df.sort_index | |
return df | |
def figo(plot_type, df, title, xlabel=None, ylabel=None, legend_title=None, colorscale='Plotly3', width=800, height=600): | |
if plot_type == "Scatter": | |
fig = go.Figure() | |
for column in df.columns[0:]: | |
df.index = list(map(float, list(df.index))) | |
sorted_data = df.sort_index() | |
fig.add_trace(go.Scatter( | |
x=sorted_data[column], | |
y=sorted_data.index, | |
mode='lines+markers+text', | |
name=column, | |
text=sorted_data[column].round(0), | |
textposition="middle right" | |
)) | |
fig.update_layout( | |
title=title, | |
xaxis_title="Percentage", | |
yaxis_title="Category", | |
yaxis={'categoryorder': 'array', 'categoryarray': sorted_data.index}, | |
width=width, | |
height=height | |
) | |
elif plot_type == "Heatmap": | |
df = df.apply(pd.to_numeric, errors='coerce') | |
fig = go.Figure(data=go.Heatmap( | |
z=df.values, | |
x=df.columns, | |
y=df.index, | |
hoverongaps=False, | |
colorscale=colorscale | |
)) | |
fig.update_layout( | |
title={ | |
'text': title, | |
'y':0.95, | |
'x':0.5, | |
'xanchor': 'center', | |
'yanchor': 'top' | |
}, | |
xaxis_title=xlabel, | |
yaxis_title=ylabel, | |
legend_title=legend_title, | |
template="plotly_white", | |
width=width, | |
height=height | |
) | |
elif plot_type == "Bar": | |
fig = go.Figure() | |
col = df.name | |
fig.add_trace(go.Bar( | |
x=df.index, | |
y=df, | |
name=col | |
)) | |
fig.update_layout( | |
title={ | |
'text': title, | |
'y':0.95, | |
'x':0.5, | |
'xanchor': 'center', | |
'yanchor': 'top' | |
}, | |
xaxis_title=xlabel, | |
yaxis_title=ylabel, | |
legend_title=legend_title, | |
template="plotly_white", | |
barmode='group', | |
width=width, | |
height=height | |
) | |
else: | |
raise ValueError("Invalid plot_type. Supported types are 'Scatter', 'Heatmap', and 'Bar'.") | |
return fig | |
def is_matching_pattern(column, prefix): | |
if not column.startswith(prefix + '_'): | |
return False | |
suffix = column[len(prefix) + 1:] | |
if 1 <= len(suffix) <= 3 and suffix.isdigit(): | |
return True | |
return False | |
def multi_answer(df): | |
friquency = {} | |
for i in df.columns: | |
try: | |
unique_values = list(set(df[i].dropna()))[0] | |
friquency[str(unique_values)] = df[i].value_counts().get(unique_values, 0) | |
except Exception as e: | |
st.error(f"Warning: One of the data columns has no value.: {e}") | |
friquency[i] = 0 | |
friquency_dataframe = pd.DataFrame({"Value": friquency.keys(), "Friquency": friquency.values(), "Percentage": np.array(list(friquency.values()))/len(df.dropna(how='all'))*100}).sort_values(by='Value') | |
friquency_dataframe.loc[len(friquency_dataframe)] = ['Sample_size', len(df.dropna(how='all')), 1] | |
return friquency_dataframe | |
def single_answer(df): | |
counter = df.value_counts() | |
friquency_dataframe = pd.DataFrame({ | |
'Value': counter.index, | |
'Frequency': counter.values, | |
'Percentage': (counter.values / counter.sum()) * 100}).sort_values(by='Value') | |
friquency_dataframe.loc[len(friquency_dataframe)] = ['Sample_size', len(df.dropna()), 1] | |
return friquency_dataframe | |
def score_answer(df): | |
counter = df.value_counts().sort_index() | |
friquency_dataframe = pd.DataFrame({ | |
'Value': list(counter.index)+["Meen", "Variance"], | |
'Frequency': list(counter.values)+[df.mean(), df.var()], | |
'Percentage': list((counter.values / counter.sum()) * 100)+["", ""]}) | |
return friquency_dataframe | |
def two_variable_ss(df, var1, var2): | |
counter = df.groupby(var1)[var2].value_counts() | |
friquency_dataframe = counter.unstack(fill_value=0) | |
#friquency_dataframe = sorting(friquency_dataframe) | |
column_sums = friquency_dataframe.sum(axis=0) | |
percentage_dataframe = friquency_dataframe.div(column_sums, axis=1) * 100 | |
friquency_dataframe['Total'] = list(single_answer(df[var1]).iloc[:,1])[:-1] | |
friquency_dataframe.loc['Sample_size'] = list(single_answer(df[var2]).iloc[:,1]) | |
percentage_dataframe['Total'] = list(single_answer(df[var1]).iloc[:,2])[:-1] | |
percentage_dataframe.loc['Sample_size'] = list(single_answer(df[var2]).iloc[:,1]) | |
return percentage_dataframe, friquency_dataframe | |
def two_variable_sm(df, var1, var2): | |
unique_values = list(set(df[var1].dropna())) | |
value = multi_answer(df[var2]).iloc[:-1,0] | |
friquency_dataframe, percentage_dataframe = {}, {} | |
for i in unique_values: | |
dataframe = multi_answer(df[df[var1] == i][var2]).iloc[:-1,:] | |
friquency_dataframe[i], percentage_dataframe[i] = dataframe['Friquency'], dataframe['Percentage'] | |
friquency_dataframe = pd.DataFrame(friquency_dataframe) | |
percentage_dataframe = pd.DataFrame(percentage_dataframe) | |
friquency_dataframe.index, percentage_dataframe.index = value, value | |
#friquency_dataframe = sorting(friquency_dataframe) | |
#percentage_dataframe = sorting(percentage_dataframe) | |
friquency_dataframe['Total'] = list(multi_answer(df[var2]).iloc[:,1])[:-1] | |
friquency_dataframe.loc['Sample_size'] = list(single_answer(df[var1]).iloc[:,1]) | |
percentage_dataframe['Total'] = list(multi_answer(df[var2]).iloc[:,2])[:-1] | |
percentage_dataframe.loc['Sample_size'] = list(single_answer(df[var1]).iloc[:,1]) | |
return percentage_dataframe, friquency_dataframe | |
def two_variable_mm(df, var1, var2): | |
friquency_dataframe, percentage_dataframe = {}, {} | |
value = multi_answer(df[var2]).iloc[:-1,0] | |
for i in var1: | |
unique_values = list(set(df[i].dropna()))[0] | |
dataframe = multi_answer(df[df[i] == unique_values][var2]).iloc[:-1,:] | |
friquency_dataframe[i], percentage_dataframe[i] = dataframe['Friquency'], dataframe['Percentage'] | |
friquency_dataframe = pd.DataFrame(friquency_dataframe) | |
percentage_dataframe = pd.DataFrame(percentage_dataframe) | |
friquency_dataframe.index, percentage_dataframe.index = value, value | |
#friquency_dataframe = sorting(friquency_dataframe) | |
#percentage_dataframe = sorting(percentage_dataframe) | |
friquency_dataframe['Total'] = list(multi_answer(df[var2]).iloc[:,1])[:-1] | |
friquency_dataframe.loc['Sample_size'] = list(multi_answer(df[var1]).iloc[:,1]) | |
percentage_dataframe['Total'] = list(multi_answer(df[var2]).iloc[:,2])[:-1] | |
percentage_dataframe.loc['Sample_size'] = list(multi_answer(df[var1]).iloc[:,1]) | |
return percentage_dataframe, friquency_dataframe | |
# Functions related to Z-Test | |
def read_excel_sheets(file): | |
"""Reads an Excel file with multiple sheets and returns a dictionary of DataFrames.""" | |
try: | |
xls = pd.ExcelFile(file) | |
sheets_data = {sheet: xls.parse(sheet) for sheet in xls.sheet_names} | |
return sheets_data | |
except Exception as e: | |
st.error(f"❌ Error reading Excel file: {e}") | |
return None | |
def z_testes(n1, n2, p1, p2): | |
p_hat = ((n1*p1) + (n2*p2)) / (n1 + n2) | |
z = (p1 - p2) / ((p_hat * (1 - p_hat) * (1 / n1 + 1 / n2)) ** 0.5) | |
p_value = 2 * (1 - norm.cdf(abs(z))) | |
return p_value | |
def z_test_data(df): | |
styles = pd.DataFrame('', index=df.index, columns=df.columns) | |
num_rows, num_cols = df.shape | |
sample_size = df.iloc[-1, -1] # Total sample size | |
for i in range(num_rows -1): | |
for j in range(1, num_cols -1): | |
n1 = df.iloc[-1, -1] | |
n2 = df.iloc[-1, j] | |
p1 = df.iloc[i, -1]/100 | |
p2 = df.iloc[i, j]/100 | |
p_value = z_testes(n1, n2, p1, p2) | |
if pd.notnull(p_value) and p_value <= 0.05: | |
styles.iloc[i, j] = 'background-color: lightgreen' | |
return df.style.apply(lambda _: styles, axis=None) | |
def Z_test_dataframes(sheets_data): | |
"""Processes each sheet's DataFrame and computes new DataFrames with Z-test results.""" | |
result_dataframes = {} | |
for sheet_name, df in sheets_data.items(): | |
if df.empty: | |
st.warning(f"⚠️ Sheet '{sheet_name}' is empty and has been skipped.") | |
continue | |
df = df.set_index(df.columns[0]) # Use the first column as index | |
rows, cols = df.shape | |
if cols < 2: | |
st.warning(f"⚠️ Sheet '{sheet_name}' does not have enough columns for analysis and has been skipped.") | |
continue | |
new_df = pd.DataFrame(index=df.index[:-1], columns=df.columns[1:]) | |
for i, row_name in enumerate(df.index[:-1]): | |
for j, col_name in enumerate(df.columns[1:]): | |
try: | |
n1 = df.iloc[-1, 0] # x_I1 | |
n2 = df.iloc[-1, j+1] # x_Ij | |
p1 = df.iloc[i, 0] # x_1J | |
p2 = df.iloc[i, j+1] # x_ij | |
p_value = z_testes(n1, n2, p1, p2) | |
new_df.iloc[i, j] = p_value | |
except Exception as e: | |
st.error(f"❌ Error processing sheet '{sheet_name}', row '{row_name}', column '{col_name}': {e}") | |
new_df.iloc[i, j] = np.nan | |
result_dataframes[sheet_name] = new_df | |
return result_dataframes | |
def analyze_z_test(file): | |
""" | |
Performs Z-Test analysis on the uploaded Excel file. | |
Parameters: | |
- file: Uploaded Excel file | |
Returns: | |
- result_dataframes: Dictionary of DataFrames with p-values | |
""" | |
sheets_data = read_excel_sheets(file) | |
if sheets_data is None: | |
return None | |
result_dataframes = Z_test_dataframes(sheets_data) | |
if not result_dataframes: | |
st.error("❌ No valid sheets found for Z-Test analysis.") | |
return None | |
st.write("### 📈 Processed Tables with Z-Test Results") | |
for sheet_name, df in result_dataframes.items(): | |
st.write(f"#### Sheet: {sheet_name}") | |
# Apply color coding based on p-value | |
def color_p_value(val): | |
try: | |
if pd.isna(val): | |
return 'background-color: lightgray' | |
elif val < 0.05: | |
return 'background-color: lightgreen' | |
else: | |
return 'background-color: lightcoral' | |
except: | |
return 'background-color: lightgray' | |
styled_df = df.style.applymap(color_p_value) | |
# Display the styled DataFrame | |
st.dataframe(styled_df, use_container_width=True) | |
return result_dataframes | |
# Streamlit User Interface | |
st.title("Data Analysis Application") | |
# Main options | |
main_option = st.selectbox("Please select an option:", ["Tabulation", "Hypothesis test", "Machine Learning", "Coding"]) | |
if main_option == "Tabulation": | |
st.header("Tabulation Analysis") | |
uploaded_file = st.file_uploader("Please upload your Excel file", type=["xlsx", "xls"]) | |
if uploaded_file: | |
try: | |
df = pd.read_excel(uploaded_file) | |
st.subheader("Displaying the first few rows of the DataFrame") | |
st.dataframe(df.head()) | |
tabulation_option = st.selectbox("Please select the type of analysis:", ["All", "Univariate", "Multivariate"]) | |
if tabulation_option == "All": | |
st.info("This section of the program is under development.") | |
elif tabulation_option == "Univariate": | |
uni_option = st.selectbox("Select the type of univariate analysis:", ["Multiple answer", "Single answer", "Score answer"]) | |
if uni_option == "Single answer": | |
var = st.text_input("Please enter the name of the desired column:") | |
if var: | |
if var in df.columns: | |
result_df = single_answer(df[var]) | |
st.subheader("Univariate Analysis Results") | |
st.dataframe(result_df) | |
fig = figo('Bar', result_df["Percentage"][:-1, ], title='Percentage Histogram', xlabel=var, ylabel='Percentage', colorscale='Plotly3') | |
st.plotly_chart(fig, use_container_width=True) | |
else: | |
st.error("The entered column was not found.") | |
elif uni_option == "Multiple answer": | |
var = st.text_input("Please enter the name of the desired column:") | |
if var: | |
matching_cols = [col for col in df.columns if is_matching_pattern(col, var)] | |
if matching_cols: | |
subset_df = df[matching_cols] | |
result_df = multi_answer(subset_df) | |
st.subheader("Multiple Answer Analysis Results") | |
st.dataframe(result_df) | |
fig = figo('Bar', result_df["Percentage"][:-1], title='Percentage Histogram', xlabel=var, ylabel='Percentage', colorscale='Plotly3') | |
st.plotly_chart(fig, use_container_width=True) | |
else: | |
st.error("No columns matching the entered pattern were found.") | |
elif uni_option == "Score answer": | |
var = st.text_input("Please enter the name of the desired column:") | |
if var: | |
subset_df = df[var] | |
result_df = score_answer(subset_df) | |
st.subheader("Score Answer Analysis Results") | |
st.dataframe(result_df) | |
fig = figo('Bar', result_df["Percentage"][:-1], title='Percentage Histogram', xlabel=var, ylabel='Percentage', colorscale='Plotly3') | |
st.plotly_chart(fig, use_container_width=True) | |
else: | |
st.error("No columns matching the entered pattern were found.") | |
elif tabulation_option == "Multivariate": | |
st.subheader("Multivariate Analysis") | |
var1 = st.text_input("Please enter the name of the first column:") | |
var2 = st.text_input("Please enter the name of the second column:") | |
if var1 and var2: | |
type1 = st.selectbox("Select the type of analysis for the first column:", ["Multiple answer", "Single answer"], key='type1') | |
type2 = st.selectbox("Select the type of analysis for the second column:", ["Multiple answer", "Single answer"], key='type2') | |
if type1 == "Single answer" and type2 == "Single answer": | |
percentile_df, frequency_df = two_variable_ss(df[[var1, var2]], var1, var2) | |
st.subheader("Percentage Table") | |
st.write(z_test_data(percentile_df)) | |
st.subheader("Frequency Table") | |
st.dataframe(frequency_df) | |
row, col = df.shape | |
fig = figo('Scatter', percentile_df.iloc[:-1,:], title='Percentage Scatter plot', width=col*40, height=row*3) | |
st.plotly_chart(fig, use_container_width=True) | |
elif type1 == "Single answer" and type2 == "Multiple answer": | |
matching_cols = [col for col in df.columns if is_matching_pattern(col, var2)] | |
if matching_cols: | |
percentile_df, frequency_df = two_variable_sm(df[[var1] + matching_cols], var1, matching_cols) | |
st.subheader("Percentage Table") | |
st.write(z_test_data(percentile_df)) | |
st.subheader("Frequency Table") | |
st.dataframe(frequency_df) | |
row, col = df.shape | |
fig = figo('Scatter', percentile_df.iloc[:-1,:], title='Percentage Scatter plot', width=col*40, height=row*3) | |
st.plotly_chart(fig, use_container_width=True) | |
else: | |
st.error("No columns matching the entered pattern were found.") | |
elif type1 == "Multiple answer" and type2 == "Multiple answer": | |
matching_cols1 = [col for col in df.columns if is_matching_pattern(col, var1)] | |
matching_cols2 = [col for col in df.columns if is_matching_pattern(col, var2)] | |
if matching_cols1 and matching_cols2: | |
percentile_df, frequency_df = two_variable_mm(df[matching_cols1 + matching_cols2], matching_cols1, matching_cols2) | |
st.subheader("Percentage Table") | |
st.write(z_test_data(percentile_df)) | |
st.subheader("Frequency Table") | |
st.dataframe(frequency_df) | |
row, col = df.shape | |
fig = figo('Scatter', percentile_df.iloc[:-1,:], title='Percentage Scatter plot', width=col*40, height=row*3) | |
st.plotly_chart(fig, use_container_width=True) | |
else: | |
st.error("No columns matching the entered pattern were found.") | |
else: | |
st.info("This section of the program is under development.") | |
except Exception as e: | |
st.error(f"❌ Error reading the Excel file: {e}") | |
elif main_option == "Hypothesis test": | |
st.header("Hypothesis Testing") | |
hypothesis_option = st.selectbox("Please select the type of hypothesis test:", ["Z test", "T test", "Chi-Square test", "ANOVA test"]) | |
if hypothesis_option != "Z test": | |
st.info("This section of the program is under development.") | |
else: | |
uploaded_file = st.file_uploader("Please upload your Excel file for Z-Test", type=["xlsx", "xls"]) | |
if uploaded_file: | |
result = analyze_z_test(uploaded_file) | |
if result: | |
st.success("Z-Test analysis completed successfully.") | |
elif main_option in ["Machine Learning", "Coding"]: | |
st.info("This section of the program is under development.") | |