AMKAPP / app.py
AMKhakbaz's picture
Update app.py
d6409d1 verified
raw
history blame
19 kB
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import norm
def sorting(df):
df.index = list(map(float, df.index))
df = df.sort_index
return df
def figo(plot_type, df, title, xlabel=None, ylabel=None, legend_title=None, colorscale='Plotly3', width=800, height=600):
if plot_type == "Scatter":
fig = go.Figure()
for column in df.columns[0:]:
df.index = list(map(float, list(df.index)))
sorted_data = df.sort_index()
fig.add_trace(go.Scatter(
x=sorted_data[column],
y=sorted_data.index,
mode='lines+markers+text',
name=column,
text=sorted_data[column].round(0),
textposition="middle right"
))
fig.update_layout(
title=title,
xaxis_title="Percentage",
yaxis_title="Category",
yaxis={'categoryorder': 'array', 'categoryarray': sorted_data.index},
width=width,
height=height
)
elif plot_type == "Heatmap":
df = df.apply(pd.to_numeric, errors='coerce')
fig = go.Figure(data=go.Heatmap(
z=df.values,
x=df.columns,
y=df.index,
hoverongaps=False,
colorscale=colorscale
))
fig.update_layout(
title={
'text': title,
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'
},
xaxis_title=xlabel,
yaxis_title=ylabel,
legend_title=legend_title,
template="plotly_white",
width=width,
height=height
)
elif plot_type == "Bar":
fig = go.Figure()
col = df.name
fig.add_trace(go.Bar(
x=df.index,
y=df,
name=col
))
fig.update_layout(
title={
'text': title,
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'
},
xaxis_title=xlabel,
yaxis_title=ylabel,
legend_title=legend_title,
template="plotly_white",
barmode='group',
width=width,
height=height
)
else:
raise ValueError("Invalid plot_type. Supported types are 'Scatter', 'Heatmap', and 'Bar'.")
return fig
def is_matching_pattern(column, prefix):
if not column.startswith(prefix + '_'):
return False
suffix = column[len(prefix) + 1:]
if 1 <= len(suffix) <= 3 and suffix.isdigit():
return True
return False
def multi_answer(df):
friquency = {}
for i in df.columns:
try:
unique_values = list(set(df[i].dropna()))[0]
friquency[str(unique_values)] = df[i].value_counts().get(unique_values, 0)
except Exception as e:
st.error(f"Warning: One of the data columns has no value.: {e}")
friquency[i] = 0
friquency_dataframe = pd.DataFrame({"Value": friquency.keys(), "Friquency": friquency.values(), "Percentage": np.array(list(friquency.values()))/len(df.dropna(how='all'))*100}).sort_values(by='Value')
friquency_dataframe.loc[len(friquency_dataframe)] = ['Sample_size', len(df.dropna(how='all')), 1]
return friquency_dataframe
def single_answer(df):
counter = df.value_counts()
friquency_dataframe = pd.DataFrame({
'Value': counter.index,
'Frequency': counter.values,
'Percentage': (counter.values / counter.sum()) * 100}).sort_values(by='Value')
friquency_dataframe.loc[len(friquency_dataframe)] = ['Sample_size', len(df.dropna()), 1]
return friquency_dataframe
def score_answer(df):
counter = df.value_counts().sort_index()
friquency_dataframe = pd.DataFrame({
'Value': list(counter.index)+["Meen", "Variance"],
'Frequency': list(counter.values)+[df.mean(), df.var()],
'Percentage': list((counter.values / counter.sum()) * 100)+["", ""]})
return friquency_dataframe
def two_variable_ss(df, var1, var2):
counter = df.groupby(var1)[var2].value_counts()
friquency_dataframe = counter.unstack(fill_value=0)
#friquency_dataframe = sorting(friquency_dataframe)
column_sums = friquency_dataframe.sum(axis=0)
percentage_dataframe = friquency_dataframe.div(column_sums, axis=1) * 100
friquency_dataframe['Total'] = list(single_answer(df[var1]).iloc[:,1])[:-1]
friquency_dataframe.loc['Sample_size'] = list(single_answer(df[var2]).iloc[:,1])
percentage_dataframe['Total'] = list(single_answer(df[var1]).iloc[:,2])[:-1]
percentage_dataframe.loc['Sample_size'] = list(single_answer(df[var2]).iloc[:,1])
return percentage_dataframe, friquency_dataframe
def two_variable_sm(df, var1, var2):
unique_values = list(set(df[var1].dropna()))
value = multi_answer(df[var2]).iloc[:-1,0]
friquency_dataframe, percentage_dataframe = {}, {}
for i in unique_values:
dataframe = multi_answer(df[df[var1] == i][var2]).iloc[:-1,:]
friquency_dataframe[i], percentage_dataframe[i] = dataframe['Friquency'], dataframe['Percentage']
friquency_dataframe = pd.DataFrame(friquency_dataframe)
percentage_dataframe = pd.DataFrame(percentage_dataframe)
friquency_dataframe.index, percentage_dataframe.index = value, value
#friquency_dataframe = sorting(friquency_dataframe)
#percentage_dataframe = sorting(percentage_dataframe)
friquency_dataframe['Total'] = list(multi_answer(df[var2]).iloc[:,1])[:-1]
friquency_dataframe.loc['Sample_size'] = list(single_answer(df[var1]).iloc[:,1])
percentage_dataframe['Total'] = list(multi_answer(df[var2]).iloc[:,2])[:-1]
percentage_dataframe.loc['Sample_size'] = list(single_answer(df[var1]).iloc[:,1])
return percentage_dataframe, friquency_dataframe
def two_variable_mm(df, var1, var2):
friquency_dataframe, percentage_dataframe = {}, {}
value = multi_answer(df[var2]).iloc[:-1,0]
for i in var1:
unique_values = list(set(df[i].dropna()))[0]
dataframe = multi_answer(df[df[i] == unique_values][var2]).iloc[:-1,:]
friquency_dataframe[i], percentage_dataframe[i] = dataframe['Friquency'], dataframe['Percentage']
friquency_dataframe = pd.DataFrame(friquency_dataframe)
percentage_dataframe = pd.DataFrame(percentage_dataframe)
friquency_dataframe.index, percentage_dataframe.index = value, value
#friquency_dataframe = sorting(friquency_dataframe)
#percentage_dataframe = sorting(percentage_dataframe)
friquency_dataframe['Total'] = list(multi_answer(df[var2]).iloc[:,1])[:-1]
friquency_dataframe.loc['Sample_size'] = list(multi_answer(df[var1]).iloc[:,1])
percentage_dataframe['Total'] = list(multi_answer(df[var2]).iloc[:,2])[:-1]
percentage_dataframe.loc['Sample_size'] = list(multi_answer(df[var1]).iloc[:,1])
return percentage_dataframe, friquency_dataframe
# Functions related to Z-Test
def read_excel_sheets(file):
"""Reads an Excel file with multiple sheets and returns a dictionary of DataFrames."""
try:
xls = pd.ExcelFile(file)
sheets_data = {sheet: xls.parse(sheet) for sheet in xls.sheet_names}
return sheets_data
except Exception as e:
st.error(f"❌ Error reading Excel file: {e}")
return None
def z_testes(n1, n2, p1, p2):
p_hat = ((n1*p1) + (n2*p2)) / (n1 + n2)
z = (p1 - p2) / ((p_hat * (1 - p_hat) * (1 / n1 + 1 / n2)) ** 0.5)
p_value = 2 * (1 - norm.cdf(abs(z)))
return p_value
def z_test_data(df):
styles = pd.DataFrame('', index=df.index, columns=df.columns)
num_rows, num_cols = df.shape
sample_size = df.iloc[-1, -1] # Total sample size
for i in range(num_rows -1):
for j in range(1, num_cols -1):
n1 = df.iloc[-1, -1]
n2 = df.iloc[-1, j]
p1 = df.iloc[i, -1]/100
p2 = df.iloc[i, j]/100
p_value = z_testes(n1, n2, p1, p2)
if pd.notnull(p_value) and p_value <= 0.05:
styles.iloc[i, j] = 'background-color: lightgreen'
return df.style.apply(lambda _: styles, axis=None)
def Z_test_dataframes(sheets_data):
"""Processes each sheet's DataFrame and computes new DataFrames with Z-test results."""
result_dataframes = {}
for sheet_name, df in sheets_data.items():
if df.empty:
st.warning(f"⚠️ Sheet '{sheet_name}' is empty and has been skipped.")
continue
df = df.set_index(df.columns[0]) # Use the first column as index
rows, cols = df.shape
if cols < 2:
st.warning(f"⚠️ Sheet '{sheet_name}' does not have enough columns for analysis and has been skipped.")
continue
new_df = pd.DataFrame(index=df.index[:-1], columns=df.columns[1:])
for i, row_name in enumerate(df.index[:-1]):
for j, col_name in enumerate(df.columns[1:]):
try:
n1 = df.iloc[-1, 0] # x_I1
n2 = df.iloc[-1, j+1] # x_Ij
p1 = df.iloc[i, 0] # x_1J
p2 = df.iloc[i, j+1] # x_ij
p_value = z_testes(n1, n2, p1, p2)
new_df.iloc[i, j] = p_value
except Exception as e:
st.error(f"❌ Error processing sheet '{sheet_name}', row '{row_name}', column '{col_name}': {e}")
new_df.iloc[i, j] = np.nan
result_dataframes[sheet_name] = new_df
return result_dataframes
def analyze_z_test(file):
"""
Performs Z-Test analysis on the uploaded Excel file.
Parameters:
- file: Uploaded Excel file
Returns:
- result_dataframes: Dictionary of DataFrames with p-values
"""
sheets_data = read_excel_sheets(file)
if sheets_data is None:
return None
result_dataframes = Z_test_dataframes(sheets_data)
if not result_dataframes:
st.error("❌ No valid sheets found for Z-Test analysis.")
return None
st.write("### 📈 Processed Tables with Z-Test Results")
for sheet_name, df in result_dataframes.items():
st.write(f"#### Sheet: {sheet_name}")
# Apply color coding based on p-value
def color_p_value(val):
try:
if pd.isna(val):
return 'background-color: lightgray'
elif val < 0.05:
return 'background-color: lightgreen'
else:
return 'background-color: lightcoral'
except:
return 'background-color: lightgray'
styled_df = df.style.applymap(color_p_value)
# Display the styled DataFrame
st.dataframe(styled_df, use_container_width=True)
return result_dataframes
# Streamlit User Interface
st.title("Data Analysis Application")
# Main options
main_option = st.selectbox("Please select an option:", ["Tabulation", "Hypothesis test", "Machine Learning", "Coding"])
if main_option == "Tabulation":
st.header("Tabulation Analysis")
uploaded_file = st.file_uploader("Please upload your Excel file", type=["xlsx", "xls"])
if uploaded_file:
try:
df = pd.read_excel(uploaded_file)
st.subheader("Displaying the first few rows of the DataFrame")
st.dataframe(df.head())
tabulation_option = st.selectbox("Please select the type of analysis:", ["All", "Univariate", "Multivariate"])
if tabulation_option == "All":
st.info("This section of the program is under development.")
elif tabulation_option == "Univariate":
uni_option = st.selectbox("Select the type of univariate analysis:", ["Multiple answer", "Single answer", "Score answer"])
if uni_option == "Single answer":
var = st.text_input("Please enter the name of the desired column:")
if var:
if var in df.columns:
result_df = single_answer(df[var])
st.subheader("Univariate Analysis Results")
st.dataframe(result_df)
fig = figo('Bar', result_df["Percentage"][:-1, ], title='Percentage Histogram', xlabel=var, ylabel='Percentage', colorscale='Plotly3')
st.plotly_chart(fig, use_container_width=True)
else:
st.error("The entered column was not found.")
elif uni_option == "Multiple answer":
var = st.text_input("Please enter the name of the desired column:")
if var:
matching_cols = [col for col in df.columns if is_matching_pattern(col, var)]
if matching_cols:
subset_df = df[matching_cols]
result_df = multi_answer(subset_df)
st.subheader("Multiple Answer Analysis Results")
st.dataframe(result_df)
fig = figo('Bar', result_df["Percentage"][:-1], title='Percentage Histogram', xlabel=var, ylabel='Percentage', colorscale='Plotly3')
st.plotly_chart(fig, use_container_width=True)
else:
st.error("No columns matching the entered pattern were found.")
elif uni_option == "Score answer":
var = st.text_input("Please enter the name of the desired column:")
if var:
subset_df = df[var]
result_df = score_answer(subset_df)
st.subheader("Score Answer Analysis Results")
st.dataframe(result_df)
fig = figo('Bar', result_df["Percentage"][:-1], title='Percentage Histogram', xlabel=var, ylabel='Percentage', colorscale='Plotly3')
st.plotly_chart(fig, use_container_width=True)
else:
st.error("No columns matching the entered pattern were found.")
elif tabulation_option == "Multivariate":
st.subheader("Multivariate Analysis")
var1 = st.text_input("Please enter the name of the first column:")
var2 = st.text_input("Please enter the name of the second column:")
if var1 and var2:
type1 = st.selectbox("Select the type of analysis for the first column:", ["Multiple answer", "Single answer"], key='type1')
type2 = st.selectbox("Select the type of analysis for the second column:", ["Multiple answer", "Single answer"], key='type2')
if type1 == "Single answer" and type2 == "Single answer":
percentile_df, frequency_df = two_variable_ss(df[[var1, var2]], var1, var2)
st.subheader("Percentage Table")
st.write(z_test_data(percentile_df))
st.subheader("Frequency Table")
st.dataframe(frequency_df)
row, col = df.shape
fig = figo('Scatter', percentile_df.iloc[:-1,:], title='Percentage Scatter plot', width=col*40, height=row*3)
st.plotly_chart(fig, use_container_width=True)
elif type1 == "Single answer" and type2 == "Multiple answer":
matching_cols = [col for col in df.columns if is_matching_pattern(col, var2)]
if matching_cols:
percentile_df, frequency_df = two_variable_sm(df[[var1] + matching_cols], var1, matching_cols)
st.subheader("Percentage Table")
st.write(z_test_data(percentile_df))
st.subheader("Frequency Table")
st.dataframe(frequency_df)
row, col = df.shape
fig = figo('Scatter', percentile_df.iloc[:-1,:], title='Percentage Scatter plot', width=col*40, height=row*3)
st.plotly_chart(fig, use_container_width=True)
else:
st.error("No columns matching the entered pattern were found.")
elif type1 == "Multiple answer" and type2 == "Multiple answer":
matching_cols1 = [col for col in df.columns if is_matching_pattern(col, var1)]
matching_cols2 = [col for col in df.columns if is_matching_pattern(col, var2)]
if matching_cols1 and matching_cols2:
percentile_df, frequency_df = two_variable_mm(df[matching_cols1 + matching_cols2], matching_cols1, matching_cols2)
st.subheader("Percentage Table")
st.write(z_test_data(percentile_df))
st.subheader("Frequency Table")
st.dataframe(frequency_df)
row, col = df.shape
fig = figo('Scatter', percentile_df.iloc[:-1,:], title='Percentage Scatter plot', width=col*40, height=row*3)
st.plotly_chart(fig, use_container_width=True)
else:
st.error("No columns matching the entered pattern were found.")
else:
st.info("This section of the program is under development.")
except Exception as e:
st.error(f"❌ Error reading the Excel file: {e}")
elif main_option == "Hypothesis test":
st.header("Hypothesis Testing")
hypothesis_option = st.selectbox("Please select the type of hypothesis test:", ["Z test", "T test", "Chi-Square test", "ANOVA test"])
if hypothesis_option != "Z test":
st.info("This section of the program is under development.")
else:
uploaded_file = st.file_uploader("Please upload your Excel file for Z-Test", type=["xlsx", "xls"])
if uploaded_file:
result = analyze_z_test(uploaded_file)
if result:
st.success("Z-Test analysis completed successfully.")
elif main_option in ["Machine Learning", "Coding"]:
st.info("This section of the program is under development.")