Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import plotly.express as px
|
5 |
+
from scipy.stats import norm
|
6 |
+
|
7 |
+
# Define your helper functions
|
8 |
+
def is_matching_pattern(column, prefix):
|
9 |
+
if not column.startswith(prefix + '_'):
|
10 |
+
return False
|
11 |
+
suffix = column[len(prefix) + 1:]
|
12 |
+
if 1 <= len(suffix) <= 3 and suffix.isdigit():
|
13 |
+
return True
|
14 |
+
return False
|
15 |
+
|
16 |
+
def multi_answer(df):
|
17 |
+
frequency = {}
|
18 |
+
for i in df.columns:
|
19 |
+
unique_values = list(set(df[i].dropna()))[0]
|
20 |
+
frequency[str(unique_values)] = df[i].value_counts().get(unique_values, 0)
|
21 |
+
|
22 |
+
frequency_dataframe = pd.DataFrame({
|
23 |
+
"Value": frequency.keys(),
|
24 |
+
"Frequency": frequency.values(),
|
25 |
+
"Percentile": np.array(list(frequency.values())) / len(df.dropna(how='all'))
|
26 |
+
}).sort_values(by='Value')
|
27 |
+
frequency_dataframe.loc[len(frequency_dataframe)] = ['Sample_size', len(df.dropna(how='all')), 1]
|
28 |
+
return frequency_dataframe
|
29 |
+
|
30 |
+
def single_answer(df):
|
31 |
+
counter = df.value_counts()
|
32 |
+
frequency_dataframe = pd.DataFrame({
|
33 |
+
'Value': counter.index,
|
34 |
+
'Frequency': counter.values,
|
35 |
+
'Percentage': (counter.values / counter.sum()) * 100
|
36 |
+
}).sort_values(by='Value')
|
37 |
+
frequency_dataframe.loc[len(frequency_dataframe)] = ['Sample_size', len(df.dropna()), 1]
|
38 |
+
return frequency_dataframe
|
39 |
+
|
40 |
+
def two_variable_ss(df, var1, var2):
|
41 |
+
counter = df.groupby(var1)[var2].value_counts()
|
42 |
+
frequency_dataframe = counter.unstack(fill_value=0)
|
43 |
+
|
44 |
+
column_sums = frequency_dataframe.sum(axis=0)
|
45 |
+
percentile_dataframe = frequency_dataframe.div(column_sums, axis=1)
|
46 |
+
|
47 |
+
frequency_dataframe.loc['Sample_size'] = list(single_answer(df[var2]).iloc[:,1])[:-1]
|
48 |
+
frequency_dataframe['Sample_size'] = list(single_answer(df[var1]).iloc[:,1])
|
49 |
+
|
50 |
+
return percentile_dataframe, frequency_dataframe
|
51 |
+
|
52 |
+
# Functions related to Z-Test
|
53 |
+
def read_excel_sheets(file):
|
54 |
+
"""Reads an Excel file with multiple sheets and returns a dictionary of DataFrames."""
|
55 |
+
try:
|
56 |
+
xls = pd.ExcelFile(file)
|
57 |
+
sheets_data = {sheet: xls.parse(sheet) for sheet in xls.sheet_names}
|
58 |
+
return sheets_data
|
59 |
+
except Exception as e:
|
60 |
+
st.error(f"❌ Error reading Excel file: {e}")
|
61 |
+
return None
|
62 |
+
|
63 |
+
def z_testes(n1, n2, p1, p2):
|
64 |
+
"""Performs Z-test for proportions and returns p-value."""
|
65 |
+
try:
|
66 |
+
pooled_p = (n1 * p1 + n2 * p2) / (n1 + n2)
|
67 |
+
se = np.sqrt(pooled_p * (1 - pooled_p) * (1 / n1 + 1 / n2))
|
68 |
+
z = (p1 - p2) / se
|
69 |
+
p_value = 2 * (1 - norm.cdf(abs(z)))
|
70 |
+
return p_value
|
71 |
+
except ZeroDivisionError:
|
72 |
+
return np.nan
|
73 |
+
|
74 |
+
def Z_test_dataframes(sheets_data):
|
75 |
+
"""Processes each sheet's DataFrame and computes new DataFrames with Z-test results."""
|
76 |
+
result_dataframes = {}
|
77 |
+
for sheet_name, df in sheets_data.items():
|
78 |
+
if df.empty:
|
79 |
+
st.warning(f"⚠️ Sheet '{sheet_name}' is empty and has been skipped.")
|
80 |
+
continue
|
81 |
+
df = df.set_index(df.columns[0]) # Use the first column as index
|
82 |
+
rows, cols = df.shape
|
83 |
+
if cols < 2:
|
84 |
+
st.warning(f"⚠️ Sheet '{sheet_name}' does not have enough columns for analysis and has been skipped.")
|
85 |
+
continue
|
86 |
+
new_df = pd.DataFrame(index=df.index[:-1], columns=df.columns[1:])
|
87 |
+
for i, row_name in enumerate(df.index[:-1]):
|
88 |
+
for j, col_name in enumerate(df.columns[1:]):
|
89 |
+
try:
|
90 |
+
n1 = df.iloc[-1, 0] # x_I1
|
91 |
+
n2 = df.iloc[-1, j+1] # x_Ij
|
92 |
+
p1 = df.iloc[i, 0] # x_1J
|
93 |
+
p2 = df.iloc[i, j+1] # x_ij
|
94 |
+
p_value = z_testes(n1, n2, p1, p2)
|
95 |
+
new_df.iloc[i, j] = p_value
|
96 |
+
except Exception as e:
|
97 |
+
st.error(f"❌ Error processing sheet '{sheet_name}', row '{row_name}', column '{col_name}': {e}")
|
98 |
+
new_df.iloc[i, j] = np.nan
|
99 |
+
|
100 |
+
result_dataframes[sheet_name] = new_df
|
101 |
+
|
102 |
+
return result_dataframes
|
103 |
+
|
104 |
+
def analyze_z_test(file):
|
105 |
+
"""
|
106 |
+
Performs Z-Test analysis on the uploaded Excel file.
|
107 |
+
|
108 |
+
Parameters:
|
109 |
+
- file: Uploaded Excel file
|
110 |
+
|
111 |
+
Returns:
|
112 |
+
- result_dataframes: Dictionary of DataFrames with p-values
|
113 |
+
"""
|
114 |
+
sheets_data = read_excel_sheets(file)
|
115 |
+
if sheets_data is None:
|
116 |
+
return None
|
117 |
+
|
118 |
+
result_dataframes = Z_test_dataframes(sheets_data)
|
119 |
+
|
120 |
+
if not result_dataframes:
|
121 |
+
st.error("❌ No valid sheets found for Z-Test analysis.")
|
122 |
+
return None
|
123 |
+
|
124 |
+
st.write("### 📈 Processed Tables with Z-Test Results")
|
125 |
+
for sheet_name, df in result_dataframes.items():
|
126 |
+
st.write(f"#### Sheet: {sheet_name}")
|
127 |
+
|
128 |
+
# Apply color coding based on p-value
|
129 |
+
def color_p_value(val):
|
130 |
+
try:
|
131 |
+
if pd.isna(val):
|
132 |
+
return 'background-color: lightgray'
|
133 |
+
elif val < 0.05:
|
134 |
+
return 'background-color: lightgreen'
|
135 |
+
else:
|
136 |
+
return 'background-color: lightcoral'
|
137 |
+
except:
|
138 |
+
return 'background-color: lightgray'
|
139 |
+
|
140 |
+
styled_df = df.style.applymap(color_p_value)
|
141 |
+
|
142 |
+
# Display the styled DataFrame
|
143 |
+
st.dataframe(styled_df, use_container_width=True)
|
144 |
+
|
145 |
+
return result_dataframes
|
146 |
+
|
147 |
+
# Streamlit User Interface
|
148 |
+
st.title("Data Analysis Application")
|
149 |
+
|
150 |
+
# Main options
|
151 |
+
main_option = st.selectbox("Please select an option:", ["Tabulation", "Hypothesis test", "Machine Learning", "Coding"])
|
152 |
+
|
153 |
+
if main_option == "Tabulation":
|
154 |
+
st.header("Tabulation Analysis")
|
155 |
+
uploaded_file = st.file_uploader("Please upload your Excel file", type=["xlsx", "xls"])
|
156 |
+
if uploaded_file:
|
157 |
+
try:
|
158 |
+
df = pd.read_excel(uploaded_file)
|
159 |
+
st.subheader("Displaying the first few rows of the DataFrame")
|
160 |
+
st.dataframe(df.head())
|
161 |
+
|
162 |
+
tabulation_option = st.selectbox("Please select the type of analysis:", ["All", "Univariate", "Multivariate"])
|
163 |
+
|
164 |
+
if tabulation_option == "All":
|
165 |
+
st.info("This section of the program is under development.")
|
166 |
+
elif tabulation_option == "Univariate":
|
167 |
+
uni_option = st.selectbox("Select the type of univariate analysis:", ["Multiple answer", "Single answer"])
|
168 |
+
|
169 |
+
if uni_option == "Single answer":
|
170 |
+
var = st.text_input("Please enter the name of the desired column:")
|
171 |
+
if var:
|
172 |
+
if var in df.columns:
|
173 |
+
result_df = single_answer(df[var])
|
174 |
+
st.subheader("Univariate Analysis Results")
|
175 |
+
st.dataframe(result_df)
|
176 |
+
|
177 |
+
fig = px.bar(result_df, x='Value', y='Percentage', title='Percentage Histogram')
|
178 |
+
st.plotly_chart(fig, use_container_width=True)
|
179 |
+
else:
|
180 |
+
st.error("The entered column was not found.")
|
181 |
+
elif uni_option == "Multiple answer":
|
182 |
+
var = st.text_input("Please enter the name of the desired column:")
|
183 |
+
if var:
|
184 |
+
matching_cols = [col for col in df.columns if is_matching_pattern(col, var)]
|
185 |
+
if matching_cols:
|
186 |
+
subset_df = df[matching_cols]
|
187 |
+
result_df = multi_answer(subset_df)
|
188 |
+
st.subheader("Multiple Answer Analysis Results")
|
189 |
+
st.dataframe(result_df)
|
190 |
+
|
191 |
+
fig = px.bar(result_df, x='Value', y='Percentile', title='Percentile Histogram')
|
192 |
+
st.plotly_chart(fig, use_container_width=True)
|
193 |
+
else:
|
194 |
+
st.error("No columns matching the entered pattern were found.")
|
195 |
+
elif tabulation_option == "Multivariate":
|
196 |
+
st.subheader("Multivariate Analysis")
|
197 |
+
var1 = st.text_input("Please enter the name of the first column:")
|
198 |
+
var2 = st.text_input("Please enter the name of the second column:")
|
199 |
+
|
200 |
+
if var1 and var2:
|
201 |
+
if var1 in df.columns and var2 in df.columns:
|
202 |
+
type1 = st.selectbox("Select the type of analysis for the first column:", ["Multiple answer", "Single answer"], key='type1')
|
203 |
+
type2 = st.selectbox("Select the type of analysis for the second column:", ["Multiple answer", "Single answer"], key='type2')
|
204 |
+
|
205 |
+
if type1 == "Single answer" and type2 == "Single answer":
|
206 |
+
percentile_df, frequency_df = two_variable_ss(df[[var1, var2]], var1, var2)
|
207 |
+
st.subheader("Percentile Table")
|
208 |
+
st.dataframe(percentile_df)
|
209 |
+
|
210 |
+
st.subheader("Frequency Table")
|
211 |
+
st.dataframe(frequency_df)
|
212 |
+
|
213 |
+
fig = px.imshow(percentile_df, text_auto=True, title='Percentile Heatmap')
|
214 |
+
st.plotly_chart(fig, use_container_width=True)
|
215 |
+
else:
|
216 |
+
st.info("This section of the program is under development.")
|
217 |
+
else:
|
218 |
+
st.error("One or both of the entered columns were not found.")
|
219 |
+
except Exception as e:
|
220 |
+
st.error(f"❌ Error reading the Excel file: {e}")
|
221 |
+
|
222 |
+
elif main_option == "Hypothesis test":
|
223 |
+
st.header("Hypothesis Testing")
|
224 |
+
hypothesis_option = st.selectbox("Please select the type of hypothesis test:", ["Z test", "T test", "Chi-Square test", "ANOVA test"])
|
225 |
+
|
226 |
+
if hypothesis_option != "Z test":
|
227 |
+
st.info("This section of the program is under development.")
|
228 |
+
else:
|
229 |
+
uploaded_file = st.file_uploader("Please upload your Excel file for Z-Test", type=["xlsx", "xls"])
|
230 |
+
if uploaded_file:
|
231 |
+
result = analyze_z_test(uploaded_file)
|
232 |
+
if result:
|
233 |
+
st.success("Z-Test analysis completed successfully.")
|
234 |
+
|
235 |
+
elif main_option in ["Machine Learning", "Coding"]:
|
236 |
+
st.info("This section of the program is under development.")
|