AMKhakbaz commited on
Commit
74cbf4e
·
verified ·
1 Parent(s): dc9de67

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +236 -0
app.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import plotly.express as px
5
+ from scipy.stats import norm
6
+
7
+ # Define your helper functions
8
+ def is_matching_pattern(column, prefix):
9
+ if not column.startswith(prefix + '_'):
10
+ return False
11
+ suffix = column[len(prefix) + 1:]
12
+ if 1 <= len(suffix) <= 3 and suffix.isdigit():
13
+ return True
14
+ return False
15
+
16
+ def multi_answer(df):
17
+ frequency = {}
18
+ for i in df.columns:
19
+ unique_values = list(set(df[i].dropna()))[0]
20
+ frequency[str(unique_values)] = df[i].value_counts().get(unique_values, 0)
21
+
22
+ frequency_dataframe = pd.DataFrame({
23
+ "Value": frequency.keys(),
24
+ "Frequency": frequency.values(),
25
+ "Percentile": np.array(list(frequency.values())) / len(df.dropna(how='all'))
26
+ }).sort_values(by='Value')
27
+ frequency_dataframe.loc[len(frequency_dataframe)] = ['Sample_size', len(df.dropna(how='all')), 1]
28
+ return frequency_dataframe
29
+
30
+ def single_answer(df):
31
+ counter = df.value_counts()
32
+ frequency_dataframe = pd.DataFrame({
33
+ 'Value': counter.index,
34
+ 'Frequency': counter.values,
35
+ 'Percentage': (counter.values / counter.sum()) * 100
36
+ }).sort_values(by='Value')
37
+ frequency_dataframe.loc[len(frequency_dataframe)] = ['Sample_size', len(df.dropna()), 1]
38
+ return frequency_dataframe
39
+
40
+ def two_variable_ss(df, var1, var2):
41
+ counter = df.groupby(var1)[var2].value_counts()
42
+ frequency_dataframe = counter.unstack(fill_value=0)
43
+
44
+ column_sums = frequency_dataframe.sum(axis=0)
45
+ percentile_dataframe = frequency_dataframe.div(column_sums, axis=1)
46
+
47
+ frequency_dataframe.loc['Sample_size'] = list(single_answer(df[var2]).iloc[:,1])[:-1]
48
+ frequency_dataframe['Sample_size'] = list(single_answer(df[var1]).iloc[:,1])
49
+
50
+ return percentile_dataframe, frequency_dataframe
51
+
52
+ # Functions related to Z-Test
53
+ def read_excel_sheets(file):
54
+ """Reads an Excel file with multiple sheets and returns a dictionary of DataFrames."""
55
+ try:
56
+ xls = pd.ExcelFile(file)
57
+ sheets_data = {sheet: xls.parse(sheet) for sheet in xls.sheet_names}
58
+ return sheets_data
59
+ except Exception as e:
60
+ st.error(f"❌ Error reading Excel file: {e}")
61
+ return None
62
+
63
+ def z_testes(n1, n2, p1, p2):
64
+ """Performs Z-test for proportions and returns p-value."""
65
+ try:
66
+ pooled_p = (n1 * p1 + n2 * p2) / (n1 + n2)
67
+ se = np.sqrt(pooled_p * (1 - pooled_p) * (1 / n1 + 1 / n2))
68
+ z = (p1 - p2) / se
69
+ p_value = 2 * (1 - norm.cdf(abs(z)))
70
+ return p_value
71
+ except ZeroDivisionError:
72
+ return np.nan
73
+
74
+ def Z_test_dataframes(sheets_data):
75
+ """Processes each sheet's DataFrame and computes new DataFrames with Z-test results."""
76
+ result_dataframes = {}
77
+ for sheet_name, df in sheets_data.items():
78
+ if df.empty:
79
+ st.warning(f"⚠️ Sheet '{sheet_name}' is empty and has been skipped.")
80
+ continue
81
+ df = df.set_index(df.columns[0]) # Use the first column as index
82
+ rows, cols = df.shape
83
+ if cols < 2:
84
+ st.warning(f"⚠️ Sheet '{sheet_name}' does not have enough columns for analysis and has been skipped.")
85
+ continue
86
+ new_df = pd.DataFrame(index=df.index[:-1], columns=df.columns[1:])
87
+ for i, row_name in enumerate(df.index[:-1]):
88
+ for j, col_name in enumerate(df.columns[1:]):
89
+ try:
90
+ n1 = df.iloc[-1, 0] # x_I1
91
+ n2 = df.iloc[-1, j+1] # x_Ij
92
+ p1 = df.iloc[i, 0] # x_1J
93
+ p2 = df.iloc[i, j+1] # x_ij
94
+ p_value = z_testes(n1, n2, p1, p2)
95
+ new_df.iloc[i, j] = p_value
96
+ except Exception as e:
97
+ st.error(f"❌ Error processing sheet '{sheet_name}', row '{row_name}', column '{col_name}': {e}")
98
+ new_df.iloc[i, j] = np.nan
99
+
100
+ result_dataframes[sheet_name] = new_df
101
+
102
+ return result_dataframes
103
+
104
+ def analyze_z_test(file):
105
+ """
106
+ Performs Z-Test analysis on the uploaded Excel file.
107
+
108
+ Parameters:
109
+ - file: Uploaded Excel file
110
+
111
+ Returns:
112
+ - result_dataframes: Dictionary of DataFrames with p-values
113
+ """
114
+ sheets_data = read_excel_sheets(file)
115
+ if sheets_data is None:
116
+ return None
117
+
118
+ result_dataframes = Z_test_dataframes(sheets_data)
119
+
120
+ if not result_dataframes:
121
+ st.error("❌ No valid sheets found for Z-Test analysis.")
122
+ return None
123
+
124
+ st.write("### 📈 Processed Tables with Z-Test Results")
125
+ for sheet_name, df in result_dataframes.items():
126
+ st.write(f"#### Sheet: {sheet_name}")
127
+
128
+ # Apply color coding based on p-value
129
+ def color_p_value(val):
130
+ try:
131
+ if pd.isna(val):
132
+ return 'background-color: lightgray'
133
+ elif val < 0.05:
134
+ return 'background-color: lightgreen'
135
+ else:
136
+ return 'background-color: lightcoral'
137
+ except:
138
+ return 'background-color: lightgray'
139
+
140
+ styled_df = df.style.applymap(color_p_value)
141
+
142
+ # Display the styled DataFrame
143
+ st.dataframe(styled_df, use_container_width=True)
144
+
145
+ return result_dataframes
146
+
147
+ # Streamlit User Interface
148
+ st.title("Data Analysis Application")
149
+
150
+ # Main options
151
+ main_option = st.selectbox("Please select an option:", ["Tabulation", "Hypothesis test", "Machine Learning", "Coding"])
152
+
153
+ if main_option == "Tabulation":
154
+ st.header("Tabulation Analysis")
155
+ uploaded_file = st.file_uploader("Please upload your Excel file", type=["xlsx", "xls"])
156
+ if uploaded_file:
157
+ try:
158
+ df = pd.read_excel(uploaded_file)
159
+ st.subheader("Displaying the first few rows of the DataFrame")
160
+ st.dataframe(df.head())
161
+
162
+ tabulation_option = st.selectbox("Please select the type of analysis:", ["All", "Univariate", "Multivariate"])
163
+
164
+ if tabulation_option == "All":
165
+ st.info("This section of the program is under development.")
166
+ elif tabulation_option == "Univariate":
167
+ uni_option = st.selectbox("Select the type of univariate analysis:", ["Multiple answer", "Single answer"])
168
+
169
+ if uni_option == "Single answer":
170
+ var = st.text_input("Please enter the name of the desired column:")
171
+ if var:
172
+ if var in df.columns:
173
+ result_df = single_answer(df[var])
174
+ st.subheader("Univariate Analysis Results")
175
+ st.dataframe(result_df)
176
+
177
+ fig = px.bar(result_df, x='Value', y='Percentage', title='Percentage Histogram')
178
+ st.plotly_chart(fig, use_container_width=True)
179
+ else:
180
+ st.error("The entered column was not found.")
181
+ elif uni_option == "Multiple answer":
182
+ var = st.text_input("Please enter the name of the desired column:")
183
+ if var:
184
+ matching_cols = [col for col in df.columns if is_matching_pattern(col, var)]
185
+ if matching_cols:
186
+ subset_df = df[matching_cols]
187
+ result_df = multi_answer(subset_df)
188
+ st.subheader("Multiple Answer Analysis Results")
189
+ st.dataframe(result_df)
190
+
191
+ fig = px.bar(result_df, x='Value', y='Percentile', title='Percentile Histogram')
192
+ st.plotly_chart(fig, use_container_width=True)
193
+ else:
194
+ st.error("No columns matching the entered pattern were found.")
195
+ elif tabulation_option == "Multivariate":
196
+ st.subheader("Multivariate Analysis")
197
+ var1 = st.text_input("Please enter the name of the first column:")
198
+ var2 = st.text_input("Please enter the name of the second column:")
199
+
200
+ if var1 and var2:
201
+ if var1 in df.columns and var2 in df.columns:
202
+ type1 = st.selectbox("Select the type of analysis for the first column:", ["Multiple answer", "Single answer"], key='type1')
203
+ type2 = st.selectbox("Select the type of analysis for the second column:", ["Multiple answer", "Single answer"], key='type2')
204
+
205
+ if type1 == "Single answer" and type2 == "Single answer":
206
+ percentile_df, frequency_df = two_variable_ss(df[[var1, var2]], var1, var2)
207
+ st.subheader("Percentile Table")
208
+ st.dataframe(percentile_df)
209
+
210
+ st.subheader("Frequency Table")
211
+ st.dataframe(frequency_df)
212
+
213
+ fig = px.imshow(percentile_df, text_auto=True, title='Percentile Heatmap')
214
+ st.plotly_chart(fig, use_container_width=True)
215
+ else:
216
+ st.info("This section of the program is under development.")
217
+ else:
218
+ st.error("One or both of the entered columns were not found.")
219
+ except Exception as e:
220
+ st.error(f"❌ Error reading the Excel file: {e}")
221
+
222
+ elif main_option == "Hypothesis test":
223
+ st.header("Hypothesis Testing")
224
+ hypothesis_option = st.selectbox("Please select the type of hypothesis test:", ["Z test", "T test", "Chi-Square test", "ANOVA test"])
225
+
226
+ if hypothesis_option != "Z test":
227
+ st.info("This section of the program is under development.")
228
+ else:
229
+ uploaded_file = st.file_uploader("Please upload your Excel file for Z-Test", type=["xlsx", "xls"])
230
+ if uploaded_file:
231
+ result = analyze_z_test(uploaded_file)
232
+ if result:
233
+ st.success("Z-Test analysis completed successfully.")
234
+
235
+ elif main_option in ["Machine Learning", "Coding"]:
236
+ st.info("This section of the program is under development.")