Spaces:

AMKhakbaz
/

AMKAPP

Running

App Files Files Community

AMKAPP / app.py

AMKhakbaz

Update app.py

8fb73ac verified 3 months ago

raw

history blame

13.3 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import plotly.express as px
	import plotly.graph_objects as go
	from scipy.stats import norm

	def figo(plot_type, df, title, xlabel=None, ylabel=None, legend_title=None, colorscale='Plotly3'):

	if plot_type == "Heatmap":
	df = df.apply(pd.to_numeric, errors='coerce')

	fig = go.Figure(data=go.Heatmap(
	z=df.values,
	x=df.columns,
	y=df.index,
	hoverongaps=False,
	colorscale=colorscale
	))

	elif plot_type == "Bar":
	fig = go.Figure()
	col = df.name
	fig.add_trace(go.Bar(
	x=df.index,
	y=df,
	name=col
	))

	fig.update_layout(barmode='group')

	else:
	raise ValueError("Invalid plot_type. Supported types are 'Heatmap' and 'Bar'.")

	fig.update_layout(
	title={
	'text': title,
	'y':0.95,
	'x':0.5,
	'xanchor': 'center',
	'yanchor': 'top'
	},
	xaxis_title=xlabel,
	yaxis_title=ylabel,
	legend_title=legend_title,
	template="plotly_white"
	)

	return fig

	def is_matching_pattern(column, prefix):
	if not column.startswith(prefix + '_'):
	return False
	suffix = column[len(prefix) + 1:]
	if 1 <= len(suffix) <= 3 and suffix.isdigit():
	return True
	return False


	def multi_answer(df):
	friquency = {}
	for i in df.columns:
	unique_values = list(set(df[i].dropna()))[0]
	friquency[str(unique_values)] = df[i].value_counts().get(unique_values, 0)

	friquency_dataframe = pd.DataFrame(
	{"Value": friquency.keys(),
	"Friquency": friquency.values(),
	"Percentage": np.array(list(friquency.values()))/len(df.dropna(how='all'))*100}).sort_values(by='Value')
	friquency_dataframe.loc[len(friquency_dataframe)] = ['Sample_size', len(df.dropna(how='all')), 1]
	return friquency_dataframe


	def single_answer(df):
	counter = df.value_counts()
	friquency_dataframe = pd.DataFrame({
	'Value': counter.index,
	'Frequency': counter.values,
	'Percentage': (counter.values / counter.sum()) * 100}).sort_values(by='Value')
	friquency_dataframe.loc[len(friquency_dataframe)] = ['Sample_size', len(df.dropna()), 1]
	return friquency_dataframe

	def two_variable_ss(df, var1, var2):

	counter = df.groupby(var1)[var2].value_counts()
	friquency_dataframe = counter.unstack(fill_value=0)

	column_sums = friquency_dataframe.sum(axis=0)
	percentage_dataframe = friquency_dataframe.div(column_sums, axis=1)

	friquency_dataframe.loc['Sample_size'] = list(single_answer(df[var2]).iloc[:,1])[:-1]
	friquency_dataframe['Sample_size'] = list(single_answer(df[var1]).iloc[:,1])

	return percentage_dataframe, friquency_dataframe

	def two_variable_sm(df, var1, var2):
	unique_values = list(set(df[var1].dropna()))
	value = multi_answer(df[var2]).iloc[:-1,0]
	friquency_dataframe, percentage_dataframe = {}, {}

	for i in unique_values:
	dataframe = multi_answer(df[df[var1] == i][var2]).iloc[:-1,:]
	friquency_dataframe[i], percentage_dataframe[i] = dataframe['Friquency'], dataframe['Percentage']

	friquency_dataframe = pd.DataFrame(friquency_dataframe)
	percentage_dataframe = pd.DataFrame(percentage_dataframe)

	friquency_dataframe.index, percentage_dataframe.index = value, value

	friquency_dataframe.loc['Sample_size'] = list(single_answer(df[var1]).iloc[:,1])[:-1]
	friquency_dataframe['Sample_size'] = list(multi_answer(df[var2]).iloc[:,1])
	percentage_dataframe.loc['Sample_size'] = list(single_answer(df[var1]).iloc[:,1])[:-1]
	percentage_dataframe['Sample_size'] = list(multi_answer(df[var2]).iloc[:,1])

	return percentage_dataframe, friquency_dataframe

	# Functions related to Z-Test
	def read_excel_sheets(file):
	"""Reads an Excel file with multiple sheets and returns a dictionary of DataFrames."""
	try:
	xls = pd.ExcelFile(file)
	sheets_data = {sheet: xls.parse(sheet) for sheet in xls.sheet_names}
	return sheets_data
	except Exception as e:
	st.error(f"❌ Error reading Excel file: {e}")
	return None

	def z_testes(n1, n2, p1, p2):
	"""Performs Z-test for proportions and returns p-value."""
	try:
	pooled_p = (n1 * p1 + n2 * p2) / (n1 + n2)
	se = np.sqrt(pooled_p * (1 - pooled_p) * (1 / n1 + 1 / n2))
	z = (p1 - p2) / se
	p_value = 2 * (1 - norm.cdf(abs(z)))
	return p_value
	except ZeroDivisionError:
	return np.nan

	def Z_test_dataframes(sheets_data):
	"""Processes each sheet's DataFrame and computes new DataFrames with Z-test results."""
	result_dataframes = {}
	for sheet_name, df in sheets_data.items():
	if df.empty:
	st.warning(f"⚠️ Sheet '{sheet_name}' is empty and has been skipped.")
	continue
	df = df.set_index(df.columns[0]) # Use the first column as index
	rows, cols = df.shape
	if cols < 2:
	st.warning(f"⚠️ Sheet '{sheet_name}' does not have enough columns for analysis and has been skipped.")
	continue
	new_df = pd.DataFrame(index=df.index[:-1], columns=df.columns[1:])
	for i, row_name in enumerate(df.index[:-1]):
	for j, col_name in enumerate(df.columns[1:]):
	try:
	n1 = df.iloc[-1, 0] # x_I1
	n2 = df.iloc[-1, j+1] # x_Ij
	p1 = df.iloc[i, 0] # x_1J
	p2 = df.iloc[i, j+1] # x_ij
	p_value = z_testes(n1, n2, p1, p2)
	new_df.iloc[i, j] = p_value
	except Exception as e:
	st.error(f"❌ Error processing sheet '{sheet_name}', row '{row_name}', column '{col_name}': {e}")
	new_df.iloc[i, j] = np.nan

	result_dataframes[sheet_name] = new_df

	return result_dataframes

	def analyze_z_test(file):
	"""
	Performs Z-Test analysis on the uploaded Excel file.

	Parameters:
	- file: Uploaded Excel file

	Returns:
	- result_dataframes: Dictionary of DataFrames with p-values
	"""
	sheets_data = read_excel_sheets(file)
	if sheets_data is None:
	return None

	result_dataframes = Z_test_dataframes(sheets_data)

	if not result_dataframes:
	st.error("❌ No valid sheets found for Z-Test analysis.")
	return None

	st.write("### 📈 Processed Tables with Z-Test Results")
	for sheet_name, df in result_dataframes.items():
	st.write(f"#### Sheet: {sheet_name}")

	# Apply color coding based on p-value
	def color_p_value(val):
	try:
	if pd.isna(val):
	return 'background-color: lightgray'
	elif val < 0.05:
	return 'background-color: lightgreen'
	else:
	return 'background-color: lightcoral'
	except:
	return 'background-color: lightgray'

	styled_df = df.style.applymap(color_p_value)

	# Display the styled DataFrame
	st.dataframe(styled_df, use_container_width=True)

	return result_dataframes

	# Streamlit User Interface
	st.title("Data Analysis Application")

	# Main options
	main_option = st.selectbox("Please select an option:", ["Tabulation", "Hypothesis test", "Machine Learning", "Coding"])

	if main_option == "Tabulation":
	st.header("Tabulation Analysis")
	uploaded_file = st.file_uploader("Please upload your Excel file", type=["xlsx", "xls"])
	if uploaded_file:
	try:
	df = pd.read_excel(uploaded_file)
	st.subheader("Displaying the first few rows of the DataFrame")
	st.dataframe(df.head())

	tabulation_option = st.selectbox("Please select the type of analysis:", ["All", "Univariate", "Multivariate"])

	if tabulation_option == "All":
	st.info("This section of the program is under development.")
	elif tabulation_option == "Univariate":
	uni_option = st.selectbox("Select the type of univariate analysis:", ["Multiple answer", "Single answer"])

	if uni_option == "Single answer":
	var = st.text_input("Please enter the name of the desired column:")
	if var:
	if var in df.columns:
	result_df = single_answer(df[var])
	st.subheader("Univariate Analysis Results")
	st.dataframe(result_df)

	fig = figo('Bar', result_df, title='Percentage Histogram', xlabel=var, ylabel='Percentage', colorscale='Plotly3')
	st.plotly_chart(fig, use_container_width=True)
	else:
	st.error("The entered column was not found.")
	elif uni_option == "Multiple answer":
	var = st.text_input("Please enter the name of the desired column:")
	if var:
	matching_cols = [col for col in df.columns if is_matching_pattern(col, var)]
	if matching_cols:
	subset_df = df[matching_cols]
	result_df = multi_answer(subset_df)
	st.subheader("Multiple Answer Analysis Results")
	st.dataframe(result_df)

	fig = figo('Bar', result_df, title='Percentage Histogram', xlabel=var, ylabel='Percentage', colorscale='Plotly3')
	st.plotly_chart(fig, use_container_width=True)
	else:
	st.error("No columns matching the entered pattern were found.")
	elif tabulation_option == "Multivariate":
	st.subheader("Multivariate Analysis")
	var1 = st.text_input("Please enter the name of the first column:")
	var2 = st.text_input("Please enter the name of the second column:")

	if var1 and var2:
	if var1 in df.columns and var2 in df.columns:
	type1 = st.selectbox("Select the type of analysis for the first column:", ["Multiple answer", "Single answer"], key='type1')
	type2 = st.selectbox("Select the type of analysis for the second column:", ["Multiple answer", "Single answer"], key='type2')

	if type1 == "Single answer" and type2 == "Single answer":
	percentile_df, frequency_df = two_variable_ss(df[[var1, var2]], var1, var2)
	st.subheader("Percentage Table")
	st.dataframe(percentile_df)

	st.subheader("Frequency Table")
	st.dataframe(frequency_df)

	fig = figo('Heatmap', percentile_df, title='Percentage Histogram', xlabel=var1, ylabel=var2, colorscale='Plotly3')
	st.plotly_chart(fig, use_container_width=True)

	elif type1 == "Single answer" and type2 == "Multiple answer":
	matching_cols = [col for col in df.columns if is_matching_pattern(col, var)]
	if matching_cols:
	percentile_df, frequency_df = two_variable_sm(df[[var1] + matching_cols], var1, matching_cols)
	st.subheader("Percentage Table")
	st.dataframe(percentile_df)

	st.subheader("Frequency Table")
	st.dataframe(frequency_df)

	fig = figo('Heatmap', percentile_df, title='Percentage Histogram', xlabel=var1, ylabel=var2, colorscale='Plotly3')
	st.plotly_chart(fig, use_container_width=True)

	else:
	st.error("No columns matching the entered pattern were found.")
	else:
	st.info("This section of the program is under development.")
	else:
	st.error("One or both of the entered columns were not found.")
	except Exception as e:
	st.error(f"❌ Error reading the Excel file: {e}")

	elif main_option == "Hypothesis test":
	st.header("Hypothesis Testing")
	hypothesis_option = st.selectbox("Please select the type of hypothesis test:", ["Z test", "T test", "Chi-Square test", "ANOVA test"])

	if hypothesis_option != "Z test":
	st.info("This section of the program is under development.")
	else:
	uploaded_file = st.file_uploader("Please upload your Excel file for Z-Test", type=["xlsx", "xls"])
	if uploaded_file:
	result = analyze_z_test(uploaded_file)
	if result:
	st.success("Z-Test analysis completed successfully.")

	elif main_option in ["Machine Learning", "Coding"]:
	st.info("This section of the program is under development.")