Spaces:

AMKhakbaz
/

AMKAPP

Running

App Files Files Community

AMKAPP / app.py

AMKhakbaz

Update app.py

acf67b7 verified 2 days ago

raw

history blame contribute delete

38.1 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import plotly.express as px
	import plotly.graph_objects as go
	from scipy.stats import norm, t
	from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
	import plotly.figure_factory as ff
	from huggingface_hub import InferenceClient
	from sklearn.cluster import KMeans
	from sklearn.decomposition import PCA
	import json
	import math
	from transformers import pipeline

	classifier = pipeline('zero-shot-classification', model='MoritzLaurer/deberta-v3-large-zeroshot-v1.1-all-33')

	def sorting(df):
	df.index = list(map(float, df.index))
	df = df.sort_index

	return df

	def edit_strings(string_list):
	edited_list = []
	for string in string_list:
	if "_" in string:
	last_underscore_index = string.rfind("_")
	edited_string = string[:last_underscore_index]
	edited_list.append(edited_string)
	else:
	edited_list.append(string)
	return edited_list

	def equalize_list_lengths(input_dict):
	max_len = 0
	for key in input_dict:
	max_len = max(max_len, len(input_dict[key]))

	for key in input_dict:
	while len(input_dict[key]) < max_len:
	input_dict[key].append(None)

	return pd.DataFrame(input_dict)

	def figo(plot_type, df, title, xlabel=None, ylabel=None, legend_title=None, colorscale='Plotly3', width=800, height=600):
	if plot_type == "Scatter":
	fig = go.Figure()

	for column in df.columns[0:]:
	df.index = list(map(float, list(df.index)))
	sorted_data = df.sort_index()
	fig.add_trace(go.Scatter(
	x=sorted_data[column],
	y=sorted_data.index,
	mode='lines+markers+text',
	name=column,
	text=sorted_data[column].round(2),
	textposition="middle right"
	))

	fig.update_layout(
	title=title,
	xaxis_title="Percentage",
	yaxis_title="Category",
	yaxis={'categoryorder': 'array', 'categoryarray': sorted_data.index},
	width=width,
	height=height
	)

	elif plot_type == "Heatmap":
	df = df.apply(pd.to_numeric, errors='coerce')

	fig = go.Figure(data=go.Heatmap(
	z=df.values,
	x=df.columns,
	y=df.index,
	hoverongaps=False,
	colorscale=colorscale
	))

	fig.update_layout(
	title={
	'text': title,
	'y':0.95,
	'x':0.5,
	'xanchor': 'center',
	'yanchor': 'top'
	},
	xaxis_title=xlabel,
	yaxis_title=ylabel,
	legend_title=legend_title,
	template="plotly_white",
	width=width,
	height=height
	)

	elif plot_type == "Bar":
	fig = go.Figure()
	col = df.name
	fig.add_trace(go.Bar(
	x=df.index,
	y=df,
	name=col
	))

	fig.update_layout(
	title={
	'text': title,
	'y':0.95,
	'x':0.5,
	'xanchor': 'center',
	'yanchor': 'top'
	},
	xaxis_title=xlabel,
	yaxis_title=ylabel,
	legend_title=legend_title,
	template="plotly_white",
	barmode='group',
	width=width,
	height=height
	)

	else:
	raise ValueError("Invalid plot_type. Supported types are 'Scatter', 'Heatmap', and 'Bar'.")

	return fig

	def is_matching_pattern(column, prefix):
	if not column.startswith(prefix + '_'):
	return False
	suffix = column[len(prefix) + 1:]
	if 1 <= len(suffix) <= 3 and suffix.isdigit():
	return True
	return False

	def multi_answer(df):
	friquency = {}
	for i in df.columns:
	try:
	unique_values = list(set(df[i].dropna()))[0]
	friquency[str(unique_values)] = df[i].value_counts().get(unique_values, 0)
	except Exception as e:
	#st.error(f"Warning: One of the data columns has no value.: {e}")
	friquency[i] = 0


	friquency_dataframe = pd.DataFrame({"Value": friquency.keys(), 'Frequency': friquency.values(), "Percentage": np.array(list(friquency.values()))/len(df.dropna(how='all'))}).sort_values(by='Value')
	friquency_dataframe.loc[len(friquency_dataframe)] = ['Sample_size', len(df.dropna(how='all')), 1]
	return friquency_dataframe

	def single_answer(df):
	counter = df.value_counts()
	friquency_dataframe = pd.DataFrame({
	'Value': counter.index,
	'Frequency': counter.values,
	'Percentage': (counter.values / counter.sum())}).sort_values(by='Value')
	friquency_dataframe.loc[len(friquency_dataframe)] = ['Sample_size', len(df.dropna()), 1]
	return friquency_dataframe

	def score_answer(df):
	counter = df.value_counts().sort_index()

	friquency_dataframe = pd.DataFrame({
	'Value': list(counter.index)+["Meen", "Variance"],
	'Frequency': list(counter.values)+[df.mean(), df.var()],
	'Percentage': list((counter.values / counter.sum()))+["", ""]})

	return friquency_dataframe

	def two_variable_ss(df, var1, var2):

	counter = df.groupby(var1)[var2].value_counts()
	friquency_dataframe = counter.unstack(fill_value=0)

	#friquency_dataframe = sorting(friquency_dataframe)

	column_sums = friquency_dataframe.sum(axis=0)
	percentage_dataframe = friquency_dataframe.div(column_sums, axis=1)

	friquency_dataframe['Total'] = list(single_answer(df[var1]).iloc[:,1])[:-1]
	friquency_dataframe.loc['Sample_size'] = list(single_answer(df[var2]).iloc[:,1])
	percentage_dataframe['Total'] = list(single_answer(df[var1]).iloc[:,2])[:-1]
	percentage_dataframe.loc['Sample_size'] = list(single_answer(df[var2]).iloc[:,1])

	return percentage_dataframe, friquency_dataframe

	def two_variable_sm(df, var1, var2):
	unique_values = list(set(df[var1].dropna()))
	value = multi_answer(df[var2]).iloc[:-1,0]
	friquency_dataframe, percentage_dataframe = {}, {}

	for i in unique_values:
	dataframe = multi_answer(df[df[var1] == i][var2]).iloc[:-1,:]
	friquency_dataframe[i], percentage_dataframe[i] = dataframe['Frequency'], dataframe['Percentage']

	friquency_dataframe = pd.DataFrame(friquency_dataframe)
	percentage_dataframe = pd.DataFrame(percentage_dataframe)

	friquency_dataframe.index, percentage_dataframe.index = value, value

	#friquency_dataframe = sorting(friquency_dataframe)
	#percentage_dataframe = sorting(percentage_dataframe)

	friquency_dataframe['Total'] = list(multi_answer(df[var2]).iloc[:,1])[:-1]
	friquency_dataframe.loc['Sample_size'] = list(single_answer(df[var1]).iloc[:,1])
	percentage_dataframe['Total'] = list(multi_answer(df[var2]).iloc[:,2])[:-1]
	percentage_dataframe.loc['Sample_size'] = list(single_answer(df[var1]).iloc[:,1])


	return percentage_dataframe, friquency_dataframe

	def two_variable_mm(df, var1, var2):
	friquency_dataframe, percentage_dataframe = {}, {}
	value = multi_answer(df[var2]).iloc[:-1,0]

	for i in var1:
	unique_values = list(set(df[i].dropna()))[0]
	dataframe = multi_answer(df[df[i] == unique_values][var2]).iloc[:-1,:]
	friquency_dataframe[i], percentage_dataframe[i] = dataframe['Frequency'], dataframe['Percentage']

	friquency_dataframe = pd.DataFrame(friquency_dataframe)
	percentage_dataframe = pd.DataFrame(percentage_dataframe)

	friquency_dataframe.index, percentage_dataframe.index = value, value

	#friquency_dataframe = sorting(friquency_dataframe)
	#percentage_dataframe = sorting(percentage_dataframe)

	friquency_dataframe['Total'] = list(multi_answer(df[var2]).iloc[:,1])[:-1]
	friquency_dataframe.loc['Sample_size'] = list(multi_answer(df[var1]).iloc[:,1])
	percentage_dataframe['Total'] = list(multi_answer(df[var2]).iloc[:,2])[:-1]
	percentage_dataframe.loc['Sample_size'] = list(multi_answer(df[var1]).iloc[:,1])

	return percentage_dataframe, friquency_dataframe

	def two_variable_ssc(df, var1, var2):
	unique_values = list(set(df[var1].dropna()))
	mean_dataframe = {'Mean': [], 'Variation': [], 'Frequency': []}
	for i in unique_values:
	d = df[df[var1] == i][var2]
	mean_dataframe['Mean'] += [d.mean()]
	mean_dataframe['Variation'] += [d.var()]
	mean_dataframe['Frequency'] += [len(d)]

	mean_dataframe = pd.DataFrame(mean_dataframe)
	mean_dataframe.index = unique_values

	mean_dataframe.loc['Total'] = [df[var2].mean(), df[var2].var(), len(df[var2])]

	return mean_dataframe

	def two_variable_msc(df, var1, var2):
	mean_dataframe, unique_values = {'Mean': [], 'Variation': [], 'Frequency': []}, []
	for i in var1:
	d = df[i].dropna()
	j = list(set(df[i].dropna()))[0]
	d = df[df[i]==j][var2]
	unique_values += [j]
	mean_dataframe['Mean'] += [d.mean()]
	mean_dataframe['Variation'] += [d.var()]
	mean_dataframe['Frequency'] += [len(d)]

	mean_dataframe = pd.DataFrame(mean_dataframe)
	mean_dataframe.index = unique_values

	mean_dataframe.loc['Total'] = [df[var2].mean(), df[var2].var(), len(df[var2])]

	return mean_dataframe

	def funnel(df, dictionary):
	friquency = {}
	for i in dictionary.keys():
	if dictionary[i] == "Single":
	friquency[i] = list(single_answer(df[i])['Frequency'])[:-1]

	elif dictionary[i] == "Multi":
	matching_cols = [col for col in df.columns if is_matching_pattern(col, i)]
	friquency[i] = list(multi_answer(df[matching_cols])['Frequency'])[:-1]

	elif dictionary[i] == "Score":
	friquency[i] = list(score_answer(df[i])['Frequency'])[:-1]

	try:
	friquency = pd.DataFrame(friquency)
	except:
	friquency = equalize_list_lengths(friquency)

	first = None
	for key, value in dictionary.items():
	if value == "Single":
	first = key
	break

	percentage = friquency/len(df[first])

	return friquency, percentage

	def t_test(m1, m2, n1, n2, v1, v2):
	te = (m1 - m2) / ((v1/n1 + v2/n2)**0.5)
	p_value = 2 * (1 - t.cdf(abs(te), n1+n2-2))
	return p_value

	def z_testes(n1, n2, p1, p2):
	p_hat = ((n1p1) + (n2p2)) / (n1 + n2)
	z = (p1 - p2) / ((p_hat * (1 - p_hat) * (1 / n1 + 1 / n2)) ** 0.5)
	p_value = 2 * (1 - norm.cdf(abs(z)))
	return p_value

	def z_test_data(df):
	styles = pd.DataFrame('', index=df.index, columns=df.columns)
	num_rows, num_cols = df.shape

	for i in range(num_rows -1):
	for j in range(num_cols -1):
	n1 = df.iloc[-1, -1]
	n2 = df.iloc[-1, j]
	p1 = df.iloc[i, -1]
	p2 = df.iloc[i, j]
	p_value = z_testes(n1, n2, p1, p2)
	if pd.notnull(p_value) and p_value <= 0.05:
	styles.iloc[i, j] = 'background-color: lightgreen'

	return df.style.apply(lambda _: styles, axis=None)

	def t_test_data(df):
	rows, cols = df.shape
	styles = pd.DataFrame('', index=df.index, columns=df.columns)

	for i in range(rows-1):
	p_value = t_test(df['Mean'].iloc[-1,], df['Mean'].iloc[i], df['Frequency'].iloc[-1], df['Frequency'].iloc[i], df['Variation'].iloc[-1], df['Variation'].iloc[i])
	if p_value <= 0.05:
	styles.iloc[i, :] = 'background-color: lightgreen'

	return df.style.apply(lambda _: styles, axis=None)

	def Z_test_dataframes(sheets_data):
	"""Processes each sheet's DataFrame and computes new DataFrames with Z-test results."""
	result_dataframes = {}
	for sheet_name, df in sheets_data.items():
	if df.empty:
	st.warning(f"⚠️ Sheet '{sheet_name}' is empty and has been skipped.")
	continue
	df = df.set_index(df.columns[0]) # Use the first column as index
	rows, cols = df.shape
	if cols < 2:
	st.warning(f"⚠️ Sheet '{sheet_name}' does not have enough columns for analysis and has been skipped.")
	continue
	new_df = pd.DataFrame(index=df.index[:-1], columns=df.columns[1:])
	for i, row_name in enumerate(df.index[:-1]):
	for j, col_name in enumerate(df.columns[1:]):
	try:
	n1 = df.iloc[-1, 0] # x_I1
	n2 = df.iloc[-1, j+1] # x_Ij
	p1 = df.iloc[i, 0] # x_1J
	p2 = df.iloc[i, j+1] # x_ij
	p_value = z_testes(n1, n2, p1, p2)
	new_df.iloc[i, j] = p_value
	except Exception as e:
	st.error(f"❌ Error processing sheet '{sheet_name}', row '{row_name}', column '{col_name}': {e}")
	new_df.iloc[i, j] = np.nan

	result_dataframes[sheet_name] = new_df

	return result_dataframes

	def analyze_z_test(file):
	"""
	Performs Z-Test analysis on the uploaded Excel file.

	Parameters:
	- file: Uploaded Excel file

	Returns:
	- result_dataframes: Dictionary of DataFrames with p-values
	"""
	sheets_data = read_excel_sheets(file)
	if sheets_data is None:
	return None

	result_dataframes = Z_test_dataframes(sheets_data)

	if not result_dataframes:
	st.error("❌ No valid sheets found for Z-Test analysis.")
	return None

	st.write("### 📈 Processed Tables with Z-Test Results")
	for sheet_name, df in result_dataframes.items():
	st.write(f"#### Sheet: {sheet_name}")

	# Apply color coding based on p-value
	def color_p_value(val):
	try:
	if pd.isna(val):
	return 'background-color: lightgray'
	elif val < 0.05:
	return 'background-color: lightgreen'
	else:
	return 'background-color: lightcoral'
	except:
	return 'background-color: lightgray'

	styled_df = df.style.applymap(color_p_value)

	# Display the styled DataFrame
	st.dataframe(styled_df, use_container_width=True)

	return result_dataframes

	def join_dataframes(mlist, dataframes):
	max_rows = max(df.shape[0] for df in dataframes)
	result = pd.DataFrame()
	col_counts = {} # Dictionary to track column name occurrences

	for i, df in enumerate(dataframes):
	rows_to_add = max_rows - df.shape[0]
	if rows_to_add > 0:
	empty_rows = pd.DataFrame(index=range(rows_to_add), columns=df.columns)
	empty_rows = empty_rows.fillna(np.nan)
	df = pd.concat([df, empty_rows], ignore_index=True)

	for col in df.columns:
	original_col = col
	count = 1
	while col in result.columns:
	col = f"{original_col}_{count}"
	count += 1
	col_counts[original_col] = col_counts.get(original_col, 0) + 1
	df = df.rename(columns={original_col: col})

	result = pd.concat([result, df.reset_index(drop=True)], axis=1)

	return result

	def all_tabulation(df, main_dict, follow_dict):

	for j in main_dict["single"]:

	dataframe_list, name_list = (single_answer(df[j]),), [j]

	for i in follow_dict["single"]:
	dataframe_list = dataframe_list + (two_variable_ss(df[[j, i]], j, i)[0], )
	name_list.append(i)

	for i in follow_dict["multi"]:
	matching_cols1 = [col for col in df.columns if is_matching_pattern(col, i)]
	dataframe_list = dataframe_list + (two_variable_sm(df[[j] + matching_cols1], j, matching_cols1)[0].T, )
	name_list.append(i)

	for i in follow_dict["score"]:
	dataframe_list = dataframe_list + (two_variable_ssc(df[[j, i]], j, i), )
	name_list.append(i)

	st.subheader(j)
	st.markdown(j + ": " + ", ".join(follow_dict['single'] + follow_dict['multi'] + follow_dict['score']))
	st.dataframe(join_dataframes(name_list, dataframe_list))

	for j in main_dict["multi"]:

	matching_cols0 = [col for col in df.columns if is_matching_pattern(col, j)]
	dataframe_list, name_list = (multi_answer(df[matching_cols0]), ), [j]

	#for i in follow_dict["single"]:
	#dataframe_list.append(two_variable_ms(df[matching_cols0, i], matching_cols0, i).drop(columns=['Value'])[0])

	for i in follow_dict["multi"]:
	matching_cols1 = [col for col in df.columns if is_matching_pattern(col, i)]
	dataframe_list = dataframe_list + (two_variable_mm(df[matching_cols0 + matching_cols1], matching_cols0, matching_cols1)[0], )
	name_list.append(i)

	for i in follow_dict["score"]:
	dataframe_list = dataframe_list + (two_variable_msc(df[matching_cols0 + [i]], matching_cols0, i), )
	name_list.append(i)

	st.subheader(j)
	st.markdown(j + ": " + ", ".join(follow_dict['multi'] + follow_dict['score']))
	st.dataframe(join_dataframes(name_list, dataframe_list))

	for j in main_dict["score"]:

	dataframe_list, name_list = [single_answer(df[j])], [j]

	st.subheader(j)
	st.dataframe(join_dataframes(name_list, dataframe_list))

	def process_dataframe(df):
	df = df.fillna(0)
	for col in df.columns:
	df[col] = pd.Categorical(df[col])
	return df

	import numpy as np
	from sklearn.decomposition import PCA

	def pca_with_variance_threshold(data, threshold=0.80):
	pca = PCA()
	pca.fit(data)

	cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
	n_components = np.argmax(cumulative_variance >= threshold) + 1

	pca = PCA(n_components=n_components)
	transformed_data = pca.fit_transform(data)

	return transformed_data

	def hierarchical_clustering_with_plotly(df, linkage_method):

	df_encoded = df.apply(lambda x: pd.factorize(x)[0])

	Z = linkage(df_encoded, method=linkage_method)

	fig = ff.create_dendrogram(df_encoded, linkagefun=lambda x: Z, orientation='bottom')
	fig.update_layout(width=800, height=500)
	st.plotly_chart(fig)

	num_clusters = int(st.text_input("Enter the desired number of clusters"))

	clusters = fcluster(Z, num_clusters, criterion='maxclust')

	df['Cluster'] = clusters

	return df

	def kmeans_clustering(df, k):

	numeric_df = df.select_dtypes(include=['number'])

	if numeric_df.empty:
	raise ValueError("DataFrame does not contain any numeric columns for clustering.")

	kmeans = KMeans(n_clusters=k, random_state=0) # You can modify random_state
	df['cluster'] = kmeans.fit_predict(numeric_df)
	return df

	def sample_size_calculator(confidence_level, p, E):
	Z = norm.ppf(1 - (1 - confidence_level) / 2)

	n = (Z*2 p * (1 - p)) / (E**2)

	n = math.ceil(n)

	return n

	import pandas as pd

	def categorize_sentences(prompt, df, Text_name):
	texts = df[Text_name].tolist()

	labels = []
	for text in texts:
	result = classifier(text, candidate_labels=[prompt])
	labels.append(result['labels'][0])

	df['labels'] = labels
	return df

	def upload_and_select_dataframe():
	st.sidebar.title("File Upload")
	uploaded_files = st.sidebar.file_uploader("Choose CSV or Excel files", type=["csv", "xlsx", "xls", "xlsb"], accept_multiple_files=True)

	dataframes = {}
	dataframes["Sample Dataset"] = pd.read_excel("Sample dataset.xlsx")

	try:
	for uploaded_file in uploaded_files:
	try:
	if uploaded_file.name.endswith(('.csv')):
	df = pd.read_csv(uploaded_file)
	elif uploaded_file.name.endswith(('.xls', '.xlsx', '.xlsb')):
	df = pd.read_excel(uploaded_file)
	else:
	st.sidebar.error(f"Unsupported file type: {uploaded_file.name}")
	continue
	dataframes[uploaded_file.name] = df
	except Exception as e:
	st.sidebar.error(f"Error reading {uploaded_file.name}: {e}")

	if len(uploaded_files) > 7:
	st.sidebar.error('Maximum 7 files can be uploaded.')
	return None

	if dataframes:
	selected_file = st.sidebar.selectbox("Select a DataFrame", ["Select a dataset"] + list(dataframes.keys()))
	return dataframes[selected_file]
	else:
	st.sidebar.info("Please upload some files.")
	return None
	except:
	pass

	#st.markdown('[Click to register a suggestion or comment](https://docs.google.com/forms/d/e/1FAIpQLScLyP7bBbqMfGdspjL7Ij64UZ6v2KjqjKNbm8gwEsgWsFs_Qg/viewform?usp=header)')

	st.image("Insightzen.png", width=600)

	df = upload_and_select_dataframe()

	try:
	try:
	d = df.head()
	st.subheader("Data preview")
	st.dataframe(df.head())

	cols = edit_strings(df.columns)
	cols = sorted(list(set(cols)))
	except:
	pass

	main_option = st.selectbox("Please select an option:", ["Select a Task","Tabulation", "Funnel Analysis", "Segmentation Analysis", "Hypothesis test", "Machine Learning", "Sample Size Calculator" ,"Coding", "AI Chat"])

	if main_option == "Tabulation":
	st.header("Tabulation Analysis")

	tabulation_option = st.selectbox("Please select the type of analysis:", ["Univariate", "Multivariate", "All"])

	if tabulation_option == "All":

	st.sidebar.header("Settings")

	main_dict = {"single": [], "multi": [], "score": []}

	st.sidebar.subheader("Main")
	main_dict["single"] = st.sidebar.multiselect(
	'Main: Single answer questions',
	cols,
	default=[]
	)

	main_dict["multi"] = st.sidebar.multiselect(
	'Main: Multi answer questions',
	cols,
	default=[]
	)

	main_dict["score"] = st.sidebar.multiselect(
	'Main: Score answer questions',
	cols,
	default=[]
	)

	follow_dict = {"single": [], "multi": [], "score": []}

	st.sidebar.subheader("Follow")
	follow_dict["single"] = st.sidebar.multiselect(
	'Follow: Single answer questions',
	cols,
	default=[]
	)

	follow_dict["multi"] = st.sidebar.multiselect(
	'Follow: Multi answer questions',
	cols,
	default=[]
	)

	follow_dict["score"] = st.sidebar.multiselect(
	'Follow: Score answer questions',
	cols,
	default=[]
	)

	all_tabulation(df, main_dict, follow_dict)

	elif tabulation_option == "Univariate":
	uni_option = st.selectbox("Select the type of univariate analysis:", ["Multiple answer", "Single answer", "Score answer"])

	if uni_option == "Single answer":
	var = st.text_input("Please enter the name of the desired column:")
	if var:
	if var in df.columns:
	result_df = single_answer(df[var])
	st.subheader("Univariate Analysis Results")
	st.dataframe(result_df)

	fig = figo('Bar', result_df["Percentage"][:-1, ], title='Percentage Histogram', xlabel=var, ylabel='Percentage', colorscale='Plotly3')
	st.plotly_chart(fig, use_container_width=True)
	else:
	st.error("The entered column was not found.")
	elif uni_option == "Multiple answer":
	var = st.text_input("Please enter the name of the desired column:")
	if var:
	matching_cols = [col for col in df.columns if is_matching_pattern(col, var)]
	if matching_cols:
	subset_df = df[matching_cols]
	result_df = multi_answer(subset_df)

	st.subheader("Multiple Answer Analysis Results")
	st.dataframe(result_df)

	fig = figo('Bar', result_df["Percentage"][:-1], title='Percentage Histogram', xlabel=var, ylabel='Percentage', colorscale='Plotly3')
	st.plotly_chart(fig, use_container_width=True)
	else:
	st.error("No columns matching the entered pattern were found.")

	elif uni_option == "Score answer":
	var = st.text_input("Please enter the name of the desired column:")
	if var:
	subset_df = df[var]
	result_df = score_answer(subset_df)

	st.subheader("Score Answer Analysis Results")
	st.dataframe(result_df)

	fig = figo('Bar', result_df["Percentage"][:-2], title='Percentage Histogram', xlabel=var, ylabel='Percentage', colorscale='Plotly3')
	st.plotly_chart(fig, use_container_width=True)
	else:
	st.error("No columns matching the entered pattern were found.")

	elif tabulation_option == "Multivariate":
	st.subheader("Multivariate Analysis")
	var1 = st.text_input("Please enter the name of the first column:")
	var2 = st.text_input("Please enter the name of the second column:")

	if var1 and var2:
	type1 = st.selectbox("Select the type of analysis for the first column:", ["Multiple answer", "Single answer"], key='type1')
	type2 = st.selectbox("Select the type of analysis for the second column:", ["Multiple answer", "Single answer", "Score answer"], key='type2')

	if type1 == "Single answer" and type2 == "Single answer":
	percentile_df, frequency_df = two_variable_ss(df[[var1, var2]], var1, var2)
	st.subheader("Percentage Table")
	st.write(z_test_data(percentile_df))

	st.subheader("Frequency Table")
	st.dataframe(frequency_df)

	row, col = df.shape
	fig = figo('Scatter', percentile_df.iloc[:-1,:], title='Percentage Scatter plot', width=(col5)+5, height=(row25) + 10)
	st.plotly_chart(fig, use_container_width=True)

	elif type1 == "Single answer" and type2 == "Multiple answer":
	matching_cols = [col for col in df.columns if is_matching_pattern(col, var2)]
	if matching_cols:
	percentile_df, frequency_df = two_variable_sm(df[[var1] + matching_cols], var1, matching_cols)
	st.subheader("Percentage Table")
	st.write(z_test_data(percentile_df))

	st.subheader("Frequency Table")
	st.dataframe(frequency_df)

	row, col = df.shape
	fig = figo('Scatter', percentile_df.iloc[:-1,:], title='Percentage Scatter plot', width=(col5)+5, height=(row25) + 10)
	st.plotly_chart(fig, use_container_width=True)

	else:
	st.error("No columns matching the entered pattern were found.")

	elif type1 == "Multiple answer" and type2 == "Multiple answer":
	matching_cols1 = [col for col in df.columns if is_matching_pattern(col, var1)]
	matching_cols2 = [col for col in df.columns if is_matching_pattern(col, var2)]
	if matching_cols1 and matching_cols2:
	percentile_df, frequency_df = two_variable_mm(df[matching_cols1 + matching_cols2], matching_cols1, matching_cols2)
	st.subheader("Percentage Table")
	st.write(z_test_data(percentile_df))

	st.subheader("Frequency Table")
	st.dataframe(frequency_df)

	row, col = df.shape
	fig = figo('Scatter', percentile_df.iloc[:-1,:], title='Percentage Scatter plot', width=(col5)+5, height=(row25) + 10)
	st.plotly_chart(fig, use_container_width=True)

	elif type1 == "Single answer" and type2 == "Score answer":

	mean_df = two_variable_ssc(df[[var1, var2]], var1, var2)
	st.subheader("Mean Table")
	st.write(t_test_data(mean_df))

	row, col = df.shape
	fig = figo('Bar', mean_df["Mean"][:-1], title='Mean Histogram', xlabel=var1, ylabel='Mean', colorscale='Plotly3')
	st.plotly_chart(fig, use_container_width=True)


	elif type1 == "Multiple answer" and type2 == "Score answer":
	matching_cols1 = [col for col in df.columns if is_matching_pattern(col, var1)]
	if matching_cols1:
	mean_df = two_variable_msc(df[matching_cols1 + [var2]], matching_cols1, var2)
	st.subheader("Mean Table")
	st.write(t_test_data(mean_df))

	row, col = df.shape
	fig = figo('Bar', mean_df["Mean"][:-1], title='Mean Histogram', xlabel=var1, ylabel='Mean', colorscale='Plotly3')
	st.plotly_chart(fig, use_container_width=True)
	else:
	st.info("This section of the program is under development.")

	elif main_option == "Funnel Analysis":
	st.header("Funnel")

	st.sidebar.header("Funnel Settings")
	single_list = st.sidebar.multiselect(
	'Single answer questions',
	cols,
	default=[]
	)

	multi_list = st.sidebar.multiselect(
	'Multi answer questions',
	cols,
	default=[]
	)
	selected_dict = {}

	for option in single_list:
	selected_dict[option] = "Single"
	for option in multi_list:
	selected_dict[option] = "Multi"

	funnel_frequency, funnel_percentage = funnel(df, selected_dict)
	st.subheader("Percentage Table")
	st.dataframe(funnel_percentage)

	st.subheader("Frequency Table")
	st.dataframe(funnel_frequency)

	st.sidebar.header("Chart Settings")

	bar_columns = st.sidebar.multiselect('Which columns should be displayed as bar charts?', sorted(funnel_percentage.columns))
	line_columns = st.sidebar.multiselect('Which columns should be displayed as line charts?', sorted(funnel_percentage.columns))

	funnel_percentage_cleaned = funnel_percentage.dropna(axis=0, how='all')

	columns = st.sidebar.multiselect('Sort by which questions?', sorted(funnel_percentage_cleaned.columns))
	sort_order = st.sidebar.radio('Sort Order', ['Ascending', 'Descending'])

	ascending = True if sort_order == 'Ascending' else False
	funnel_percentage_cleaned = funnel_percentage_cleaned.sort_values(by=columns, ascending=ascending)

	# Update index to string to ensure proper sorting in Plotly
	funnel_percentage_cleaned.index = funnel_percentage_cleaned.index.astype(str)

	fig = go.Figure()

	# Define modern and diverse color palette
	modern_colors = [
	"#FF6F61", "#6B5B95", "#88B04B", "#F7CAC9", "#92A8D1",
	"#955251", "#B565A7", "#009B77", "#DD4124", "#45B8AC"
	]

	# Add Bar traces with transparency and custom colors
	for idx, col in enumerate(bar_columns):
	funnel_percentage_col = funnel_percentage_cleaned[col]
	fig.add_trace(
	go.Bar(
	x=funnel_percentage_cleaned.index,
	y=funnel_percentage_col,
	name=col,
	marker_color=modern_colors[idx % len(modern_colors)], # Cycle through colors
	opacity=0.8 # Set transparency
	)
	)

	# Add Line traces with transparency and custom colors
	for idx, col in enumerate(line_columns):
	funnel_percentage_col = funnel_percentage_cleaned[col]
	fig.add_trace(
	go.Scatter(
	x=funnel_percentage_cleaned.index,
	y=funnel_percentage_col,
	mode='lines',
	name=col,
	line=dict(color=modern_colors[(idx + len(bar_columns)) % len(modern_colors)]), # Cycle through colors
	opacity=0.8 # Set transparency
	)
	)

	fig.update_layout(
	title="Combined Bar and Line Chart",
	xaxis_title="Brands",
	yaxis_title="Percentage",
	template="plotly_dark",
	barmode="group",
	xaxis=dict(
	tickmode='array',
	categoryorder='array',
	categoryarray=funnel_percentage_cleaned.index.tolist() # Ensure sorting of x-axis matches the DataFrame index
	)
	)

	st.plotly_chart(fig)

	elif main_option == "Segmentation Analysis":
	st.header("Segmentation Analysis")

	st.sidebar.header("Selection of questions")
	single_list = st.sidebar.multiselect(
	'Single answer questions',
	cols,
	default=[]
	)

	multi_list = st.sidebar.multiselect(
	'Multi answer questions',
	cols,
	default=[]
	)

	score_list = st.sidebar.multiselect(
	'Score answer questions',
	cols,
	default=[]
	)

	matching_cols1 = []
	for i in multi_list:
	matching_cols1 += [col for col in df.columns if is_matching_pattern(col, i)]

	df_clean = process_dataframe(df[single_list + matching_cols1])
	st.subheader("Selected Table")
	st.dataframe(df_clean)

	PCA_radio = st.sidebar.radio('PCA', ['Yes', 'No'])

	if PCA_radio == 'Yes':
	dfc = pd.DataFrame(pca_with_variance_threshold(df_clean, threshold=0.80))
	else:
	dfc = df_clean


	selected_method = st.sidebar.selectbox("Select the Linkage Method of Segmentation Analysis:", ['Hierarchical Clustering', 'K-means Clustering'])
	if selected_method == 'Hierarchical Clustering':
	linkage_method = st.sidebar.selectbox("Select the Linkage Method of Segmentation Analysis:", ['average', 'single', 'complete', 'weighted', 'centroid', 'median', 'ward'])
	df_cluster = hierarchical_clustering_with_plotly(dfc, linkage_method)
	if selected_method == 'K-means Clustering':
	k = int(st.text_input("Enter the desired number of clusters"))
	df_clean = kmeans_clustering(dfc, k)

	st.subheader("Cluster Table")
	st.dataframe(df_clean)

	elif main_option == "Hypothesis test":
	st.header("Hypothesis Testing")
	hypothesis_option = st.selectbox("Please select the type of hypothesis test:", ["Z test", "T test", "Chi-Square test", "ANOVA test"])

	if hypothesis_option != "Z test":
	st.info("This section of the program is under development.")
	else:
	uploaded_file = st.file_uploader("Please upload your Excel file for Z-Test", type=["xlsx", "xls"])
	if uploaded_file:
	result = analyze_z_test(uploaded_file)
	if result:
	st.success("Z-Test analysis completed successfully.")

	elif main_option == "Coding":
	selected_list = st.sidebar.multiselect(
	'Select the desired "Open Question" column.',
	cols,
	default=[]
	)
	df["id"] = df.index
	prompt_user = st.text_input("Write a brief description of the selected column question.")
	if st.button("Submit"):

	df2 = categorize_sentences(prompt_user, df, selected_list)

	st.subheader("Categorized data")
	st.dataframe(df2)

	elif main_option == "Machine Learning":
	st.info("This section of the program is under development.")

	elif main_option == "AI Chat":

	client = InferenceClient(
	provider="together",
	api_key="hf_xxxxxxxxxxxxxxxxxxxxxxxx"
	)

	messages = [
	{
	"role": "user",
	"content": "What is the capital of France?"
	}
	]

	stream = client.chat.completions.create(
	model="deepseek-ai/DeepSeek-R1",
	messages=messages,
	max_tokens=500,
	stream=True
	)

	for chunk in stream:
	st.warning(chunk.choices[0].delta.content, end="")

	elif main_option == "Sample Size Calculator":
	st.header("Sample Size Calculator")
	confidence_level = st.text_input("Confidence levels: (In percentage terms)")
	p = st.text_input("Estimated probability of success: (In percentage terms)")
	E = st.text_input("Margin of error: (In percentage terms)")
	try:
	confidence_level, p, E = float(confidence_level)/100, float(p)/100, float(E)/100
	n = sample_size_calculator(confidence_level, p, E)
	except:
	pass

	st.write(f"Sample size: {n}")

	except Exception as e:
	pass
	#st.error(f"❌ Error: {e}")