Spaces:

AMKhakbaz
/

AMKAPP

Running

App Files Files Community

AMKAPP / app3.py

AMKhakbaz

Rename app.py to app3.py

b2949cd verified 3 months ago

raw

history blame

15.7 kB

	import pandas as pd
	import plotly.graph_objs as go
	from plotly.subplots import make_subplots
	import streamlit as st

	def is_matching_pattern(column, prefix):
	"""
	Checks if the column name matches the pattern: prefix-number (1 to 3 digits).

	Parameters:
	- column: The column name to check.
	- prefix: The prefix to match.

	Returns:
	- True if matches, False otherwise.
	"""
	if not column.startswith(prefix + '_'):
	return False
	suffix = column[len(prefix) + 1:]
	if 1 <= len(suffix) <= 3 and suffix.isdigit():
	return True
	return False

	def analyze_single_column(df, column_name, type_option):
	"""
	Function 1: Analyzes data based on the selected type.

	Parameters:
	- df: pandas DataFrame
	- column_name: Name of the column or prefix
	- type_option: 1 or 2

	Returns:
	- table: DataFrame containing frequency percentages
	- fig: Plotly Figure object
	"""
	total_rows = len(df)

	if type_option == 1:
	if column_name not in df.columns:
	st.error(f"Column '{column_name}' not found in the Excel file.")
	return None, None
	# Calculate frequency
	frequency = df[column_name].value_counts(dropna=False).sort_index()
	percentage = (frequency / total_rows) * 100
	table = percentage.reset_index()
	table.columns = [column_name, 'Percentage']
	st.subheader("Frequency Table (Percentage):")
	st.dataframe(table)

	# Plot interactive histogram
	fig = go.Figure(data=[go.Bar(
	x=table[column_name].astype(str),
	y=table['Percentage'],
	marker=dict(color='rgba(173, 216, 230, 0.6)',
	line=dict(color='rgba(173, 216, 230, 1.0)', width=1)),
	width=0.4
	)])
	fig.update_layout(
	title=f"Histogram of '{column_name}'",
	xaxis_title=column_name,
	yaxis_title='Percentage',
	bargap=0.2,
	template='plotly_white'
	)
	st.plotly_chart(fig, use_container_width=True)
	return table, fig

	elif type_option == 2:
	# Column name pattern: name-number (1 to 3 digits)
	selected_columns = [col for col in df.columns if is_matching_pattern(col, column_name)]

	if not selected_columns:
	st.error(f"No columns matching the pattern '{column_name}-<number>' found in the Excel file.")
	return None, None

	results = {}
	for col in selected_columns:
	unique_values = df[col].dropna().unique()
	if len(unique_values) > 2:
	st.warning(f"Column '{col}' has more than two unique values. It was skipped.")
	continue
	value_counts = df[col].value_counts(dropna=False)
	percentage = (value_counts / total_rows) * 100
	for val, pct in percentage.items():
	results[val] = results.get(val, 0) + pct

	if not results:
	st.error("No valid columns with exactly two unique values were found.")
	return None, None

	# Sort the keys
	table = pd.DataFrame(list(results.items()), columns=[column_name, 'Percentage'])
	table = table.sort_values(by=column_name).reset_index(drop=True)
	st.subheader("Aggregated Frequency Table (Percentage):")
	st.dataframe(table)

	# Plot interactive histogram
	fig = go.Figure(data=[go.Bar(
	x=table[column_name].astype(str),
	y=table['Percentage'],
	marker=dict(color='rgba(173, 216, 230, 0.6)',
	line=dict(color='rgba(173, 216, 230, 1.0)', width=1)),
	width=0.4
	)])
	fig.update_layout(
	title=f"Histogram of Columns '{column_name}-<number>'",
	xaxis_title=column_name,
	yaxis_title='Percentage',
	bargap=0.2,
	template='plotly_white'
	)
	st.plotly_chart(fig, use_container_width=True)
	return table, fig

	else:
	st.error("The 'type_option' must be either 1 or 2.")
	return None, None

	def analyze_multiple_columns(df, first_name, second_name, first_type, second_type):
	"""
	Function 2: Analyzes data based on combinations of first and second types.

	Parameters:
	- df: pandas DataFrame
	- first_name: Name or prefix of the first column
	- second_name: Name or prefix of the second column
	- first_type: 1 or 2
	- second_type: 1 or 2

	Returns:
	- table: DataFrame containing contingency or frequency tables
	- figs: List of Plotly Figure objects
	"""
	total_rows = len(df)

	# Helper functions to select columns based on type
	def select_columns_type1(name):
	if name in df.columns:
	return [name]
	else:
	st.error(f"Column '{name}' not found in the Excel file.")
	return []

	def select_columns_type2(name):
	selected = [col for col in df.columns if is_matching_pattern(col, name)]
	if not selected:
	st.error(f"No columns matching the pattern '{name}-<number>' found in the Excel file.")
	return selected

	# Select columns based on types
	first_columns = select_columns_type1(first_name) if first_type == 1 else select_columns_type2(first_name)
	second_columns = select_columns_type1(second_name) if second_type == 1 else select_columns_type2(second_name)

	if not first_columns or not second_columns:
	st.error("Column selection failed.")
	return None, None

	figs = []

	if first_type == 1 and second_type == 1:
	# Both types are 1
	if len(first_columns) != 1 or len(second_columns) != 1:
	st.error("When both first and second types are 1, exactly one column must be selected for each.")
	return None, None
	col1 = first_columns[0]
	col2 = second_columns[0]
	contingency = pd.crosstab(df[col1], df[col2], normalize='all') * 100
	st.subheader("Contingency Table (Percentage):")
	st.dataframe(contingency)

	# Plot interactive heatmap
	fig = go.Figure(data=go.Heatmap(
	z=contingency.values,
	x=contingency.columns,
	y=contingency.index,
	colorscale='Blues'
	))
	fig.update_layout(
	title=f"Contingency Table between '{col1}' and '{col2}'",
	xaxis_title=col2,
	yaxis_title=col1,
	template='plotly_white'
	)
	st.plotly_chart(fig, use_container_width=True)
	figs.append(fig)

	elif first_type == 1 and second_type == 2:
	# First type is 1 and second type is 2
	if len(first_columns) != 1:
	st.error("When first type is 1, exactly one column must be selected.")
	return None, None
	col1 = first_columns[0]
	col2_list = second_columns
	unique_values_col1 = df[col1].dropna().unique()

	results = {}
	for val in unique_values_col1:
	filter_df = df[df[col1] == val]
	freq_table = {}
	for col2 in col2_list:
	frequency = filter_df[col2].value_counts(dropna=False)
	percentage = (frequency / total_rows) * 100
	freq_table[col2] = percentage.to_dict()
	results[val] = freq_table

	# Convert results to a multi-dimensional DataFrame
	final_df = pd.DataFrame(results).T
	st.subheader("Multi-dimensional Frequency Table (Percentage):")
	st.dataframe(final_df)

	# Plot interactive charts
	for val, data in results.items():
	st.subheader(f"Frequencies for '{col1}' = {val}")
	fig = make_subplots(rows=1, cols=len(col2_list), subplot_titles=col2_list)
	for i, col2 in enumerate(col2_list, 1):
	categories = list(data[col2].keys())
	values = list(data[col2].values())
	fig.add_trace(go.Bar(
	x=categories,
	y=values,
	marker=dict(color='rgba(173, 216, 230, 0.6)',
	line=dict(color='rgba(173, 216, 230, 1.0)', width=1)),
	width=0.4
	), row=1, col=i)
	fig.update_xaxes(title_text=col2, row=1, col=i)
	fig.update_yaxes(title_text='Percentage', row=1, col=i)
	fig.update_layout(
	title=f"Frequencies for '{col1}' = {val}",
	template='plotly_white',
	showlegend=False
	)
	st.plotly_chart(fig, use_container_width=True)
	figs.append(fig)

	elif first_type == 2 and second_type == 1:
	# First type is 2 and second type is 1
	if len(second_columns) != 1:
	st.error("When second type is 1, exactly one column must be selected.")
	return None, None
	col2 = second_columns[0]
	col1_list = first_columns
	unique_values_col2 = df[col2].dropna().unique()

	results = {}
	for val in unique_values_col2:
	filter_df = df[df[col2] == val]
	freq_table = {}
	for col1 in col1_list:
	frequency = filter_df[col1].value_counts(dropna=False)
	percentage = (frequency / total_rows) * 100
	freq_table[col1] = percentage.to_dict()
	results[val] = freq_table

	# Convert results to a multi-dimensional DataFrame
	final_df = pd.DataFrame(results).T
	st.subheader("Multi-dimensional Frequency Table (Percentage):")
	st.dataframe(final_df)

	# Plot interactive charts
	for val, data in results.items():
	st.subheader(f"Frequencies for '{col2}' = {val}")
	fig = make_subplots(rows=1, cols=len(col1_list), subplot_titles=col1_list)
	for i, col1 in enumerate(col1_list, 1):
	categories = list(data[col1].keys())
	values = list(data[col1].values())
	fig.add_trace(go.Bar(
	x=categories,
	y=values,
	marker=dict(color='rgba(173, 216, 230, 0.6)',
	line=dict(color='rgba(173, 216, 230, 1.0)', width=1)),
	width=0.4
	), row=1, col=i)
	fig.update_xaxes(title_text=col1, row=1, col=i)
	fig.update_yaxes(title_text='Percentage', row=1, col=i)
	fig.update_layout(
	title=f"Frequencies for '{col2}' = {val}",
	template='plotly_white',
	showlegend=False
	)
	st.plotly_chart(fig, use_container_width=True)
	figs.append(fig)

	elif first_type == 2 and second_type == 2:
	# Both types are 2
	col1_list = first_columns
	col2_list = second_columns

	results = {}
	for col2 in col2_list:
	for val in df[col2].dropna().unique():
	filter_df = df[df[col2] == val]
	frequency = {}
	for col1 in col1_list:
	count = filter_df[col1].count()
	frequency[col1] = (count / total_rows) * 100
	results[(col2, val)] = frequency

	if not results:
	st.error("No valid combinations found for both types being 2.")
	return None, None

	# Convert results to a multi-dimensional DataFrame
	index = pd.MultiIndex.from_tuples(results.keys(), names=['Second Column', 'Second Column Value'])
	final_df = pd.DataFrame(list(results.values()), index=index)
	st.subheader("Multi-dimensional Frequency Table (Percentage):")
	st.dataframe(final_df)

	# Plot interactive charts
	for (col2, val), data in results.items():
	st.subheader(f"Frequencies for '{col2}' = {val}")
	fig = go.Figure(data=[go.Bar(
	x=list(data.keys()),
	y=list(data.values()),
	marker=dict(color='rgba(173, 216, 230, 0.6)',
	line=dict(color='rgba(173, 216, 230, 1.0)', width=1)),
	width=0.4
	)])
	fig.update_layout(
	title=f"Frequencies for '{col2}' = {val}",
	xaxis_title=first_name,
	yaxis_title='Percentage',
	template='plotly_white'
	)
	st.plotly_chart(fig, use_container_width=True)
	figs.append(fig)

	else:
	st.error("The 'first_type' and 'second_type' must each be either 1 or 2.")
	return None, None

	return final_df if 'final_df' in locals() else table, figs

	def main():
	st.title("📊 Data Analysis Application")
	st.write("""
	This application allows you to upload an Excel file and perform data analysis based on your selected parameters.
	""")

	# File uploader
	uploaded_file = st.file_uploader("Upload your Excel file", type=["xlsx", "xls"])

	if uploaded_file is not None:
	try:
	df = pd.read_excel(uploaded_file)
	st.success("File uploaded successfully!")
	st.subheader("Preview of Uploaded Data:")
	st.dataframe(df.head()) # Display first few rows
	except Exception as e:
	st.error(f"Error reading the Excel file: {e}")
	return

	st.markdown("---")

	# Function Selection
	st.header("🔍 Select Analysis Function")
	analysis_function = st.radio("Choose Function", ("Analyze Single Column", "Analyze Multiple Columns"))

	if analysis_function == "Analyze Single Column":
	st.header("📈 Analyze Single Column")
	column_name = st.text_input("Enter the column name or prefix:", "")
	type_option = st.selectbox("Select Type Option", ("1", "2"))

	if st.button("Run Analysis"):
	if column_name.strip() == "":
	st.error("Please enter a valid column name.")
	else:
	type_option_int = int(type_option)
	table, fig = analyze_single_column(df, column_name, type_option_int)
	if table is not None:
	st.success("Analysis completed successfully!")

	elif analysis_function == "Analyze Multiple Columns":
	st.header("📊 Analyze Multiple Columns")
	first_name = st.text_input("Enter the first column name or prefix:", "")
	second_name = st.text_input("Enter the second column name or prefix:", "")
	first_type = st.selectbox("Select First Type Option", ("1", "2"), key='first_type')
	second_type = st.selectbox("Select Second Type Option", ("1", "2"), key='second_type')

	if st.button("Run Analysis"):
	if first_name.strip() == "" or second_name.strip() == "":
	st.error("Please enter valid column names.")
	else:
	first_type_int = int(first_type)
	second_type_int = int(second_type)
	table, figs = analyze_multiple_columns(df, first_name, second_name, first_type_int, second_type_int)
	if table is not None:
	st.success("Analysis completed successfully!")

	if __name__ == "__main__":
	main()