Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import pandas as pd | |
| # -----------------Numerical Statistics----------------- | |
| def format_values(key, value): | |
| if not isinstance(value, (int, float)): | |
| # if value is a time | |
| return str(value) | |
| if "Memory" in key: | |
| # for memory usage | |
| ind = 0 | |
| unit = dict(enumerate(["B", "KB", "MB", "GB", "TB"], 0)) | |
| while value > 1024: | |
| value /= 1024 | |
| ind += 1 | |
| return f"{value:.1f} {unit[ind]}" | |
| if (value * 10) % 10 == 0: | |
| # if value is int but in a float form with 0 at last digit | |
| value = int(value) | |
| if abs(value) >= 1000000: | |
| return f"{value:.5g}" | |
| elif abs(value) >= 1000000 or abs(value) < 0.001: | |
| value = f"{value:.5g}" | |
| elif abs(value) >= 1: | |
| # eliminate trailing zeros | |
| pre_value = float(f"{value:.4f}") | |
| value = int(pre_value) if (pre_value * 10) % 10 == 0 else pre_value | |
| elif 0.001 <= abs(value) < 1: | |
| value = f"{value:.4g}" | |
| else: | |
| value = str(value) | |
| if "%" in key: | |
| # for percentage, only use digits before notation sign for extreme small number | |
| value = f"{float(value):.1%}" | |
| return str(value) | |
| def format_num_stats(data): | |
| """ | |
| Format numerical statistics | |
| """ | |
| overview = { | |
| "Approximate Distinct Count": data["nuniq"], | |
| "Approximate Unique (%)": data["nuniq"] / data["npres"], | |
| "Missing": data["nrows"] - data["npres"], | |
| "Missing (%)": 1 - (data["npres"] / data["nrows"]), | |
| "Infinite": (data["npres"] - data["nreals"]), | |
| "Infinite (%)": (data["npres"] - data["nreals"]) / data["nrows"], | |
| "Memory Size": data["mem_use"], | |
| "Mean": data["mean"], | |
| "Minimum": data["min"], | |
| "Maximum": data["max"], | |
| "Zeros": data["nzero"], | |
| "Zeros (%)": data["nzero"] / data["nrows"], | |
| "Negatives": data["nneg"], | |
| "Negatives (%)": data["nneg"] / data["nrows"], | |
| } | |
| data["qntls"].index = np.round(data["qntls"].index, 2) | |
| quantile = { | |
| "Minimum": data["min"], | |
| "5-th Percentile": data["qntls"].loc[0.05], | |
| "Q1": data["qntls"].loc[0.25], | |
| "Median": data["qntls"].loc[0.50], | |
| "Q3": data["qntls"].loc[0.75], | |
| "95-th Percentile": data["qntls"].loc[0.95], | |
| "Maximum": data["max"], | |
| "Range": data["max"] - data["min"], | |
| "IQR": data["qntls"].loc[0.75] - data["qntls"].loc[0.25], | |
| } | |
| descriptive = { | |
| "Mean": data["mean"], | |
| "Standard Deviation": data["std"], | |
| "Variance": data["std"] ** 2, | |
| "Sum": data["mean"] * data["npres"], | |
| "Skewness": float(data["skew"]), | |
| "Kurtosis": float(data["kurt"]), | |
| "Coefficient of Variation": data["std"] / data["mean"] if data["mean"] != 0 else np.nan, | |
| } | |
| # return { | |
| # "Overview": {k: _format_values(k, v) for k, v in overview.items()}, | |
| # # "Quantile Statistics": {k: _format_values(k, v) for k, v in quantile.items()}, | |
| # # "Descriptive Statistics": {k: _format_values(k, v) for k, v in descriptive.items()}, | |
| # } | |
| return { | |
| "Overview": {**{k: format_values(k, v) for k, v in overview.items()}, | |
| **{k: format_values(k, v) for k, v in quantile.items()}, | |
| **{k: format_values(k, v) for k, v in descriptive.items()}} | |
| } | |
| # ----------------------------------------------------- | |
| # -----------------Categorical Statistics----------------- | |
| def format_cat_stats( | |
| data | |
| ): | |
| """ | |
| Format categorical statistics | |
| """ | |
| stats = data['stats'] | |
| len_stats = data['len_stats'] | |
| letter_stats = data["letter_stats"] | |
| ov_stats = { | |
| "Approximate Distinct Count": stats["nuniq"], | |
| "Approximate Unique (%)": stats["nuniq"] / stats["npres"], | |
| "Missing": stats["nrows"] - stats["npres"], | |
| "Missing (%)": 1 - stats["npres"] / stats["nrows"], | |
| "Memory Size": stats["mem_use"], | |
| } | |
| sampled_rows = ("1st row", "2nd row", "3rd row", "4th row", "5th row") | |
| smpl = dict(zip(sampled_rows, stats["first_rows"])) | |
| # return { | |
| # "Overview": {k: _format_values(k, v) for k, v in ov_stats.items()}, | |
| # "Length": {k: _format_values(k, v) for k, v in len_stats.items()}, | |
| # "Sample": {k: f"{v[:18]}..." if len(v) > 18 else v for k, v in smpl.items()}, | |
| # "Letter": {k: _format_values(k, v) for k, v in letter_stats.items()}, | |
| # } | |
| return { | |
| "Overview": {**{k: format_values(k, v) for k, v in ov_stats.items()}, | |
| **{k: format_values(k, v) for k, v in len_stats.items()}, | |
| } | |
| } | |
| # ----------------------------------------------------- | |
| def format_ov_stats(stats) : | |
| nrows, ncols, npresent_cells, nrows_wo_dups, mem_use, dtypes_cnt = stats.values() | |
| ncells = nrows * ncols | |
| data = { | |
| "Number of Variables": ncols, | |
| "Number of Rows": nrows, | |
| "Missing Cells": float(ncells - npresent_cells), | |
| "Missing Cells (%)": 1 - (npresent_cells / ncells), | |
| "Duplicate Rows": nrows - nrows_wo_dups, | |
| "Duplicate Rows (%)": 1 - (nrows_wo_dups / nrows), | |
| "Total Size in Memory": float(mem_use), | |
| "Average Row Size in Memory": mem_use / nrows, | |
| } | |
| return {k: format_values(k, v) for k, v in data.items()}, dtypes_cnt | |
| def format_insights(data): | |
| data_list = [] | |
| for key, value_list in data.items(): | |
| for item in value_list: | |
| for category, description in item.items(): | |
| data_list.append({'Category': category, 'Description': description}) | |
| insights_df = pd.DataFrame(data_list) | |
| insights_df['Description'] = insights_df['Description'].str.replace(r'/\*start\*/', '', regex=True) | |
| insights_df['Description'] = insights_df['Description'].str.replace(r'/\*end\*/', '', regex=True) | |
| return insights_df |