Mustehson commited on
Commit
4e0396a
·
verified ·
1 Parent(s): a6502a8

Delete utils.py

Browse files
Files changed (1) hide show
  1. utils.py +0 -162
utils.py DELETED
@@ -1,162 +0,0 @@
1
-
2
- import numpy as np
3
- import pandas as pd
4
- # -----------------Numerical Statistics-----------------
5
- def format_values(key, value):
6
-
7
- if not isinstance(value, (int, float)):
8
- # if value is a time
9
- return str(value)
10
-
11
- if "Memory" in key:
12
- # for memory usage
13
- ind = 0
14
- unit = dict(enumerate(["B", "KB", "MB", "GB", "TB"], 0))
15
- while value > 1024:
16
- value /= 1024
17
- ind += 1
18
- return f"{value:.1f} {unit[ind]}"
19
-
20
- if (value * 10) % 10 == 0:
21
- # if value is int but in a float form with 0 at last digit
22
- value = int(value)
23
- if abs(value) >= 1000000:
24
- return f"{value:.5g}"
25
- elif abs(value) >= 1000000 or abs(value) < 0.001:
26
- value = f"{value:.5g}"
27
- elif abs(value) >= 1:
28
- # eliminate trailing zeros
29
- pre_value = float(f"{value:.4f}")
30
- value = int(pre_value) if (pre_value * 10) % 10 == 0 else pre_value
31
- elif 0.001 <= abs(value) < 1:
32
- value = f"{value:.4g}"
33
- else:
34
- value = str(value)
35
-
36
- if "%" in key:
37
- # for percentage, only use digits before notation sign for extreme small number
38
- value = f"{float(value):.1%}"
39
- return str(value)
40
-
41
- def format_num_stats(data):
42
- """
43
- Format numerical statistics
44
- """
45
- overview = {
46
- "Approximate Distinct Count": data["nuniq"],
47
- "Approximate Unique (%)": data["nuniq"] / data["npres"],
48
- "Missing": data["nrows"] - data["npres"],
49
- "Missing (%)": 1 - (data["npres"] / data["nrows"]),
50
- "Infinite": (data["npres"] - data["nreals"]),
51
- "Infinite (%)": (data["npres"] - data["nreals"]) / data["nrows"],
52
- "Memory Size": data["mem_use"],
53
- "Mean": data["mean"],
54
- "Minimum": data["min"],
55
- "Maximum": data["max"],
56
- "Zeros": data["nzero"],
57
- "Zeros (%)": data["nzero"] / data["nrows"],
58
- "Negatives": data["nneg"],
59
- "Negatives (%)": data["nneg"] / data["nrows"],
60
- }
61
- data["qntls"].index = np.round(data["qntls"].index, 2)
62
- quantile = {
63
- "Minimum": data["min"],
64
- "5-th Percentile": data["qntls"].loc[0.05],
65
- "Q1": data["qntls"].loc[0.25],
66
- "Median": data["qntls"].loc[0.50],
67
- "Q3": data["qntls"].loc[0.75],
68
- "95-th Percentile": data["qntls"].loc[0.95],
69
- "Maximum": data["max"],
70
- "Range": data["max"] - data["min"],
71
- "IQR": data["qntls"].loc[0.75] - data["qntls"].loc[0.25],
72
- }
73
- descriptive = {
74
- "Mean": data["mean"],
75
- "Standard Deviation": data["std"],
76
- "Variance": data["std"] ** 2,
77
- "Sum": data["mean"] * data["npres"],
78
- "Skewness": float(data["skew"]),
79
- "Kurtosis": float(data["kurt"]),
80
- "Coefficient of Variation": data["std"] / data["mean"] if data["mean"] != 0 else np.nan,
81
- }
82
-
83
- # return {
84
- # "Overview": {k: _format_values(k, v) for k, v in overview.items()},
85
- # # "Quantile Statistics": {k: _format_values(k, v) for k, v in quantile.items()},
86
- # # "Descriptive Statistics": {k: _format_values(k, v) for k, v in descriptive.items()},
87
- # }
88
-
89
- return {
90
- "Overview": {**{k: format_values(k, v) for k, v in overview.items()},
91
- **{k: format_values(k, v) for k, v in quantile.items()},
92
- **{k: format_values(k, v) for k, v in descriptive.items()}}
93
- }
94
- # -----------------------------------------------------
95
-
96
-
97
- # -----------------Categorical Statistics-----------------
98
-
99
- def format_cat_stats(
100
- data
101
- ):
102
- """
103
- Format categorical statistics
104
- """
105
- stats = data['stats']
106
- len_stats = data['len_stats']
107
- letter_stats = data["letter_stats"]
108
- ov_stats = {
109
- "Approximate Distinct Count": stats["nuniq"],
110
- "Approximate Unique (%)": stats["nuniq"] / stats["npres"],
111
- "Missing": stats["nrows"] - stats["npres"],
112
- "Missing (%)": 1 - stats["npres"] / stats["nrows"],
113
- "Memory Size": stats["mem_use"],
114
- }
115
- sampled_rows = ("1st row", "2nd row", "3rd row", "4th row", "5th row")
116
- smpl = dict(zip(sampled_rows, stats["first_rows"]))
117
-
118
- # return {
119
- # "Overview": {k: _format_values(k, v) for k, v in ov_stats.items()},
120
- # "Length": {k: _format_values(k, v) for k, v in len_stats.items()},
121
- # "Sample": {k: f"{v[:18]}..." if len(v) > 18 else v for k, v in smpl.items()},
122
- # "Letter": {k: _format_values(k, v) for k, v in letter_stats.items()},
123
- # }
124
- return {
125
- "Overview": {**{k: format_values(k, v) for k, v in ov_stats.items()},
126
- **{k: format_values(k, v) for k, v in len_stats.items()},
127
- }
128
- }
129
- # -----------------------------------------------------
130
-
131
-
132
- def format_ov_stats(stats) :
133
-
134
- nrows, ncols, npresent_cells, nrows_wo_dups, mem_use, dtypes_cnt = stats.values()
135
- ncells = nrows * ncols
136
-
137
- data = {
138
- "Number of Variables": ncols,
139
- "Number of Rows": nrows,
140
- "Missing Cells": float(ncells - npresent_cells),
141
- "Missing Cells (%)": 1 - (npresent_cells / ncells),
142
- "Duplicate Rows": nrows - nrows_wo_dups,
143
- "Duplicate Rows (%)": 1 - (nrows_wo_dups / nrows),
144
- "Total Size in Memory": float(mem_use),
145
- "Average Row Size in Memory": mem_use / nrows,
146
- }
147
- return {k: format_values(k, v) for k, v in data.items()}, dtypes_cnt
148
-
149
-
150
- def format_insights(data):
151
- data_list = []
152
- for key, value_list in data.items():
153
- for item in value_list:
154
- for category, description in item.items():
155
- data_list.append({'Category': category, 'Description': description})
156
-
157
- insights_df = pd.DataFrame(data_list)
158
-
159
- insights_df['Description'] = insights_df['Description'].str.replace(r'/\*start\*/', '', regex=True)
160
- insights_df['Description'] = insights_df['Description'].str.replace(r'/\*end\*/', '', regex=True)
161
-
162
- return insights_df