Zack commited on
Commit
782735f
·
1 Parent(s): a131676

fix: Drop all null columns

Browse files
Files changed (1) hide show
  1. app.py +3 -43
app.py CHANGED
@@ -10,14 +10,12 @@ scaler = json.load(f)
10
 
11
  TIME_STEPS = 288
12
 
13
- # Generated training sequences for use in the model.
14
  def create_sequences(values, time_steps=TIME_STEPS):
15
  output = []
16
  for i in range(len(values) - time_steps + 1):
17
  output.append(values[i : (i + time_steps)])
18
  return np.stack(output)
19
 
20
-
21
  def normalize_data(data):
22
  df_test_value = (data - scaler["mean"]) / scaler["std"]
23
  return df_test_value
@@ -31,21 +29,17 @@ def plot_test_data(df_test_value):
31
  return fig
32
 
33
  def get_anomalies(df_test_value):
34
- # Create sequences from test values.
35
  x_test = create_sequences(df_test_value.values)
36
  model = from_pretrained_keras("keras-io/timeseries-anomaly-detection")
37
 
38
- # Get test MAE loss.
39
  x_test_pred = model.predict(x_test)
40
  test_mae_loss = np.mean(np.abs(x_test_pred - x_test), axis=1)
41
  test_mae_loss = test_mae_loss.reshape((-1))
42
 
43
- # Detect all the samples which are anomalies.
44
  anomalies = test_mae_loss > scaler["threshold"]
45
  return anomalies
46
 
47
  def plot_anomalies(df_test_value, data, anomalies):
48
- # data i is an anomaly if samples [(i - timesteps + 1) to (i)] are anomalies
49
  anomalous_data_indices = []
50
  for data_idx in range(TIME_STEPS - 1, len(df_test_value) - TIME_STEPS + 1):
51
  if np.all(anomalies[data_idx - TIME_STEPS + 1 : data_idx]):
@@ -60,73 +54,38 @@ def plot_anomalies(df_test_value, data, anomalies):
60
  return fig
61
 
62
  def clean_data(df):
63
- # Drop rows with any null data
64
- # df = df.dropna()
65
-
66
- # Check if the DataFrame already contains the correct columns
67
  if "timestamp" in df.columns and "value" in df.columns:
68
  df["timestamp"] = pd.to_datetime(df["timestamp"])
69
  return df
70
-
71
- # Check if DataFrame contains the columns to be converted
72
  elif "Date" in df.columns and "Hour" in df.columns and "Hourly_Labor_Hours_Total" in df.columns:
73
- # Convert "Date" and "Hour" columns into datetime format
74
  df["timestamp"] = pd.to_datetime(df["Date"]) + pd.to_timedelta(df["Hour"].astype(int), unit='h')
75
-
76
- # Handle the case where hour is 24
77
  df.loc[df["timestamp"].dt.hour == 24, "timestamp"] = df["timestamp"] + pd.DateOffset(days=1)
78
  df["timestamp"] = df["timestamp"].dt.floor('h')
79
-
80
- # Keep only necessary columns
81
  df = df[["timestamp", "Hourly_Labor_Hours_Total"]]
82
-
83
- # Rename column
84
  df.rename(columns={"Hourly_Labor_Hours_Total": "value"}, inplace=True)
85
-
86
  elif "Date_CY" in df.columns and "Hour" in df.columns and "Net_Sales_CY" in df.columns:
87
- # Convert "Date_CY" and "Hour" columns into datetime format
88
  df["timestamp"] = pd.to_datetime(df["Date_CY"]) + pd.to_timedelta(df["Hour"].astype(int), unit='h')
89
-
90
- # Handle the case where hour is 24
91
  df.loc[df["timestamp"].dt.hour == 24, "timestamp"] = df["timestamp"] - pd.DateOffset(days=1)
92
  df["timestamp"] = df["timestamp"].dt.floor('h')
93
-
94
- # Keep only necessary columns
95
  df = df[["timestamp", "Net_Sales_CY"]]
96
-
97
- # Rename column
98
  df.rename(columns={"Net_Sales_CY": "value"}, inplace=True)
99
-
100
- # Drop rows where 'value' is NaN
101
  df = df.dropna(subset=['value'])
102
-
103
  return df
104
-
105
  else:
106
  raise ValueError("Dataframe does not contain necessary columns.")
107
 
108
  def master(file):
109
- # read file
110
  data = pd.read_csv(file.name)
111
-
112
- # clean data
113
  data = clean_data(data)
114
-
115
- # Convert timestamp to datetime after cleaning
116
  data['timestamp'] = pd.to_datetime(data['timestamp'])
117
-
118
  data.set_index("timestamp", inplace=True)
119
-
120
- # Check if data has enough records to create sequences
121
  if len(data) < TIME_STEPS:
122
  return "Not enough data to create sequences. Need at least {} records.".format(TIME_STEPS)
123
-
124
  df_test_value = normalize_data(data)
125
- # plot input test data
126
  plot1 = plot_test_data(df_test_value)
127
- # predict
128
  anomalies = get_anomalies(df_test_value)
129
- #plot anomalous data points
130
  plot2 = plot_anomalies(df_test_value, data, anomalies)
131
  return plot2
132
 
@@ -142,3 +101,4 @@ iface = gr.Interface(
142
  )
143
 
144
  iface.launch()
 
 
10
 
11
  TIME_STEPS = 288
12
 
 
13
  def create_sequences(values, time_steps=TIME_STEPS):
14
  output = []
15
  for i in range(len(values) - time_steps + 1):
16
  output.append(values[i : (i + time_steps)])
17
  return np.stack(output)
18
 
 
19
  def normalize_data(data):
20
  df_test_value = (data - scaler["mean"]) / scaler["std"]
21
  return df_test_value
 
29
  return fig
30
 
31
  def get_anomalies(df_test_value):
 
32
  x_test = create_sequences(df_test_value.values)
33
  model = from_pretrained_keras("keras-io/timeseries-anomaly-detection")
34
 
 
35
  x_test_pred = model.predict(x_test)
36
  test_mae_loss = np.mean(np.abs(x_test_pred - x_test), axis=1)
37
  test_mae_loss = test_mae_loss.reshape((-1))
38
 
 
39
  anomalies = test_mae_loss > scaler["threshold"]
40
  return anomalies
41
 
42
  def plot_anomalies(df_test_value, data, anomalies):
 
43
  anomalous_data_indices = []
44
  for data_idx in range(TIME_STEPS - 1, len(df_test_value) - TIME_STEPS + 1):
45
  if np.all(anomalies[data_idx - TIME_STEPS + 1 : data_idx]):
 
54
  return fig
55
 
56
  def clean_data(df):
 
 
 
 
57
  if "timestamp" in df.columns and "value" in df.columns:
58
  df["timestamp"] = pd.to_datetime(df["timestamp"])
59
  return df
 
 
60
  elif "Date" in df.columns and "Hour" in df.columns and "Hourly_Labor_Hours_Total" in df.columns:
 
61
  df["timestamp"] = pd.to_datetime(df["Date"]) + pd.to_timedelta(df["Hour"].astype(int), unit='h')
 
 
62
  df.loc[df["timestamp"].dt.hour == 24, "timestamp"] = df["timestamp"] + pd.DateOffset(days=1)
63
  df["timestamp"] = df["timestamp"].dt.floor('h')
 
 
64
  df = df[["timestamp", "Hourly_Labor_Hours_Total"]]
 
 
65
  df.rename(columns={"Hourly_Labor_Hours_Total": "value"}, inplace=True)
 
66
  elif "Date_CY" in df.columns and "Hour" in df.columns and "Net_Sales_CY" in df.columns:
 
67
  df["timestamp"] = pd.to_datetime(df["Date_CY"]) + pd.to_timedelta(df["Hour"].astype(int), unit='h')
 
 
68
  df.loc[df["timestamp"].dt.hour == 24, "timestamp"] = df["timestamp"] - pd.DateOffset(days=1)
69
  df["timestamp"] = df["timestamp"].dt.floor('h')
 
 
70
  df = df[["timestamp", "Net_Sales_CY"]]
 
 
71
  df.rename(columns={"Net_Sales_CY": "value"}, inplace=True)
 
 
72
  df = df.dropna(subset=['value'])
 
73
  return df
 
74
  else:
75
  raise ValueError("Dataframe does not contain necessary columns.")
76
 
77
  def master(file):
 
78
  data = pd.read_csv(file.name)
79
+ print(f"Original data shape: {data.shape}") # Debug statement
 
80
  data = clean_data(data)
81
+ print(f"Cleaned data shape: {data.shape}") # Debug statement
 
82
  data['timestamp'] = pd.to_datetime(data['timestamp'])
 
83
  data.set_index("timestamp", inplace=True)
 
 
84
  if len(data) < TIME_STEPS:
85
  return "Not enough data to create sequences. Need at least {} records.".format(TIME_STEPS)
 
86
  df_test_value = normalize_data(data)
 
87
  plot1 = plot_test_data(df_test_value)
 
88
  anomalies = get_anomalies(df_test_value)
 
89
  plot2 = plot_anomalies(df_test_value, data, anomalies)
90
  return plot2
91
 
 
101
  )
102
 
103
  iface.launch()
104
+