timeseries-anomaly-detection-autoencoders

Runtime error

App Files Files Community

Zack commited on Aug 1, 2023

Commit

782735f

1 Parent(s): a131676

fix: Drop all null columns

Browse files

Files changed (1) hide show

app.py +3 -43

app.py CHANGED Viewed

@@ -10,14 +10,12 @@ scaler = json.load(f)
 TIME_STEPS = 288
-# Generated training sequences for use in the model.
 def create_sequences(values, time_steps=TIME_STEPS):
     output = []
     for i in range(len(values) - time_steps + 1):
         output.append(values[i : (i + time_steps)])
     return np.stack(output)
 def normalize_data(data):
     df_test_value = (data - scaler["mean"]) / scaler["std"]
     return df_test_value
@@ -31,21 +29,17 @@ def plot_test_data(df_test_value):
     return fig
 def get_anomalies(df_test_value):
-    # Create sequences from test values.
     x_test = create_sequences(df_test_value.values)
     model = from_pretrained_keras("keras-io/timeseries-anomaly-detection")
-    # Get test MAE loss.
     x_test_pred = model.predict(x_test)
     test_mae_loss = np.mean(np.abs(x_test_pred - x_test), axis=1)
     test_mae_loss = test_mae_loss.reshape((-1))
-    # Detect all the samples which are anomalies.
     anomalies = test_mae_loss > scaler["threshold"]
     return anomalies
 def plot_anomalies(df_test_value, data, anomalies):
-    # data i is an anomaly if samples [(i - timesteps + 1) to (i)] are anomalies
     anomalous_data_indices = []
     for data_idx in range(TIME_STEPS - 1, len(df_test_value) - TIME_STEPS + 1):
         if np.all(anomalies[data_idx - TIME_STEPS + 1 : data_idx]):
@@ -60,73 +54,38 @@ def plot_anomalies(df_test_value, data, anomalies):
     return fig
 def clean_data(df):
-    # Drop rows with any null data
-    # df = df.dropna()
-    # Check if the DataFrame already contains the correct columns
     if "timestamp" in df.columns and "value" in df.columns:
         df["timestamp"] = pd.to_datetime(df["timestamp"])
         return df
-    # Check if DataFrame contains the columns to be converted
     elif "Date" in df.columns and "Hour" in df.columns and "Hourly_Labor_Hours_Total" in df.columns:
-        # Convert "Date" and "Hour" columns into datetime format
         df["timestamp"] = pd.to_datetime(df["Date"]) + pd.to_timedelta(df["Hour"].astype(int), unit='h')
-        # Handle the case where hour is 24
         df.loc[df["timestamp"].dt.hour == 24, "timestamp"] = df["timestamp"] + pd.DateOffset(days=1)
         df["timestamp"] = df["timestamp"].dt.floor('h')
-        # Keep only necessary columns
         df = df[["timestamp", "Hourly_Labor_Hours_Total"]]
-        # Rename column
         df.rename(columns={"Hourly_Labor_Hours_Total": "value"}, inplace=True)
     elif "Date_CY" in df.columns and "Hour" in df.columns and "Net_Sales_CY" in df.columns:
-        # Convert "Date_CY" and "Hour" columns into datetime format
         df["timestamp"] = pd.to_datetime(df["Date_CY"]) + pd.to_timedelta(df["Hour"].astype(int), unit='h')
-        # Handle the case where hour is 24
         df.loc[df["timestamp"].dt.hour == 24, "timestamp"] = df["timestamp"] - pd.DateOffset(days=1)
         df["timestamp"] = df["timestamp"].dt.floor('h')
-        # Keep only necessary columns
         df = df[["timestamp", "Net_Sales_CY"]]
-        # Rename column
         df.rename(columns={"Net_Sales_CY": "value"}, inplace=True)
-        # Drop rows where 'value' is NaN
         df = df.dropna(subset=['value'])
         return df
     else:
         raise ValueError("Dataframe does not contain necessary columns.")
 def master(file):
-    # read file
     data = pd.read_csv(file.name)
-    # clean data
     data = clean_data(data)
-    # Convert timestamp to datetime after cleaning
     data['timestamp'] = pd.to_datetime(data['timestamp'])
     data.set_index("timestamp", inplace=True)
-    # Check if data has enough records to create sequences
     if len(data) < TIME_STEPS:
         return "Not enough data to create sequences. Need at least {} records.".format(TIME_STEPS)
     df_test_value = normalize_data(data)
-    # plot input test data
     plot1 = plot_test_data(df_test_value)
-    # predict
     anomalies = get_anomalies(df_test_value)
-    #plot anomalous data points
     plot2 = plot_anomalies(df_test_value, data, anomalies)
     return plot2
@@ -142,3 +101,4 @@ iface = gr.Interface(
 )
 iface.launch()

 TIME_STEPS = 288
 def create_sequences(values, time_steps=TIME_STEPS):
     output = []
     for i in range(len(values) - time_steps + 1):
         output.append(values[i : (i + time_steps)])
     return np.stack(output)
 def normalize_data(data):
     df_test_value = (data - scaler["mean"]) / scaler["std"]
     return df_test_value
     return fig
 def get_anomalies(df_test_value):
     x_test = create_sequences(df_test_value.values)
     model = from_pretrained_keras("keras-io/timeseries-anomaly-detection")
     x_test_pred = model.predict(x_test)
     test_mae_loss = np.mean(np.abs(x_test_pred - x_test), axis=1)
     test_mae_loss = test_mae_loss.reshape((-1))
     anomalies = test_mae_loss > scaler["threshold"]
     return anomalies
 def plot_anomalies(df_test_value, data, anomalies):
     anomalous_data_indices = []
     for data_idx in range(TIME_STEPS - 1, len(df_test_value) - TIME_STEPS + 1):
         if np.all(anomalies[data_idx - TIME_STEPS + 1 : data_idx]):
     return fig
 def clean_data(df):
     if "timestamp" in df.columns and "value" in df.columns:
         df["timestamp"] = pd.to_datetime(df["timestamp"])
         return df
     elif "Date" in df.columns and "Hour" in df.columns and "Hourly_Labor_Hours_Total" in df.columns:
         df["timestamp"] = pd.to_datetime(df["Date"]) + pd.to_timedelta(df["Hour"].astype(int), unit='h')
         df.loc[df["timestamp"].dt.hour == 24, "timestamp"] = df["timestamp"] + pd.DateOffset(days=1)
         df["timestamp"] = df["timestamp"].dt.floor('h')
         df = df[["timestamp", "Hourly_Labor_Hours_Total"]]
         df.rename(columns={"Hourly_Labor_Hours_Total": "value"}, inplace=True)
     elif "Date_CY" in df.columns and "Hour" in df.columns and "Net_Sales_CY" in df.columns:
         df["timestamp"] = pd.to_datetime(df["Date_CY"]) + pd.to_timedelta(df["Hour"].astype(int), unit='h')
         df.loc[df["timestamp"].dt.hour == 24, "timestamp"] = df["timestamp"] - pd.DateOffset(days=1)
         df["timestamp"] = df["timestamp"].dt.floor('h')
         df = df[["timestamp", "Net_Sales_CY"]]
         df.rename(columns={"Net_Sales_CY": "value"}, inplace=True)
         df = df.dropna(subset=['value'])
         return df
     else:
         raise ValueError("Dataframe does not contain necessary columns.")
 def master(file):
     data = pd.read_csv(file.name)
+    print(f"Original data shape: {data.shape}")  # Debug statement
     data = clean_data(data)
+    print(f"Cleaned data shape: {data.shape}")  # Debug statement
     data['timestamp'] = pd.to_datetime(data['timestamp'])
     data.set_index("timestamp", inplace=True)
     if len(data) < TIME_STEPS:
         return "Not enough data to create sequences. Need at least {} records.".format(TIME_STEPS)
     df_test_value = normalize_data(data)
     plot1 = plot_test_data(df_test_value)
     anomalies = get_anomalies(df_test_value)
     plot2 = plot_anomalies(df_test_value, data, anomalies)
     return plot2
 )
 iface.launch()