import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import uuid | |
import calendar | |
import numpy as np | |
# Set professional matplotlib styling with high resolution | |
#plt.style.use('vayuchat.mplstyle') | |
df = pd.read_csv("AQ_met_data.csv") | |
df["Timestamp"] = pd.to_datetime(df["Timestamp"]) | |
states_df = pd.read_csv("states_data.csv") | |
ncap_df = pd.read_csv("ncap_funding_data.csv") | |
# df is pandas DataFrame with air quality data from India. Data frequency is daily from 2017 to 2024. The data has the following columns and data types: | |
# Unnamed: 0 int64 | |
# Timestamp datetime64[ns] | |
# State object | |
# City object | |
# Station object | |
# site_id object | |
# Year int64 | |
# PM2.5 (µg/m³) float64 | |
# PM10 (µg/m³) float64 | |
# NO (µg/m³) float64 | |
# NO2 (µg/m³) float64 | |
# NOx (ppb) float64 | |
# NH3 (µg/m³) float64 | |
# SO2 (µg/m³) float64 | |
# CO (mg/m³) float64 | |
# Ozone (µg/m³) float64 | |
# AT (°C) float64 | |
# RH (%) float64 | |
# WS (m/s) float64 | |
# WD (deg) float64 | |
# RF (mm) float64 | |
# TOT-RF (mm) float64 | |
# SR (W/mt2) float64 | |
# BP (mmHg) float64 | |
# VWS (m/s) float64 | |
# dtype: object | |
# states_df is a pandas DataFrame of state-wise population, area and whether state is union territory or not of India. | |
# state object | |
# population int64 | |
# area (km2) int64 | |
# isUnionTerritory bool | |
# dtype: object | |
# ncap_df is a pandas DataFrame of funding given to the cities of India from 2019-2022, under The National Clean Air Program (NCAP). | |
# S. No. int64 | |
# state object | |
# city object | |
# Amount released during FY 2019-20 float64 | |
# Amount released during FY 2020-21 float64 | |
# Amount released during FY 2021-22 float64 | |
# Total fund released float64 | |
# Utilisation as on June 2022 float64 | |
# dtype: object | |
# Question: Compare the wind speed and PM2.5 levels during Delhi’s most polluted week (highest PM2.5) in December 2024 with the previous 15 days and the following 15 days on a time series plot. | |
# Generate code to answer the question and save result in 'answer' variable | |
# If creating a plot, save it with a unique filename and store the filename in 'answer' | |
# If returning text/numbers, store the result directly in 'answer' | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import uuid | |
import numpy as np | |
# Ensure data is loaded | |
if df.empty: | |
answer = "No data available" | |
else: | |
try: | |
# Filter for Delhi in December 2024 | |
df_delhi = df[ | |
(df['City'].str.contains('Delhi', case=False, na=False)) & | |
(df['Timestamp'].dt.year == 2024) & | |
(df['Timestamp'].dt.month == 12) | |
].copy() | |
df_delhi = df_delhi.dropna(subset=['PM2.5 (µg/m³)', 'WS (m/s)']) | |
df_delhi = df_delhi.sort_values('Timestamp') | |
# Need at least a full week to compute rolling mean | |
if len(df_delhi) < 7: | |
answer = "Insufficient data" | |
else: | |
# Compute 7‑day rolling mean of PM2.5 | |
df_delhi['PM2.5_roll7'] = df_delhi['PM2.5 (µg/m³)'].rolling(window=7, min_periods=7).mean() | |
# Identify the window with the highest mean PM2.5 | |
max_idx = df_delhi['PM2.5_roll7'].idxmax() | |
max_end_date = df_delhi.loc[max_idx, 'Timestamp'] | |
max_start_date = max_end_date - pd.Timedelta(days=6) | |
# Define extended window: 15 days before start and 15 days after end | |
ext_start = max_start_date - pd.Timedelta(days=15) | |
ext_end = max_end_date + pd.Timedelta(days=15) | |
# Filter data for the extended period | |
mask = (df_delhi['Timestamp'] >= ext_start) & (df_delhi['Timestamp'] <= ext_end) | |
df_plot = df_delhi.loc[mask].copy() | |
if df_plot.empty or len(df_plot) < 30: | |
answer = "Insufficient data" | |
else: | |
# Plot time series | |
plt.figure(figsize=(9, 6)) | |
ax1 = plt.gca() | |
sns.lineplot(data=df_plot, x='Timestamp', y='PM2.5 (µg/m³)', ax=ax1, | |
label='PM2.5 (µg/m³)', color='tab:red') | |
ax1.set_ylabel('PM2.5 (µg/m³)', color='tab:red') | |
ax1.tick_params(axis='y', labelcolor='tab:red') | |
ax2 = ax1.twinx() | |
sns.lineplot(data=df_plot, x='Timestamp', y='WS (m/s)', ax=ax2, | |
label='Wind Speed (m/s)', color='tab:blue') | |
ax2.set_ylabel('Wind Speed (m/s)', color='tab:blue') | |
ax2.tick_params(axis='y', labelcolor='tab:blue') | |
plt.title('Delhi – PM2.5 and Wind Speed around Most Polluted Week (Dec 2024)') | |
plt.xlabel('Date') | |
plt.tight_layout() | |
# Save plot | |
filename = f"plot.png" | |
plt.savefig(filename, dpi=1200, bbox_inches='tight', facecolor='white') | |
plt.close() | |
answer = filename | |
except Exception as e: | |
answer = "Unable to complete analysis with available data" |