Spaces:
Running
Running
File size: 10,724 Bytes
7727a49 3231b63 7727a49 3231b63 74a75f8 3231b63 7727a49 3231b63 7727a49 3231b63 7727a49 3231b63 74a75f8 3231b63 7727a49 3231b63 7727a49 3231b63 7727a49 3231b63 7727a49 3231b63 7727a49 3231b63 74a75f8 3231b63 7727a49 74a75f8 7727a49 74a75f8 3231b63 7727a49 3231b63 |
|
from nfl_data_py import nfl_data_py as nfl
from tqdm import tqdm
import numpy as np
import pandas as pd
pd.set_option('chained_assignment',None)
pd.set_option('display.max_columns',None)
import os
import datetime as dt
current_directory = os.path.dirname(os.path.abspath(__file__))
parent_directory = os.path.dirname(current_directory)
data_directory = os.path.join(parent_directory, 'Data')
year = dt.datetime.now().year
month = dt.datetime.now().month
current_season = year if month in [8,9,10,11,12] else year-1
def get_pbp_data(get_seasons=[]):
"""
Pull data from nflFastR's Github repo.
"""
pbp = nfl.import_pbp_data(get_seasons)
#pbp = pd.read_csv(r"C:\Users\brayd\Downloads\play_by_play_2023.csv")
pbp['TOP_seconds'] = pbp['drive_time_of_possession'].apply(lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1]) if pd.notnull(x) else 0)
return pbp
def build_gbg_data(get_seasons=[]):
"""
Build a game-by-game dataset to use for prediction models.
"""
print('Loading play-by-play data.')
pbp = get_pbp_data(get_seasons)
game_date_dict = dict(pbp[['game_id','game_date']].values)
teams = list(set(list(pbp['home_team'].unique()) + list(pbp['away_team'].unique())))
seasons = pbp['season'].unique()
print('Building game-by-game data.')
data = pd.DataFrame()
for season in seasons:
print(season)
for team_name in tqdm(teams):
# create features
team = pbp.loc[((pbp['home_team']==team_name) | (pbp['away_team']==team_name)) & (pbp['season']==season)]
team['GP'] = team['week']
team['W'] = [1 if r>0 and team_name==h else 1 if r<0 and team_name==a else 0 for r,a,h in team[['result','away_team','home_team']].values]
team['L'] = [0 if r>0 and team_name==h else 0 if r<0 and team_name==a else 1 for r,a,h in team[['result','away_team','home_team']].values]
team['W_PCT'] = team['W']/team['GP']
team['TOP'] = [t if team_name==p else 0 for t,p in team[['TOP_seconds','posteam']].values]
team['FGA'] = [1 if team_name==p and f==1 else 0 for p,f in team[['posteam','field_goal_attempt']].values]
team['FGM'] = [1 if team_name==p and f=='made' else 0 for p,f in team[['posteam','field_goal_result']].values]
team['FG_PCT'] = team['FGM']/team['FGA']
team['PassTD'] = np.where((team['posteam'] == team_name) & (team['pass_touchdown'] == 1), 1, 0)
team['RushTD'] = np.where((team['posteam'] == team_name) & (team['rush_touchdown'] == 1), 1, 0)
team['PassTD_Allowed'] = np.where((team['defteam'] == team_name) & (team['pass_touchdown'] == 1), 1, 0)
team['RushTD_Allowed'] = np.where((team['defteam'] == team_name) & (team['rush_touchdown'] == 1), 1, 0)
team['PassYds'] = [y if p==team_name else 0 for p,y in team[['posteam','passing_yards']].values]
team['RushYds'] = [y if p==team_name else 0 for p,y in team[['posteam','rushing_yards']].values]
team['PassYds_Allowed'] = [y if d==team_name else 0 for d,y in team[['defteam','passing_yards']].values]
team['RushYds_Allowed'] = [y if d==team_name else 0 for d,y in team[['defteam','rushing_yards']].values]
team['Fum'] = np.where((team['defteam'] == team_name) & (team['fumble_lost'] == 1), 1, 0)
team['Fum_Allowed'] = np.where((team['posteam'] == team_name) & (team['fumble_lost'] == 1), 1, 0)
team['INT'] = np.where((team['defteam'] == team_name) & (team['interception'] == 1), 1, 0)
team['INT_Allowed'] = np.where((team['posteam'] == team_name) & (team['interception'] == 1), 1, 0)
team['Sacks'] = np.where((team['defteam'] == team_name) & (team['sack'] == 1), 1, 0)
team['Sacks_Allowed'] = np.where((team['posteam'] == team_name) & (team['sack'] == 1), 1, 0)
team['Penalties'] = np.where((team['penalty_team'] == team_name), 1, 0)
team['FirstDowns'] = [1 if team_name==p and f==1 else 0 for p,f in team[['posteam','first_down']].values]
team['3rdDownConverted'] = [1 if p==team_name and t==1 else 0 for p,t in team[['posteam','third_down_converted']].values]
team['3rdDownFailed'] = [1 if p==team_name and t==1 else 0 for p,t in team[['posteam','third_down_failed']].values]
team['3rdDownAllowed'] = [1 if d==team_name and t==1 else 0 for d,t in team[['defteam','third_down_converted']].values]
team['3rdDownDefended'] = [1 if d==team_name and t==1 else 0 for d,t in team[['defteam','third_down_failed']].values]
team['PTS'] = [ap if at==team_name else hp if ht==team_name else None for ht,at,hp,ap in team[['home_team','away_team','home_score','away_score']].values]
team['PointDiff'] = [r if team_name==h else -r if team_name==a else 0 for r,a,h in team[['result','away_team','home_team']].values]
# aggregate from play-by-play to game-by-game
features = {
'GP':'mean',
'W':'mean',
'L':'mean',
'W_PCT':'mean',
'TOP':'sum',
'FGA':'sum',
'FGM':'sum',
'FG_PCT':'mean',
'PassTD':'sum',
'RushTD':'sum',
'PassTD_Allowed':'sum',
'RushTD_Allowed':'sum',
'PassYds':'sum',
'RushYds':'sum',
'PassYds_Allowed':'sum',
'RushYds_Allowed':'sum',
'Fum':'sum',
'Fum_Allowed':'sum',
'INT':'sum',
'INT_Allowed':'sum',
'Sacks':'sum',
'Sacks_Allowed':'sum',
'Penalties':'sum',
'FirstDowns':'sum',
'3rdDownConverted':'sum',
'3rdDownFailed':'sum',
'3rdDownAllowed':'sum',
'3rdDownDefended':'sum',
'PTS':'mean',
'PointDiff':'mean'
}
game = team.groupby('game_id').agg(features).reset_index().sort_values('GP')
game[['W','L']] = game[['W','L']].expanding().sum()
game[game.columns[4:]] = game[game.columns[4:]].expanding().mean()
if season != current_season:
game[game.columns[1:]] = game[game.columns[1:]].shift()
game['TEAM'] = team_name
game['Season'] = season
else:
game['TEAM'] = team_name
game['Season'] = season
data = pd.concat([data,game])
# separate home and away data and merge
data = data.merge(pbp[['game_id','home_team','away_team']].drop_duplicates())
home = data.loc[data['home_team']==data['TEAM']]
away = data.loc[data['away_team']==data['TEAM']]
away.columns = [f'{i}.Away' for i in away.columns]
gbg = home.merge(away,left_on='game_id',right_on='game_id.Away')
gbg.drop(columns=['TEAM','TEAM.Away','home_team.Away','away_team.Away','Season.Away','game_id.Away'], inplace=True)
gbg['game_date'] = gbg['game_id'].map(game_date_dict)
# save current data
if current_season in get_seasons:
gbg_this_year = gbg.loc[gbg['Season']==current_season]
file_path = os.path.join(data_directory, 'gbg_this_year.csv')
gbg_this_year.to_csv(file_path, index=False)
# save historical data
if get_seasons != [current_season]:
gbg = gbg.loc[gbg['Season']!=current_season]
file_path = os.path.join(data_directory, 'gbg.csv')
gbg.to_csv(file_path, index=False)
def add_odds_data():
"""
Get odds from Australian Sports Betting's free online dataset and merge it with game-by-game data.
"""
# get team abbreviations
team_descriptions = nfl.import_team_desc()
team_abbreviation_dict = dict(team_descriptions[['team_name','team_abbr']].values)
# get odds
odds = pd.read_excel('https://www.aussportsbetting.com/historical_data/nfl.xlsx')
odds['Home Team'] = odds['Home Team'].str.replace('Washington Redskins','Washington Commanders').str.replace('Washington Football Team','Washington Commanders')
odds['Away Team'] = odds['Away Team'].str.replace('Washington Redskins','Washington Commanders').str.replace('Washington Football Team','Washington Commanders')
odds['Season'] = [i.year if i.month in [8,9,10,11,12] else i.year-1 for i in odds['Date']]
odds['Home Team Abbrev'] = odds['Home Team'].map(team_abbreviation_dict).str.replace('LAR','LA')
odds['Away Team Abbrev'] = odds['Away Team'].map(team_abbreviation_dict).str.replace('LAR','LA')
odds = odds[['Date','Home Score','Away Score','Home Team Abbrev','Away Team Abbrev','Home Odds Close','Away Odds Close','Total Score Close','Home Line Close']]
odds['Key'] = odds['Date'].astype(str) + odds['Home Team Abbrev'] + odds['Away Team Abbrev']
odds = odds.drop(columns=['Date','Home Team Abbrev','Away Team Abbrev']).dropna()
odds['Home Odds'] = [round((i-1)*100) if i>= 2 else round(-100/(i-1)) for i in odds['Home Odds Close']]
odds['Away Odds'] = [round((i-1)*100) if i>= 2 else round(-100/(i-1)) for i in odds['Away Odds Close']]
odds['Home Winnings'] = [ho-1 if h>a else -1 if a>h else 0 for ho,h,a in odds[['Home Odds Close','Home Score','Away Score']].values]
odds['Away Winnings'] = [ao-1 if a>h else -1 if h>a else 0 for ao,h,a in odds[['Away Odds Close','Home Score','Away Score']].values]
# load gbg data
file_path = os.path.join(data_directory, 'gbg.csv')
gbg = pd.read_csv(file_path)
file_path = os.path.join(data_directory, 'gbg_this_year.csv')
gbg_this_year = pd.read_csv(file_path)
# merge and save
dataframes = [gbg, gbg_this_year]
for idx in range(2):
i = dataframes[idx]
i['Key'] = i['game_date'].astype(str) + i['home_team'] + i['away_team']
gbg_and_odds = i.merge(odds, left_on='Key', right_on='Key')
gbg_and_odds['Home-Team-Cover'] = [1 if (h-a)>-l else 0 if (h-a)<-l else 2 for h,a,l in gbg_and_odds[['Home Score','Away Score','Home Line Close']].values]
gbg_and_odds['Home-Team-Win'] = (gbg_and_odds['Home Score']>gbg_and_odds['Away Score']).astype(int)
gbg_and_odds['Over'] = ((gbg_and_odds['Home Score'] + gbg_and_odds['Away Score'])>gbg_and_odds['Total Score Close']).astype(int)
if idx==0:
file_path = os.path.join(data_directory, 'gbg_and_odds.csv')
else:
file_path = os.path.join(data_directory, 'gbg_and_odds_this_year.csv')
gbg_and_odds.drop_duplicates(subset='game_id').to_csv(file_path, index=False)
|