Spaces:
Running
Running
File size: 10,724 Bytes
7727a49 3231b63 7727a49 3231b63 74a75f8 3231b63 7727a49 3231b63 7727a49 3231b63 7727a49 3231b63 74a75f8 3231b63 7727a49 3231b63 7727a49 3231b63 7727a49 3231b63 7727a49 3231b63 7727a49 3231b63 74a75f8 3231b63 7727a49 74a75f8 7727a49 74a75f8 3231b63 7727a49 3231b63 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 |
from nfl_data_py import nfl_data_py as nfl
from tqdm import tqdm
import numpy as np
import pandas as pd
pd.set_option('chained_assignment',None)
pd.set_option('display.max_columns',None)
import os
import datetime as dt
current_directory = os.path.dirname(os.path.abspath(__file__))
parent_directory = os.path.dirname(current_directory)
data_directory = os.path.join(parent_directory, 'Data')
year = dt.datetime.now().year
month = dt.datetime.now().month
current_season = year if month in [8,9,10,11,12] else year-1
def get_pbp_data(get_seasons=[]):
"""
Pull data from nflFastR's Github repo.
"""
pbp = nfl.import_pbp_data(get_seasons)
#pbp = pd.read_csv(r"C:\Users\brayd\Downloads\play_by_play_2023.csv")
pbp['TOP_seconds'] = pbp['drive_time_of_possession'].apply(lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1]) if pd.notnull(x) else 0)
return pbp
def build_gbg_data(get_seasons=[]):
"""
Build a game-by-game dataset to use for prediction models.
"""
print('Loading play-by-play data.')
pbp = get_pbp_data(get_seasons)
game_date_dict = dict(pbp[['game_id','game_date']].values)
teams = list(set(list(pbp['home_team'].unique()) + list(pbp['away_team'].unique())))
seasons = pbp['season'].unique()
print('Building game-by-game data.')
data = pd.DataFrame()
for season in seasons:
print(season)
for team_name in tqdm(teams):
# create features
team = pbp.loc[((pbp['home_team']==team_name) | (pbp['away_team']==team_name)) & (pbp['season']==season)]
team['GP'] = team['week']
team['W'] = [1 if r>0 and team_name==h else 1 if r<0 and team_name==a else 0 for r,a,h in team[['result','away_team','home_team']].values]
team['L'] = [0 if r>0 and team_name==h else 0 if r<0 and team_name==a else 1 for r,a,h in team[['result','away_team','home_team']].values]
team['W_PCT'] = team['W']/team['GP']
team['TOP'] = [t if team_name==p else 0 for t,p in team[['TOP_seconds','posteam']].values]
team['FGA'] = [1 if team_name==p and f==1 else 0 for p,f in team[['posteam','field_goal_attempt']].values]
team['FGM'] = [1 if team_name==p and f=='made' else 0 for p,f in team[['posteam','field_goal_result']].values]
team['FG_PCT'] = team['FGM']/team['FGA']
team['PassTD'] = np.where((team['posteam'] == team_name) & (team['pass_touchdown'] == 1), 1, 0)
team['RushTD'] = np.where((team['posteam'] == team_name) & (team['rush_touchdown'] == 1), 1, 0)
team['PassTD_Allowed'] = np.where((team['defteam'] == team_name) & (team['pass_touchdown'] == 1), 1, 0)
team['RushTD_Allowed'] = np.where((team['defteam'] == team_name) & (team['rush_touchdown'] == 1), 1, 0)
team['PassYds'] = [y if p==team_name else 0 for p,y in team[['posteam','passing_yards']].values]
team['RushYds'] = [y if p==team_name else 0 for p,y in team[['posteam','rushing_yards']].values]
team['PassYds_Allowed'] = [y if d==team_name else 0 for d,y in team[['defteam','passing_yards']].values]
team['RushYds_Allowed'] = [y if d==team_name else 0 for d,y in team[['defteam','rushing_yards']].values]
team['Fum'] = np.where((team['defteam'] == team_name) & (team['fumble_lost'] == 1), 1, 0)
team['Fum_Allowed'] = np.where((team['posteam'] == team_name) & (team['fumble_lost'] == 1), 1, 0)
team['INT'] = np.where((team['defteam'] == team_name) & (team['interception'] == 1), 1, 0)
team['INT_Allowed'] = np.where((team['posteam'] == team_name) & (team['interception'] == 1), 1, 0)
team['Sacks'] = np.where((team['defteam'] == team_name) & (team['sack'] == 1), 1, 0)
team['Sacks_Allowed'] = np.where((team['posteam'] == team_name) & (team['sack'] == 1), 1, 0)
team['Penalties'] = np.where((team['penalty_team'] == team_name), 1, 0)
team['FirstDowns'] = [1 if team_name==p and f==1 else 0 for p,f in team[['posteam','first_down']].values]
team['3rdDownConverted'] = [1 if p==team_name and t==1 else 0 for p,t in team[['posteam','third_down_converted']].values]
team['3rdDownFailed'] = [1 if p==team_name and t==1 else 0 for p,t in team[['posteam','third_down_failed']].values]
team['3rdDownAllowed'] = [1 if d==team_name and t==1 else 0 for d,t in team[['defteam','third_down_converted']].values]
team['3rdDownDefended'] = [1 if d==team_name and t==1 else 0 for d,t in team[['defteam','third_down_failed']].values]
team['PTS'] = [ap if at==team_name else hp if ht==team_name else None for ht,at,hp,ap in team[['home_team','away_team','home_score','away_score']].values]
team['PointDiff'] = [r if team_name==h else -r if team_name==a else 0 for r,a,h in team[['result','away_team','home_team']].values]
# aggregate from play-by-play to game-by-game
features = {
'GP':'mean',
'W':'mean',
'L':'mean',
'W_PCT':'mean',
'TOP':'sum',
'FGA':'sum',
'FGM':'sum',
'FG_PCT':'mean',
'PassTD':'sum',
'RushTD':'sum',
'PassTD_Allowed':'sum',
'RushTD_Allowed':'sum',
'PassYds':'sum',
'RushYds':'sum',
'PassYds_Allowed':'sum',
'RushYds_Allowed':'sum',
'Fum':'sum',
'Fum_Allowed':'sum',
'INT':'sum',
'INT_Allowed':'sum',
'Sacks':'sum',
'Sacks_Allowed':'sum',
'Penalties':'sum',
'FirstDowns':'sum',
'3rdDownConverted':'sum',
'3rdDownFailed':'sum',
'3rdDownAllowed':'sum',
'3rdDownDefended':'sum',
'PTS':'mean',
'PointDiff':'mean'
}
game = team.groupby('game_id').agg(features).reset_index().sort_values('GP')
game[['W','L']] = game[['W','L']].expanding().sum()
game[game.columns[4:]] = game[game.columns[4:]].expanding().mean()
if season != current_season:
game[game.columns[1:]] = game[game.columns[1:]].shift()
game['TEAM'] = team_name
game['Season'] = season
else:
game['TEAM'] = team_name
game['Season'] = season
data = pd.concat([data,game])
# separate home and away data and merge
data = data.merge(pbp[['game_id','home_team','away_team']].drop_duplicates())
home = data.loc[data['home_team']==data['TEAM']]
away = data.loc[data['away_team']==data['TEAM']]
away.columns = [f'{i}.Away' for i in away.columns]
gbg = home.merge(away,left_on='game_id',right_on='game_id.Away')
gbg.drop(columns=['TEAM','TEAM.Away','home_team.Away','away_team.Away','Season.Away','game_id.Away'], inplace=True)
gbg['game_date'] = gbg['game_id'].map(game_date_dict)
# save current data
if current_season in get_seasons:
gbg_this_year = gbg.loc[gbg['Season']==current_season]
file_path = os.path.join(data_directory, 'gbg_this_year.csv')
gbg_this_year.to_csv(file_path, index=False)
# save historical data
if get_seasons != [current_season]:
gbg = gbg.loc[gbg['Season']!=current_season]
file_path = os.path.join(data_directory, 'gbg.csv')
gbg.to_csv(file_path, index=False)
def add_odds_data():
"""
Get odds from Australian Sports Betting's free online dataset and merge it with game-by-game data.
"""
# get team abbreviations
team_descriptions = nfl.import_team_desc()
team_abbreviation_dict = dict(team_descriptions[['team_name','team_abbr']].values)
# get odds
odds = pd.read_excel('https://www.aussportsbetting.com/historical_data/nfl.xlsx')
odds['Home Team'] = odds['Home Team'].str.replace('Washington Redskins','Washington Commanders').str.replace('Washington Football Team','Washington Commanders')
odds['Away Team'] = odds['Away Team'].str.replace('Washington Redskins','Washington Commanders').str.replace('Washington Football Team','Washington Commanders')
odds['Season'] = [i.year if i.month in [8,9,10,11,12] else i.year-1 for i in odds['Date']]
odds['Home Team Abbrev'] = odds['Home Team'].map(team_abbreviation_dict).str.replace('LAR','LA')
odds['Away Team Abbrev'] = odds['Away Team'].map(team_abbreviation_dict).str.replace('LAR','LA')
odds = odds[['Date','Home Score','Away Score','Home Team Abbrev','Away Team Abbrev','Home Odds Close','Away Odds Close','Total Score Close','Home Line Close']]
odds['Key'] = odds['Date'].astype(str) + odds['Home Team Abbrev'] + odds['Away Team Abbrev']
odds = odds.drop(columns=['Date','Home Team Abbrev','Away Team Abbrev']).dropna()
odds['Home Odds'] = [round((i-1)*100) if i>= 2 else round(-100/(i-1)) for i in odds['Home Odds Close']]
odds['Away Odds'] = [round((i-1)*100) if i>= 2 else round(-100/(i-1)) for i in odds['Away Odds Close']]
odds['Home Winnings'] = [ho-1 if h>a else -1 if a>h else 0 for ho,h,a in odds[['Home Odds Close','Home Score','Away Score']].values]
odds['Away Winnings'] = [ao-1 if a>h else -1 if h>a else 0 for ao,h,a in odds[['Away Odds Close','Home Score','Away Score']].values]
# load gbg data
file_path = os.path.join(data_directory, 'gbg.csv')
gbg = pd.read_csv(file_path)
file_path = os.path.join(data_directory, 'gbg_this_year.csv')
gbg_this_year = pd.read_csv(file_path)
# merge and save
dataframes = [gbg, gbg_this_year]
for idx in range(2):
i = dataframes[idx]
i['Key'] = i['game_date'].astype(str) + i['home_team'] + i['away_team']
gbg_and_odds = i.merge(odds, left_on='Key', right_on='Key')
gbg_and_odds['Home-Team-Cover'] = [1 if (h-a)>-l else 0 if (h-a)<-l else 2 for h,a,l in gbg_and_odds[['Home Score','Away Score','Home Line Close']].values]
gbg_and_odds['Home-Team-Win'] = (gbg_and_odds['Home Score']>gbg_and_odds['Away Score']).astype(int)
gbg_and_odds['Over'] = ((gbg_and_odds['Home Score'] + gbg_and_odds['Away Score'])>gbg_and_odds['Total Score Close']).astype(int)
if idx==0:
file_path = os.path.join(data_directory, 'gbg_and_odds.csv')
else:
file_path = os.path.join(data_directory, 'gbg_and_odds_this_year.csv')
gbg_and_odds.drop_duplicates(subset='game_id').to_csv(file_path, index=False)
|