lschlessinger commited on
Commit
0e1f732
·
1 Parent(s): d72caf0

refactor: split out core logic from app

Browse files
Files changed (3) hide show
  1. app.py +27 -259
  2. match_parser.py +261 -0
  3. util.py +8 -0
app.py CHANGED
@@ -1,272 +1,36 @@
1
  from pathlib import Path
2
- from typing import Optional, Tuple
3
 
4
  import gradio as gr
5
- import matplotlib.pyplot as plt
6
- import numpy as np
7
- import pandas as pd
8
- import plotly.graph_objects as go
9
- import seaborn as sns
10
- from wordcloud import WordCloud
11
 
12
-
13
- def _rename_columns(df: pd.DataFrame, is_tournament: bool) -> pd.DataFrame:
14
- columns = {
15
- "Rating": "rating",
16
- "Result": "result",
17
- "Scores": "scores",
18
- "Opponent": "opponent",
19
- "OpponentRating": "opponent_rating",
20
- }
21
-
22
- if is_tournament:
23
- columns.update({
24
- "TournamentStartDate": "tournament_start_date",
25
- "TournamentEndDate": "tournament_end_date",
26
- " Touranament": "tournament",
27
- })
28
- else:
29
- columns.update({
30
- "EventDate": "event_date",
31
- "LeagueName": "league_name"
32
- })
33
-
34
- return df.rename(columns=columns)
35
-
36
-
37
- def _fix_dtypes(df: pd.DataFrame, is_tournament: bool) -> pd.DataFrame:
38
- if is_tournament:
39
- df["tournament_start_date"] = pd.to_datetime(df["tournament_start_date"])
40
- df["tournament_end_date"] = pd.to_datetime(df["tournament_end_date"])
41
- df["tournament"] = df["tournament"].astype('category')
42
- else:
43
- df["event_date"] = pd.to_datetime(df["event_date"])
44
- df["league_name"] = df["league_name"].astype('string')
45
-
46
- df["rating"] = df["rating"].astype('int')
47
- df["result"] = df["result"].astype('category')
48
- df["scores"] = df["scores"].astype('string')
49
- df["opponent"] = df["opponent"].astype('category')
50
- df["opponent_rating"] = df["opponent_rating"].astype('int')
51
-
52
- return df
53
-
54
- def snake_case_to_human_readable(s: str) -> str:
55
- return " ".join(s.capitalize().split("_"))
56
-
57
- def make_df_columns_readable(df: Optional[pd.DataFrame], is_tournament: bool) -> Optional[pd.DataFrame]:
58
- """Make a data frame's columns human-readable."""
59
- if df is None:
60
- return None
61
-
62
- nat_to_none = lambda x: None if x == "NaT" else x
63
- if is_tournament:
64
- if "tournament_start_date" in df.columns and "tournament_end_date" in df.columns:
65
- df['tournament_start_date'] = df['tournament_start_date'].dt.date.astype(str).apply(nat_to_none)
66
- df['tournament_end_date'] = df['tournament_end_date'].dt.date.astype(str).apply(nat_to_none)
67
-
68
- def create_date(tournament_start_date, tournament_end_date):
69
- missing_start_date = tournament_start_date is None
70
- missing_end_date = tournament_end_date is None
71
- if not missing_start_date and not missing_end_date:
72
- if tournament_start_date is not tournament_end_date:
73
- return ' - '.join((tournament_start_date, tournament_end_date))
74
- else:
75
- return tournament_start_date
76
- else:
77
- return tournament_start_date if missing_end_date else tournament_end_date
78
-
79
- df["date"] = df.apply(lambda row: create_date(row['tournament_start_date'], row['tournament_end_date']), axis=1)
80
- df = df.drop(columns=["tournament_start_date", "tournament_end_date"])
81
-
82
- # Move date to the front.
83
- columns = list(df.columns)
84
- columns.insert(0, columns.pop(columns.index("date")))
85
- df = df.loc[:, columns]
86
- else:
87
- if "event_date" in df.columns:
88
- df['event_date'] = df['event_date'].dt.date.astype(str).apply(nat_to_none)
89
- df = df.rename(columns={"league_name": "league"})
90
-
91
- df = df.rename(columns=lambda c: snake_case_to_human_readable(c))
92
- return df
93
-
94
- def _check_match_type(match_type: str) -> str:
95
- allowed_match_types = {"tournament", "league"}
96
- if match_type not in allowed_match_types:
97
- raise ValueError(
98
- f"The only supported match types are {allowed_match_types}. Found match type of '{match_type}'.")
99
- return match_type
100
-
101
-
102
- def get_num_competitions_played(df: pd.DataFrame, is_tournament: bool) -> int:
103
- key_name = "tournament_end_date" if is_tournament else "event_date"
104
- return df[key_name].nunique()
105
-
106
-
107
- def get_current_rating(df: pd.DataFrame) -> int:
108
- return df.rating.iloc[0]
109
-
110
-
111
- def get_max_rating(df: pd.DataFrame) -> int:
112
- return df.rating.max()
113
-
114
-
115
- def get_matches_per_competition_fig(df: pd.DataFrame, is_tournament: bool):
116
- fig = plt.figure()
117
- plt.title('Matches per competition')
118
- sns.histplot(df.groupby('tournament' if is_tournament else "event_date").size())
119
- plt.xlabel('Number of matches in competition')
120
- return fig
121
-
122
-
123
- def get_competition_name_word_cloud_fig(df: pd.DataFrame, is_tournament: bool):
124
- fig = plt.figure()
125
- key_name = "tournament" if is_tournament else "league_name"
126
- wordcloud = WordCloud().generate(" ".join(df[key_name].values.tolist()))
127
- plt.imshow(wordcloud, interpolation='bilinear')
128
- plt.axis("off")
129
- return fig
130
-
131
-
132
- def get_opponent_name_word_cloud_fig(df: pd.DataFrame):
133
- fig = plt.figure()
134
- wordcloud = WordCloud().generate(" ".join(df.opponent.values.tolist()))
135
- plt.imshow(wordcloud, interpolation='bilinear')
136
- plt.axis("off")
137
- return fig
138
-
139
-
140
- def get_rating_over_time_fig(df: pd.DataFrame, is_tournament: bool):
141
- fig = go.Figure()
142
- fig.add_trace(go.Scatter(x=df["tournament_end_date" if is_tournament else "event_date"],
143
- y=df["rating"],
144
- mode='lines+markers',
145
- line=dict( width=0.9),
146
- marker=dict(size=4))),
147
-
148
- fig.update_layout(
149
- title='Rating over time',
150
- xaxis_title='Competition date',
151
- yaxis_title='Rating',
152
- showlegend=False,
153
- template="plotly_white",
154
- )
155
-
156
- return fig
157
-
158
-
159
- def get_max_abs_int(int_csv_str: str) -> int:
160
- """Get the max absolute value int from an int CSV."""
161
- ints = [abs(int(i.strip())) for i in int_csv_str.split(',') if i]
162
- return max(ints)
163
-
164
-
165
- def get_match_with_longest_game(df: pd.DataFrame, is_tournament: bool) -> Optional[pd.DataFrame]:
166
- if not is_tournament:
167
- return None
168
- df_non_null = df.loc[~df.scores.isna()]
169
- return df_non_null.iloc[[df_non_null.scores.apply(get_max_abs_int).argmax()]]
170
-
171
-
172
- def get_win_loss_record_str(group_df) -> str:
173
- if len(group_df) > 0:
174
- win_loss_counts = group_df.value_counts()
175
- n_wins = win_loss_counts.Won if hasattr(win_loss_counts, "Won") else 0
176
- n_losses = win_loss_counts.Lost if hasattr(win_loss_counts, "Lost") else 0
177
- else:
178
- n_wins = 0
179
- n_losses = 0
180
-
181
- return f"{n_wins}, {n_losses}"
182
-
183
-
184
- def get_most_frequent_opponents(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
185
- df_with_opponents = df.loc[df.opponent != "-, -"]
186
-
187
- most_common_opponents_df = df_with_opponents.groupby('opponent').agg({"result": [get_win_loss_record_str, "size"]})
188
- most_common_opponents_df.columns = most_common_opponents_df.columns.get_level_values(1)
189
- most_common_opponents_df.rename({"get_win_loss_record_str": "Win/loss record", "size": "Number of matches"}, axis=1,
190
- inplace=True)
191
- most_common_opponents_df["Opponent"] = most_common_opponents_df.index
192
- return most_common_opponents_df.sort_values("Number of matches", ascending=False)[
193
- ["Opponent", "Number of matches", "Win/loss record"]].head(top_n)
194
-
195
-
196
- def get_best_wins(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
197
- """Get the top-n wins sorted by opponent rating."""
198
- return df.loc[df.result == 'Won'].sort_values("opponent_rating", ascending=False).head(top_n)
199
-
200
-
201
- def get_biggest_upsets(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
202
- """Get the top-n wins sorted by rating difference."""
203
- df['rating_difference'] = df['opponent_rating'] - df['rating']
204
- return df.loc[df.result == 'Won'].sort_values("rating_difference", ascending=False).head(top_n)
205
-
206
-
207
- def get_highest_rated_opponent(df: pd.DataFrame) -> pd.DataFrame:
208
- return df.iloc[df.opponent_rating.idxmax()].to_frame().transpose()
209
-
210
-
211
- def get_opponent_rating_distr_fig(df: pd.DataFrame):
212
- fig = plt.figure()
213
- plt.title('Opponent rating distribution')
214
- sns.histplot(data=df, x="opponent_rating", hue='result')
215
- plt.xlabel('Opponent rating')
216
- return fig
217
-
218
-
219
- def get_opponent_rating_dist_over_time_fig(df: pd.DataFrame, is_tournament: bool):
220
- fig, ax = plt.subplots(figsize=(12, 8))
221
- plt.title(f'Opponent rating distribution over time')
222
- x_key_name = "tournament_end_date" if is_tournament else "event_date"
223
- sns.violinplot(data=df,
224
- x=df[x_key_name].dt.year,
225
- y="opponent_rating",
226
- hue="result",
227
- split=True,
228
- inner='points',
229
- cut=1,
230
- ax=ax)
231
- plt.xticks(rotation=30)
232
- plt.xlabel('Competition year')
233
- plt.ylabel('Opponent rating')
234
- return fig
235
-
236
-
237
- def load_match_df(file_path: Path) -> Tuple[pd.DataFrame, bool]:
238
- match_type = _check_match_type(file_path.name.split('_')[0])
239
- is_tournament = match_type == "tournament"
240
-
241
- df = pd.read_csv(file_path)
242
- df = _rename_columns(df, is_tournament)
243
- df = _fix_dtypes(df, is_tournament)
244
-
245
- return df, is_tournament
246
 
247
 
248
  def usatt_rating_analyzer(file_obj):
249
  # Load data.
250
- df, is_tournament = load_match_df(Path(file_obj.name))
 
251
 
252
  # Create outputs.
253
- current_rating = get_current_rating(df)
254
- peak_rating = get_max_rating(df)
255
- n_competitions_played = get_num_competitions_played(df, is_tournament)
 
 
256
  n_matches_played = len(df)
257
- matches_per_competition_fig = get_matches_per_competition_fig(df, is_tournament)
258
- opponent_name_word_cloud_fig = get_opponent_name_word_cloud_fig(df)
259
- competition_name_word_cloud_fig = get_competition_name_word_cloud_fig(df, is_tournament)
260
- most_frequent_opponents = make_df_columns_readable(get_most_frequent_opponents(df), is_tournament)
261
- best_wins = make_df_columns_readable(get_best_wins(df), is_tournament)
262
- biggest_upsets = make_df_columns_readable(get_biggest_upsets(df), is_tournament)
263
- highest_rated_opponent = make_df_columns_readable(get_highest_rated_opponent(df), is_tournament)
264
- rating_over_time_fig = get_rating_over_time_fig(df, is_tournament)
265
- match_with_longest_game = make_df_columns_readable(get_match_with_longest_game(df, is_tournament), is_tournament)
266
- opponent_rating_distr_fig = get_opponent_rating_distr_fig(df)
267
- opponent_rating_dist_over_time_fig = get_opponent_rating_dist_over_time_fig(df, is_tournament)
268
-
269
- return (current_rating,
 
270
  peak_rating,
271
  n_competitions_played,
272
  n_matches_played,
@@ -302,7 +66,7 @@ with gr.Blocks() as demo:
302
  """)
303
  with gr.Row():
304
  with gr.Column():
305
- input_file = gr.File(label='USATT Results File', file_types=['file'])
306
  btn = gr.Button(analyze_btn_title)
307
 
308
  gr.Markdown("""<br />
@@ -313,6 +77,9 @@ with gr.Blocks() as demo:
313
  """)
314
 
315
  with gr.Group():
 
 
 
316
  with gr.Row():
317
  with gr.Column():
318
  current_rating_box = gr.Textbox(lines=1, label="Current rating")
@@ -371,6 +138,7 @@ with gr.Blocks() as demo:
371
 
372
  inputs = [input_file]
373
  outputs = [
 
374
  current_rating_box,
375
  peak_rating_box,
376
  num_comps_box,
 
1
  from pathlib import Path
 
2
 
3
  import gradio as gr
 
 
 
 
 
 
4
 
5
+ import match_parser as mp
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
 
8
  def usatt_rating_analyzer(file_obj):
9
  # Load data.
10
+ file_path = Path(file_obj.name)
11
+ df, is_tournament = mp.load_match_df(file_path)
12
 
13
  # Create outputs.
14
+ print(file_path.stem)
15
+ player_name = mp.get_player_name(file_path.stem)
16
+ current_rating = mp.get_current_rating(df)
17
+ peak_rating = mp.get_max_rating(df)
18
+ n_competitions_played = mp.get_num_competitions_played(df, is_tournament)
19
  n_matches_played = len(df)
20
+ matches_per_competition_fig = mp.get_matches_per_competition_fig(df, is_tournament)
21
+ opponent_name_word_cloud_fig = mp.get_opponent_name_word_cloud_fig(df)
22
+ competition_name_word_cloud_fig = mp.get_competition_name_word_cloud_fig(df, is_tournament)
23
+ most_frequent_opponents = mp.make_df_columns_readable(mp.get_most_frequent_opponents(df), is_tournament)
24
+ best_wins = mp.make_df_columns_readable(mp.get_best_wins(df), is_tournament)
25
+ biggest_upsets = mp.make_df_columns_readable(mp.get_biggest_upsets(df), is_tournament)
26
+ highest_rated_opponent = mp.make_df_columns_readable(mp.get_highest_rated_opponent(df), is_tournament)
27
+ rating_over_time_fig = mp.get_rating_over_time_fig(df, is_tournament)
28
+ match_with_longest_game = mp.make_df_columns_readable(mp.get_match_with_longest_game(df, is_tournament), is_tournament)
29
+ opponent_rating_distr_fig = mp.get_opponent_rating_distr_fig(df)
30
+ opponent_rating_dist_over_time_fig = mp.get_opponent_rating_dist_over_time_fig(df, is_tournament)
31
+
32
+ return (player_name,
33
+ current_rating,
34
  peak_rating,
35
  n_competitions_played,
36
  n_matches_played,
 
66
  """)
67
  with gr.Row():
68
  with gr.Column():
69
+ input_file = gr.File(label='USATT Results File', file_types=['file'], keepfilename=True)
70
  btn = gr.Button(analyze_btn_title)
71
 
72
  gr.Markdown("""<br />
 
77
  """)
78
 
79
  with gr.Group():
80
+ with gr.Row():
81
+ with gr.Column():
82
+ player_name_box = gr.Textbox(lines=1, label="Player name")
83
  with gr.Row():
84
  with gr.Column():
85
  current_rating_box = gr.Textbox(lines=1, label="Current rating")
 
138
 
139
  inputs = [input_file]
140
  outputs = [
141
+ player_name_box,
142
  current_rating_box,
143
  peak_rating_box,
144
  num_comps_box,
match_parser.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Optional, Tuple
4
+
5
+ import matplotlib.pyplot as plt
6
+ import pandas as pd
7
+ import plotly.graph_objects as go
8
+ import requests
9
+ import seaborn as sns
10
+ from bs4 import BeautifulSoup
11
+ from wordcloud import WordCloud
12
+
13
+ from util import get_max_abs_int, snake_case_to_human_readable
14
+
15
+
16
+ def _rename_columns(df: pd.DataFrame, is_tournament: bool) -> pd.DataFrame:
17
+ columns = {
18
+ "Rating": "rating",
19
+ "Result": "result",
20
+ "Scores": "scores",
21
+ "Opponent": "opponent",
22
+ "OpponentRating": "opponent_rating",
23
+ }
24
+
25
+ if is_tournament:
26
+ columns.update({
27
+ "TournamentStartDate": "tournament_start_date",
28
+ "TournamentEndDate": "tournament_end_date",
29
+ " Touranament": "tournament",
30
+ })
31
+ else:
32
+ columns.update({
33
+ "EventDate": "event_date",
34
+ "LeagueName": "league_name"
35
+ })
36
+
37
+ return df.rename(columns=columns)
38
+
39
+
40
+ def _fix_dtypes(df: pd.DataFrame, is_tournament: bool) -> pd.DataFrame:
41
+ if is_tournament:
42
+ df["tournament_start_date"] = pd.to_datetime(df["tournament_start_date"])
43
+ df["tournament_end_date"] = pd.to_datetime(df["tournament_end_date"])
44
+ df["tournament"] = df["tournament"].astype('category')
45
+ else:
46
+ df["event_date"] = pd.to_datetime(df["event_date"])
47
+ df["league_name"] = df["league_name"].astype('string')
48
+
49
+ df["rating"] = df["rating"].astype('int')
50
+ df["result"] = df["result"].astype('category')
51
+ df["scores"] = df["scores"].astype('string')
52
+ df["opponent"] = df["opponent"].astype('category')
53
+ df["opponent_rating"] = df["opponent_rating"].astype('int')
54
+
55
+ return df
56
+
57
+
58
+
59
+ def make_df_columns_readable(df: Optional[pd.DataFrame], is_tournament: bool) -> Optional[pd.DataFrame]:
60
+ """Make a data frame's columns human-readable."""
61
+ if df is None:
62
+ return None
63
+
64
+ nat_to_none = lambda x: None if x == "NaT" else x
65
+ if is_tournament:
66
+ if "tournament_start_date" in df.columns and "tournament_end_date" in df.columns:
67
+ df['tournament_start_date'] = df['tournament_start_date'].dt.date.astype(str).apply(nat_to_none)
68
+ df['tournament_end_date'] = df['tournament_end_date'].dt.date.astype(str).apply(nat_to_none)
69
+
70
+ def create_date(tournament_start_date, tournament_end_date):
71
+ missing_start_date = tournament_start_date is None
72
+ missing_end_date = tournament_end_date is None
73
+ if not missing_start_date and not missing_end_date:
74
+ if tournament_start_date is not tournament_end_date:
75
+ return ' - '.join((tournament_start_date, tournament_end_date))
76
+ else:
77
+ return tournament_start_date
78
+ else:
79
+ return tournament_start_date if missing_end_date else tournament_end_date
80
+
81
+ df["date"] = df.apply(lambda row: create_date(row['tournament_start_date'], row['tournament_end_date']), axis=1)
82
+ df = df.drop(columns=["tournament_start_date", "tournament_end_date"])
83
+
84
+ # Move date to the front.
85
+ columns = list(df.columns)
86
+ columns.insert(0, columns.pop(columns.index("date")))
87
+ df = df.loc[:, columns]
88
+ else:
89
+ if "event_date" in df.columns:
90
+ df['event_date'] = df['event_date'].dt.date.astype(str).apply(nat_to_none)
91
+ df = df.rename(columns={"league_name": "league"})
92
+
93
+ df = df.rename(columns=lambda c: snake_case_to_human_readable(c))
94
+ return df
95
+
96
+ def _check_match_type(match_type: str) -> str:
97
+ allowed_match_types = {"tournament", "league"}
98
+ if match_type not in allowed_match_types:
99
+ raise ValueError(
100
+ f"The only supported match types are {allowed_match_types}. Found match type of '{match_type}'.")
101
+ return match_type
102
+
103
+
104
+ def fetch_player_name(profile_id: int) -> str:
105
+ """Fetch a player name from theUSATT website.
106
+
107
+ note: the profile ID is NOT the USATT number.
108
+ """
109
+ url = f"https://usatt.simplycompete.com/userAccount/up/{profile_id}"
110
+ logging.info(f"Fetching player name from {url}")
111
+ page = requests.get(url)
112
+ soup = BeautifulSoup(page.content, "html.parser")
113
+ profile_elt = soup.find("div", class_="profile-header")
114
+ return profile_elt.find(class_="title").text.strip()
115
+
116
+
117
+ def get_player_name(file_stem: str) -> str:
118
+ profile_id = int(file_stem.split(" ")[0].replace("_", "").split("matches")[-1])
119
+ return fetch_player_name(profile_id)
120
+
121
+ def get_num_competitions_played(df: pd.DataFrame, is_tournament: bool) -> int:
122
+ key_name = "tournament_end_date" if is_tournament else "event_date"
123
+ return df[key_name].nunique()
124
+
125
+
126
+ def get_current_rating(df: pd.DataFrame) -> int:
127
+ return df.rating.iloc[0]
128
+
129
+
130
+ def get_max_rating(df: pd.DataFrame) -> int:
131
+ return df.rating.max()
132
+
133
+
134
+ def get_matches_per_competition_fig(df: pd.DataFrame, is_tournament: bool):
135
+ fig = plt.figure()
136
+ plt.title('Matches per competition')
137
+ sns.histplot(df.groupby('tournament' if is_tournament else "event_date").size())
138
+ plt.xlabel('Number of matches in competition')
139
+ return fig
140
+
141
+
142
+ def get_competition_name_word_cloud_fig(df: pd.DataFrame, is_tournament: bool):
143
+ fig = plt.figure()
144
+ key_name = "tournament" if is_tournament else "league_name"
145
+ wordcloud = WordCloud().generate(" ".join(df[key_name].values.tolist()))
146
+ plt.imshow(wordcloud, interpolation='bilinear')
147
+ plt.axis("off")
148
+ return fig
149
+
150
+
151
+ def get_opponent_name_word_cloud_fig(df: pd.DataFrame):
152
+ fig = plt.figure()
153
+ wordcloud = WordCloud().generate(" ".join(df.opponent.values.tolist()))
154
+ plt.imshow(wordcloud, interpolation='bilinear')
155
+ plt.axis("off")
156
+ return fig
157
+
158
+
159
+ def get_rating_over_time_fig(df: pd.DataFrame, is_tournament: bool):
160
+ fig = go.Figure()
161
+ fig.add_trace(go.Scatter(x=df["tournament_end_date" if is_tournament else "event_date"],
162
+ y=df["rating"],
163
+ mode='lines+markers',
164
+ line=dict( width=0.9),
165
+ marker=dict(size=4))),
166
+
167
+ fig.update_layout(
168
+ title='Rating over time',
169
+ xaxis_title='Competition date',
170
+ yaxis_title='Rating',
171
+ showlegend=False,
172
+ template="plotly_white",
173
+ )
174
+
175
+ return fig
176
+
177
+
178
+
179
+
180
+ def get_match_with_longest_game(df: pd.DataFrame, is_tournament: bool) -> Optional[pd.DataFrame]:
181
+ if not is_tournament:
182
+ return None
183
+ df_non_null = df.loc[~df.scores.isna()]
184
+ return df_non_null.iloc[[df_non_null.scores.apply(get_max_abs_int).argmax()]]
185
+
186
+
187
+ def get_win_loss_record_str(group_df) -> str:
188
+ if len(group_df) > 0:
189
+ win_loss_counts = group_df.value_counts()
190
+ n_wins = win_loss_counts.Won if hasattr(win_loss_counts, "Won") else 0
191
+ n_losses = win_loss_counts.Lost if hasattr(win_loss_counts, "Lost") else 0
192
+ else:
193
+ n_wins = 0
194
+ n_losses = 0
195
+
196
+ return f"{n_wins}, {n_losses}"
197
+
198
+
199
+ def get_most_frequent_opponents(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
200
+ df_with_opponents = df.loc[df.opponent != "-, -"]
201
+
202
+ most_common_opponents_df = df_with_opponents.groupby('opponent').agg({"result": [get_win_loss_record_str, "size"]})
203
+ most_common_opponents_df.columns = most_common_opponents_df.columns.get_level_values(1)
204
+ most_common_opponents_df.rename({"get_win_loss_record_str": "Win/loss record", "size": "Number of matches"}, axis=1,
205
+ inplace=True)
206
+ most_common_opponents_df["Opponent"] = most_common_opponents_df.index
207
+ return most_common_opponents_df.sort_values("Number of matches", ascending=False)[
208
+ ["Opponent", "Number of matches", "Win/loss record"]].head(top_n)
209
+
210
+
211
+ def get_best_wins(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
212
+ """Get the top-n wins sorted by opponent rating."""
213
+ return df.loc[df.result == 'Won'].sort_values("opponent_rating", ascending=False).head(top_n)
214
+
215
+
216
+ def get_biggest_upsets(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
217
+ """Get the top-n wins sorted by rating difference."""
218
+ df['rating_difference'] = df['opponent_rating'] - df['rating']
219
+ return df.loc[df.result == 'Won'].sort_values("rating_difference", ascending=False).head(top_n)
220
+
221
+
222
+ def get_highest_rated_opponent(df: pd.DataFrame) -> pd.DataFrame:
223
+ return df.iloc[df.opponent_rating.idxmax()].to_frame().transpose()
224
+
225
+
226
+ def get_opponent_rating_distr_fig(df: pd.DataFrame):
227
+ fig = plt.figure()
228
+ plt.title('Opponent rating distribution')
229
+ sns.histplot(data=df, x="opponent_rating", hue='result')
230
+ plt.xlabel('Opponent rating')
231
+ return fig
232
+
233
+
234
+ def get_opponent_rating_dist_over_time_fig(df: pd.DataFrame, is_tournament: bool):
235
+ fig, ax = plt.subplots(figsize=(12, 8))
236
+ plt.title(f'Opponent rating distribution over time')
237
+ x_key_name = "tournament_end_date" if is_tournament else "event_date"
238
+ sns.violinplot(data=df,
239
+ x=df[x_key_name].dt.year,
240
+ y="opponent_rating",
241
+ hue="result",
242
+ split=True,
243
+ inner='points',
244
+ cut=1,
245
+ ax=ax)
246
+ plt.xticks(rotation=30)
247
+ plt.xlabel('Competition year')
248
+ plt.ylabel('Opponent rating')
249
+ return fig
250
+
251
+
252
+ def load_match_df(file_path: Path) -> Tuple[pd.DataFrame, bool]:
253
+ match_type = _check_match_type(file_path.name.split('_')[0])
254
+ is_tournament = match_type == "tournament"
255
+
256
+ df = pd.read_csv(file_path)
257
+ df = _rename_columns(df, is_tournament)
258
+ df = _fix_dtypes(df, is_tournament)
259
+
260
+ return df, is_tournament
261
+
util.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ def snake_case_to_human_readable(s: str) -> str:
2
+ return " ".join(s.capitalize().split("_"))
3
+
4
+
5
+ def get_max_abs_int(int_csv_str: str) -> int:
6
+ """Get the max absolute value int from an int CSV."""
7
+ ints = [abs(int(i.strip())) for i in int_csv_str.split(',') if i]
8
+ return max(ints)