Commit
·
0e1f732
1
Parent(s):
d72caf0
refactor: split out core logic from app
Browse files- app.py +27 -259
- match_parser.py +261 -0
- util.py +8 -0
app.py
CHANGED
@@ -1,272 +1,36 @@
|
|
1 |
from pathlib import Path
|
2 |
-
from typing import Optional, Tuple
|
3 |
|
4 |
import gradio as gr
|
5 |
-
import matplotlib.pyplot as plt
|
6 |
-
import numpy as np
|
7 |
-
import pandas as pd
|
8 |
-
import plotly.graph_objects as go
|
9 |
-
import seaborn as sns
|
10 |
-
from wordcloud import WordCloud
|
11 |
|
12 |
-
|
13 |
-
def _rename_columns(df: pd.DataFrame, is_tournament: bool) -> pd.DataFrame:
|
14 |
-
columns = {
|
15 |
-
"Rating": "rating",
|
16 |
-
"Result": "result",
|
17 |
-
"Scores": "scores",
|
18 |
-
"Opponent": "opponent",
|
19 |
-
"OpponentRating": "opponent_rating",
|
20 |
-
}
|
21 |
-
|
22 |
-
if is_tournament:
|
23 |
-
columns.update({
|
24 |
-
"TournamentStartDate": "tournament_start_date",
|
25 |
-
"TournamentEndDate": "tournament_end_date",
|
26 |
-
" Touranament": "tournament",
|
27 |
-
})
|
28 |
-
else:
|
29 |
-
columns.update({
|
30 |
-
"EventDate": "event_date",
|
31 |
-
"LeagueName": "league_name"
|
32 |
-
})
|
33 |
-
|
34 |
-
return df.rename(columns=columns)
|
35 |
-
|
36 |
-
|
37 |
-
def _fix_dtypes(df: pd.DataFrame, is_tournament: bool) -> pd.DataFrame:
|
38 |
-
if is_tournament:
|
39 |
-
df["tournament_start_date"] = pd.to_datetime(df["tournament_start_date"])
|
40 |
-
df["tournament_end_date"] = pd.to_datetime(df["tournament_end_date"])
|
41 |
-
df["tournament"] = df["tournament"].astype('category')
|
42 |
-
else:
|
43 |
-
df["event_date"] = pd.to_datetime(df["event_date"])
|
44 |
-
df["league_name"] = df["league_name"].astype('string')
|
45 |
-
|
46 |
-
df["rating"] = df["rating"].astype('int')
|
47 |
-
df["result"] = df["result"].astype('category')
|
48 |
-
df["scores"] = df["scores"].astype('string')
|
49 |
-
df["opponent"] = df["opponent"].astype('category')
|
50 |
-
df["opponent_rating"] = df["opponent_rating"].astype('int')
|
51 |
-
|
52 |
-
return df
|
53 |
-
|
54 |
-
def snake_case_to_human_readable(s: str) -> str:
|
55 |
-
return " ".join(s.capitalize().split("_"))
|
56 |
-
|
57 |
-
def make_df_columns_readable(df: Optional[pd.DataFrame], is_tournament: bool) -> Optional[pd.DataFrame]:
|
58 |
-
"""Make a data frame's columns human-readable."""
|
59 |
-
if df is None:
|
60 |
-
return None
|
61 |
-
|
62 |
-
nat_to_none = lambda x: None if x == "NaT" else x
|
63 |
-
if is_tournament:
|
64 |
-
if "tournament_start_date" in df.columns and "tournament_end_date" in df.columns:
|
65 |
-
df['tournament_start_date'] = df['tournament_start_date'].dt.date.astype(str).apply(nat_to_none)
|
66 |
-
df['tournament_end_date'] = df['tournament_end_date'].dt.date.astype(str).apply(nat_to_none)
|
67 |
-
|
68 |
-
def create_date(tournament_start_date, tournament_end_date):
|
69 |
-
missing_start_date = tournament_start_date is None
|
70 |
-
missing_end_date = tournament_end_date is None
|
71 |
-
if not missing_start_date and not missing_end_date:
|
72 |
-
if tournament_start_date is not tournament_end_date:
|
73 |
-
return ' - '.join((tournament_start_date, tournament_end_date))
|
74 |
-
else:
|
75 |
-
return tournament_start_date
|
76 |
-
else:
|
77 |
-
return tournament_start_date if missing_end_date else tournament_end_date
|
78 |
-
|
79 |
-
df["date"] = df.apply(lambda row: create_date(row['tournament_start_date'], row['tournament_end_date']), axis=1)
|
80 |
-
df = df.drop(columns=["tournament_start_date", "tournament_end_date"])
|
81 |
-
|
82 |
-
# Move date to the front.
|
83 |
-
columns = list(df.columns)
|
84 |
-
columns.insert(0, columns.pop(columns.index("date")))
|
85 |
-
df = df.loc[:, columns]
|
86 |
-
else:
|
87 |
-
if "event_date" in df.columns:
|
88 |
-
df['event_date'] = df['event_date'].dt.date.astype(str).apply(nat_to_none)
|
89 |
-
df = df.rename(columns={"league_name": "league"})
|
90 |
-
|
91 |
-
df = df.rename(columns=lambda c: snake_case_to_human_readable(c))
|
92 |
-
return df
|
93 |
-
|
94 |
-
def _check_match_type(match_type: str) -> str:
|
95 |
-
allowed_match_types = {"tournament", "league"}
|
96 |
-
if match_type not in allowed_match_types:
|
97 |
-
raise ValueError(
|
98 |
-
f"The only supported match types are {allowed_match_types}. Found match type of '{match_type}'.")
|
99 |
-
return match_type
|
100 |
-
|
101 |
-
|
102 |
-
def get_num_competitions_played(df: pd.DataFrame, is_tournament: bool) -> int:
|
103 |
-
key_name = "tournament_end_date" if is_tournament else "event_date"
|
104 |
-
return df[key_name].nunique()
|
105 |
-
|
106 |
-
|
107 |
-
def get_current_rating(df: pd.DataFrame) -> int:
|
108 |
-
return df.rating.iloc[0]
|
109 |
-
|
110 |
-
|
111 |
-
def get_max_rating(df: pd.DataFrame) -> int:
|
112 |
-
return df.rating.max()
|
113 |
-
|
114 |
-
|
115 |
-
def get_matches_per_competition_fig(df: pd.DataFrame, is_tournament: bool):
|
116 |
-
fig = plt.figure()
|
117 |
-
plt.title('Matches per competition')
|
118 |
-
sns.histplot(df.groupby('tournament' if is_tournament else "event_date").size())
|
119 |
-
plt.xlabel('Number of matches in competition')
|
120 |
-
return fig
|
121 |
-
|
122 |
-
|
123 |
-
def get_competition_name_word_cloud_fig(df: pd.DataFrame, is_tournament: bool):
|
124 |
-
fig = plt.figure()
|
125 |
-
key_name = "tournament" if is_tournament else "league_name"
|
126 |
-
wordcloud = WordCloud().generate(" ".join(df[key_name].values.tolist()))
|
127 |
-
plt.imshow(wordcloud, interpolation='bilinear')
|
128 |
-
plt.axis("off")
|
129 |
-
return fig
|
130 |
-
|
131 |
-
|
132 |
-
def get_opponent_name_word_cloud_fig(df: pd.DataFrame):
|
133 |
-
fig = plt.figure()
|
134 |
-
wordcloud = WordCloud().generate(" ".join(df.opponent.values.tolist()))
|
135 |
-
plt.imshow(wordcloud, interpolation='bilinear')
|
136 |
-
plt.axis("off")
|
137 |
-
return fig
|
138 |
-
|
139 |
-
|
140 |
-
def get_rating_over_time_fig(df: pd.DataFrame, is_tournament: bool):
|
141 |
-
fig = go.Figure()
|
142 |
-
fig.add_trace(go.Scatter(x=df["tournament_end_date" if is_tournament else "event_date"],
|
143 |
-
y=df["rating"],
|
144 |
-
mode='lines+markers',
|
145 |
-
line=dict( width=0.9),
|
146 |
-
marker=dict(size=4))),
|
147 |
-
|
148 |
-
fig.update_layout(
|
149 |
-
title='Rating over time',
|
150 |
-
xaxis_title='Competition date',
|
151 |
-
yaxis_title='Rating',
|
152 |
-
showlegend=False,
|
153 |
-
template="plotly_white",
|
154 |
-
)
|
155 |
-
|
156 |
-
return fig
|
157 |
-
|
158 |
-
|
159 |
-
def get_max_abs_int(int_csv_str: str) -> int:
|
160 |
-
"""Get the max absolute value int from an int CSV."""
|
161 |
-
ints = [abs(int(i.strip())) for i in int_csv_str.split(',') if i]
|
162 |
-
return max(ints)
|
163 |
-
|
164 |
-
|
165 |
-
def get_match_with_longest_game(df: pd.DataFrame, is_tournament: bool) -> Optional[pd.DataFrame]:
|
166 |
-
if not is_tournament:
|
167 |
-
return None
|
168 |
-
df_non_null = df.loc[~df.scores.isna()]
|
169 |
-
return df_non_null.iloc[[df_non_null.scores.apply(get_max_abs_int).argmax()]]
|
170 |
-
|
171 |
-
|
172 |
-
def get_win_loss_record_str(group_df) -> str:
|
173 |
-
if len(group_df) > 0:
|
174 |
-
win_loss_counts = group_df.value_counts()
|
175 |
-
n_wins = win_loss_counts.Won if hasattr(win_loss_counts, "Won") else 0
|
176 |
-
n_losses = win_loss_counts.Lost if hasattr(win_loss_counts, "Lost") else 0
|
177 |
-
else:
|
178 |
-
n_wins = 0
|
179 |
-
n_losses = 0
|
180 |
-
|
181 |
-
return f"{n_wins}, {n_losses}"
|
182 |
-
|
183 |
-
|
184 |
-
def get_most_frequent_opponents(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
|
185 |
-
df_with_opponents = df.loc[df.opponent != "-, -"]
|
186 |
-
|
187 |
-
most_common_opponents_df = df_with_opponents.groupby('opponent').agg({"result": [get_win_loss_record_str, "size"]})
|
188 |
-
most_common_opponents_df.columns = most_common_opponents_df.columns.get_level_values(1)
|
189 |
-
most_common_opponents_df.rename({"get_win_loss_record_str": "Win/loss record", "size": "Number of matches"}, axis=1,
|
190 |
-
inplace=True)
|
191 |
-
most_common_opponents_df["Opponent"] = most_common_opponents_df.index
|
192 |
-
return most_common_opponents_df.sort_values("Number of matches", ascending=False)[
|
193 |
-
["Opponent", "Number of matches", "Win/loss record"]].head(top_n)
|
194 |
-
|
195 |
-
|
196 |
-
def get_best_wins(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
|
197 |
-
"""Get the top-n wins sorted by opponent rating."""
|
198 |
-
return df.loc[df.result == 'Won'].sort_values("opponent_rating", ascending=False).head(top_n)
|
199 |
-
|
200 |
-
|
201 |
-
def get_biggest_upsets(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
|
202 |
-
"""Get the top-n wins sorted by rating difference."""
|
203 |
-
df['rating_difference'] = df['opponent_rating'] - df['rating']
|
204 |
-
return df.loc[df.result == 'Won'].sort_values("rating_difference", ascending=False).head(top_n)
|
205 |
-
|
206 |
-
|
207 |
-
def get_highest_rated_opponent(df: pd.DataFrame) -> pd.DataFrame:
|
208 |
-
return df.iloc[df.opponent_rating.idxmax()].to_frame().transpose()
|
209 |
-
|
210 |
-
|
211 |
-
def get_opponent_rating_distr_fig(df: pd.DataFrame):
|
212 |
-
fig = plt.figure()
|
213 |
-
plt.title('Opponent rating distribution')
|
214 |
-
sns.histplot(data=df, x="opponent_rating", hue='result')
|
215 |
-
plt.xlabel('Opponent rating')
|
216 |
-
return fig
|
217 |
-
|
218 |
-
|
219 |
-
def get_opponent_rating_dist_over_time_fig(df: pd.DataFrame, is_tournament: bool):
|
220 |
-
fig, ax = plt.subplots(figsize=(12, 8))
|
221 |
-
plt.title(f'Opponent rating distribution over time')
|
222 |
-
x_key_name = "tournament_end_date" if is_tournament else "event_date"
|
223 |
-
sns.violinplot(data=df,
|
224 |
-
x=df[x_key_name].dt.year,
|
225 |
-
y="opponent_rating",
|
226 |
-
hue="result",
|
227 |
-
split=True,
|
228 |
-
inner='points',
|
229 |
-
cut=1,
|
230 |
-
ax=ax)
|
231 |
-
plt.xticks(rotation=30)
|
232 |
-
plt.xlabel('Competition year')
|
233 |
-
plt.ylabel('Opponent rating')
|
234 |
-
return fig
|
235 |
-
|
236 |
-
|
237 |
-
def load_match_df(file_path: Path) -> Tuple[pd.DataFrame, bool]:
|
238 |
-
match_type = _check_match_type(file_path.name.split('_')[0])
|
239 |
-
is_tournament = match_type == "tournament"
|
240 |
-
|
241 |
-
df = pd.read_csv(file_path)
|
242 |
-
df = _rename_columns(df, is_tournament)
|
243 |
-
df = _fix_dtypes(df, is_tournament)
|
244 |
-
|
245 |
-
return df, is_tournament
|
246 |
|
247 |
|
248 |
def usatt_rating_analyzer(file_obj):
|
249 |
# Load data.
|
250 |
-
|
|
|
251 |
|
252 |
# Create outputs.
|
253 |
-
|
254 |
-
|
255 |
-
|
|
|
|
|
256 |
n_matches_played = len(df)
|
257 |
-
matches_per_competition_fig = get_matches_per_competition_fig(df, is_tournament)
|
258 |
-
opponent_name_word_cloud_fig = get_opponent_name_word_cloud_fig(df)
|
259 |
-
competition_name_word_cloud_fig = get_competition_name_word_cloud_fig(df, is_tournament)
|
260 |
-
most_frequent_opponents = make_df_columns_readable(get_most_frequent_opponents(df), is_tournament)
|
261 |
-
best_wins = make_df_columns_readable(get_best_wins(df), is_tournament)
|
262 |
-
biggest_upsets = make_df_columns_readable(get_biggest_upsets(df), is_tournament)
|
263 |
-
highest_rated_opponent = make_df_columns_readable(get_highest_rated_opponent(df), is_tournament)
|
264 |
-
rating_over_time_fig = get_rating_over_time_fig(df, is_tournament)
|
265 |
-
match_with_longest_game = make_df_columns_readable(get_match_with_longest_game(df, is_tournament), is_tournament)
|
266 |
-
opponent_rating_distr_fig = get_opponent_rating_distr_fig(df)
|
267 |
-
opponent_rating_dist_over_time_fig = get_opponent_rating_dist_over_time_fig(df, is_tournament)
|
268 |
-
|
269 |
-
return (
|
|
|
270 |
peak_rating,
|
271 |
n_competitions_played,
|
272 |
n_matches_played,
|
@@ -302,7 +66,7 @@ with gr.Blocks() as demo:
|
|
302 |
""")
|
303 |
with gr.Row():
|
304 |
with gr.Column():
|
305 |
-
input_file = gr.File(label='USATT Results File', file_types=['file'])
|
306 |
btn = gr.Button(analyze_btn_title)
|
307 |
|
308 |
gr.Markdown("""<br />
|
@@ -313,6 +77,9 @@ with gr.Blocks() as demo:
|
|
313 |
""")
|
314 |
|
315 |
with gr.Group():
|
|
|
|
|
|
|
316 |
with gr.Row():
|
317 |
with gr.Column():
|
318 |
current_rating_box = gr.Textbox(lines=1, label="Current rating")
|
@@ -371,6 +138,7 @@ with gr.Blocks() as demo:
|
|
371 |
|
372 |
inputs = [input_file]
|
373 |
outputs = [
|
|
|
374 |
current_rating_box,
|
375 |
peak_rating_box,
|
376 |
num_comps_box,
|
|
|
1 |
from pathlib import Path
|
|
|
2 |
|
3 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
+
import match_parser as mp
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
|
8 |
def usatt_rating_analyzer(file_obj):
|
9 |
# Load data.
|
10 |
+
file_path = Path(file_obj.name)
|
11 |
+
df, is_tournament = mp.load_match_df(file_path)
|
12 |
|
13 |
# Create outputs.
|
14 |
+
print(file_path.stem)
|
15 |
+
player_name = mp.get_player_name(file_path.stem)
|
16 |
+
current_rating = mp.get_current_rating(df)
|
17 |
+
peak_rating = mp.get_max_rating(df)
|
18 |
+
n_competitions_played = mp.get_num_competitions_played(df, is_tournament)
|
19 |
n_matches_played = len(df)
|
20 |
+
matches_per_competition_fig = mp.get_matches_per_competition_fig(df, is_tournament)
|
21 |
+
opponent_name_word_cloud_fig = mp.get_opponent_name_word_cloud_fig(df)
|
22 |
+
competition_name_word_cloud_fig = mp.get_competition_name_word_cloud_fig(df, is_tournament)
|
23 |
+
most_frequent_opponents = mp.make_df_columns_readable(mp.get_most_frequent_opponents(df), is_tournament)
|
24 |
+
best_wins = mp.make_df_columns_readable(mp.get_best_wins(df), is_tournament)
|
25 |
+
biggest_upsets = mp.make_df_columns_readable(mp.get_biggest_upsets(df), is_tournament)
|
26 |
+
highest_rated_opponent = mp.make_df_columns_readable(mp.get_highest_rated_opponent(df), is_tournament)
|
27 |
+
rating_over_time_fig = mp.get_rating_over_time_fig(df, is_tournament)
|
28 |
+
match_with_longest_game = mp.make_df_columns_readable(mp.get_match_with_longest_game(df, is_tournament), is_tournament)
|
29 |
+
opponent_rating_distr_fig = mp.get_opponent_rating_distr_fig(df)
|
30 |
+
opponent_rating_dist_over_time_fig = mp.get_opponent_rating_dist_over_time_fig(df, is_tournament)
|
31 |
+
|
32 |
+
return (player_name,
|
33 |
+
current_rating,
|
34 |
peak_rating,
|
35 |
n_competitions_played,
|
36 |
n_matches_played,
|
|
|
66 |
""")
|
67 |
with gr.Row():
|
68 |
with gr.Column():
|
69 |
+
input_file = gr.File(label='USATT Results File', file_types=['file'], keepfilename=True)
|
70 |
btn = gr.Button(analyze_btn_title)
|
71 |
|
72 |
gr.Markdown("""<br />
|
|
|
77 |
""")
|
78 |
|
79 |
with gr.Group():
|
80 |
+
with gr.Row():
|
81 |
+
with gr.Column():
|
82 |
+
player_name_box = gr.Textbox(lines=1, label="Player name")
|
83 |
with gr.Row():
|
84 |
with gr.Column():
|
85 |
current_rating_box = gr.Textbox(lines=1, label="Current rating")
|
|
|
138 |
|
139 |
inputs = [input_file]
|
140 |
outputs = [
|
141 |
+
player_name_box,
|
142 |
current_rating_box,
|
143 |
peak_rating_box,
|
144 |
num_comps_box,
|
match_parser.py
ADDED
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from pathlib import Path
|
3 |
+
from typing import Optional, Tuple
|
4 |
+
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
import pandas as pd
|
7 |
+
import plotly.graph_objects as go
|
8 |
+
import requests
|
9 |
+
import seaborn as sns
|
10 |
+
from bs4 import BeautifulSoup
|
11 |
+
from wordcloud import WordCloud
|
12 |
+
|
13 |
+
from util import get_max_abs_int, snake_case_to_human_readable
|
14 |
+
|
15 |
+
|
16 |
+
def _rename_columns(df: pd.DataFrame, is_tournament: bool) -> pd.DataFrame:
|
17 |
+
columns = {
|
18 |
+
"Rating": "rating",
|
19 |
+
"Result": "result",
|
20 |
+
"Scores": "scores",
|
21 |
+
"Opponent": "opponent",
|
22 |
+
"OpponentRating": "opponent_rating",
|
23 |
+
}
|
24 |
+
|
25 |
+
if is_tournament:
|
26 |
+
columns.update({
|
27 |
+
"TournamentStartDate": "tournament_start_date",
|
28 |
+
"TournamentEndDate": "tournament_end_date",
|
29 |
+
" Touranament": "tournament",
|
30 |
+
})
|
31 |
+
else:
|
32 |
+
columns.update({
|
33 |
+
"EventDate": "event_date",
|
34 |
+
"LeagueName": "league_name"
|
35 |
+
})
|
36 |
+
|
37 |
+
return df.rename(columns=columns)
|
38 |
+
|
39 |
+
|
40 |
+
def _fix_dtypes(df: pd.DataFrame, is_tournament: bool) -> pd.DataFrame:
|
41 |
+
if is_tournament:
|
42 |
+
df["tournament_start_date"] = pd.to_datetime(df["tournament_start_date"])
|
43 |
+
df["tournament_end_date"] = pd.to_datetime(df["tournament_end_date"])
|
44 |
+
df["tournament"] = df["tournament"].astype('category')
|
45 |
+
else:
|
46 |
+
df["event_date"] = pd.to_datetime(df["event_date"])
|
47 |
+
df["league_name"] = df["league_name"].astype('string')
|
48 |
+
|
49 |
+
df["rating"] = df["rating"].astype('int')
|
50 |
+
df["result"] = df["result"].astype('category')
|
51 |
+
df["scores"] = df["scores"].astype('string')
|
52 |
+
df["opponent"] = df["opponent"].astype('category')
|
53 |
+
df["opponent_rating"] = df["opponent_rating"].astype('int')
|
54 |
+
|
55 |
+
return df
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
def make_df_columns_readable(df: Optional[pd.DataFrame], is_tournament: bool) -> Optional[pd.DataFrame]:
|
60 |
+
"""Make a data frame's columns human-readable."""
|
61 |
+
if df is None:
|
62 |
+
return None
|
63 |
+
|
64 |
+
nat_to_none = lambda x: None if x == "NaT" else x
|
65 |
+
if is_tournament:
|
66 |
+
if "tournament_start_date" in df.columns and "tournament_end_date" in df.columns:
|
67 |
+
df['tournament_start_date'] = df['tournament_start_date'].dt.date.astype(str).apply(nat_to_none)
|
68 |
+
df['tournament_end_date'] = df['tournament_end_date'].dt.date.astype(str).apply(nat_to_none)
|
69 |
+
|
70 |
+
def create_date(tournament_start_date, tournament_end_date):
|
71 |
+
missing_start_date = tournament_start_date is None
|
72 |
+
missing_end_date = tournament_end_date is None
|
73 |
+
if not missing_start_date and not missing_end_date:
|
74 |
+
if tournament_start_date is not tournament_end_date:
|
75 |
+
return ' - '.join((tournament_start_date, tournament_end_date))
|
76 |
+
else:
|
77 |
+
return tournament_start_date
|
78 |
+
else:
|
79 |
+
return tournament_start_date if missing_end_date else tournament_end_date
|
80 |
+
|
81 |
+
df["date"] = df.apply(lambda row: create_date(row['tournament_start_date'], row['tournament_end_date']), axis=1)
|
82 |
+
df = df.drop(columns=["tournament_start_date", "tournament_end_date"])
|
83 |
+
|
84 |
+
# Move date to the front.
|
85 |
+
columns = list(df.columns)
|
86 |
+
columns.insert(0, columns.pop(columns.index("date")))
|
87 |
+
df = df.loc[:, columns]
|
88 |
+
else:
|
89 |
+
if "event_date" in df.columns:
|
90 |
+
df['event_date'] = df['event_date'].dt.date.astype(str).apply(nat_to_none)
|
91 |
+
df = df.rename(columns={"league_name": "league"})
|
92 |
+
|
93 |
+
df = df.rename(columns=lambda c: snake_case_to_human_readable(c))
|
94 |
+
return df
|
95 |
+
|
96 |
+
def _check_match_type(match_type: str) -> str:
|
97 |
+
allowed_match_types = {"tournament", "league"}
|
98 |
+
if match_type not in allowed_match_types:
|
99 |
+
raise ValueError(
|
100 |
+
f"The only supported match types are {allowed_match_types}. Found match type of '{match_type}'.")
|
101 |
+
return match_type
|
102 |
+
|
103 |
+
|
104 |
+
def fetch_player_name(profile_id: int) -> str:
|
105 |
+
"""Fetch a player name from theUSATT website.
|
106 |
+
|
107 |
+
note: the profile ID is NOT the USATT number.
|
108 |
+
"""
|
109 |
+
url = f"https://usatt.simplycompete.com/userAccount/up/{profile_id}"
|
110 |
+
logging.info(f"Fetching player name from {url}")
|
111 |
+
page = requests.get(url)
|
112 |
+
soup = BeautifulSoup(page.content, "html.parser")
|
113 |
+
profile_elt = soup.find("div", class_="profile-header")
|
114 |
+
return profile_elt.find(class_="title").text.strip()
|
115 |
+
|
116 |
+
|
117 |
+
def get_player_name(file_stem: str) -> str:
|
118 |
+
profile_id = int(file_stem.split(" ")[0].replace("_", "").split("matches")[-1])
|
119 |
+
return fetch_player_name(profile_id)
|
120 |
+
|
121 |
+
def get_num_competitions_played(df: pd.DataFrame, is_tournament: bool) -> int:
|
122 |
+
key_name = "tournament_end_date" if is_tournament else "event_date"
|
123 |
+
return df[key_name].nunique()
|
124 |
+
|
125 |
+
|
126 |
+
def get_current_rating(df: pd.DataFrame) -> int:
|
127 |
+
return df.rating.iloc[0]
|
128 |
+
|
129 |
+
|
130 |
+
def get_max_rating(df: pd.DataFrame) -> int:
|
131 |
+
return df.rating.max()
|
132 |
+
|
133 |
+
|
134 |
+
def get_matches_per_competition_fig(df: pd.DataFrame, is_tournament: bool):
|
135 |
+
fig = plt.figure()
|
136 |
+
plt.title('Matches per competition')
|
137 |
+
sns.histplot(df.groupby('tournament' if is_tournament else "event_date").size())
|
138 |
+
plt.xlabel('Number of matches in competition')
|
139 |
+
return fig
|
140 |
+
|
141 |
+
|
142 |
+
def get_competition_name_word_cloud_fig(df: pd.DataFrame, is_tournament: bool):
|
143 |
+
fig = plt.figure()
|
144 |
+
key_name = "tournament" if is_tournament else "league_name"
|
145 |
+
wordcloud = WordCloud().generate(" ".join(df[key_name].values.tolist()))
|
146 |
+
plt.imshow(wordcloud, interpolation='bilinear')
|
147 |
+
plt.axis("off")
|
148 |
+
return fig
|
149 |
+
|
150 |
+
|
151 |
+
def get_opponent_name_word_cloud_fig(df: pd.DataFrame):
|
152 |
+
fig = plt.figure()
|
153 |
+
wordcloud = WordCloud().generate(" ".join(df.opponent.values.tolist()))
|
154 |
+
plt.imshow(wordcloud, interpolation='bilinear')
|
155 |
+
plt.axis("off")
|
156 |
+
return fig
|
157 |
+
|
158 |
+
|
159 |
+
def get_rating_over_time_fig(df: pd.DataFrame, is_tournament: bool):
|
160 |
+
fig = go.Figure()
|
161 |
+
fig.add_trace(go.Scatter(x=df["tournament_end_date" if is_tournament else "event_date"],
|
162 |
+
y=df["rating"],
|
163 |
+
mode='lines+markers',
|
164 |
+
line=dict( width=0.9),
|
165 |
+
marker=dict(size=4))),
|
166 |
+
|
167 |
+
fig.update_layout(
|
168 |
+
title='Rating over time',
|
169 |
+
xaxis_title='Competition date',
|
170 |
+
yaxis_title='Rating',
|
171 |
+
showlegend=False,
|
172 |
+
template="plotly_white",
|
173 |
+
)
|
174 |
+
|
175 |
+
return fig
|
176 |
+
|
177 |
+
|
178 |
+
|
179 |
+
|
180 |
+
def get_match_with_longest_game(df: pd.DataFrame, is_tournament: bool) -> Optional[pd.DataFrame]:
|
181 |
+
if not is_tournament:
|
182 |
+
return None
|
183 |
+
df_non_null = df.loc[~df.scores.isna()]
|
184 |
+
return df_non_null.iloc[[df_non_null.scores.apply(get_max_abs_int).argmax()]]
|
185 |
+
|
186 |
+
|
187 |
+
def get_win_loss_record_str(group_df) -> str:
|
188 |
+
if len(group_df) > 0:
|
189 |
+
win_loss_counts = group_df.value_counts()
|
190 |
+
n_wins = win_loss_counts.Won if hasattr(win_loss_counts, "Won") else 0
|
191 |
+
n_losses = win_loss_counts.Lost if hasattr(win_loss_counts, "Lost") else 0
|
192 |
+
else:
|
193 |
+
n_wins = 0
|
194 |
+
n_losses = 0
|
195 |
+
|
196 |
+
return f"{n_wins}, {n_losses}"
|
197 |
+
|
198 |
+
|
199 |
+
def get_most_frequent_opponents(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
|
200 |
+
df_with_opponents = df.loc[df.opponent != "-, -"]
|
201 |
+
|
202 |
+
most_common_opponents_df = df_with_opponents.groupby('opponent').agg({"result": [get_win_loss_record_str, "size"]})
|
203 |
+
most_common_opponents_df.columns = most_common_opponents_df.columns.get_level_values(1)
|
204 |
+
most_common_opponents_df.rename({"get_win_loss_record_str": "Win/loss record", "size": "Number of matches"}, axis=1,
|
205 |
+
inplace=True)
|
206 |
+
most_common_opponents_df["Opponent"] = most_common_opponents_df.index
|
207 |
+
return most_common_opponents_df.sort_values("Number of matches", ascending=False)[
|
208 |
+
["Opponent", "Number of matches", "Win/loss record"]].head(top_n)
|
209 |
+
|
210 |
+
|
211 |
+
def get_best_wins(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
|
212 |
+
"""Get the top-n wins sorted by opponent rating."""
|
213 |
+
return df.loc[df.result == 'Won'].sort_values("opponent_rating", ascending=False).head(top_n)
|
214 |
+
|
215 |
+
|
216 |
+
def get_biggest_upsets(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
|
217 |
+
"""Get the top-n wins sorted by rating difference."""
|
218 |
+
df['rating_difference'] = df['opponent_rating'] - df['rating']
|
219 |
+
return df.loc[df.result == 'Won'].sort_values("rating_difference", ascending=False).head(top_n)
|
220 |
+
|
221 |
+
|
222 |
+
def get_highest_rated_opponent(df: pd.DataFrame) -> pd.DataFrame:
|
223 |
+
return df.iloc[df.opponent_rating.idxmax()].to_frame().transpose()
|
224 |
+
|
225 |
+
|
226 |
+
def get_opponent_rating_distr_fig(df: pd.DataFrame):
|
227 |
+
fig = plt.figure()
|
228 |
+
plt.title('Opponent rating distribution')
|
229 |
+
sns.histplot(data=df, x="opponent_rating", hue='result')
|
230 |
+
plt.xlabel('Opponent rating')
|
231 |
+
return fig
|
232 |
+
|
233 |
+
|
234 |
+
def get_opponent_rating_dist_over_time_fig(df: pd.DataFrame, is_tournament: bool):
|
235 |
+
fig, ax = plt.subplots(figsize=(12, 8))
|
236 |
+
plt.title(f'Opponent rating distribution over time')
|
237 |
+
x_key_name = "tournament_end_date" if is_tournament else "event_date"
|
238 |
+
sns.violinplot(data=df,
|
239 |
+
x=df[x_key_name].dt.year,
|
240 |
+
y="opponent_rating",
|
241 |
+
hue="result",
|
242 |
+
split=True,
|
243 |
+
inner='points',
|
244 |
+
cut=1,
|
245 |
+
ax=ax)
|
246 |
+
plt.xticks(rotation=30)
|
247 |
+
plt.xlabel('Competition year')
|
248 |
+
plt.ylabel('Opponent rating')
|
249 |
+
return fig
|
250 |
+
|
251 |
+
|
252 |
+
def load_match_df(file_path: Path) -> Tuple[pd.DataFrame, bool]:
|
253 |
+
match_type = _check_match_type(file_path.name.split('_')[0])
|
254 |
+
is_tournament = match_type == "tournament"
|
255 |
+
|
256 |
+
df = pd.read_csv(file_path)
|
257 |
+
df = _rename_columns(df, is_tournament)
|
258 |
+
df = _fix_dtypes(df, is_tournament)
|
259 |
+
|
260 |
+
return df, is_tournament
|
261 |
+
|
util.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def snake_case_to_human_readable(s: str) -> str:
|
2 |
+
return " ".join(s.capitalize().split("_"))
|
3 |
+
|
4 |
+
|
5 |
+
def get_max_abs_int(int_csv_str: str) -> int:
|
6 |
+
"""Get the max absolute value int from an int CSV."""
|
7 |
+
ints = [abs(int(i.strip())) for i in int_csv_str.split(',') if i]
|
8 |
+
return max(ints)
|