Spaces:

lschlessinger
/

usatt-rating-analyzer

Running

App Files Files Community

lschlessinger commited on Feb 1, 2023

Commit

0e1f732

1 Parent(s): d72caf0

refactor: split out core logic from app

Browse files

Files changed (3) hide show

app.py +27 -259
match_parser.py +261 -0
util.py +8 -0

app.py CHANGED Viewed

@@ -1,272 +1,36 @@
 from pathlib import Path
-from typing import Optional, Tuple
 import gradio as gr
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-import plotly.graph_objects as go
-import seaborn as sns
-from wordcloud import WordCloud
-def _rename_columns(df: pd.DataFrame, is_tournament: bool) -> pd.DataFrame:
-    columns = {
-        "Rating": "rating",
-        "Result": "result",
-        "Scores": "scores",
-        "Opponent": "opponent",
-        "OpponentRating": "opponent_rating",
-    }
-    if is_tournament:
-        columns.update({
-            "TournamentStartDate": "tournament_start_date",
-            "TournamentEndDate": "tournament_end_date",
-            " Touranament": "tournament",
-        })
-    else:
-        columns.update({
-            "EventDate": "event_date",
-            "LeagueName": "league_name"
-        })
-    return df.rename(columns=columns)
-def _fix_dtypes(df: pd.DataFrame, is_tournament: bool) -> pd.DataFrame:
-    if is_tournament:
-        df["tournament_start_date"] = pd.to_datetime(df["tournament_start_date"])
-        df["tournament_end_date"] = pd.to_datetime(df["tournament_end_date"])
-        df["tournament"] = df["tournament"].astype('category')
-    else:
-        df["event_date"] = pd.to_datetime(df["event_date"])
-        df["league_name"] = df["league_name"].astype('string')
-    df["rating"] = df["rating"].astype('int')
-    df["result"] = df["result"].astype('category')
-    df["scores"] = df["scores"].astype('string')
-    df["opponent"] = df["opponent"].astype('category')
-    df["opponent_rating"] = df["opponent_rating"].astype('int')
-    return df
-def snake_case_to_human_readable(s: str) -> str:
-    return " ".join(s.capitalize().split("_"))
-def make_df_columns_readable(df: Optional[pd.DataFrame], is_tournament: bool) -> Optional[pd.DataFrame]:
-    """Make a data frame's columns human-readable."""
-    if df is None:
-        return None
-    nat_to_none = lambda x: None if x == "NaT" else x
-    if is_tournament:
-        if "tournament_start_date" in df.columns and "tournament_end_date" in df.columns:
-            df['tournament_start_date'] = df['tournament_start_date'].dt.date.astype(str).apply(nat_to_none)
-            df['tournament_end_date'] = df['tournament_end_date'].dt.date.astype(str).apply(nat_to_none)
-            def create_date(tournament_start_date, tournament_end_date):
-                missing_start_date = tournament_start_date is None
-                missing_end_date = tournament_end_date is None
-                if not missing_start_date and not missing_end_date:
-                    if tournament_start_date is not tournament_end_date:
-                        return ' - '.join((tournament_start_date, tournament_end_date))
-                    else:
-                        return tournament_start_date
-                else:
-                    return tournament_start_date if missing_end_date else tournament_end_date
-            df["date"] = df.apply(lambda row: create_date(row['tournament_start_date'], row['tournament_end_date']), axis=1)
-            df = df.drop(columns=["tournament_start_date", "tournament_end_date"])
-            # Move date to the front.
-            columns = list(df.columns)
-            columns.insert(0, columns.pop(columns.index("date")))
-            df = df.loc[:, columns]
-    else:
-        if "event_date" in df.columns:
-            df['event_date'] = df['event_date'].dt.date.astype(str).apply(nat_to_none)
-        df = df.rename(columns={"league_name": "league"})
-    df = df.rename(columns=lambda c: snake_case_to_human_readable(c))
-    return df
-def _check_match_type(match_type: str) -> str:
-    allowed_match_types = {"tournament", "league"}
-    if match_type not in allowed_match_types:
-        raise ValueError(
-            f"The only supported match types are {allowed_match_types}. Found match type of '{match_type}'.")
-    return match_type
-def get_num_competitions_played(df: pd.DataFrame, is_tournament: bool) -> int:
-    key_name = "tournament_end_date" if is_tournament else "event_date"
-    return df[key_name].nunique()
-def get_current_rating(df: pd.DataFrame) -> int:
-    return df.rating.iloc[0]
-def get_max_rating(df: pd.DataFrame) -> int:
-    return df.rating.max()
-def get_matches_per_competition_fig(df: pd.DataFrame, is_tournament: bool):
-    fig = plt.figure()
-    plt.title('Matches per competition')
-    sns.histplot(df.groupby('tournament' if is_tournament else "event_date").size())
-    plt.xlabel('Number of matches in competition')
-    return fig
-def get_competition_name_word_cloud_fig(df: pd.DataFrame, is_tournament: bool):
-    fig = plt.figure()
-    key_name = "tournament" if is_tournament else "league_name"
-    wordcloud = WordCloud().generate(" ".join(df[key_name].values.tolist()))
-    plt.imshow(wordcloud, interpolation='bilinear')
-    plt.axis("off")
-    return fig
-def get_opponent_name_word_cloud_fig(df: pd.DataFrame):
-    fig = plt.figure()
-    wordcloud = WordCloud().generate(" ".join(df.opponent.values.tolist()))
-    plt.imshow(wordcloud, interpolation='bilinear')
-    plt.axis("off")
-    return fig
-def get_rating_over_time_fig(df: pd.DataFrame, is_tournament: bool):
-    fig = go.Figure()
-    fig.add_trace(go.Scatter(x=df["tournament_end_date" if is_tournament else "event_date"],
-                             y=df["rating"],
-                             mode='lines+markers',
-                             line=dict( width=0.9),
-                             marker=dict(size=4))),
-    fig.update_layout(
-        title='Rating over time',
-        xaxis_title='Competition date',
-        yaxis_title='Rating',
-        showlegend=False,
-        template="plotly_white",
-    )
-    return fig
-def get_max_abs_int(int_csv_str: str) -> int:
-    """Get the max absolute value int from an int CSV."""
-    ints = [abs(int(i.strip())) for i in int_csv_str.split(',') if i]
-    return max(ints)
-def get_match_with_longest_game(df: pd.DataFrame, is_tournament: bool) -> Optional[pd.DataFrame]:
-    if not is_tournament:
-        return None
-    df_non_null = df.loc[~df.scores.isna()]
-    return df_non_null.iloc[[df_non_null.scores.apply(get_max_abs_int).argmax()]]
-def get_win_loss_record_str(group_df) -> str:
-    if len(group_df) > 0:
-        win_loss_counts = group_df.value_counts()
-        n_wins = win_loss_counts.Won if hasattr(win_loss_counts, "Won") else 0
-        n_losses = win_loss_counts.Lost if hasattr(win_loss_counts, "Lost") else 0
-    else:
-        n_wins = 0
-        n_losses = 0
-    return f"{n_wins}, {n_losses}"
-def get_most_frequent_opponents(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
-    df_with_opponents = df.loc[df.opponent != "-, -"]
-    most_common_opponents_df = df_with_opponents.groupby('opponent').agg({"result": [get_win_loss_record_str, "size"]})
-    most_common_opponents_df.columns = most_common_opponents_df.columns.get_level_values(1)
-    most_common_opponents_df.rename({"get_win_loss_record_str": "Win/loss record", "size": "Number of matches"}, axis=1,
-                                    inplace=True)
-    most_common_opponents_df["Opponent"] = most_common_opponents_df.index
-    return most_common_opponents_df.sort_values("Number of matches", ascending=False)[
-        ["Opponent", "Number of matches", "Win/loss record"]].head(top_n)
-def get_best_wins(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
-    """Get the top-n wins sorted by opponent rating."""
-    return df.loc[df.result == 'Won'].sort_values("opponent_rating", ascending=False).head(top_n)
-def get_biggest_upsets(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
-    """Get the top-n wins sorted by rating difference."""
-    df['rating_difference'] = df['opponent_rating'] - df['rating']
-    return df.loc[df.result == 'Won'].sort_values("rating_difference", ascending=False).head(top_n)
-def get_highest_rated_opponent(df: pd.DataFrame) -> pd.DataFrame:
-    return df.iloc[df.opponent_rating.idxmax()].to_frame().transpose()
-def get_opponent_rating_distr_fig(df: pd.DataFrame):
-    fig = plt.figure()
-    plt.title('Opponent rating distribution')
-    sns.histplot(data=df, x="opponent_rating", hue='result')
-    plt.xlabel('Opponent rating')
-    return fig
-def get_opponent_rating_dist_over_time_fig(df: pd.DataFrame, is_tournament: bool):
-    fig, ax = plt.subplots(figsize=(12, 8))
-    plt.title(f'Opponent rating distribution over time')
-    x_key_name = "tournament_end_date" if is_tournament else "event_date"
-    sns.violinplot(data=df,
-                   x=df[x_key_name].dt.year,
-                   y="opponent_rating",
-                   hue="result",
-                   split=True,
-                   inner='points',
-                   cut=1,
-                   ax=ax)
-    plt.xticks(rotation=30)
-    plt.xlabel('Competition year')
-    plt.ylabel('Opponent rating')
-    return fig
-def load_match_df(file_path: Path) -> Tuple[pd.DataFrame, bool]:
-    match_type = _check_match_type(file_path.name.split('_')[0])
-    is_tournament = match_type == "tournament"
-    df = pd.read_csv(file_path)
-    df = _rename_columns(df, is_tournament)
-    df = _fix_dtypes(df, is_tournament)
-    return df, is_tournament
 def usatt_rating_analyzer(file_obj):
     # Load data.
-    df, is_tournament = load_match_df(Path(file_obj.name))
     # Create outputs.
-    current_rating = get_current_rating(df)
-    peak_rating = get_max_rating(df)
-    n_competitions_played = get_num_competitions_played(df, is_tournament)
     n_matches_played = len(df)
-    matches_per_competition_fig = get_matches_per_competition_fig(df, is_tournament)
-    opponent_name_word_cloud_fig = get_opponent_name_word_cloud_fig(df)
-    competition_name_word_cloud_fig = get_competition_name_word_cloud_fig(df, is_tournament)
-    most_frequent_opponents = make_df_columns_readable(get_most_frequent_opponents(df), is_tournament)
-    best_wins = make_df_columns_readable(get_best_wins(df), is_tournament)
-    biggest_upsets = make_df_columns_readable(get_biggest_upsets(df), is_tournament)
-    highest_rated_opponent = make_df_columns_readable(get_highest_rated_opponent(df), is_tournament)
-    rating_over_time_fig = get_rating_over_time_fig(df, is_tournament)
-    match_with_longest_game = make_df_columns_readable(get_match_with_longest_game(df, is_tournament), is_tournament)
-    opponent_rating_distr_fig = get_opponent_rating_distr_fig(df)
-    opponent_rating_dist_over_time_fig = get_opponent_rating_dist_over_time_fig(df, is_tournament)
-    return (current_rating,
             peak_rating,
             n_competitions_played,
             n_matches_played,
@@ -302,7 +66,7 @@ with gr.Blocks() as demo:
     """)
     with gr.Row():
         with gr.Column():
-            input_file = gr.File(label='USATT Results File', file_types=['file'])
             btn = gr.Button(analyze_btn_title)
     gr.Markdown("""<br />
@@ -313,6 +77,9 @@ with gr.Blocks() as demo:
     """)
     with gr.Group():
         with gr.Row():
             with gr.Column():
                 current_rating_box = gr.Textbox(lines=1, label="Current rating")
@@ -371,6 +138,7 @@ with gr.Blocks() as demo:
     inputs = [input_file]
     outputs = [
         current_rating_box,
         peak_rating_box,
         num_comps_box,

 from pathlib import Path
 import gradio as gr
+import match_parser as mp
 def usatt_rating_analyzer(file_obj):
     # Load data.
+    file_path = Path(file_obj.name)
+    df, is_tournament = mp.load_match_df(file_path)
     # Create outputs.
+    print(file_path.stem)
+    player_name = mp.get_player_name(file_path.stem)
+    current_rating = mp.get_current_rating(df)
+    peak_rating = mp.get_max_rating(df)
+    n_competitions_played = mp.get_num_competitions_played(df, is_tournament)
     n_matches_played = len(df)
+    matches_per_competition_fig = mp.get_matches_per_competition_fig(df, is_tournament)
+    opponent_name_word_cloud_fig = mp.get_opponent_name_word_cloud_fig(df)
+    competition_name_word_cloud_fig = mp.get_competition_name_word_cloud_fig(df, is_tournament)
+    most_frequent_opponents = mp.make_df_columns_readable(mp.get_most_frequent_opponents(df), is_tournament)
+    best_wins = mp.make_df_columns_readable(mp.get_best_wins(df), is_tournament)
+    biggest_upsets = mp.make_df_columns_readable(mp.get_biggest_upsets(df), is_tournament)
+    highest_rated_opponent = mp.make_df_columns_readable(mp.get_highest_rated_opponent(df), is_tournament)
+    rating_over_time_fig = mp.get_rating_over_time_fig(df, is_tournament)
+    match_with_longest_game = mp.make_df_columns_readable(mp.get_match_with_longest_game(df, is_tournament), is_tournament)
+    opponent_rating_distr_fig = mp.get_opponent_rating_distr_fig(df)
+    opponent_rating_dist_over_time_fig = mp.get_opponent_rating_dist_over_time_fig(df, is_tournament)
+    return (player_name,
+            current_rating,
             peak_rating,
             n_competitions_played,
             n_matches_played,
     """)
     with gr.Row():
         with gr.Column():
+            input_file = gr.File(label='USATT Results File', file_types=['file'], keepfilename=True)
             btn = gr.Button(analyze_btn_title)
     gr.Markdown("""<br />
     """)
     with gr.Group():
+        with gr.Row():
+            with gr.Column():
+                player_name_box = gr.Textbox(lines=1, label="Player name")
         with gr.Row():
             with gr.Column():
                 current_rating_box = gr.Textbox(lines=1, label="Current rating")
     inputs = [input_file]
     outputs = [
+        player_name_box,
         current_rating_box,
         peak_rating_box,
         num_comps_box,

match_parser.py ADDED Viewed

	@@ -0,0 +1,261 @@

+import logging
+from pathlib import Path
+from typing import Optional, Tuple
+import matplotlib.pyplot as plt
+import pandas as pd
+import plotly.graph_objects as go
+import requests
+import seaborn as sns
+from bs4 import BeautifulSoup
+from wordcloud import WordCloud
+from util import get_max_abs_int, snake_case_to_human_readable
+def _rename_columns(df: pd.DataFrame, is_tournament: bool) -> pd.DataFrame:
+    columns = {
+        "Rating": "rating",
+        "Result": "result",
+        "Scores": "scores",
+        "Opponent": "opponent",
+        "OpponentRating": "opponent_rating",
+    }
+    if is_tournament:
+        columns.update({
+            "TournamentStartDate": "tournament_start_date",
+            "TournamentEndDate": "tournament_end_date",
+            " Touranament": "tournament",
+        })
+    else:
+        columns.update({
+            "EventDate": "event_date",
+            "LeagueName": "league_name"
+        })
+    return df.rename(columns=columns)
+def _fix_dtypes(df: pd.DataFrame, is_tournament: bool) -> pd.DataFrame:
+    if is_tournament:
+        df["tournament_start_date"] = pd.to_datetime(df["tournament_start_date"])
+        df["tournament_end_date"] = pd.to_datetime(df["tournament_end_date"])
+        df["tournament"] = df["tournament"].astype('category')
+    else:
+        df["event_date"] = pd.to_datetime(df["event_date"])
+        df["league_name"] = df["league_name"].astype('string')
+    df["rating"] = df["rating"].astype('int')
+    df["result"] = df["result"].astype('category')
+    df["scores"] = df["scores"].astype('string')
+    df["opponent"] = df["opponent"].astype('category')
+    df["opponent_rating"] = df["opponent_rating"].astype('int')
+    return df
+def make_df_columns_readable(df: Optional[pd.DataFrame], is_tournament: bool) -> Optional[pd.DataFrame]:
+    """Make a data frame's columns human-readable."""
+    if df is None:
+        return None
+    nat_to_none = lambda x: None if x == "NaT" else x
+    if is_tournament:
+        if "tournament_start_date" in df.columns and "tournament_end_date" in df.columns:
+            df['tournament_start_date'] = df['tournament_start_date'].dt.date.astype(str).apply(nat_to_none)
+            df['tournament_end_date'] = df['tournament_end_date'].dt.date.astype(str).apply(nat_to_none)
+            def create_date(tournament_start_date, tournament_end_date):
+                missing_start_date = tournament_start_date is None
+                missing_end_date = tournament_end_date is None
+                if not missing_start_date and not missing_end_date:
+                    if tournament_start_date is not tournament_end_date:
+                        return ' - '.join((tournament_start_date, tournament_end_date))
+                    else:
+                        return tournament_start_date
+                else:
+                    return tournament_start_date if missing_end_date else tournament_end_date
+            df["date"] = df.apply(lambda row: create_date(row['tournament_start_date'], row['tournament_end_date']), axis=1)
+            df = df.drop(columns=["tournament_start_date", "tournament_end_date"])
+            # Move date to the front.
+            columns = list(df.columns)
+            columns.insert(0, columns.pop(columns.index("date")))
+            df = df.loc[:, columns]
+    else:
+        if "event_date" in df.columns:
+            df['event_date'] = df['event_date'].dt.date.astype(str).apply(nat_to_none)
+        df = df.rename(columns={"league_name": "league"})
+    df = df.rename(columns=lambda c: snake_case_to_human_readable(c))
+    return df
+def _check_match_type(match_type: str) -> str:
+    allowed_match_types = {"tournament", "league"}
+    if match_type not in allowed_match_types:
+        raise ValueError(
+            f"The only supported match types are {allowed_match_types}. Found match type of '{match_type}'.")
+    return match_type
+def fetch_player_name(profile_id: int) -> str:
+    """Fetch a player name from theUSATT website.
+    note: the profile ID is NOT the USATT number.
+    """
+    url = f"https://usatt.simplycompete.com/userAccount/up/{profile_id}"
+    logging.info(f"Fetching player name from {url}")
+    page = requests.get(url)
+    soup = BeautifulSoup(page.content, "html.parser")
+    profile_elt = soup.find("div", class_="profile-header")
+    return profile_elt.find(class_="title").text.strip()
+def get_player_name(file_stem: str) -> str:
+    profile_id = int(file_stem.split(" ")[0].replace("_", "").split("matches")[-1])
+    return fetch_player_name(profile_id)
+def get_num_competitions_played(df: pd.DataFrame, is_tournament: bool) -> int:
+    key_name = "tournament_end_date" if is_tournament else "event_date"
+    return df[key_name].nunique()
+def get_current_rating(df: pd.DataFrame) -> int:
+    return df.rating.iloc[0]
+def get_max_rating(df: pd.DataFrame) -> int:
+    return df.rating.max()
+def get_matches_per_competition_fig(df: pd.DataFrame, is_tournament: bool):
+    fig = plt.figure()
+    plt.title('Matches per competition')
+    sns.histplot(df.groupby('tournament' if is_tournament else "event_date").size())
+    plt.xlabel('Number of matches in competition')
+    return fig
+def get_competition_name_word_cloud_fig(df: pd.DataFrame, is_tournament: bool):
+    fig = plt.figure()
+    key_name = "tournament" if is_tournament else "league_name"
+    wordcloud = WordCloud().generate(" ".join(df[key_name].values.tolist()))
+    plt.imshow(wordcloud, interpolation='bilinear')
+    plt.axis("off")
+    return fig
+def get_opponent_name_word_cloud_fig(df: pd.DataFrame):
+    fig = plt.figure()
+    wordcloud = WordCloud().generate(" ".join(df.opponent.values.tolist()))
+    plt.imshow(wordcloud, interpolation='bilinear')
+    plt.axis("off")
+    return fig
+def get_rating_over_time_fig(df: pd.DataFrame, is_tournament: bool):
+    fig = go.Figure()
+    fig.add_trace(go.Scatter(x=df["tournament_end_date" if is_tournament else "event_date"],
+                             y=df["rating"],
+                             mode='lines+markers',
+                             line=dict( width=0.9),
+                             marker=dict(size=4))),
+    fig.update_layout(
+        title='Rating over time',
+        xaxis_title='Competition date',
+        yaxis_title='Rating',
+        showlegend=False,
+        template="plotly_white",
+    )
+    return fig
+def get_match_with_longest_game(df: pd.DataFrame, is_tournament: bool) -> Optional[pd.DataFrame]:
+    if not is_tournament:
+        return None
+    df_non_null = df.loc[~df.scores.isna()]
+    return df_non_null.iloc[[df_non_null.scores.apply(get_max_abs_int).argmax()]]
+def get_win_loss_record_str(group_df) -> str:
+    if len(group_df) > 0:
+        win_loss_counts = group_df.value_counts()
+        n_wins = win_loss_counts.Won if hasattr(win_loss_counts, "Won") else 0
+        n_losses = win_loss_counts.Lost if hasattr(win_loss_counts, "Lost") else 0
+    else:
+        n_wins = 0
+        n_losses = 0
+    return f"{n_wins}, {n_losses}"
+def get_most_frequent_opponents(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
+    df_with_opponents = df.loc[df.opponent != "-, -"]
+    most_common_opponents_df = df_with_opponents.groupby('opponent').agg({"result": [get_win_loss_record_str, "size"]})
+    most_common_opponents_df.columns = most_common_opponents_df.columns.get_level_values(1)
+    most_common_opponents_df.rename({"get_win_loss_record_str": "Win/loss record", "size": "Number of matches"}, axis=1,
+                                    inplace=True)
+    most_common_opponents_df["Opponent"] = most_common_opponents_df.index
+    return most_common_opponents_df.sort_values("Number of matches", ascending=False)[
+        ["Opponent", "Number of matches", "Win/loss record"]].head(top_n)
+def get_best_wins(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
+    """Get the top-n wins sorted by opponent rating."""
+    return df.loc[df.result == 'Won'].sort_values("opponent_rating", ascending=False).head(top_n)
+def get_biggest_upsets(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
+    """Get the top-n wins sorted by rating difference."""
+    df['rating_difference'] = df['opponent_rating'] - df['rating']
+    return df.loc[df.result == 'Won'].sort_values("rating_difference", ascending=False).head(top_n)
+def get_highest_rated_opponent(df: pd.DataFrame) -> pd.DataFrame:
+    return df.iloc[df.opponent_rating.idxmax()].to_frame().transpose()
+def get_opponent_rating_distr_fig(df: pd.DataFrame):
+    fig = plt.figure()
+    plt.title('Opponent rating distribution')
+    sns.histplot(data=df, x="opponent_rating", hue='result')
+    plt.xlabel('Opponent rating')
+    return fig
+def get_opponent_rating_dist_over_time_fig(df: pd.DataFrame, is_tournament: bool):
+    fig, ax = plt.subplots(figsize=(12, 8))
+    plt.title(f'Opponent rating distribution over time')
+    x_key_name = "tournament_end_date" if is_tournament else "event_date"
+    sns.violinplot(data=df,
+                   x=df[x_key_name].dt.year,
+                   y="opponent_rating",
+                   hue="result",
+                   split=True,
+                   inner='points',
+                   cut=1,
+                   ax=ax)
+    plt.xticks(rotation=30)
+    plt.xlabel('Competition year')
+    plt.ylabel('Opponent rating')
+    return fig
+def load_match_df(file_path: Path) -> Tuple[pd.DataFrame, bool]:
+    match_type = _check_match_type(file_path.name.split('_')[0])
+    is_tournament = match_type == "tournament"
+    df = pd.read_csv(file_path)
+    df = _rename_columns(df, is_tournament)
+    df = _fix_dtypes(df, is_tournament)
+    return df, is_tournament

util.py ADDED Viewed

	@@ -0,0 +1,8 @@

+def snake_case_to_human_readable(s: str) -> str:
+    return " ".join(s.capitalize().split("_"))
+def get_max_abs_int(int_csv_str: str) -> int:
+    """Get the max absolute value int from an int CSV."""
+    ints = [abs(int(i.strip())) for i in int_csv_str.split(',') if i]
+    return max(ints)