Spaces:
Running
on
Zero
Running
on
Zero
| import os | |
| import pandas as pd | |
| import math | |
| from datetime import datetime | |
| from .models import models | |
| from huggingface_hub import CommitScheduler, hf_hub_download | |
| # Default K-factor (determines how much a single match affects ratings) | |
| DEFAULT_K_FACTOR = 32 | |
| # Default starting Elo | |
| DEFAULT_ELO = 1500 | |
| LEADERBOARD_FN = './utils/leaderboard/arena_elo_leaderboard.csv' | |
| REPO_ID = "aizip-dev/Arena-Metadata" | |
| hub_leaderboard_path = hf_hub_download( | |
| repo_id=REPO_ID, | |
| filename="arena_elo_leaderboard.csv", | |
| repo_type="dataset", | |
| ) | |
| df = pd.read_csv(hub_leaderboard_path) | |
| print(f"Successfully loaded leaderboard from the Hub. {len(df)} models.") | |
| df.to_csv(LEADERBOARD_FN, index=False) | |
| print(f"Leaderboard copied to {LEADERBOARD_FN} for CommitScheduler.") | |
| #csv_path = os.path.join('utils', 'arena_elo_leaderboard.csv') | |
| leaderboard_scheduler = CommitScheduler( | |
| repo_id=REPO_ID, | |
| folder_path="utils/leaderboard", | |
| repo_type="dataset", | |
| every=1 | |
| ) | |
| def prepare_url(model_dict: dict): | |
| """ | |
| Prepare the URL for the model based on its name. | |
| Parameters: | |
| - model_dict: Dictionary containing model information | |
| Returns: | |
| - URL string for the model | |
| """ | |
| url_dict = {} | |
| # Extract the model name from the dictionary | |
| model_names = model_dict.keys() | |
| for name in model_names: | |
| half_url = model_dict[name] | |
| # Construct the URL using the model name | |
| url = f"https://huggingface.co/{half_url}" | |
| url_dict[name] = url | |
| return url_dict | |
| # Mapping of model names to their Hugging Face URLs | |
| # model_to_hf = { | |
| # "Qwen2.5-1.5b-Instruct": "https://huggingface.co/qwen/qwen2.5-1.5b-instruct", | |
| # "Qwen2.5-3b-Instruct": "https://huggingface.co/qwen/qwen2.5-3b-instruct", | |
| # # Add more models and their HF links here | |
| # } | |
| model_to_hf = prepare_url(models) | |
| def calculate_elo_changes(winner_rating, loser_rating, k_factor=DEFAULT_K_FACTOR, draw=False): | |
| """ | |
| Calculate Elo rating changes for two models. | |
| Parameters: | |
| - winner_rating: Winner's current rating | |
| - loser_rating: Loser's current rating | |
| - k_factor: How much a single match affects ratings | |
| - draw: Whether the match was a draw | |
| Returns: | |
| - (winner_change, loser_change): Rating changes to apply | |
| """ | |
| # Calculate expected scores (probability of winning) | |
| expected_winner = 1 / (1 + 10 ** ((loser_rating - winner_rating) / 400)) | |
| expected_loser = 1 / (1 + 10 ** ((winner_rating - loser_rating) / 400)) | |
| if draw: | |
| # For a draw, both get 0.5 points | |
| actual_winner = 0.5 | |
| actual_loser = 0.5 | |
| else: | |
| # For a win, winner gets 1 point, loser gets 0 | |
| actual_winner = 1.0 | |
| actual_loser = 0.0 | |
| # Calculate rating changes | |
| winner_change = k_factor * (actual_winner - expected_winner) | |
| loser_change = k_factor * (actual_loser - expected_loser) | |
| return winner_change, loser_change | |
| def calculate_confidence_interval(elo_rating, num_games, confidence=0.95): | |
| """ | |
| Calculate a confidence interval for an Elo rating. | |
| Parameters: | |
| - elo_rating: The current Elo rating | |
| - num_games: Number of games played | |
| - confidence: Confidence level (default: 0.95 for 95% confidence) | |
| Returns: | |
| - margin: The margin of error for the confidence interval | |
| """ | |
| if num_games == 0: | |
| return float('inf') | |
| # Z-score for the given confidence level (1.96 for 95% confidence) | |
| z = 1.96 if confidence == 0.95 else 1.645 if confidence == 0.90 else 2.576 if confidence == 0.99 else 1.96 | |
| # Standard deviation of the Elo rating | |
| # The factor 400/sqrt(num_games) is a common approximation | |
| std_dev = 400 / math.sqrt(num_games) | |
| # Margin of error | |
| margin = z * std_dev | |
| return margin | |
| def load_leaderboard_data(): | |
| """ | |
| Loads the leaderboard data from the leaderboard CSV file. | |
| Returns the data in a format compatible with the application. | |
| """ | |
| # Initialize the results structure with both win/loss/tie counts and Elo ratings | |
| results = { | |
| "wins": {}, | |
| "losses": {}, | |
| "ties": {}, | |
| "votes": 0, | |
| "elo": {}, | |
| "games_played": {}, | |
| "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| } | |
| try: | |
| # Define the path to the CSV file for leaderboard | |
| csv_path = LEADERBOARD_FN | |
| # Check if the file exists and load it | |
| if os.path.exists(csv_path): | |
| df = pd.read_csv(LEADERBOARD_FN) | |
| # Process the data into our structure | |
| for _, row in df.iterrows(): | |
| model = row['model'] | |
| results["wins"][model] = row['wins'] | |
| results["losses"][model] = row['losses'] | |
| results["ties"][model] = row['ties'] | |
| results["elo"][model] = row['elo'] | |
| results["games_played"][model] = row['games_played'] | |
| # Calculate total votes | |
| for model in results["wins"].keys(): | |
| results["votes"] += results["wins"][model] + results["losses"][model] + results["ties"][model] // 2 | |
| else: | |
| # If file doesn't exist, pre-populate with some reasonable data | |
| print("Leaderboard file not found. Initializing with default values.") | |
| from .models import model_names | |
| for model in model_names: | |
| results["wins"][model] = 0 | |
| results["losses"][model] = 0 | |
| results["ties"][model] = 0 | |
| results["elo"][model] = DEFAULT_ELO # Start everyone at 1500 Elo | |
| results["games_played"][model] = 0 | |
| return results | |
| except Exception as e: | |
| print(f"Error loading leaderboard data: {e}") | |
| # Return the initialized structure if file can't be loaded | |
| return results | |
| def update_elo_ratings(results, model_a, model_b, winner, k_factor=DEFAULT_K_FACTOR): | |
| """ | |
| Updates Elo ratings based on a match result. | |
| Parameters: | |
| - results: The current leaderboard results dictionary | |
| - model_a: Name of model A | |
| - model_b: Name of model B | |
| - winner: 'left' for model A, 'right' for model B, 'tie' for a tie, 'neither' for no winner | |
| - k_factor: How much this match affects ratings | |
| Returns: | |
| - Updated results dictionary | |
| """ | |
| # Initialize ratings if not present | |
| if model_a not in results["elo"]: | |
| results["elo"][model_a] = DEFAULT_ELO | |
| results["games_played"][model_a] = 0 | |
| if model_b not in results["elo"]: | |
| results["elo"][model_b] = DEFAULT_ELO | |
| results["games_played"][model_b] = 0 | |
| # Get current ratings | |
| rating_a = results["elo"][model_a] | |
| rating_b = results["elo"][model_b] | |
| # Handle different winning scenarios | |
| if winner == 'left': | |
| # Model A won | |
| change_a, change_b = calculate_elo_changes(rating_a, rating_b, k_factor, draw=False) | |
| results["wins"][model_a] = results["wins"].get(model_a, 0) + 1 | |
| results["losses"][model_b] = results["losses"].get(model_b, 0) + 1 | |
| elif winner == 'right': | |
| # Model B won | |
| change_b, change_a = calculate_elo_changes(rating_b, rating_a, k_factor, draw=False) | |
| results["wins"][model_b] = results["wins"].get(model_b, 0) + 1 | |
| results["losses"][model_a] = results["losses"].get(model_a, 0) + 1 | |
| elif winner == 'tie': | |
| # It's a tie | |
| change_a, change_b = calculate_elo_changes(rating_a, rating_b, k_factor, draw=True) | |
| results["ties"][model_a] = results["ties"].get(model_a, 0) + 1 | |
| results["ties"][model_b] = results["ties"].get(model_b, 0) + 1 | |
| else: # 'neither' case - no winner | |
| # No rating changes, but still log the game | |
| change_a, change_b = 0, 0 | |
| # Apply rating changes | |
| results["elo"][model_a] = rating_a + change_a | |
| results["elo"][model_b] = rating_b + change_b | |
| # Update games played counters | |
| results["games_played"][model_a] = results["games_played"].get(model_a, 0) + 1 | |
| results["games_played"][model_b] = results["games_played"].get(model_b, 0) + 1 | |
| # Update timestamp | |
| results["last_updated"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| return results | |
| def save_leaderboard_data(results): | |
| """ | |
| Saves the current leaderboard results back to the CSV file. | |
| Parameters: | |
| - results: The results dictionary with wins, losses, ties, elo, etc. | |
| """ | |
| try: | |
| # Define the path to the CSV file | |
| csv_path = LEADERBOARD_FN | |
| # Convert the results dictionary to a DataFrame | |
| data = [] | |
| for model in results["elo"].keys(): | |
| # Calculate confidence interval | |
| games_played = results["games_played"].get(model, 0) | |
| confidence_interval = calculate_confidence_interval(results["elo"][model], games_played) | |
| data.append({ | |
| 'model': model, | |
| 'elo': round(results["elo"].get(model, DEFAULT_ELO), 1), | |
| 'wins': results["wins"].get(model, 0), | |
| 'losses': results["losses"].get(model, 0), | |
| 'ties': results["ties"].get(model, 0), | |
| 'games_played': results["games_played"].get(model, 0), | |
| 'confidence_interval': round(confidence_interval, 1) | |
| }) | |
| df = pd.DataFrame(data) | |
| # Sort by Elo rating (descending) | |
| df = df.sort_values(by='elo', ascending=False) | |
| # Save to CSV | |
| with leaderboard_scheduler.lock: | |
| df.to_csv(csv_path, index=False) | |
| print(f"Leaderboard data saved successfully to {csv_path}") | |
| except Exception as e: | |
| print(f"Error saving leaderboard data: {e}") | |
| def generate_leaderboard_html(results): | |
| """ | |
| Generate HTML for displaying the leaderboard with Elo ratings. | |
| Parameters: | |
| - results: The current leaderboard results dictionary | |
| Returns: | |
| - HTML string for the leaderboard | |
| """ | |
| # Models to hide from leaderboard display (but keep in battles) | |
| HIDDEN_MODELS = ["icecream-3b"] | |
| # Prepare model data for the HTML table | |
| model_data = [] | |
| for model in results["elo"]: | |
| # Skip hidden models in the display | |
| if model in HIDDEN_MODELS: | |
| continue | |
| elo = results["elo"].get(model, DEFAULT_ELO) | |
| wins = results["wins"].get(model, 0) | |
| losses = results["losses"].get(model, 0) | |
| ties = results["ties"].get(model, 0) | |
| total_comparisons = wins + losses + ties | |
| win_rate = (wins + 0.5 * ties) / total_comparisons if total_comparisons > 0 else 0.0 | |
| # Calculate confidence interval | |
| games_played = results["games_played"].get(model, 0) | |
| confidence = calculate_confidence_interval(elo, games_played) | |
| model_data.append({ | |
| "model": model, | |
| "elo": elo, | |
| "wins": wins, | |
| "losses": losses, | |
| "ties": ties, | |
| "comparisons": total_comparisons, | |
| "win_rate": win_rate, | |
| "confidence": confidence | |
| }) | |
| # Sort by Elo rating | |
| model_data.sort(key=lambda x: x["elo"], reverse=True) | |
| # Start building HTML table | |
| html = """ | |
| <table class="leaderboard-table"> | |
| <thead> | |
| <tr> | |
| <th class="centered">Rank</th> | |
| <th>Model</th> | |
| <th>Elo Rating</th> | |
| <th class="centered">Win Rate (%)</th> | |
| <th class="centered">Wins</th> | |
| <th class="centered">Losses</th> | |
| <th class="centered">Ties</th> | |
| <th class="centered">Comparisons</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| """ | |
| # Add rows to the HTML table | |
| for rank, data in enumerate(model_data, 1): | |
| model = data["model"] | |
| elo = data["elo"] | |
| wins = data["wins"] | |
| losses = data["losses"] | |
| ties = data["ties"] | |
| comparisons = data["comparisons"] | |
| win_rate = data["win_rate"] | |
| confidence = data["confidence"] | |
| # Create model link if in the mapping | |
| if model in model_to_hf: | |
| model_html = f'<a href="{model_to_hf[model]}" target="_blank" rel="noopener noreferrer" class="model-link">{model}<span class="external-icon">↗</span></a>' | |
| else: | |
| model_html = model | |
| # Format Elo with confidence interval | |
| elo_html = f"{elo:.1f} <span class='confidence-value'>± {confidence:.1f}</span>" | |
| # Add row to table | |
| html += f""" | |
| <tr> | |
| <td class="centered"><strong>{rank}</strong></td> | |
| <td>{model_html}</td> | |
| <td class="elo-col">{elo_html}</td> | |
| <td class="centered">{win_rate:.1%}</td> | |
| <td class="centered">{wins}</td> | |
| <td class="centered">{losses}</td> | |
| <td class="centered">{ties}</td> | |
| <td class="centered">{comparisons}</td> | |
| </tr> | |
| """ | |
| # Close the HTML table | |
| html += """ | |
| </tbody> | |
| </table> | |
| """ | |
| return html | |
| def submit_vote_with_elo(m_a, m_b, winner, feedback, current_results): | |
| """ | |
| Enhanced version of submit_vote that calculates and applies Elo rating changes. | |
| This replaces the original submit_vote_fixed function. | |
| Parameters: | |
| - m_a: Model A name | |
| - m_b: Model B name | |
| - winner: 'left', 'right', 'tie', or 'neither' | |
| - feedback: List of feedback options selected | |
| - current_results: The current leaderboard state | |
| Returns: | |
| - Updated results and UI components | |
| """ | |
| if winner is None: | |
| print("Warning: Submit called without a winner selected.") | |
| return {} | |
| # Current results could be stale, reload from latest copy on non-persistent storage: | |
| recent_results = load_leaderboard_data() | |
| # Update Elo ratings | |
| updated_results = update_elo_ratings(recent_results.copy(), m_a, m_b, winner) | |
| # Update vote count | |
| updated_results["votes"] = updated_results.get("votes", 0) + 1 | |
| # Save updated results | |
| save_leaderboard_data(updated_results) | |
| # Generate HTML leaderboard | |
| leaderboard_html = generate_leaderboard_html(updated_results) | |
| # Import gradio for the gr.update objects | |
| import gradio as gr | |
| return [ | |
| True, updated_results, | |
| gr.update(interactive=False), gr.update(interactive=False), | |
| gr.update(interactive=False), gr.update(interactive=False), | |
| gr.update(interactive=False), gr.update(visible=True), | |
| gr.update(visible=False), gr.update(visible=True), | |
| gr.update(interactive=False), gr.update(value=leaderboard_html, visible=True), | |
| gr.update(elem_classes=["results-revealed"]), | |
| gr.update(interactive=True), gr.update(value=m_a), gr.update(value=m_b) | |
| ] |