Spaces:
Running
Running
| import glob | |
| import json | |
| import os | |
| import shutil | |
| import sys | |
| import urllib | |
| from collections import defaultdict | |
| from datetime import datetime | |
| from statistics import mean | |
| import pandas as pd | |
| import requests | |
| from dotenv import load_dotenv | |
| from huggingface_hub import login | |
| from constants import BASE_WHISPERKIT_BENCHMARK_URL | |
| from text_normalizer import text_normalizer | |
| from utils import compute_average_wer, download_dataset, download_json_from_github | |
| def fetch_evaluation_data(url): | |
| """ | |
| Fetches evaluation data from the given URL. | |
| :param url: The URL to fetch the evaluation data from. | |
| :returns: The evaluation data as a dictionary. | |
| :rauses: sys.exit if the request fails | |
| """ | |
| response = requests.get(url) | |
| if response.status_code == 200: | |
| return json.loads(response.text) | |
| else: | |
| sys.exit(f"Failed to fetch WhisperKit evals: {response.text}") | |
| def generate_device_map(base_dir): | |
| """ | |
| Generates a mapping of device identifiers to their corresponding device models. | |
| This function iterates through all summary files in the specified base directory and its subdirectories, | |
| extracting device identifier and device model information. It stores this information in a dictionary, | |
| where the keys are device identifiers and the values are device models. | |
| :param base_dir: The base directory to search for summary files. | |
| :returns: A dictionary mapping device identifiers to device models. | |
| """ | |
| device_map = {} | |
| # Find all summary files recursively | |
| summary_files = glob.glob(f"{base_dir}/**/*summary*.json", recursive=True) | |
| for file_path in summary_files: | |
| try: | |
| with open(file_path, "r") as f: | |
| data = json.load(f) | |
| # Extract device information and create simple mapping | |
| if "deviceModel" in data and "deviceIdentifier" in data: | |
| device_map[data["deviceIdentifier"]] = data["deviceModel"] | |
| except json.JSONDecodeError: | |
| print(f"Error reading {file_path}") | |
| except Exception as e: | |
| print(f"Error processing {file_path}: {e}") | |
| # Save the device map to project root | |
| output_path = "dashboard_data/device_map.json" | |
| with open(output_path, "w") as f: | |
| json.dump(device_map, f, indent=4, sort_keys=True) | |
| return device_map | |
| def get_device_name(device): | |
| """ | |
| Gets the device name from the device map if it exists. | |
| :param device: String representing the device name. | |
| :returns: The device name from the device map if it exists, otherwise the input device name. | |
| """ | |
| with open("dashboard_data/device_map.json", "r") as f: | |
| device_map = json.load(f) | |
| return device_map.get(device, device).replace(" ", "_") | |
| def process_benchmark_file(file_path, dataset_dfs, results, releases): | |
| """ | |
| Processes a single benchmark file and updates the results dictionary. | |
| :param file_path: Path to the benchmark JSON file. | |
| :param dataset_dfs: Dictionary of DataFrames containing dataset information. | |
| :param results: Dictionary to store the processed results. | |
| This function reads a benchmark JSON file, extracts relevant information, | |
| and updates the results dictionary with various metrics including WER, | |
| speed, tokens per second, and quality of inference (QoI). | |
| """ | |
| with open(file_path, "r") as file: | |
| test_results = json.load(file) | |
| if len(test_results) == 0: | |
| return | |
| commit_hash_timestamp = file_path.split("/")[-2] | |
| commit_timestamp, commit_hash = commit_hash_timestamp.split("_") | |
| if commit_hash not in releases: | |
| return | |
| first_test_result = test_results[0] | |
| model = first_test_result["testInfo"]["model"] | |
| device = first_test_result["testInfo"]["device"] | |
| dataset_dir = first_test_result["testInfo"]["datasetDir"] | |
| if "iPhone" in device or "iPad" in device: | |
| version_numbers = first_test_result["staticAttributes"]["osVersion"].split(".") | |
| if len(version_numbers) == 3 and version_numbers[-1] == "0": | |
| version_numbers.pop() | |
| os_info = f"""{'iOS' if 'iPhone' in device else 'iPadOS'}_{".".join(version_numbers)}""" | |
| else: | |
| os_info = f"macOS_{first_test_result['staticAttributes']['osVersion']}" | |
| timestamp = first_test_result["testInfo"]["date"] | |
| key = (model, device, os_info, commit_timestamp) | |
| dataset_name = dataset_dir | |
| for test_result in test_results: | |
| test_info = test_result["testInfo"] | |
| audio_file_name = test_info["audioFile"] | |
| dataset_df = dataset_dfs[dataset_name] | |
| wer_entry = { | |
| "prediction": text_normalizer(test_info["prediction"]), | |
| "reference": text_normalizer(test_info["reference"]), | |
| } | |
| results[key]["timestamp"] = timestamp | |
| results[key]["average_wer"].append(wer_entry) | |
| results[key]["dataset_wer"][dataset_name].append(wer_entry) | |
| input_audio_seconds = test_info["timings"]["inputAudioSeconds"] | |
| full_pipeline = test_info["timings"]["fullPipeline"] | |
| total_decoding_loops = test_info["timings"]["totalDecodingLoops"] | |
| results[key]["dataset_speed"][dataset_name][ | |
| "inputAudioSeconds" | |
| ] += input_audio_seconds | |
| results[key]["dataset_speed"][dataset_name]["fullPipeline"] += full_pipeline | |
| results[key]["speed"]["inputAudioSeconds"] += input_audio_seconds | |
| results[key]["speed"]["fullPipeline"] += full_pipeline | |
| results[key]["commit_hash"] = commit_hash | |
| results[key]["commit_timestamp"] = commit_timestamp | |
| results[key]["dataset_tokens_per_second"][dataset_name][ | |
| "totalDecodingLoops" | |
| ] += total_decoding_loops | |
| results[key]["dataset_tokens_per_second"][dataset_name][ | |
| "fullPipeline" | |
| ] += full_pipeline | |
| results[key]["tokens_per_second"]["totalDecodingLoops"] += total_decoding_loops | |
| results[key]["tokens_per_second"]["fullPipeline"] += full_pipeline | |
| audio = audio_file_name.split(".")[0] | |
| if dataset_name == "earnings22-10mins": | |
| audio = audio.split("-")[0] | |
| dataset_row = dataset_df.loc[dataset_df["file"].str.contains(audio)].iloc[0] | |
| reference_wer = dataset_row["wer"] | |
| prediction_wer = test_info["wer"] | |
| results[key]["qoi"].append(1 if prediction_wer <= reference_wer else 0) | |
| def process_summary_file(file_path, results, releases): | |
| """ | |
| Processes a summary file and updates the results dictionary with device support information. | |
| :param file_path: Path to the summary JSON file. | |
| :param results: Dictionary to store the processed results. | |
| :param releases: Set of release commit hashes to process. | |
| This function reads a summary JSON file, extracts information about supported | |
| and failed models for a specific device and OS combination, and updates the | |
| results dictionary accordingly. It creates separate entries for each release. | |
| """ | |
| with open(file_path, "r") as file: | |
| summary_data = json.load(file) | |
| if summary_data["commitHash"] not in releases: | |
| return | |
| device = summary_data["deviceIdentifier"] | |
| os = f"{'iPadOS' if 'iPad' in device else summary_data['osType']} {summary_data['osVersion']}" | |
| commit_hash = summary_data["commitHash"] | |
| commit_timestamp = summary_data["commitTimestamp"] | |
| test_file_name = file_path.split("/")[-1] | |
| test_timestamp = test_file_name.split("_")[-1].replace(".json", "") | |
| key = (device, os, commit_hash) | |
| if key in results: | |
| existing_commit_timestamp = results[key]["commitTimestamp"] | |
| existing_test_timestamp = results[key]["testTimestamp"] | |
| existing_commit_dt = datetime.strptime( | |
| existing_commit_timestamp, "%Y-%m-%dT%H%M%S" | |
| ) | |
| new_commit_dt = datetime.strptime(commit_timestamp, "%Y-%m-%dT%H%M%S") | |
| existing_test_dt = datetime.strptime(existing_test_timestamp, "%Y-%m-%dT%H%M%S") | |
| new_test_dt = datetime.strptime(test_timestamp, "%Y-%m-%dT%H%M%S") | |
| if new_test_dt < existing_test_dt or new_commit_dt < existing_commit_dt: | |
| return | |
| else: | |
| results[key] = {} | |
| supported_models = set(summary_data["modelsTested"]) | |
| failed_models = set() | |
| dataset_count = 2 | |
| for model, value in summary_data["testResults"].items(): | |
| if model not in summary_data["failureInfo"]: | |
| dataset_count = len(value) | |
| break | |
| for failed_model in summary_data["failureInfo"]: | |
| if ( | |
| failed_model in summary_data["testResults"] | |
| and len(summary_data["testResults"][failed_model]) == dataset_count | |
| ): | |
| continue | |
| supported_models.discard(failed_model) | |
| failed_models.add(failed_model) | |
| results[key]["supportedModels"] = supported_models | |
| results[key]["commitHash"] = commit_hash | |
| results[key]["commitTimestamp"] = commit_timestamp | |
| results[key]["testTimestamp"] = test_timestamp | |
| results[key]["failedModels"] = (failed_models, file_path) | |
| results["modelsTested"] |= supported_models | |
| results["devices"].add(device) | |
| def calculate_and_save_performance_results( | |
| performance_results, performance_output_path | |
| ): | |
| """ | |
| Calculates final performance metrics and saves them to a JSON file. | |
| :param performance_results: Dictionary containing raw performance data. | |
| :param performance_output_path: Path to save the processed performance results. | |
| This function processes the raw performance data, calculates average metrics, | |
| and writes the final results to a JSON file, with each entry representing | |
| a unique combination of model, device, and OS. | |
| """ | |
| not_supported = [] | |
| with open(performance_output_path, "w") as performance_file: | |
| for key, data in performance_results.items(): | |
| model, device, os_info, timestamp = key | |
| speed = round( | |
| data["speed"]["inputAudioSeconds"] / data["speed"]["fullPipeline"], 2 | |
| ) | |
| if speed < 1.0: | |
| not_supported.append((model, device, os_info)) | |
| continue | |
| performance_entry = { | |
| "model": model.replace("_", "/"), | |
| "device": get_device_name(device).replace("_", " "), | |
| "os": os_info.replace("_", " "), | |
| "timestamp": data["timestamp"], | |
| "speed": speed, | |
| "tokens_per_second": round( | |
| data["tokens_per_second"]["totalDecodingLoops"] | |
| / data["tokens_per_second"]["fullPipeline"], | |
| 2, | |
| ), | |
| "dataset_speed": { | |
| dataset: round( | |
| speed_info["inputAudioSeconds"] / speed_info["fullPipeline"], 2 | |
| ) | |
| for dataset, speed_info in data["dataset_speed"].items() | |
| }, | |
| "dataset_tokens_per_second": { | |
| dataset: round( | |
| tps_info["totalDecodingLoops"] / tps_info["fullPipeline"], 2 | |
| ) | |
| for dataset, tps_info in data["dataset_tokens_per_second"].items() | |
| }, | |
| "average_wer": compute_average_wer(data["average_wer"]), | |
| "dataset_average_wer": { | |
| dataset: compute_average_wer(data["dataset_wer"][dataset]) | |
| for dataset in data["dataset_wer"] | |
| }, | |
| "qoi": round(mean(data["qoi"]), 2), | |
| "commit_hash": data["commit_hash"], | |
| "commit_timestamp": data["commit_timestamp"], | |
| } | |
| json.dump(performance_entry, performance_file) | |
| performance_file.write("\n") | |
| return not_supported | |
| def calculate_and_save_support_results( | |
| support_results, not_supported, support_output_path | |
| ): | |
| """ | |
| Calculates device support results and saves them to separate CSV files for each release. | |
| :param support_results: Dictionary containing device support information. | |
| :param support_output_path: Base path to save the processed support results. | |
| :param not_supported: List of (model, device, os) tuples that are not supported. | |
| This function processes the device support data and creates separate CSV files | |
| showing which models are supported on different devices and OS versions, | |
| using checkmarks, warning signs, question marks or Not supported to | |
| indicate support status. | |
| """ | |
| all_models = sorted(support_results["modelsTested"]) | |
| # Group results by commit hash | |
| results_by_commit = {} | |
| for key, data in support_results.items(): | |
| if key in ["modelsTested", "devices"]: | |
| continue | |
| device, os, commit_hash = key | |
| if commit_hash not in results_by_commit: | |
| results_by_commit[commit_hash] = { | |
| "data": {}, | |
| "devices": set(), | |
| "timestamp": data["commitTimestamp"], | |
| } | |
| results_by_commit[commit_hash]["data"][key] = data | |
| results_by_commit[commit_hash]["devices"].add(device) | |
| # Generate separate CSV for each commit | |
| for commit_hash, commit_data in results_by_commit.items(): | |
| commit_devices = sorted(commit_data["devices"]) | |
| df = pd.DataFrame(index=all_models, columns=["Model"] + commit_devices) | |
| for model in all_models: | |
| row = {"Model": model} | |
| for device in commit_devices: | |
| row[device] = "" | |
| for key, data in commit_data["data"].items(): | |
| device, os, _ = key | |
| supported_models = data["supportedModels"] | |
| failed_models, file_path = data["failedModels"] | |
| directories = file_path.split("/") | |
| commit_file, summary_file = directories[-2], directories[-1] | |
| url = f"{BASE_WHISPERKIT_BENCHMARK_URL}/{commit_file}/{urllib.parse.quote(summary_file)}" | |
| if model in supported_models: | |
| current_value = row[device] | |
| new_value = ( | |
| f"β {os}" | |
| if current_value == "" | |
| else f"{current_value}<p>β {os}</p>" | |
| ) | |
| elif model in failed_models: | |
| current_value = row[device] | |
| new_value = ( | |
| f"""β οΈ <a style='color: #3B82F6; text-decoration: underline; text-decoration-style: dotted;' href={url}>{os}</a>""" | |
| if current_value == "" | |
| else f"""{current_value}<p>β οΈ <a style='color: #3B82F6; text-decoration: underline; text-decoration-style: dotted;' href={url}>{os}</a></p>""" | |
| ) | |
| else: | |
| current_value = row[device] | |
| new_value = ( | |
| f"? {os}" | |
| if current_value == "" | |
| else f"{current_value}<p>? {os}</p>" | |
| ) | |
| row[device] = new_value | |
| df.loc[model] = row | |
| # Mark unsupported combinations for this commit | |
| commit_not_supported = [ | |
| (model, device, os) | |
| for model, device, os in not_supported | |
| if any( | |
| key[2] == commit_hash | |
| for key in support_results | |
| if key not in ["modelsTested", "devices"] and model == key[0] | |
| ) | |
| ] | |
| remove_unsupported_cells(df, commit_not_supported) | |
| # Format column headers | |
| cols = df.columns.tolist() | |
| cols = ["Model"] + [ | |
| f"""{get_device_name(col).replace("_", " ")} ({col})""" | |
| for col in cols | |
| if col != "Model" | |
| ] | |
| df.columns = cols | |
| # Save to commit-specific file | |
| output_path = support_output_path.replace(".csv", f"_{commit_hash[:7]}.csv") | |
| df.to_csv(output_path, index=True) | |
| def remove_unsupported_cells(df, not_supported): | |
| """ | |
| Updates the DataFrame to mark unsupported model-device combinations. | |
| This function reads a configuration file to determine which models are supported | |
| on which devices. It then iterates over the DataFrame and sets the value to "Not supported" | |
| for any model-device combination that is not supported according to the configuration. | |
| :param df: A Pandas DataFrame where the index represents models and columns represent devices. | |
| """ | |
| with open("dashboard_data/config.json", "r") as file: | |
| config_data = json.load(file) | |
| device_support = config_data["device_support"] | |
| for info in device_support: | |
| identifiers = set(info["identifiers"]) | |
| supported = set(info["models"]["supported"]) | |
| for model in df.index: | |
| for device in df.columns: | |
| if ( | |
| any(identifier in device for identifier in identifiers) | |
| and model not in supported | |
| ): | |
| df.at[model, device] = "Not Supported" | |
| for model, device, os in not_supported: | |
| df.at[model, device] = "Not Supported" | |
| def download_device_json_safe(file_path): | |
| """ | |
| Safely downloads a device JSON file from GitHub, returning None if it doesn't exist. | |
| :param file_path: Path to the JSON file within the repository | |
| :returns: The JSON data as a dictionary, or None if the file doesn't exist | |
| """ | |
| try: | |
| return download_json_from_github(file_path=file_path) | |
| except SystemExit: | |
| # File doesn't exist or other error occurred | |
| return None | |
| def load_device_json_local(file_path): | |
| """ | |
| Safely loads a local device JSON file, returning None if it doesn't exist. | |
| :param file_path: Local path to the JSON file | |
| :returns: The JSON data as a dictionary, or None if the file doesn't exist | |
| """ | |
| try: | |
| with open(file_path, "r") as f: | |
| return json.load(f) | |
| except (FileNotFoundError, json.JSONDecodeError): | |
| return None | |
| def build_chip_mapping(): | |
| """ | |
| Builds a mapping from device SKUs to their chip types. | |
| :returns: Dictionary where keys are device SKUs and values are chip types | |
| """ | |
| sku_to_chip = {} | |
| # Load iPad devices | |
| ipad_data = load_device_json_local("dashboard_data/iPad.json") | |
| if ipad_data and "total_menu" in ipad_data: | |
| for device_info in ipad_data["total_menu"].values(): | |
| if "sku" in device_info and "chip" in device_info: | |
| # iPad has sku as an array | |
| skus = device_info["sku"] if isinstance(device_info["sku"], list) else [device_info["sku"]] | |
| for sku in skus: | |
| sku_to_chip[sku] = device_info["chip"] | |
| # Load iPhone devices | |
| iphone_data = load_device_json_local("dashboard_data/iPhone.json") | |
| if iphone_data and "total_menu" in iphone_data: | |
| for device_info in iphone_data["total_menu"].values(): | |
| if "sku" in device_info and "chip" in device_info: | |
| # iPhone has sku as a single string | |
| sku_to_chip[device_info["sku"]] = device_info["chip"] | |
| # Load Mac devices | |
| mac_data = load_device_json_local("dashboard_data/Mac.json") | |
| if mac_data and "total_menu" in mac_data: | |
| for device_info in mac_data["total_menu"].values(): | |
| if "sku" in device_info and "chip" in device_info: | |
| # Mac has sku as a single string | |
| sku_to_chip[device_info["sku"]] = device_info["chip"] | |
| return sku_to_chip | |
| def get_platform_from_sku(sku): | |
| """ | |
| Determines the platform (iPad, iPhone, Mac) from a device SKU. | |
| :param sku: Device SKU string | |
| :returns: Platform string ('iPad', 'iPhone', 'Mac') or 'Unknown' | |
| """ | |
| if sku.startswith("iPad"): | |
| return "iPad" | |
| elif sku.startswith("iPhone"): | |
| return "iPhone" | |
| elif sku.startswith("Mac") or sku.startswith("iMac") or sku.startswith("MacBook"): | |
| return "Mac" | |
| else: | |
| return "Unknown" | |
| def normalize_chip_name(chip): | |
| """ | |
| Normalizes chip names for consistent grouping. | |
| :param chip: Raw chip name from device JSON | |
| :returns: Normalized chip name | |
| """ | |
| # Handle variations like "A18 Pro" -> "A18", "M4 Pro" -> "M4", etc. | |
| # But keep distinct generations separate | |
| chip = chip.strip() | |
| # For A-series chips, keep Pro variants separate as they have different capabilities | |
| if chip.startswith("A") and "Pro" in chip: | |
| return chip # Keep A17 Pro separate from A17 | |
| # For M-series chips, group Pro/Max/Ultra variants together as they're same generation | |
| if chip.startswith("M"): | |
| # Extract just the M number (M1, M2, M3, M4) | |
| parts = chip.split() | |
| if len(parts) > 0: | |
| return parts[0] # Return just "M1", "M2", etc. | |
| return chip | |
| def build_sku_group_mapping(): | |
| """ | |
| Builds a mapping from individual SKUs to all SKUs that share the same chip on the same platform. | |
| This implements chip-based coverage where testing one device with a specific chip | |
| provides coverage for all devices with that chip on the same platform. | |
| :returns: Dictionary where keys are individual SKUs and values are sets of all SKUs in that chip group | |
| """ | |
| sku_to_chip = build_chip_mapping() | |
| sku_to_group = {} | |
| # Group SKUs by platform and normalized chip | |
| platform_chip_groups = {} | |
| for sku, chip in sku_to_chip.items(): | |
| platform = get_platform_from_sku(sku) | |
| normalized_chip = normalize_chip_name(chip) | |
| key = (platform, normalized_chip) | |
| if key not in platform_chip_groups: | |
| platform_chip_groups[key] = set() | |
| platform_chip_groups[key].add(sku) | |
| # Create reverse mapping: each SKU maps to all SKUs in its chip group | |
| for sku, chip in sku_to_chip.items(): | |
| platform = get_platform_from_sku(sku) | |
| normalized_chip = normalize_chip_name(chip) | |
| key = (platform, normalized_chip) | |
| sku_to_group[sku] = platform_chip_groups[key] | |
| return sku_to_group | |
| def expand_tested_devices(tested_devices, sku_mapping): | |
| """ | |
| Expands tested devices to include all SKUs in the same group. | |
| :param tested_devices: Set of device SKUs that were actually tested | |
| :param sku_mapping: Dictionary mapping individual SKUs to their complete groups | |
| :returns: Expanded set of devices including all SKUs in the same groups | |
| """ | |
| expanded_devices = set(tested_devices) | |
| for device in tested_devices: | |
| if device in sku_mapping: | |
| # Add all SKUs from the same group | |
| expanded_devices.update(sku_mapping[device]) | |
| return expanded_devices | |
| def get_test_iphones(): | |
| """ | |
| Gets iPhone SKU identifiers from the local iPhone.json file. | |
| """ | |
| iphone_data = load_device_json_local("dashboard_data/iPhone.json") | |
| if iphone_data and "total_menu" in iphone_data: | |
| return set([device_info["sku"] for device_info in iphone_data["total_menu"].values() if "sku" in device_info]) | |
| return set() | |
| def get_test_macs(): | |
| """ | |
| Gets Mac SKU identifiers from the local Mac.json file. | |
| """ | |
| mac_data = load_device_json_local("dashboard_data/Mac.json") | |
| if mac_data and "total_menu" in mac_data: | |
| return set([device_info["sku"] for device_info in mac_data["total_menu"].values() if "sku" in device_info]) | |
| return set() | |
| def get_all_supported_devices(): | |
| """ | |
| Gets all supported device identifiers from the config file. | |
| Returns a set of device identifiers. | |
| """ | |
| with open("dashboard_data/config.json", "r") as f: | |
| config = json.load(f) | |
| devices = set() | |
| for device_group in config["device_support"]: | |
| identifiers = device_group["identifiers"] | |
| devices.update(identifiers) | |
| return devices | |
| def get_tested_devices_for_commit(performance_results, support_results, commit_hash): | |
| """ | |
| Gets all device identifiers that were actually tested for a specific commit, | |
| including all SKUs in the same chip groups as tested devices. | |
| Uses chip-based coverage logic where testing one device with a specific chip | |
| provides coverage for all devices with that chip on the same platform. | |
| Returns a set of device identifiers. | |
| """ | |
| tested_devices = set() | |
| # From performance results (benchmark files) | |
| for key, result in performance_results.items(): | |
| if len(key) >= 4 and result.get("commit_hash") == commit_hash: | |
| model, device, _, _ = key | |
| tested_devices.add(device) | |
| # From support results (summary files) | |
| for key, result in support_results.items(): | |
| if key in ["modelsTested", "devices"]: | |
| continue | |
| if len(key) >= 3 and result.get("commitHash") == commit_hash: | |
| device, _, _ = key | |
| tested_devices.add(device) | |
| # Expand to include all SKUs in the same chip groups for all platforms | |
| sku_mapping = build_sku_group_mapping() | |
| expanded_devices = expand_tested_devices(tested_devices, sku_mapping) | |
| return expanded_devices | |
| def get_tested_os_versions_for_commit( | |
| performance_results, support_results, commit_hash | |
| ): | |
| """ | |
| Gets all OS versions that were actually tested for a specific commit. | |
| Returns a set of OS version strings like 'iOS_17.2', 'macOS_14.5', etc. | |
| """ | |
| tested_os_versions = set() | |
| # From performance results (benchmark files) | |
| for key, result in performance_results.items(): | |
| if len(key) >= 4 and result.get("commit_hash") == commit_hash: | |
| model, device, os_info, _ = key | |
| tested_os_versions.add(os_info) | |
| # From support results (summary files) | |
| for key, result in support_results.items(): | |
| if key in ["modelsTested", "devices"]: | |
| continue | |
| if len(key) >= 3 and result.get("commitHash") == commit_hash: | |
| device, os, _ = key | |
| # Convert format like "iOS 17.2" to "iOS_17.2" for consistency | |
| os_normalized = os.replace(" ", "_") | |
| tested_os_versions.add(os_normalized) | |
| return tested_os_versions | |
| def check_target_os_coverage(tested_os_versions): | |
| """ | |
| Check if the tested OS versions include ALL of the target OS versions: | |
| - macOS 14, 15, 26 | |
| - iOS 17, 18, 26 (noting that iOS and iPadOS are the same under the hood) | |
| Returns (is_fully_covered: bool, covered_versions: list, missing_versions: list) | |
| """ | |
| target_macos_versions = {14, 15, 26} | |
| target_ios_versions = {17, 18, 26} | |
| covered_macos = set() | |
| covered_ios = set() | |
| for os_version in tested_os_versions: | |
| # Parse OS version string like "iOS_17.2" or "macOS_14.5" | |
| if "_" in os_version: | |
| os_type, version_str = os_version.split("_", 1) | |
| try: | |
| # Extract major version number | |
| major_version = int(version_str.split(".")[0]) | |
| if os_type == "macOS" and major_version in target_macos_versions: | |
| covered_macos.add(major_version) | |
| elif ( | |
| os_type in ["iOS", "iPadOS"] | |
| and major_version in target_ios_versions | |
| ): | |
| covered_ios.add(major_version) | |
| except (ValueError, IndexError): | |
| # Skip if we can't parse the version | |
| continue | |
| # Check what's missing | |
| missing_macos = target_macos_versions - covered_macos | |
| missing_ios = target_ios_versions - covered_ios | |
| # Format covered and missing versions | |
| covered_versions = [] | |
| covered_versions.extend([f"macOS {v}" for v in sorted(covered_macos)]) | |
| covered_versions.extend([f"iOS {v}" for v in sorted(covered_ios)]) | |
| missing_versions = [] | |
| missing_versions.extend([f"macOS {v}" for v in sorted(missing_macos)]) | |
| missing_versions.extend([f"iOS {v}" for v in sorted(missing_ios)]) | |
| # Only fully covered if no missing versions | |
| is_fully_covered = len(missing_versions) == 0 | |
| return is_fully_covered, covered_versions, missing_versions | |
| def check_chip_coverage(tested_devices): | |
| """ | |
| Check if the tested devices provide complete chip coverage for each platform. | |
| Target coverage: | |
| - iPad: A14, A15, A16, A17 Pro, M1, M2, M3, M4 | |
| - iPhone: A14, A15, A16, A17 Pro, A18, A18 Pro | |
| - Mac: M1, M2, M3, M4 | |
| :param tested_devices: Set of device SKUs that were tested | |
| :returns: (is_fully_covered: bool, platform_coverage: dict, missing_chips: dict) | |
| """ | |
| # Define target chips for each platform | |
| target_chips = { | |
| "iPad": {"A14", "A15", "A16", "A17 Pro", "M1", "M2", "M3", "M4"}, | |
| "iPhone": {"A14", "A15", "A16", "A17 Pro", "A18", "A18 Pro"}, | |
| "Mac": {"M1", "M2", "M3", "M4"} | |
| } | |
| # Build mapping from SKUs to chips | |
| sku_to_chip = build_chip_mapping() | |
| # Track which chips were tested for each platform | |
| tested_chips = { | |
| "iPad": set(), | |
| "iPhone": set(), | |
| "Mac": set() | |
| } | |
| for device_sku in tested_devices: | |
| if device_sku in sku_to_chip: | |
| platform = get_platform_from_sku(device_sku) | |
| chip = normalize_chip_name(sku_to_chip[device_sku]) | |
| if platform in tested_chips: | |
| tested_chips[platform].add(chip) | |
| # Calculate coverage for each platform | |
| platform_coverage = {} | |
| missing_chips = {} | |
| for platform, target_set in target_chips.items(): | |
| covered_set = tested_chips[platform] | |
| missing_set = target_set - covered_set | |
| platform_coverage[platform] = { | |
| "total_chips": len(target_set), | |
| "tested_chips": len(covered_set), | |
| "coverage_percentage": (len(covered_set) / len(target_set)) * 100 if target_set else 0, | |
| "covered_chips": sorted(list(covered_set)), | |
| "missing_chips": sorted(list(missing_set)) | |
| } | |
| missing_chips[platform] = sorted(list(missing_set)) | |
| # Overall coverage is complete if all platforms have full coverage | |
| is_fully_covered = all( | |
| len(missing_chips[platform]) == 0 | |
| for platform in target_chips.keys() | |
| ) | |
| return is_fully_covered, platform_coverage, missing_chips | |
| def generate_test_coverage_report( | |
| performance_results, support_results, output_dir="dashboard_data" | |
| ): | |
| """ | |
| Generates test coverage reports for each commit, showing which devices | |
| were tested vs skipped. | |
| """ | |
| # Get all possible devices from config | |
| all_devices = get_all_supported_devices() | |
| # Get all unique commit hashes from results | |
| commit_hashes = set() | |
| # Collect from performance results | |
| for key, result in performance_results.items(): | |
| if len(key) >= 4 and result.get("commit_hash"): | |
| commit_hashes.add(result["commit_hash"]) | |
| # Collect from support results | |
| for key, result in support_results.items(): | |
| if key in ["modelsTested", "devices"]: | |
| continue | |
| if len(key) >= 3 and result.get("commitHash"): | |
| commit_hashes.add(result["commitHash"]) | |
| print(f"Found {len(commit_hashes)} commit hashes to analyze") | |
| # Generate coverage report for each commit | |
| for commit_hash in commit_hashes: | |
| tested_devices = get_tested_devices_for_commit( | |
| performance_results, support_results, commit_hash | |
| ) | |
| tested_os_versions = get_tested_os_versions_for_commit( | |
| performance_results, support_results, commit_hash | |
| ) | |
| # Check target OS coverage | |
| os_fully_covered, covered_versions, missing_versions = check_target_os_coverage( | |
| tested_os_versions | |
| ) | |
| # Check chip coverage for all platforms | |
| chip_fully_covered, platform_coverage, missing_chips = check_chip_coverage( | |
| tested_devices | |
| ) | |
| skipped_devices = all_devices - tested_devices | |
| # Convert sets to lists for JSON serialization | |
| tested_devices_list = list(tested_devices) | |
| skipped_devices_list = list(skipped_devices) | |
| tested_os_versions_list = list(tested_os_versions) | |
| coverage_report = { | |
| "commit_hash": commit_hash, | |
| "total_devices": len(all_devices), | |
| "tested_devices": len(tested_devices), | |
| "skipped_devices": len(skipped_devices), | |
| "coverage_percentage": (len(tested_devices) / len(all_devices)) * 100, | |
| "tested_device_list": tested_devices_list, | |
| "skipped_device_list": skipped_devices_list, | |
| "tested_os_versions": tested_os_versions_list, | |
| "has_target_os_coverage": os_fully_covered, | |
| "covered_target_versions": covered_versions, | |
| "missing_target_versions": missing_versions, | |
| "has_target_chip_coverage": chip_fully_covered, | |
| "platform_chip_coverage": platform_coverage, | |
| "missing_target_chips": missing_chips, | |
| } | |
| # Save report for this commit | |
| output_file = os.path.join(output_dir, f"test_coverage_{commit_hash}.json") | |
| with open(output_file, "w") as f: | |
| json.dump(coverage_report, f, indent=2) | |
| os_coverage_info = f"OS coverage: {'β Complete' if os_fully_covered else 'β Incomplete'}" | |
| chip_coverage_info = f"Chip coverage: {'β Complete' if chip_fully_covered else 'β Incomplete'}" | |
| print( | |
| f"Generated coverage report for commit {commit_hash}: " | |
| f"{len(tested_devices)}/{len(all_devices)} devices tested " | |
| f"({coverage_report['coverage_percentage']:.1f}%) " | |
| f"({os_coverage_info}, {chip_coverage_info})" | |
| ) | |
| def main(): | |
| """ | |
| Main function to orchestrate the performance data generation process. | |
| This function performs the following steps: | |
| 1. Downloads benchmark data if requested. | |
| 2. Fetches evaluation data for various datasets. | |
| 3. Processes benchmark files and summary files. | |
| 4. Calculates and saves performance and support results. | |
| 5. Generates test coverage reports for each commit. | |
| """ | |
| source_xcresult_repo = "argmaxinc/whisperkit-evals-dataset" | |
| source_xcresult_subfolder = "benchmark_data/" | |
| source_xcresult_directory = f"{source_xcresult_repo}/{source_xcresult_subfolder}" | |
| if len(sys.argv) > 1 and sys.argv[1] == "download": | |
| try: | |
| shutil.rmtree(source_xcresult_repo) | |
| except: | |
| print("Nothing to remove.") | |
| download_dataset( | |
| source_xcresult_repo, source_xcresult_repo, source_xcresult_subfolder | |
| ) | |
| datasets = { | |
| "Earnings-22": "https://huggingface.co/datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/earnings22/2024-03-04_13%3A39%3A42_GMT-0800.json", | |
| "LibriSpeech": "https://huggingface.co/datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/librispeech/2024-02-28_18%3A45%3A02_GMT-0800.json?download=true", | |
| "earnings22-10mins": "https://huggingface.co/datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/earnings22/2024-03-04_13%3A39%3A42_GMT-0800.json", | |
| "librispeech-10mins": "https://huggingface.co/datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/librispeech/2024-02-28_18%3A45%3A02_GMT-0800.json?download=true", | |
| "earnings22-12hours": "https://huggingface.co/datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/earnings22/2024-03-04_13%3A39%3A42_GMT-0800.json", | |
| "librispeech": "https://huggingface.co/datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/librispeech/2024-02-28_18%3A45%3A02_GMT-0800.json?download=true", | |
| } | |
| dataset_dfs = {} | |
| for dataset_name, url in datasets.items(): | |
| evals = fetch_evaluation_data(url) | |
| dataset_dfs[dataset_name] = pd.json_normalize(evals["results"]) | |
| performance_results = defaultdict( | |
| lambda: { | |
| "average_wer": [], | |
| "dataset_wer": defaultdict(list), | |
| "qoi": [], | |
| "speed": {"inputAudioSeconds": 0, "fullPipeline": 0}, | |
| "tokens_per_second": {"totalDecodingLoops": 0, "fullPipeline": 0}, | |
| "dataset_speed": defaultdict( | |
| lambda: {"inputAudioSeconds": 0, "fullPipeline": 0} | |
| ), | |
| "dataset_tokens_per_second": defaultdict( | |
| lambda: {"totalDecodingLoops": 0, "fullPipeline": 0} | |
| ), | |
| "timestamp": None, | |
| "commit_hash": None, | |
| "commit_timestamp": None, | |
| "test_timestamp": None, | |
| } | |
| ) | |
| support_results = {"modelsTested": set(), "devices": set()} | |
| generate_device_map(source_xcresult_directory) | |
| with open("dashboard_data/version.json", "r") as f: | |
| version = json.load(f) | |
| releases = set(version["releases"]) | |
| for subdir, _, files in os.walk(source_xcresult_directory): | |
| for filename in files: | |
| file_path = os.path.join(subdir, filename) | |
| if not filename.endswith(".json"): | |
| continue | |
| elif "summary" in filename: | |
| process_summary_file(file_path, support_results, releases) | |
| else: | |
| process_benchmark_file( | |
| file_path, dataset_dfs, performance_results, releases | |
| ) | |
| not_supported = calculate_and_save_performance_results( | |
| performance_results, "dashboard_data/performance_data.json" | |
| ) | |
| calculate_and_save_support_results( | |
| support_results, not_supported, "dashboard_data/support_data.csv" | |
| ) | |
| # Generate test coverage reports | |
| generate_test_coverage_report(performance_results, support_results) | |
| if __name__ == "__main__": | |
| main() | |