# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import datetime import os from tqdm import tqdm import datasets import evaluate from seametrics.user_friendly.utils import payload_to_uf_metrics, UFM from seametrics.payload import Payload import wandb _CITATION = """\ @InProceedings{huggingface:module, title = {A great new module}, authors={huggingface, Inc.}, year={2020} }\ @article{milan2016mot16, title={MOT16: A benchmark for multi-object tracking}, author={Milan, Anton and Leal-Taix{\'e}, Laura and Reid, Ian and Roth, Stefan and Schindler, Konrad}, journal={arXiv preprint arXiv:1603.00831}, year={2016} } """ _DESCRIPTION = """\ The MOT Metrics module is designed to evaluate multi-object tracking (MOT) algorithms by computing various metrics based on predicted and ground truth bounding boxes. It serves as a crucial tool in assessing the performance of MOT systems, aiding in the iterative improvement of tracking algorithms.""" _KWARGS_DESCRIPTION = """ Calculates how good are predictions given some references, using certain scores Args: predictions: list of predictions to score. Each predictions should be a string with tokens separated by spaces. references: list of reference for each prediction. Each reference should be a string with tokens separated by spaces. max_iou (`float`, *optional*): If specified, this is the minimum Intersection over Union (IoU) threshold to consider a detection as a true positive. Default is 0.5. """ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class UserFriendlyMetrics(evaluate.Metric): """TODO: Short description of my evaluation module.""" def __init__( self, iou_threshold: float = 1e-10, recognition_thresholds=[0.3, 0.5, 0.8], filter_dict={"name": "area", "ranges": [("all", [0, 1e5**2])]}, **kwargs): super().__init__(**kwargs) # save parameters for later self.iou_threshold = iou_threshold self.filter_dict = filter_dict self.recognition_thresholds = recognition_thresholds self.metric = UFM(iou_threshold, recognition_thresholds) def _info(self): # TODO: Specifies the evaluate.EvaluationModuleInfo object return evaluate.MetricInfo( # This is the description that will appear on the modules page. module_type="metric", description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, # This defines the format of each prediction and reference features=datasets.Features({ "predictions": datasets.Sequence( datasets.Sequence(datasets.Value("float")) ), "references": datasets.Features({ "all": datasets.Sequence(datasets.Sequence(datasets.Value("float")))} ) }), #couldn't get this to work # Additional links to the codebase or references codebase_urls=["http://github.com/path/to/codebase/of/new_module"], reference_urls=["http://path.to.reference.url/new_module"], ) def _download_and_prepare(self, dl_manager): """Optional: download external resources useful to compute the scores""" # TODO: Download external resources if needed pass def _compute( self, predictions, references, ): results = {} filter_ranges = self.filter_dict["ranges"] for filter_range in filter_ranges: filter_range_name = filter_range[0] range_results = {} for sequence_predictions, sequence_references in zip(predictions, references): sequence_range_results = self.metric.calculate( sequence_predictions, sequence_references[filter_range_name], ) range_results = sum_dicts(range_results, sequence_range_results) results[filter_range_name] = self.metric.derive_scores(range_results, self.recognition_thresholds) return results def compute_from_payload(self, payload: Payload, ): results = {} for model_name in payload.models: results[model_name] = {"overall": {}, "per_sequence": {}} # per-sequence loop progress_bar = tqdm(payload.sequences.items()) for seq_name, sequence in progress_bar: progress_bar.set_description(f"Getting sequence payload: {seq_name}") # create new payload only with specific sequence and model sequence_payload = Payload( dataset=payload.dataset, gt_field_name=payload.gt_field_name, models=[model_name], sequences={seq_name: sequence} ) progress_bar.set_description(f"Processing sequence: {seq_name}") predictions, references = payload_to_uf_metrics(sequence_payload, model_name=model_name, filter_dict=self.filter_dict) results[model_name]["per_sequence"][seq_name] = self._compute(predictions=predictions, references=references) # overall model_payload = Payload( dataset=payload.dataset, gt_field_name=payload.gt_field_name, models=[model_name], sequences=payload.sequences ) predictions, references = payload_to_uf_metrics(model_payload, model_name=model_name, filter_dict=self.filter_dict) results[model_name]["overall"] = self._compute(predictions=predictions, references=references) return results def wandb( self, results, wandb_section: str = None, wandb_runs = None, wandb_project="user_friendly_metrics", log_plots: bool = True, debug: bool = False, log_per_sequence = False ): """ Logs metrics to Weights and Biases (wandb) for tracking and visualization, including categorized bar charts for overall metrics. Args: results (dict): Results dictionary with 'overall' and 'per_sequence' keys. wandb_section (str, optional): W&B section for metric grouping. Defaults to None. wandb_project (str, optional): The name of the wandb project. Defaults to 'user_friendly_metrics'. log_plots (bool, optional): Generates categorized bar charts for overall metrics. Defaults to True. debug (bool, optional): Logs detailed summaries and histories to the terminal console. Defaults to False. """ current_datetime = datetime.datetime.now() formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H-%M-%S") wandb.login(key=os.getenv("WANDB_API_KEY")) if wandb_runs is not None: assert len(wandb_runs) == len(results), "runs and results must have the same length" else: wandb_runs = [f"{i}_{formatted_datetime}" for i in list(results.keys())] for wandb_run_name, result in zip(wandb_runs, results.values()): self.wandb_run(result = result, wandb_run_name = wandb_run_name, wandb_project = wandb_project, debug = debug, wandb_section = wandb_section, log_plots = log_plots, log_per_sequence = log_per_sequence) def wandb_run(self, result, wandb_run_name, wandb_project, debug, wandb_section = None, log_plots = True, log_per_sequence = False): run = wandb.init( project = wandb_project, name = wandb_run_name, reinit = True, settings = wandb.Settings(silent=not debug), ) categories = { "user_friendly_metrics": { f"mostly_tracked_score_{str(threshold).replace('.', '_')}" for threshold in self.recognition_thresholds }, "evaluation_metrics_dev": { "recall", }, "user_friendly_metrics_dev": { f"mostly_tracked_count_{str(threshold).replace('.', '_')}" for threshold in self.recognition_thresholds }.union("unique_object_count"), "predictions_summary": { "tp", "fn", }, } chart_data = {key: [] for key in categories.keys()} # Log overall metrics if "overall" in result: for metric, value in result["overall"]["all"].items(): log_key = ( f"{wandb_section}/overall/{metric}" if wandb_section else f"overall/{metric}" ) run.log({log_key: value}) if debug: print(f" {log_key} = {value}") for category, metrics in categories.items(): if metric in metrics: chart_data[category].append([metric, value]) print("----------------------------------------------------") if log_plots: for category, data in chart_data.items(): if data: table_data = [[label, value] for label, value in data] table = wandb.Table(data=table_data, columns=["metrics", "value"]) run.log( { f"{category}_bar_chart": wandb.plot.bar( table, "metrics", "value", title=f"{category.replace('_', ' ').title()}", ) } ) if log_per_sequence: if "per_sequence" in result: sorted_sequences = sorted( result["per_sequence"].items(), key=lambda x: next(iter(x[1].values()), {}).get("all", {}).get("recall", 0), reverse=True, # Set to True for descending order ) for sequence_name, sequence_data in sorted_sequences: for metric, value in sequence_data["all"].items(): log_key = ( f"{wandb_section}/per_sequence/{sequence_name}/{metric}" if wandb_section else f"per_sequence/{sequence_name}/{metric}" ) run.log({log_key: value}) if debug: print(f" {log_key} = {value}") print("----------------------------------------------------") if debug: print("\nDebug Mode: Logging Summary and History") print(f"Results Summary:\n{result}") print(f"WandB Settings:\n{run.settings}") print("All metrics have been logged.") run.finish() def sum_dicts(*dicts): """ Sums multiple dictionaries with depth one. If keys overlap, their values are summed. If keys are unique, they are simply included in the result. Args: *dicts: Any number of dictionaries to be summed. Returns: A single dictionary with the summed values. """ result = {} for d in dicts: for key, value in d.items(): if key in result: result[key] += value else: result[key] = value return result