# Authors: The scikit-learn developers # SPDX-License-Identifier: BSD-3-Clause import numpy as np from scipy.optimize import linear_sum_assignment from ...utils._param_validation import StrOptions, validate_params from ...utils.validation import check_array, check_consistent_length __all__ = ["consensus_score"] def _check_rows_and_columns(a, b): """Unpacks the row and column arrays and checks their shape.""" check_consistent_length(*a) check_consistent_length(*b) checks = lambda x: check_array(x, ensure_2d=False) a_rows, a_cols = map(checks, a) b_rows, b_cols = map(checks, b) return a_rows, a_cols, b_rows, b_cols def _jaccard(a_rows, a_cols, b_rows, b_cols): """Jaccard coefficient on the elements of the two biclusters.""" intersection = (a_rows * b_rows).sum() * (a_cols * b_cols).sum() a_size = a_rows.sum() * a_cols.sum() b_size = b_rows.sum() * b_cols.sum() return intersection / (a_size + b_size - intersection) def _pairwise_similarity(a, b, similarity): """Computes pairwise similarity matrix. result[i, j] is the Jaccard coefficient of a's bicluster i and b's bicluster j. """ a_rows, a_cols, b_rows, b_cols = _check_rows_and_columns(a, b) n_a = a_rows.shape[0] n_b = b_rows.shape[0] result = np.array( [ [similarity(a_rows[i], a_cols[i], b_rows[j], b_cols[j]) for j in range(n_b)] for i in range(n_a) ] ) return result @validate_params( { "a": [tuple], "b": [tuple], "similarity": [callable, StrOptions({"jaccard"})], }, prefer_skip_nested_validation=True, ) def consensus_score(a, b, *, similarity="jaccard"): """The similarity of two sets of biclusters. Similarity between individual biclusters is computed. Then the best matching between sets is found by solving a linear sum assignment problem, using a modified Jonker-Volgenant algorithm. The final score is the sum of similarities divided by the size of the larger set. Read more in the :ref:`User Guide `. Parameters ---------- a : tuple (rows, columns) Tuple of row and column indicators for a set of biclusters. b : tuple (rows, columns) Another set of biclusters like ``a``. similarity : 'jaccard' or callable, default='jaccard' May be the string "jaccard" to use the Jaccard coefficient, or any function that takes four arguments, each of which is a 1d indicator vector: (a_rows, a_columns, b_rows, b_columns). Returns ------- consensus_score : float Consensus score, a non-negative value, sum of similarities divided by size of larger set. See Also -------- scipy.optimize.linear_sum_assignment : Solve the linear sum assignment problem. References ---------- * Hochreiter, Bodenhofer, et. al., 2010. `FABIA: factor analysis for bicluster acquisition `__. Examples -------- >>> from sklearn.metrics import consensus_score >>> a = ([[True, False], [False, True]], [[False, True], [True, False]]) >>> b = ([[False, True], [True, False]], [[True, False], [False, True]]) >>> consensus_score(a, b, similarity='jaccard') np.float64(1.0) """ if similarity == "jaccard": similarity = _jaccard matrix = _pairwise_similarity(a, b, similarity) row_indices, col_indices = linear_sum_assignment(1.0 - matrix) n_a = len(a[0]) n_b = len(b[0]) return matrix[row_indices, col_indices].sum() / max(n_a, n_b)