Spaces:

ArneBinder
/

ScientificArgumentRecommender

Sleeping

App Files Files Community

ArneBinder commited on Jul 13

Commit

d868d2e

verified ·

1 Parent(s): 5fbc03a

update from https://github.com/ArneBinder/argumentation-structure-identification/pull/529

Browse files

Files changed (39) hide show

src/analysis/combine_job_returns.py +207 -21
src/analysis/common.py +27 -6
src/analysis/compare_job_returns.py +1 -1
src/analysis/format_metric_results.py +269 -0
src/analysis/get_json_field_as_string.py +55 -0
src/analysis/show_inference_params_on_quality_and_throughput.py +485 -0
src/datamodules/__init__.py +1 -1
src/datamodules/datamodule_with_sampler.py +59 -0
src/dataset/processing.py +88 -3
src/demo/annotation_utils.py +6 -56
src/demo/backend_utils.py +50 -12
src/demo/retrieve_and_dump_all_relevant.py +82 -38
src/document/processing.py +300 -1
src/evaluate.py +3 -3
src/evaluate_documents.py +1 -1
src/hydra_callbacks/save_job_return_value.py +67 -4
src/langchain_modules/basic_pie_document_store.py +3 -1
src/langchain_modules/datasets_pie_document_store.py +1 -1
src/metrics/__init__.py +7 -1
src/metrics/connected_component_sizes.py +43 -0
src/metrics/coref.py +223 -0
src/metrics/coref_sklearn.py +158 -43
src/metrics/f1_with_bootstrapping.py +103 -0
src/metrics/f1_with_threshold.py +33 -0
src/metrics/ranking_sklearn.py +193 -0
src/metrics/score_distribution.py +13 -4
src/metrics/semantically_same_ranking.py +448 -0
src/metrics/tpfpfn.py +193 -0
src/models/__init__.py +2 -1
src/models/sequence_classification_with_pooler.py +65 -30
src/predict.py +2 -2
src/serializer/__init__.py +4 -1
src/serializer/interface.py +1 -2
src/serializer/json.py +7 -121
src/start_demo.py +3 -2
src/train.py +2 -3
src/utils/graph_utils.py +47 -0
src/utils/inference_utils.py +4 -1
src/utils/pdf_utils/process_pdf.py +1 -1

src/analysis/combine_job_returns.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import pyrootutils
 root = pyrootutils.setup_root(
     search_from=__file__,
@@ -6,14 +7,18 @@ root = pyrootutils.setup_root(
     pythonpath=True,
     dotenv=False,
 )
 import argparse
 import os
 import pandas as pd
 from src.analysis.common import read_nested_jsons
 def separate_path_and_id(path_and_maybe_id: str, separator: str = ":") -> tuple[str | None, str]:
     parts = path_and_maybe_id.split(separator, 1)
@@ -22,7 +27,7 @@ def separate_path_and_id(path_and_maybe_id: str, separator: str = ":") -> tuple[
     return parts[0], parts[1]
-def get_file_paths(paths_file: str, file_name: str, use_aggregated: bool) -> dict[str, str]:
     with open(paths_file, "r") as f:
         paths_maybe_with_ids = f.readlines()
     ids, paths = zip(*[separate_path_and_id(path.strip()) for path in paths_maybe_with_ids])
@@ -31,10 +36,40 @@ def get_file_paths(paths_file: str, file_name: str, use_aggregated: bool) -> dic
         file_base_name, ext = os.path.splitext(file_name)
         file_name = f"{file_base_name}.aggregated{ext}"
     file_paths = [os.path.join(path, file_name) for path in paths]
-    return {
-        id if id is not None else f"idx={idx}": path
         for idx, (id, path) in enumerate(zip(ids, file_paths))
-    }
 def main(
@@ -46,17 +81,36 @@ def main(
     format: str,
     transpose: bool = False,
     unpack_multirun_results: bool = False,
     in_percent: bool = False,
     reset_index: bool = False,
 ):
     file_paths = get_file_paths(
         paths_file=paths_file, file_name=file_name, use_aggregated=use_aggregated
     )
     data = read_nested_jsons(json_paths=file_paths)
     if columns is not None:
-        columns_multi_index = [tuple(col.split("/")) for col in columns]
         try:
             data_series = [data[col] for col in columns_multi_index]
         except KeyError as e:
             print(
@@ -88,35 +142,129 @@ def main(
         for level in sorted(unique_column_levels, reverse=True):
             data.columns = data.columns.droplevel(level)
-    if unpack_multirun_results:
         index_names = list(data.index.names)
-        data_series_lists = data.unstack()
-        data = pd.DataFrame.from_records(
-            data_series_lists.values, index=data_series_lists.index
-        ).stack()
-        for _, index_name in enumerate(index_names):
-            data = data.unstack(index_name)
-        data = data.T
     # needs to happen before rounding, otherwise the rounding will be off
     if in_percent:
-        data = data * 100
     if round_precision is not None:
         data = data.round(round_precision)
     # needs to happen before transposing
     if format == "markdown_mean_and_std":
-        if "mean" not in data.columns or "std" not in data.columns:
-            raise ValueError("Columns 'mean' and 'std' are required for this format.")
-        # create a single column with mean and std in the format: mean ± std
-        data = pd.DataFrame(
-            data["mean"].astype(str) + " ± " + data["std"].astype(str), columns=["mean ± std"]
-        )
     if transpose:
         data = data.T
     if reset_index:
         data = data.reset_index()
@@ -148,7 +296,17 @@ if __name__ == "__main__":
     parser.add_argument(
         "--unpack-multirun-results", action="store_true", help="Unpack multirun results"
     )
     parser.add_argument("--transpose", action="store_true", help="Transpose the table")
     parser.add_argument(
         "--round-precision",
         type=int,
@@ -160,6 +318,34 @@ if __name__ == "__main__":
     parser.add_argument(
         "--reset-index", action="store_true", help="Reset the index of the combined job returns"
     )
     parser.add_argument(
         "--format",
         type=str,

 import pyrootutils
+from pandas import MultiIndex
 root = pyrootutils.setup_root(
     search_from=__file__,
     pythonpath=True,
     dotenv=False,
 )
 import argparse
+import logging
 import os
+from typing import Iterable, List, Optional, Tuple
+import numpy as np
 import pandas as pd
 from src.analysis.common import read_nested_jsons
+logger = logging.getLogger(__name__)
 def separate_path_and_id(path_and_maybe_id: str, separator: str = ":") -> tuple[str | None, str]:
     parts = path_and_maybe_id.split(separator, 1)
     return parts[0], parts[1]
+def get_file_paths(paths_file: str, file_name: str, use_aggregated: bool) -> List[Tuple[str, str]]:
     with open(paths_file, "r") as f:
         paths_maybe_with_ids = f.readlines()
     ids, paths = zip(*[separate_path_and_id(path.strip()) for path in paths_maybe_with_ids])
         file_base_name, ext = os.path.splitext(file_name)
         file_name = f"{file_base_name}.aggregated{ext}"
     file_paths = [os.path.join(path, file_name) for path in paths]
+    return [
+        (id if id is not None else f"idx={idx}", path)
         for idx, (id, path) in enumerate(zip(ids, file_paths))
+    ]
+def get_job_id_col(index: pd.MultiIndex) -> Optional[Tuple]:
+    for idx in index:
+        if "job_id" in idx:
+            return idx
+    return None
+def stringify(value: str | int | float | None | tuple | list) -> str:
+    if isinstance(value, str):
+        return value
+    if value is None:
+        return ""
+    if isinstance(value, float) and np.isnan(value):
+        return ""
+    if isinstance(value, (int, float)):
+        return str(value)
+    if isinstance(value, Iterable):
+        entries = [stringify(v) for v in value]
+        return "/".join(v for v in entries if v)
+    return value
+def remove_part_from_multi_index(index: pd.MultiIndex, part: str) -> pd.MultiIndex:
+    new_index = []
+    for idx in index:
+        new_idx = tuple([i for i in idx if i != part])
+        new_index.append(new_idx)
+    return MultiIndex.from_tuples(new_index)
 def main(
     format: str,
     transpose: bool = False,
     unpack_multirun_results: bool = False,
+    unpack_multirun_results_with_job_id: bool = False,
     in_percent: bool = False,
     reset_index: bool = False,
+    sort_columns: bool = False,
+    stringify_column_names: bool = False,
+    column_regex_blacklist: Optional[List[str]] = None,
+    column_regex_whitelist: Optional[List[str]] = None,
+    replace_in_col_names: Optional[List[Tuple[str, str]]] = None,
 ):
     file_paths = get_file_paths(
         paths_file=paths_file, file_name=file_name, use_aggregated=use_aggregated
     )
     data = read_nested_jsons(json_paths=file_paths)
+    job_id_col = get_job_id_col(data.columns)
     if columns is not None:
+        columns_multi_index = [
+            tuple([part or np.nan for part in col.split("/")]) for col in columns
+        ]
+        if unpack_multirun_results_with_job_id:
+            if job_id_col is None:
+                raise ValueError("Job ID column not found in the data.")
+            if job_id_col not in columns_multi_index:
+                columns_multi_index.append(job_id_col)
         try:
+            available_cols = data.columns.tolist()
+            for col in columns_multi_index:
+                if col not in available_cols:
+                    raise KeyError(f"Column {col} not found in the data.")
             data_series = [data[col] for col in columns_multi_index]
         except KeyError as e:
             print(
         for level in sorted(unique_column_levels, reverse=True):
             data.columns = data.columns.droplevel(level)
+    if unpack_multirun_results or unpack_multirun_results_with_job_id:
         index_names = list(data.index.names)
+        data_series_lists = data.copy()
+        job_ids = None
+        if job_id_col in data_series_lists.columns:
+            job_ids_series = data_series_lists.pop(job_id_col)
+            job_ids_frame = pd.DataFrame(pd.DataFrame.from_records(job_ids_series.values))
+            job_ids_frame.index = job_ids_series.index
+            # check that all rows are identical
+            if job_ids_frame.nunique().max():
+                job_ids = job_ids_frame.iloc[0]
+            else:
+                logger.warning(
+                    "Job IDs are not identical across all rows. Cannot unpack "
+                    "multirun results with job ids as columns."
+                )
+        while not isinstance(data_series_lists, pd.Series):
+            data_series_lists = data_series_lists.stack(future_stack=True)
+        data_series_lists = data_series_lists.dropna()
+        data = pd.DataFrame.from_records(data_series_lists.values, index=data_series_lists.index)
+        if job_ids is not None:
+            data.columns = job_ids
+        num_col_levels = data.index.nlevels - len(index_names)
+        for _ in range(num_col_levels):
+            data = data.unstack()
+        data.columns = data.columns.swaplevel(0, -1)
+        data = data.dropna(how="all", axis="columns")
     # needs to happen before rounding, otherwise the rounding will be off
     if in_percent:
+        float_columns = data.select_dtypes(include=["float64", "float32"]).columns
+        data[float_columns] = data[float_columns] * 100
     if round_precision is not None:
         data = data.round(round_precision)
     # needs to happen before transposing
     if format == "markdown_mean_and_std":
+        if data.columns.nlevels == 1:
+            data.columns = pd.MultiIndex.from_tuples([(col,) for col in data.columns.tolist()])
+        # get mean columns
+        mean_col_names = [col for col in data.columns if "mean" in col]
+        mean_columns = data[mean_col_names].copy()
+        # remove all "mean" from col names
+        mean_columns.columns = remove_part_from_multi_index(mean_columns.columns, "mean")
+        # get std columns
+        std_col_names = [col for col in data.columns if "std" in col]
+        std_columns = data[std_col_names].copy()
+        # remove all "std" from col names
+        std_columns.columns = remove_part_from_multi_index(std_columns.columns, "std")
+        # sanity check
+        if not mean_columns.columns.equals(std_columns.columns):
+            raise ValueError("Mean and std columns do not match.")
+        mean_and_std = mean_columns.astype(str) + " ± " + std_columns.astype(str)
+        mean_and_std.columns = [
+            ("mean ± std",) + (tuple(col) if col != ((),) else ()) for col in mean_columns.columns
+        ]
+        # remove mean and std columns from data
+        # we can not use drop because the columns is a multiindex that may contain NaNs
+        other_cols = [
+            col for col in data.columns if col not in set(mean_col_names + std_col_names)
+        ]
+        data = data[other_cols]
+        # add mean and std columns to data
+        data = pd.concat([data, mean_and_std], axis=1)
+        if data.columns.nlevels == 1:
+            data.columns = data.columns.to_flat_index()
+            data.columns = [
+                "/".join(col) if isinstance(col, tuple) else col for col in data.columns
+            ]
     if transpose:
         data = data.T
+    if sort_columns:
+        # sort columns to get a deterministic order
+        data = data.sort_index(axis=1)
+    if stringify_column_names:
+        # Convert MultiIndex columns to string representation
+        data.columns = data.columns.map(stringify)
+        if column_regex_blacklist is not None:
+            # Remove columns that match any of the regex patterns in the blacklist
+            for pattern in column_regex_blacklist:
+                data = data.loc[:, ~data.columns.str.contains(pattern, regex=True)]
+        if column_regex_whitelist is not None:
+            # keep only columns that match any of the regex patterns in the whitelist
+            data = data.loc[
+                :, data.columns.str.contains("|".join(column_regex_whitelist), regex=True)
+            ]
+        if replace_in_col_names is not None:
+            for old_value, new_value in replace_in_col_names:
+                data.columns = data.columns.str.replace(old_value, new_value, regex=False)
+    else:
+        if column_regex_blacklist is not None:
+            logger.warning(
+                "Column regex blacklist is ignored when stringify_column_names is False."
+            )
+        if column_regex_whitelist is not None:
+            logger.warning(
+                "Column regex whitelist is ignored when stringify_column_names is False."
+            )
+        if replace_in_col_names is not None:
+            logger.warning(
+                "Replace in column names is ignored when stringify_column_names is False."
+            )
+    # remove empty rows
+    # get rows that contain only nan or "nan ± nan"
+    empty_rows = data.apply(
+        lambda row: all(
+            pd.isna(value) or (isinstance(value, str) and value == "nan ± nan") for value in row
+        ),
+        axis=1,
+    )
+    data = data[~empty_rows]
     if reset_index:
         data = data.reset_index()
     parser.add_argument(
         "--unpack-multirun-results", action="store_true", help="Unpack multirun results"
     )
+    parser.add_argument(
+        "--unpack-multirun-results-with-job-id",
+        action="store_true",
+        help="Unpack multirun results with job ID",
+    )
     parser.add_argument("--transpose", action="store_true", help="Transpose the table")
+    parser.add_argument(
+        "--sort-columns",
+        action="store_true",
+        help="Sort the columns of the combined job returns",
+    )
     parser.add_argument(
         "--round-precision",
         type=int,
     parser.add_argument(
         "--reset-index", action="store_true", help="Reset the index of the combined job returns"
     )
+    parser.add_argument(
+        "--stringify-column-names",
+        action="store_true",
+        help="Stringify the column names of the combined job returns (useful for multi-index columns)",
+    )
+    parser.add_argument(
+        "--column-regex-blacklist",
+        type=str,
+        nargs="+",
+        default=None,
+        help="List of regex patterns to match column names. "
+        "Columns that match any of the patterns will be removed.",
+    )
+    parser.add_argument(
+        "--column-regex-whitelist",
+        type=str,
+        nargs="+",
+        default=None,
+        help="List of regex patterns to match column names. "
+        "Only columns that match any of the patterns will be kept.",
+    )
+    parser.add_argument(
+        "--replace-in-col-names",
+        type=lambda s: s.split(":", 1),
+        nargs="+",
+        default=None,
+        help='List of strings in the format "<old_value>:<new_value>" to replace substrings in column names.',
+    )
     parser.add_argument(
         "--format",
         type=str,

src/analysis/common.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import json
-from typing import Dict, List, Optional
 import pandas as pd
@@ -26,21 +26,42 @@ def read_nested_json(path: str) -> pd.DataFrame:
 def read_nested_jsons(
-    json_paths: Dict[str, str],
     default_key_values: Optional[Dict[str, str]] = None,
     column_level_names: Optional[List[str]] = None,
 ) -> pd.DataFrame:
-    identifier_strings = json_paths.keys()
-    dfs = [read_nested_json(json_paths[identifier_str]) for identifier_str in identifier_strings]
     new_index_levels = pd.MultiIndex.from_frame(
         pd.DataFrame(
             [
                 parse_identifier(identifier_str, default_key_values or {})
-                for identifier_str in identifier_strings
             ]
         )
     )
-    dfs_concat = pd.concat(dfs, keys=list(new_index_levels), names=new_index_levels.names, axis=0)
     dfs_concat.columns = pd.MultiIndex.from_tuples(
         [col.split("/") for col in dfs_concat.columns], names=column_level_names
     )

 import json
+from typing import Dict, List, Optional, Tuple
 import pandas as pd
 def read_nested_jsons(
+    json_paths: List[Tuple[str, str]],
     default_key_values: Optional[Dict[str, str]] = None,
     column_level_names: Optional[List[str]] = None,
 ) -> pd.DataFrame:
+    dfs = [read_nested_json(json_path) for identifier_str, json_path in json_paths]
     new_index_levels = pd.MultiIndex.from_frame(
         pd.DataFrame(
             [
                 parse_identifier(identifier_str, default_key_values or {})
+                for identifier_str, _ in json_paths
             ]
         )
     )
+    if len(set(list(new_index_levels))) == len(list(new_index_levels)):
+        dfs_concat = pd.concat(
+            dfs, keys=list(new_index_levels), names=new_index_levels.names, axis=0
+        )
+    else:
+        dfs_new = []
+        ids_unique = []
+        for identifier_str in new_index_levels:
+            if identifier_str not in ids_unique:
+                ids_unique.append(identifier_str)
+        # first combine the dataframes with same ids along the columns
+        for identifier_str in ids_unique:
+            dfs_with_id = [df for df, idx in zip(dfs, new_index_levels) if idx == identifier_str]
+            # assert that all columns are distinct
+            if len(set([tuple(col) for df in dfs_with_id for col in df.columns])) != sum(
+                [len(df.columns) for df in dfs_with_id]
+            ):
+                raise ValueError(
+                    "There are duplicate columns across the dataframes with the same identifier."
+                )
+            dfs_id_concat = pd.concat(dfs_with_id, axis=1)
+            dfs_new.append(dfs_id_concat)
+        dfs_concat = pd.concat(dfs_new, keys=ids_unique, names=new_index_levels.names, axis=0)
     dfs_concat.columns = pd.MultiIndex.from_tuples(
         [col.split("/") for col in dfs_concat.columns], names=column_level_names
     )

src/analysis/compare_job_returns.py CHANGED Viewed

@@ -173,7 +173,7 @@ def combine_job_returns_and_plot(
     if job_return_paths is not None:
         df_all = read_nested_jsons(
-            json_paths=job_return_paths,
             default_key_values=default_key_values,
             column_level_names=column_level_names,
         )

     if job_return_paths is not None:
         df_all = read_nested_jsons(
+            json_paths=list(job_return_paths.items()),
             default_key_values=default_key_values,
             column_level_names=column_level_names,
         )

src/analysis/format_metric_results.py ADDED Viewed

	@@ -0,0 +1,269 @@

+#!/usr/bin/env python
+import argparse
+import json
+import os
+from pathlib import Path
+import pandas as pd
+from pie_modules.utils import flatten_dict
+def str2record(s: str | None, sep_parts: str = "-", sep_k_v: str = "=") -> pd.Series:
+    if s is None or s.strip() == "" or s == "None":
+        return pd.Series()
+    return pd.Series(dict(k_v.split(sep_k_v, 1) for k_v in s.split(sep_parts)))
+def separate_path_and_id(path_and_maybe_id: str, separator: str = ":") -> tuple[str | None, str]:
+    parts = path_and_maybe_id.split(separator, 1)
+    if len(parts) == 1:
+        return None, parts[0]
+    return parts[0], parts[1]
+def load_data_from_json(path: str | Path) -> pd.DataFrame:
+    with open(path, "r") as f:
+        data_json = json.load(f)
+        data_flat = flatten_dict(data_json)
+    return pd.DataFrame(data_flat)
+def main(
+    path: str | Path,
+    remove_col_prefix: str | None = None,
+    sparse_col_prefix: str | None = None,
+    tail_cols: list[str] | None = None,
+    sort_cols: list[str] | None = None,
+    split_col: str | None = None,
+    replace_in_col_names: list[tuple[str, str]] | None = None,
+    round_precision: int | None = None,
+    in_percent: bool = False,
+    common_prefix_separator: str | None = None,
+    column_regex_blacklist: list[str] | None = None,
+    column_regex_whitelist: list[str] | None = None,
+    format: str = "markdown",
+) -> None:
+    if str(path).lower().endswith(".json"):
+        result = load_data_from_json(path)
+    elif str(path).lower().endswith(".txt"):
+        with open(path, "r") as f:
+            index_data = [separate_path_and_id(line.strip()) for line in f.readlines()]
+        data_list = []
+        for meta_id, meta_path in index_data:
+            data = load_data_from_json(os.path.join(meta_path, "job_return_value.json"))
+            if meta_id is not None:
+                job_id_prefix = meta_id.replace(",", "-")
+                data["job_id"] = job_id_prefix + "-" + data["job_id"].astype(str)
+            data = data.set_index("job_id")
+            data_list.append(data)
+        result = pd.concat(data_list, axis=1).reset_index()
+    else:
+        raise ValueError("Unsupported file format. Please provide a .json or .txt file.")
+    if remove_col_prefix is not None:
+        result.columns = result.columns.str.replace(r"^" + remove_col_prefix, "", regex=True)
+    if sparse_col_prefix is not None:
+        # get all columns that contain just one not-nan value
+        # number_of_non_nan_values = len(df) - df.isna().sum()
+        # df_sparse = df.loc[:, number_of_non_nan_values == 1]
+        sparse_cols = [col for col in result.columns if col.startswith(sparse_col_prefix)]
+        other_cols = [col for col in result.columns if col not in sparse_cols]
+        value_col = f"{sparse_col_prefix}value"
+        name_col = f"{sparse_col_prefix}name"
+        result = result.melt(
+            id_vars=other_cols, value_vars=sparse_cols, var_name=name_col, value_name=value_col
+        ).dropna(
+            subset=[value_col]
+        )  # keep rows with a value
+        # strip the "f1-" prefix, leaving just the numeric threshold
+        result[name_col] = result[name_col].str.replace(r"^" + sparse_col_prefix, "", regex=True)
+        # convert the column to numeric (if possible)
+        try:
+            result[name_col] = pd.to_numeric(result[name_col])
+        except ValueError:
+            # if it fails, just keep it as a string
+            pass
+    if split_col is not None:
+        new_frame = result[split_col].apply(str2record)
+        result = pd.concat([result.drop(columns=[split_col]), new_frame], axis=1)
+    if in_percent:
+        float_columns = result.select_dtypes(include=["float64", "float32"]).columns
+        result[float_columns] = result[float_columns] * 100
+    if round_precision is not None:
+        # round all columns to the given precision
+        result = result.round(round_precision)
+    if common_prefix_separator is not None:
+        # remove common prefix from values in all string columns
+        obj_columns = result.select_dtypes(include=["object"]).columns
+        for obj_col in obj_columns:
+            # get the common prefix
+            common_prefix = os.path.commonprefix(result[obj_col].dropna().astype(str).tolist())
+            # find last occurrence of the common_prefix_separator
+            last_occurrence = common_prefix.rfind(common_prefix_separator)
+            if last_occurrence != -1:
+                # truncate the common prefix after the last occurrence of the separator
+                common_prefix = common_prefix[: last_occurrence + len(common_prefix_separator)]
+                # remove the common prefix (including the separator) from the column
+                result[obj_col] = result[obj_col].str.replace(r"^" + common_prefix, "", regex=True)
+    # sort columns to get a deterministic order
+    result = result.sort_index(axis=1)
+    if tail_cols is not None:
+        front_cols = [c for c in result.columns if c not in tail_cols]
+        result = result[front_cols + tail_cols]
+    if sort_cols is not None:
+        result = result.sort_values(sort_cols)
+        # also move the sort columns to the front
+        result = result[sort_cols + [c for c in result.columns if c not in sort_cols]]
+    if column_regex_blacklist is not None:
+        # remove columns that match any of the regex patterns in the blacklist
+        for pattern in column_regex_blacklist:
+            result = result.loc[:, ~result.columns.str.contains(pattern, regex=True)]
+    if column_regex_whitelist is not None:
+        # keep only columns that match any of the regex patterns in the whitelist
+        result = result.loc[
+            :, result.columns.str.contains("|".join(column_regex_whitelist), regex=True)
+        ]
+    if replace_in_col_names is not None:
+        for old_value, new_value in replace_in_col_names:
+            result.columns = result.columns.str.replace(old_value, new_value, regex=False)
+    if format == "markdown":
+        result_str = result.to_markdown(index=False)
+    elif format == "csv":
+        result_str = result.to_csv(index=False)
+    elif format == "tsv":
+        result_str = result.to_csv(index=False, sep="\t")
+    elif format == "json":
+        result_str = result.to_json(orient="records", lines=True)
+    else:
+        raise ValueError(
+            f"Unsupported format: {format}. Supported formats are: markdown, csv, json."
+        )
+    print(result_str)
+if __name__ == "__main__":
+    """
+    Example usage:
+    python src/analysis/format_metric_results.py \
+    logs/document_evaluation/multiruns/default/2025-05-21_11-59-19/job_return_value.json \
+    --remove-col-prefix train/ \
+    --sparse-col-prefix f1- \
+    --split-col job_id \
+    --tail-cols num_positives num_total \
+    --sort-cols experiment model \
+    --round-precision 4
+    """
+    parser = argparse.ArgumentParser(
+        description="Process a JSON file containing metric results (from multirun) and print as Markdown table."
+    )
+    parser.add_argument(
+        "path",
+        type=str,
+        help="Path to the JSON file to process. The JSON file is expected to contain "
+        "a (maybe nested) dictionary where each leave entry is a list of values with "
+        "the same length.",
+    )
+    parser.add_argument(
+        "--remove-col-prefix",
+        type=str,
+        default=None,
+        help="Prefix to remove from column names.",
+    )
+    parser.add_argument(
+        "--sparse-col-prefix",
+        type=str,
+        default=None,
+        help="Prefix of sparse columns. All sparse columns will be melted into "
+        "two columns: <prefix>name and <prefix>value. The name column will "
+        "be converted to numeric if possible.",
+    )
+    parser.add_argument(
+        "--split-col",
+        type=str,
+        default=None,
+        help="Column to split into multiple columns. The format of the "
+        "column entries is expected to be: <key_1>=<value_a>-<key_2>=<value_b>-...",
+    )
+    parser.add_argument(
+        "--tail-cols",
+        type=str,
+        nargs="+",
+        default=None,
+        help="Columns to move to the end.",
+    )
+    parser.add_argument(
+        "--sort-cols",
+        type=str,
+        nargs="+",
+        default=None,
+        help="Columns to sort by (they will be moved to the front).",
+    )
+    parser.add_argument(
+        "--replace-in-col-names",
+        type=lambda s: s.split(":", 1),
+        nargs="+",
+        default=None,
+        help='List of strings in the format "<old_value>:<new_value>" to replace substrings in column names.',
+    )
+    parser.add_argument(
+        "--round-precision",
+        type=int,
+        default=None,
+        help="Number of decimal places to round to.",
+    )
+    parser.add_argument(
+        "--in-percent",
+        action="store_true",
+        default=False,
+        help="If set, all float columns will be multiplied by 100 to convert them to percentages.",
+    )
+    parser.add_argument(
+        "--common-prefix-separator",
+        type=str,
+        default=None,
+        help="For all string columns, remove the common prefix up to the last occurrence of this separator.",
+    )
+    parser.add_argument(
+        "--column-regex-blacklist",
+        type=str,
+        nargs="+",
+        default=None,
+        help="List of regex patterns to match column names. "
+        "Columns that match any of the patterns will be removed.",
+    )
+    parser.add_argument(
+        "--column-regex-whitelist",
+        type=str,
+        nargs="+",
+        default=None,
+        help="List of regex patterns to match column names. "
+        "Only columns that match any of the patterns will be kept.",
+    )
+    parser.add_argument(
+        "--format",
+        type=str,
+        default="markdown",
+        choices=["markdown", "csv", "tsv", "json"],
+        help="Format to print the result in. Supported formats are: markdown, csv, json.",
+    )
+    kwargs = vars(parser.parse_args())
+    main(**kwargs)

src/analysis/get_json_field_as_string.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import json
+def main(
+    paths: list[str],
+    field: list[str],
+    format: str = "plain",
+) -> None:
+    result = []
+    for path in paths:
+        with open(path, "r") as f:
+            data = json.load(f)
+            value = data
+            for key in field:
+                value = value.get(key)
+            if not isinstance(value, list):
+                value = [value]
+            result.extend(value)
+    if format == "plain":
+        print(",".join(map(str, result)))
+    elif format == "python":
+        result_str = str(result)
+        print(result_str.replace(" ", ""))
+    else:
+        raise ValueError(f"Unknown format: {format}")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="Get a field from one or more JSON files and print to stdout."
+    )
+    parser.add_argument(
+        "paths",
+        type=lambda x: x.split(","),
+        help="Comma-separated list of paths to the JSON files to process.",
+    )
+    parser.add_argument(
+        "--field",
+        type=str,
+        required=True,
+        nargs="+",
+        help="Field to extract from the JSON files. Can be a nested field by providing multiple entries.",
+    )
+    parser.add_argument(
+        "--format",
+        type=str,
+        default="plain",
+        choices=["plain", "python"],
+    )
+    args = parser.parse_args()
+    kwargs = vars(args)
+    main(**kwargs)

src/analysis/show_inference_params_on_quality_and_throughput.py ADDED Viewed

	@@ -0,0 +1,485 @@

+import argparse
+import json
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+import numpy as np
+import pandas as pd
+import plotly.express as px
+def get_col_name(col: str) -> str:
+    parts = [part[1:-1] for part in col[1:-1].split(", ") if part[1:-1] != ""]
+    return parts[-1]
+def get_idx_entry(s: str, keep_only_last_part: bool = False) -> Tuple[str, str]:
+    k, v = s.split("=", 1)
+    if keep_only_last_part:
+        k = k.split(".")[-1]
+    return k, v
+def get_idx_dict(job_id: str, keep_only_last_part: bool = False) -> Dict[str, str]:
+    return dict(
+        get_idx_entry(part, keep_only_last_part=keep_only_last_part) for part in job_id.split("-")
+    )
+def unflatten_index(
+    index: Iterable[str],
+    keep_only_last_part: bool = False,
+    dtypes: Optional[Dict[str, Any]] = None,
+) -> pd.MultiIndex:
+    as_df = pd.DataFrame.from_records(
+        [get_idx_dict(idx, keep_only_last_part=keep_only_last_part) for idx in index]
+    )
+    if dtypes is not None:
+        dtypes_valid = {col: dtype for col, dtype in dtypes.items() if col in as_df.columns}
+        as_df = as_df.astype(dtypes_valid)
+    return pd.MultiIndex.from_frame(as_df.convert_dtypes())
+def col_to_str(col_entries: Iterable[str], names: Iterable[Optional[str]], sep: str) -> str:
+    return sep.join(
+        [
+            f"{name}={col_entry}" if name is not None else col_entry
+            for col_entry, name in zip(col_entries, names)
+        ]
+    )
+def flatten_index(index: pd.MultiIndex, names: Optional[List[Optional[str]]] = None) -> pd.Index:
+    names = names or index.names
+    if names is None:
+        raise ValueError("names must be provided if index has no names")
+    return pd.Index([col_to_str(col, names=names, sep=",") for col in index])
+def prepare_quality_and_throughput_dfs(
+    metric_data_path: str,
+    job_return_value_path: str,
+    char_total: int,
+    index_dtypes: Optional[Dict[str, Any]] = None,
+    job_id_prefix: Optional[str] = None,
+) -> Tuple[pd.DataFrame, pd.Series]:
+    with open(metric_data_path) as f:
+        data = json.load(f)
+    # save result from above command in "data" (use only last ouf the output line!)
+    df = pd.DataFrame.from_dict(data)
+    df.columns = [get_col_name(col) for col in df.columns]
+    f1_series = df.set_index([col for col in df.columns if col != "f1"])["f1"]
+    f1_df = f1_series.apply(lambda x: pd.Series(x)).T
+    with open(job_return_value_path) as f:
+        job_return_value = json.load(f)
+    job_ids = job_return_value["job_id"]
+    if job_id_prefix is not None:
+        job_ids = [
+            f"{job_id_prefix},{job_id}" if job_id.strip() != "" else job_id_prefix
+            for job_id in job_ids
+        ]
+    index = unflatten_index(
+        job_ids,
+        keep_only_last_part=True,
+        dtypes=index_dtypes,
+    )
+    prediction_time_series = pd.Series(
+        job_return_value["prediction_time"], index=index, name="prediction_time"
+    )
+    f1_df.index = prediction_time_series.index
+    k_chars_per_s = char_total / (prediction_time_series * 1000)
+    k_chars_per_s.name = "1k_chars_per_s"
+    return f1_df, k_chars_per_s
+def get_pareto_front_mask(df: pd.DataFrame, x_col: str, y_col: str) -> pd.Series:
+    """
+    Return a boolean mask indicating which rows belong to the Pareto front.
+    In this version, we assume you want to maximize both x_col and y_col.
+    A point A is said to dominate point B if:
+        A[x_col] >= B[x_col] AND
+        A[y_col] >= B[y_col] AND
+        at least one is strictly greater.
+    Then B is not on the Pareto front.
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame containing the data points.
+    x_col : str
+        Name of the column to treat as the first objective (maximize).
+    y_col : str
+        Name of the column to treat as the second objective (maximize).
+    Returns
+    -------
+    pd.Series
+        A boolean Series (aligned with df.index) where True means
+        the row is on the Pareto front.
+    """
+    # Extract the relevant columns as a NumPy array for speed.
+    data = df[[x_col, y_col]].values
+    n = len(data)
+    is_dominated = np.zeros(n, dtype=bool)
+    for i in range(n):
+        # If it's already marked dominated, skip checks
+        if is_dominated[i]:
+            continue
+        for j in range(n):
+            if i == j:
+                continue
+            # Check if j dominates i
+            if (
+                data[j, 0] >= data[i, 0]
+                and data[j, 1] >= data[i, 1]
+                and (data[j, 0] > data[i, 0] or data[j, 1] > data[i, 1])
+            ):
+                is_dominated[i] = True
+                break
+    # Return True for points not dominated by any other
+    return pd.Series(~is_dominated, index=df.index)
+def main(
+    job_return_value_path_test: List[str],
+    job_return_value_path_val: List[str],
+    metric_data_path_test: List[str],
+    metric_data_path_val: List[str],
+    char_total_test: int,
+    char_total_val: int,
+    job_id_prefixes: Optional[List[str]] = None,
+    metric_filters: Optional[List[str]] = None,
+    index_filters: Optional[List[str]] = None,
+    index_blacklist: Optional[List[str]] = None,
+    label_mapping: Optional[Dict[str, str]] = None,
+    plot_method: str = "line",  # can be "scatter" or "line"
+    pareto_front: bool = False,
+    show_as: str = "figure",
+    columns: Optional[List[str]] = None,
+    color_column: Optional[str] = None,
+):
+    label_mapping = label_mapping or {}
+    if job_id_prefixes is not None:
+        if len(job_id_prefixes) != len(job_return_value_path_test):
+            raise ValueError(
+                f"job_id_prefixes ({len(job_id_prefixes)}) and "
+                f"job_return_value_path_test ({len(job_return_value_path_test)}) "
+                f"must have the same length"
+            )
+        # replace empty strings with None
+        job_id_prefixes_with_none = [
+            job_id_prefix if job_id_prefix != "" else None for job_id_prefix in job_id_prefixes
+        ]
+    else:
+        job_id_prefixes_with_none = [None] * len(job_return_value_path_test)
+    # combine input data for test and val
+    char_total = {"test": char_total_test, "val": char_total_val}
+    metric_data_path = {"test": metric_data_path_test, "val": metric_data_path_val}
+    job_return_value_path = {"test": job_return_value_path_test, "val": job_return_value_path_val}
+    # prepare dataframes
+    common_kwargs = dict(
+        index_dtypes={
+            "max_argument_distance": int,
+            "max_length": int,
+            "num_beams": int,
+        }
+    )
+    f1_df_list: Dict[str, List[pd.DataFrame]] = {"test": [], "val": []}
+    k_chars_per_s_list: Dict[str, List[pd.Series]] = {"test": [], "val": []}
+    for split in metric_data_path:
+        if len(metric_data_path[split]) != len(job_return_value_path[split]):
+            raise ValueError(
+                f"metric_data_path[{split}] ({len(metric_data_path[split])}) and "
+                f"job_return_value_path[{split}] ({len(job_return_value_path[split])}) "
+                f"must have the same length"
+            )
+        for current_metric_data_path, current_job_return_value_path, job_id_prefix in zip(
+            metric_data_path[split], job_return_value_path[split], job_id_prefixes_with_none
+        ):
+            current_f1_df, current_k_chars_per_s = prepare_quality_and_throughput_dfs(
+                current_metric_data_path,
+                current_job_return_value_path,
+                char_total=char_total[split],
+                job_id_prefix=job_id_prefix,
+                **common_kwargs,
+            )
+            f1_df_list[split].append(current_f1_df)
+            k_chars_per_s_list[split].append(current_k_chars_per_s)
+    f1_df_dict = {split: pd.concat(f1_df_list[split], axis=0) for split in f1_df_list}
+    k_chars_per_s_dict = {
+        split: pd.concat(k_chars_per_s_list[split], axis=0) for split in k_chars_per_s_list
+    }
+    # combine dataframes for test and val
+    f1_df = pd.concat(f1_df_dict, names=["split"] + f1_df_dict["test"].index.names)
+    f1_df.columns = [col_to_str(col, names=f1_df.columns.names, sep=",") for col in f1_df.columns]
+    k_chars_per_s = pd.concat(
+        k_chars_per_s_dict,
+        names=["split"] + k_chars_per_s_dict["test"].index.names,
+    )
+    # combine quality and throughput data
+    df_plot = pd.concat([f1_df, k_chars_per_s], axis=1)
+    df_plot = (
+        df_plot.reset_index()
+        .set_index(list(f1_df.index.names) + [k_chars_per_s.name])
+        .unstack("split")
+    )
+    df_plot.columns = flatten_index(df_plot.columns, names=[None, "split"])
+    # remove all columns that are not needed
+    if metric_filters is not None:
+        for fil in metric_filters:
+            df_plot.drop(columns=[col for col in df_plot.columns if fil not in col], inplace=True)
+            df_plot.columns = [col.replace(fil, "") for col in df_plot.columns]
+    # flatten the columns
+    df_plot.columns = [
+        ",".join([part for part in col.split(",") if part != ""]) for col in df_plot.columns
+    ]
+    v: Any
+    if index_filters is not None:
+        for k_v in index_filters:
+            k, v = k_v.split("=")
+            if k in common_kwargs["index_dtypes"]:
+                v = common_kwargs["index_dtypes"][k](v)
+            df_plot = df_plot.xs(v, level=k, axis=0)
+    if index_blacklist is not None:
+        for k_v in index_blacklist:
+            k, v = k_v.split("=")
+            if k in common_kwargs["index_dtypes"]:
+                v = common_kwargs["index_dtypes"][k](v)
+            df_plot = df_plot.drop(v, level=k, axis=0)
+    if columns is not None:
+        df_plot = df_plot[columns]
+    x = "1k_chars_per_s"
+    y = df_plot.columns
+    if pareto_front:
+        for col in y:
+            current_data = df_plot[col].dropna().reset_index(x).copy()
+            pareto_front_mask = get_pareto_front_mask(current_data, x_col=x, y_col=col)
+            current_data.loc[~pareto_front_mask, col] = np.nan
+            current_data_reset = current_data.reset_index().set_index(df_plot.index.names)
+            df_plot[col] = current_data_reset[col]
+    # remove nan rows
+    df_plot = df_plot.dropna(how="all")
+    # plot
+    # Create a custom color sequence (concatenating multiple palettes if needed)
+    custom_colors = px.colors.qualitative.Dark24 + px.colors.qualitative.Light24
+    text_cols = list(df_plot.index.names)
+    text_cols.remove(x)
+    df_plot_reset = df_plot.reset_index()
+    if len(text_cols) > 1:
+        df_plot_reset[",".join(text_cols)] = (
+            df_plot_reset[text_cols].astype(str).agg(", ".join, axis=1)
+        )
+    text_col = ",".join(text_cols)
+    if show_as == "figure":
+        _plot_method = getattr(px, plot_method)
+        df_plot_sorted = df_plot_reset.sort_values(by=x)
+        fig = _plot_method(
+            df_plot_sorted,
+            x=x,
+            y=y,
+            text=text_col if plot_method != "scatter" else None,
+            color=color_column,
+            color_discrete_sequence=custom_colors,
+            hover_data=text_cols,
+        )
+        # set connectgaps to True to connect the lines
+        fig.update_traces(connectgaps=True)
+        legend_title = "Evaluation Setup"
+        if metric_filters:
+            whitelist_filters_mapped = [label_mapping.get(fil, fil) for fil in metric_filters]
+            legend_title += f" ({', '.join(whitelist_filters_mapped)})"
+        text_cols_mapped = [label_mapping.get(col, col) for col in text_cols]
+        title = f"Impact of {', '.join(text_cols_mapped)} on Prediction Quality and Throughput"
+        if index_filters:
+            index_filters_mapped = [label_mapping.get(fil, fil) for fil in index_filters]
+            title += f" ({', '.join(index_filters_mapped)})"
+        if pareto_front:
+            title += " (Pareto Front)"
+        fig.update_layout(
+            xaxis_title="Throughput (1k chars/s)",
+            yaxis_title="Quality (F1)",
+            title=title,
+            # center the title
+            title_x=0.2,
+            # black title
+            title_font=dict(color="black"),
+            # change legend title
+            legend_title=legend_title,
+            font_family="Computer Modern",
+            # white background
+            plot_bgcolor="white",
+            paper_bgcolor="white",
+        )
+        update_axes_kwargs = dict(
+            tickfont=dict(color="black"),
+            title_font=dict(color="black"),
+            ticks="inside",  # ensure tick markers are drawn
+            tickcolor="black",
+            tickwidth=1,
+            ticklen=10,
+            linecolor="black",
+            # show grid
+            gridcolor="lightgray",
+        )
+        fig.update_yaxes(**update_axes_kwargs)
+        fig.update_xaxes(**update_axes_kwargs)
+        fig.show()
+    elif show_as == "markdown":
+        # Print the DataFrame as a Markdown table
+        print(df_plot_reset.to_markdown(index=False, floatfmt=".4f"))
+    elif show_as == "json":
+        # Print the DataFrame as a JSON object
+        print(df_plot_reset.to_json(orient="columns", indent=4))
+    else:
+        raise ValueError(f"Unknown show_as value: {show_as}. Use 'figure', 'markdown' or 'json'.")
+if __name__ == "__main__":
+    """
+    # Example usage 1 (pipeline model, data from data source: https://github.com/ArneBinder/pie-document-level/issues/388#issuecomment-2752829257):
+    python src/analysis/show_inference_params_on_quality_and_throughput.py \
+        --job-return-value-path-test logs/prediction/multiruns/default/2025-03-26_01-31-05/job_return_value.json \
+        --job-return-value-path-val logs/prediction/multiruns/default/2025-03-26_16-49-36/job_return_value.json \
+        --metric-data-path-test data/evaluation/argumentation_structure/inference_pipeline_test.json \
+        --metric-data-path-val data/evaluation/argumentation_structure/inference_pipeline_validation.json \
+        --metric-filters task=are discont_comp=true split=val
+    # Example usage 2 (joint model, data from: https://github.com/ArneBinder/pie-document-level/issues/390#issuecomment-2759888004)
+    python src/analysis/show_inference_params_on_quality_and_throughput.py \
+        --job-return-value-path-test logs/prediction/multiruns/default/2025-03-28_01-34-07/job_return_value.json \
+        --job-return-value-path-val logs/prediction/multiruns/default/2025-03-28_02-57-00/job_return_value.json \
+        --metric-data-path-test data/evaluation/argumentation_structure/inference_joint_test.json \
+        --metric-data-path-val data/evaluation/argumentation_structure/inference_joint_validation.json \
+        --metric-filters task=are discont_comp=true split=val \
+        --plot-method scatter
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--job-return-value-path-test",
+        type=str,
+        nargs="+",
+        required=True,
+    )
+    parser.add_argument(
+        "--job-return-value-path-val",
+        type=str,
+        nargs="+",
+        required=True,
+    )
+    parser.add_argument(
+        "--metric-data-path-test",
+        type=str,
+        nargs="+",
+        required=True,
+    )
+    parser.add_argument(
+        "--metric-data-path-val",
+        type=str,
+        nargs="+",
+        required=True,
+    )
+    parser.add_argument(
+        "--job-id-prefixes",
+        type=str,
+        nargs="*",
+        default=None,
+    )
+    parser.add_argument(
+        "--plot-method",
+        type=str,
+        default="line",
+        choices=["scatter", "line"],
+        help="Plot method to use (default: line)",
+    )
+    parser.add_argument(
+        "--color-column",
+        type=str,
+        default=None,
+        help="Column to use for colour coding (default: None)",
+    )
+    parser.add_argument(
+        "--metric-filters",
+        type=str,
+        nargs="*",
+        default=None,
+        help="Filters to apply to the metric data in the format 'key=value'",
+    )
+    parser.add_argument(
+        "--index-filters",
+        type=str,
+        nargs="*",
+        default=None,
+        help="Filters to apply to the index data in the format 'key=value'",
+    )
+    parser.add_argument(
+        "--index-blacklist",
+        type=str,
+        nargs="*",
+        default=None,
+        help="Blacklist to apply to the index data in the format 'key=value'",
+    )
+    parser.add_argument(
+        "--columns",
+        type=str,
+        nargs="*",
+        default=None,
+        help="Columns to plot (default: all)",
+    )
+    parser.add_argument(
+        "--pareto-front",
+        action="store_true",
+        help="Whether to show only the pareto front",
+    )
+    parser.add_argument(
+        "--show-as",
+        type=str,
+        default="figure",
+        choices=["figure", "markdown", "json"],
+        help="How to show the results (default: figure)",
+    )
+    kwargs = vars(parser.parse_args())
+    main(
+        char_total_test=383154,
+        char_total_val=182794,
+        label_mapping={
+            "max_argument_distance": "Max. Argument Distance",
+            "max_length": "Max. Length",
+            "num_beams": "Num. Beams",
+            "task=are": "ARE",
+            "discont_comp=true": "Discont. Comp.",
+            "split=val": "Validation Split",
+        },
+        **kwargs,
+    )

src/datamodules/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- from .~~datamodule~~ import ~~PieDataModule~~


1	+ from .datamodule_with_sampler import PieDataModuleWithSampler

src/datamodules/datamodule_with_sampler.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import logging
+from typing import Optional, Union
+from pytorch_ie import PieDataModule
+from pytorch_ie.core.taskmodule import IterableTaskEncodingDataset, TaskEncodingDataset
+from torch.utils.data import DataLoader, Sampler
+from .components.sampler import ImbalancedDatasetSampler
+logger = logging.getLogger(__name__)
+class PieDataModuleWithSampler(PieDataModule):
+    def __init__(
+        self,
+        train_sampler: Optional[str] = None,
+        dont_shuffle_train: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.train_sampler_name = train_sampler
+        self.dont_shuffle_train = dont_shuffle_train
+    def get_train_sampler(
+        self,
+        dataset: Union[TaskEncodingDataset, IterableTaskEncodingDataset],
+    ) -> Optional[Sampler]:
+        if self.train_sampler_name is None:
+            return None
+        elif self.train_sampler_name == "imbalanced_dataset":
+            # for now, this work only with targets that have a single entry
+            return ImbalancedDatasetSampler(
+                dataset, callback_get_label=lambda ds: [x.targets[0] for x in ds]
+            )
+        else:
+            raise ValueError(f"unknown sampler name: {self.train_sampler_name}")
+    def train_dataloader(self) -> DataLoader:
+        ds = self.data_split(self.train_split)
+        sampler = self.get_train_sampler(dataset=ds)
+        # don't shuffle if we explicitly set dont_shuffle_train,
+        # streamed datasets or if we use a sampler or
+        shuffle = not (
+            self.dont_shuffle_train
+            or isinstance(ds, IterableTaskEncodingDataset)
+            or sampler is not None
+        )
+        if not shuffle:
+            logger.warning("not shuffling train dataloader")
+        return DataLoader(
+            dataset=ds,
+            sampler=sampler,
+            collate_fn=self.taskmodule.collate,
+            shuffle=shuffle,
+            **self.dataloader_kwargs,
+        )

src/dataset/processing.py CHANGED Viewed

@@ -1,9 +1,16 @@
-from typing import Callable, Type, Union
 from pie_datasets import Dataset, DatasetDict
 from pytorch_ie import Document
 from pytorch_ie.utils.hydra import resolve_optional_document_type, resolve_target
 # TODO: simply use use DatasetDict.map() with set_batch_size_to_split_size=True and
 #  batched=True instead when https://github.com/ArneBinder/pie-datasets/pull/155 is merged
@@ -11,7 +18,7 @@ def apply_func_to_splits(
     dataset: DatasetDict,
     function: Union[str, Callable],
     result_document_type: Type[Document],
-    **kwargs
 ):
     resolved_func = resolve_target(function)
     resolved_document_type = resolve_optional_document_type(document_type=result_document_type)
@@ -23,7 +30,85 @@ def apply_func_to_splits(
             batched=True,
             batch_size=len(split),
             result_document_type=resolved_document_type,
-            **kwargs
         )
         result_dict[split_name] = converted_dataset
     return DatasetDict(result_dict)

+import logging
+from collections import defaultdict
+from typing import Callable, Dict, List, Optional, Type, TypeVar, Union
 from pie_datasets import Dataset, DatasetDict
+from pie_modules.documents import TextPairDocumentWithLabeledSpansAndBinaryCorefRelations
 from pytorch_ie import Document
+from pytorch_ie.annotations import BinaryRelation, Span
+from pytorch_ie.documents import TextDocumentWithLabeledSpansAndBinaryRelations
 from pytorch_ie.utils.hydra import resolve_optional_document_type, resolve_target
+logger = logging.getLogger(__name__)
 # TODO: simply use use DatasetDict.map() with set_batch_size_to_split_size=True and
 #  batched=True instead when https://github.com/ArneBinder/pie-datasets/pull/155 is merged
     dataset: DatasetDict,
     function: Union[str, Callable],
     result_document_type: Type[Document],
+    **kwargs,
 ):
     resolved_func = resolve_target(function)
     resolved_document_type = resolve_optional_document_type(document_type=result_document_type)
             batched=True,
             batch_size=len(split),
             result_document_type=resolved_document_type,
+            **kwargs,
         )
         result_dict[split_name] = converted_dataset
     return DatasetDict(result_dict)
+S = TypeVar("S", bound=Span)
+def shift_span(span: S, offset: int) -> S:
+    """Shift the start and end of a span by a given offset."""
+    return span.copy(start=span.start + offset, end=span.end + offset)
+D = TypeVar("D", bound=TextDocumentWithLabeledSpansAndBinaryRelations)
+def add_predicted_semantically_same_relations_to_document(
+    document: D,
+    doc_id2docs_with_predictions: Dict[
+        str, TextPairDocumentWithLabeledSpansAndBinaryCorefRelations
+    ],
+    relation_label: str,
+    argument_label_blacklist: Optional[List[str]] = None,
+    verbose: bool = False,
+) -> D:
+    # create lookup for detached versions of the spans (attached span != detached span even if they are the same)
+    span2span = {span.copy(): span for span in document.labeled_spans}
+    for text_pair_doc_with_preds in doc_id2docs_with_predictions.get(document.id, []):
+        offset = text_pair_doc_with_preds.metadata["original_doc_span"]["start"]
+        offset_pair = text_pair_doc_with_preds.metadata["original_doc_span_pair"]["start"]
+        for coref_rel in text_pair_doc_with_preds.binary_coref_relations.predictions:
+            head = shift_span(coref_rel.head, offset=offset)
+            if head not in span2span:
+                if verbose:
+                    logger.warning(f"doc_id={document.id}: Head span {head} not found.")
+                continue
+            tail = shift_span(coref_rel.tail, offset=offset_pair)
+            if tail not in span2span:
+                if verbose:
+                    logger.warning(f"doc_id={document.id}: Tail span {tail} not found.")
+                continue
+            if argument_label_blacklist is not None and (
+                span2span[head].label in argument_label_blacklist
+                or span2span[tail].label in argument_label_blacklist
+            ):
+                continue
+            new_rel = BinaryRelation(
+                head=span2span[head],
+                tail=span2span[tail],
+                label=relation_label,
+                score=coref_rel.score,
+            )
+            document.binary_relations.predictions.append(new_rel)
+    return document
+def integrate_coref_predictions_from_text_pair_documents(
+    dataset: DatasetDict, data_dir: str, **kwargs
+) -> DatasetDict:
+    dataset_with_predictions = DatasetDict.from_json(data_dir=data_dir)
+    for split_name in dataset.keys():
+        ds_with_predictions = dataset_with_predictions[split_name]
+        original_doc_id2docs = defaultdict(list)
+        for doc in ds_with_predictions:
+            original_doc_id = doc.metadata["original_doc_id"]
+            if original_doc_id != doc.metadata["original_doc_id_pair"]:
+                raise ValueError(
+                    f"Original document IDs do not match: "
+                    f"{original_doc_id} != {doc.metadata['original_doc_id_pair']}. "
+                    f"Cross-document coref is not supported."
+                )
+            original_doc_id2docs[original_doc_id].append(doc)
+        dataset[split_name] = dataset[split_name].map(
+            function=add_predicted_semantically_same_relations_to_document,
+            fn_kwargs=dict(doc_id2docs_with_predictions=original_doc_id2docs, **kwargs),
+        )
+    return dataset

src/demo/annotation_utils.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import json
 import logging
-from typing import Iterable, Optional, Sequence, Union
 import gradio as gr
 from hydra.utils import instantiate
@@ -41,59 +41,6 @@ def get_merger() -> SpansViaRelationMerger:
     )
-def annotate_document(
-    document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
-    argumentation_model: Pipeline,
-    handle_parts_of_same: bool = False,
-) -> Union[
-    TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
-    TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
-]:
-    """Annotate a document with the provided pipeline.
-    Args:
-        document: The document to annotate.
-        argumentation_model: The pipeline to use for annotation.
-        handle_parts_of_same: Whether to merge spans that are part of the same entity into a single multi span.
-    """
-    # execute prediction pipeline
-    result: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions = argumentation_model(
-        document, inplace=True
-    )
-    if handle_parts_of_same:
-        merger = get_merger()
-        result = merger(result)
-    return result
-def annotate_documents(
-    documents: Sequence[TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions],
-    argumentation_model: Pipeline,
-    handle_parts_of_same: bool = False,
-) -> Union[
-    Sequence[TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions],
-    Sequence[TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions],
-]:
-    """Annotate a sequence of documents with the provided pipeline.
-    Args:
-        documents: The documents to annotate.
-        argumentation_model: The pipeline to use for annotation.
-        handle_parts_of_same: Whether to merge spans that are part of the same entity into a single multi span.
-    """
-    # execute prediction pipeline
-    result = argumentation_model(documents, inplace=True)
-    if handle_parts_of_same:
-        merger = get_merger()
-        result = [merger(document) for document in result]
-    return result
 def create_document(
     text: str, doc_id: str, split_regex: Optional[str] = None
 ) -> TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions:
@@ -143,14 +90,17 @@ def create_documents(
     ]
-def load_argumentation_model(config_str: str, **kwargs) -> Pipeline:
     try:
         config = parse_config(config_str, format="yaml")
         # for PIE AutoPipeline, we need to handle the revision separately for
         # the taskmodule and the model
         if (
-            config.get("_target_") == "pytorch_ie.auto.AutoPipeline.from_pretrained"
             and "revision" in config
         ):
             revision = config.pop("revision")

 import json
 import logging
+from typing import Iterable, Optional, Sequence
 import gradio as gr
 from hydra.utils import instantiate
     )
 def create_document(
     text: str, doc_id: str, split_regex: Optional[str] = None
 ) -> TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions:
     ]
+def load_argumentation_model(config_str: str, **kwargs) -> Optional[Pipeline]:
     try:
         config = parse_config(config_str, format="yaml")
+        if config is None or config == {}:
+            gr.Warning("Empty argumentation model config provided. No model loaded.")
+            return None
         # for PIE AutoPipeline, we need to handle the revision separately for
         # the taskmodule and the model
         if (
+            config.get("_target_", "").strip().endswith("AutoPipeline.from_pretrained")
             and "revision" in config
         ):
             revision = config.pop("revision")

src/demo/backend_utils.py CHANGED Viewed

@@ -12,10 +12,11 @@ from pie_datasets import Dataset, IterableDataset, load_dataset
 from pytorch_ie import Pipeline
 from pytorch_ie.documents import (
     TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
 )
 from tqdm import tqdm
-from src.demo.annotation_utils import annotate_documents, create_documents
 from src.demo.data_utils import load_text_from_arxiv
 from src.demo.rendering_utils import (
     RENDER_WITH_DISPLACY,
@@ -54,7 +55,7 @@ def add_annotated_pie_documents(
 def process_texts(
     texts: Iterable[str],
     doc_ids: Iterable[str],
-    argumentation_model: Pipeline,
     retriever: DocumentAwareSpanRetriever,
     split_regex_escaped: Optional[str],
     handle_parts_of_same: bool = False,
@@ -68,13 +69,21 @@ def process_texts(
         doc_ids=doc_ids,
         split_regex=split_regex_escaped,
     )
-    if verbose:
-        gr.Info(f"Annotate {len(pie_documents)} documents...")
-    pie_documents = annotate_documents(
-        documents=pie_documents,
-        argumentation_model=argumentation_model,
-        handle_parts_of_same=handle_parts_of_same,
-    )
     add_annotated_pie_documents(
         retriever=retriever,
         pie_documents=pie_documents,
@@ -93,12 +102,41 @@ def add_annotated_pie_documents_from_dataset(
         dataset = load_dataset(**load_dataset_kwargs)
         if not isinstance(dataset, (Dataset, IterableDataset)):
             raise gr.Error("Loaded dataset is not of type PIE (Iterable)Dataset.")
-        dataset_converted = dataset.to_document_type(
-            TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions
         )
         add_annotated_pie_documents(
             retriever=retriever,
-            pie_documents=dataset_converted,
             use_predicted_annotations=False,
             verbose=verbose,
         )

 from pytorch_ie import Pipeline
 from pytorch_ie.documents import (
     TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
+    TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
 )
 from tqdm import tqdm
+from src.demo.annotation_utils import create_documents, get_merger
 from src.demo.data_utils import load_text_from_arxiv
 from src.demo.rendering_utils import (
     RENDER_WITH_DISPLACY,
 def process_texts(
     texts: Iterable[str],
     doc_ids: Iterable[str],
+    argumentation_model: Optional[Pipeline],
     retriever: DocumentAwareSpanRetriever,
     split_regex_escaped: Optional[str],
     handle_parts_of_same: bool = False,
         doc_ids=doc_ids,
         split_regex=split_regex_escaped,
     )
+    if argumentation_model is not None:
+        if verbose:
+            gr.Info(f"Annotate {len(pie_documents)} documents...")
+        pie_documents = argumentation_model(pie_documents, inplace=True)
+    else:
+        gr.Warning(
+            "Annotation is disabled (no model was loaded). No annotations will be added to the documents."
+        )
+    # this needs to be done also if the documents are not annotated because
+    # it adjusts the document type
+    if handle_parts_of_same:
+        merger = get_merger()
+        pie_documents = [merger(document) for document in pie_documents]
     add_annotated_pie_documents(
         retriever=retriever,
         pie_documents=pie_documents,
         dataset = load_dataset(**load_dataset_kwargs)
         if not isinstance(dataset, (Dataset, IterableDataset)):
             raise gr.Error("Loaded dataset is not of type PIE (Iterable)Dataset.")
+        try:
+            dataset_converted = dataset.to_document_type(
+                TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions
+            )
+        except ValueError:
+            gr.Warning(
+                "The dataset does not seem to have registered converter to create multi-spans. "
+                "Try to Load as single-spans and to convert to multi-spans manually ..."
+            )
+            dataset_converted_single_span = dataset.to_document_type(
+                TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
+            )
+            merger = get_merger()
+            dataset_converted = dataset_converted_single_span.map(
+                merger,
+                result_document_type=TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
+            )
+        def _clear_metadata(
+            doc: TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
+        ) -> TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions:
+            result = doc.copy()
+            result.metadata = dict()
+            return result
+        # adding documents with different metadata format to the retriever breaks it,
+        # so we clear the metadata field beforehand
+        dataset_converted_without_metadata = dataset_converted.map(
+            _clear_metadata,
+            result_document_type=TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
         )
         add_annotated_pie_documents(
             retriever=retriever,
+            pie_documents=dataset_converted_without_metadata,
             use_predicted_annotations=False,
             verbose=verbose,
         )

src/demo/retrieve_and_dump_all_relevant.py CHANGED Viewed

@@ -10,21 +10,24 @@ root = pyrootutils.setup_root(
 import argparse
 import logging
 import os
-from typing import Dict, List, Optional, Tuple
 import pandas as pd
 from pie_datasets import Dataset, DatasetDict
 from pytorch_ie import Annotation
 from pytorch_ie.annotations import BinaryRelation, MultiSpan, Span
-from document.types import (
-    RelatedRelation,
-    TextDocumentWithLabeledMultiSpansBinaryRelationsLabeledPartitionsAndRelatedRelations,
-)
 from src.demo.retriever_utils import (
     retrieve_all_relevant_spans,
     retrieve_all_relevant_spans_for_all_documents,
     retrieve_relevant_spans,
 )
 from src.langchain_modules import DocumentAwareSpanRetrieverWithRelations
@@ -131,14 +134,17 @@ def add_result_to_gold_data(
             base_annotation_mapping=base_annotation_mapping,
         )
     )
-    doc_and_span_id2annotation.update(
-        get_doc_and_span_id2annotation_mapping(
-            span_ids=result["ref_span_id"],
-            doc_ids=result["doc_id"],
-            retriever=retriever,
-            base_annotation_mapping=base_annotation_mapping,
         )
-    )
     doc_and_span_id2annotation.update(
         get_doc_and_span_id2annotation_mapping(
             span_ids=result["query_span_id"],
@@ -159,38 +165,51 @@ def add_result_to_gold_data(
             (row.query_doc_id, row.query_span_id)
         ]
         doc_id, span = doc_and_span_id2annotation[(row.doc_id, row.span_id)]
-        doc_id2, ref_span = doc_and_span_id2annotation[(row.doc_id, row.ref_span_id)]
         if doc_id != query_doc_id:
             raise ValueError("doc_id and query_doc_id must be the same")
-        if doc_id != doc_id2:
-            raise ValueError("doc_id and ref_doc_id must be the same")
         doc = doc_id2doc[doc_id]
-        link_rel = BinaryRelation(
-            head=query_span, tail=ref_span, label=link_relation_label, score=row.sim_score
-        )
-        doc.binary_relations.predictions.append(link_rel)
-        head_and_tail2relation = doc_id2head_tail2relation[doc_id]
-        related_rel_label = row.type
-        if related_rel_label.endswith(reversed_relation_suffix):
-            base_rel = head_and_tail2relation[(span, ref_span)]
         else:
-            base_rel = head_and_tail2relation[(ref_span, span)]
-        related_rel = RelatedRelation(
-            head=query_span,
-            tail=span,
-            link_relation=link_rel,
-            relation=base_rel,
-            label=related_rel_label,
-            score=link_rel.score * base_rel.score,
-        )
-        doc.related_relations.predictions.append(related_rel)
     dataset = Dataset.from_documents(list(doc_id2doc.values()))
     dataset_dict = DatasetDict({split: dataset})
     if not os.path.exists(dataset_out_dir):
         os.makedirs(dataset_out_dir, exist_ok=True)
-    dataset_dict.to_json(dataset_out_dir)
 if __name__ == "__main__":
@@ -216,6 +235,13 @@ if __name__ == "__main__":
         type=str,
         required=True,
     )
     parser.add_argument(
         "--query_doc_id",
         type=str,
@@ -282,6 +308,24 @@ if __name__ == "__main__":
     logger.info(f"loading data from {args.data_path}...")
     retriever.load_from_disc(args.data_path)
     search_kwargs = {"k": args.top_k, "score_threshold": args.threshold}
     if args.doc_id_whitelist is not None:
         search_kwargs["doc_id_whitelist"] = args.doc_id_whitelist
@@ -293,7 +337,7 @@ if __name__ == "__main__":
         all_spans_for_all_documents = None
         for doc_id_pair in args.query_target_doc_id_pairs:
             query_doc_id, target_doc_id = doc_id_pair.split(":")
-            current_result = retrieve_all_relevant_spans(
                 retriever=retriever,
                 query_doc_id=query_doc_id,
                 doc_id_whitelist=[target_doc_id],
@@ -319,16 +363,16 @@ if __name__ == "__main__":
     elif args.query_span_id is not None:
         logger.warning(f"retrieving results for single span: {args.query_span_id}")
-        all_spans_for_all_documents = retrieve_relevant_spans(
             retriever=retriever, query_span_id=args.query_span_id, **search_kwargs
         )
     elif args.query_doc_id is not None:
         logger.warning(f"retrieving results for single document: {args.query_doc_id}")
-        all_spans_for_all_documents = retrieve_all_relevant_spans(
             retriever=retriever, query_doc_id=args.query_doc_id, **search_kwargs
         )
     else:
-        all_spans_for_all_documents = retrieve_all_relevant_spans_for_all_documents(
             retriever=retriever, **search_kwargs
         )

 import argparse
 import logging
 import os
+from typing import Callable, Dict, List, Optional, Tuple
 import pandas as pd
 from pie_datasets import Dataset, DatasetDict
 from pytorch_ie import Annotation
 from pytorch_ie.annotations import BinaryRelation, MultiSpan, Span
 from src.demo.retriever_utils import (
     retrieve_all_relevant_spans,
     retrieve_all_relevant_spans_for_all_documents,
+    retrieve_all_similar_spans,
+    retrieve_all_similar_spans_for_all_documents,
     retrieve_relevant_spans,
+    retrieve_similar_spans,
+)
+from src.document.types import (
+    RelatedRelation,
+    TextDocumentWithLabeledMultiSpansBinaryRelationsLabeledPartitionsAndRelatedRelations,
 )
 from src.langchain_modules import DocumentAwareSpanRetrieverWithRelations
             base_annotation_mapping=base_annotation_mapping,
         )
     )
+    # only when we process relevant span retriever results, we have a ref_span_id
+    # (for similar span retriever results, we only have query_span_id)
+    if "ref_span_id" in result.columns:
+        doc_and_span_id2annotation.update(
+            get_doc_and_span_id2annotation_mapping(
+                span_ids=result["ref_span_id"],
+                doc_ids=result["doc_id"],
+                retriever=retriever,
+                base_annotation_mapping=base_annotation_mapping,
+            )
         )
     doc_and_span_id2annotation.update(
         get_doc_and_span_id2annotation_mapping(
             span_ids=result["query_span_id"],
             (row.query_doc_id, row.query_span_id)
         ]
         doc_id, span = doc_and_span_id2annotation[(row.doc_id, row.span_id)]
         if doc_id != query_doc_id:
             raise ValueError("doc_id and query_doc_id must be the same")
         doc = doc_id2doc[doc_id]
+        # if we have a reference span, we need to construct the related relation
+        if hasattr(row, "ref_span_id"):
+            doc_id2, ref_span = doc_and_span_id2annotation[(row.doc_id, row.ref_span_id)]
+            if doc_id != doc_id2:
+                raise ValueError("doc_id and ref_doc_id must be the same")
+            # create a link relation between the query span and the reference span
+            link_rel = BinaryRelation(
+                head=query_span, tail=ref_span, label=link_relation_label, score=row.sim_score
+            )
+            doc.binary_relations.predictions.append(link_rel)
+            head_and_tail2relation = doc_id2head_tail2relation[doc_id]
+            related_rel_label = row.type
+            if related_rel_label.endswith(reversed_relation_suffix):
+                base_rel = head_and_tail2relation[(span, ref_span)]
+            else:
+                base_rel = head_and_tail2relation[(ref_span, span)]
+            related_rel = RelatedRelation(
+                head=query_span,
+                tail=span,
+                link_relation=link_rel,
+                relation=base_rel,
+                label=related_rel_label,
+                score=link_rel.score * base_rel.score,
+            )
+            doc.related_relations.predictions.append(related_rel)
+        # otherwise, we just ...
         else:
+            # ... create a link relation between the query span and returned span
+            link_rel = BinaryRelation(
+                head=query_span, tail=span, label=link_relation_label, score=row.sim_score
+            )
+            doc.binary_relations.predictions.append(link_rel)
     dataset = Dataset.from_documents(list(doc_id2doc.values()))
     dataset_dict = DatasetDict({split: dataset})
     if not os.path.exists(dataset_out_dir):
         os.makedirs(dataset_out_dir, exist_ok=True)
+    dataset_dict.to_json(dataset_out_dir, mode="w")
 if __name__ == "__main__":
         type=str,
         required=True,
     )
+    parser.add_argument(
+        "-v",
+        "--variant",
+        choices=["relevant", "similar"],
+        default="relevant",
+        help="Variant of the retriever to use: 'relevant' for relevant spans, 'similar' for similar spans.",
+    )
     parser.add_argument(
         "--query_doc_id",
         type=str,
     logger.info(f"loading data from {args.data_path}...")
     retriever.load_from_disc(args.data_path)
+    methods: Dict[str, Callable]
+    if args.variant == "relevant":
+        logger.info("using *relevant* span retriever methods")
+        methods = {
+            "retrieve_all_spans": retrieve_all_relevant_spans,
+            "retrieve_spans": retrieve_relevant_spans,
+            "retrieve_all_spans_for_all_documents": retrieve_all_relevant_spans_for_all_documents,
+        }
+    elif args.variant == "similar":
+        logger.info("using *similar* span retriever methods")
+        methods = {
+            "retrieve_all_spans": retrieve_all_similar_spans,
+            "retrieve_spans": retrieve_similar_spans,
+            "retrieve_all_spans_for_all_documents": retrieve_all_similar_spans_for_all_documents,
+        }
+    else:
+        raise ValueError(f"unknown method: {args.variant}")
     search_kwargs = {"k": args.top_k, "score_threshold": args.threshold}
     if args.doc_id_whitelist is not None:
         search_kwargs["doc_id_whitelist"] = args.doc_id_whitelist
         all_spans_for_all_documents = None
         for doc_id_pair in args.query_target_doc_id_pairs:
             query_doc_id, target_doc_id = doc_id_pair.split(":")
+            current_result = methods["retrieve_all_spans"](
                 retriever=retriever,
                 query_doc_id=query_doc_id,
                 doc_id_whitelist=[target_doc_id],
     elif args.query_span_id is not None:
         logger.warning(f"retrieving results for single span: {args.query_span_id}")
+        all_spans_for_all_documents = methods["retrieve_spans"](
             retriever=retriever, query_span_id=args.query_span_id, **search_kwargs
         )
     elif args.query_doc_id is not None:
         logger.warning(f"retrieving results for single document: {args.query_doc_id}")
+        all_spans_for_all_documents = methods["retrieve_all_spans"](
             retriever=retriever, query_doc_id=args.query_doc_id, **search_kwargs
         )
     else:
+        all_spans_for_all_documents = methods["retrieve_all_spans_for_all_documents"](
             retriever=retriever, **search_kwargs
         )

src/document/processing.py CHANGED Viewed

@@ -1,20 +1,23 @@
 from __future__ import annotations
 import logging
 from collections import defaultdict
 from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, TypeVar, Union
 from pie_modules.utils.span import have_overlap
 from pytorch_ie import AnnotationLayer
-from pytorch_ie.annotations import LabeledMultiSpan, LabeledSpan, MultiSpan, Span
 from pytorch_ie.core import Document
 from pytorch_ie.core.document import Annotation, _enumerate_dependencies
 from src.document.types import (
     RelatedRelation,
     TextDocumentWithLabeledMultiSpansBinaryRelationsLabeledPartitionsAndRelatedRelations,
 )
 from src.utils import distance, distance_slices
 from src.utils.span_utils import get_overlap_len
 logger = logging.getLogger(__name__)
@@ -123,6 +126,69 @@ def remove_partitions_by_labels(
 D_text = TypeVar("D_text", bound=Document)
 def replace_substrings_in_text(
     document: D_text, replacements: Dict[str, str], enforce_same_length: bool = True
 ) -> D_text:
@@ -512,3 +578,236 @@ def add_related_relations_from_binary_relations(
         )
     return document

 from __future__ import annotations
+import itertools
 import logging
 from collections import defaultdict
 from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, TypeVar, Union
 from pie_modules.utils.span import have_overlap
 from pytorch_ie import AnnotationLayer
+from pytorch_ie.annotations import BinaryRelation, LabeledMultiSpan, LabeledSpan, MultiSpan, Span
 from pytorch_ie.core import Document
 from pytorch_ie.core.document import Annotation, _enumerate_dependencies
+from pytorch_ie.documents import TextDocumentWithLabeledSpansAndBinaryRelations
 from src.document.types import (
     RelatedRelation,
     TextDocumentWithLabeledMultiSpansBinaryRelationsLabeledPartitionsAndRelatedRelations,
 )
 from src.utils import distance, distance_slices
+from src.utils.graph_utils import get_connected_components
 from src.utils.span_utils import get_overlap_len
 logger = logging.getLogger(__name__)
 D_text = TypeVar("D_text", bound=Document)
+def remove_annotations_by_label(
+    document: D, layer2label_blacklist: Dict[str, List[str]], verbose: bool = False
+) -> D:
+    """Remove annotations with labels in the blacklist from a document.
+    Args:
+        document: The document to process.
+        layer2label_blacklist: A mapping from layer names to lists of labels to remove.
+        verbose: Whether to print number of removed annotations.
+    Returns:
+        The processed document.
+    """
+    result = document.copy(with_annotations=False)
+    override_annotations: Dict[str, Dict[int, Annotation]] = defaultdict(dict)
+    removed_annotations: Dict[str, Set[int]] = defaultdict(set)
+    for layer_name, labels in layer2label_blacklist.items():
+        # process gold annotations and predictions
+        for src_layer, tgt_layer in [
+            (document[layer_name], result[layer_name]),
+            (document[layer_name].predictions, result[layer_name].predictions),
+        ]:
+            current_override_annotations = dict()
+            current_removed_annotations = set()
+            for annotation in src_layer:
+                label = getattr(annotation, "label")
+                if label is None:
+                    raise ValueError(
+                        f"Annotation {annotation} has no label. Please check the annotation type."
+                    )
+                if label not in labels:
+                    current_override_annotations[annotation._id] = annotation.copy()
+                else:
+                    current_removed_annotations.add(annotation._id)
+            tgt_layer.extend(current_override_annotations.values())
+            override_annotations[layer_name].update(current_override_annotations)
+            removed_annotations[layer_name].update(current_removed_annotations)
+    if verbose:
+        num_removed = {
+            layer_name: len(removed_ids) for layer_name, removed_ids in removed_annotations.items()
+        }
+        if len(num_removed) > 0:
+            num_total = {
+                layer_name: len(kept_ids) + num_removed[layer_name]
+                for layer_name, kept_ids in override_annotations.items()
+            }
+            logger.warning(
+                f"doc.id={document.id}: Removed {num_removed} (total: {num_total}) "
+                f"annotations with label blacklists {layer2label_blacklist}"
+            )
+    result.add_all_annotations_from_other(
+        other=document,
+        removed_annotations=removed_annotations,
+        override_annotations=override_annotations,
+        strict=False,
+        verbose=False,
+    )
+    return result
 def replace_substrings_in_text(
     document: D_text, replacements: Dict[str, str], enforce_same_length: bool = True
 ) -> D_text:
         )
     return document
+T = TypeVar("T", bound=TextDocumentWithLabeledSpansAndBinaryRelations)
+def remove_discontinuous_spans(
+    document: T,
+    parts_of_same_relation: str,
+    verbose: bool = False,
+) -> T:
+    """
+    Remove discontinuous spans from a document.
+    Args:
+        document: The document to process.
+        parts_of_same_relation: The name of the relation that indicates linked spans.
+        verbose: Whether to print debug information.
+    Returns:
+        The processed document.
+    """
+    result = document.copy()
+    spans = result.labeled_spans.clear()
+    rels = result.binary_relations.clear()
+    segment_spans = set()
+    segment_rels = set()
+    # collect all spans that are linked
+    for rel in rels:
+        if rel.label == parts_of_same_relation:
+            segment_spans.add(rel.head)
+            segment_spans.add(rel.tail)
+            segment_rels.add(rel)
+    for span in spans:
+        if span not in segment_spans:
+            result.labeled_spans.append(span)
+    other_rels_dropped = set()
+    for rel in rels:
+        if rel not in segment_rels:
+            if rel.head not in segment_spans and rel.tail not in segment_spans:
+                result.binary_relations.append(rel)
+            else:
+                other_rels_dropped.add(rel)
+    if verbose:
+        if len(segment_rels) > 0:
+            logger.warning(
+                f"doc={document.id}: Dropped {len(segment_rels)} segment rels "
+                f"and {len(other_rels_dropped)} other rels "
+                f"({round((len(document.binary_relations) - len(result.binary_relations)) * 100 / len(document.binary_relations), 1)}% "
+                f"of all relations dropped)"
+            )
+    return result
+def close_clusters_transitively(
+    document: D, relation_layer: str, link_relation_label: str, verbose: bool = False
+) -> D:
+    """
+    Close clusters transitively by adding relations between all pairs of spans in the same cluster.
+    Args:
+        document: The document to process.
+        relation_layer: The name of the relation layer.
+        link_relation_label: The label of the link relation.
+        verbose: Whether to print debug information.
+    Returns:
+        The processed document.
+    """
+    result = document.copy()
+    connected_components: List[List[Annotation]] = get_connected_components(
+        relations=result[relation_layer],
+        link_relation_label=link_relation_label,
+        add_singletons=False,
+    )
+    # detach from document
+    relations = result[relation_layer].clear()
+    # use set to speed up membership checks
+    relations_set = set(relations)
+    n_before = len(relations)
+    for cluster in connected_components:
+        for head, tail in itertools.combinations(sorted(cluster), 2):
+            rel = BinaryRelation(
+                head=head,
+                tail=tail,
+                label=link_relation_label,
+            )
+            rel_reversed = BinaryRelation(
+                head=tail,
+                tail=head,
+                label=link_relation_label,
+            )
+            if rel not in relations_set and rel_reversed not in relations_set:
+                # append to relations to keep the order
+                relations.append(rel)
+                relations_set.add(rel)
+    result[relation_layer].extend(relations)
+    if verbose:
+        num_added = len(relations) - n_before
+        if num_added > 0:
+            logger.warning(
+                f"doc.id={document.id}: added {num_added} relations to {relation_layer} layer"
+            )
+    return result
+def get_ancestor_layers(children: Dict[str, Set[str]], layer: str) -> Set[str]:
+    """
+    Get all ancestor layers of a given layer in the dependency graph.
+    Args:
+        children: A mapping from layers to their children layers.
+        layer: The layer for which to find ancestors.
+    Returns:
+        A set of ancestor layers.
+    """
+    ancestors = set()
+    def _get_ancestors(current_layer: str):
+        for parent_layer, child_layers in children.items():
+            if current_layer in child_layers:
+                ancestors.add(parent_layer)
+                _get_ancestors(parent_layer)
+    _get_ancestors(layer)
+    # drop the _artificial_root
+    ancestors.discard("_artificial_root")
+    return ancestors
+def remove_binary_relations_by_partition_labels(
+    document: D,
+    partition_layer: str,
+    relation_layer: str,
+    partition_label_whitelist: Optional[List[List[str]]] = None,
+    partition_label_blacklist: Optional[List[List[str]]] = None,
+    verbose: bool = False,
+) -> D:
+    """
+    Remove binary relations that are not between partitions with labels in the whitelist or
+    that are in the blacklist.
+    Args:
+        document: The document to process.
+        partition_layer: The name of the partition layer.
+        relation_layer: The name of the relation layer.
+        partition_label_whitelist: The list of head-tail label pairs to keep.
+        partition_label_blacklist: The list of head-tail label pairs to remove.
+        verbose: Whether to print the removed relations to console.
+    Returns:
+        The processed document.
+    """
+    result = document.copy()
+    relation_annotation_layer = result[relation_layer]
+    # get all layers that target the relation layer
+    relation_dependent_layers = get_ancestor_layers(
+        children=result._annotation_graph, layer=relation_layer
+    )
+    # clear all layers that depend on the relation layer
+    for layer_name in relation_dependent_layers:
+        dependent_layer = result[layer_name]
+        gold_anns_cleared = dependent_layer.clear()
+        pred_anns_cleared = dependent_layer.predictions.clear()
+        if len(gold_anns_cleared) > 0 or len(pred_anns_cleared) > 0:
+            if verbose:
+                logger.warning(
+                    f"doc.id={document.id}: Cleared {len(gold_anns_cleared)} gold and "
+                    f"{len(pred_anns_cleared)} predicted annotations from layer {layer_name} "
+                    f"because it depends on the relation layer {relation_layer}."
+                )
+    span2partition = {}
+    span_layer: AnnotationLayer
+    for span_layer in relation_annotation_layer.target_layers.values():
+        for span in list(span_layer) + list(span_layer.predictions):
+            if isinstance(span, Span):
+                span_start, span_end = span.start, span.end
+            elif isinstance(span, MultiSpan):
+                span_start, span_end = min(start for start, _ in span.slices), max(
+                    end for _, end in span.slices
+                )
+            else:
+                raise ValueError(f"Unsupported span type: {type(span)}")
+            found_partition = False
+            for partition in result[partition_layer]:
+                if partition.start <= span_start and span_end <= partition.end:
+                    span2partition[span] = partition
+                    found_partition = True
+                    break
+            if not found_partition:
+                raise ValueError(f"No partition found for span {span}")
+    if partition_label_whitelist is not None:
+        partition_label_whitelist_tuples = [tuple(pair) for pair in partition_label_whitelist]
+    else:
+        partition_label_whitelist_tuples = None
+    if partition_label_blacklist is not None:
+        partition_label_blacklist_tuples = [tuple(pair) for pair in partition_label_blacklist]
+    else:
+        partition_label_blacklist_tuples = None
+    for relation_base_layer in [relation_annotation_layer, relation_annotation_layer.predictions]:
+        # get all relations and clear the layer
+        relations = relation_base_layer.clear()
+        for relation in relations:
+            head_partition = span2partition[relation.head]
+            tail_partition = span2partition[relation.tail]
+            pair = (head_partition.label, tail_partition.label)
+            if (
+                partition_label_whitelist_tuples is None
+                or pair in partition_label_whitelist_tuples
+            ) and (
+                partition_label_blacklist_tuples is None
+                or pair not in partition_label_blacklist_tuples
+            ):
+                relation_base_layer.append(relation)
+            else:
+                if verbose:
+                    logger.info(
+                        f"Removing relation {relation} because its partitions "
+                        f"({pair}) are not in the whitelist or are in the blacklist."
+                    )
+    return result

src/evaluate.py CHANGED Viewed

@@ -41,13 +41,13 @@ from omegaconf import DictConfig
 from pie_datasets import DatasetDict
 from pie_modules.models import *  # noqa: F403
 from pie_modules.taskmodules import *  # noqa: F403
 from pytorch_ie.core import PyTorchIEModel, TaskModule
 from pytorch_ie.models import *  # noqa: F403
 from pytorch_ie.taskmodules import *  # noqa: F403
 from pytorch_lightning import Trainer
 from src import utils
-from src.datamodules import PieDataModule
 from src.models import *  # noqa: F403
 from src.taskmodules import *  # noqa: F403
@@ -80,8 +80,8 @@ def evaluate(cfg: DictConfig) -> Tuple[dict, dict]:
     log.info(f"Instantiating taskmodule <{cfg.taskmodule._target_}>")
     taskmodule: TaskModule = hydra.utils.instantiate(cfg.taskmodule, _convert_="partial")
-    # auto-convert the dataset if the metric specifies a document type
-    dataset = taskmodule.convert_dataset(dataset)
     # Init pytorch-ie datamodule
     log.info(f"Instantiating datamodule <{cfg.datamodule._target_}>")

 from pie_datasets import DatasetDict
 from pie_modules.models import *  # noqa: F403
 from pie_modules.taskmodules import *  # noqa: F403
+from pytorch_ie import PieDataModule
 from pytorch_ie.core import PyTorchIEModel, TaskModule
 from pytorch_ie.models import *  # noqa: F403
 from pytorch_ie.taskmodules import *  # noqa: F403
 from pytorch_lightning import Trainer
 from src import utils
 from src.models import *  # noqa: F403
 from src.taskmodules import *  # noqa: F403
     log.info(f"Instantiating taskmodule <{cfg.taskmodule._target_}>")
     taskmodule: TaskModule = hydra.utils.instantiate(cfg.taskmodule, _convert_="partial")
+    # auto-convert the dataset if the taskmodule specifies a document type
+    dataset = dataset.to_document_type(taskmodule, downcast=False)
     # Init pytorch-ie datamodule
     log.info(f"Instantiating datamodule <{cfg.datamodule._target_}>")

src/evaluate_documents.py CHANGED Viewed

@@ -73,7 +73,7 @@ def evaluate_documents(cfg: DictConfig) -> Tuple[dict, dict]:
     metric: DocumentMetric = hydra.utils.instantiate(cfg.metric, _convert_="partial")
     # auto-convert the dataset if the metric specifies a document type
-    dataset = metric.convert_dataset(dataset)
     # Init lightning loggers
     loggers = utils.instantiate_dict_entries(cfg, "logger")

     metric: DocumentMetric = hydra.utils.instantiate(cfg.metric, _convert_="partial")
     # auto-convert the dataset if the metric specifies a document type
+    dataset = dataset.to_document_type(metric, downcast=False)
     # Init lightning loggers
     loggers = utils.instantiate_dict_entries(cfg, "logger")

src/hydra_callbacks/save_job_return_value.py CHANGED Viewed

@@ -3,7 +3,7 @@ import logging
 import os
 import pickle
 from pathlib import Path
-from typing import Any, Dict, Generator, List, Optional, Tuple, Union
 import numpy as np
 import pandas as pd
@@ -174,6 +174,46 @@ def overrides_to_identifiers(overrides_per_result: List[List[str]], sep: str = "
     return identifiers
 class SaveJobReturnValueCallback(Callback):
     """Save the job return-value in ${output_dir}/{job_return_value_filename}.
@@ -200,6 +240,10 @@ class SaveJobReturnValueCallback(Callback):
     multirun_create_ids_from_overrides: bool (default: True)
         Create job identifiers from the overrides of the jobs in a multi-run. If False, the job index is used as
         identifier.
     markdown_round_digits: int (default: 3)
         The number of digits to round the values in the markdown file. If None, no rounding is applied.
     multirun_job_id_key: str (default: "job_id")
@@ -220,6 +264,8 @@ class SaveJobReturnValueCallback(Callback):
         integrate_multirun_result: bool = False,
         multirun_aggregator_blacklist: Optional[List[str]] = None,
         multirun_create_ids_from_overrides: bool = True,
         markdown_round_digits: Optional[int] = 3,
         multirun_job_id_key: str = "job_id",
         paths_file: Optional[str] = None,
@@ -234,6 +280,8 @@ class SaveJobReturnValueCallback(Callback):
         self.multirun_aggregator_blacklist = multirun_aggregator_blacklist
         self.multirun_create_ids_from_overrides = multirun_create_ids_from_overrides
         self.multirun_job_id_key = multirun_job_id_key
         self.markdown_round_digits = markdown_round_digits
         self.multirun_paths_file = multirun_paths_file
         self.multirun_path_id = multirun_path_id
@@ -253,10 +301,21 @@ class SaveJobReturnValueCallback(Callback):
     def on_multirun_end(self, config: DictConfig, **kwargs: Any) -> None:
         job_ids: Union[List[str], List[int]]
-        if self.multirun_create_ids_from_overrides:
-            job_ids = overrides_to_identifiers([jr.overrides for jr in self.job_returns])
         else:
-            job_ids = list(range(len(self.job_returns)))
         if self.integrate_multirun_result:
             # rearrange the job return-values of all jobs from a multi-run into a dict of lists (maybe nested),
@@ -368,6 +427,10 @@ class SaveJobReturnValueCallback(Callback):
                 if job_id_column in result.columns:
                     result = result.set_index(job_id_column)
                     result.index.name = self.multirun_job_id_key
             else:
                 # Otherwise, we have only one value for each key. We convert the dict to a pandas Series.
                 series = pd.Series(obj_py_flat)

 import os
 import pickle
 from pathlib import Path
+from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, Union
 import numpy as np
 import pandas as pd
     return identifiers
+def identifier2dict(
+    identifier: str, record_sep: str = "-", key_value_sep: str = "="
+) -> Dict[str, str]:
+    """Converts a single identifier to a dict. The identifier is expected to be separated by "-".
+    Values are allowed to contain "-" as well, but keys are not. Key and value are separated by "=".
+    Example:
+        >>> identifier = "a=1-b=my-stuff"
+        >>> identifier2dict(identifier)
+        {'a': '1', 'b': 'my-stuff'}
+    """
+    parts = identifier.split(record_sep)
+    result = {}
+    last_key = None
+    for part in parts:
+        if key_value_sep in part:
+            last_key, value = part.split(key_value_sep, 1)
+            result[last_key] = value
+        else:
+            if last_key is None:
+                raise ValueError(
+                    f'Invalid identifier: {identifier} (keys must not contain the record_sep="{record_sep}")'
+                )
+            result[last_key] += record_sep + part
+    return result
+def identifiers_to_multiindex(identifiers: Iterable[str], **kwargs) -> pd.MultiIndex:
+    """Converts a list of identifiers to a MultiIndex. See identifier2dict for the
+    format of the identifiers.
+    Example:
+        >>> identifiers = ["a=1-b=my-stuff", "a=2-b=yes", "a=3"]
+        >>> identifiers_to_multiindex(identifiers, record_sep="-", key_value_sep="=")
+        MultiIndex([(1, 'my-stuff'), (2, 'yes'), (3, nan)], names=['a', 'b'])
+    """
+    frame = pd.DataFrame([identifier2dict(identifier, **kwargs) for identifier in identifiers])
+    return pd.MultiIndex.from_frame(frame)
 class SaveJobReturnValueCallback(Callback):
     """Save the job return-value in ${output_dir}/{job_return_value_filename}.
     multirun_create_ids_from_overrides: bool (default: True)
         Create job identifiers from the overrides of the jobs in a multi-run. If False, the job index is used as
         identifier.
+    multirun_ids: List[str] or List[int] (default: None)
+        If provided, the job identifiers from the config are used instead of the overrides or the job index.
+    markdown_split_index: bool (default: False)
+        If True, the index of the markdown file is split into multiple columns based on the separator "-".
     markdown_round_digits: int (default: 3)
         The number of digits to round the values in the markdown file. If None, no rounding is applied.
     multirun_job_id_key: str (default: "job_id")
         integrate_multirun_result: bool = False,
         multirun_aggregator_blacklist: Optional[List[str]] = None,
         multirun_create_ids_from_overrides: bool = True,
+        multirun_ids: Optional[Union[List[str], List[int]]] = None,
+        markdown_split_index: bool = False,
         markdown_round_digits: Optional[int] = 3,
         multirun_job_id_key: str = "job_id",
         paths_file: Optional[str] = None,
         self.multirun_aggregator_blacklist = multirun_aggregator_blacklist
         self.multirun_create_ids_from_overrides = multirun_create_ids_from_overrides
         self.multirun_job_id_key = multirun_job_id_key
+        self.multirun_ids = multirun_ids
+        self.markdown_split_index = markdown_split_index
         self.markdown_round_digits = markdown_round_digits
         self.multirun_paths_file = multirun_paths_file
         self.multirun_path_id = multirun_path_id
     def on_multirun_end(self, config: DictConfig, **kwargs: Any) -> None:
         job_ids: Union[List[str], List[int]]
+        if self.multirun_ids is not None:
+            # use the job_ids from the config
+            if len(self.multirun_ids) != len(self.job_returns):
+                raise ValueError(
+                    f"Number of job_ids ({len(self.multirun_ids)}) does not match number of job returns ({len(self.job_returns)})"
+                )
+            # convert ListConfig to list
+            job_ids = list(self.multirun_ids)  # type: ignore
         else:
+            if self.multirun_create_ids_from_overrides:
+                job_ids = overrides_to_identifiers(
+                    [jr.overrides for jr in self.job_returns], sep="-"
+                )
+            else:
+                job_ids = list(range(len(self.job_returns)))
         if self.integrate_multirun_result:
             # rearrange the job return-values of all jobs from a multi-run into a dict of lists (maybe nested),
                 if job_id_column in result.columns:
                     result = result.set_index(job_id_column)
                     result.index.name = self.multirun_job_id_key
+                if self.markdown_split_index:
+                    result.index = identifiers_to_multiindex(result.index, record_sep="-")
+                    result = result.reset_index()
             else:
                 # Otherwise, we have only one value for each key. We convert the dict to a pandas Series.
                 series = pd.Series(obj_py_flat)

src/langchain_modules/basic_pie_document_store.py CHANGED Viewed

@@ -52,6 +52,7 @@ class BasicPieDocumentStore(PieDocumentStore):
             shutil.rmtree(pie_documents_path)
         os.makedirs(pie_documents_path, exist_ok=True)
         doc_ids_iter = iter(self.client.yield_keys())
         while batch_doc_ids := list(islice(doc_ids_iter, batch_size or 1000)):
             all_doc_ids.extend(batch_doc_ids)
             docs = self.client.mget(batch_doc_ids)
@@ -63,7 +64,8 @@ class BasicPieDocumentStore(PieDocumentStore):
                     {k: v for k, v in doc.metadata.items() if k != self.METADATA_KEY_PIE_DOCUMENT}
                 )
             pie_dataset = Dataset.from_documents(pie_docs)
-            DatasetDict({"train": pie_dataset}).to_json(path=pie_documents_path)
         if len(all_doc_ids) > 0:
             doc_ids_path = os.path.join(path, "doc_ids.json")
             with open(doc_ids_path, "w") as f:

             shutil.rmtree(pie_documents_path)
         os.makedirs(pie_documents_path, exist_ok=True)
         doc_ids_iter = iter(self.client.yield_keys())
+        mode = "w"
         while batch_doc_ids := list(islice(doc_ids_iter, batch_size or 1000)):
             all_doc_ids.extend(batch_doc_ids)
             docs = self.client.mget(batch_doc_ids)
                     {k: v for k, v in doc.metadata.items() if k != self.METADATA_KEY_PIE_DOCUMENT}
                 )
             pie_dataset = Dataset.from_documents(pie_docs)
+            DatasetDict({"train": pie_dataset}).to_json(path=pie_documents_path, mode=mode)
+            mode = "a"  # append after the first batch
         if len(all_doc_ids) > 0:
             doc_ids_path = os.path.join(path, "doc_ids.json")
             with open(doc_ids_path, "w") as f:

src/langchain_modules/datasets_pie_document_store.py CHANGED Viewed

@@ -118,7 +118,7 @@ class DatasetsPieDocumentStore(PieDocumentStore):
             logger.warning(f"Removing existing directory: {pie_documents_path}")
             shutil.rmtree(pie_documents_path)
         os.makedirs(pie_documents_path, exist_ok=True)
-        DatasetDict({"train": self._data}).to_json(pie_documents_path)
         doc_ids_path = os.path.join(path, "doc_ids.json")
         with open(doc_ids_path, "w") as f:
             json.dump(all_doc_ids, f)

             logger.warning(f"Removing existing directory: {pie_documents_path}")
             shutil.rmtree(pie_documents_path)
         os.makedirs(pie_documents_path, exist_ok=True)
+        DatasetDict({"train": self._data}).to_json(pie_documents_path, mode="w")
         doc_ids_path = os.path.join(path, "doc_ids.json")
         with open(doc_ids_path, "w") as f:
             json.dump(all_doc_ids, f)

src/metrics/__init__.py CHANGED Viewed

@@ -1,3 +1,9 @@
-from .coref_sklearn import CorefMetricsSKLearn
 from .coref_torchmetrics import CorefMetricsTorchmetrics
 from .score_distribution import ScoreDistribution

+from .connected_component_sizes import ConnectedComponentSizes
+from .coref import CorefHoiF1
+from .coref_sklearn import BinaryClassificationMetricsSKLearn
 from .coref_torchmetrics import CorefMetricsTorchmetrics
+from .f1_with_threshold import F1WithThresholdMetric
+from .ranking_sklearn import RankingMetricsSKLearn
 from .score_distribution import ScoreDistribution
+from .semantically_same_ranking import SemanticallySameRankingMetric
+from .tpfpfn import TPFFPFNMetric

src/metrics/connected_component_sizes.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import logging
+from collections import Counter
+from typing import Dict, List, TypeVar
+from pytorch_ie import Annotation, AnnotationLayer, Document, DocumentStatistic
+from pytorch_ie.annotations import BinaryRelation
+from src.utils.graph_utils import get_connected_components
+logger = logging.getLogger(__name__)
+A = TypeVar("A")
+# TODO: remove when "counts" aggregation function is available in DocumentStatistic
+def count_func(values: List[int]) -> Dict[int, int]:
+    """Counts the number of occurrences of each value in the list."""
+    counter = Counter(values)
+    result = {k: counter[k] for k in sorted(counter)}
+    return result
+class ConnectedComponentSizes(DocumentStatistic):
+    # TODO: use "counts" aggregation function when available in DocumentStatistic
+    DEFAULT_AGGREGATION_FUNCTIONS = ["src.metrics.connected_component_sizes.count_func"]
+    def __init__(self, relation_layer: str, link_relation_label: str, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.relation_layer = relation_layer
+        self.link_relation_label = link_relation_label
+    def _collect(self, document: Document) -> List[int]:
+        relations: AnnotationLayer[BinaryRelation] = document[self.relation_layer]
+        spans: AnnotationLayer[Annotation] = document[self.relation_layer].target_layer
+        connected_components: List[List] = get_connected_components(
+            elements=spans,
+            relations=relations,
+            link_relation_label=self.link_relation_label,
+            add_singletons=True,
+        )
+        new_component_sizes = [len(component) for component in connected_components]
+        return new_component_sizes

src/metrics/coref.py ADDED Viewed

	@@ -0,0 +1,223 @@

+from collections import Counter
+from typing import Dict, Hashable, List, Optional, Sequence, Tuple, TypeVar
+import numpy as np
+from pytorch_ie import Annotation, Document, DocumentMetric
+from pytorch_ie.annotations import BinaryRelation
+from src.utils.graph_utils import get_connected_components
+class CorefHoiEvaluator(object):
+    def __init__(self, metric, beta=1):
+        self.p_num = 0
+        self.p_den = 0
+        self.r_num = 0
+        self.r_den = 0
+        self.metric = metric
+        self.beta = beta
+    def update(self, predicted, gold, mention_to_predicted, mention_to_gold):
+        if self.metric == ceafe_simplified:
+            pn, pd, rn, rd = self.metric(predicted, gold)
+        else:
+            pn, pd = self.metric(predicted, mention_to_gold)
+            rn, rd = self.metric(gold, mention_to_predicted)
+        self.p_num += pn
+        self.p_den += pd
+        self.r_num += rn
+        self.r_den += rd
+    def f1(self, p_num, p_den, r_num, r_den, beta=1):
+        p = 0 if p_den == 0 else p_num / float(p_den)
+        r = 0 if r_den == 0 else r_num / float(r_den)
+        return 0 if p + r == 0 else (1 + beta * beta) * p * r / (beta * beta * p + r)
+    def get_f1(self):
+        return self.f1(self.p_num, self.p_den, self.r_num, self.r_den, beta=self.beta)
+    def get_recall(self):
+        return 0 if self.r_num == 0 else self.r_num / float(self.r_den)
+    def get_precision(self):
+        return 0 if self.p_num == 0 else self.p_num / float(self.p_den)
+    def get_prf(self):
+        return self.get_precision(), self.get_recall(), self.get_f1()
+    def get_counts(self):
+        return self.p_num, self.p_den, self.r_num, self.r_den
+def b_cubed_simplified(clusters, mention_to_gold):
+    num, dem = 0, 0
+    for c in clusters:
+        if len(c) == 1:
+            continue
+        gold_counts = Counter()
+        correct = 0
+        for m in c:
+            if m in mention_to_gold:
+                gold_counts[tuple(mention_to_gold[m])] += 1
+        for c2, count in gold_counts.items():
+            if len(c2) != 1:
+                correct += count * count
+        num += correct / float(len(c))
+        dem += len(c)
+    return num, dem
+def muc_simplified(clusters, mention_to_gold):
+    tp, p = 0, 0
+    for c in clusters:
+        p += len(c) - 1
+        tp += len(c)
+        linked = set()
+        for m in c:
+            if m in mention_to_gold:
+                linked.add(mention_to_gold[m])
+            else:
+                tp -= 1
+        tp -= len(linked)
+    return tp, p
+def phi4_simplified(c1, c2):
+    return 2 * len([m for m in c1 if m in c2]) / float(len(c1) + len(c2))
+def ceafe_simplified(clusters, gold_clusters):
+    # lazy import to not force scipy installation
+    from scipy.optimize import linear_sum_assignment as linear_assignment
+    clusters = [c for c in clusters if len(c) != 1]
+    scores = np.zeros((len(gold_clusters), len(clusters)))
+    for i in range(len(gold_clusters)):
+        for j in range(len(clusters)):
+            scores[i, j] = phi4_simplified(gold_clusters[i], clusters[j])
+    matching = linear_assignment(-scores)
+    matching = np.transpose(np.asarray(matching))
+    similarity = sum(scores[matching[:, 0], matching[:, 1]])
+    return similarity, len(clusters), similarity, len(gold_clusters)
+def lea_simplified(clusters, mention_to_gold):
+    num, dem = 0, 0
+    for c in clusters:
+        if len(c) == 1:
+            continue
+        common_links = 0
+        all_links = len(c) * (len(c) - 1) / 2.0
+        for i, m in enumerate(c):
+            if m in mention_to_gold:
+                for m2 in c[i + 1 :]:
+                    if m2 in mention_to_gold and mention_to_gold[m] == mention_to_gold[m2]:
+                        common_links += 1
+        num += len(c) * common_links / float(all_links)
+        dem += len(c)
+    return num, dem
+H = TypeVar("H", bound=Hashable)
+class CorefHoiF1(DocumentMetric):
+    """
+    Coreference evaluation based on official coref-hoi evaluation script, i.e.,
+    https://github.com/lxucs/coref-hoi/blob/5ddfc3b64a5519c3555b5a57e47ab2f03c104a60/metrics.py.
+    The metric expects documents with a relation layer that contains binary relations
+    between mentions from the same coreference cluster. Works with relations targeting
+    mentions from multiple layers (e.g., cross-textual relations).
+    Args:
+        relation_layer: The name of the relation layer that contains the link relations.
+        include_singletons: If True (default), singletons will be included in the evaluation.
+        link_relation_label: If provided, only the relations with this label will be used
+            to create the clusters.
+        link_relation_relation_score_threshold: If provided, only the relations with a score
+            greater than or equal to this threshold will be used to create the clusters.
+    """
+    def __init__(
+        self,
+        relation_layer: str,
+        include_singletons: bool = True,
+        link_relation_label: Optional[str] = None,
+        link_relation_relation_score_threshold: Optional[float] = None,
+    ) -> None:
+        super().__init__()
+        self.relation_layer = relation_layer
+        self.link_relation_label = link_relation_label
+        self.include_singletons = include_singletons
+        self.link_relation_relation_score_threshold = link_relation_relation_score_threshold
+    def reset(self) -> None:
+        self.evaluators = [
+            CorefHoiEvaluator(m) for m in (muc_simplified, b_cubed_simplified, ceafe_simplified)
+        ]
+    def prepare_clusters_with_mapping(
+        self, mentions: Sequence[Annotation], relations: Sequence[BinaryRelation]
+    ) -> Tuple[List[List[Annotation]], Dict[Annotation, Tuple[Annotation]]]:
+        # get connected components based on binary relations
+        connected_components = get_connected_components(
+            elements=mentions,
+            relations=relations,
+            link_relation_label=self.link_relation_label,
+            link_relation_relation_score_threshold=self.link_relation_relation_score_threshold,
+            add_singletons=self.include_singletons,
+        )
+        # store all clustered mentions in a list and
+        # create a map from each mention to its cluster
+        # (i.e. to the list of spans that includes all other mentions from the same cluster)
+        clusters = []
+        mention_to_cluster = dict()
+        for cluster in connected_components:
+            clusters.append(cluster)
+            for mention in cluster:
+                mention_to_cluster[mention] = tuple(cluster)
+        return clusters, mention_to_cluster
+    def _update(self, doc: Document) -> None:
+        relation_layer = doc[self.relation_layer]
+        gold_mentions = []
+        predicted_mentions = []
+        for mention_layer in relation_layer.target_layers.values():
+            gold_mentions.extend(mention_layer)
+            predicted_mentions.extend(mention_layer.predictions)
+        # prepare the clusters and mention-to-cluster mapping needed for evaluation
+        predicted_clusters, mention_to_predicted = self.prepare_clusters_with_mapping(
+            mentions=predicted_mentions, relations=relation_layer.predictions
+        )
+        gold_clusters, mention_to_gold = self.prepare_clusters_with_mapping(
+            mentions=gold_mentions, relations=relation_layer
+        )
+        for e in self.evaluators:
+            e.update(predicted_clusters, gold_clusters, mention_to_predicted, mention_to_gold)
+    def get_f1(self) -> float:
+        return sum(e.get_f1() for e in self.evaluators) / len(self.evaluators)
+    def get_recall(self) -> float:
+        return sum(e.get_recall() for e in self.evaluators) / len(self.evaluators)
+    def get_precision(self) -> float:
+        return sum(e.get_precision() for e in self.evaluators) / len(self.evaluators)
+    def get_prf(self) -> Tuple[float, float, float]:
+        return self.get_precision(), self.get_recall(), self.get_f1()
+    def _compute(self) -> float:
+        return self.get_f1()

src/metrics/coref_sklearn.py CHANGED Viewed

@@ -1,15 +1,13 @@
 import logging
 import math
-from typing import Any, Callable, Dict, List, Optional, Union
 import numpy as np
-import torch
 from pandas import MultiIndex
-from pie_modules.documents import TextPairDocumentWithLabeledSpansAndBinaryCorefRelations
-from pytorch_ie import DocumentMetric
 from pytorch_ie.core.metric import T
 from pytorch_ie.utils.hydra import resolve_target
-from torchmetrics import Metric, MetricCollection
 from src.hydra_callbacks.save_job_return_value import to_py_obj
@@ -24,6 +22,14 @@ def get_num_positives(targets: List[int], preds: List[float], positive_idx: int
     return len([v for v in targets if v == positive_idx])
 def discretize(
     values: List[float], threshold: Union[float, List[float], dict]
 ) -> Union[List[float], Dict[Any, List[float]]]:
@@ -40,20 +46,97 @@ def discretize(
     raise TypeError(f"threshold has unknown type: {threshold}")
-class CorefMetricsSKLearn(DocumentMetric):
-    DOCUMENT_TYPE = TextPairDocumentWithLabeledSpansAndBinaryCorefRelations
     def __init__(
         self,
         metrics: Dict[str, str],
         thresholds: Optional[Dict[str, float]] = None,
         default_target_idx: int = 0,
         default_prediction_score: float = 0.0,
         show_as_markdown: bool = False,
         markdown_precision: int = 4,
-        plot: bool = False,
     ):
-        self.metrics = {name: resolve_target(metric) for name, metric in metrics.items()}
         self.thresholds = thresholds or {}
         thresholds_not_in_metrics = {
             name: t for name, t in self.thresholds.items() if name not in self.metrics
@@ -62,11 +145,25 @@ class CorefMetricsSKLearn(DocumentMetric):
             logger.warning(
                 f"there are discretizing thresholds that do not have a metric: {thresholds_not_in_metrics}"
             )
         self.default_target_idx = default_target_idx
         self.default_prediction_score = default_prediction_score
         self.show_as_markdown = show_as_markdown
         self.markdown_precision = markdown_precision
-        self.plot = plot
         super().__init__()
@@ -74,50 +171,55 @@ class CorefMetricsSKLearn(DocumentMetric):
         self._preds: List[float] = []
         self._targets: List[int] = []
-    def _update(self, document: TextPairDocumentWithLabeledSpansAndBinaryCorefRelations) -> None:
-        target_args2idx = {
-            (rel.head, rel.tail): int(rel.score) for rel in document.binary_coref_relations
         }
-        prediction_args2score = {
-            (rel.head, rel.tail): rel.score for rel in document.binary_coref_relations.predictions
         }
-        all_args = set(target_args2idx) | set(prediction_args2score)
         all_targets: List[int] = []
         all_predictions: List[float] = []
         for args in all_args:
-            target_idx = target_args2idx.get(args, self.default_target_idx)
-            prediction_score = prediction_args2score.get(args, self.default_prediction_score)
             all_targets.append(target_idx)
             all_predictions.append(prediction_score)
-        # prediction_scores = torch.tensor(all_predictions)
-        # target_indices = torch.tensor(all_targets)
-        # self.metrics.update(preds=prediction_scores, target=target_indices)
         self._preds.extend(all_predictions)
         self._targets.extend(all_targets)
-    def do_plot(self):
-        raise NotImplementedError()
         from matplotlib import pyplot as plt
         # Get the number of metrics
-        num_metrics = len(self.metrics)
         # Calculate rows and columns for subplots (aim for a square-like layout)
-        ncols = math.ceil(math.sqrt(num_metrics))
-        nrows = math.ceil(num_metrics / ncols)
         # Create the subplots
         fig, ax_list = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15, 10))
         # Flatten the ax_list if necessary (in case of multiple rows/columns)
-        ax_list = ax_list.flatten().tolist()  # Ensure it's a list, and flatten it if necessary
-        # Ensure that we pass exactly the number of axes required by metrics
-        ax_list = ax_list[:num_metrics]
-        # Plot the metrics using the list of axes
-        self.metrics.plot(ax=ax_list, together=False)
         # Adjust layout to avoid overlapping plots
         plt.tight_layout()
@@ -125,23 +227,35 @@ class CorefMetricsSKLearn(DocumentMetric):
     def _compute(self) -> T:
-        if self.plot:
-            self.do_plot()
         result = {}
         for name, metric in self.metrics.items():
             if name in self.thresholds:
-                preds = discretize(values=self._preds, threshold=self.thresholds[name])
             else:
                 preds = self._preds
-            if isinstance(preds, dict):
-                metric_results = {
-                    t: metric(self._targets, t_preds) for t, t_preds in preds.items()
-                }
-                # just get the max
-                max_t, max_v = max(metric_results.items(), key=lambda k_v: k_v[1])
-                result[f"{name}-{max_t}"] = max_v
             else:
                 result[name] = metric(self._targets, preds)
@@ -149,7 +263,8 @@ class CorefMetricsSKLearn(DocumentMetric):
         if self.show_as_markdown:
             import pandas as pd
-            series = pd.Series(result)
             if isinstance(series.index, MultiIndex):
                 if len(series.index.levels) > 1:
                     # in fact, this is not a series anymore

 import logging
 import math
+from typing import Any, Callable, Dict, List, Optional, Union, overload
 import numpy as np
 from pandas import MultiIndex
+from pie_modules.utils import flatten_dict
+from pytorch_ie import Document, DocumentMetric
 from pytorch_ie.core.metric import T
 from pytorch_ie.utils.hydra import resolve_target
 from src.hydra_callbacks.save_job_return_value import to_py_obj
     return len([v for v in targets if v == positive_idx])
+@overload
+def discretize(values: List[float], threshold: float) -> List[float]: ...
+@overload
+def discretize(values: List[float], threshold: List[float]) -> Dict[Any, List[float]]: ...
 def discretize(
     values: List[float], threshold: Union[float, List[float], dict]
 ) -> Union[List[float], Dict[Any, List[float]]]:
     raise TypeError(f"threshold has unknown type: {threshold}")
+def get_metric_func(name: str) -> Callable:
+    if name.endswith("_curve"):
+        from sklearn.metrics import auc
+        base_func = resolve_target(name)
+        def wrapper(targets: List[int], preds: List[float], **kwargs):
+            x, y, thresholds = base_func(targets, preds, **kwargs)
+            return auc(y, x)
+        return wrapper
+    else:
+        return resolve_target(name)
+def bootstrap(
+    metric_fn: Callable[[List[int], Union[List[int], List[float]]], float],
+    targets: List[int],
+    predictions: Union[List[int], List[float]],
+    n: int = 1_000,
+    random_state: int | None = None,
+    alpha: float = 0.95,
+) -> Dict[str, float]:
+    """
+    Returns mean and a two–sided (1–alpha) bootstrap CI for any
+    pair-wise classification or ranking metric.
+    Parameters
+    ----------
+    metric_fn   Metric function taking (targets, prediction) lists.
+    targets     Ground-truth 0/1 labels.
+    prediction  Scores or hard predictions (same length as `targets`).
+    n           Number of bootstrap replicates (after skipping degenerate ones).
+    random_state  Seed for reproducibility.
+    alpha       Confidence level (default 0.95 → 95 % CI).
+    Notes
+    -----
+    * A replicate that contains only one class is discarded
+      because many sklearn metrics are undefined in that case.
+    * If all replicates are discarded an exception is raised.
+    """
+    y = np.asarray(targets)
+    yhat = np.asarray(predictions)
+    if y.shape[0] != yhat.shape[0]:
+        raise ValueError("`targets` and `prediction` must have the same length")
+    rng = np.random.default_rng(random_state)
+    idx = np.arange(y.shape[0])
+    vals_list: list[float] = []
+    while len(vals_list) < n:
+        sample_idx = rng.choice(idx, size=idx.shape[0], replace=True)
+        y_samp, yhat_samp = y[sample_idx], yhat[sample_idx]
+        # skip all-positive or all-negative bootstrap samples
+        if y_samp.min() == y_samp.max():
+            continue
+        vals_list.append(metric_fn(y_samp.tolist(), yhat_samp.tolist()))
+    if not vals_list:
+        raise RuntimeError("No valid bootstrap replicate contained both classes.")
+    vals = np.asarray(vals_list, dtype=float)
+    lower = np.percentile(vals, (1 - alpha) / 2 * 100)
+    upper = np.percentile(vals, (1 + alpha) / 2 * 100)
+    return {"mean": float(vals.mean()), "low": float(lower), "high": float(upper)}
+class BinaryClassificationMetricsSKLearn(DocumentMetric):
     def __init__(
         self,
         metrics: Dict[str, str],
+        layer: str,
+        label: Optional[str] = None,
         thresholds: Optional[Dict[str, float]] = None,
         default_target_idx: int = 0,
         default_prediction_score: float = 0.0,
         show_as_markdown: bool = False,
         markdown_precision: int = 4,
+        bootstrap: Optional[list[str]] = None,
+        bootstrap_n: int = 1_000,
+        bootstrap_random_state: int | None = None,
+        bootstrap_alpha: float = 0.95,
+        create_plots: bool = True,
+        plots: Optional[Dict[str, str]] = None,
     ):
+        self.metrics = {name: get_metric_func(metric) for name, metric in metrics.items()}
         self.thresholds = thresholds or {}
         thresholds_not_in_metrics = {
             name: t for name, t in self.thresholds.items() if name not in self.metrics
             logger.warning(
                 f"there are discretizing thresholds that do not have a metric: {thresholds_not_in_metrics}"
             )
+        self.annotation_layer_name = layer
+        self.annotation_label = label
         self.default_target_idx = default_target_idx
         self.default_prediction_score = default_prediction_score
         self.show_as_markdown = show_as_markdown
         self.markdown_precision = markdown_precision
+        if create_plots:
+            self.plots = {
+                name: resolve_target(plot_func) for name, plot_func in (plots or {}).items()
+            }
+        else:
+            self.plots = {}
+        self.bootstrap = set(bootstrap or [])
+        self.bootstrap_kwargs = {
+            "n": bootstrap_n,
+            "random_state": bootstrap_random_state,
+            "alpha": bootstrap_alpha,
+        }
         super().__init__()
         self._preds: List[float] = []
         self._targets: List[int] = []
+    def _update(self, document: Document) -> None:
+        annotation_layer = document[self.annotation_layer_name]
+        target2idx = {
+            ann: int(ann.score)
+            for ann in annotation_layer
+            if self.annotation_label is None or ann.label == self.annotation_label
         }
+        prediction2score = {
+            ann: ann.score
+            for ann in annotation_layer.predictions
+            if self.annotation_label is None or ann.label == self.annotation_label
         }
+        all_args = set(target2idx) | set(prediction2score)
         all_targets: List[int] = []
         all_predictions: List[float] = []
         for args in all_args:
+            target_idx = target2idx.get(args, self.default_target_idx)
+            prediction_score = prediction2score.get(args, self.default_prediction_score)
             all_targets.append(target_idx)
             all_predictions.append(prediction_score)
         self._preds.extend(all_predictions)
         self._targets.extend(all_targets)
+    def create_plots(self):
         from matplotlib import pyplot as plt
         # Get the number of metrics
+        num_plots = len(self.plots)
         # Calculate rows and columns for subplots (aim for a square-like layout)
+        ncols = math.ceil(math.sqrt(num_plots))
+        nrows = math.ceil(num_plots / ncols)
         # Create the subplots
         fig, ax_list = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15, 10))
         # Flatten the ax_list if necessary (in case of multiple rows/columns)
+        if num_plots > 1:
+            ax_list = ax_list.flatten().tolist()  # Ensure it's a list, and flatten it if necessary
+        else:
+            ax_list = [ax_list]
+        # Create each plot
+        for ax, (name, plot_func) in zip(ax_list, self.plots.items()):
+            # Set the title for each subplot
+            ax.set_title(name)
+            plot_func(y_true=self._targets, y_pred=self._preds, ax=ax)
         # Adjust layout to avoid overlapping plots
         plt.tight_layout()
     def _compute(self) -> T:
+        if len(self.plots) > 0:
+            self.create_plots()
         result = {}
         for name, metric in self.metrics.items():
             if name in self.thresholds:
+                preds_dict = discretize(values=self._preds, threshold=self.thresholds[name])
+                if isinstance(preds_dict, dict):
+                    metric_results = {
+                        t: metric(self._targets, t_preds) for t, t_preds in preds_dict.items()
+                    }
+                    # just get the max
+                    max_t, max_v = max(metric_results.items(), key=lambda k_v: k_v[1])
+                    result[f"{name}_threshold"] = max_t
+                    preds = discretize(values=self._preds, threshold=max_t)
+                else:
+                    preds = preds_dict
             else:
                 preds = self._preds
+            if name in self.bootstrap:
+                # bootstrap the metric
+                result[name] = bootstrap(
+                    metric_fn=metric,
+                    targets=self._targets,
+                    predictions=preds,
+                    **self.bootstrap_kwargs,  # type: ignore
+                )
             else:
                 result[name] = metric(self._targets, preds)
         if self.show_as_markdown:
             import pandas as pd
+            result_flat = flatten_dict(result)
+            series = pd.Series(result_flat)
             if isinstance(series.index, MultiIndex):
                 if len(series.index.levels) > 1:
                     # in fact, this is not a series anymore

src/metrics/f1_with_bootstrapping.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from collections import defaultdict
+from functools import partial
+from typing import Callable, Hashable, Optional, Tuple, Dict, Collection, List, Set
+from pie_modules.metrics import F1Metric
+from pytorch_ie import Annotation, Document
+def has_one_of_the_labels(ann: Annotation, label_field: str, labels: Collection[str]) -> bool:
+    return getattr(ann, label_field) in labels
+def has_this_label(ann: Annotation, label_field: str, label: str) -> bool:
+    return getattr(ann, label_field) == label
+class F1WithBootstrappingMetric(F1Metric):
+    def __init__(self, *args, bootstrap_n: int = 0, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.bootstrap_n = bootstrap_n
+    def reset(self) -> None:
+        self.tp: Dict[str, Set[Annotation]] = defaultdict(set)
+        self.fp: Dict[str, Set[Annotation]] = defaultdict(set)
+        self.fn: Dict[str, Set[Annotation]] = defaultdict(set)
+    def calculate_tp_fp_fn(
+        self,
+        document: Document,
+        annotation_filter: Optional[Callable[[Annotation], bool]] = None,
+        annotation_processor: Optional[Callable[[Annotation], Hashable]] = None,
+    ) -> Tuple[Set[Annotation], Set[Annotation], Set[Annotation]]:
+        annotation_processor = annotation_processor or (lambda ann: ann)
+        annotation_filter = annotation_filter or (lambda ann: True)
+        predicted_annotations = {
+            annotation_processor(ann)
+            for ann in document[self.layer].predictions
+            if annotation_filter(ann)
+        }
+        gold_annotations = {
+            annotation_processor(ann) for ann in document[self.layer] if annotation_filter(ann)
+        }
+        return predicted_annotations & gold_annotations, predicted_annotations - gold_annotations, gold_annotations - predicted_annotations
+    def add_tp_fp_fn(self, tp: Set[Annotation], fp: Set[Annotation], fn: Set[Annotation], label: str) -> None:
+        self.tp[label].update(tp)
+        self.fp[label].update(fp)
+        self.fn[label].update(fn)
+    def _update(self, document: Document) -> None:
+        new_values = self.calculate_tp_fp_fn(
+            document=document,
+            annotation_filter=(
+                partial(has_one_of_the_labels, label_field=self.label_field, labels=self.labels)
+                if self.per_label and not self.infer_labels
+                else None
+            ),
+            annotation_processor=self.annotation_processor,
+        )
+        self.add_tp_fp_fn(*new_values, label="MICRO")
+        if self.infer_labels:
+            layer = document[self.layer]
+            # collect labels from gold data and predictions
+            for ann in list(layer) + list(layer.predictions):
+                label = getattr(ann, self.label_field)
+                if label not in self.labels:
+                    self.labels.append(label)
+        if self.per_label:
+            for label in self.labels:
+                new_values = self.calculate_tp_fp_fn(
+                    document=document,
+                    annotation_filter=partial(
+                        has_this_label, label_field=self.label_field, label=label
+                    ),
+                    annotation_processor=self.annotation_processor,
+                )
+                self.add_tp_fp_fn(*new_values, label=label)
+    def _compute(self) -> Dict[str, Dict[str, float]]:
+        res = dict()
+        if self.per_label:
+            res["MACRO"] = {"f1": 0.0, "p": 0.0, "r": 0.0}
+        for label in self.tp.keys():
+            tp, fp, fn = (
+                len(self.tp[label]),
+                len(self.fp[label]),
+                len(self.fn[label]),
+            )
+            if tp == 0:
+                p, r, f1 = 0.0, 0.0, 0.0
+            else:
+                p = tp / (tp + fp)
+                r = tp / (tp + fn)
+                f1 = 2 * p * r / (p + r)
+            res[label] = {"f1": f1, "p": p, "r": r, "s": tp + fn}
+            if self.per_label and label in self.labels:
+                res["MACRO"]["f1"] += f1 / len(self.labels)
+                res["MACRO"]["p"] += p / len(self.labels)
+                res["MACRO"]["r"] += r / len(self.labels)
+        if self.show_as_markdown:
+            logger.info(f"\n{self.layer}:\n{pd.DataFrame(res).round(3).T.to_markdown()}")
+        return res

src/metrics/f1_with_threshold.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from typing import Callable, Hashable, Optional, Tuple
+from pie_modules.metrics import F1Metric
+from pytorch_ie import Annotation, Document
+class F1WithThresholdMetric(F1Metric):
+    def __init__(self, *args, threshold: float = 0.0, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.threshold = threshold
+    def calculate_counts(
+        self,
+        document: Document,
+        annotation_filter: Optional[Callable[[Annotation], bool]] = None,
+        annotation_processor: Optional[Callable[[Annotation], Hashable]] = None,
+    ) -> Tuple[int, int, int]:
+        annotation_processor = annotation_processor or (lambda ann: ann)
+        annotation_filter = annotation_filter or (lambda ann: True)
+        predicted_annotations = {
+            annotation_processor(ann)
+            for ann in document[self.layer].predictions
+            if annotation_filter(ann) and getattr(ann, "score", 0.0) >= self.threshold
+        }
+        gold_annotations = {
+            annotation_processor(ann)
+            for ann in document[self.layer]
+            if annotation_filter(ann) and getattr(ann, "score", 0.0) >= self.threshold
+        }
+        tp = len([ann for ann in predicted_annotations & gold_annotations])
+        fn = len([ann for ann in gold_annotations - predicted_annotations])
+        fp = len([ann for ann in predicted_annotations - gold_annotations])
+        return tp, fp, fn

src/metrics/ranking_sklearn.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import logging
+from collections import defaultdict
+from typing import Callable, Dict, List, Optional, Sequence, Union
+from pandas import MultiIndex
+from pytorch_ie import Annotation, AnnotationLayer, Document, DocumentMetric
+from pytorch_ie.annotations import BinaryRelation
+from pytorch_ie.core.metric import T
+from pytorch_ie.utils.hydra import resolve_target
+from src.hydra_callbacks.save_job_return_value import to_py_obj
+logger = logging.getLogger(__name__)
+class RankingMetricsSKLearn(DocumentMetric):
+    """Ranking metrics for documents with binary relations.
+    This metric computes the ranking metrics for retrieval tasks, where
+    relation heads are the queries and the relation tails are the candidates.
+    The metric is computed for each head and the results are averaged. It is meant to
+    be used with Scikit-learn metrics such as `sklearn.metrics.ndcg_score` (Normalized
+    Discounted Cumulative Gain), `sklearn.metrics.label_ranking_average_precision_score`
+    (LRAP), etc., see
+    https://scikit-learn.org/stable/modules/model_evaluation.html#multilabel-ranking-metrics.
+    Args:
+        metrics (Dict[str, Union[str, Callable]]): A dictionary of metric names and their
+            corresponding functions. The function can be a string (name of the function, e.g.,
+            sklearn.metrics.ndcg_score) or a callable.
+        layer (str): The name of the annotation layer containing the binary relations, e.g.,
+            "binary_relations" when applied to TextDocumentsWithLabeledSpansAndBinaryRelations.
+        use_manual_average (Optional[List[str]]): A list of metric names to use for manual
+            averaging. If provided, the metric scores will be calculated for each
+            head and then averaged. Otherwise, all true and predicted scores will be
+            passed to the metric function at once.
+        exclude_singletons (Optional[List[str]]): A list of metric names to exclude singletons
+            from the computation, i.e., entries (heads) where the number of candidates is 1.
+        label (Optional[str]): If provided, only the relations with this label will be used
+            to compute the metrics. This is useful for filtering out relations that are not
+            relevant for the task at hand (e.g., when having multiple relation types in the
+            same layer).
+        score_threshold (float): If provided, only the relations with a score greater than or
+            equal to this threshold will be used to compute the metrics.
+        default_score (float): The default score to use for missing relations, either in the
+            target or prediction. Default is 0.0.
+        use_all_spans (bool): Whether to consider all spans in the document as queries and
+            candidates or only the spans that are present in the target and prediction.
+        span_label_blacklist (Optional[List[str]]): If provided, ignore the relations with
+            heads/tails that are in this list. When using use_all_spans=True, this also
+            restricts the candidates to those that are not in the blacklist.
+        show_as_markdown (bool): Whether to show the results as markdown. Default is False.
+        markdown_precision (int): The precision for displaying the results in markdown.
+            Default is 4.
+    """
+    def __init__(
+        self,
+        metrics: Dict[str, Union[str, Callable]],
+        layer: str,
+        use_manual_average: Optional[List[str]] = None,
+        exclude_singletons: Optional[List[str]] = None,
+        label: Optional[str] = None,
+        score_threshold: float = 0.0,
+        default_score: float = 0.0,
+        use_all_spans: bool = False,
+        span_label_blacklist: Optional[List[str]] = None,
+        show_as_markdown: bool = False,
+        markdown_precision: int = 4,
+        plot: bool = False,
+    ):
+        self.metrics = {
+            name: resolve_target(metric) if isinstance(metric, str) else metric
+            for name, metric in metrics.items()
+        }
+        self.use_manual_average = set(use_manual_average or [])
+        self.exclude_singletons = set(exclude_singletons or [])
+        self.annotation_layer_name = layer
+        self.annotation_label = label
+        self.score_threshold = score_threshold
+        self.default_score = default_score
+        self.use_all_spans = use_all_spans
+        self.span_label_blacklist = span_label_blacklist
+        self.show_as_markdown = show_as_markdown
+        self.markdown_precision = markdown_precision
+        self.plot = plot
+        super().__init__()
+    def reset(self) -> None:
+        self._preds: List[List[float]] = []
+        self._targets: List[List[float]] = []
+    def get_head2tail2score(
+        self, relations: Sequence[BinaryRelation]
+    ) -> Dict[Annotation, Dict[Annotation, float]]:
+        result: Dict[Annotation, Dict[Annotation, float]] = defaultdict(dict)
+        for rel in relations:
+            if (
+                (self.annotation_label is None or rel.label == self.annotation_label)
+                and (rel.score >= self.score_threshold)
+                and (
+                    self.span_label_blacklist is None
+                    or (
+                        rel.head.label not in self.span_label_blacklist
+                        and rel.tail.label not in self.span_label_blacklist
+                    )
+                )
+            ):
+                result[rel.head][rel.tail] = rel.score
+        return result
+    def _update(self, document: Document) -> None:
+        annotation_layer: AnnotationLayer[BinaryRelation] = document[self.annotation_layer_name]
+        target_head2tail2score = self.get_head2tail2score(annotation_layer)
+        prediction_head2tail2score = self.get_head2tail2score(annotation_layer.predictions)
+        all_spans = set()
+        # get spans from all layers targeted by the annotation (relation) layer
+        for span_layer in annotation_layer.target_layers.values():
+            all_spans.update(span_layer)
+        if self.span_label_blacklist is not None:
+            all_spans = {span for span in all_spans if span.label not in self.span_label_blacklist}
+        if self.use_all_spans:
+            all_heads = all_spans
+        else:
+            all_heads = set(target_head2tail2score) | set(prediction_head2tail2score)
+        all_targets: List[List[float]] = []
+        all_predictions: List[List[float]] = []
+        for head in all_heads:
+            target_tail2score = target_head2tail2score.get(head, {})
+            prediction_tail2score = prediction_head2tail2score.get(head, {})
+            if self.use_all_spans:
+                # use all spans as tails
+                tails = set(span for span in all_spans if span != head)
+            else:
+                # use only the tails that are in the target or prediction
+                tails = set(target_tail2score) | set(prediction_tail2score)
+            target_scores = [target_tail2score.get(t, self.default_score) for t in tails]
+            prediction_scores = [prediction_tail2score.get(t, self.default_score) for t in tails]
+            all_targets.append(target_scores)
+            all_predictions.append(prediction_scores)
+        self._targets.extend(all_targets)
+        self._preds.extend(all_predictions)
+    def do_plot(self):
+        raise NotImplementedError()
+    def _compute(self) -> T:
+        if self.plot:
+            self.do_plot()
+        result = {}
+        for name, metric in self.metrics.items():
+            targets, preds = self._targets, self._preds
+            if name in self.exclude_singletons:
+                targets = [t for t in targets if len(t) > 1]
+                preds = [p for p in preds if len(p) > 1]
+                num_singletons = len(self._targets) - len(targets)
+                logger.warning(
+                    f"Excluding {num_singletons} singletons (out of {len(self._targets)} "
+                    f"entries) from {name} metric calculation."
+                )
+            if name in self.use_manual_average:
+                scores = [
+                    metric(y_true=[tgts], y_score=[prds]) for tgts, prds in zip(targets, preds)
+                ]
+                result[name] = sum(scores) / len(scores) if len(scores) > 0 else 0.0
+            else:
+                result[name] = metric(y_true=targets, y_score=preds)
+        result = to_py_obj(result)
+        if self.show_as_markdown:
+            import pandas as pd
+            series = pd.Series(result)
+            if isinstance(series.index, MultiIndex):
+                if len(series.index.levels) > 1:
+                    # in fact, this is not a series anymore
+                    series = series.unstack(-1)
+                else:
+                    series.index = series.index.get_level_values(0)
+            logger.info(
+                f"{self.current_split}\n{series.round(self.markdown_precision).to_markdown()}"
+            )
+        return result

src/metrics/score_distribution.py CHANGED Viewed

@@ -1,9 +1,12 @@
 from collections import defaultdict
 from typing import Any, Dict, List, Optional, Tuple
 import pandas as pd
 from pytorch_ie import Document, DocumentMetric
 class ScoreDistribution(DocumentMetric):
     """Computes the distribution of prediction scores for annotations in a layer. The scores are
@@ -36,7 +39,8 @@ class ScoreDistribution(DocumentMetric):
         plotly_use_create_distplot: bool = True,
         plotly_barmode: Optional[str] = None,
         plotly_marginal: Optional[str] = "violin",
-        plotly_font_size: int = 18,
         plotly_font_family: Optional[str] = None,
         plotly_background_color: Optional[str] = None,
     ):
@@ -52,7 +56,12 @@ class ScoreDistribution(DocumentMetric):
         self.plotly_use_create_distplot = plotly_use_create_distplot
         self.plotly_barmode = plotly_barmode
         self.plotly_marginal = plotly_marginal
-        self.plotly_font_size = plotly_font_size
         self.plotly_font_family = plotly_font_family
         self.plotly_background_color = plotly_background_color
         self.scores: Dict[str, Dict[str, List[float]]] = defaultdict(lambda: defaultdict(list))
@@ -231,7 +240,7 @@ class ScoreDistribution(DocumentMetric):
                     width=800,
                     title_text=description,
                     title_x=0.5,
-                    font=dict(size=self.plotly_font_size),
                     legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
                 )
                 if self.plotly_barmode is not None:
@@ -290,7 +299,7 @@ class ScoreDistribution(DocumentMetric):
             width=800,
             title_text=f"Mean Binned Scores for {self.mapped_layer}",
             title_x=0.5,
-            font=dict(size=self.plotly_font_size),
         )
         fig.update_layout(
             legend=dict(

+import logging
 from collections import defaultdict
 from typing import Any, Dict, List, Optional, Tuple
 import pandas as pd
 from pytorch_ie import Document, DocumentMetric
+logger = logging.getLogger()
 class ScoreDistribution(DocumentMetric):
     """Computes the distribution of prediction scores for annotations in a layer. The scores are
         plotly_use_create_distplot: bool = True,
         plotly_barmode: Optional[str] = None,
         plotly_marginal: Optional[str] = "violin",
+        plotly_font: Optional[Dict[str, Any]] = None,
+        plotly_font_size: Optional[int] = None,
         plotly_font_family: Optional[str] = None,
         plotly_background_color: Optional[str] = None,
     ):
         self.plotly_use_create_distplot = plotly_use_create_distplot
         self.plotly_barmode = plotly_barmode
         self.plotly_marginal = plotly_marginal
+        self.plotly_font = plotly_font or {}
+        if plotly_font_size is not None:
+            logger.warning(
+                "Parameter 'plotly_font_size' is deprecated. Use 'plotly_font' with 'size' key instead."
+            )
+            self.plotly_font["size"] = plotly_font_size
         self.plotly_font_family = plotly_font_family
         self.plotly_background_color = plotly_background_color
         self.scores: Dict[str, Dict[str, List[float]]] = defaultdict(lambda: defaultdict(list))
                     width=800,
                     title_text=description,
                     title_x=0.5,
+                    font=self.plotly_font,
                     legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
                 )
                 if self.plotly_barmode is not None:
             width=800,
             title_text=f"Mean Binned Scores for {self.mapped_layer}",
             title_x=0.5,
+            font=self.plotly_font,
         )
         fig.update_layout(
             legend=dict(

src/metrics/semantically_same_ranking.py ADDED Viewed

	@@ -0,0 +1,448 @@

+import logging
+import warnings
+from collections import defaultdict
+from functools import partial
+from typing import Callable, Iterable, List, Optional, Set, Tuple
+import numpy as np
+import pandas as pd
+from pytorch_ie import DocumentMetric
+from pytorch_ie.annotations import BinaryRelation
+from sklearn.metrics import average_precision_score, ndcg_score
+logger = logging.getLogger(__name__)
+NEG_INF = -1e9  # smaller than any real score
+# metrics
+def true_mrr(y_true: np.ndarray, y_score: np.ndarray, k: int | None = None) -> float:
+    """
+    Macro MRR over *all* queries.
+    • Reciprocal rank is 0 when a query has no relevant item.
+    • If k is given, restrict the search to the top-k list.
+    """
+    if y_true.size == 0:
+        return np.nan
+    rr = []
+    for t, s in zip(y_true, y_score):
+        if t.sum() == 0:
+            rr.append(0.0)
+            continue
+        order = np.argsort(-s)
+        if k is not None:
+            order = order[:k]
+        # first position where t == 1, +1 for 1-based rank
+        first_hit = np.flatnonzero(t[order] > 0)
+        rank = first_hit[0] + 1 if first_hit.size else np.inf
+        rr.append(0.0 if np.isinf(rank) else 1.0 / rank)
+    return np.mean(rr)
+def macro_ndcg(y_true: np.ndarray, y_score: np.ndarray, k: int | None = None) -> float:
+    """
+    Macro NDCG@k over all queries.
+    ndcg_score returns 0 when a query has no positives, so no masking is required.
+    """
+    if y_true.size == 0:
+        return np.nan
+    return ndcg_score(y_true, y_score, k=k)
+def macro_map(y_true: np.ndarray, y_score: np.ndarray) -> float:
+    """
+    Macro MAP: mean of Average-Precision per query.
+    Queries without positives contribute AP = 0.
+    """
+    if y_true.size == 0:
+        return np.nan
+    ap = []
+    for t, s in zip(y_true, y_score):
+        if t.sum() == 0:
+            ap.append(0.0)
+        else:
+            ap.append(average_precision_score(t, s))
+    return np.mean(ap)
+def ap_micro(y_true: np.ndarray, y_score: np.ndarray) -> float:
+    """
+    Micro AP over the entire pool (unchanged).
+    """
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", message="No positive class found in y_true")
+        return average_precision_score(y_true.ravel(), y_score.ravel())
+# ---------------------------
+# Recall@k
+# ---------------------------
+def recall_at_k_micro(y_true: np.ndarray, y_score: np.ndarray, k: int = 5) -> float:
+    """
+    Micro Recall@k  (a.k.a. instance-level recall)
+    – Each *positive instance* counts once, regardless of which query it belongs to.
+    – Denominator = total #positives across the whole pool.
+    """
+    total_pos = y_true.sum()
+    if total_pos == 0:
+        return np.nan
+    topk = np.argsort(-y_score, axis=1)[:, :k]  # indices of top-k per query
+    rows = np.arange(topk.shape[0])[:, None]
+    hits = (y_true[rows, topk] > 0).sum()  # total #hits (instances)
+    return hits / total_pos
+def recall_at_k_macro(y_true: np.ndarray, y_score: np.ndarray, k: int = 5) -> float:
+    """
+    Macro Recall@k  (query-level recall)
+    – First compute recall per *query* (#hits / #positives in that query).
+    – Then average across all queries that actually contain ≥1 positive.
+    """
+    mask = y_true.sum(axis=1) > 0  # keep only valid queries
+    if not mask.any():
+        return np.nan
+    Yt, Ys = y_true[mask], y_score[mask]
+    topk = np.argsort(-Ys, axis=1)[:, :k]
+    rows = np.arange(Yt.shape[0])[:, None]
+    hits_per_q = (Yt[rows, topk] > 0).sum(axis=1)  # shape: (n_queries,)
+    pos_per_q = Yt.sum(axis=1)
+    return np.mean(hits_per_q / pos_per_q)  # average of query recalls
+# ---------------------------
+# Precision@k
+# ---------------------------
+def precision_at_k_micro(y_true: np.ndarray, y_score: np.ndarray, k: int = 5) -> float:
+    """
+    Micro Precision@k  (pool-level precision)
+    – Numerator = total #hits across all queries.
+    – Denominator = total #predictions considered (n_queries · k).
+    """
+    if y_true.size == 0:
+        return np.nan
+    topk = np.argsort(-y_score, axis=1)[:, :k]
+    rows = np.arange(topk.shape[0])[:, None]
+    hits = (y_true[rows, topk] > 0).sum()
+    total_pred = y_true.shape[0] * k
+    return hits / total_pred
+def precision_at_k_macro(y_true: np.ndarray, y_score: np.ndarray, k: int = 5) -> float:
+    """
+    Macro Precision@k  (query-level precision)
+    – Compute precision = (#hits / k) for each query, **including those with zero positives**,
+      then average.
+    """
+    if y_true.size == 0:
+        return np.nan
+    topk = np.argsort(-y_score, axis=1)[:, :k]
+    rows = np.arange(topk.shape[0])[:, None]
+    rel = y_true[rows, topk] > 0  # shape: (n_queries, k)
+    precision_per_q = rel.mean(axis=1)  # mean over k positions
+    return precision_per_q.mean()
+# helper methods
+def bootstrap(
+    metric_fn: Callable[[np.ndarray, np.ndarray], float],
+    y_true: np.ndarray,
+    y_score: np.ndarray,
+    n: int = 1000,
+    rng=None,
+) -> dict[str, float]:
+    rng = np.random.default_rng(rng)
+    idx = np.arange(len(y_true))
+    vals: list[float] = []
+    while len(vals) < n:
+        sample = rng.choice(idx, size=len(idx), replace=True)
+        t = y_true[sample]
+        s = y_score[sample]
+        if t.sum() == 0:  # no positive at all → resample
+            continue
+        vals.append(metric_fn(t, s))
+    result = np.asarray(vals)
+    # get 95% confidence interval
+    lo, hi = np.percentile(result, [2.5, 97.5])
+    return {"mean": result.mean(), "low": lo, "high": hi}
+def evaluate_with_ranx(
+    pred_rels: set[BinaryRelation],
+    target_rels: set[BinaryRelation],
+    metrics: list[str],
+    include_queries_without_gold: bool = True,
+) -> dict[str, float]:
+    # lazy import to not require ranx via requirements.txt
+    import ranx
+    all_rels = set(pred_rels) | set(target_rels)
+    all_heads = {rel.head for rel in all_rels}
+    head2id = {head: f"q_{idx}" for idx, head in enumerate(sorted(all_heads))}
+    tail_and_label2id = {(ann.tail, ann.label): f"d_{idx}" for idx, ann in enumerate(all_rels)}
+    qrels_dict: dict[str, dict[str, int]] = defaultdict(dict)  # {query_id: {doc_id: 1}}
+    run_dict: dict[str, dict[str, float]] = defaultdict(dict)  # {query_id: {doc_id: score}}
+    for target_rel in target_rels:
+        query_id = head2id[target_rel.head]
+        doc_id = tail_and_label2id[(target_rel.tail, target_rel.label)]
+        if target_rel.score != 1.0:
+            raise ValueError(
+                f"target score must be 1.0, but got {target_rel.score} for {target_rel}"
+            )
+        qrels_dict[query_id][doc_id] = 1
+    for pred_rel in pred_rels:
+        query_id = head2id[pred_rel.head]
+        doc_id = tail_and_label2id[(pred_rel.tail, pred_rel.label)]
+        run_dict[query_id][doc_id] = pred_rel.score
+    if include_queries_without_gold:
+        # add missing query ids to rund_dict and qrels_dict
+        for query_id in set(head2id.values()) - set(qrels_dict):
+            qrels_dict[query_id] = {}
+    # evaluate
+    qrels = ranx.Qrels(qrels_dict)
+    run = ranx.Run(run_dict)
+    results = ranx.evaluate(qrels, run, metrics, make_comparable=True)
+    return results
+def deduplicate_relations(
+    relations: Iterable[BinaryRelation], caption: str
+) -> Set[BinaryRelation]:
+    pred2scores = defaultdict(set)
+    for ann in relations:
+        pred2scores[ann].add(round(ann.score, 4))
+    # warning for duplicates
+    preds_with_duplicates = [ann for ann, scores in pred2scores.items() if len(scores) > 1]
+    if len(preds_with_duplicates) > 0:
+        logger.warning(
+            f"there are {len(preds_with_duplicates)} {caption} with duplicates: "
+            f"{preds_with_duplicates}. We will take the max score for each annotation."
+        )
+    # take the max score for each annotation
+    result = {ann.copy(score=max(scores)) for ann, scores in pred2scores.items()}
+    return result
+def construct_y_true_and_score(
+    preds: Iterable[BinaryRelation], targets: Iterable[BinaryRelation]
+) -> Tuple[np.ndarray, np.ndarray]:
+    # helper constructs
+    all_anns = set(preds) | set(targets)
+    head2relations = defaultdict(list)
+    for ann in all_anns:
+        head2relations[ann.head].append(ann)
+    target2score = {rel: rel.score for rel in targets}
+    pred2score = {rel: rel.score for rel in preds}
+    max_len = max(len(relations) for relations in head2relations.values())
+    target_rows, pred_rows = [], []
+    for query in head2relations:
+        relations = head2relations[query]
+        # get a very small, random score for missing predictions. Or should we use 0.0 as before? or NEG_INF?
+        missing_pred_score = NEG_INF  # np.random.uniform(0.0, 0.001) #0.0 #
+        missing_target_score = 0
+        query_scores = [
+            (target2score.get(ann, missing_target_score), pred2score.get(ann, missing_pred_score))
+            for ann in relations
+        ]
+        # sort by descending order of prediction score
+        query_scores_sorted = np.array(sorted(query_scores, key=lambda x: x[1], reverse=True))
+        # pad with zeros so every row has the same length
+        pad_width = max_len - len(query_scores)
+        query_target = np.pad(
+            query_scores_sorted[:, 0], (0, pad_width), constant_values=missing_target_score
+        )
+        query_pred = np.pad(
+            query_scores_sorted[:, 1], (0, pad_width), constant_values=missing_pred_score
+        )
+        target_rows.append(query_target)
+        pred_rows.append(query_pred)
+    y_true = np.vstack(target_rows)  # shape (n_queries, max_len)
+    y_score = np.vstack(pred_rows)
+    return y_true, y_score
+class SemanticallySameRankingMetric(DocumentMetric):
+    def __init__(
+        self,
+        layer: str,
+        label: Optional[str] = None,
+        add_reversed: bool = False,
+        require_positive_gold: bool = False,
+        bootstrap_n: Optional[int] = None,
+        k_values: Optional[List[int]] = None,
+        return_coverage: bool = True,
+        show_as_markdown: bool = False,
+        use_ranx: bool = False,
+        add_stats_to_result: bool = False,
+    ) -> None:
+        super().__init__()
+        self.layer = layer
+        self.label = label
+        self.add_reversed = add_reversed
+        self.require_positive_gold = require_positive_gold
+        self.bootstrap_n = bootstrap_n
+        self.k_values = k_values if k_values is not None else [1, 5, 10]
+        self.return_coverage = return_coverage
+        self.show_as_markdown = show_as_markdown
+        self.use_ranx = use_ranx
+        self.add_stats_to_result = add_stats_to_result
+        self.metrics = {
+            "macro_ndcg": macro_ndcg,
+            "macro_mrr": true_mrr,
+            "macro_map": macro_map,
+            "micro_ap": ap_micro,
+        }
+        for name, func in [
+            ("macro_ndcg", macro_ndcg),
+            ("micro_recall", recall_at_k_micro),
+            ("micro_precision", precision_at_k_micro),
+            ("macro_recall", recall_at_k_macro),
+            ("macro_precision", precision_at_k_macro),
+        ]:
+            for k in self.k_values:
+                self.metrics[f"{name}@{k}"] = partial(func, k=k)  # type: ignore
+        self.ranx_metrics = ["map", "mrr", "ndcg"]
+        for name in ["recall", "precision", "ndcg"]:
+            for k in self.k_values:
+                self.ranx_metrics.append(f"{name}@{k}")
+    def reset(self) -> None:
+        """
+        Reset the metric to its initial state.
+        """
+        self._preds: List[BinaryRelation] = []
+        self._targets: List[BinaryRelation] = []
+    def _update(self, document):
+        layer = document[self.layer]
+        ann: BinaryRelation
+        for ann in layer:
+            if self.label is None or ann.label == self.label:
+                if ann.score > 0.0:
+                    self._targets.append(ann.copy())
+                    if self.add_reversed:
+                        self._targets.append(ann.copy(head=ann.tail, tail=ann.head))
+        for ann in layer.predictions:
+            if self.label is None or ann.label == self.label:
+                if ann.score > 0.0:
+                    self._preds.append(ann.copy())
+                    if self.add_reversed:
+                        self._preds.append(ann.copy(head=ann.tail, tail=ann.head))
+    def _compute(self):
+        # take the max score for each annotation
+        preds_deduplicated = deduplicate_relations(self._preds, "predictions")
+        targets_deduplicated = deduplicate_relations(self._targets, "targets")
+        stats = {
+            "gold": len(targets_deduplicated),
+            "preds": len(preds_deduplicated),
+            "queries": len(
+                set(ann.head for ann in targets_deduplicated)
+                | set(ann.head for ann in preds_deduplicated)
+            ),
+        }
+        if self.use_ranx:
+            if self.bootstrap_n is not None:
+                raise ValueError(
+                    "Ranx does not support bootstrapping. Please set bootstrap_n=None."
+                )
+            scores = evaluate_with_ranx(
+                preds_deduplicated,
+                targets_deduplicated,
+                metrics=self.ranx_metrics,
+                include_queries_without_gold=not self.require_positive_gold,
+            )
+            if self.add_stats_to_result:
+                scores.update(stats)
+            # logger.info(f"results via ranx:\n{pd.Series(ranx_result).sort_index().round(3).to_markdown()}")
+            df = pd.DataFrame.from_records([scores], index=["score"])
+        else:
+            y_true, y_score = construct_y_true_and_score(
+                preds=preds_deduplicated, targets=targets_deduplicated
+            )
+            # original definition ─ share of queries with ≥1 positive
+            coverage = (y_true.sum(axis=1) > 0).mean()
+            # keep only queries that actually have at least one gold positive
+            if self.require_positive_gold:
+                mask = y_true.sum(axis=1) > 0  # shape: (n_queries,)
+                y_true = y_true[mask]
+                y_score = y_score[mask]
+            if self.bootstrap_n is not None:
+                scores = {
+                    name: bootstrap(fn, y_true, y_score, n=self.bootstrap_n)
+                    for name, fn in self.metrics.items()
+                }
+                if self.add_stats_to_result:
+                    scores["stats"] = stats
+                df = pd.DataFrame(scores)
+            else:
+                scores = {name: fn(y_true, y_score) for name, fn in self.metrics.items()}
+                if self.add_stats_to_result:
+                    scores.update(stats)
+                df = pd.DataFrame.from_records([scores], index=["score"])
+            if self.return_coverage:
+                scores["coverage"] = coverage
+        if self.show_as_markdown:
+            if not self.add_stats_to_result:
+                logger.info(
+                    logger.info(
+                        f'\nstatistics ({self.layer}):\n{pd.Series(stats, name="value").to_markdown()}'
+                    )
+                )
+            logger.info(f"\n{self.layer}:\n{df.round(4).T.to_markdown()}")
+        return scores

src/metrics/tpfpfn.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import logging
+from collections import defaultdict
+from functools import partial
+from typing import (
+    Any,
+    Callable,
+    Collection,
+    Dict,
+    Hashable,
+    List,
+    Optional,
+    Tuple,
+    TypeAlias,
+    Union,
+)
+from pytorch_ie.core import Annotation, Document, DocumentMetric
+from pytorch_ie.utils.hydra import resolve_target
+from src.document.types import RelatedRelation
+logger = logging.getLogger(__name__)
+def has_one_of_the_labels(ann: Annotation, label_field: str, labels: Collection[str]) -> bool:
+    return getattr(ann, label_field) in labels
+def has_this_label(ann: Annotation, label_field: str, label: str) -> bool:
+    return getattr(ann, label_field) == label
+InstanceType: TypeAlias = Tuple[Document, Annotation]
+InstancesType: TypeAlias = Tuple[List[InstanceType], List[InstanceType], List[InstanceType]]
+class TPFFPFNMetric(DocumentMetric):
+    """Computes the lists of True Positive, False Positive, and False Negative
+    annotations for a given layer. If labels are provided, it also computes
+    the counts for each label separately.
+    Works only with `RelatedRelation` annotations for now.
+    Args:
+        layer: The layer to compute the metrics for.
+        labels: If provided, calculate metrics for each label.
+        label_field: The field to use for the label. Defaults to "label".
+    """
+    def __init__(
+        self,
+        layer: str,
+        labels: Optional[Union[Collection[str], str]] = None,
+        label_field: str = "label",
+        annotation_processor: Optional[Union[Callable[[Annotation], Hashable], str]] = None,
+    ):
+        super().__init__()
+        self.layer = layer
+        self.label_field = label_field
+        self.annotation_processor: Optional[Callable[[Annotation], Hashable]]
+        if isinstance(annotation_processor, str):
+            self.annotation_processor = resolve_target(annotation_processor)
+        else:
+            self.annotation_processor = annotation_processor
+        self.per_label = labels is not None
+        self.infer_labels = False
+        if self.per_label:
+            if isinstance(labels, str):
+                if labels != "INFERRED":
+                    raise ValueError(
+                        "labels can only be 'INFERRED' if per_label is True and labels is a string"
+                    )
+                self.labels = []
+                self.infer_labels = True
+            elif isinstance(labels, Collection):
+                if not all(isinstance(label, str) for label in labels):
+                    raise ValueError("labels must be a collection of strings")
+                if "MICRO" in labels or "MACRO" in labels:
+                    raise ValueError(
+                        "labels cannot contain 'MICRO' or 'MACRO' because they are used to capture aggregated metrics"
+                    )
+                if len(labels) == 0:
+                    raise ValueError("labels cannot be empty")
+                self.labels = list(labels)
+            else:
+                raise ValueError("labels must be a string or a collection of strings")
+    def reset(self):
+        self.tp_fp_fn = defaultdict(lambda: (list(), list(), list()))
+    def get_tp_fp_fn(
+        self,
+        document: Document,
+        annotation_filter: Optional[Callable[[Annotation], bool]] = None,
+        annotation_processor: Optional[Callable[[Annotation], Hashable]] = None,
+    ) -> InstancesType:
+        annotation_processor = annotation_processor or (lambda ann: ann)
+        annotation_filter = annotation_filter or (lambda ann: True)
+        predicted_annotations = {
+            annotation_processor(ann)
+            for ann in document[self.layer].predictions
+            if annotation_filter(ann)
+        }
+        gold_annotations = {
+            annotation_processor(ann) for ann in document[self.layer] if annotation_filter(ann)
+        }
+        tp = [(document, ann) for ann in predicted_annotations & gold_annotations]
+        fn = [(document, ann) for ann in gold_annotations - predicted_annotations]
+        fp = [(document, ann) for ann in predicted_annotations - gold_annotations]
+        return tp, fp, fn
+    def add_annotations(self, annotations: InstancesType, label: str):
+        self.tp_fp_fn[label] = (
+            self.tp_fp_fn[label][0] + annotations[0],
+            self.tp_fp_fn[label][1] + annotations[1],
+            self.tp_fp_fn[label][2] + annotations[2],
+        )
+    def _update(self, document: Document):
+        new_tp_fp_fn = self.get_tp_fp_fn(
+            document=document,
+            annotation_filter=(
+                partial(has_one_of_the_labels, label_field=self.label_field, labels=self.labels)
+                if self.per_label and not self.infer_labels
+                else None
+            ),
+            annotation_processor=self.annotation_processor,
+        )
+        self.add_annotations(new_tp_fp_fn, label="MICRO")
+        if self.infer_labels:
+            layer = document[self.layer]
+            # collect labels from gold data and predictions
+            for ann in list(layer) + list(layer.predictions):
+                label = getattr(ann, self.label_field)
+                if label not in self.labels:
+                    self.labels.append(label)
+        if self.per_label:
+            for label in self.labels:
+                new_tp_fp_fn = self.get_tp_fp_fn(
+                    document=document,
+                    annotation_filter=partial(
+                        has_this_label, label_field=self.label_field, label=label
+                    ),
+                    annotation_processor=self.annotation_processor,
+                )
+                self.add_annotations(new_tp_fp_fn, label=label)
+    def format_texts(self, texts: List[str]) -> str:
+        return "<SEP>".join(texts)
+    def format_annotation(self, ann: Annotation) -> Dict[str, Any]:
+        if isinstance(ann, RelatedRelation):
+            head_resolved = ann.head.resolve()
+            tail_resolved = ann.tail.resolve()
+            ref_resolved = ann.reference_span.resolve()
+            return {
+                "related_label": ann.label,
+                "related_score": round(ann.score, 3),
+                "query_label": head_resolved[0],
+                "query_texts": self.format_texts(head_resolved[1]),
+                "query_score": round(ann.head.score, 3),
+                "ref_label": ref_resolved[0],
+                "ref_texts": self.format_texts(ref_resolved[1]),
+                "ref_score": round(ann.reference_span.score, 3),
+                "rec_label": tail_resolved[0],
+                "rec_texts": self.format_texts(tail_resolved[1]),
+                "rec_score": round(ann.tail.score, 3),
+            }
+        else:
+            raise NotImplementedError
+            # return ann.resolve()
+    def format_instance(self, instance: InstanceType) -> Dict[str, Any]:
+        document, annotation = instance
+        result = self.format_annotation(annotation)
+        if getattr(document, "id", None) is not None:
+            result["document_id"] = document.id
+        return result
+    def _compute(self) -> Dict[str, Dict[str, list]]:
+        res = dict()
+        for k, instances in self.tp_fp_fn.items():
+            res[k] = {
+                "tp": [self.format_instance(instance) for instance in instances[0]],
+                "fp": [self.format_instance(instance) for instance in instances[1]],
+                "fn": [self.format_instance(instance) for instance in instances[2]],
+            }
+        # if self.show_as_markdown:
+        #    logger.info(f"\n{self.layer}:\n{pd.DataFrame(res).round(3).T.to_markdown()}")
+        return res

src/models/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from .sequence_classification import SimpleSequenceClassificationModelWithInputTypeIds
 from .sequence_classification_with_pooler import (
     SequencePairSimilarityModelWithMaxCosineSim,
-    SequencePairSimilarityModelWithPooler2,
     SequencePairSimilarityModelWithPoolerAndAdapter,
 )

 from .sequence_classification import SimpleSequenceClassificationModelWithInputTypeIds
 from .sequence_classification_with_pooler import (
+    SequencePairSimilarityModelDummy,
     SequencePairSimilarityModelWithMaxCosineSim,
+    SequencePairSimilarityModelWithMaxCosineSimAndAdapter,
     SequencePairSimilarityModelWithPoolerAndAdapter,
 )

src/models/sequence_classification_with_pooler.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import abc
 import logging
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
 from adapters import AutoAdapterModel
 from pie_modules.models import SequencePairSimilarityModelWithPooler
-from pie_modules.models.components.pooler import MENTION_POOLING
 from pie_modules.models.sequence_classification_with_pooler import (
     InputType,
     OutputType,
@@ -20,31 +19,11 @@ from torch import FloatTensor, Tensor
 from transformers import AutoConfig, PreTrainedModel
 from transformers.modeling_outputs import SequenceClassifierOutput
-from src.models.components.pooler import SpanMeanPooler
 logger = logging.getLogger(__name__)
-class SequenceClassificationModelWithPoolerBase2(
-    SequenceClassificationModelWithPoolerBase, abc.ABC
-):
-    def setup_pooler(self, input_dim: int) -> Tuple[Callable, int]:
-        aggregate = self.pooler_config.get("aggregate", "max")
-        if self.pooler_config["type"] == MENTION_POOLING and aggregate != "max":
-            if aggregate == "mean":
-                pooler_config = dict(self.pooler_config)
-                pooler_config.pop("type")
-                pooler_config.pop("aggregate")
-                pooler = SpanMeanPooler(input_dim=input_dim, **pooler_config)
-                return pooler, pooler.output_dim
-            else:
-                raise ValueError(f"Unknown aggregation method: {aggregate}")
-        else:
-            return super().setup_pooler(input_dim)
 class SequenceClassificationModelWithPoolerAndAdapterBase(
-    SequenceClassificationModelWithPoolerBase2, abc.ABC
 ):
     def __init__(self, adapter_name_or_path: Optional[str] = None, **kwargs):
         self.adapter_name_or_path = adapter_name_or_path
@@ -66,13 +45,6 @@ class SequenceClassificationModelWithPoolerAndAdapterBase(
             return model
-@PyTorchIEModel.register()
-class SequencePairSimilarityModelWithPooler2(
-    SequencePairSimilarityModelWithPooler, SequenceClassificationModelWithPoolerBase2
-):
-    pass
 @PyTorchIEModel.register()
 class SequencePairSimilarityModelWithPoolerAndAdapter(
     SequencePairSimilarityModelWithPooler, SequenceClassificationModelWithPoolerAndAdapterBase
@@ -164,3 +136,66 @@ class SequencePairSimilarityModelWithMaxCosineSimAndAdapter(
     SequencePairSimilarityModelWithMaxCosineSim, SequencePairSimilarityModelWithPoolerAndAdapter
 ):
     pass

 import abc
 import logging
+from typing import Callable, List, Optional
 import torch
 import torch.nn.functional as F
 from adapters import AutoAdapterModel
 from pie_modules.models import SequencePairSimilarityModelWithPooler
 from pie_modules.models.sequence_classification_with_pooler import (
     InputType,
     OutputType,
 from transformers import AutoConfig, PreTrainedModel
 from transformers.modeling_outputs import SequenceClassifierOutput
 logger = logging.getLogger(__name__)
 class SequenceClassificationModelWithPoolerAndAdapterBase(
+    SequenceClassificationModelWithPoolerBase, abc.ABC
 ):
     def __init__(self, adapter_name_or_path: Optional[str] = None, **kwargs):
         self.adapter_name_or_path = adapter_name_or_path
             return model
 @PyTorchIEModel.register()
 class SequencePairSimilarityModelWithPoolerAndAdapter(
     SequencePairSimilarityModelWithPooler, SequenceClassificationModelWithPoolerAndAdapterBase
     SequencePairSimilarityModelWithMaxCosineSim, SequencePairSimilarityModelWithPoolerAndAdapter
 ):
     pass
+@PyTorchIEModel.register()
+class SequencePairSimilarityModelDummy(SequencePairSimilarityModelWithPooler):
+    def __init__(
+        self,
+        method: str = "random",
+        random_seed: Optional[int] = None,
+        **kwargs,
+    ):
+        self.method = method
+        self.random_seed = random_seed
+        super().__init__(**kwargs)
+    def setup_classifier(
+        self, pooler_output_dim: int
+    ) -> Callable[[torch.FloatTensor, torch.FloatTensor], torch.FloatTensor]:
+        if self.method == "random":
+            generator = torch.Generator(device=self.device)
+            if self.random_seed is not None:
+                generator = generator.manual_seed(self.random_seed)
+            def binary_classify_random(
+                inputs: torch.FloatTensor,
+                inputs_pair: torch.FloatTensor,
+            ) -> Callable[[torch.FloatTensor, torch.FloatTensor], torch.FloatTensor]:
+                """Randomly classifies pairs of inputs as similar or not similar."""
+                # Generate random logits in the range of [0, 1]
+                logits = torch.rand(inputs.size(0), device=self.device, generator=generator)
+                return logits
+            return binary_classify_random
+        elif self.method == "zero":
+            def binary_classify_zero(
+                inputs: torch.FloatTensor,
+                inputs_pair: torch.FloatTensor,
+            ) -> Callable[[torch.FloatTensor, torch.FloatTensor], torch.FloatTensor]:
+                """Classifies pairs of inputs as not similar (logit = 0)."""
+                # Return a tensor of zeros with the same batch size
+                logits = torch.zeros(inputs.size(0), device=self.device)
+                return logits
+            return binary_classify_zero
+        else:
+            raise ValueError(
+                f"Unknown method: {self.method}. Supported methods are 'random' and 'zero'."
+            )
+    def setup_loss_fct(self) -> Callable:
+        def loss_fct(logits: FloatTensor, labels: FloatTensor) -> FloatTensor:
+            raise NotImplementedError(
+                "Dummy model does not support loss function, as it is not used for training."
+            )
+        return loss_fct
+    def get_pooled_output(self, model_inputs, pooler_inputs) -> torch.FloatTensor:
+        # Just return a tensor of zeros in the shape of the batch size
+        # so that the classifier can construct dummy logits in the correct shape.
+        bs = pooler_inputs["start_indices"].size(0)
+        return torch.zeros(bs, device=self.device)

src/predict.py CHANGED Viewed

@@ -113,8 +113,8 @@ def predict(cfg: DictConfig) -> Tuple[dict, dict]:
                 .to(dtype=pipeline.model.dtype)
             )
-        # auto-convert the dataset if the metric specifies a document type
-        dataset = pipeline.taskmodule.convert_dataset(dataset)
     # Init the serializer
     serializer: Optional[DocumentSerializer] = None

                 .to(dtype=pipeline.model.dtype)
             )
+        # auto-convert the dataset if the taskmodule specifies a document type
+        dataset = dataset.to_document_type(pipeline.taskmodule, downcast=False)
     # Init the serializer
     serializer: Optional[DocumentSerializer] = None

src/serializer/__init__.py CHANGED Viewed

	@@ -1 +1,4 @@
1	- from .json import JsonSerializer~~, JsonSerializer2~~

+from .json import JsonSerializer
+# backward compatibility
+JsonSerializer2 = JsonSerializer

src/serializer/interface.py CHANGED Viewed

@@ -12,5 +12,4 @@ class DocumentSerializer(ABC):
     """
     @abstractmethod
-    def __call__(self, documents: Iterable[Document]) -> Any:
-        pass

     """
     @abstractmethod
+    def __call__(self, documents: Iterable[Document], append: bool = False, **kwargs) -> Any: ...

src/serializer/json.py CHANGED Viewed

@@ -1,11 +1,7 @@
-import json
-import os
-from typing import Dict, Iterable, List, Optional, Sequence, Type, TypeVar
 from pie_datasets import Dataset, DatasetDict, IterableDataset
-from pie_datasets.core.dataset_dict import METADATA_FILE_NAME
 from pytorch_ie.core import Document
-from pytorch_ie.utils.hydra import resolve_optional_document_type, serialize_document_type
 from src.serializer.interface import DocumentSerializer
 from src.utils.logging_utils import get_pylogger
@@ -28,125 +24,13 @@ class JsonSerializer(DocumentSerializer):
     def __init__(self, **kwargs):
         self.default_kwargs = kwargs
-    @classmethod
-    def write(
-        cls,
-        documents: Iterable[Document],
-        path: str,
-        file_name: str = "documents.jsonl",
-        metadata_file_name: str = METADATA_FILE_NAME,
-        split: Optional[str] = None,
-        **kwargs,
-    ) -> Dict[str, str]:
-        realpath = os.path.realpath(path)
-        log.info(f'serialize documents to "{realpath}" ...')
-        os.makedirs(realpath, exist_ok=True)
-        if not isinstance(documents, Sequence):
-            documents = list(documents)
-        # dump metadata including the document_type
-        if len(documents) == 0:
-            raise Exception("cannot serialize empty list of documents")
-        document_type = type(documents[0])
-        metadata = {"document_type": serialize_document_type(document_type)}
-        full_metadata_file_name = os.path.join(realpath, metadata_file_name)
-        if os.path.exists(full_metadata_file_name):
-            # load previous metadata
-            with open(full_metadata_file_name) as f:
-                previous_metadata = json.load(f)
-            if previous_metadata != metadata:
-                raise ValueError(
-                    f"metadata file {full_metadata_file_name} already exists, "
-                    "but the content does not match the current metadata"
-                    "\nprevious metadata: {previous_metadata}"
-                    "\ncurrent metadata: {metadata}"
-                )
-        else:
-            with open(full_metadata_file_name, "w") as f:
-                json.dump(metadata, f, indent=2)
-        if split is not None:
-            realpath = os.path.join(realpath, split)
-            os.makedirs(realpath, exist_ok=True)
-        full_file_name = os.path.join(realpath, file_name)
-        if as_json_lines(file_name):
-            # if the file already exists, append to it
-            mode = "a" if os.path.exists(full_file_name) else "w"
-            with open(full_file_name, mode) as f:
-                for doc in documents:
-                    f.write(json.dumps(doc.asdict(), **kwargs) + "\n")
-        else:
-            docs_list = [doc.asdict() for doc in documents]
-            if os.path.exists(full_file_name):
-                # load previous documents
-                with open(full_file_name) as f:
-                    previous_doc_list = json.load(f)
-                docs_list = previous_doc_list + docs_list
-            with open(full_file_name, "w") as f:
-                json.dump(docs_list, fp=f, **kwargs)
-        return {"path": realpath, "file_name": file_name, "metadata_file_name": metadata_file_name}
-    @classmethod
-    def read(
-        cls,
-        path: str,
-        document_type: Optional[Type[D]] = None,
-        file_name: str = "documents.jsonl",
-        metadata_file_name: str = METADATA_FILE_NAME,
-        split: Optional[str] = None,
-    ) -> List[D]:
-        realpath = os.path.realpath(path)
-        log.info(f'load documents from "{realpath}" ...')
-        # try to load metadata including the document_type
-        full_metadata_file_name = os.path.join(realpath, metadata_file_name)
-        if os.path.exists(full_metadata_file_name):
-            with open(full_metadata_file_name) as f:
-                metadata = json.load(f)
-            document_type = resolve_optional_document_type(metadata.get("document_type"))
-        if document_type is None:
-            raise Exception("document_type is required to load serialized documents")
-        if split is not None:
-            realpath = os.path.join(realpath, split)
-        full_file_name = os.path.join(realpath, file_name)
-        documents = []
-        if as_json_lines(str(file_name)):
-            with open(full_file_name) as f:
-                for line in f:
-                    json_dict = json.loads(line)
-                    documents.append(document_type.fromdict(json_dict))
-        else:
-            with open(full_file_name) as f:
-                json_list = json.load(f)
-            for json_dict in json_list:
-                documents.append(document_type.fromdict(json_dict))
-        return documents
-    def read_with_defaults(self, **kwargs) -> List[D]:
-        all_kwargs = {**self.default_kwargs, **kwargs}
-        return self.read(**all_kwargs)
-    def write_with_defaults(self, **kwargs) -> Dict[str, str]:
-        all_kwargs = {**self.default_kwargs, **kwargs}
-        return self.write(**all_kwargs)
-    def __call__(self, documents: Iterable[Document], **kwargs) -> Dict[str, str]:
-        return self.write_with_defaults(documents=documents, **kwargs)
-class JsonSerializer2(DocumentSerializer):
-    def __init__(self, **kwargs):
-        self.default_kwargs = kwargs
     @classmethod
     def write(
         cls,
         documents: Iterable[Document],
         path: str,
         split: str = "train",
     ) -> Dict[str, str]:
         if not isinstance(documents, (Dataset, IterableDataset)):
             if not isinstance(documents, Sequence):
@@ -154,7 +38,7 @@ class JsonSerializer2(DocumentSerializer):
             else:
                 documents = Dataset.from_documents(documents)
         dataset_dict = DatasetDict({split: documents})
-        dataset_dict.to_json(path=path)
         return {"path": path, "split": split}
     @classmethod
@@ -181,5 +65,7 @@ class JsonSerializer2(DocumentSerializer):
         all_kwargs = {**self.default_kwargs, **kwargs}
         return self.write(**all_kwargs)
-    def __call__(self, documents: Iterable[Document], **kwargs) -> Dict[str, str]:
-        return self.write_with_defaults(documents=documents, **kwargs)

+from typing import Dict, Iterable, Optional, Sequence, Type, TypeVar
 from pie_datasets import Dataset, DatasetDict, IterableDataset
 from pytorch_ie.core import Document
 from src.serializer.interface import DocumentSerializer
 from src.utils.logging_utils import get_pylogger
     def __init__(self, **kwargs):
         self.default_kwargs = kwargs
     @classmethod
     def write(
         cls,
         documents: Iterable[Document],
         path: str,
         split: str = "train",
+        append: bool = False,
     ) -> Dict[str, str]:
         if not isinstance(documents, (Dataset, IterableDataset)):
             if not isinstance(documents, Sequence):
             else:
                 documents = Dataset.from_documents(documents)
         dataset_dict = DatasetDict({split: documents})
+        dataset_dict.to_json(path=path, mode="a" if append else "w")
         return {"path": path, "split": split}
     @classmethod
         all_kwargs = {**self.default_kwargs, **kwargs}
         return self.write(**all_kwargs)
+    def __call__(
+        self, documents: Iterable[Document], append: bool = False, **kwargs
+    ) -> Dict[str, str]:
+        return self.write_with_defaults(documents=documents, append=append, **kwargs)

src/start_demo.py CHANGED Viewed

@@ -331,8 +331,9 @@ def main(cfg: DictConfig) -> None:
                         visible=pdf_fulltext_extractor is not None,
                     )
-                    enable_acl_venue_loading = pdf_fulltext_extractor is not None and cfg.get(
-                        "acl_anthology_pdf_dir"
                     )
                     acl_anthology_venues = gr.Textbox(
                         label="ACL Anthology Venues",

                         visible=pdf_fulltext_extractor is not None,
                     )
+                    enable_acl_venue_loading = (
+                        pdf_fulltext_extractor is not None
+                        and cfg.get("acl_anthology_data_dir") is not None
                     )
                     acl_anthology_venues = gr.Textbox(
                         label="ACL Anthology Venues",

src/train.py CHANGED Viewed

@@ -45,7 +45,7 @@ from pie_modules.models import SimpleGenerativeModel
 from pie_modules.models.interface import RequiresTaskmoduleConfig
 from pie_modules.taskmodules import *  # noqa: F403
 from pie_modules.taskmodules import PointerNetworkTaskModuleForEnd2EndRE
-from pytorch_ie import Pipeline
 from pytorch_ie.core import PyTorchIEModel, TaskModule
 from pytorch_ie.models import *  # noqa: F403
 from pytorch_ie.models.interface import RequiresModelNameOrPath, RequiresNumClasses
@@ -55,7 +55,6 @@ from pytorch_lightning import Callback, Trainer
 from pytorch_lightning.loggers import Logger
 from src import utils
-from src.datamodules import PieDataModule
 from src.models import *  # noqa: F403
 from src.serializer.interface import DocumentSerializer
 from src.taskmodules import *  # noqa: F403
@@ -135,7 +134,7 @@ def train(cfg: DictConfig) -> Tuple[dict, dict]:
     )
     # auto-convert the dataset if the taskmodule specifies a document type
-    dataset = taskmodule.convert_dataset(dataset)
     # Init pytorch-ie datamodule
     log.info(f"Instantiating datamodule <{cfg.datamodule._target_}>")

 from pie_modules.models.interface import RequiresTaskmoduleConfig
 from pie_modules.taskmodules import *  # noqa: F403
 from pie_modules.taskmodules import PointerNetworkTaskModuleForEnd2EndRE
+from pytorch_ie import PieDataModule, Pipeline
 from pytorch_ie.core import PyTorchIEModel, TaskModule
 from pytorch_ie.models import *  # noqa: F403
 from pytorch_ie.models.interface import RequiresModelNameOrPath, RequiresNumClasses
 from pytorch_lightning.loggers import Logger
 from src import utils
 from src.models import *  # noqa: F403
 from src.serializer.interface import DocumentSerializer
 from src.taskmodules import *  # noqa: F403
     )
     # auto-convert the dataset if the taskmodule specifies a document type
+    dataset = dataset.to_document_type(taskmodule, downcast=False)
     # Init pytorch-ie datamodule
     log.info(f"Instantiating datamodule <{cfg.datamodule._target_}>")

src/utils/graph_utils.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from typing import Hashable, List, Optional, Sequence, TypeVar
+from pytorch_ie.annotations import BinaryRelation
+H = TypeVar("H", bound=Hashable)
+def get_connected_components(
+    relations: Sequence[BinaryRelation],
+    elements: Optional[Sequence[H]] = None,
+    link_relation_label: Optional[str] = None,
+    link_relation_relation_score_threshold: Optional[float] = None,
+    add_singletons: bool = False,
+) -> List[List[H]]:
+    try:
+        import networkx as nx
+    except ImportError:
+        raise ImportError(
+            "NetworkX must be installed to use the SpansViaRelationMerger. "
+            "You can install NetworkX with `pip install networkx`."
+        )
+    # convert list of relations to a graph to easily calculate connected components to merge
+    g = nx.Graph()
+    link_relations = []
+    other_relations = []
+    elem2edge_relation = {}
+    for rel in relations:
+        if (link_relation_label is None or rel.label == link_relation_label) and (
+            link_relation_relation_score_threshold is None
+            or rel.score >= link_relation_relation_score_threshold
+        ):
+            link_relations.append(rel)
+            g.add_edge(rel.head, rel.tail)
+            elem2edge_relation[rel.head] = rel
+            elem2edge_relation[rel.tail] = rel
+        else:
+            other_relations.append(rel)
+    if add_singletons:
+        if elements is None:
+            raise ValueError("elements must be provided if add_singletons is True")
+        # add singletons to the graph
+        for elem in elements:
+            if elem not in elem2edge_relation:
+                g.add_node(elem)
+    return list(nx.connected_components(g))

src/utils/inference_utils.py CHANGED Viewed

@@ -50,6 +50,8 @@ def predict_and_serialize(
         batch_iter = [dataset]
     else:
         batch_iter = document_batch_iter(dataset=dataset, batch_size=document_batch_size)
     for docs_batch in batch_iter:
         if pipeline is not None:
             t_start = timeit.default_timer()
@@ -60,13 +62,14 @@ def predict_and_serialize(
         if serializer is not None:
             # the serializer should not return the serialized documents, but write them to disk
             # and instead return some metadata such as the path to the serialized documents
-            serializer_result = serializer(docs_batch)
             if "serializer" in result and result["serializer"] != serializer_result:
                 log.warning(
                     f"serializer result changed from {result['serializer']} to {serializer_result}"
                     " during prediction. Only the last result is returned."
                 )
             result["serializer"] = serializer_result
     if prediction_time is not None:
         result["prediction_time"] = prediction_time

         batch_iter = [dataset]
     else:
         batch_iter = document_batch_iter(dataset=dataset, batch_size=document_batch_size)
+    append = False
     for docs_batch in batch_iter:
         if pipeline is not None:
             t_start = timeit.default_timer()
         if serializer is not None:
             # the serializer should not return the serialized documents, but write them to disk
             # and instead return some metadata such as the path to the serialized documents
+            serializer_result = serializer(docs_batch, append=append)
             if "serializer" in result and result["serializer"] != serializer_result:
                 log.warning(
                     f"serializer result changed from {result['serializer']} to {serializer_result}"
                     " during prediction. Only the last result is returned."
                 )
             result["serializer"] = serializer_result
+            append = True
     if prediction_time is not None:
         result["prediction_time"] = prediction_time

src/utils/pdf_utils/process_pdf.py CHANGED Viewed

@@ -138,7 +138,7 @@ def process_pdf_file(
     os.makedirs(output_dir, exist_ok=True)
     # get paper id as the name of the file
-    paper_id = ".".join(input_file.split("/")[-1].split(".")[:-1])
     tei_file = os.path.join(temp_dir, f"{paper_id}.tei.xml")
     output_file = os.path.join(output_dir, f"{paper_id}.json")

     os.makedirs(output_dir, exist_ok=True)
     # get paper id as the name of the file
+    paper_id = os.path.splitext(os.path.basename(input_file))[0]
     tei_file = os.path.join(temp_dir, f"{paper_id}.tei.xml")
     output_file = os.path.join(output_dir, f"{paper_id}.json")