File size: 2,308 Bytes
382191a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""Chatbots using API-based services."""
from __future__ import annotations

import os
from dataclasses import dataclass

import config


@dataclass(frozen=True)
class GptMtInstance:
    """An instance from the GPT-MT dataset.

    Attributes:
        data: The input sentence.
        label: The output sentence.
        doc_id: The document ID.
        lang_pair: The language pair.
    """

    data: str
    label: str
    doc_id: str
    lang_pair: str


def process_data(
    input_dir: str,
    lang_pairs: list[str],
) -> list[GptMtInstance]:
    """Load data."""
    # Load the data
    data: list[GptMtInstance] = []
    eval_dir = os.path.join(input_dir, "evaluation", "testset")
    for lang_pair in lang_pairs:
        src_lang, trg_lang = lang_pair[:2], lang_pair[2:]
        src_file = os.path.join(
            eval_dir, "wmt-testset", lang_pair, f"test.{src_lang}-{trg_lang}.{src_lang}"
        )
        trg_file = os.path.join(
            eval_dir, "wmt-testset", lang_pair, f"test.{src_lang}-{trg_lang}.{trg_lang}"
        )
        doc_file = os.path.join(
            eval_dir,
            "wmt-testset-docids",
            lang_pair,
            f"test.{src_lang}-{trg_lang}.docids",
        )
        with open(src_file, "r") as src_in, open(trg_file, "r") as trg_in, open(
            doc_file, "r"
        ) as doc_in:
            for src_line, trg_line, doc_line in zip(src_in, trg_in, doc_in):
                data.append(
                    GptMtInstance(
                        src_line.strip(), trg_line.strip(), doc_line.strip(), lang_pair
                    )
                )
    return data


def process_output(
    input_dir: str,
    lang_pairs: list[str],
    model_preset: str,
) -> list[str]:
    """Load model outputs."""
    # Load the data
    data: list[str] = []
    model_path = config.model_configs[model_preset].path
    system_dir = os.path.join(input_dir, "evaluation", "system-outputs", model_path)
    for lang_pair in lang_pairs:
        src_lang, trg_lang = lang_pair[:2], lang_pair[2:]
        sys_file = os.path.join(
            system_dir, lang_pair, f"test.{src_lang}-{trg_lang}.{trg_lang}"
        )
        with open(sys_file, "r") as sys_in:
            for sys_line in sys_in:
                data.append(sys_line.strip())
    return data