File size: 857 Bytes
3bd5293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import os
import pandas as pd

filenames = os.listdir("data_translated")
df = pd.read_feather("data/ccs_synthetic_multi.feather")
df_list = [pd.read_feather(os.path.join("data_translated", filename)) for filename in filenames]
df_multi = pd.concat(df_list)
df_multi = df_multi.reset_index(drop=True)

df = df.drop("caption_multi", axis=1)
df = df.merge(df_multi[["caption_multi", "index"]], how="left", on="index")

df = df[
    [
        "caption",
        "caption_sv",
        "caption_multi",
        "url",
        "multi_language_code",
        "multi_language_name",
        "multi_target",
        "target_code",
        "opus_mt_url",
        "index",
    ]
]

df = df.rename(columns={"multi_target": "multiple_target_model"})
df["opus_mt_url"] = df["opus_mt_url"].str.replace("https://huggingface.co/", "")

df.to_feather("ccs_synthetic.feather")