medmac01
Added multilingual_clip module
3bd5293
raw
history blame
857 Bytes
import os
import pandas as pd
filenames = os.listdir("data_translated")
df = pd.read_feather("data/ccs_synthetic_multi.feather")
df_list = [pd.read_feather(os.path.join("data_translated", filename)) for filename in filenames]
df_multi = pd.concat(df_list)
df_multi = df_multi.reset_index(drop=True)
df = df.drop("caption_multi", axis=1)
df = df.merge(df_multi[["caption_multi", "index"]], how="left", on="index")
df = df[
[
"caption",
"caption_sv",
"caption_multi",
"url",
"multi_language_code",
"multi_language_name",
"multi_target",
"target_code",
"opus_mt_url",
"index",
]
]
df = df.rename(columns={"multi_target": "multiple_target_model"})
df["opus_mt_url"] = df["opus_mt_url"].str.replace("https://huggingface.co/", "")
df.to_feather("ccs_synthetic.feather")