Spaces:
Running
Running
#!/usr/bin/python3 | |
# -*- coding: utf-8 -*- | |
import argparse | |
import os | |
from pathlib import Path | |
import sys | |
pwd = os.path.abspath(os.path.dirname(__file__)) | |
sys.path.append(os.path.join(pwd, "../../")) | |
import pandas as pd | |
from toolbox.torch.utils.data.vocabulary import Vocabulary | |
def get_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--vocabulary_dir", default="vocabulary", type=str) | |
parser.add_argument("--train_dataset", default="train.xlsx", type=str) | |
parser.add_argument("--valid_dataset", default="valid.xlsx", type=str) | |
args = parser.parse_args() | |
return args | |
def main(): | |
args = get_args() | |
train_dataset = pd.read_excel(args.train_dataset) | |
valid_dataset = pd.read_excel(args.valid_dataset) | |
vocabulary = Vocabulary() | |
# train | |
for i, row in train_dataset.iterrows(): | |
label = row["labels"] | |
vocabulary.add_token_to_namespace(label, namespace="labels") | |
# valid | |
for i, row in valid_dataset.iterrows(): | |
label = row["labels"] | |
vocabulary.add_token_to_namespace(label, namespace="labels") | |
vocabulary.save_to_files(args.vocabulary_dir) | |
return | |
if __name__ == "__main__": | |
main() | |