Update all_datasets.py
Browse files- all_datasets.py +3 -43
all_datasets.py
CHANGED
|
@@ -61,54 +61,14 @@ def sentiment_dataset(path_folder, train_file_name, test_file_name):
|
|
| 61 |
custom_tokenized.set_format('torch',columns=["input_ids", 'token_type_ids', "attention_mask", "label"])
|
| 62 |
return custom_tokenized
|
| 63 |
|
| 64 |
-
# support function for ner task
|
| 65 |
-
def get_dict_map(data, mode="token"):
|
| 66 |
-
if mode == "token":
|
| 67 |
-
vocab = list(set([j[0] for i in data for j in i]))
|
| 68 |
-
else:
|
| 69 |
-
vocab = list(set([j[1] for i in data for j in i]))
|
| 70 |
-
idx2tok = {idx:tok for idx, tok in enumerate(vocab)}
|
| 71 |
-
tok2idx = {tok:idx for idx, tok in enumerate(vocab)}
|
| 72 |
-
return tok2idx, idx2tok
|
| 73 |
-
|
| 74 |
-
def read_csv_to_ner_data(path):
|
| 75 |
-
data = pd.read_csv(path, encoding="utf-8")
|
| 76 |
-
tok = list(data["token"])
|
| 77 |
-
tok = [replace_all(i) for i in tok]
|
| 78 |
-
lab = list(data["label"])
|
| 79 |
-
token = []
|
| 80 |
-
label = []
|
| 81 |
-
tmp = []
|
| 82 |
-
tmp_ = []
|
| 83 |
-
for i, txt in enumerate(tok):
|
| 84 |
-
if str(txt) != "nan":
|
| 85 |
-
tmp.append(txt)
|
| 86 |
-
tmp_.append(lab[i])
|
| 87 |
-
else:
|
| 88 |
-
token.append(tmp)
|
| 89 |
-
label.append(tmp_)
|
| 90 |
-
tmp = []
|
| 91 |
-
tmp_ = []
|
| 92 |
-
|
| 93 |
-
data = []
|
| 94 |
-
tmp = []
|
| 95 |
-
for i, sent in enumerate(token):
|
| 96 |
-
for j, tok in enumerate(sent):
|
| 97 |
-
tmp.append([tok, label[i][j]])
|
| 98 |
-
data.append(tmp)
|
| 99 |
-
tmp = []
|
| 100 |
-
return data
|
| 101 |
-
|
| 102 |
# get feature for ner task
|
| 103 |
def feature_for_phobert(data, tokenizer, max_seq_len: int=256, use_crf: bool = False) -> List[NerFeatures]:
|
| 104 |
features = []
|
| 105 |
tokens = []
|
| 106 |
tag_ids = []
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
df = read_csv_to_ner_data(os.path.join(path, file_name))
|
| 111 |
-
tag2idx, idx2tag = get_dict_map(df, 'tag')
|
| 112 |
for id, tokens in enumerate(data):
|
| 113 |
if tokens == []:
|
| 114 |
continue
|
|
|
|
| 61 |
custom_tokenized.set_format('torch',columns=["input_ids", 'token_type_ids', "attention_mask", "label"])
|
| 62 |
return custom_tokenized
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
# get feature for ner task
|
| 65 |
def feature_for_phobert(data, tokenizer, max_seq_len: int=256, use_crf: bool = False) -> List[NerFeatures]:
|
| 66 |
features = []
|
| 67 |
tokens = []
|
| 68 |
tag_ids = []
|
| 69 |
+
|
| 70 |
+
idx2tag = {0: 'B-chỗ để xe', 1: 'B-con người', 2: 'B-công việc', 3: 'B-cơ sở vật chất', 4: 'B-dự án', 5: 'B-lương', 6: 'B-môi trường làm việc', 7: 'B-ot/thời gian', 8: 'B-văn phòng', 9: 'B-đãi ngộ', 10: 'I-chỗ để xe', 11: 'I-con người', 12: 'I-công việc', 13: 'I-cơ sở vật chất', 14: 'I-dự án', 15: 'I-lương', 16: 'I-môi trường làm việc', 17: 'I-ot/thời gian', 18: 'I-văn phòng', 19: 'I-đãi ngộ', 20: 'O'}
|
| 71 |
+
tag2idx = {v: k for k, v in idx2tag.items()}
|
|
|
|
|
|
|
| 72 |
for id, tokens in enumerate(data):
|
| 73 |
if tokens == []:
|
| 74 |
continue
|