Spaces:

nam194
/

Review_company_analysis_and_Resume_parsing

Running

App Files Files Community

nam194 commited on Jun 18, 2023

Commit

1cafba1

1 Parent(s): 246d50e

Update all_datasets.py

Browse files

Files changed (1) hide show

all_datasets.py +3 -43

all_datasets.py CHANGED Viewed

@@ -61,54 +61,14 @@ def sentiment_dataset(path_folder, train_file_name, test_file_name):
     custom_tokenized.set_format('torch',columns=["input_ids", 'token_type_ids', "attention_mask", "label"])
     return custom_tokenized
-# support function for ner task
-def get_dict_map(data, mode="token"):
-    if mode == "token":
-        vocab = list(set([j[0] for i in data for j in i]))
-    else:
-        vocab = list(set([j[1] for i in data for j in i]))
-    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
-    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
-    return tok2idx, idx2tok
-def read_csv_to_ner_data(path):
-    data = pd.read_csv(path, encoding="utf-8")
-    tok = list(data["token"])
-    tok = [replace_all(i) for i in tok]
-    lab = list(data["label"])
-    token = []
-    label = []
-    tmp = []
-    tmp_ = []
-    for i, txt in enumerate(tok):
-        if str(txt) != "nan":
-            tmp.append(txt)
-            tmp_.append(lab[i])
-        else:
-            token.append(tmp)
-            label.append(tmp_)
-            tmp = []
-            tmp_ = []
-    data = []
-    tmp = []
-    for i, sent in enumerate(token):
-        for j, tok in enumerate(sent):
-            tmp.append([tok, label[i][j]])
-        data.append(tmp)
-        tmp = []
-    return data
 # get feature for ner task
 def feature_for_phobert(data, tokenizer, max_seq_len: int=256, use_crf: bool = False) -> List[NerFeatures]:
     features = []
     tokens = []
     tag_ids = []
-    # args = parse_arguments()
-    path = os.path.abspath("./data/topic")
-    file_name = os.listdir(path)[0]
-    df = read_csv_to_ner_data(os.path.join(path, file_name))
-    tag2idx, idx2tag = get_dict_map(df, 'tag')
     for id, tokens in enumerate(data):
         if tokens == []:
             continue

     custom_tokenized.set_format('torch',columns=["input_ids", 'token_type_ids', "attention_mask", "label"])
     return custom_tokenized
 # get feature for ner task
 def feature_for_phobert(data, tokenizer, max_seq_len: int=256, use_crf: bool = False) -> List[NerFeatures]:
     features = []
     tokens = []
     tag_ids = []
+    idx2tag = {0: 'B-chỗ để xe', 1: 'B-con người', 2: 'B-công việc', 3: 'B-cơ sở vật chất', 4: 'B-dự án', 5: 'B-lương', 6: 'B-môi trường làm việc', 7: 'B-ot/thời gian', 8: 'B-văn phòng', 9: 'B-đãi ngộ', 10: 'I-chỗ để xe', 11: 'I-con người', 12: 'I-công việc', 13: 'I-cơ sở vật chất', 14: 'I-dự án', 15: 'I-lương', 16: 'I-môi trường làm việc', 17: 'I-ot/thời gian', 18: 'I-văn phòng', 19: 'I-đãi ngộ', 20: 'O'}
+    tag2idx = {v: k for k, v in idx2tag.items()}
     for id, tokens in enumerate(data):
         if tokens == []:
             continue