# -*- coding: utf-8 -*- """ Created on Tue Mar 10 16:34:12 2020 @author: luol2 """ import numpy as np import io import sys #read ner text (word\tlabel), generate the list[[[w1,label],[w2,label]]] def ml_intext(file): fin=open(file,'r',encoding='utf-8') alltexts=fin.read().strip().split('\n\n') fin.close() data_list=[] label_list=[] for sents in alltexts: lines=sents.split('\n') temp_sentece=[] for i in range(0,len(lines)): seg=lines[i].split('\t') temp_sentece.append(seg[:]) label_list.append(seg[-1]) data_list.append(temp_sentece) #print(data_list) #print(label_list) return data_list,label_list def ml_intext_fn(alltexts): # fin=io.StringIO(ml_input) # alltexts=fin.read().strip().split('\n\n') # fin.close() data_list=[] label_list=[] for sents in alltexts: lines=sents.split('\n') temp_sentece=[] for i in range(0,len(lines)): seg=lines[i].split('\t') temp_sentece.append(seg[:]) label_list.append(seg[-1]) data_list.append(temp_sentece) #print(data_list) #print(label_list) return data_list,label_list # model predict result to conll evalute format [token answer predict] def out_BIO(file,raw_pre,raw_input,label_set): fout=open(file,'w',encoding='utf-8') for i in range(len(raw_input)): for j in range(len(raw_input[i])): if jmax_len: max_len=word_len print(seg[0]) for i in range(word_len): if seg[0][i] not in char_vocab: char_vocab.append(seg[0][i]) #else: # fout.write(line) fin.close() #fout.close() for ele in char_vocab: fout_char.write(ele+'\n') fout_char.close() print('max_len:',max_len) if __name__=='__main__': # infile='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/data/pubmed_unlabel/mutation_disease_1990.ner_BIO' # #outfile='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/data/pubmed_unlabel/mutation_disease_1990.ner_BIO_new' # outfile_char='//panfs/pan1/bionlp/lulab/luoling/HPO_project/AutoPhe/src/nn_model/vocab/char_vocab' # #processing_text(file) # char_vocab(infile,outfile_char) a=[1,2,3] print(a[:-1])