|
|
|
"""
|
|
Created on Tue Mar 10 16:34:12 2020
|
|
|
|
@author: luol2
|
|
"""
|
|
import numpy as np
|
|
import io
|
|
import sys
|
|
|
|
def ml_intext(file):
|
|
fin=open(file,'r',encoding='utf-8')
|
|
alltexts=fin.read().strip().split('\n\n')
|
|
fin.close()
|
|
data_list=[]
|
|
|
|
for sents in alltexts:
|
|
lines=sents.split('\n')
|
|
temp_sentece=[]
|
|
for i in range(0,len(lines)):
|
|
seg=lines[i].split('\t')
|
|
temp_sentece.append(seg[:])
|
|
|
|
data_list.append(temp_sentece)
|
|
|
|
|
|
return data_list
|
|
|
|
def ml_intext_fn(ml_input):
|
|
fin=io.StringIO(ml_input)
|
|
alltexts=fin.read().strip().split('\n\n')
|
|
fin.close()
|
|
data_list=[]
|
|
|
|
for sents in alltexts:
|
|
lines=sents.split('\n')
|
|
temp_sentece=[]
|
|
for i in range(0,len(lines)):
|
|
seg=lines[i].split('\t')
|
|
temp_sentece.append(seg[:])
|
|
|
|
data_list.append(temp_sentece)
|
|
|
|
|
|
return data_list
|
|
|
|
|
|
def out_BIO_crf(file,raw_pre,raw_input,label_set):
|
|
fout=open(file,'w',encoding='utf-8')
|
|
for i in range(len(raw_input)):
|
|
|
|
for j in range(len(raw_input[i])):
|
|
if j<len(raw_pre[i]):
|
|
label_id = raw_pre[i][j]
|
|
label_tag = label_set[str(label_id)]
|
|
else:
|
|
label_tag='O'
|
|
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
|
fout.write('\n')
|
|
fout.close()
|
|
def out_BIO_crf_fn(raw_pre,raw_input,label_set):
|
|
fout=io.StringIO()
|
|
for i in range(len(raw_input)):
|
|
|
|
for j in range(len(raw_input[i])):
|
|
if j<len(raw_pre[i]):
|
|
label_id = raw_pre[i][j]
|
|
label_tag = label_set[str(label_id)]
|
|
else:
|
|
label_tag='O'
|
|
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
|
fout.write('\n')
|
|
return fout.getvalue()
|
|
def out_BIO_softmax(file,raw_pre,raw_input,label_set):
|
|
fout=open(file,'w',encoding='utf-8')
|
|
|
|
for i in range(len(raw_input)):
|
|
|
|
for j in range(len(raw_input[i])):
|
|
if j<len(raw_pre[i]):
|
|
label_id = np.argmax(raw_pre[i][j])
|
|
|
|
label_tag = label_set[str(label_id)]
|
|
else:
|
|
label_tag='O'
|
|
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
|
fout.write('\n')
|
|
fout.close()
|
|
def out_BIO_softmax_fn(raw_pre,raw_input,label_set):
|
|
fout=io.StringIO()
|
|
|
|
for i in range(len(raw_input)):
|
|
|
|
for j in range(len(raw_input[i])):
|
|
if j<len(raw_pre[i]):
|
|
label_id = np.argmax(raw_pre[i][j])
|
|
|
|
label_tag = label_set[str(label_id)]
|
|
else:
|
|
label_tag='O'
|
|
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\n')
|
|
fout.write('\n')
|
|
return fout.getvalue()
|
|
|
|
def out_BIO_BERT_softmax(file,raw_pre,raw_input,label_set):
|
|
fout=open(file,'w',encoding='utf-8')
|
|
for i in range(len(raw_input)):
|
|
for j in range(len(raw_input[i])):
|
|
if raw_input[i][j][-1]<len(raw_pre[i]):
|
|
|
|
label_id = np.argmax(raw_pre[i][raw_input[i][j][-1]])
|
|
label_tag = label_set[str(label_id)]
|
|
else:
|
|
label_tag='O'
|
|
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][1]+'\t'+label_tag+'\n')
|
|
fout.write('\n')
|
|
fout.close()
|
|
def out_BIO_BERT_softmax_fn(raw_pre,raw_input,label_set):
|
|
fout=io.StringIO()
|
|
for i in range(len(raw_input)):
|
|
for j in range(len(raw_input[i])):
|
|
if raw_input[i][j][-1]<len(raw_pre[i]):
|
|
|
|
label_id = np.argmax(raw_pre[i][raw_input[i][j][-1]])
|
|
label_tag = label_set[str(label_id)]
|
|
else:
|
|
label_tag='O'
|
|
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][1]+'\t'+label_tag+'\n')
|
|
fout.write('\n')
|
|
return fout.getvalue()
|
|
def out_BIO_BERT_crf(file,raw_pre,raw_input,label_set):
|
|
fout=open(file,'w',encoding='utf-8')
|
|
for i in range(len(raw_input)):
|
|
|
|
for j in range(len(raw_input[i])):
|
|
if raw_input[i][j][-1]<len(raw_pre[i]):
|
|
label_id = raw_pre[i][raw_input[i][j][-1]]
|
|
label_tag = label_set[str(label_id)]
|
|
else:
|
|
label_tag='O'
|
|
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][1]+'\t'+label_tag+'\n')
|
|
fout.write('\n')
|
|
fout.close()
|
|
def out_BIO_BERT_crf_fn(raw_pre,raw_input,label_set):
|
|
fout=io.StringIO()
|
|
for i in range(len(raw_input)):
|
|
|
|
for j in range(len(raw_input[i])):
|
|
if raw_input[i][j][-1]<len(raw_pre[i]):
|
|
label_id = raw_pre[i][raw_input[i][j][-1]]
|
|
label_tag = label_set[str(label_id)]
|
|
else:
|
|
label_tag='O'
|
|
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][1]+'\t'+label_tag+'\n')
|
|
fout.write('\n')
|
|
return fout.getvalue()
|
|
|
|
def out_BIO_BERT_softmax_score_fn(raw_pre,raw_input,label_set):
|
|
fout=io.StringIO()
|
|
for i in range(len(raw_input)):
|
|
|
|
for j in range(len(raw_input[i])):
|
|
if j<len(raw_pre[i]):
|
|
|
|
label_id = np.argmax(raw_pre[i][j])
|
|
label_score = round(raw_pre[i][j][label_id],4)
|
|
label_tag = label_set[str(label_id)]
|
|
else:
|
|
label_tag='O'
|
|
label_score = 0.0
|
|
fout.write(raw_input[i][j][0]+'\t'+raw_input[i][j][-1]+'\t'+label_tag+'\t'+str(label_score)+'\n')
|
|
fout.write('\n')
|
|
return fout.getvalue()
|
|
|
|
def char_vocab(infile,outfile_char):
|
|
fin=open(infile,'r',encoding='utf-8')
|
|
|
|
fout_char=open(outfile_char,'w',encoding='utf-8')
|
|
char_vocab=['oov_char']
|
|
max_len=0
|
|
for line in fin:
|
|
if line.strip()!='':
|
|
seg=line.split('\t')
|
|
word_len=len(seg[0])
|
|
|
|
|
|
if word_len>max_len:
|
|
max_len=word_len
|
|
print(seg[0])
|
|
for i in range(word_len):
|
|
if seg[0][i] not in char_vocab:
|
|
char_vocab.append(seg[0][i])
|
|
|
|
|
|
fin.close()
|
|
|
|
for ele in char_vocab:
|
|
fout_char.write(ele+'\n')
|
|
fout_char.close()
|
|
print('max_len:',max_len)
|
|
|
|
|
|
if __name__=='__main__':
|
|
|
|
|
|
|
|
|
|
|
|
a=[1,2,3]
|
|
print(a[:-1])
|
|
|