|
|
|
"""
|
|
Created on Mon Mar 1 15:33:54 2021
|
|
|
|
@author: luol2
|
|
"""
|
|
|
|
def BIO_tag(tokens):
|
|
gold_entity={}
|
|
pre_entity={}
|
|
gold_start,gold_end=0,0
|
|
pre_start,pre_end=0,0
|
|
for i in range(0,len(tokens)):
|
|
segs=tokens[i].split('\t')
|
|
|
|
|
|
if segs[1].startswith('B-')>0:
|
|
gold_start=i
|
|
gold_type=segs[1][2:]
|
|
if i+1>=len(tokens):
|
|
gold_end=i
|
|
if gold_type in gold_entity.keys():
|
|
gold_entity[gold_type].append([gold_start,gold_end])
|
|
else:
|
|
gold_entity[gold_type]=[[gold_start,gold_end]]
|
|
else:
|
|
next_seg=tokens[i+1].split('\t')
|
|
if next_seg[1].startswith('B-')>0 or next_seg[1]=='O':
|
|
gold_end=i
|
|
if gold_type in gold_entity.keys():
|
|
gold_entity[gold_type].append([gold_start,gold_end])
|
|
else:
|
|
gold_entity[gold_type]=[[gold_start,gold_end]]
|
|
elif next_seg[1].startswith('I-')>0:
|
|
pass
|
|
elif segs[1].startswith('I-')>0:
|
|
if i+1>=len(tokens):
|
|
gold_end=i
|
|
if gold_type in gold_entity.keys():
|
|
gold_entity[gold_type].append([gold_start,gold_end])
|
|
else:
|
|
gold_entity[gold_type]=[[gold_start,gold_end]]
|
|
else:
|
|
next_seg=tokens[i+1].split('\t')
|
|
if next_seg[1].startswith('B-')>0 or next_seg[1]=='O':
|
|
gold_end=i
|
|
if gold_type in gold_entity.keys():
|
|
gold_entity[gold_type].append([gold_start,gold_end])
|
|
else:
|
|
gold_entity[gold_type]=[[gold_start,gold_end]]
|
|
elif next_seg[1].startswith('I-')>0:
|
|
pass
|
|
elif segs[1]=='O':
|
|
pass
|
|
|
|
|
|
if segs[2].startswith('B-')>0:
|
|
pre_start=i
|
|
pre_type=segs[2][2:]
|
|
if i+1>=len(tokens):
|
|
pre_end=i
|
|
if pre_type in pre_entity.keys():
|
|
pre_entity[pre_type].append([pre_start,pre_end])
|
|
else:
|
|
pre_entity[pre_type]=[[pre_start,pre_end]]
|
|
else:
|
|
next_seg=tokens[i+1].split('\t')
|
|
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
|
pre_end=i
|
|
if pre_type in pre_entity.keys():
|
|
pre_entity[pre_type].append([pre_start,pre_end])
|
|
else:
|
|
pre_entity[pre_type]=[[pre_start,pre_end]]
|
|
elif next_seg[2].startswith('I-')>0:
|
|
pass
|
|
elif segs[2].startswith('I-')>0:
|
|
if i==0 and i+1<len(tokens):
|
|
pre_start=i
|
|
pre_type=segs[2][2:]
|
|
next_seg=tokens[i+1].split('\t')
|
|
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
|
pre_end=i
|
|
if pre_type in pre_entity.keys():
|
|
pre_entity[pre_type].append([pre_start,pre_end])
|
|
else:
|
|
pre_entity[pre_type]=[[pre_start,pre_end]]
|
|
elif next_seg[2].startswith('I-')>0:
|
|
pass
|
|
elif i==0 and i+1==len(tokens):
|
|
pre_start=i
|
|
pre_type=segs[2][2:]
|
|
pre_end=i
|
|
if pre_type in pre_entity.keys():
|
|
pre_entity[pre_type].append([pre_start,pre_end])
|
|
else:
|
|
pre_entity[pre_type]=[[pre_start,pre_end]]
|
|
elif i+1>=len(tokens):
|
|
last_seg=tokens[i-1].split('\t')
|
|
if last_seg[2]=='O':
|
|
pre_start=i
|
|
pre_type=segs[2][2:]
|
|
pre_end=i
|
|
if pre_type in pre_entity.keys():
|
|
pre_entity[pre_type].append([pre_start,pre_end])
|
|
else:
|
|
pre_entity[pre_type]=[[pre_start,pre_end]]
|
|
elif i+1< len(tokens):
|
|
next_seg=tokens[i+1].split('\t')
|
|
last_seg=tokens[i-1].split('\t')
|
|
if last_seg[2]=='O':
|
|
pre_start=i
|
|
pre_type=segs[2][2:]
|
|
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
|
|
pre_end=i
|
|
if pre_type in pre_entity.keys():
|
|
pre_entity[pre_type].append([pre_start,pre_end])
|
|
else:
|
|
pre_entity[pre_type]=[[pre_start,pre_end]]
|
|
elif next_seg[2].startswith('I-')>0:
|
|
pass
|
|
elif segs[2]=='O':
|
|
pass
|
|
|
|
|
|
|
|
return gold_entity,pre_entity
|
|
|
|
|
|
def NER_Evaluation():
|
|
path='//panfs/pan1/bionlp/lulab/luoling/OpenBioIE_project/models/Kfold/BiLSTM-CRF/'
|
|
fin=open(path+'dev_pre.conll_all','r',encoding='utf-8')
|
|
all_sentence=fin.read().strip().split('\n\n')
|
|
fin.close()
|
|
Metrics={}
|
|
|
|
for sentence in all_sentence:
|
|
tokens=sentence.split('\n')
|
|
gold_entity,pre_entity=BIO_tag(tokens)
|
|
|
|
for entity_type in gold_entity.keys():
|
|
if entity_type not in Metrics.keys():
|
|
Metrics[entity_type]=[0,len(gold_entity[entity_type]),0]
|
|
else:
|
|
Metrics[entity_type][1]+=len(gold_entity[entity_type])
|
|
for entity_type in pre_entity.keys():
|
|
if entity_type not in Metrics.keys():
|
|
Metrics[entity_type]=[0,0,len(pre_entity[entity_type])]
|
|
else:
|
|
Metrics[entity_type][2]+=len(pre_entity[entity_type])
|
|
for mention in pre_entity[entity_type]:
|
|
if entity_type in gold_entity.keys():
|
|
if mention in gold_entity[entity_type]:
|
|
Metrics[entity_type][0]+=1
|
|
print(Metrics)
|
|
TP,Gold_num,Pre_num=0,0,0
|
|
for ele in Metrics.keys():
|
|
if Metrics[ele][2]==0:
|
|
p=0
|
|
else:
|
|
p=Metrics[ele][0]/Metrics[ele][2]
|
|
if Metrics[ele][1]==0:
|
|
r=0
|
|
else:
|
|
r=Metrics[ele][0]/Metrics[ele][1]
|
|
if p+r==0:
|
|
f1=0
|
|
else:
|
|
f1=2*p*r/(p+r)
|
|
TP+=Metrics[ele][0]
|
|
Gold_num+=Metrics[ele][1]
|
|
Pre_num+=Metrics[ele][2]
|
|
print(ele+': P=%.5f, R=%.5f, F1=%.5f' % (p,r,f1))
|
|
|
|
if Pre_num==0:
|
|
P=0
|
|
else:
|
|
P=TP/Pre_num
|
|
R=TP/Gold_num
|
|
F1=2*P*R/(P+R)
|
|
print("Overall: P=%.5f, R=%.5f, F1=%.5f"% (P,R,F1))
|
|
|
|
def NER_Evaluation_fn(file):
|
|
|
|
fin=open(file,'r',encoding='utf-8')
|
|
all_sentence=fin.read().strip().split('\n\n')
|
|
fin.close()
|
|
Metrics={}
|
|
breai=0
|
|
for sentence in all_sentence:
|
|
breai+=1
|
|
if breai>5000:
|
|
break
|
|
tokens=sentence.split('\n')
|
|
gold_entity,pre_entity=BIO_tag(tokens)
|
|
|
|
for entity_type in gold_entity.keys():
|
|
if entity_type not in Metrics.keys():
|
|
Metrics[entity_type]=[0,len(gold_entity[entity_type]),0]
|
|
else:
|
|
Metrics[entity_type][1]+=len(gold_entity[entity_type])
|
|
for entity_type in pre_entity.keys():
|
|
if entity_type not in Metrics.keys():
|
|
Metrics[entity_type]=[0,0,len(pre_entity[entity_type])]
|
|
else:
|
|
Metrics[entity_type][2]+=len(pre_entity[entity_type])
|
|
for mention in pre_entity[entity_type]:
|
|
if entity_type in gold_entity.keys():
|
|
if mention in gold_entity[entity_type]:
|
|
Metrics[entity_type][0]+=1
|
|
print(Metrics)
|
|
TP,Gold_num,Pre_num=0,0,0
|
|
for ele in Metrics.keys():
|
|
if Metrics[ele][2]==0:
|
|
p=0
|
|
else:
|
|
p=Metrics[ele][0]/Metrics[ele][2]
|
|
if Metrics[ele][1]==0:
|
|
r=0
|
|
else:
|
|
r=Metrics[ele][0]/Metrics[ele][1]
|
|
if p+r==0:
|
|
f1=0
|
|
else:
|
|
f1=2*p*r/(p+r)
|
|
TP+=Metrics[ele][0]
|
|
Gold_num+=Metrics[ele][1]
|
|
Pre_num+=Metrics[ele][2]
|
|
print(ele+': P=%.5f, R=%.5f, F1=%.5f' % (p,r,f1))
|
|
|
|
if Pre_num==0:
|
|
P=0
|
|
else:
|
|
P=TP/Pre_num
|
|
R=TP/Gold_num
|
|
if P+R==0:
|
|
F1=0
|
|
else:
|
|
F1=2*P*R/(P+R)
|
|
print("Overall: P=%.5f, R=%.5f, F1=%.5f"% (P,R,F1))
|
|
return F1
|
|
|
|
if __name__=='__main__':
|
|
NER_Evaluation()
|
|
|