|
|
|
""" |
|
Created on Mon Mar 1 15:33:54 2021 |
|
|
|
@author: luol2 |
|
""" |
|
|
|
def BIO_tag(tokens): |
|
gold_entity={} |
|
pre_entity={} |
|
gold_start,gold_end=0,0 |
|
pre_start,pre_end=0,0 |
|
for i in range(0,len(tokens)): |
|
segs=tokens[i].split('\t') |
|
|
|
|
|
if segs[1].startswith('B-')>0: |
|
gold_start=i |
|
gold_type=segs[1][2:] |
|
if i+1>=len(tokens): |
|
gold_end=i |
|
if gold_type in gold_entity.keys(): |
|
gold_entity[gold_type].append([gold_start,gold_end]) |
|
else: |
|
gold_entity[gold_type]=[[gold_start,gold_end]] |
|
else: |
|
next_seg=tokens[i+1].split('\t') |
|
if next_seg[1].startswith('B-')>0 or next_seg[1]=='O': |
|
gold_end=i |
|
if gold_type in gold_entity.keys(): |
|
gold_entity[gold_type].append([gold_start,gold_end]) |
|
else: |
|
gold_entity[gold_type]=[[gold_start,gold_end]] |
|
elif next_seg[1].startswith('I-')>0: |
|
pass |
|
elif segs[1].startswith('I-')>0: |
|
if i+1>=len(tokens): |
|
gold_end=i |
|
if gold_type in gold_entity.keys(): |
|
gold_entity[gold_type].append([gold_start,gold_end]) |
|
else: |
|
gold_entity[gold_type]=[[gold_start,gold_end]] |
|
else: |
|
next_seg=tokens[i+1].split('\t') |
|
if next_seg[1].startswith('B-')>0 or next_seg[1]=='O': |
|
gold_end=i |
|
if gold_type in gold_entity.keys(): |
|
gold_entity[gold_type].append([gold_start,gold_end]) |
|
else: |
|
gold_entity[gold_type]=[[gold_start,gold_end]] |
|
elif next_seg[1].startswith('I-')>0: |
|
pass |
|
elif segs[1]=='O': |
|
pass |
|
|
|
|
|
if segs[2].startswith('B-')>0: |
|
pre_start=i |
|
pre_type=segs[2][2:] |
|
if i+1>=len(tokens): |
|
pre_end=i |
|
if pre_type in pre_entity.keys(): |
|
pre_entity[pre_type].append([pre_start,pre_end]) |
|
else: |
|
pre_entity[pre_type]=[[pre_start,pre_end]] |
|
else: |
|
next_seg=tokens[i+1].split('\t') |
|
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O': |
|
pre_end=i |
|
if pre_type in pre_entity.keys(): |
|
pre_entity[pre_type].append([pre_start,pre_end]) |
|
else: |
|
pre_entity[pre_type]=[[pre_start,pre_end]] |
|
elif next_seg[2].startswith('I-')>0: |
|
pass |
|
elif segs[2].startswith('I-')>0: |
|
if i==0 and i+1<len(tokens): |
|
pre_start=i |
|
pre_type=segs[2][2:] |
|
next_seg=tokens[i+1].split('\t') |
|
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O': |
|
pre_end=i |
|
if pre_type in pre_entity.keys(): |
|
pre_entity[pre_type].append([pre_start,pre_end]) |
|
else: |
|
pre_entity[pre_type]=[[pre_start,pre_end]] |
|
elif next_seg[2].startswith('I-')>0: |
|
pass |
|
elif i==0 and i+1==len(tokens): |
|
pre_start=i |
|
pre_type=segs[2][2:] |
|
pre_end=i |
|
if pre_type in pre_entity.keys(): |
|
pre_entity[pre_type].append([pre_start,pre_end]) |
|
else: |
|
pre_entity[pre_type]=[[pre_start,pre_end]] |
|
elif i+1>=len(tokens): |
|
last_seg=tokens[i-1].split('\t') |
|
if last_seg[2]=='O': |
|
pre_start=i |
|
pre_type=segs[2][2:] |
|
pre_end=i |
|
if pre_type in pre_entity.keys(): |
|
pre_entity[pre_type].append([pre_start,pre_end]) |
|
else: |
|
pre_entity[pre_type]=[[pre_start,pre_end]] |
|
elif i+1< len(tokens): |
|
next_seg=tokens[i+1].split('\t') |
|
last_seg=tokens[i-1].split('\t') |
|
if last_seg[2]=='O': |
|
pre_start=i |
|
pre_type=segs[2][2:] |
|
if next_seg[2].startswith('B-')>0 or next_seg[2]=='O': |
|
pre_end=i |
|
if pre_type in pre_entity.keys(): |
|
pre_entity[pre_type].append([pre_start,pre_end]) |
|
else: |
|
pre_entity[pre_type]=[[pre_start,pre_end]] |
|
elif next_seg[2].startswith('I-')>0: |
|
pass |
|
elif segs[2]=='O': |
|
pass |
|
|
|
|
|
|
|
return gold_entity,pre_entity |
|
|
|
|
|
def NER_Evaluation(): |
|
path='//panfs/pan1/bionlp/lulab/luoling/OpenBioIE_project/models/Kfold/BiLSTM-CRF/' |
|
fin=open(path+'dev_pre.conll_all','r',encoding='utf-8') |
|
all_sentence=fin.read().strip().split('\n\n') |
|
fin.close() |
|
Metrics={} |
|
|
|
for sentence in all_sentence: |
|
tokens=sentence.split('\n') |
|
gold_entity,pre_entity=BIO_tag(tokens) |
|
|
|
for entity_type in gold_entity.keys(): |
|
if entity_type not in Metrics.keys(): |
|
Metrics[entity_type]=[0,len(gold_entity[entity_type]),0] |
|
else: |
|
Metrics[entity_type][1]+=len(gold_entity[entity_type]) |
|
for entity_type in pre_entity.keys(): |
|
if entity_type not in Metrics.keys(): |
|
Metrics[entity_type]=[0,0,len(pre_entity[entity_type])] |
|
else: |
|
Metrics[entity_type][2]+=len(pre_entity[entity_type]) |
|
for mention in pre_entity[entity_type]: |
|
if entity_type in gold_entity.keys(): |
|
if mention in gold_entity[entity_type]: |
|
Metrics[entity_type][0]+=1 |
|
print(Metrics) |
|
TP,Gold_num,Pre_num=0,0,0 |
|
for ele in Metrics.keys(): |
|
if Metrics[ele][2]==0: |
|
p=0 |
|
else: |
|
p=Metrics[ele][0]/Metrics[ele][2] |
|
if Metrics[ele][1]==0: |
|
r=0 |
|
else: |
|
r=Metrics[ele][0]/Metrics[ele][1] |
|
if p+r==0: |
|
f1=0 |
|
else: |
|
f1=2*p*r/(p+r) |
|
TP+=Metrics[ele][0] |
|
Gold_num+=Metrics[ele][1] |
|
Pre_num+=Metrics[ele][2] |
|
print(ele+': P=%.5f, R=%.5f, F1=%.5f' % (p,r,f1)) |
|
|
|
if Pre_num==0: |
|
P=0 |
|
else: |
|
P=TP/Pre_num |
|
R=TP/Gold_num |
|
F1=2*P*R/(P+R) |
|
print("Overall: P=%.5f, R=%.5f, F1=%.5f"% (P,R,F1)) |
|
|
|
def NER_Evaluation_fn(file): |
|
|
|
fin=open(file,'r',encoding='utf-8') |
|
all_sentence=fin.read().strip().split('\n\n') |
|
fin.close() |
|
Metrics={} |
|
breai=0 |
|
for sentence in all_sentence: |
|
breai+=1 |
|
if breai>5000: |
|
break |
|
tokens=sentence.split('\n') |
|
gold_entity,pre_entity=BIO_tag(tokens) |
|
|
|
for entity_type in gold_entity.keys(): |
|
if entity_type not in Metrics.keys(): |
|
Metrics[entity_type]=[0,len(gold_entity[entity_type]),0] |
|
else: |
|
Metrics[entity_type][1]+=len(gold_entity[entity_type]) |
|
for entity_type in pre_entity.keys(): |
|
if entity_type not in Metrics.keys(): |
|
Metrics[entity_type]=[0,0,len(pre_entity[entity_type])] |
|
else: |
|
Metrics[entity_type][2]+=len(pre_entity[entity_type]) |
|
for mention in pre_entity[entity_type]: |
|
if entity_type in gold_entity.keys(): |
|
if mention in gold_entity[entity_type]: |
|
Metrics[entity_type][0]+=1 |
|
print(Metrics) |
|
TP,Gold_num,Pre_num=0,0,0 |
|
for ele in Metrics.keys(): |
|
if Metrics[ele][2]==0: |
|
p=0 |
|
else: |
|
p=Metrics[ele][0]/Metrics[ele][2] |
|
if Metrics[ele][1]==0: |
|
r=0 |
|
else: |
|
r=Metrics[ele][0]/Metrics[ele][1] |
|
if p+r==0: |
|
f1=0 |
|
else: |
|
f1=2*p*r/(p+r) |
|
TP+=Metrics[ele][0] |
|
Gold_num+=Metrics[ele][1] |
|
Pre_num+=Metrics[ele][2] |
|
print(ele+': P=%.5f, R=%.5f, F1=%.5f' % (p,r,f1)) |
|
|
|
if Pre_num==0: |
|
P=0 |
|
else: |
|
P=TP/Pre_num |
|
R=TP/Gold_num |
|
if P+R==0: |
|
F1=0 |
|
else: |
|
F1=2*P*R/(P+R) |
|
print("Overall: P=%.5f, R=%.5f, F1=%.5f"% (P,R,F1)) |
|
return F1 |
|
|
|
if __name__=='__main__': |
|
NER_Evaluation() |
|
|