|
|
|
"""
|
|
Created on Mon Mar 1 15:33:54 2021
|
|
|
|
@author: luol2
|
|
"""
|
|
|
|
|
|
def Rel_Evaluation(prefile):
|
|
fin=open(prefile,'r',encoding='utf-8')
|
|
all_in=fin.read().strip().split('\n\n')
|
|
fin.close()
|
|
TP=0
|
|
FP=0
|
|
FN=0
|
|
for sentence in all_in:
|
|
tokens=sentence.split('\n')
|
|
entity_id=0
|
|
token_id=0
|
|
temp_gold='O'
|
|
temp_pre='O'
|
|
while (token_id<len(tokens)):
|
|
seg=tokens[token_id].split('\t')
|
|
if seg[0]=='<GENE>':
|
|
if seg[1]=='O':
|
|
temp_gold=seg[1]
|
|
else:
|
|
temp_gold=seg[1][2:]
|
|
if seg[2]=='O':
|
|
temp_pre=seg[2]
|
|
else:
|
|
temp_pre=seg[2][2:]
|
|
token_id+=1
|
|
seg=tokens[token_id].split('\t')
|
|
while seg[0]!='</GENE>':
|
|
token_id+=1
|
|
seg=tokens[token_id].split('\t')
|
|
if seg[1]!='O' and temp_gold=='O':
|
|
temp_gold=seg[1][2:]
|
|
if seg[2]!='O' and temp_pre=='O':
|
|
temp_pre=seg[2][2:]
|
|
if temp_pre!='O' and temp_gold!='O' and temp_pre==temp_gold:
|
|
TP+=1
|
|
elif temp_pre!='O' and temp_gold!='O' and temp_pre!=temp_gold:
|
|
FP+=1
|
|
FN+=1
|
|
elif temp_pre!='O' and temp_gold=='O' :
|
|
FP+=1
|
|
elif temp_pre=='O' and temp_gold!='O' :
|
|
FN+=1
|
|
temp_pre='O'
|
|
temp_gold='O'
|
|
|
|
else:
|
|
pass
|
|
token_id+=1
|
|
|
|
if TP+FP==0:
|
|
P=0
|
|
else:
|
|
P=TP/(TP+FP)
|
|
if TP+FN==0:
|
|
R=0
|
|
else:
|
|
R=TP/(TP+FN)
|
|
if P+R==0:
|
|
F1=0
|
|
else:
|
|
F1=2*P*R/(P+R)
|
|
print('TP,FP,FN:',TP,FP,FN)
|
|
print('P,R,F1:',P,R,F1)
|
|
|
|
|
|
def Rel_Evaluation_fn(prefile):
|
|
fin=open(prefile,'r',encoding='utf-8')
|
|
all_in=fin.read().strip().split('\n\n')
|
|
fin.close()
|
|
TP=0
|
|
FP=0
|
|
FN=0
|
|
for sentence in all_in:
|
|
tokens=sentence.split('\n')
|
|
entity_id=0
|
|
token_id=0
|
|
temp_gold='O'
|
|
temp_pre='O'
|
|
while (token_id<len(tokens)):
|
|
seg=tokens[token_id].split('\t')
|
|
if seg[0]=='<GENE>':
|
|
if seg[1]=='O':
|
|
temp_gold=seg[1]
|
|
else:
|
|
temp_gold=seg[1][2:]
|
|
if seg[2]=='O':
|
|
temp_pre=seg[2]
|
|
else:
|
|
temp_pre=seg[2][2:]
|
|
token_id+=1
|
|
seg=tokens[token_id].split('\t')
|
|
while seg[0]!='</GENE>':
|
|
token_id+=1
|
|
seg=tokens[token_id].split('\t')
|
|
if seg[1]!='O' and temp_gold=='O':
|
|
temp_gold=seg[1][2:]
|
|
if seg[2]!='O' and temp_pre=='O':
|
|
temp_pre=seg[2][2:]
|
|
if temp_pre!='O' and temp_gold!='O' and temp_pre==temp_gold:
|
|
TP+=1
|
|
elif temp_pre!='O' and temp_gold!='O' and temp_pre!=temp_gold:
|
|
FP+=1
|
|
elif temp_pre!='O' and temp_gold=='O' :
|
|
FP+=1
|
|
elif temp_pre=='O' and temp_gold!='O' :
|
|
FN+=1
|
|
temp_pre='O'
|
|
temp_gold='O'
|
|
|
|
else:
|
|
pass
|
|
token_id+=1
|
|
print('TP,FP,FN:',TP,FP,FN)
|
|
if TP+FP==0:
|
|
P=0
|
|
else:
|
|
P=TP/(TP+FP)
|
|
if TP+FN==0:
|
|
R=0
|
|
else:
|
|
R=TP/(TP+FN)
|
|
if P+R==0:
|
|
F1=0
|
|
else:
|
|
F1=2*P*R/(P+R)
|
|
|
|
print('P,R,F1:',P,R,F1)
|
|
return F1
|
|
|
|
def Rel_Evaluation_Hugface_fn(prefile,ARG2_label='gene1s'):
|
|
fin=open(prefile,'r',encoding='utf-8')
|
|
all_in=fin.read().strip().split('\n\n')
|
|
fin.close()
|
|
TP=0
|
|
FP=0
|
|
FN=0
|
|
result_dict={}
|
|
for sentence in all_in:
|
|
tokens=sentence.split('\n')
|
|
for token in tokens:
|
|
seg=token.split('\t')
|
|
if seg[0]==ARG2_label:
|
|
if seg[1].find('ARG2')>=0:
|
|
if seg[2]==seg[1]:
|
|
if seg[1] not in result_dict.keys():
|
|
result_dict[seg[1]]=[1,0,0]
|
|
else:
|
|
result_dict[seg[1]][0]+=1
|
|
TP+=1
|
|
elif seg[2].find('ARG2')>=0:
|
|
if seg[1] not in result_dict.keys():
|
|
result_dict[seg[1]]=[0,0,1]
|
|
else:
|
|
result_dict[seg[1]][2]+=1
|
|
if seg[2] not in result_dict.keys():
|
|
result_dict[seg[2]]=[0,1,0]
|
|
else:
|
|
result_dict[seg[2]][1]+=1
|
|
FP+=1
|
|
FN+=1
|
|
else:
|
|
if seg[1] not in result_dict.keys():
|
|
result_dict[seg[1]]=[0,0,1]
|
|
else:
|
|
result_dict[seg[1]][2]+=1
|
|
FN+=1
|
|
|
|
else:
|
|
if seg[2].find('ARG2')>=0:
|
|
if seg[2] not in result_dict.keys():
|
|
result_dict[seg[2]]=[0,1,0]
|
|
else:
|
|
result_dict[seg[2]][1]+=1
|
|
FP+=1
|
|
|
|
rel_metrics={}
|
|
for rel_type in result_dict.keys():
|
|
if result_dict[rel_type][0]+result_dict[rel_type][1]==0:
|
|
p=0
|
|
else:
|
|
p=result_dict[rel_type][0]/(result_dict[rel_type][0]+result_dict[rel_type][1])
|
|
if result_dict[rel_type][0]+result_dict[rel_type][2]==0:
|
|
r=0
|
|
else:
|
|
r=result_dict[rel_type][0]/(result_dict[rel_type][0]+result_dict[rel_type][2])
|
|
if p+r==0:
|
|
f1=0
|
|
else:
|
|
f1=2*p*r/(p+r)
|
|
rel_metrics[rel_type]=[round(p,4),round(r,4),round(f1,4)]
|
|
if TP+FP==0:
|
|
P=0
|
|
else:
|
|
P=TP/(TP+FP)
|
|
if TP+FN==0:
|
|
R=0
|
|
else:
|
|
R=TP/(TP+FN)
|
|
if P+R==0:
|
|
F1=0
|
|
else:
|
|
F1=2*P*R/(P+R)
|
|
P=round(P,4)
|
|
R=round(R,4)
|
|
F1=round(F1,4)
|
|
print('mertics:\n',rel_metrics)
|
|
print('\nTP,FP,FN:',TP,FP,FN)
|
|
print('Overall P,R,F1:',P,R,F1)
|
|
return [P,R,F1],rel_metrics
|
|
|
|
def Rel_Evaluation_AIO_fn(prefile):
|
|
fin=open(prefile,'r',encoding='utf-8')
|
|
all_in=fin.read().strip().split('\n\n')
|
|
fin.close()
|
|
TP=0
|
|
FP=0
|
|
FN=0
|
|
for sentence in all_in:
|
|
tokens=sentence.split('\n')
|
|
for token in tokens:
|
|
seg=token.split('\t')
|
|
if seg[0]=='<GENE>':
|
|
if seg[1].find('ARG2-')>=0:
|
|
if seg[2]==seg[1]:
|
|
TP+=1
|
|
elif seg[2].find('ARG2-')>=0:
|
|
FP+=1
|
|
FN+=1
|
|
else:
|
|
FN+=1
|
|
|
|
else:
|
|
if seg[2].find('ARG2-')>=0:
|
|
FP+=1
|
|
|
|
if TP+FP==0:
|
|
P=0
|
|
else:
|
|
P=TP/(TP+FP)
|
|
if TP+FN==0:
|
|
R=0
|
|
else:
|
|
R=TP/(TP+FN)
|
|
if P+R==0:
|
|
F1=0
|
|
else:
|
|
F1=2*P*R/(P+R)
|
|
P=round(P,4)
|
|
R=round(R,4)
|
|
F1=round(F1,4)
|
|
print('TP,FP,FN:',TP,FP,FN)
|
|
print('P,R,F1:',P,R,F1)
|
|
return [P,R,F1]
|
|
|
|
def Rel_Evaluation_AIO_GC_fn(prefile):
|
|
fin=open(prefile,'r',encoding='utf-8')
|
|
all_in=fin.read().strip().split('\n\n')
|
|
fin.close()
|
|
TP=0
|
|
FP=0
|
|
FN=0
|
|
for sentence in all_in:
|
|
tokens=sentence.split('\n')
|
|
for token in tokens:
|
|
seg=token.split('\t')
|
|
if seg[0]=='<CHEMICAL>':
|
|
if seg[1].find('ARG2-')>=0:
|
|
if seg[2]==seg[1]:
|
|
TP+=1
|
|
elif seg[2].find('ARG2-')>=0:
|
|
FP+=1
|
|
FN+=1
|
|
else:
|
|
FN+=1
|
|
|
|
else:
|
|
if seg[2].find('ARG2-')>=0:
|
|
FP+=1
|
|
|
|
if TP+FP==0:
|
|
P=0
|
|
else:
|
|
P=TP/(TP+FP)
|
|
if TP+FN==0:
|
|
R=0
|
|
else:
|
|
R=TP/(TP+FN)
|
|
if P+R==0:
|
|
F1=0
|
|
else:
|
|
F1=2*P*R/(P+R)
|
|
P=round(P,4)
|
|
R=round(R,4)
|
|
F1=round(F1,4)
|
|
print('TP,FP,FN:',TP,FP,FN)
|
|
print('P,R,F1:',P,R,F1)
|
|
return [P,R,F1]
|
|
|
|
def office_evaluation(goldfile,prefile):
|
|
fin_gold=open(goldfile,'r',encoding='utf-8')
|
|
all_gold=fin_gold.read().strip().split('\n')
|
|
fin_gold.close()
|
|
fin_pre=open(prefile,'r',encoding='utf-8')
|
|
all_pre=fin_pre.read().strip().split('\n')
|
|
fin_pre.close()
|
|
|
|
gold_result={}
|
|
pre_result={}
|
|
all_result={}
|
|
for line in all_gold:
|
|
seg=line.split('\t')
|
|
if seg[1] not in all_result.keys():
|
|
all_result[seg[1]]=[0,0,0]
|
|
if seg[1] not in gold_result.keys():
|
|
gold_result[seg[1]]=set()
|
|
gold_result[seg[1]].add(line)
|
|
else:
|
|
gold_result[seg[1]].add(line)
|
|
|
|
for line in all_pre:
|
|
seg=line.split('\t')
|
|
if seg[1] not in pre_result.keys():
|
|
pre_result[seg[1]]=set()
|
|
pre_result[seg[1]].add(line)
|
|
else:
|
|
pre_result[seg[1]].add(line)
|
|
|
|
for rel_type in gold_result.keys():
|
|
for gold_ele in gold_result[rel_type]:
|
|
if rel_type not in pre_result.keys():
|
|
all_result[rel_type][2]+=1
|
|
else:
|
|
if gold_ele in pre_result[rel_type]:
|
|
all_result[rel_type][0]+=1
|
|
else:
|
|
all_result[rel_type][2]+=1
|
|
if rel_type in pre_result.keys():
|
|
for pre_ele in pre_result[rel_type]:
|
|
if pre_ele not in gold_result[rel_type]:
|
|
all_result[rel_type][1]+=1
|
|
ave_f=0
|
|
TP,FP,FN=0,0,0
|
|
print(all_result)
|
|
for rel_type in all_result.keys():
|
|
TP+=all_result[rel_type][0]
|
|
FP+=all_result[rel_type][1]
|
|
FN+=all_result[rel_type][2]
|
|
tem_p,tem_r,tem_f=0,0,0
|
|
if all_result[rel_type][0]+all_result[rel_type][1]==0:
|
|
tem_p=0
|
|
else:
|
|
tem_p=all_result[rel_type][0]/(all_result[rel_type][0]+all_result[rel_type][1])
|
|
if all_result[rel_type][0]+all_result[rel_type][2]==0:
|
|
tem_r=0
|
|
else:
|
|
tem_r=all_result[rel_type][0]/(all_result[rel_type][0]+all_result[rel_type][2])
|
|
if tem_p+tem_r==0:
|
|
tem_f=0
|
|
else:
|
|
tem_f=2*tem_p*tem_r/(tem_p+tem_r)
|
|
ave_f+=tem_f
|
|
print('%s:p=%.4f,r=%.4f,f=%.4f' % (rel_type,tem_p,tem_r,tem_f))
|
|
|
|
if TP+FP==0:
|
|
P=0
|
|
else:
|
|
P=TP/(TP+FP)
|
|
if TP+FN==0:
|
|
R=0
|
|
else:
|
|
R=TP/(TP+FN)
|
|
if P+R==0:
|
|
F1=0
|
|
else:
|
|
F1=2*P*R/(P+R)
|
|
ave_f+=tem_f
|
|
|
|
print('Overall:')
|
|
print('ave_f1:',ave_f/len(all_result))
|
|
print('TP=%d, FP=%d, FN=%d'%(TP,FP,FN))
|
|
print('P=%.4f, R=%.4f, F1=%.4f'%(P,R,F1))
|
|
|
|
|
|
if __name__=='__main__':
|
|
path='//panfs/pan1/bionlplab/luol2/BC7DrugProt/results/'
|
|
office_evaluation(path+'dev/dev_gold_relations.tsv',path+'drugprot_dev_LSTM-CRF-ES_pre.tsv')
|
|
print('............')
|
|
Rel_Evaluation_check('//panfs/pan1/bionlplab/luol2/BC7DrugProt/check/dev_pre_temp.conll')
|
|
|