GNorm2-docker / src_python /GeneNER /Evaluation_ner.py

Upload folder using huggingface_hub

d5062c8 verified over 1 year ago

9.17 kB

	# -- coding: utf-8 --
	"""
	Created on Mon Mar 1 15:33:54 2021

	@author: luol2
	"""
	# from BIO format to entity
	def BIO_tag(tokens):
	gold_entity={}
	pre_entity={}
	gold_start,gold_end=0,0
	pre_start,pre_end=0,0
	for i in range(0,len(tokens)):
	segs=tokens[i].split('\t')

	# generate gold entity
	if segs[1].startswith('B-')>0:
	gold_start=i
	gold_type=segs[1][2:]
	if i+1>=len(tokens): # the last word
	gold_end=i
	if gold_type in gold_entity.keys():
	gold_entity[gold_type].append([gold_start,gold_end])
	else:
	gold_entity[gold_type]=[[gold_start,gold_end]]
	else: # non last word
	next_seg=tokens[i+1].split('\t')
	if next_seg[1].startswith('B-')>0 or next_seg[1]=='O':
	gold_end=i
	if gold_type in gold_entity.keys():
	gold_entity[gold_type].append([gold_start,gold_end])
	else:
	gold_entity[gold_type]=[[gold_start,gold_end]]
	elif next_seg[1].startswith('I-')>0:
	pass
	elif segs[1].startswith('I-')>0:
	if i+1>=len(tokens): # the last word
	gold_end=i
	if gold_type in gold_entity.keys():
	gold_entity[gold_type].append([gold_start,gold_end])
	else:
	gold_entity[gold_type]=[[gold_start,gold_end]]
	else: # non last word
	next_seg=tokens[i+1].split('\t')
	if next_seg[1].startswith('B-')>0 or next_seg[1]=='O':
	gold_end=i
	if gold_type in gold_entity.keys():
	gold_entity[gold_type].append([gold_start,gold_end])
	else:
	gold_entity[gold_type]=[[gold_start,gold_end]]
	elif next_seg[1].startswith('I-')>0:
	pass
	elif segs[1]=='O':
	pass

	# generate prediction entity
	if segs[2].startswith('B-')>0:
	pre_start=i
	pre_type=segs[2][2:]
	if i+1>=len(tokens): # the last word
	pre_end=i
	if pre_type in pre_entity.keys():
	pre_entity[pre_type].append([pre_start,pre_end])
	else:
	pre_entity[pre_type]=[[pre_start,pre_end]]
	else: # non last word
	next_seg=tokens[i+1].split('\t')
	if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
	pre_end=i
	if pre_type in pre_entity.keys():
	pre_entity[pre_type].append([pre_start,pre_end])
	else:
	pre_entity[pre_type]=[[pre_start,pre_end]]
	elif next_seg[2].startswith('I-')>0:
	pass
	elif segs[2].startswith('I-')>0:
	if i==0 and i+1<len(tokens): # the first word and not only a word
	pre_start=i
	pre_type=segs[2][2:]
	next_seg=tokens[i+1].split('\t')
	if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
	pre_end=i
	if pre_type in pre_entity.keys():
	pre_entity[pre_type].append([pre_start,pre_end])
	else:
	pre_entity[pre_type]=[[pre_start,pre_end]]
	elif next_seg[2].startswith('I-')>0:
	pass
	elif i==0 and i+1==len(tokens):# only one word:
	pre_start=i
	pre_type=segs[2][2:]
	pre_end=i
	if pre_type in pre_entity.keys():
	pre_entity[pre_type].append([pre_start,pre_end])
	else:
	pre_entity[pre_type]=[[pre_start,pre_end]]
	elif i+1>=len(tokens): # the last word
	last_seg=tokens[i-1].split('\t')
	if last_seg[2]=='O':
	pre_start=i
	pre_type=segs[2][2:]
	pre_end=i
	if pre_type in pre_entity.keys():
	pre_entity[pre_type].append([pre_start,pre_end])
	else:
	pre_entity[pre_type]=[[pre_start,pre_end]]
	elif i+1< len(tokens): # non last word
	next_seg=tokens[i+1].split('\t')
	last_seg=tokens[i-1].split('\t')
	if last_seg[2]=='O':
	pre_start=i
	pre_type=segs[2][2:]
	if next_seg[2].startswith('B-')>0 or next_seg[2]=='O':
	pre_end=i
	if pre_type in pre_entity.keys():
	pre_entity[pre_type].append([pre_start,pre_end])
	else:
	pre_entity[pre_type]=[[pre_start,pre_end]]
	elif next_seg[2].startswith('I-')>0:
	pass
	elif segs[2]=='O':
	pass
	# print(tokens)
	# print(gold_entity)
	# print(pre_entity)
	return gold_entity,pre_entity

	# input: token \t Gold \t Prediction\n, sentence is split "\n"
	def NER_Evaluation():
	path='//panfs/pan1/bionlp/lulab/luoling/OpenBioIE_project/models/Kfold/BiLSTM-CRF/'
	fin=open(path+'dev_pre.conll_all','r',encoding='utf-8')
	all_sentence=fin.read().strip().split('\n\n')
	fin.close()
	Metrics={} #{'entity_type':[TP,gold_num,pre_num]}

	for sentence in all_sentence:
	tokens=sentence.split('\n')
	gold_entity,pre_entity=BIO_tag(tokens)
	# print(tokens)
	for entity_type in gold_entity.keys():
	if entity_type not in Metrics.keys():
	Metrics[entity_type]=[0,len(gold_entity[entity_type]),0]
	else:
	Metrics[entity_type][1]+=len(gold_entity[entity_type])
	for entity_type in pre_entity.keys():
	if entity_type not in Metrics.keys():
	Metrics[entity_type]=[0,0,len(pre_entity[entity_type])]
	else:
	Metrics[entity_type][2]+=len(pre_entity[entity_type])
	for mention in pre_entity[entity_type]:
	if entity_type in gold_entity.keys():
	if mention in gold_entity[entity_type]:
	Metrics[entity_type][0]+=1
	print(Metrics)
	TP,Gold_num,Pre_num=0,0,0
	for ele in Metrics.keys():
	if Metrics[ele][2]==0:
	p=0
	else:
	p=Metrics[ele][0]/Metrics[ele][2]
	if Metrics[ele][1]==0:
	r=0
	else:
	r=Metrics[ele][0]/Metrics[ele][1]
	if p+r==0:
	f1=0
	else:
	f1=2pr/(p+r)
	TP+=Metrics[ele][0]
	Gold_num+=Metrics[ele][1]
	Pre_num+=Metrics[ele][2]
	print(ele+': P=%.5f, R=%.5f, F1=%.5f' % (p,r,f1))
	# break
	if Pre_num==0:
	P=0
	else:
	P=TP/Pre_num
	R=TP/Gold_num
	F1=2PR/(P+R)
	print("Overall: P=%.5f, R=%.5f, F1=%.5f"% (P,R,F1))

	def NER_Evaluation_fn(file):

	fin=open(file,'r',encoding='utf-8')
	all_sentence=fin.read().strip().split('\n\n')
	fin.close()
	Metrics={} #{'entity_type':[TP,gold_num,pre_num]}
	breai=0
	for sentence in all_sentence:
	breai+=1
	if breai>5000:
	break
	tokens=sentence.split('\n')
	gold_entity,pre_entity=BIO_tag(tokens)
	# print(tokens)
	for entity_type in gold_entity.keys():
	if entity_type not in Metrics.keys():
	Metrics[entity_type]=[0,len(gold_entity[entity_type]),0]
	else:
	Metrics[entity_type][1]+=len(gold_entity[entity_type])
	for entity_type in pre_entity.keys():
	if entity_type not in Metrics.keys():
	Metrics[entity_type]=[0,0,len(pre_entity[entity_type])]
	else:
	Metrics[entity_type][2]+=len(pre_entity[entity_type])
	for mention in pre_entity[entity_type]:
	if entity_type in gold_entity.keys():
	if mention in gold_entity[entity_type]:
	Metrics[entity_type][0]+=1
	print(Metrics)
	TP,Gold_num,Pre_num=0,0,0
	for ele in Metrics.keys():
	if Metrics[ele][2]==0:
	p=0
	else:
	p=Metrics[ele][0]/Metrics[ele][2]
	if Metrics[ele][1]==0:
	r=0
	else:
	r=Metrics[ele][0]/Metrics[ele][1]
	if p+r==0:
	f1=0
	else:
	f1=2pr/(p+r)
	TP+=Metrics[ele][0]
	Gold_num+=Metrics[ele][1]
	Pre_num+=Metrics[ele][2]
	print(ele+': P=%.5f, R=%.5f, F1=%.5f' % (p,r,f1))
	# break
	if Pre_num==0:
	P=0
	else:
	P=TP/Pre_num
	R=TP/Gold_num
	if P+R==0:
	F1=0
	else:
	F1=2PR/(P+R)
	print("Overall: P=%.5f, R=%.5f, F1=%.5f"% (P,R,F1))
	return F1

	if __name__=='__main__':
	NER_Evaluation()