|
|
|
|
|
import sys
|
|
import io
|
|
import stanza
|
|
|
|
nlp = stanza.Pipeline(lang='en', processors={'tokenize': 'spacy'},package='None')
|
|
REL_ENT={'arg1':'Species',
|
|
'arg2':'Gene'}
|
|
|
|
ENTITY_TAG={'arg1':['arg1s','arg1e'],
|
|
'arg2':['arg2s','arg2e'],
|
|
'gene':['gene1s','gene1e'],
|
|
'species':['species1s','species1e']
|
|
}
|
|
|
|
|
|
def ssplit_token(infile):
|
|
fin=open(infile,'r',encoding='utf-8')
|
|
fout=io.StringIO()
|
|
all_in=fin.read().strip().split('\n\n')
|
|
fin.close()
|
|
for doc_text in all_in:
|
|
lines=doc_text.split('\n')
|
|
ori_text=lines[0].split('|t|')[1]+' '+lines[1].split('|a|')[1]
|
|
pmid=lines[0].split('|t|')[0]
|
|
|
|
entity_all=[]
|
|
for i in range(2,len(lines)):
|
|
seg=lines[i].split('\t')
|
|
entity_all.append(seg)
|
|
|
|
|
|
doc_stanza = nlp(ori_text)
|
|
token_text=''
|
|
for sent in doc_stanza.sentences:
|
|
for word in sent.words:
|
|
if word.text==' ':
|
|
pass
|
|
|
|
else:
|
|
token_text+=word.text+' '
|
|
|
|
|
|
|
|
index_map=[-1]*len(ori_text)
|
|
j=0
|
|
space_list=[' ',chr(160),chr(8201),chr(8194),chr(8197),chr(8202)]
|
|
for i in range(0,len(ori_text)):
|
|
if ori_text[i] in space_list:
|
|
pass
|
|
elif ori_text[i]==token_text[j]:
|
|
|
|
|
|
index_map[i]=j
|
|
j+=1
|
|
else:
|
|
|
|
|
|
j+=1
|
|
temp_log=j
|
|
try:
|
|
while(ori_text[i]!=token_text[j]):
|
|
j+=1
|
|
except:
|
|
print('doc',doc_text)
|
|
print('token_text:',token_text)
|
|
print('error:',ori_text[i-10:i+10],'i:',ori_text[i],'j:',token_text[temp_log],',',token_text[temp_log-10:temp_log+10])
|
|
print(ord(ori_text[i]),ord(' '))
|
|
sys.exit()
|
|
index_map[i]=j
|
|
j+=1
|
|
|
|
|
|
|
|
fout.write(token_text+'\n')
|
|
for ele in entity_all:
|
|
if index_map[int(ele[1])]==-1:
|
|
new_ents=index_map[int(ele[1])+1]
|
|
else:
|
|
new_ents=index_map[int(ele[1])]
|
|
if index_map[int(ele[2])-1]==-1:
|
|
new_ente=index_map[int(ele[2])-1-1]+1
|
|
else:
|
|
new_ente=index_map[int(ele[2])-1]+1
|
|
new_ent=token_text[new_ents:new_ente]
|
|
if ele[4]=='Species' or ele[4]=='Gene':
|
|
fout.write(ele[0]+'\t'+str(new_ents)+'\t'+str(new_ente)+'\t'+new_ent+'\t'+ele[4]+'\t'+ele[5]+'\n')
|
|
else:
|
|
|
|
fout.write(ele[0]+'\t'+str(new_ents)+'\t'+str(new_ente)+'\t'+new_ent+'\t'+'Gene'+'\t'+ele[5]+'\n')
|
|
fout.write('\n')
|
|
return fout.getvalue()
|
|
|
|
|
|
def corpus_noNest(token_input):
|
|
|
|
fin=io.StringIO(token_input)
|
|
fout=io.StringIO()
|
|
|
|
documents=fin.read().strip().split('\n\n')
|
|
fin.close()
|
|
total_entity=0
|
|
over_entity=0
|
|
nest_entity=0
|
|
for doc in documents:
|
|
lines=doc.split('\n')
|
|
context=lines[0]
|
|
entity_list=[]
|
|
if len(lines)>1:
|
|
doc_result={}
|
|
for i in range(1,len(lines)):
|
|
segs=lines[i].split('\t')
|
|
doc_result[lines[i]]=[int(segs[1]),int(segs[2])]
|
|
doc_result=sorted(doc_result.items(), key=lambda kv:(kv[1]), reverse=False)
|
|
doc_result_sort=[]
|
|
for ele in doc_result:
|
|
doc_result_sort.append(ele[0])
|
|
|
|
first_entity=doc_result_sort[0].split('\t')
|
|
nest_list=[first_entity]
|
|
max_eid=int(first_entity[2])
|
|
total_entity+=len(lines)-2
|
|
for i in range(1,len(doc_result_sort)):
|
|
segs=doc_result_sort[i].split('\t')
|
|
if int(segs[1])> max_eid:
|
|
if len(nest_list)==1:
|
|
entity_list.append(nest_list[0])
|
|
nest_list=[]
|
|
nest_list.append(segs)
|
|
if int(segs[2])>max_eid:
|
|
max_eid=int(segs[2])
|
|
else:
|
|
|
|
nest_entity+=len(nest_list)-1
|
|
tem=find_max_entity(nest_list,context)
|
|
|
|
|
|
entity_list.extend(tem)
|
|
nest_list=[]
|
|
nest_list.append(segs)
|
|
if int(segs[2])>max_eid:
|
|
max_eid=int(segs[2])
|
|
|
|
else:
|
|
nest_list.append(segs)
|
|
over_entity+=1
|
|
if int(segs[2])>max_eid:
|
|
max_eid=int(segs[2])
|
|
if nest_list!=[]:
|
|
if len(nest_list)==1:
|
|
entity_list.append(nest_list[0])
|
|
|
|
else:
|
|
tem=find_max_entity(nest_list,context)
|
|
|
|
|
|
entity_list.extend(tem)
|
|
fout.write(context+'\n')
|
|
for ele in entity_list:
|
|
if ele[4]=='Gene':
|
|
temp_gene={}
|
|
gene_ids=ele[5].split(',')
|
|
for gene_id in gene_ids:
|
|
temp_id=gene_id[gene_id.find('Species:'):-1]
|
|
spe_id=temp_id[len('Species:'):]
|
|
temp_gene[temp_id]=int(spe_id)
|
|
temp_gene_sort=sorted(temp_gene.items(), key=lambda kv:(kv[1]), reverse=False)
|
|
final_gene_id=''
|
|
for temp_ele in temp_gene_sort:
|
|
final_gene_id+=temp_ele[0]+','
|
|
fout.write('\t'.join(ele[:-1])+'\t'+final_gene_id[:-1]+'\n')
|
|
else:
|
|
fout.write('\t'.join(ele)+'\n')
|
|
fout.write('\n')
|
|
|
|
return fout.getvalue()
|
|
|
|
def find_max_entity(nest_list,text):
|
|
max_len=0
|
|
final_tem=[]
|
|
max_index=0
|
|
for i in range(0, len(nest_list)):
|
|
if nest_list[i][4] =='Species':
|
|
final_tem.append(nest_list[i])
|
|
else:
|
|
cur_len=int(nest_list[i][2])-int(nest_list[i][1])
|
|
if cur_len>max_len:
|
|
max_len=cur_len
|
|
max_index=i
|
|
final_tem.append(nest_list[max_index])
|
|
return final_tem
|
|
|
|
|
|
def generate_seq_input(nonest_input,outfile):
|
|
|
|
fin=io.StringIO(nonest_input)
|
|
fout=open(outfile,'w',encoding='utf-8')
|
|
all_in=fin.read().strip().split('\n\n')
|
|
fin.close()
|
|
|
|
final_input=[]
|
|
|
|
for doc in all_in:
|
|
lines=doc.split('\n')
|
|
token_text=lines[0]
|
|
pmid=lines[1].split('\t')[0]
|
|
|
|
|
|
entity_arg1={}
|
|
entity_arg2={}
|
|
entity_all=[]
|
|
|
|
for i in range(1,len(lines)):
|
|
seg=lines[i].split('\t')
|
|
if seg[4]==REL_ENT['arg1']:
|
|
if seg[-1] in entity_arg1.keys():
|
|
entity_arg1[seg[-1]].append([seg[1],seg[2]])
|
|
else:
|
|
entity_arg1[seg[-1]]=[[seg[1],seg[2]]]
|
|
elif seg[4]==REL_ENT['arg2']:
|
|
temp_spes=seg[-1].split(',')
|
|
for ele in temp_spes:
|
|
gene_spe_id=ele
|
|
if gene_spe_id in entity_arg2.keys():
|
|
entity_arg2[gene_spe_id].append([seg[1],seg[2]])
|
|
else:
|
|
entity_arg2[gene_spe_id]=[[seg[1],seg[2]]]
|
|
|
|
entity_all.append(seg)
|
|
|
|
|
|
|
|
|
|
for cur_ele in entity_arg1.keys():
|
|
|
|
|
|
|
|
|
|
if cur_ele in entity_arg2.keys():
|
|
rel_ent2=entity_arg2[cur_ele]
|
|
ner_text=''
|
|
text_sid=0
|
|
|
|
for ele_nonest in entity_all:
|
|
ent_id=[ele_nonest[1],ele_nonest[2]]
|
|
ent_sid=int(ele_nonest[1])
|
|
ent_eid=int(ele_nonest[2])
|
|
|
|
ent_text=ele_nonest[3]
|
|
ent_type=ele_nonest[4]
|
|
if ent_sid>=text_sid:
|
|
if ent_id in entity_arg1[cur_ele]:
|
|
ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG['arg1'][0]+' '+ent_text+ ' '+ENTITY_TAG['arg1'][1]+' '
|
|
else:
|
|
if ent_id in rel_ent2:
|
|
if ent_type!=REL_ENT['arg2']:
|
|
pass
|
|
|
|
ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG['arg2'][0]+' '+ent_text+ ' '+ENTITY_TAG['arg2'][1]+' '
|
|
else:
|
|
ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG[ent_type.lower()][0]+' '+ent_text+ ' '+ENTITY_TAG[ent_type.lower()][1]+' '
|
|
text_sid=ent_eid
|
|
else:
|
|
pass
|
|
|
|
ner_text+=token_text[text_sid:]
|
|
sen_tokens=ner_text.split()
|
|
|
|
|
|
|
|
|
|
temp_input=[]
|
|
token_id=0
|
|
while token_id <len(sen_tokens):
|
|
if sen_tokens[token_id].find(ENTITY_TAG['arg1'][0])>=0:
|
|
temp_input.append(ENTITY_TAG['arg1'][0]+'\tO')
|
|
token_id+=1
|
|
while(sen_tokens[token_id]!=ENTITY_TAG['arg1'][1]):
|
|
temp_input.append(sen_tokens[token_id]+'\tO')
|
|
token_id+=1
|
|
temp_input.append(ENTITY_TAG['arg1'][1]+'\tO')
|
|
elif sen_tokens[token_id].find(ENTITY_TAG['arg2'][0])>=0:
|
|
temp_input.append(ENTITY_TAG[REL_ENT['arg2'].lower()][0]+'\tARG2')
|
|
token_id+=1
|
|
while(sen_tokens[token_id]!=ENTITY_TAG['arg2'][1]):
|
|
temp_input.append(sen_tokens[token_id]+'\tARG2')
|
|
token_id+=1
|
|
temp_input.append(ENTITY_TAG[REL_ENT['arg2'].lower()][1]+'\tARG2')
|
|
elif sen_tokens[token_id].find(ENTITY_TAG['gene'][0])>=0:
|
|
temp_input.append(ENTITY_TAG['gene'][0]+'\tO')
|
|
token_id+=1
|
|
while(sen_tokens[token_id]!=ENTITY_TAG['gene'][1]):
|
|
temp_input.append(sen_tokens[token_id]+'\tO')
|
|
token_id+=1
|
|
temp_input.append(ENTITY_TAG['gene'][1]+'\tO')
|
|
elif sen_tokens[token_id].find(ENTITY_TAG['species'][0])>=0:
|
|
temp_input.append(ENTITY_TAG['species'][0]+'\tO')
|
|
token_id+=1
|
|
while(sen_tokens[token_id]!=ENTITY_TAG['species'][1]):
|
|
temp_input.append(sen_tokens[token_id]+'\tO')
|
|
token_id+=1
|
|
temp_input.append(ENTITY_TAG['species'][1]+'\tO')
|
|
else:
|
|
if sen_tokens[token_id]=='':
|
|
|
|
pass
|
|
else:
|
|
temp_input.append(sen_tokens[token_id]+'\tO')
|
|
token_id+=1
|
|
|
|
final_input.append('\n'.join(temp_input))
|
|
|
|
else:
|
|
ner_text=''
|
|
text_sid=0
|
|
|
|
for ele_nonest in entity_all:
|
|
ent_id=[ele_nonest[1],ele_nonest[2]]
|
|
ent_sid=int(ele_nonest[1])
|
|
ent_eid=int(ele_nonest[2])
|
|
|
|
ent_text=ele_nonest[3]
|
|
ent_type=ele_nonest[4]
|
|
if ent_sid>=text_sid:
|
|
if ent_id in entity_arg1[cur_ele]:
|
|
ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG['arg1'][0]+' '+ent_text+ ' '+ENTITY_TAG['arg1'][1]+' '
|
|
else:
|
|
ner_text+=token_text[text_sid:ent_sid]+' '+ENTITY_TAG[ent_type.lower()][0]+' '+ent_text+ ' '+ENTITY_TAG[ent_type.lower()][1]+' '
|
|
text_sid=ent_eid
|
|
else:
|
|
pass
|
|
|
|
ner_text+=token_text[text_sid:]
|
|
sen_tokens=ner_text.split()
|
|
|
|
|
|
|
|
|
|
temp_input=[]
|
|
token_id=0
|
|
while token_id <len(sen_tokens):
|
|
if sen_tokens[token_id].find(ENTITY_TAG['arg1'][0])>=0:
|
|
temp_input.append(ENTITY_TAG['arg1'][0]+'\tO')
|
|
token_id+=1
|
|
while(sen_tokens[token_id]!=ENTITY_TAG['arg1'][1]):
|
|
temp_input.append(sen_tokens[token_id]+'\tO')
|
|
token_id+=1
|
|
temp_input.append(ENTITY_TAG['arg1'][1]+'\tO')
|
|
elif sen_tokens[token_id].find(ENTITY_TAG['gene'][0])>=0:
|
|
temp_input.append(ENTITY_TAG['gene'][0]+'\tO')
|
|
token_id+=1
|
|
while(sen_tokens[token_id]!=ENTITY_TAG['gene'][1]):
|
|
temp_input.append(sen_tokens[token_id]+'\tO')
|
|
token_id+=1
|
|
temp_input.append(ENTITY_TAG['gene'][1]+'\tO')
|
|
elif sen_tokens[token_id].find(ENTITY_TAG['species'][0])>=0:
|
|
temp_input.append(ENTITY_TAG['species'][0]+'\tO')
|
|
token_id+=1
|
|
while(sen_tokens[token_id]!=ENTITY_TAG['species'][1]):
|
|
temp_input.append(sen_tokens[token_id]+'\tO')
|
|
token_id+=1
|
|
temp_input.append(ENTITY_TAG['species'][1]+'\tO')
|
|
else:
|
|
if sen_tokens[token_id]=='':
|
|
print('token is none!error!')
|
|
else:
|
|
temp_input.append(sen_tokens[token_id]+'\tO')
|
|
token_id+=1
|
|
|
|
final_input.append('\n'.join(temp_input))
|
|
|
|
|
|
fout.write('\n\n'.join(final_input))
|
|
fout.write('\n')
|
|
fout.close()
|
|
|
|
def check_entity_pos(line,relations):
|
|
|
|
seg=line.split(' ')
|
|
stack_ent=[]
|
|
|
|
entity_num={'arg1':0,'arg2':0, 'gene':0,'chemical':0}
|
|
|
|
temp_arg2=[]
|
|
for i in range(0,len(seg)):
|
|
if seg[i].find(ENTITY_TAG['gene'][0])>=0:
|
|
entity_num['gene']+=1
|
|
stack_ent.append(seg[i])
|
|
elif seg[i].find(ENTITY_TAG['chemical'][0])>=0:
|
|
entity_num['chemical']+=1
|
|
stack_ent.append(seg[i])
|
|
|
|
elif seg[i].find(ENTITY_TAG['arg1'][0])>=0:
|
|
entity_num['arg1']+=1
|
|
stack_ent.append(seg[i])
|
|
elif seg[i].find(ENTITY_TAG['arg2'][0])>=0:
|
|
entity_num['arg2']+=1
|
|
temp_arg2.append(seg[i].split('|')[0])
|
|
stack_ent.append(seg[i])
|
|
elif seg[i].find(ENTITY_TAG['arg1'][1])>=0 or seg[i].find(ENTITY_TAG['arg2'][1])>=0 or seg[i].find(ENTITY_TAG['gene'][1])>=0 or seg[i].find(ENTITY_TAG['chemical'][1])>=0:
|
|
stack_ent.pop()
|
|
if stack_ent!=[]:
|
|
|
|
return(-1,seg,entity_num)
|
|
|
|
else:
|
|
if entity_num['arg1']!=0:
|
|
for arg2_id in relations.keys():
|
|
if arg2_id not in temp_arg2:
|
|
|
|
|
|
return(0,seg,entity_num)
|
|
if entity_num['arg2']!=0 and entity_num['arg1']==0:
|
|
return(0,seg,entity_num)
|
|
return(1,seg,entity_num)
|
|
|
|
def check_entity_neg(line):
|
|
|
|
seg=line.split(' ')
|
|
stack_ent=[]
|
|
|
|
entity_num={'arg1':0,'gene':0,'chemical':0}
|
|
for i in range(0,len(seg)):
|
|
if seg[i].find(ENTITY_TAG['gene'][0])>=0:
|
|
entity_num['gene']+=1
|
|
stack_ent.append(seg[i])
|
|
elif seg[i].find(ENTITY_TAG['chemical'][0])>=0:
|
|
entity_num['chemical']+=1
|
|
stack_ent.append(seg[i])
|
|
|
|
elif seg[i].find(ENTITY_TAG['arg1'][0])>=0:
|
|
entity_num['arg1']+=1
|
|
stack_ent.append(seg[i])
|
|
elif seg[i].find(ENTITY_TAG['arg1'][1])>=0 or seg[i].find(ENTITY_TAG['gene'][1])>=0 or seg[i].find(ENTITY_TAG['chemical'][1])>=0:
|
|
stack_ent.pop()
|
|
if stack_ent!=[]:
|
|
|
|
return(-1,seg,entity_num)
|
|
|
|
else:
|
|
return(1,seg,entity_num)
|
|
|
|
def get_one_entity(nest_list,cur_ent,rel_entity2_id):
|
|
max_len=0
|
|
max_entity=[]
|
|
final_entity=[]
|
|
for i in range(0, len(nest_list)):
|
|
if nest_list[i][1]==cur_ent:
|
|
final_entity=[]
|
|
max_entity=nest_list[i]
|
|
final_entity.append(nest_list[i])
|
|
return(final_entity)
|
|
if nest_list[i][1] in rel_entity2_id:
|
|
final_entity.append(nest_list[i])
|
|
continue
|
|
length=int(nest_list[i][4])-int(nest_list[i][3])
|
|
if max_entity==[]:
|
|
max_len=length
|
|
max_entity=nest_list[i]
|
|
else:
|
|
if length>max_len:
|
|
if max_entity[2]==REL_ENT['arg1']:
|
|
max_len=length
|
|
max_entity=nest_list[i]
|
|
else:
|
|
if nest_list[i][2]==REL_ENT['arg2'] and max_entity[1] not in rel_entity2_id:
|
|
max_len=length
|
|
max_entity=nest_list[i]
|
|
|
|
else:
|
|
if nest_list[i][1] in rel_entity2_id:
|
|
max_len=length
|
|
max_entity=nest_list[i]
|
|
elif max_entity[2]==REL_ENT['arg1'] and nest_list[i][2]==REL_ENT['arg2']:
|
|
max_len=length
|
|
max_entity=nest_list[i]
|
|
if final_entity==[]:
|
|
final_entity.append(max_entity)
|
|
return final_entity
|
|
|
|
if __name__=='__main__':
|
|
|
|
infile='../../TrainingSet/No505/SA.Train.txt'
|
|
outfile='../../TrainingSet/No505/SA.Train.conll'
|
|
|
|
|
|
token_input=ssplit_token(infile)
|
|
|
|
|
|
nonest_input=corpus_noNest(token_input)
|
|
|
|
|
|
generate_seq_input(nonest_input,outfile) |