|
#include "AbbrvE.h" |
|
#include <sstream> |
|
|
|
namespace iret { |
|
|
|
Find_Seq::Find_Seq( void ) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
{ |
|
seq_i.push_back("i"); |
|
seq_i.push_back("ii"); |
|
seq_i.push_back("iii"); |
|
seq_i.push_back("iv"); |
|
seq_i.push_back("v"); |
|
seq_i.push_back("vi"); |
|
|
|
seq_I.push_back("I"); |
|
seq_I.push_back("II"); |
|
seq_I.push_back("III"); |
|
seq_I.push_back("IV"); |
|
seq_I.push_back("V"); |
|
seq_I.push_back("VI"); |
|
|
|
seq_a.push_back("a"); |
|
seq_a.push_back("b"); |
|
seq_a.push_back("c"); |
|
seq_a.push_back("d"); |
|
seq_a.push_back("e"); |
|
seq_a.push_back("f"); |
|
|
|
seq_A.push_back("A"); |
|
seq_A.push_back("B"); |
|
seq_A.push_back("C"); |
|
seq_A.push_back("D"); |
|
seq_A.push_back("E"); |
|
seq_A.push_back("F"); |
|
|
|
} |
|
|
|
void |
|
Find_Seq::flag_seq( int numa, char* abbs[] ) { |
|
|
|
my_numa = numa; |
|
my_abbs = abbs; |
|
|
|
my_rate.resize(numa); |
|
for ( int i = 0; i < numa; ++i ) |
|
my_rate[i] = true; |
|
|
|
find_seq(seq_i); |
|
find_seq(seq_I); |
|
find_seq(seq_a); |
|
find_seq(seq_A); |
|
|
|
create_seq(); |
|
} |
|
|
|
|
|
void |
|
Find_Seq::find_seq( const vector<string> & seq ) { |
|
|
|
for ( int i_abbr = 0; i_abbr < my_numa-1; ++i_abbr ) { |
|
|
|
|
|
if ( seq[0] == my_abbs[i_abbr] ) { |
|
int i_seq = 1; |
|
while ( i_seq < seq.size() and |
|
i_seq + i_abbr < my_numa and |
|
seq[i_seq] == my_abbs[i_abbr + i_seq ] ) |
|
++i_seq; |
|
|
|
if ( i_seq > 1 ) |
|
for ( int i = 0; i < i_seq; ++i ) |
|
my_rate[i_abbr+i] = false; |
|
} |
|
|
|
} |
|
} |
|
|
|
void |
|
Find_Seq::create_seq( void ) { |
|
|
|
for ( int i_abbr = 0; i_abbr < my_numa; ++i_abbr ) { |
|
|
|
size_t len = std::strlen( my_abbs[i_abbr] ); |
|
|
|
if ( my_abbs[i_abbr][len-1] == '1' ) { |
|
|
|
|
|
string prefix( my_abbs[i_abbr], len-1 ); |
|
size_t seq_len = my_numa - i_abbr; |
|
vector<string> seq; |
|
|
|
for ( int i= 1; i <= seq_len; ++i ) { |
|
std::ostringstream stream(prefix,std::ios::app); |
|
stream << i; |
|
seq.push_back( stream.str() ); |
|
} |
|
|
|
|
|
find_seq(seq); |
|
} |
|
} |
|
} |
|
|
|
|
|
AbbrvE::AbbrvE(long ta,long wrd_spc){ |
|
tta=ta; |
|
word_space=wrd_spc; |
|
abbl=new char*[tta]; |
|
abbs=new char*[tta]; |
|
nt=new int[tta]; |
|
lst=new char*[word_space]; |
|
numa=num=0; |
|
pMt=new MPtok; |
|
setup_Test(); |
|
} |
|
|
|
AbbrvE::~AbbrvE(){ |
|
if(numa)cleara(); |
|
clear(); |
|
delete [] abbl; |
|
delete [] abbs; |
|
delete [] nt; |
|
delete [] lst; |
|
delete pMt; |
|
} |
|
|
|
void AbbrvE::Extract(char*pch){ |
|
long i,j,k,u,flag; |
|
int ix; |
|
|
|
if ( strlen(pch) <= 0 ) |
|
return; |
|
|
|
token(pch); |
|
|
|
i=j=k=0; |
|
flag=0; |
|
while(i<num){ |
|
if(!strcmp("(",lst[i])){ |
|
if(flag)k=j+1; |
|
if((i>k)&&(strcmp(")",lst[i-1]))){ |
|
j=i; |
|
flag=1; |
|
} |
|
} |
|
if(!strcmp(")",lst[i])){ |
|
if(!flag){j=k=i+1;} |
|
else { |
|
if(((j>k)&&(i<j+12))&&(i>j+1)){ |
|
if(k<j-10)k=j-10; |
|
strcpy(cnam,lst[k]); |
|
for(u=k+1;u<j;u++){ |
|
strcat(cnam," "); |
|
strcat(cnam,lst[u]); |
|
} |
|
ix=strlen(cnam); |
|
abbl[numa]=new char[ix+1]; |
|
strcpy(abbl[numa],cnam); |
|
|
|
strcpy(cnam,lst[j+1]); |
|
for(u=j+2;u<i;u++){ |
|
strcat(cnam," "); |
|
strcat(cnam,lst[u]); |
|
} |
|
nt[numa]=i-j-1; |
|
ix=strlen(cnam); |
|
abbs[numa]=new char[ix+1]; |
|
strcpy(abbs[numa],cnam); |
|
if(Test(abbs[numa]))numa++; |
|
else{ |
|
|
|
delete [] abbs[numa]; |
|
delete [] abbl[numa]; |
|
} |
|
flag=0; |
|
} |
|
else { |
|
flag=0; |
|
k=i+1; |
|
} |
|
} |
|
} |
|
i++; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
void AbbrvE::Extract2(const char*pch){ |
|
long i,j,k,u,ii,jj,kk,flag; |
|
int ix; |
|
char openCh[2], closeCh[2]; |
|
|
|
token2(pch); |
|
|
|
for(jj=0; jj<2; jj++) { |
|
i=j=k=0; |
|
flag=0; |
|
|
|
if(jj==0) { strcpy(openCh,"("); strcpy(closeCh,")"); } |
|
else if(jj==1) { strcpy(openCh,"["); strcpy(closeCh,"]"); } |
|
|
|
while(i<num){ |
|
if(!strcmp(openCh,lst[i])){ |
|
if(flag)k=j+1; |
|
if((i>k)&&(strcmp(closeCh,lst[i-1]))){ |
|
j=i; |
|
flag=1; |
|
} |
|
} |
|
if(!strcmp(closeCh,lst[i])){ |
|
if(!flag){j=k=i+1;} |
|
else { |
|
if(((j>k)&&(i<j+12))&&(i>j+1)){ |
|
if(k<j-10)k=j-10; |
|
strcpy(cnam,lst[k]); |
|
for(u=k+1;u<j;u++){ |
|
strcat(cnam," "); |
|
strcat(cnam,lst[u]); |
|
} |
|
ix=strlen(cnam); |
|
abbl[numa]=new char[ix+1]; |
|
strcpy(abbl[numa],cnam); |
|
|
|
strcpy(cnam,lst[j+1]); |
|
for(u=j+2;u<i;u++){ |
|
strcat(cnam," "); |
|
strcat(cnam,lst[u]); |
|
} |
|
nt[numa]=i-j-1; |
|
ix=strlen(cnam); |
|
|
|
|
|
ii=0; |
|
while(ii<ix) { |
|
if( ((cnam[ii]==';')&&(cnam[ii+1]==' ')) || |
|
((cnam[ii]==',')&&(cnam[ii+1]==' ')) ) { |
|
ix=ii+1; |
|
cnam[ii]='\0'; |
|
break; |
|
} |
|
ii++; |
|
} |
|
|
|
|
|
abbs[numa]=new char[ix+1]; |
|
strcpy(abbs[numa],cnam); |
|
if(Test(abbs[numa]))numa++; |
|
else{ |
|
|
|
delete [] abbs[numa]; |
|
delete [] abbl[numa]; |
|
} |
|
flag=0; |
|
} |
|
else { |
|
flag=0; |
|
k=i+1; |
|
} |
|
} |
|
} |
|
i++; |
|
} |
|
} |
|
} |
|
|
|
|
|
void AbbrvE::token(const char *pch){ |
|
long i=1,j=0,k=0; |
|
long u=1,flag=0; |
|
char c,*str=cnam; |
|
int size=strlen(pch); |
|
if(size>cnam_size) { |
|
cerr<<"Scratch space "<<cnam_size<<", needed "<<size<<endl; |
|
exit(1); |
|
} |
|
clear(); |
|
cnam[0]=pch[0]; |
|
while(c=pch[i]){ |
|
switch(c){ |
|
case '(': if(isblank(str[u-1])){ |
|
str[u++]=pch[i++]; |
|
if(!isblank(pch[i])){ |
|
str[u++]=' '; |
|
} |
|
flag=1; |
|
} |
|
else str[u++]=pch[i++]; |
|
break; |
|
case ')': if(flag){ |
|
if(!isblank(str[u-1])){ |
|
str[u++]=' '; |
|
str[u++]=pch[i++]; |
|
} |
|
if(!isblank(pch[i]))str[u++]=' '; |
|
flag=0; |
|
} |
|
else str[u++]=pch[i++]; |
|
break; |
|
default: str[u++]=pch[i++]; |
|
} |
|
} |
|
while((u>0)&&(isblank(str[u-1])))u--; |
|
str[u]='\0'; |
|
|
|
while(str[j]){ |
|
while(isblank(str[j]))j++; |
|
i=j; |
|
while((str[j])&&(!isblank(str[j])))j++; |
|
lst[k]=new char[j-i+1]; |
|
strncpy(lst[k],str+i,j-i); |
|
lst[k][j-i]='\0'; |
|
if(str[j]){ |
|
k++; |
|
j++; |
|
} |
|
} |
|
num=k+1; |
|
} |
|
|
|
|
|
|
|
|
|
void AbbrvE::token2(const char *pch){ |
|
long i=1,j=0,k=0; |
|
long u=1; |
|
vector<bool> openChFlag1,openChFlag2; |
|
long cflag; |
|
long ii, jj, kk, sz; |
|
char c,*str=cnam; |
|
clear(); |
|
cnam[0]=pch[0]; |
|
while(c=pch[i]){ |
|
switch(c){ |
|
case '(': |
|
|
|
ii=kk=i; |
|
cflag=0; |
|
while(pch[ii] && !isblank(pch[ii])) { |
|
if(pch[ii]=='(') cflag -= 1; |
|
else if(pch[ii]==')') { cflag += 1; kk=ii; } |
|
ii++; |
|
} |
|
|
|
if(!cflag && isalnum(pch[kk+1])) { |
|
while(i<ii) str[u++]=pch[i++]; |
|
break; |
|
} |
|
|
|
|
|
if(isblank(str[u-1])){ |
|
str[u++]=pch[i++]; |
|
if(!isblank(pch[i])){ |
|
str[u++]=' '; |
|
} |
|
openChFlag1.push_back(true); |
|
} |
|
else { |
|
str[u++]=pch[i++]; |
|
openChFlag1.push_back(false); |
|
} |
|
|
|
break; |
|
|
|
case ')': sz = openChFlag1.size(); |
|
if(sz>0 && openChFlag1[sz-1]){ |
|
if(!isblank(str[u-1])){ |
|
str[u++]=' '; |
|
str[u++]=pch[i++]; |
|
} |
|
|
|
else if(!isblank(pch[i+1])){ |
|
str[u++]=pch[i++]; |
|
} |
|
|
|
|
|
if(!isblank(pch[i]))str[u++]=' '; |
|
} |
|
else str[u++]=pch[i++]; |
|
|
|
if(sz>0) openChFlag1.pop_back(); |
|
|
|
break; |
|
|
|
case '[': |
|
|
|
ii=kk=i; |
|
cflag=0; |
|
while(pch[ii] && !isblank(pch[ii])) { |
|
if(pch[ii]=='[') cflag -= 1; |
|
else if(pch[ii]==']') { cflag += 1; kk=ii; } |
|
ii++; |
|
} |
|
|
|
if(!cflag && isalnum(pch[kk+1])) { |
|
while(i<ii) str[u++]=pch[i++]; |
|
break; |
|
} |
|
|
|
|
|
if(isblank(str[u-1])){ |
|
str[u++]=pch[i++]; |
|
if(!isblank(pch[i])){ |
|
str[u++]=' '; |
|
} |
|
openChFlag2.push_back(true); |
|
} |
|
else { |
|
str[u++]=pch[i++]; |
|
openChFlag2.push_back(false); |
|
} |
|
|
|
break; |
|
|
|
case ']': sz=openChFlag2.size(); |
|
if(sz>0 && openChFlag2[sz-1]){ |
|
if(!isblank(str[u-1])){ |
|
str[u++]=' '; |
|
str[u++]=pch[i++]; |
|
} |
|
|
|
else if(!isblank(pch[i+1])){ |
|
str[u++]=pch[i++]; |
|
} |
|
|
|
if(!isblank(pch[i]))str[u++]=' '; |
|
} |
|
else str[u++]=pch[i++]; |
|
|
|
if(sz>0) openChFlag2.pop_back(); |
|
|
|
break; |
|
default: str[u++]=pch[i++]; |
|
} |
|
} |
|
while((u>0)&&(isblank(str[u-1])))u--; |
|
str[u]='\0'; |
|
|
|
while(str[j]){ |
|
while(isblank(str[j]))j++; |
|
i=j; |
|
while((str[j])&&(!isblank(str[j])))j++; |
|
lst[k]=new char[j-i+1]; |
|
strncpy(lst[k],str+i,j-i); |
|
lst[k][j-i]='\0'; |
|
if(str[j]){ |
|
k++; |
|
j++; |
|
} |
|
} |
|
num=k+1; |
|
} |
|
|
|
|
|
void AbbrvE::clear(void){ |
|
for ( int i=0; i<num; i++ ) { |
|
delete [] lst[i]; |
|
} |
|
num=0; |
|
} |
|
|
|
void AbbrvE::cleara(void){ |
|
long i; |
|
for(i=0;i<numa;i++){ |
|
delete [] abbl[i]; |
|
delete [] abbs[i]; |
|
} |
|
numa=0; |
|
} |
|
|
|
#if 0 |
|
|
|
|
|
int AbbrvE::Test(const char *str){ |
|
long i,j,k; |
|
char b,c; |
|
|
|
if(strchr(str,'='))return(0); |
|
if(!strcmp(str,"author's transl"))return(0); |
|
if(!strcmp(str,"proceedings"))return(0); |
|
|
|
if(!strcmp(str,"see"))return(0); |
|
if(!strcmp(str,"and"))return(0); |
|
if(!strcmp(str,"comment"))return(0); |
|
if(!strcmp(str,"letter"))return(0); |
|
|
|
if((str[0]=='e')&&(str[1]=='g')){ |
|
if(!(c=str[2])||(c=='.')||(c==','))return(0); |
|
} |
|
if((str[0]=='s')&&(str[1]=='e')&&(str[2]=='e')&&(((b=str[3])==' ')||(b==',')))return(0); |
|
if('p'==tolower(str[0])){ |
|
if(strchr(str+1,'<'))return(0); |
|
} |
|
i=j=k=0; |
|
while((c=str[i])&&(c!=' ')){ |
|
i++; |
|
if(isdigit(c))j++; |
|
if(isalpha(c))k++; |
|
if((i==j)&&(i==3))return(0); |
|
} |
|
if((i==j)||(k==0))return(0); |
|
else return(1); |
|
} |
|
|
|
#endif |
|
|
|
bool AbbrvE::prefix_match( const char *str ) { |
|
size_t size = strlen(str); |
|
for ( int i = 0; i < prefix.size(); ++i ) { |
|
string& pre = prefix[i]; |
|
if ( size > pre.size() and |
|
0 == pre.compare( 0, pre.size(), str, pre.size() ) ) |
|
return true; |
|
} |
|
return false; |
|
} |
|
|
|
|
|
|
|
bool AbbrvE::Test(const char *str){ |
|
|
|
if ( match.find(str) != match.end() ) return false; |
|
if ( prefix_match(str) ) return false; |
|
|
|
size_t length, letters, digits; |
|
length = letters = digits = 0; |
|
|
|
char c; |
|
while((c=str[length])&&(c!=' ')){ |
|
length++; |
|
if ( isdigit(c) ) digits++; |
|
if ( isalpha(c) ) letters++; |
|
|
|
if( length==digits and length>=3 ) return false; |
|
} |
|
if ( digits == length ) return false; |
|
if ( letters <= 0 ) return false; |
|
|
|
return true; |
|
} |
|
|
|
void AbbrvE::setup_Test( void ) { |
|
|
|
match.insert("author's transl"); |
|
match.insert("proceedings"); |
|
match.insert("see"); |
|
match.insert("and"); |
|
match.insert("comment"); |
|
match.insert("letter"); |
|
match.insert("eg"); |
|
|
|
prefix.push_back("="); |
|
prefix.push_back("eg."); |
|
prefix.push_back("eg,"); |
|
prefix.push_back("see "); |
|
prefix.push_back("see,"); |
|
prefix.push_back("p<"); |
|
prefix.push_back("P<"); |
|
|
|
|
|
match.insert("e.g."); |
|
match.insert("ie"); |
|
match.insert("i.e."); |
|
match.insert("mean"); |
|
match.insert("age"); |
|
match.insert("std"); |
|
match.insert("range"); |
|
match.insert("young"); |
|
match.insert("old"); |
|
match.insert("male"); |
|
match.insert("female"); |
|
|
|
} |
|
|
|
void AbbrvE::Proc(char *pxh){ |
|
long i,j; |
|
char *pch,*ptr; |
|
pMt->segment(pxh); |
|
for(i=0;i<pMt->sent.size();i++){ |
|
Extract2( (pMt->sent[i]).c_str() ); |
|
} |
|
|
|
seq.flag_seq( numa, abbs ); |
|
j=0; |
|
for(i=0;i<numa;i++){ |
|
if( seq.rate(i) ){ |
|
if(j<i){ |
|
pch=abbl[i]; |
|
if(ptr=strchr(pch,'|')){ |
|
*ptr='/'; |
|
ptr++; |
|
while(ptr=strchr(pch,'|')){ |
|
*ptr='/'; |
|
ptr++; |
|
} |
|
} |
|
abbl[j]=pch; |
|
pch=abbs[i]; |
|
if(ptr=strchr(pch,'|')){ |
|
*ptr='/'; |
|
ptr++; |
|
while(ptr=strchr(pch,'|')){ |
|
*ptr='/'; |
|
ptr++; |
|
} |
|
} |
|
abbs[j]=pch; |
|
nt[j]=nt[i]; |
|
} |
|
j++; |
|
} |
|
else { |
|
delete [] abbl[i]; |
|
delete [] abbs[i]; |
|
} |
|
} |
|
|
|
numa=j; |
|
} |
|
|
|
} |
|
|