using namespace std; | |
using namespace iret; | |
class WordData { | |
public: | |
WordData(const char *wrdname="wrdset3", const char *stpname="stop", | |
const char *lfsname="Lf1chSf"); | |
~WordData(); | |
Chash wrdset; //sigle word in MEDLINE | |
Hash stp; //stopword | |
Hash lfs; //lfs (1-ch sf) for FirstLet match cases >=2 | |
}; | |
class AbbrStra { | |
public: | |
AbbrStra(); | |
~AbbrStra(); | |
void token(const char *str, char lst[1000][1000]); // tokennize & set ntk | |
long tokenize(const char *str, char lst[1000][1000]); //tokennize & return # tokens | |
long num_token(const char *str); //return # tokens | |
long first_ch(const char *str, char *fch, long num); | |
long is_upperal(const char *str); | |
long is_alpha(const char *str); | |
void str_tolower(const char *str1, char *str2); | |
long get_str(const char *str1, char *str2, long num); | |
bool isupper_str(const char *str); | |
bool is_onealpha(const char *str); | |
long count_upperstr(const char *str); | |
//return # upper-case 1st letter of consecutive tokens (backward) | |
void get_alpha(const char *str1, char *str2); | |
//set str2 with only alphabet of str1 | |
bool lf_ok(const char *shrtf, const char *longf); | |
virtual bool set_condition(const char *sf); | |
//must set nonAlphaSF=true if want to use SF containing non-alphabet | |
virtual long strategy(const char *sf, const char *str) = 0; | |
//sf & str will be lower-cased (OCt-25-2007) | |
long search_backward(long sloc, long tnum, long tloc, const char *sf, bool first); | |
//search backward to find match starting from sf[sloc] | |
//Returns 1 if matches. sf[0] must match with begin word | |
long search_backward_adv(const char *sf, bool first); | |
//Searches for next model setting. Returns 1 if finds one. | |
void extract_lf(long begin, long end); | |
//save strings from begin to end of tok to lf | |
void extract_lf(long begin, long end, const char *str); | |
//save strings from begin to end of str's tok to lf | |
//---after set mod check conditions | |
//nsf:# ch in sf, nsw:# allowed skipword, general:true allow 1st ch match after non-alnum | |
bool exist_skipword(long nsf); | |
//true if at least one skip word exists | |
bool exist_n_skipwords(long nsf, long n); | |
//true if exist n consecutive skip words between tokens but cannot be more than n | |
bool exist_n_stopwords(long nsf, long n); | |
//true if exist n consecutive skip stopwords between tokens but cannot be more than n | |
bool stopword_ok(long nsf, long nsw); | |
//true if at most (can be 0) nsw skip stopword in row exists | |
bool skip_stop_ok(long nsf, long nsw, long n); | |
//true if at most (can be 0) nsw skip word, which include at least n stopwords, in row exists | |
bool skip_stop_ok2(long nsf, long nsw, long n); | |
//true if nsw skip word, which include at least n stopwords, in row exists | |
bool skipword_ok(long nsf, long nsw); | |
//true if at most (can be 0) nsw skip word in row exists | |
bool is_subword(long nsf); | |
//true if matching string is begin of a tok or a word in wrdlist | |
bool is_BeginWrdMatch(long nsf, bool general); | |
//true if begining ch of a word match | |
//if general is true, allow match after non-alnum (eg, 1-alpha) | |
bool is_WithinWrdMatch(long nsf, bool general); | |
//true if within word match | |
//if general is true, 1-Alpha: 'A' is not within word match | |
bool is_FirstLetMatch(long nsf, bool general); | |
//true if each ch of sf match with 1st ch of word | |
//(true: Alpha anyword Beta (AB)) | |
//if general=true, true: 1-Alpha Beta, Alpha-Beta | |
bool is_FirstLetMatch2(long nsf, bool general); | |
//at least one 1-Alpha | |
bool is_FirstLetSMatch(const char *sf, bool general); | |
//true if first letter match & 's' match with last ch of lf | |
bool is_ContLetMatch(long nsf); | |
//true if two or more consecutive letter match | |
//--- | |
char *pch; //sf applied to a strategy | |
char *ps, *pl; //sf, potential lf | |
char sf[100], text[10000]; //sf & potential lf used in a strategy | |
char lf[10000]; //lf found by a strategy | |
char tok[1000][1000]; //token of potential lf | |
//lower after strategy, original after extract_lf(b,e,str) | |
long ntk; //# tokens | |
long mod[100][2]; //match locations of tok with a given sf | |
//mod[sf_inx][0]=tok inx, mod[sf_inx][1]=match loc in tok[mod[sf_inx][0]] | |
//for each n_ch-SF | |
long npairs; //selected pairs for this strategy | |
long tpairs; //total pairs | |
long nsfs; //# selected unique sfs for this strategy | |
long nmatchs; //# matchs (success strategy & given sf == real sf) | |
long amatchs; //# accumulated matchs up to this strategy | |
long setCondition; //SF condition | |
long greaterEqNsf; //if 1 select SF |SF|>=nsf | |
WordData *wData; | |
}; | |
/* | |
alpha beta gamma (ABG) | |
*/ | |
class FirstLet : public AbbrStra { | |
public: | |
virtual bool set_condition(const char *str1, const char *str2, char *str); | |
virtual long strategy(const char *sf, const char *str); | |
}; | |
class FirstLetOneChSF : public AbbrStra { | |
public: | |
virtual bool set_condition(const char *str1, const char *str2, char *str); | |
virtual long strategy(const char *sf, const char *str); | |
}; | |
/* | |
- sf ch matchs with 1st ch or ch right after non-alphanum of lf | |
but at least one match right after non-alphanum | |
(eg, success: 1-alpha 2-beta (AB), alpha-beta(AB), | |
fail: alpha beta(AB)) | |
*/ | |
class FirstLetGen : public AbbrStra { | |
public: | |
virtual long strategy(const char *sf, const char *str); | |
}; | |
/* | |
- sf ch matchs with 1st ch or ch right after non-alphanum of lf | |
(eg, success: 1-alpha 2-beta (AB), alpha-beta(AB), | |
alpha beta(AB)) | |
*/ | |
class FirstLetGen2 : public AbbrStra { | |
public: | |
virtual long strategy(const char *sf, const char *str); | |
}; | |
/* | |
For sf consisting of capital letters & lower-case 's' | |
- First letter & 's' in the last token of lf | |
(success: Alpha Betas (ABs), 1-Alpha Betas (ABs), | |
1-Alpha-Betas (ABs), Alpha BetaS (ABs) | |
fail: Alpha Beta xxs (ABs) ) | |
*/ | |
class FirstLetGenS : public AbbrStra { | |
public: | |
virtual bool set_condition(const char *sf); //sf must be an original sf | |
//true if sf is like ABCs | |
virtual long strategy(const char *sf, const char *str); | |
}; | |
/* | |
- sf ch matches with 1st ch or ch right after non-alphanum of lf | |
- allowing one skip stopword between tokens (no more than one in row) | |
at least one skip stopword in total | |
(eg, success: alpha and beta (AB), 1-alpha and beta (AB) | |
fail: alpha beta (AB), alpha word beta (AB)) | |
*/ | |
class FirstLetGenStp : public AbbrStra { | |
public: | |
virtual long strategy(const char *sf, const char *str); | |
}; | |
/* | |
- same as FirstLetGenStp except for 2 skip stopwords | |
& at least one two consecutive skip stopwords | |
*/ | |
class FirstLetGenStp2 : public AbbrStra { | |
public: | |
virtual long strategy(const char *sf, const char *str); | |
}; | |
/* | |
- same as FirstLetGenStp except using skip any word instead of stopword | |
*/ | |
class FirstLetGenSkp : public AbbrStra { | |
public: | |
virtual long strategy(const char *sf, const char *str); | |
}; | |
/* | |
- a matching sub-string must be word | |
(eg, success: AlphaBeta (AB), Beta is word | |
x-AlphaBeta (AB) ) | |
- at least one within word match | |
(eg,fail: Alpha Beta Word (ABW), Alpha x-Beta x-Word (ABW) | |
success: AlphaBeta Word (ABW), x-AlphaBeta inWord (ABW)) | |
*/ | |
class WithinWrdWrd : public AbbrStra { | |
public: | |
virtual long strategy(const char *sf, const char *str); | |
}; | |
/* | |
- WithinWrdWrd w/ Begin Word Match | |
(success: AlphaBeta x-Word (ABW) | |
fail: AlphaBeta inWord (ABW) ) | |
*/ | |
class WithinWrdFWrd : public AbbrStra { | |
public: | |
virtual long strategy(const char *sf, const char *str); | |
}; | |
/* | |
- WithinWrdFWrd w/ allowing one skip word between tokens (no more than one in row) | |
at least one skip word in total | |
(success: AlphaBeta zzz x-Word zzz (ABW) | |
fail: AlphaBeta x-Word (ABW), AlphaBeta zzz yyy x-Word (ABW)) | |
*/ | |
class WithinWrdFWrdSkp : public AbbrStra { | |
public: | |
virtual long strategy(const char *sf, const char *str); | |
}; | |
/* | |
- at least one within word match | |
( success: Alpha InXyy (AX), x-Alpha InXyy (AX)) | |
fail: Alpha Xyy (AX), Alpha 1-Xyy (AX)) | |
*/ | |
class WithinWrdLet : public AbbrStra { | |
public: | |
virtual long strategy(const char *sf, const char *str); | |
}; | |
/* | |
- WithinWrdLet w/ Begin Word Match | |
(fail: Alpha InXyy (AX), x-Alpha InXyy (AX) | |
success: AlphaXyy Word (AXW), x-AlphaXyy 1-Word (AXW)) | |
*/ | |
class WithinWrdFLet : public AbbrStra { | |
public: | |
virtual long strategy(const char *sf, const char *str); | |
}; | |
/* | |
- WithinWrdFLet w/ allowing one skip word between tokens (no more than one in row) | |
at least one skip word in total | |
(success: AlphaXyy zzz Word zzz (AXW) | |
fail: AlphaXyy Word (AXW), AlphaXyy zzz yyy Word (AXW)) | |
*/ | |
class WithinWrdFLetSkp : public AbbrStra { | |
public: | |
virtual long strategy(const char *sf, const char *str); | |
}; | |
/* | |
- any two consecutive letter matching w/ begin word match | |
eg) ABxxx (AB), 1-ABxxx (AB), ABxxx Cxxx (ABC), Axxx BCxxx (ABC) | |
prolactin (PRL), succinylcholine (SCh) | |
*/ | |
class ContLet : public AbbrStra { | |
public: | |
virtual long strategy(const char *sf, const char *str); | |
}; | |
/* | |
- ContLet w/ allowing one skip word between tokens (no more than one in row) | |
at least one skip word in total | |
*/ | |
class ContLetSkp : public AbbrStra { | |
public: | |
virtual long strategy(const char *sf, const char *str); | |
}; | |
/* | |
- match can occur anywhere | |
- allow one skip word between tokens (no more than one in row) | |
(success: Alpha yXyy (AX), Alpha yXyy word (AX) | |
1-Alpha yXyy word (AX)) | |
*/ | |
class AnyLet : public AbbrStra { | |
public: | |
virtual long strategy(const char *sf, const char *str); | |
}; | |
class StratUtil { | |
public: | |
AbbrStra *strat_factory(string name); | |
vector<string> get_strats(string s); | |
//get the strategy sequence for a given #-ch SF group | |
void push_back_strat(string sgp, string strat); | |
bool group_sf(const char *sf, string &grp); | |
//check if sf is ok and assign a group | |
bool group_sf(const char *sf, const char *lf, string &grp); | |
//add the contion |lf|>|sf| | |
void remove_nonAlnum(const char *str1, char *str2); | |
//remove non-alnum in str1 and save it to str2 | |
long exist_upperal(const char *str); //return 1 if exists upper char, 0 ow | |
long num_token(const char *str); //return # tokens | |
vector<string> Al1, Al2, Al3, Al4, Al5; | |
vector<string> Num2, Num3, Num4, Num5; | |
vector<string> Spec2, Spec3, Spec4, Spec5; | |
}; | |