#ifndef ABBRVE_H #define ABBRVE_H #include #include #include #include #include using namespace std; namespace iret { typedef vector strings; class Find_Seq { public: Find_Seq( void ); // flag the SFs whether part of sequence or not void flag_seq( int numa, char* abbs[] ); // true if good SF, false if part of sequence bool rate( int i ) const { my_rate[i]; } private: void find_seq( const vector & seq ); void create_seq( void ); // const works with c++0x /* const */ strings seq_i; /* const */ strings seq_I; /* const */ strings seq_a; /* const */ strings seq_A; vector my_rate; int my_numa; char ** my_abbs; // really char *[], but that doesn't work }; class AbbrvE { public: AbbrvE(long ta=10000,long wrd_spc=10000); //Sets space for extracted //potential abbreviations to ta & word_space to wrd_spc ~AbbrvE(void); void Extract(char *pch); //Extracts possible long-short form //pairs, but does not attempt to find the relationship void Extract2(const char *pch); //extened version (Jan-9-2008) bool Test(const char *str); //Tests a single token and returns true //if the token should be a possible first token of a short form void Rate(void); //Sets ratings for the proposed pairs. Effort to //remove (a), (b), etc., sequence markers void token(const char *str); //Produces a list of tokens in order of //of occurrence in the string. void token2(const char *str); //extended version (Jan-9-2008) void cleara(void); //Clear the abbl & abbs memory of strings void clear(void); //Clear the lst memory of words //Application functions void Proc(char *pch); //Accepts a natural language statement and //processes to final results stored in tta, abbs, and abbl //Need to call cleara function after each use of this function // Internal routines: // setup data for Test method void setup_Test( void ); bool prefix_match( const char *str ); // does str begins with a prefix? //Data long tta; //Total possible abbreviations extracted //default 10k long numa; //number of abbreviations in current extract char **abbl; //Long form space, hold up to 10 tokens char **abbs; //Short form space, hold up to 10 tokens Find_Seq seq; // identify sequences to ignore int *nt; //Number of tokens within parentheses long word_space; //Space in lst for tokens //default 10k long num; //Number of tokens char **lst; //Holds the tokens static const int cnam_size=100000; char cnam[cnam_size]; //Work space MPtok *pMt; //Pointer at tokenizer class. Used to segment text //in Proc function. // Test data set match; // bad SF to match exactly vector prefix; // bad SF to match prefix }; } #endif