steventango
/

GNorm2-docker

Model card Files Files and versions Community

GNorm2-docker / Library /AbbrStra.h

steventango

Upload folder using huggingface_hub

d5062c8 verified about 1 year ago

raw

history blame contribute delete

10.4 kB

	#ifndef ABBRSTRA_H
	#define ABBRSTRA_H

	#include <vector>
	#include <string>
	#include <Hash.h>

	using namespace std;
	using namespace iret;


	class WordData {
	public:
	WordData(const char wrdname="wrdset3", const char stpname="stop",
	const char *lfsname="Lf1chSf");

	~WordData();

	Chash wrdset; //sigle word in MEDLINE
	Hash stp; //stopword
	Hash lfs; //lfs (1-ch sf) for FirstLet match cases >=2
	};


	class AbbrStra {
	public:
	AbbrStra();
	~AbbrStra();
	void token(const char *str, char lst[1000][1000]); // tokennize & set ntk
	long tokenize(const char *str, char lst[1000][1000]); //tokennize & return # tokens
	long num_token(const char *str); //return # tokens
	long first_ch(const char str, char fch, long num);
	long is_upperal(const char *str);
	long is_alpha(const char *str);
	void str_tolower(const char str1, char str2);
	long get_str(const char str1, char str2, long num);
	bool isupper_str(const char *str);
	bool is_onealpha(const char *str);
	long count_upperstr(const char *str);
	//return # upper-case 1st letter of consecutive tokens (backward)
	void get_alpha(const char str1, char str2);
	//set str2 with only alphabet of str1
	bool lf_ok(const char shrtf, const char longf);

	virtual bool set_condition(const char *sf);
	//must set nonAlphaSF=true if want to use SF containing non-alphabet
	virtual long strategy(const char sf, const char str) = 0;
	//sf & str will be lower-cased (OCt-25-2007)
	long search_backward(long sloc, long tnum, long tloc, const char *sf, bool first);
	//search backward to find match starting from sf[sloc]
	//Returns 1 if matches. sf[0] must match with begin word
	long search_backward_adv(const char *sf, bool first);
	//Searches for next model setting. Returns 1 if finds one.
	void extract_lf(long begin, long end);
	//save strings from begin to end of tok to lf
	void extract_lf(long begin, long end, const char *str);
	//save strings from begin to end of str's tok to lf

	//---after set mod check conditions
	//nsf:# ch in sf, nsw:# allowed skipword, general:true allow 1st ch match after non-alnum
	bool exist_skipword(long nsf);
	//true if at least one skip word exists
	bool exist_n_skipwords(long nsf, long n);
	//true if exist n consecutive skip words between tokens but cannot be more than n
	bool exist_n_stopwords(long nsf, long n);
	//true if exist n consecutive skip stopwords between tokens but cannot be more than n
	bool stopword_ok(long nsf, long nsw);
	//true if at most (can be 0) nsw skip stopword in row exists
	bool skip_stop_ok(long nsf, long nsw, long n);
	//true if at most (can be 0) nsw skip word, which include at least n stopwords, in row exists
	bool skip_stop_ok2(long nsf, long nsw, long n);
	//true if nsw skip word, which include at least n stopwords, in row exists
	bool skipword_ok(long nsf, long nsw);
	//true if at most (can be 0) nsw skip word in row exists
	bool is_subword(long nsf);
	//true if matching string is begin of a tok or a word in wrdlist
	bool is_BeginWrdMatch(long nsf, bool general);
	//true if begining ch of a word match
	//if general is true, allow match after non-alnum (eg, 1-alpha)
	bool is_WithinWrdMatch(long nsf, bool general);
	//true if within word match
	//if general is true, 1-Alpha: 'A' is not within word match
	bool is_FirstLetMatch(long nsf, bool general);
	//true if each ch of sf match with 1st ch of word
	//(true: Alpha anyword Beta (AB))
	//if general=true, true: 1-Alpha Beta, Alpha-Beta
	bool is_FirstLetMatch2(long nsf, bool general);
	//at least one 1-Alpha
	bool is_FirstLetSMatch(const char *sf, bool general);
	//true if first letter match & 's' match with last ch of lf
	bool is_ContLetMatch(long nsf);
	//true if two or more consecutive letter match
	//---

	char *pch; //sf applied to a strategy
	char ps, pl; //sf, potential lf
	char sf[100], text[10000]; //sf & potential lf used in a strategy
	char lf[10000]; //lf found by a strategy
	char tok[1000][1000]; //token of potential lf
	//lower after strategy, original after extract_lf(b,e,str)
	long ntk; //# tokens
	long mod[100][2]; //match locations of tok with a given sf
	//mod[sf_inx][0]=tok inx, mod[sf_inx][1]=match loc in tok[mod[sf_inx][0]]

	//for each n_ch-SF
	long npairs; //selected pairs for this strategy
	long tpairs; //total pairs
	long nsfs; //# selected unique sfs for this strategy
	long nmatchs; //# matchs (success strategy & given sf == real sf)
	long amatchs; //# accumulated matchs up to this strategy
	long setCondition; //SF condition
	long greaterEqNsf; //if 1 select SF \|SF\|>=nsf

	WordData *wData;
	};


	/*
	alpha beta gamma (ABG)
	*/
	class FirstLet : public AbbrStra {
	public:
	virtual bool set_condition(const char str1, const char str2, char *str);
	virtual long strategy(const char sf, const char str);
	};


	class FirstLetOneChSF : public AbbrStra {
	public:
	virtual bool set_condition(const char str1, const char str2, char *str);
	virtual long strategy(const char sf, const char str);
	};


	/*
	- sf ch matchs with 1st ch or ch right after non-alphanum of lf
	but at least one match right after non-alphanum
	(eg, success: 1-alpha 2-beta (AB), alpha-beta(AB),
	fail: alpha beta(AB))
	*/
	class FirstLetGen : public AbbrStra {
	public:
	virtual long strategy(const char sf, const char str);
	};


	/*
	- sf ch matchs with 1st ch or ch right after non-alphanum of lf
	(eg, success: 1-alpha 2-beta (AB), alpha-beta(AB),
	alpha beta(AB))
	*/
	class FirstLetGen2 : public AbbrStra {
	public:
	virtual long strategy(const char sf, const char str);
	};


	/*
	For sf consisting of capital letters & lower-case 's'
	- First letter & 's' in the last token of lf
	(success: Alpha Betas (ABs), 1-Alpha Betas (ABs),
	1-Alpha-Betas (ABs), Alpha BetaS (ABs)
	fail: Alpha Beta xxs (ABs) )
	*/
	class FirstLetGenS : public AbbrStra {
	public:
	virtual bool set_condition(const char *sf); //sf must be an original sf
	//true if sf is like ABCs
	virtual long strategy(const char sf, const char str);
	};


	/*
	- sf ch matches with 1st ch or ch right after non-alphanum of lf
	- allowing one skip stopword between tokens (no more than one in row)
	at least one skip stopword in total
	(eg, success: alpha and beta (AB), 1-alpha and beta (AB)
	fail: alpha beta (AB), alpha word beta (AB))
	*/
	class FirstLetGenStp : public AbbrStra {
	public:
	virtual long strategy(const char sf, const char str);
	};


	/*
	- same as FirstLetGenStp except for 2 skip stopwords
	& at least one two consecutive skip stopwords
	*/
	class FirstLetGenStp2 : public AbbrStra {
	public:
	virtual long strategy(const char sf, const char str);
	};


	/*
	- same as FirstLetGenStp except using skip any word instead of stopword
	*/
	class FirstLetGenSkp : public AbbrStra {
	public:
	virtual long strategy(const char sf, const char str);
	};


	/*
	- a matching sub-string must be word
	(eg, success: AlphaBeta (AB), Beta is word
	x-AlphaBeta (AB) )
	- at least one within word match
	(eg,fail: Alpha Beta Word (ABW), Alpha x-Beta x-Word (ABW)
	success: AlphaBeta Word (ABW), x-AlphaBeta inWord (ABW))
	*/
	class WithinWrdWrd : public AbbrStra {
	public:
	virtual long strategy(const char sf, const char str);
	};


	/*
	- WithinWrdWrd w/ Begin Word Match
	(success: AlphaBeta x-Word (ABW)
	fail: AlphaBeta inWord (ABW) )
	*/
	class WithinWrdFWrd : public AbbrStra {
	public:
	virtual long strategy(const char sf, const char str);
	};


	/*
	- WithinWrdFWrd w/ allowing one skip word between tokens (no more than one in row)
	at least one skip word in total
	(success: AlphaBeta zzz x-Word zzz (ABW)
	fail: AlphaBeta x-Word (ABW), AlphaBeta zzz yyy x-Word (ABW))
	*/
	class WithinWrdFWrdSkp : public AbbrStra {
	public:
	virtual long strategy(const char sf, const char str);
	};


	/*
	- at least one within word match
	( success: Alpha InXyy (AX), x-Alpha InXyy (AX))
	fail: Alpha Xyy (AX), Alpha 1-Xyy (AX))
	*/
	class WithinWrdLet : public AbbrStra {
	public:
	virtual long strategy(const char sf, const char str);
	};


	/*
	- WithinWrdLet w/ Begin Word Match
	(fail: Alpha InXyy (AX), x-Alpha InXyy (AX)
	success: AlphaXyy Word (AXW), x-AlphaXyy 1-Word (AXW))
	*/
	class WithinWrdFLet : public AbbrStra {
	public:
	virtual long strategy(const char sf, const char str);
	};


	/*
	- WithinWrdFLet w/ allowing one skip word between tokens (no more than one in row)
	at least one skip word in total
	(success: AlphaXyy zzz Word zzz (AXW)
	fail: AlphaXyy Word (AXW), AlphaXyy zzz yyy Word (AXW))
	*/
	class WithinWrdFLetSkp : public AbbrStra {
	public:
	virtual long strategy(const char sf, const char str);
	};


	/*
	- any two consecutive letter matching w/ begin word match
	eg) ABxxx (AB), 1-ABxxx (AB), ABxxx Cxxx (ABC), Axxx BCxxx (ABC)
	prolactin (PRL), succinylcholine (SCh)
	*/
	class ContLet : public AbbrStra {
	public:
	virtual long strategy(const char sf, const char str);
	};


	/*
	- ContLet w/ allowing one skip word between tokens (no more than one in row)
	at least one skip word in total
	*/
	class ContLetSkp : public AbbrStra {
	public:
	virtual long strategy(const char sf, const char str);
	};


	/*
	- match can occur anywhere
	- allow one skip word between tokens (no more than one in row)
	(success: Alpha yXyy (AX), Alpha yXyy word (AX)
	1-Alpha yXyy word (AX))
	*/
	class AnyLet : public AbbrStra {
	public:
	virtual long strategy(const char sf, const char str);
	};


	class StratUtil {
	public:
	AbbrStra *strat_factory(string name);
	vector<string> get_strats(string s);
	//get the strategy sequence for a given #-ch SF group
	void push_back_strat(string sgp, string strat);
	bool group_sf(const char *sf, string &grp);
	//check if sf is ok and assign a group
	bool group_sf(const char sf, const char lf, string &grp);
	//add the contion \|lf\|>\|sf\|
	void remove_nonAlnum(const char str1, char str2);
	//remove non-alnum in str1 and save it to str2
	long exist_upperal(const char *str); //return 1 if exists upper char, 0 ow
	long num_token(const char *str); //return # tokens

	vector<string> Al1, Al2, Al3, Al4, Al5;
	vector<string> Num2, Num3, Num4, Num5;
	vector<string> Spec2, Spec3, Spec4, Spec5;
	};


	#endif