steventango
/

GNorm2-docker

Model card Files Files and versions Community

File size: 4,394 Bytes

d5062c8

#ifndef _MPTOK_H
#define _MPTOK_H

#include <stdio.h>

#include <string>
#include <vector>
#include <map>
#include <set>

using namespace std;

#define MPTOK_VERSION 11			// The latest version

// Maximum number of words in a sentence

#define MAX_WORDS 10000

enum { ABB_ABB, ABB_EOS, ABB_NUM };
#define MAX_ABB 100

/*! \brief A class to perform tokenization.
 *
 * The MPtag class can be used to perform tokenization and segmentation
 * of strings into tokens or sentences. It is inherited and used by MPtag
 * so if the user is only interested in tagging, this class does not
 * need to be referenced.
 */

class MPtok
{
public:
	/// \brief A MPtok object, giving the install directory \p idir where data files can be found
	MPtok(string idir = "", const string& cnam = "");
	~MPtok();

	void init();				// Initialize (call only once)
	void init(const string& idir) { option_dir = idir; init(); } // Initialize using specified install directory

	string option_pretag;			// The tag to use on tokens
	int option_segment;			// Segment into sentences
	int option_hyphen;			// Hyphens are separate tokens
	int option_comma;			// Commas are always tokenized
	int option_pretok;			// The text is pre-tokenized
	int option_new;				// Use new algorithms, used in development only
	int option_doteos;			// If " . " occurs, it's an end EOS (new >= 5)

	void set_segment(int i);		///< \brief Sentences are broken up during tokenization (default 1)
	void set_token(int i);			///< \brief Break tokens apart with white space (default 1)
	void set_hyphen(int i);			///< \brief Hyphens are separate tokens (default 0)
	void set_comma(int i);			///< \brief Commas are separate tokens (default 1)
	void set_pretag(char *a);		///< \brief Use this tag on all tokens (default empty string)
	void set_pretok(int i);			///< \brief Assume string is already tokenized using spaces (default 0)
	void set_new(int i);			///< \brief Use a previous algorithm (defaults to most recent)
	void set_doteos(int i);			///< \brief Ignore abbreviations, and always assume a period ends a sentence (default 0)

	void merge_words(int s, int e);		// merge words between s and e (idiom)
	void split_words(void);			// split all merged words

	string tokenize(const string&);		///< \brief Tokenize, save (in \p word), and return space delimited tokens
	string segment(const string&);		///< \brief Segment, save (in \p sent), and return newline delimited sentences

	string save_string(const string&);	// save a buffer
	string tokenize_nosave(const string&);	// tokenize without saving
	string tokenize(const string&,int);	// do tokenization with or without inserting spaces between them

	void print(int);			///< \brief Print tokens/tags with given verbosity

	vector<string> word;			///< \brief Vector of words (tokens) of most recently tagged (or tokenized) text
	vector<string> tag;			///< \brief Vector of tags of most recently tagged (or tokenized) text
	vector<string> sent;			///< \brief Vector of sentences of most recently sentence-segmented text

	char	*text;				// Input text arg
	int	text_len;			// It's length
	int	*tokflag;			// token flags
	int	*endflag;			// end-sentence flags

	string option_cnam;			// A suffix, for opening variant support files
	string option_dir;			// Directory to find things

protected:

	void set_tokflag();
	void set_endflag();
	void set_endflag_01();
	int size_buff();

	void init_pair(const string& file_name);	// read a file of common pairs
	void init_abbr(const string& file_name);	// read a file of abbreviations

	void tok_0();
	void tok_1();
	void tok_2();
	void tok_3();
	void tok_5_6_7();
	void tok_8_9();
	void tok_10();
	void tok_11();
	void tok_12();
	void tok_13();
	void tok_14();
	void tok_15();
	void tok_15_1();
	void tok_16();
	void tok_16_1();
	void tok_17();
	void tok_20();
	void tok_20_1();
	void tok_20_2();
	void tok_21();
	void tok_21a();
	void tok_22();
	void tok_23();
	void tok_24();
	void tok_25();
	void tok_26();
	void tok_27();
	void tok_28();
	void tok_29();
	void tok_29a();
	void tok_30();
	void tok_31();
	void tok_32();
	void tok_33();
	int complex_check();
	void map_escapes();
	void tok_un();

	void append_token(string&, int&, char*, int);
	string token_string();

	set<string> common_pair;
	map<string,int> common_abbr;

private:
	int option_token;			// Output tokenized text (only use internally)
	int tok_initialized;			// is it inited?
};

#endif