Upload folder using huggingface_hub

d5062c8 verified over 1 year ago

4.39 kB

	#ifndef _MPTOK_H
	#define _MPTOK_H

	#include <stdio.h>

	#include <string>
	#include <vector>
	#include <map>
	#include <set>

	using namespace std;

	#define MPTOK_VERSION 11 // The latest version

	// Maximum number of words in a sentence

	#define MAX_WORDS 10000

	enum { ABB_ABB, ABB_EOS, ABB_NUM };
	#define MAX_ABB 100

	/*! \brief A class to perform tokenization.
	*
	* The MPtag class can be used to perform tokenization and segmentation
	* of strings into tokens or sentences. It is inherited and used by MPtag
	* so if the user is only interested in tagging, this class does not
	* need to be referenced.
	*/

	class MPtok
	{
	public:
	/// \brief A MPtok object, giving the install directory \p idir where data files can be found
	MPtok(string idir = "", const string& cnam = "");
	~MPtok();

	void init(); // Initialize (call only once)
	void init(const string& idir) { option_dir = idir; init(); } // Initialize using specified install directory

	string option_pretag; // The tag to use on tokens
	int option_segment; // Segment into sentences
	int option_hyphen; // Hyphens are separate tokens
	int option_comma; // Commas are always tokenized
	int option_pretok; // The text is pre-tokenized
	int option_new; // Use new algorithms, used in development only
	int option_doteos; // If " . " occurs, it's an end EOS (new >= 5)

	void set_segment(int i); ///< \brief Sentences are broken up during tokenization (default 1)
	void set_token(int i); ///< \brief Break tokens apart with white space (default 1)
	void set_hyphen(int i); ///< \brief Hyphens are separate tokens (default 0)
	void set_comma(int i); ///< \brief Commas are separate tokens (default 1)
	void set_pretag(char *a); ///< \brief Use this tag on all tokens (default empty string)
	void set_pretok(int i); ///< \brief Assume string is already tokenized using spaces (default 0)
	void set_new(int i); ///< \brief Use a previous algorithm (defaults to most recent)
	void set_doteos(int i); ///< \brief Ignore abbreviations, and always assume a period ends a sentence (default 0)

	void merge_words(int s, int e); // merge words between s and e (idiom)
	void split_words(void); // split all merged words

	string tokenize(const string&); ///< \brief Tokenize, save (in \p word), and return space delimited tokens
	string segment(const string&); ///< \brief Segment, save (in \p sent), and return newline delimited sentences

	string save_string(const string&); // save a buffer
	string tokenize_nosave(const string&); // tokenize without saving
	string tokenize(const string&,int); // do tokenization with or without inserting spaces between them

	void print(int); ///< \brief Print tokens/tags with given verbosity

	vector<string> word; ///< \brief Vector of words (tokens) of most recently tagged (or tokenized) text
	vector<string> tag; ///< \brief Vector of tags of most recently tagged (or tokenized) text
	vector<string> sent; ///< \brief Vector of sentences of most recently sentence-segmented text

	char *text; // Input text arg
	int text_len; // It's length
	int *tokflag; // token flags
	int *endflag; // end-sentence flags

	string option_cnam; // A suffix, for opening variant support files
	string option_dir; // Directory to find things

	protected:

	void set_tokflag();
	void set_endflag();
	void set_endflag_01();
	int size_buff();

	void init_pair(const string& file_name); // read a file of common pairs
	void init_abbr(const string& file_name); // read a file of abbreviations

	void tok_0();
	void tok_1();
	void tok_2();
	void tok_3();
	void tok_5_6_7();
	void tok_8_9();
	void tok_10();
	void tok_11();
	void tok_12();
	void tok_13();
	void tok_14();
	void tok_15();
	void tok_15_1();
	void tok_16();
	void tok_16_1();
	void tok_17();
	void tok_20();
	void tok_20_1();
	void tok_20_2();
	void tok_21();
	void tok_21a();
	void tok_22();
	void tok_23();
	void tok_24();
	void tok_25();
	void tok_26();
	void tok_27();
	void tok_28();
	void tok_29();
	void tok_29a();
	void tok_30();
	void tok_31();
	void tok_32();
	void tok_33();
	int complex_check();
	void map_escapes();
	void tok_un();

	void append_token(string&, int&, char*, int);
	string token_string();

	set<string> common_pair;
	map<string,int> common_abbr;

	private:
	int option_token; // Output tokenized text (only use internally)
	int tok_initialized; // is it inited?
	};

	#endif