File size: 4,394 Bytes
d5062c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#ifndef _MPTOK_H
#define _MPTOK_H

#include <stdio.h>

#include <string>
#include <vector>
#include <map>
#include <set>

using namespace std;

#define MPTOK_VERSION 11			// The latest version

// Maximum number of words in a sentence

#define MAX_WORDS 10000

enum { ABB_ABB, ABB_EOS, ABB_NUM };
#define MAX_ABB 100

/*! \brief A class to perform tokenization.
 *
 * The MPtag class can be used to perform tokenization and segmentation
 * of strings into tokens or sentences. It is inherited and used by MPtag
 * so if the user is only interested in tagging, this class does not
 * need to be referenced.
 */

class MPtok
{
public:
	/// \brief A MPtok object, giving the install directory \p idir where data files can be found
	MPtok(string idir = "", const string& cnam = "");
	~MPtok();

	void init();				// Initialize (call only once)
	void init(const string& idir) { option_dir = idir; init(); } // Initialize using specified install directory

	string option_pretag;			// The tag to use on tokens
	int option_segment;			// Segment into sentences
	int option_hyphen;			// Hyphens are separate tokens
	int option_comma;			// Commas are always tokenized
	int option_pretok;			// The text is pre-tokenized
	int option_new;				// Use new algorithms, used in development only
	int option_doteos;			// If " . " occurs, it's an end EOS (new >= 5)

	void set_segment(int i);		///< \brief Sentences are broken up during tokenization (default 1)
	void set_token(int i);			///< \brief Break tokens apart with white space (default 1)
	void set_hyphen(int i);			///< \brief Hyphens are separate tokens (default 0)
	void set_comma(int i);			///< \brief Commas are separate tokens (default 1)
	void set_pretag(char *a);		///< \brief Use this tag on all tokens (default empty string)
	void set_pretok(int i);			///< \brief Assume string is already tokenized using spaces (default 0)
	void set_new(int i);			///< \brief Use a previous algorithm (defaults to most recent)
	void set_doteos(int i);			///< \brief Ignore abbreviations, and always assume a period ends a sentence (default 0)

	void merge_words(int s, int e);		// merge words between s and e (idiom)
	void split_words(void);			// split all merged words

	string tokenize(const string&);		///< \brief Tokenize, save (in \p word), and return space delimited tokens
	string segment(const string&);		///< \brief Segment, save (in \p sent), and return newline delimited sentences

	string save_string(const string&);	// save a buffer
	string tokenize_nosave(const string&);	// tokenize without saving
	string tokenize(const string&,int);	// do tokenization with or without inserting spaces between them

	void print(int);			///< \brief Print tokens/tags with given verbosity

	vector<string> word;			///< \brief Vector of words (tokens) of most recently tagged (or tokenized) text
	vector<string> tag;			///< \brief Vector of tags of most recently tagged (or tokenized) text
	vector<string> sent;			///< \brief Vector of sentences of most recently sentence-segmented text

	char	*text;				// Input text arg
	int	text_len;			// It's length
	int	*tokflag;			// token flags
	int	*endflag;			// end-sentence flags

	string option_cnam;			// A suffix, for opening variant support files
	string option_dir;			// Directory to find things

protected:

	void set_tokflag();
	void set_endflag();
	void set_endflag_01();
	int size_buff();

	void init_pair(const string& file_name);	// read a file of common pairs
	void init_abbr(const string& file_name);	// read a file of abbreviations

	void tok_0();
	void tok_1();
	void tok_2();
	void tok_3();
	void tok_5_6_7();
	void tok_8_9();
	void tok_10();
	void tok_11();
	void tok_12();
	void tok_13();
	void tok_14();
	void tok_15();
	void tok_15_1();
	void tok_16();
	void tok_16_1();
	void tok_17();
	void tok_20();
	void tok_20_1();
	void tok_20_2();
	void tok_21();
	void tok_21a();
	void tok_22();
	void tok_23();
	void tok_24();
	void tok_25();
	void tok_26();
	void tok_27();
	void tok_28();
	void tok_29();
	void tok_29a();
	void tok_30();
	void tok_31();
	void tok_32();
	void tok_33();
	int complex_check();
	void map_escapes();
	void tok_un();

	void append_token(string&, int&, char*, int);
	string token_string();

	set<string> common_pair;
	map<string,int> common_abbr;

private:
	int option_token;			// Output tokenized text (only use internally)
	int tok_initialized;			// is it inited?
};

#endif