File size: 4,394 Bytes
d5062c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
#ifndef _MPTOK_H
#define _MPTOK_H
#include <stdio.h>
#include <string>
#include <vector>
#include <map>
#include <set>
using namespace std;
#define MPTOK_VERSION 11 // The latest version
// Maximum number of words in a sentence
#define MAX_WORDS 10000
enum { ABB_ABB, ABB_EOS, ABB_NUM };
#define MAX_ABB 100
/*! \brief A class to perform tokenization.
*
* The MPtag class can be used to perform tokenization and segmentation
* of strings into tokens or sentences. It is inherited and used by MPtag
* so if the user is only interested in tagging, this class does not
* need to be referenced.
*/
class MPtok
{
public:
/// \brief A MPtok object, giving the install directory \p idir where data files can be found
MPtok(string idir = "", const string& cnam = "");
~MPtok();
void init(); // Initialize (call only once)
void init(const string& idir) { option_dir = idir; init(); } // Initialize using specified install directory
string option_pretag; // The tag to use on tokens
int option_segment; // Segment into sentences
int option_hyphen; // Hyphens are separate tokens
int option_comma; // Commas are always tokenized
int option_pretok; // The text is pre-tokenized
int option_new; // Use new algorithms, used in development only
int option_doteos; // If " . " occurs, it's an end EOS (new >= 5)
void set_segment(int i); ///< \brief Sentences are broken up during tokenization (default 1)
void set_token(int i); ///< \brief Break tokens apart with white space (default 1)
void set_hyphen(int i); ///< \brief Hyphens are separate tokens (default 0)
void set_comma(int i); ///< \brief Commas are separate tokens (default 1)
void set_pretag(char *a); ///< \brief Use this tag on all tokens (default empty string)
void set_pretok(int i); ///< \brief Assume string is already tokenized using spaces (default 0)
void set_new(int i); ///< \brief Use a previous algorithm (defaults to most recent)
void set_doteos(int i); ///< \brief Ignore abbreviations, and always assume a period ends a sentence (default 0)
void merge_words(int s, int e); // merge words between s and e (idiom)
void split_words(void); // split all merged words
string tokenize(const string&); ///< \brief Tokenize, save (in \p word), and return space delimited tokens
string segment(const string&); ///< \brief Segment, save (in \p sent), and return newline delimited sentences
string save_string(const string&); // save a buffer
string tokenize_nosave(const string&); // tokenize without saving
string tokenize(const string&,int); // do tokenization with or without inserting spaces between them
void print(int); ///< \brief Print tokens/tags with given verbosity
vector<string> word; ///< \brief Vector of words (tokens) of most recently tagged (or tokenized) text
vector<string> tag; ///< \brief Vector of tags of most recently tagged (or tokenized) text
vector<string> sent; ///< \brief Vector of sentences of most recently sentence-segmented text
char *text; // Input text arg
int text_len; // It's length
int *tokflag; // token flags
int *endflag; // end-sentence flags
string option_cnam; // A suffix, for opening variant support files
string option_dir; // Directory to find things
protected:
void set_tokflag();
void set_endflag();
void set_endflag_01();
int size_buff();
void init_pair(const string& file_name); // read a file of common pairs
void init_abbr(const string& file_name); // read a file of abbreviations
void tok_0();
void tok_1();
void tok_2();
void tok_3();
void tok_5_6_7();
void tok_8_9();
void tok_10();
void tok_11();
void tok_12();
void tok_13();
void tok_14();
void tok_15();
void tok_15_1();
void tok_16();
void tok_16_1();
void tok_17();
void tok_20();
void tok_20_1();
void tok_20_2();
void tok_21();
void tok_21a();
void tok_22();
void tok_23();
void tok_24();
void tok_25();
void tok_26();
void tok_27();
void tok_28();
void tok_29();
void tok_29a();
void tok_30();
void tok_31();
void tok_32();
void tok_33();
int complex_check();
void map_escapes();
void tok_un();
void append_token(string&, int&, char*, int);
string token_string();
set<string> common_pair;
map<string,int> common_abbr;
private:
int option_token; // Output tokenized text (only use internally)
int tok_initialized; // is it inited?
};
#endif
|