steventango's picture
Upload folder using huggingface_hub
d5062c8 verified
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include <string>
#include <iostream>
#include <fstream>
#include <sstream>
#include "MPtok.h"
// These options are probably compile time constants
static char option_tagsep = '_'; // The tagsep character
static char option_replacesep = '-'; // Replace tagsep with this
static void chomp(char *line)
int i;
i = strlen(line) - 1;
while (i >= 0 && line[i] == '\n' || line[i] == '\r')
line[i--] = '\0';
// Data structure and algorithm for finding common pairs.
// read a file of pairs into a data structure,
// the file must be sorted first
void MPtok::init_pair(const string& file_name)
filebuf fb;, ios::in);
istream is(&fb);
string pair;
while (1)
getline(is, pair);
if ( break;
if (pair.size() > 0) common_pair.insert(pair);
// List of abbreviations in 3 categories
// ABB = can occur mid sentence
// EOS = can occur at end of sentence
// NUM = only used before numbers
void MPtok::init_abbr(const string& file_name)
filebuf fb;, ios::in);
istream is(&fb);
string typ, abb;
map<string,int> val;
val["ABB"] = ABB_ABB; val["EOS"] = ABB_EOS; val["NUM"] = ABB_NUM;
while (is.good())
is >> typ;
if (val.count(typ))
is >> abb;
if (abb.size() > 0) common_abbr[abb] = val[typ];
static char nextchar(const char *t, int i)
while (isspace(t[i])) i++;
return t[i];
// Look for a token at or prior to the text position
static int lookbehind(const char *t, int i, const char *s, int *tokflag)
int k = (int) strlen(s) - 1;
while (i > 0 && isspace(t[i])) i--;
while (k >= 0 && i >= 0)
if (k > 0 && tokflag[i]) break;
if (tolower(s[k]) != tolower(t[i]))
return -1;
return (k < 0 && tokflag[i+1]) ? i + 1 : -1;
// Look for a token at or following the text position
static int lookahead(const char *t, int i, const char *s, int *tokflag)
int k = 0;
while (isspace(t[i])) i++;
while (k < strlen(s) && i < strlen(t))
if (k > 0 && tokflag[i]) break;
if (tolower(s[k]) != tolower(t[i]))
return -1;
return (k == strlen(s) && tokflag[i]) ? i - (int) strlen(s) : -1;
// Set the initial tokens at spaces
void MPtok::tok_0()
int i;
tokflag[0] = 1;
for (i = 1; i < text_len; i++)
tokflag[i] = isspace(text[i]) || (i > 0 && isspace(text[i - 1])) ? 1 : 0;
tokflag[i] = 1;
// Get quotes preceded by open parens
// A double quote, preceded by a space or open bracket is a separate token
void MPtok::tok_1()
for (int i = 1; i < text_len; i++)
if (text[i] == '"' && strchr("([{<", text[i-1]))
tokflag[i] = 1;
if (i + 1 < text_len) tokflag[i+1] = 1;
// Look for ellipses
// Three dots in a row is a separate token
void MPtok::tok_2()
for (int i = 1; i + 2 < text_len; i++)
if (strncmp(&text[i], "...", 3) == 0)
tokflag[i] = 1;
if (i + 3 < text_len) tokflag[i+3] = 1;
// Non-sentence-ending punctuation
// Certain punctuation characters are separate tokens
void MPtok::tok_3()
for (int i = 0; i < text_len; i++)
// If it is a comma and the next char is not a space and option_comma = 0
if (option_comma == 0 && text[i] == ',' && isspace(text[i + 1]) == 0)
// do nothing
} else if (strchr(",;:@#$%&", text[i]))
tokflag[i] = 1;
tokflag[i + 1] = 1;
// Separate the slashes
// Slashes are a separate token
// except for +/-, +/+, -/-, -/+, and and/or.
void MPtok::tok_5_6_7()
for (int i = 0; i < text_len; i++)
if (text[i] == '/')
tokflag[i] = 1;
if (i+1 < text_len) tokflag[i+1] = 1;
// Put back +/-, etc, unless option_hyphen is 1
if (i - 1 >= 0
&& i + 1 < text_len
&& ((option_new < 9
&& text[i - 1] == '+' || (text[i - 1] == '-' && option_hyphen == 0)
&& text[i + 1] == '+' || (text[i + 1] == '-' && option_hyphen == 0))
|| (option_new >= 9
&& (text[i - 1] == '+' || text[i - 1] == '-')
&& (text[i + 1] == '+' || text[i + 1] == '-'))))
tokflag[i - 1] = 1;
tokflag[i] = tokflag[i+1] = 0;
tokflag[i + 2] = 1;
// Put back and/or, etc
if (option_new <= 7)
if (i > 5 && strncmp(text + i - 5, " and/or ", 8) == 0)
for (int j = 1; j < 5; j++)
tokflag[i - 2 + j] = 0;
} else
if (i > 4 && strncmp(text + i - 4, " and/or ", 8) == 0)
for (int j = 1; j < 6; j++)
tokflag[i - 3 + j] = 0;
// All brackets
// Any open or closed bracket is a separate token
// Exclamation and question mark
// Any question or exclamation mark is a separate token
void MPtok::tok_8_9()
for (int i = 0; i < text_len; i++)
if (strchr("[](){}<>", text[i])
|| strchr("?!", text[i]))
tokflag[i] = 1;
if (i + 1 < text_len) tokflag[i+1] = 1;
// Period at the end of a string may be followed by closed-bracket or quote
// A period that is preceded by a non-period
// and optionally followed by a close paren
// and any amount of space at the end of the string
// is a separate token.
void MPtok::tok_10()
for (int i = text_len - 1; i >= 0; i--)
if (isspace(text[i])) continue;
if (strchr("])}>\"'", text[i])) continue;
if (text[i] != '.') break;
if (text[i] == '.' && (i - 1 < 0 || text[i-1] != '.'))
tokflag[i] = 1;
if (i + 1 < text_len) tokflag[i+1] = 1;
// Period followed by a capitalized word
// A period preceded by a character that is not another period and not a space
// and followed by a space then an upper case letter is a separate token
void MPtok::tok_11()
for (int i = 0; i < text_len; i++)
if (text[i] == '.'
&& (i + 1 < text_len && isspace(text[i+1]))
&& (i - 1 < 0 || text[i - 1] != '.' || isspace(text[i-1]) == 0)
&& isupper(nextchar(text, i + 1)))
tokflag[i] = 1;
// A normal word followed by a period
// A period followed by a space
// and preceded by 2 or more alphabetic characters or hyphens
// is a separate token
void MPtok::tok_12()
int wcnt = 0;
for (int i = 0; i < text_len; i++)
if (text[i] == '.'
&& tokflag[i + 1]
&& wcnt >= 2)
tokflag[i] = 1;
if (isalpha(text[i]) || text[i] == '-')
wcnt = 0;
// A non-normal token (that has no lower case letters) followed by a period
// A period at the end of a token made of characters excluding lower case
// is a separate token
void MPtok::tok_13()
int stok = 0;
int wcnt = 0;
for (int i = 0; i < text_len; i++)
if (text[i] == '.'
&& tokflag[i + 1]
&& wcnt >= 2)
tokflag[i] = 1;
if (tokflag[i] == 1) stok = 1;
if (islower(text[i]) || text[i] == '.')
stok = 0;
wcnt = 0;
if (stok)
// put some periods with single-letter abbreviations
// A single alphabetic token followed by a period followed
// by a token that does not begin with an upper case letter
// or number is taken to be an abbreviation and the period
// does not start a new token.
// NOTE: This does not recognize initials in people's names,
// that problem is not simply solved.
void MPtok::tok_14()
for (int i = 0; i < text_len; i++)
if (text[i] == '.'
&& i - 1 >= 0 && isalpha(text[i - 1]) && tokflag[i - 1]
&& tokflag[i + 1]
&& isupper(nextchar(text, i + 1)) == 0
&& isdigit(nextchar(text, i + 1)) == 0
&& nextchar(text, i + 1) != '('
tokflag[i] = 0;
void MPtok::tok_15()
int i, j, k, a;
char buff[MAX_ABB + 1];
for (i = 0; i < text_len; i++)
// only start at a current token
if (! tokflag[i]) continue;
// find alphabetic followed by period
buff[0] = '\0';
for (k = 0; i + k < text_len && k < MAX_ABB; k++)
buff[k] = text[i+k]; buff[k+1] = '\0';
if (k > 0 && buff[k] == '.') break; // this is good
if (! isalpha(buff[k])) { buff[0] = '\0'; break; } // this is not good
if (buff[0] == '\0' || i + k == text_len || k == MAX_ABB) continue;
// at this point, buff[k] == '.' add 1 to make it the length
// if not found, try finding a concatenated abbrev
if (! common_abbr.count(buff))
for (; i + k < text_len && k < MAX_ABB; k++)
buff[k] = text[i+k]; buff[k+1] = '\0';
if (k > 0 && buff[k] == '.') break; // this is good
if (! isalpha(buff[k])) { buff[0] = '\0'; break; } // this is not good
if (buff[0] == '\0' || i + k == text_len || k == MAX_ABB) continue;
// at this point, buff[k] == '.' add 1 to make it the length
// if not found, give up
if (! common_abbr.count(buff)) continue;
if (common_abbr[buff] == ABB_NUM)
for (j = i + k; j < text_len && isspace(text[j]); j++) ; // next must be a number
if (! isdigit(text[j])) continue; // go to next abbreviation
} else if (common_abbr[buff] == ABB_EOS)
for (j = i + k; j < text_len && isspace(text[j]); j++) ; // if next token is upper case letter
if (isupper(text[j])) tokflag[i + (--k)] = 1; // tokenize the final period of this abbreviation
// clear all token flags
for (j = 1; j < k; j++) tokflag[i + j] = 0;
// Check for common pairs that should not be considered sentence breaks
void MPtok::tok_15_1()
int i, j, k, tnum, p;
char buff[MAX_ABB + 1];
for (i = 0; i < text_len; i++)
if (! tokflag[i]) continue;
// must be alphanumeric token followed by period token followed by space followed by alphanumeric token
tnum = 0;
buff[0] = '\0';
for (p = k = 0; i + k < text_len && k < MAX_ABB; k++)
buff[k] = text[i+k]; buff[k+1] = '\0';
if (isspace(buff[k]))
if (tnum == 2) break; // this is good
else if (tnum == 1) continue; // ok
else { buff[0] = '\0'; break; } // this shouldn't happen
if (tokflag[i+k])
if (tnum > 2) break; // done
else tnum++;
if (tnum == 1 && buff[k] == '.') p = k;
if (tnum == 1 && buff[k] != '.') { buff[0] = '\0'; break; } // nope
if (! isalnum(buff[k])) { buff[0] = '\0'; break; } // nope
if (buff[0] == '\0' || i + k == text_len || k == MAX_ABB) continue;
// at this point buff is a potential pair, so untokenize the period, that's all
if (common_pair.count(buff))
tokflag[p] = 0;
// Get cases where a space after a sentence has been omitted
// A period that occurs in a token consisting of alphabetic
// letters with a vowel to the left and the right is a
// separate token.
void MPtok::tok_16()
int j;
int has_vowel;
for (int i = 0; i < text_len; i++)
if (text[i] == '.' && tokflag[i] == 0)
has_vowel = 0;
for (j = i - 1; j >= 0; --j)
if (isalpha(text[j]) == 0)
if (strchr("aeiouAEIOU", text[j]))
has_vowel = 1;
if (tokflag[j])
if ((j >= 0 && tokflag[j] == 0) || has_vowel == 0)
j = i + 1;
has_vowel = 0;
for (; j < text_len && tokflag[j] == 0; ++j)
if (isalpha(text[j]) == 0)
if (strchr("aeiouAEIOU", text[j]))
has_vowel = 1;
if ((j < text_len && tokflag[j] == 0) || has_vowel == 0)
tokflag[i] = 1;
tokflag[i + 1] = 1;
// Correction to tok_16,
// Don't count if the token before is a single letter
// or the token following is a single letter other than 'a'.
// Also, don't count if the token to the right is gov, com, edu, etc.
// because those are web addresses!
struct _complex {
int flag;
int offset;
const char *str;
int len;
} complex[] = {
COMPLEX_YES, 0, "complex", 7,
COMPLEX_NOT, 0, "complexi", 8,
COMPLEX_NOT, 0, "complexed", 9,
COMPLEX_NOT, 0, "complexa", 8,
COMPLEX_NOT, 0, "complex-", 8,
COMPLEX_NOT, 0, "complexl", 8,
COMPLEX_NOT, 0, "complexu", 8,
COMPLEX_NOT, -1, "-complex", 7,
COMPLEX_NOT, -2, "nocomplex", 9,
COMPLEX_NOT, -3, "subcomplex", 10,
COMPLEX_YES, 0, "hybrid", 6,
COMPLEX_NOT, 0, "hybridi", 7,
COMPLEX_NOT, 0, "hybrido", 7,
COMPLEX_NOT, 0, "hybrida", 7,
COMPLEX_NOT, 0, "hybrid-", 7,
COMPLEX_NOT, -1, "-hybrid", 7,
COMPLEX_YES, 0, "duplex", 6,
COMPLEX_NOT, -1, "oduplex", 7,
int MPtok::complex_check()
int last_period = -2*COMPLEX_WINDOW;
int last_complex = -2*COMPLEX_WINDOW;
int i, j;
int complex_match;
for (i = 0; i < text_len; i++)
if (text[i] == '.')
if (i - last_complex <= COMPLEX_WINDOW)
return 1;
last_period = i;
complex_match = 0;
for (j = 0; complex[j].str; j++)
if (complex[j].flag == COMPLEX_NOT)
if (i + complex[j].offset >= 0
&& strncmp(text+i+complex[j].offset, complex[j].str, complex[j].len) == 0)
// don't match here
complex_match = 0;
} else if (complex[j].flag == COMPLEX_YES)
if (i + complex[j].offset >= 0
&& strncmp(text+i+complex[j].offset, complex[j].str, complex[j].len) == 0)
// match here
complex_match = 1;
if (complex_match)
if (i - last_period <= COMPLEX_WINDOW)
return 1;
last_complex = i;
return 0;
void MPtok::tok_16_1()
int i, j;
char v1, v2;
int c1, c2;
if (option_new == 3 && strstr(text, "complex"))
if (option_new >= 4 && complex_check())
for (i = 0; i < text_len; i++)
if (text[i] == '.' && tokflag[i] == 0)
char suffix[10];
int s_i;
v1 = '\0';
c1 = 0;
for (j = i - 1; j >= 0; --j)
if (isalpha(text[j]) == 0)
if (strchr("aeiouAEIOU", text[j]))
v1 = tolower(text[j]);
if (tokflag[j])
if ((j >= 0 && tokflag[j] == 0)
|| v1 == '\0'
|| c1 == 1)
j = i + 1;
v2 = '\0';
c2 = 0;
s_i = 0;
for (; j < text_len && tokflag[j] == 0; ++j)
if (isalpha(text[j]) == 0)
if (strchr("aeiouAEIOU", text[j]))
v2 = tolower(text[j]);
if (s_i < 3)
suffix[s_i++] = tolower(text[j]); suffix[s_i] = '\0';
if ((j < text_len && tokflag[j] == 0)
|| v2 == '\0'
|| (c2 == 1 && v2 != 'a')
|| (c2 == 3 && tokflag[j] == 1 && s_i == 3
&& (strcmp(suffix, "gov") == 0
|| strcmp(suffix, "edu") == 0
|| strcmp(suffix, "org") == 0
|| strcmp(suffix, "com") == 0)))
tokflag[i] = 1;
tokflag[i + 1] = 1;
// Numeric endings of sentences
// A period after a numeric token followed by a token that starts
// with an alphabetic character, is a separate token.
// This should be covered already by tok_13
void MPtok::tok_17()
int j;
for (int i = 0; i < text_len; i++)
if (text[i] == '.'
&& tokflag[i] == 0
&& tokflag[i + 1])
for (j = i - 1; j >= 0 && isdigit(text[j]) && tokflag[j] == 0; --j)
if (j >= 0 && j < i - 1 && tokflag[j] && isalpha(nextchar(text, i + 1)))
tokflag[i] = 1;
// period at end of string is a token
void MPtok::tok_20()
for (int i = text_len - 1; i >= 0; --i)
if (isspace(text[i]))
if (strchr(".!?", text[i]))
tokflag[i] = 1;
// a period that follows a non-common word, and that is
// followed by a lower case common word is probably not a token
void MPtok::tok_20_1()
int j;
for (int i = 0; i < text_len; ++i)
if (text[i] == '.' && tokflag[i] == 1)
int tcnt, lcnt, ocnt;
tcnt = lcnt = ocnt = 0;
// make sure the previous word was *not* common
for (j = i - 1; j >= 0; j--)
if (isspace(text[j])) continue;
if (option_new >= 2)
if (islower(text[j]) == 0 && text[j] != '-') ocnt++;
} else
if (! islower(text[j])) ocnt++;
if (tokflag[j] || j == 0)
if (ocnt == 0)
goto nexti;
tcnt = lcnt = ocnt = 0;
// make sure the next word is common
for (j = i + 1; j < text_len; j++)
if (isspace(text[j])) continue;
if (tokflag[j]) tcnt++;
if (tcnt == 2 || j == text_len - 1)
if (lcnt > 0 && ocnt == 0) tokflag[i] = 0;
if (islower(text[j])) lcnt++;
else ocnt++;
nexti: ;
// tokenized period followed by non-space other than close paren
// is not a token
void MPtok::tok_20_2()
int j;
for (int i = 0; i < text_len - 1; ++i)
if (text[i] == '.' && tokflag[i] == 1
&& strchr(" ()[]\"\'\n\t\r", text[i+1]) == 0)
tokflag[i] = 0;
// long dash
// A pair of hyphens is a complete token
void MPtok::tok_21()
for (int i = 0; i + 1 < text_len; i++)
if (strncmp(&text[i], "--", 2) == 0)
tokflag[i] = 1;
if (i + 2 < text_len)
i += 2;
tokflag[i] = 1;
// hyphens
// If specified as an option, a hyphen between letters is a complete token
void MPtok::tok_21a()
if (option_hyphen == 0) return;
for (int i = 0; i + 1 < text_len; i++)
if (text[i] == '-'
&& (i == 0 || text[i-1] != '-')
&& text[i+1] != '-')
tokflag[i] = 1;
tokflag[i+1] = 1;
// quote
// Any double quote is a separate token
void MPtok::tok_22()
for (int i = 0; i < text_len; i++)
if (text[i] == '"')
tokflag[i] = 1;
if (i + 1 < text_len)
i += 1;
tokflag[i] = 1;
// possessive
// Any single quote at the end of a token that is not
// preceded by a single quote is a separate token
void MPtok::tok_23()
for (int i = 0; i < text_len; i++)
if (text[i] == '\''
&& (i - 1 >= 0 && text[i - 1] != '\'')
&& tokflag[i + 1])
tokflag[i] = 1;
// quote
// If a single quote starts a token, or is preceded by a
// single quote, and followed by a character
// that is not a single quote, then
// the character to it's right is the start of a new token
void MPtok::tok_24()
for (int i = 0; i < text_len; i++)
if (text[i] == '\''
&& (tokflag[i] == 1 || (i - 1 >= 0 && text[i - 1] == '\''))
&& (i + 1 < text_len && text[i + 1] != '\''))
tokflag[i + 1] = 1;
// put back possessive
// A single quote that is a whole token followed by a lower case s
// that is also a whole token (without space between them)
// should be merged into a single token
void MPtok::tok_25()
for (int i = 0; i < text_len; i++)
if (text[i] == '\''
&& tokflag[i] == 1
&& i + 1 < text_len && text[i + 1] == 's'
&& tokflag[i+1] == 1
&& (i + 2 >= text_len || isspace(text[i + 2]) || tokflag[i + 2] == 1))
tokflag[i + 1] = 0;
// quote
// A pair of single quotes is a separate token
void MPtok::tok_26()
for (int i = 0; i < text_len; i++)
if (strncmp(&text[i], "''", 2) == 0
|| strncmp(&text[i], "``", 2) == 0)
tokflag[i] = 1;
if (i + 2 < text_len) tokflag[i + 2] = 1;
// possessive
// A single quote followed by a letter s is a possessive
void MPtok::tok_27()
for (int i = 0; i < text_len; i++)
if (text[i] == '\''
&& i + 1 < text_len
&& tolower(text[i + 1]) == 's'
&& (i + 2 >= text_len || tokflag[i + 2]))
tokflag[i] = 1;
// split "cannot" to "can not"
// A single token that is the word cannot (in any case)
// is split into two words
void MPtok::tok_28()
for (int i = 0; i < text_len; i++)
if ((strncmp(&text[i], "cannot", 6) == 0
|| strncmp(&text[i], "Cannot", 6) == 0)
&& tokflag[i + 6])
tokflag[i + 3] = 1;
// put list item elements back at sentence end
// A period that is preceded by an alphanumeric (no space)
// and any amount of preceding space and an end-mark
// stays with the alphanumeric.
void MPtok::tok_29()
int j;
for (int i = 0; i < text_len; i++)
if (text[i] == '.'
&& tokflag[i] && tokflag[i + 1]
&& i - 1 >= 0 && isalnum(text[i - 1])
&& tokflag[i - 1]
&& ((j = lookbehind(text, i-2, ".", tokflag)) >= 0
|| (j = lookbehind(text, i-2, "?", tokflag)) >= 0
|| (j = lookbehind(text, i-2, "!", tokflag)) >= 0)
&& tokflag[j])
tokflag[i] = 0;
// attach list elements to the beginnings of their sentences
// this means, attach the period to the list element
// a list element is a single letter or a one or two digits
// which is preceded by an end of sentence ".!?;"
// or colon (provided it doesn't belong to a proportion construct)
void MPtok::tok_29a()
int i, j;
for (i = 0; i < text_len; i++)
if (text[i] == '.' && tokflag[i])
// Look back, make sure the token before the period
// is either single alphanumeric, or at most a two digit number
// and the character before that is a punctuation ".?!:,"
int tcnt, acnt, dcnt, pcnt, ocnt, scnt;
tcnt = acnt = dcnt = pcnt = ocnt = scnt = 0;
char p;
for (j = i - 1; j >= 0; j--)
if (isspace(text[j])) { scnt++; continue; }
else if (tcnt == 0 && isalpha(text[j])) ++acnt;
else if (tcnt == 0 && isdigit(text[j])) ++dcnt;
else if (tcnt == 1 && strchr(".!?:;,", text[j])) { pcnt++; p = text[j]; }
else ocnt++;
if (tokflag[j] || j == 0)
if (tcnt == 1 && ocnt == 0 && scnt == 0
&& ((acnt == 1 && dcnt == 0) || (acnt == 0 && dcnt > 0 && dcnt <= 2)))
// This is acceptable
} else if (tcnt == 2 && pcnt <= 1 && ocnt == 0 && scnt > 0)
if (p == ':')
while (--j >= 0 && isspace(text[j]))
if (j >= 0 && isdigit(text[j]))
// It's probably a proportion
// Jackpot
tokflag[i] = 0;
} else
// This is not
scnt = 0;
// list elements at the beginning of a string
// An alphanumeric token followed by a period
// at the beginning of the line stays with the
// alphanumeric
void MPtok::tok_30()
int i = 0;
while (isspace(text[i])) i++;
if (isalnum(text[i])
&& tokflag[i]
&& i + 1 < text_len
&& text[i + 1] == '.'
&& tokflag[i + 1])
tokflag[i + 1] = 0;
// process American style numbers
void MPtok::tok_31()
int j;
for (int i = 0; i < text_len; i++)
if (text[i] == ','
&& i + 3 < text_len
&& tokflag[i] && tokflag[i + 1]
&& isdigit(text[i + 1])
&& isdigit(text[i + 2])
&& isdigit(text[i + 3])
&& i - 1 >= 0 && isdigit(text[i - 1])
tokflag[i] = 0;
tokflag[i + 1] = 0;
// process British style numbers
void MPtok::tok_32()
int j;
for (int i = 0; i < text_len; i++)
if (text[i] == ' '
&& i + 3 < text_len
&& tokflag[i] && tokflag[i + 1]
&& isdigit(text[i + 1])
&& isdigit(text[i + 2])
&& isdigit(text[i + 3])
&& i - 1 >= 0 && isdigit(text[i - 1])
tokflag[i] = 0;
tokflag[i + 1] = 0;
// tokenize unicode escapes
// Added
void MPtok::tok_33()
int j;
for (int i = 0; i < text_len; i++)
if (text[i] == '&')
if (text[i + 1] == '#')
for (j = i + 2; isdigit(text[j]); j++)
} else
for (j = i + 1; isalpha(text[j]); j++)
if (text[j] == ';')
// Tokenize the escape, untokenize everything inside
tokflag[i] = 1;
for (i++; i <= j; i++) tokflag[i] = 0;
tokflag[i] = 1;
// Remove tags if they are present
void MPtok::tok_un()
int untok = 0;
for (int i = 0; text[i]; ++i)
if (isspace(text[i])) untok = 0;
if (text[i] == option_tagsep) untok = 1;
if (untok) text[i] = ' ';
void MPtok::set_tokflag()
int i;
// step 4 replaces tag char, this is done at output
if (option_new >= 1)
if (option_new <= 5)
if (option_new < 2)
// steps 18 and 19 recognize periods within parens,
// and this is moved to the segmentation section
if (option_new >= 1)
if (option_new >= 2)
if (option_new >= 6)
if (option_new >= 7)
if (option_new < 1)
if (option_new >= 1)
/* set_endflag
** After tokflag has been set, find the possible sentence endings.
void MPtok::set_endflag()
int i;
// The following tests look for end-stops and label them.
// They include steps 18 and 19
for (i = 0; i <= text_len; i++)
endflag[i] = 0;
// Count the number of unmatched parens
int up = 0; // unmatched round parens
int ub = 0; // unmatched brackets
for (i = 0; i < text_len; i++)
if (text[i] == '(') ++up;
if (text[i] == ')') --up;
if (text[i] == '[') ++ub;
if (text[i] == ']') --ub;
if (up < 0) up = 0;
if (ub < 0) ub = 0;
// Now find the end-of-sentence marks
// tok_18: periods within parentheses, allow for nesting
// tok_19: periods within brackets, allow for nesting
// the perl version solves this by putting the period
// back with the previous token, but a better solution
// is to allow it to be tokenized but just don't
// allow it to be an end-of-sentence.
// Therefore, these are moved to the segmentation
// section
int p = 0; // round parens
int b = 0; // brackets
for (i = 0; i < text_len; i++)
if (text[i] == '(') ++p;
if (text[i] == ')') --p;
if (text[i] == '[') ++b;
if (text[i] == ']') --b;
if (p < 0) p = 0;
if (b < 0) b = 0;
if (strchr(".!?", text[i])
&& tokflag[i]
&& tokflag[i + 1])
if (option_segment && p <= up && b <= ub)
endflag[i] = 1;
// This is optional to join periods with
// probable abbreviations
if (p > up || b > ub)
tokflag[i] = 0;
// endtokens followed by a single or double quote, which matches
// a single or double quote in the previous sentence
if (option_new >= 1)
int dquo, squo;
dquo = squo = 0;
for (i = 0; i < text_len; i++)
if (text[i] == '"') dquo = ! dquo;
else if (text[i] == '\'') squo = ! squo;
else if (endflag[i])
if ((text[i+1] == '"' && dquo) || (text[i+1] == '\'' && squo))
endflag[i] = 0;
// But don't end at all if the next token is something
// other than an upper case letter.
if (option_new >= 2)
int j;
int ok = 0;
for (j = i + 2; j < text_len; j++)
if (isspace(text[j])) continue;
// if (isupper(text[j]))
if (isupper(text[j]) || text[j] == '(')
ok = 1;
if (tokflag[j]) break;
if (ok)
endflag[i+1] = 1;
} else
endflag[i+1] = 1;
dquo = squo = 0;
/* set_endflag_01
** After tokflag has been set, find the possible sentence endings.
** This has improved paren matching.
#define MAX_MATCH 500 // Maximum length to get a paren match
void MPtok::set_endflag_01()
int match[text_len];
int i, j;
// The following tests look for end-stops and label them.
// They include steps 18 and 19
for (i = 0; i <= text_len; i++)
endflag[i] = 0;
for (i = 0; i < text_len; i++)
match[i] = 0;
for (i = text_len - 1; i >= 0; i--)
if (text[i] == '(' || text[i] == '[')
for (j = i + 1; text[j] && j - i <= MAX_MATCH; j++)
// Skip parens that are already matched
if (match[j] > j)
j = match[j];
// Look for a matching close paren
if (match[j] == 0
&& ((text[i] == '(' && text[j] == ')')
|| (text[i] == '[' && text[j] == ']')))
match[i] = j;
match[j] = i;
int next_match = 0;
for (i = 0; i < text_len; i++)
if (match[i] > next_match)
next_match = match[i];
if (strchr(".!?", text[i])
&& tokflag[i]
&& tokflag[i + 1]
&& (option_new <= 4 || option_doteos == 1 || (i > 0 && isspace(text[i-1]) == 0)))
if (i <= next_match)
tokflag[i] = 0;
else if (option_segment)
endflag[i] = 1;
// endtokens followed by a single or double quote, which matches
// a single or double quote in the previous sentence
int dquo, squo;
dquo = squo = 0;
for (i = 0; i < text_len; i++)
if (option_new <= 7 && text[i] == '"') dquo = ! dquo;
else if (option_new >= 8 && text[i] == '"' && tokflag[i] && tokflag[i+1]) dquo = ! dquo;
else if (option_new <= 7 && text[i] == '\'') squo = ! squo;
else if (option_new >= 8 && text[i] == '\''
&& tokflag[i] && (tokflag[i+1] || (text[i+1] == '\'' && tokflag[i+2]))) squo = ! squo;
else if (endflag[i])
if ((text[i+1] == '"' && dquo) || (text[i+1] == '\'' && squo))
endflag[i] = 0;
// But don't end at all if the next token is something
// other than an upper case letter.
if (option_new >= 2)
int j;
int ok = 0;
for (j = i + 2; j < text_len; j++)
if (isspace(text[j])) continue;
// if (isupper(text[j]))
if (isupper(text[j]) || text[j] == '(')
ok = 1;
if (tokflag[j]) break;
if (ok)
endflag[i+1] = 1;
} else
endflag[i+1] = 1;
dquo = squo = 0;
// Size buffer: return the size of the buffer required to hold all of the tokenized text.
// It can be simply estimated by a formula that depends only on the length of text and number of tokens.
int MPtok::size_buff()
int size = 1; // Start with null terminator
int t = option_pretag.size(); // for each tag, the length of the UNTAG string
if (t <= 0) t = 1; // Make sure there is at least one
t += 2; // Add one for underscore and one for space
for (int i = 0; i < text_len; i++)
size++; // Count all characters
if (tokflag[i]) size += t; // Count token delimiters (may overcount)
if (endflag[i]) size++; // Add one for newline
return size;
/* append_token
** Save a single token to a buffer.
void MPtok::append_token(string& buff, int& sp, char *tok, int ef)
// Convert tag separator chars and back quotes (?)
for (int i = 0; tok[i]; i++)
if (tok[i] == option_tagsep) tok[i] = option_replacesep;
if (tok[i] == '`') tok[i] = '\'';
// Skip whitespace if tokens are being output
// Otherwise, skip whitespace at the start of a sentence
if (option_token || ! sp) while (isspace(*tok)) ++tok;
// Save the token
if (strlen(tok) > 0)
// Add delimiter if needed
if (option_token && sp) buff += ' ';
// Append token to output
if (option_new < 9)
while (*tok && (! option_token || ! isspace(*tok)))
buff += *(tok++);
} else
while (*tok)
buff += *(tok++);
sp = 1;
// Add tag holders
if (option_token && option_pretag.size() > 0)
buff += option_tagsep;
buff += option_pretag;
// If it was end of sentence, then add newline
if (ef)
buff += '\n';
sp = 0;
// Strip whitespace after sentences
static void adjust_space(string& buff)
while (buff.size() > 0 && isspace(buff[0])) buff.erase(0, 1);
// delete two spaces in a row, but keep newlines
for (int i = 1; i < buff.size(); i++)
if (isspace(buff[i]) && isspace(buff[i-1]))
buff.erase((buff[i] == '\n')?(--i):(i--), 1);
for (int i = buff.size() - 1; i >= 0 && isspace(buff[i]); i--)
buff.erase(i, 1);
/* token_string
** After the tokflag and endflag have been set, copy the tokens to the buffer.
string MPtok::token_string()
string buff;
int i;
// Move token starts to non-whitespace chars
int last_tok = 0;
for (i = 0; i < text_len; i++)
if (tokflag[i] == 1 && isspace(text[i]))
tokflag[i] = 0;
last_tok = 1;
} else if (isspace(text[i]) == 0 && last_tok)
tokflag[i] = 1;
last_tok = 0;
// Extract the tokens and print them out now
char *tok = new char[text_len + 1];
int pos = 0;
int sp = 0;
int ef = 0;
tok[pos] = '\0';
for (i = 0; i <= text_len; i++)
// The start of a new token
if (tokflag[i])
// Print the current token
append_token(buff, sp, tok, ef);
// Start a new token
pos = 0;
tok[pos] = '\0';
ef = 0;
// Append to the current token
tok[pos++] = text[i];
tok[pos] = '\0';
// If any of the characters in the token are endflagged,
// Then pass this information along for end-of-sentence
if (endflag[i]) ef = 1;
// Print the last token
append_token(buff, sp, tok, ef);
delete[] tok;
// Adjust the end of sentence boundaries
return buff;
void MPtok::map_escapes()
char *s;
int j, k, ch;
char buff[10];
k = 0;
for (int i = 0; text[i]; i++)
if (text[i] == '&' && text[i + 1] == '#')
for (s = &buff[0], j = 2; j <= 4 && i + j < text_len && isdigit(text[i + j]); j++)
*s++ = text[i + j];
*s = '\0';
ch = atoi(buff);
if (strlen(buff) > 0 && text[i + j] == ';' && ch > 0 && ch <= 256)
text[k] = ch;
if (! text[k]) text[k] = ' ';
i = i + j;
text[k++] = text[i];
text[k] = '\0';
text_len = k;
MPtok::MPtok(string idir, const string& cnam)
tok_initialized = 0;
if (idir.size() == 0)
char *p = getenv("MEDPOST_HOME");
if (p && strlen(p))
idir = p;
int found = idir.find("=");
if (found != string::npos)
idir = idir.substr(found + 1);
if (idir.size() == 0)
char buff[1000];
FILE *fp = fopen("path_medpost", "r");
if (fp)
if (fgets(buff, 1000, fp))
idir = &buff[0];
if (idir.size() == 0)
idir = "/home/natxie/CPP64/lib/FIXED_DATA/";
option_dir = idir;
option_token = 1;
option_segment = 1;
option_hyphen = 0;
option_comma = 1;
option_pretok = 0;
option_new = MPTOK_VERSION;
option_doteos = 0;
if (cnam.size() > 0)
option_cnam = "_";
option_cnam += cnam;
void MPtok::init(void)
if (tok_initialized) return;
string fname;
fname = option_dir + "/medpost" + option_cnam + ".pairs";
fname = option_dir + "/medpost" + option_cnam + ".abbr";
tok_initialized = 1;
// Global tokenizer
string MPtok::tokenize(const string& txt, int mt)
if (option_pretok) return save_string(txt);
option_token = mt;
text_len = txt.size();
if (text_len == 0) return string("");
text = new char[text_len + 1];
strcpy(text, txt.c_str());
if (text_len == 0) return NULL;
tokflag = new int[text_len + 1];
endflag = new int[text_len + 1];
if (option_new < 3)
string buff = token_string();
delete[] text; text = NULL;
delete[] tokflag; tokflag = NULL;
delete[] endflag; endflag = NULL;
return buff;
string MPtok::tokenize(const string& text)
return tokenize(text, 1);
string MPtok::segment(const string& text)
// tokenize the text
int save_option_segment = option_segment;
option_segment = 1;
string buff = tokenize(text, 0);
option_segment = save_option_segment;
if (buff.size() == 0) return text;
int found = 0;
int pos = 0;
while (pos < buff.size())
found = buff.find('\n', pos);
if (found == string::npos)
pos = buff.size();
} else
sent.push_back(buff.substr(pos, found - pos));
pos = found + 1;
return buff;
string MPtok::save_string(const string& s)
stringstream ss (stringstream::in | stringstream::out);
string w, t;
int found;
string ret;
ss << s;
while (ss.good())
ss >> w;
if (w.size() == 0) break;
found = w.find('_');
if (found != string::npos)
t = w.substr(found + 1);
} else
if (ret.size() > 0) ret += " ";
ret += w;
// now look for continuation tags...
for (int i = 0; i < word.size(); i++)
int j = tag[i].size() - 1;
if (j >= 0 && tag[i][j] == '+' && i < tag.size() - 1)
word[i] = word[i] + " " + word[i + 1];
tag[i] = tag[i + 1];
word.erase(word.begin() + i + 1, word.begin() + i + 2);
tag.erase(tag.begin() + i + 1, tag.begin() + i + 2);
return ret;
static int count_words(const char *s)
int i;
i = 1;
for (; *s; ++s)
if (*s == ' ') ++i;
return i;
static void print_word(const char *s, int i)
for (; i > 0 && *s; ++s) { if (*s == ' ') --i; }
while (*s && *s != ' ') { printf("%c", *s); ++s; }
void MPtok::print(int how)
int i, j, w;
if (how != 0 && how != 2)
printf("print(%d) not defined\n", how);
for (i = 0; i < word.size(); ++i)
// Get the words from an idiom
for (w = 0; w < count_words(word[i].c_str()); ++w)
if (how == 2 && i + w > 0) printf(" ");
print_word(word[i].c_str(), w);
if (how == 0)
printf(" tagged %s", tag[i].c_str());
if (w < count_words(word[i].c_str()) - 1) printf("+");
} else if (how == 2)
printf("%s%s", "_", tag[i].c_str());
if (w < count_words(word[i].c_str()) - 1) printf("+");
if (how == 2)
void MPtok::merge_words(int s, int n)
string tmp = word[s];
for (int i = s + 1; i < s + n; i++)
tmp += " ";
tmp += word[i];
// printf("merging words : '%s' n = %d\n", tmp.c_str(), n);
for (int k = s; k + n < word.size(); k++)
word[k+1] = word[k+n];
tag[k+1] = tag[k+n];
// Fixup the remaining array
word.resize(word.size() - n + 1);
word[s] = tmp;
void MPtok::split_words()
for (int i = 0; i < word.size(); i++)
int found = word[i].find(' ');
if (found != string::npos)
string tmp1(word[i], 0, found);
string tmp2(word[i], found + 1, string::npos);
// Move all the words and tags down
word.resize(word.size() + 1);
tag.resize(tag.size() + 1);
for (int j = word.size() - 1; j > i; j--)
word[j] = word[j - 1];
tag[j] = tag[j - 1];
word[i] = tmp1;
tag[i] = tag[i+1];
tag[i] += "+";
word[i+1] = tmp2;
// Callable functions to set internal options
void MPtok::set_segment(int i) { option_segment = i; }
void MPtok::set_hyphen(int i) { option_hyphen = i; }
void MPtok::set_comma(int i) { option_comma = i; }
void MPtok::set_pretag(char *a) { option_pretag = a; }
void MPtok::set_pretok(int i) { option_pretok = i; }
void MPtok::set_new(int i) { option_new = i; }
void MPtok::set_doteos(int i) { option_doteos = i; }