steventango
/

GNorm2-docker

Model card Files Files and versions Community

GNorm2-docker / Library /MPtok.C

steventango

Upload folder using huggingface_hub

d5062c8 verified about 1 year ago

raw

history blame contribute delete

38.6 kB

	#include <stdio.h>
	#include <ctype.h>
	#include <string.h>
	#include <stdlib.h>

	#include <string>
	#include <iostream>
	#include <fstream>
	#include <sstream>

	#include "MPtok.h"

	// These options are probably compile time constants

	static char option_tagsep = '_'; // The tagsep character
	static char option_replacesep = '-'; // Replace tagsep with this

	static void chomp(char *line)
	{
	int i;

	i = strlen(line) - 1;
	while (i >= 0 && line[i] == '\n' \|\| line[i] == '\r')
	line[i--] = '\0';
	}

	// Data structure and algorithm for finding common pairs.

	// read a file of pairs into a data structure,
	// the file must be sorted first

	void MPtok::init_pair(const string& file_name)
	{
	filebuf fb;
	fb.open(file_name.c_str(), ios::in);
	istream is(&fb);
	string pair;

	while (1)
	{
	getline(is, pair);
	if (is.fail()) break;
	if (pair.size() > 0) common_pair.insert(pair);
	}

	fb.close();
	}

	// List of abbreviations in 3 categories
	// ABB = can occur mid sentence
	// EOS = can occur at end of sentence
	// NUM = only used before numbers

	void MPtok::init_abbr(const string& file_name)
	{
	filebuf fb;
	fb.open(file_name.c_str(), ios::in);
	istream is(&fb);
	string typ, abb;
	map<string,int> val;
	val["ABB"] = ABB_ABB; val["EOS"] = ABB_EOS; val["NUM"] = ABB_NUM;

	while (is.good())
	{
	is >> typ;
	if (val.count(typ))
	{
	is >> abb;
	if (abb.size() > 0) common_abbr[abb] = val[typ];
	}
	}
	fb.close();
	}

	static char nextchar(const char *t, int i)
	{
	while (isspace(t[i])) i++;
	return t[i];
	}

	// Look for a token at or prior to the text position

	static int lookbehind(const char t, int i, const char s, int *tokflag)
	{
	int k = (int) strlen(s) - 1;

	while (i > 0 && isspace(t[i])) i--;

	while (k >= 0 && i >= 0)
	{
	if (k > 0 && tokflag[i]) break;

	if (tolower(s[k]) != tolower(t[i]))
	return -1;
	k--;
	i--;
	}

	return (k < 0 && tokflag[i+1]) ? i + 1 : -1;
	}

	// Look for a token at or following the text position

	static int lookahead(const char t, int i, const char s, int *tokflag)
	{
	int k = 0;

	while (isspace(t[i])) i++;

	while (k < strlen(s) && i < strlen(t))
	{
	if (k > 0 && tokflag[i]) break;

	if (tolower(s[k]) != tolower(t[i]))
	return -1;
	k++;
	i++;
	}

	return (k == strlen(s) && tokflag[i]) ? i - (int) strlen(s) : -1;
	}

	// Set the initial tokens at spaces

	void MPtok::tok_0()
	{
	int i;

	tokflag[0] = 1;
	for (i = 1; i < text_len; i++)
	{
	tokflag[i] = isspace(text[i]) \|\| (i > 0 && isspace(text[i - 1])) ? 1 : 0;
	}
	tokflag[i] = 1;
	}

	// Get quotes preceded by open parens
	//
	// A double quote, preceded by a space or open bracket is a separate token
	//

	void MPtok::tok_1()
	{
	for (int i = 1; i < text_len; i++)
	{
	if (text[i] == '"' && strchr("([{<", text[i-1]))
	{
	tokflag[i] = 1;
	if (i + 1 < text_len) tokflag[i+1] = 1;
	}
	}
	}

	// Look for ellipses
	//
	// Three dots in a row is a separate token

	void MPtok::tok_2()
	{
	for (int i = 1; i + 2 < text_len; i++)
	{
	if (strncmp(&text[i], "...", 3) == 0)
	{
	tokflag[i] = 1;
	if (i + 3 < text_len) tokflag[i+3] = 1;
	}
	}
	}

	// Non-sentence-ending punctuation
	//
	// Certain punctuation characters are separate tokens

	void MPtok::tok_3()
	{
	for (int i = 0; i < text_len; i++)
	{
	// If it is a comma and the next char is not a space and option_comma = 0

	if (option_comma == 0 && text[i] == ',' && isspace(text[i + 1]) == 0)
	{
	// do nothing
	} else if (strchr(",;:@#$%&", text[i]))
	{
	tokflag[i] = 1;
	tokflag[i + 1] = 1;
	}
	}
	}

	// Separate the slashes
	//
	// Slashes are a separate token
	// except for +/-, +/+, -/-, -/+, and and/or.

	void MPtok::tok_5_6_7()
	{
	for (int i = 0; i < text_len; i++)
	{
	if (text[i] == '/')
	{
	tokflag[i] = 1;
	if (i+1 < text_len) tokflag[i+1] = 1;

	// Put back +/-, etc, unless option_hyphen is 1

	if (i - 1 >= 0
	&& i + 1 < text_len
	&& ((option_new < 9
	&& text[i - 1] == '+' \|\| (text[i - 1] == '-' && option_hyphen == 0)
	&& text[i + 1] == '+' \|\| (text[i + 1] == '-' && option_hyphen == 0))
	\|\| (option_new >= 9
	&& (text[i - 1] == '+' \|\| text[i - 1] == '-')
	&& (text[i + 1] == '+' \|\| text[i + 1] == '-'))))
	{
	tokflag[i - 1] = 1;
	tokflag[i] = tokflag[i+1] = 0;
	tokflag[i + 2] = 1;
	}

	// Put back and/or, etc

	if (option_new <= 7)
	{
	if (i > 5 && strncmp(text + i - 5, " and/or ", 8) == 0)
	{
	for (int j = 1; j < 5; j++)
	tokflag[i - 2 + j] = 0;
	}
	} else
	{
	if (i > 4 && strncmp(text + i - 4, " and/or ", 8) == 0)
	{
	for (int j = 1; j < 6; j++)
	tokflag[i - 3 + j] = 0;
	}
	}
	}
	}
	}

	// All brackets
	//
	// Any open or closed bracket is a separate token
	//
	// Exclamation and question mark
	//
	// Any question or exclamation mark is a separate token

	void MPtok::tok_8_9()
	{
	for (int i = 0; i < text_len; i++)
	{
	if (strchr("[](){}<>", text[i])
	\|\| strchr("?!", text[i]))
	{
	tokflag[i] = 1;
	if (i + 1 < text_len) tokflag[i+1] = 1;
	}
	}
	}

	// Period at the end of a string may be followed by closed-bracket or quote
	//
	// A period that is preceded by a non-period
	// and optionally followed by a close paren
	// and any amount of space at the end of the string
	// is a separate token.

	void MPtok::tok_10()
	{
	for (int i = text_len - 1; i >= 0; i--)
	{
	if (isspace(text[i])) continue;
	if (strchr("])}>\"'", text[i])) continue;
	if (text[i] != '.') break;
	if (text[i] == '.' && (i - 1 < 0 \|\| text[i-1] != '.'))
	{
	tokflag[i] = 1;
	if (i + 1 < text_len) tokflag[i+1] = 1;
	}
	}
	}

	// Period followed by a capitalized word
	//
	// A period preceded by a character that is not another period and not a space
	// and followed by a space then an upper case letter is a separate token

	void MPtok::tok_11()
	{
	for (int i = 0; i < text_len; i++)
	{
	if (text[i] == '.'
	&& (i + 1 < text_len && isspace(text[i+1]))
	&& (i - 1 < 0 \|\| text[i - 1] != '.' \|\| isspace(text[i-1]) == 0)
	&& isupper(nextchar(text, i + 1)))
	tokflag[i] = 1;
	}
	}

	// A normal word followed by a period
	//
	// A period followed by a space
	// and preceded by 2 or more alphabetic characters or hyphens
	// is a separate token

	void MPtok::tok_12()
	{
	int wcnt = 0;

	for (int i = 0; i < text_len; i++)
	{
	if (text[i] == '.'
	&& tokflag[i + 1]
	&& wcnt >= 2)
	tokflag[i] = 1;

	if (isalpha(text[i]) \|\| text[i] == '-')
	++wcnt;
	else
	wcnt = 0;
	}
	}

	// A non-normal token (that has no lower case letters) followed by a period
	//
	// A period at the end of a token made of characters excluding lower case
	// is a separate token

	void MPtok::tok_13()
	{
	int stok = 0;
	int wcnt = 0;

	for (int i = 0; i < text_len; i++)
	{
	if (text[i] == '.'
	&& tokflag[i + 1]
	&& wcnt >= 2)
	tokflag[i] = 1;

	if (tokflag[i] == 1) stok = 1;

	if (islower(text[i]) \|\| text[i] == '.')
	{
	stok = 0;
	wcnt = 0;
	}

	if (stok)
	wcnt++;
	}
	}

	// put some periods with single-letter abbreviations
	//
	// A single alphabetic token followed by a period followed
	// by a token that does not begin with an upper case letter
	// or number is taken to be an abbreviation and the period
	// does not start a new token.
	//
	// NOTE: This does not recognize initials in people's names,
	// that problem is not simply solved.

	void MPtok::tok_14()
	{
	for (int i = 0; i < text_len; i++)
	{
	if (text[i] == '.'
	&& i - 1 >= 0 && isalpha(text[i - 1]) && tokflag[i - 1]
	&& tokflag[i + 1]
	&& isupper(nextchar(text, i + 1)) == 0
	&& isdigit(nextchar(text, i + 1)) == 0
	&& nextchar(text, i + 1) != '('
	)
	{
	tokflag[i] = 0;
	}
	}
	}

	void MPtok::tok_15()
	{
	int i, j, k, a;
	char buff[MAX_ABB + 1];

	for (i = 0; i < text_len; i++)
	{
	// only start at a current token

	if (! tokflag[i]) continue;

	// find alphabetic followed by period

	buff[0] = '\0';
	for (k = 0; i + k < text_len && k < MAX_ABB; k++)
	{
	buff[k] = text[i+k]; buff[k+1] = '\0';
	if (k > 0 && buff[k] == '.') break; // this is good
	if (! isalpha(buff[k])) { buff[0] = '\0'; break; } // this is not good
	}

	if (buff[0] == '\0' \|\| i + k == text_len \|\| k == MAX_ABB) continue;

	// at this point, buff[k] == '.' add 1 to make it the length

	k++;

	// if not found, try finding a concatenated abbrev

	if (! common_abbr.count(buff))
	{
	for (; i + k < text_len && k < MAX_ABB; k++)
	{
	buff[k] = text[i+k]; buff[k+1] = '\0';
	if (k > 0 && buff[k] == '.') break; // this is good
	if (! isalpha(buff[k])) { buff[0] = '\0'; break; } // this is not good
	}

	if (buff[0] == '\0' \|\| i + k == text_len \|\| k == MAX_ABB) continue;

	// at this point, buff[k] == '.' add 1 to make it the length

	k++;
	}

	// if not found, give up

	if (! common_abbr.count(buff)) continue;

	if (common_abbr[buff] == ABB_NUM)
	{
	for (j = i + k; j < text_len && isspace(text[j]); j++) ; // next must be a number
	if (! isdigit(text[j])) continue; // go to next abbreviation
	} else if (common_abbr[buff] == ABB_EOS)
	{
	for (j = i + k; j < text_len && isspace(text[j]); j++) ; // if next token is upper case letter
	if (isupper(text[j])) tokflag[i + (--k)] = 1; // tokenize the final period of this abbreviation
	}

	// clear all token flags

	for (j = 1; j < k; j++) tokflag[i + j] = 0;
	}
	}

	// Check for common pairs that should not be considered sentence breaks

	void MPtok::tok_15_1()
	{
	int i, j, k, tnum, p;
	char buff[MAX_ABB + 1];

	for (i = 0; i < text_len; i++)
	{
	if (! tokflag[i]) continue;

	// must be alphanumeric token followed by period token followed by space followed by alphanumeric token

	tnum = 0;
	buff[0] = '\0';
	for (p = k = 0; i + k < text_len && k < MAX_ABB; k++)
	{
	buff[k] = text[i+k]; buff[k+1] = '\0';

	if (isspace(buff[k]))
	{
	if (tnum == 2) break; // this is good
	else if (tnum == 1) continue; // ok
	else { buff[0] = '\0'; break; } // this shouldn't happen
	}

	if (tokflag[i+k])
	{
	if (tnum > 2) break; // done
	else tnum++;
	}

	if (tnum == 1 && buff[k] == '.') p = k;
	if (tnum == 1 && buff[k] != '.') { buff[0] = '\0'; break; } // nope
	if (! isalnum(buff[k])) { buff[0] = '\0'; break; } // nope
	}

	if (buff[0] == '\0' \|\| i + k == text_len \|\| k == MAX_ABB) continue;

	// at this point buff is a potential pair, so untokenize the period, that's all

	if (common_pair.count(buff))
	tokflag[p] = 0;
	}
	}

	// Get cases where a space after a sentence has been omitted
	//
	// A period that occurs in a token consisting of alphabetic
	// letters with a vowel to the left and the right is a
	// separate token.

	void MPtok::tok_16()
	{
	int j;
	int has_vowel;

	for (int i = 0; i < text_len; i++)
	{
	if (text[i] == '.' && tokflag[i] == 0)
	{
	has_vowel = 0;
	for (j = i - 1; j >= 0; --j)
	{
	if (isalpha(text[j]) == 0)
	break;
	if (strchr("aeiouAEIOU", text[j]))
	has_vowel = 1;
	if (tokflag[j])
	break;
	}
	if ((j >= 0 && tokflag[j] == 0) \|\| has_vowel == 0)
	continue;

	j = i + 1;

	has_vowel = 0;
	for (; j < text_len && tokflag[j] == 0; ++j)
	{
	if (isalpha(text[j]) == 0)
	break;
	if (strchr("aeiouAEIOU", text[j]))
	has_vowel = 1;
	}

	if ((j < text_len && tokflag[j] == 0) \|\| has_vowel == 0)
	continue;

	tokflag[i] = 1;
	tokflag[i + 1] = 1;
	}
	}
	}

	// Correction to tok_16,
	// Don't count if the token before is a single letter
	// or the token following is a single letter other than 'a'.
	// Also, don't count if the token to the right is gov, com, edu, etc.
	// because those are web addresses!

	#define COMPLEX_WINDOW 40

	enum {COMPLEX_NOT = 0, COMPLEX_YES, COMPLEX_DONE};

	struct _complex {
	int flag;
	int offset;
	const char *str;
	int len;
	} complex[] = {
	COMPLEX_YES, 0, "complex", 7,
	COMPLEX_NOT, 0, "complexi", 8,
	COMPLEX_NOT, 0, "complexed", 9,
	COMPLEX_NOT, 0, "complexa", 8,
	COMPLEX_NOT, 0, "complex-", 8,
	COMPLEX_NOT, 0, "complexl", 8,
	COMPLEX_NOT, 0, "complexu", 8,
	COMPLEX_NOT, -1, "-complex", 7,
	COMPLEX_NOT, -2, "nocomplex", 9,
	COMPLEX_NOT, -3, "subcomplex", 10,
	COMPLEX_YES, 0, "hybrid", 6,
	COMPLEX_NOT, 0, "hybridi", 7,
	COMPLEX_NOT, 0, "hybrido", 7,
	COMPLEX_NOT, 0, "hybrida", 7,
	COMPLEX_NOT, 0, "hybrid-", 7,
	COMPLEX_NOT, -1, "-hybrid", 7,
	COMPLEX_YES, 0, "duplex", 6,
	COMPLEX_NOT, -1, "oduplex", 7,
	COMPLEX_DONE, 0, NULL, 0,
	};

	int MPtok::complex_check()
	{
	int last_period = -2*COMPLEX_WINDOW;
	int last_complex = -2*COMPLEX_WINDOW;
	int i, j;
	int complex_match;

	for (i = 0; i < text_len; i++)
	{
	if (text[i] == '.')
	{
	if (i - last_complex <= COMPLEX_WINDOW)
	return 1;
	last_period = i;
	}

	complex_match = 0;
	for (j = 0; complex[j].str; j++)
	{
	if (complex[j].flag == COMPLEX_NOT)
	{
	if (i + complex[j].offset >= 0
	&& strncmp(text+i+complex[j].offset, complex[j].str, complex[j].len) == 0)
	{
	// don't match here
	complex_match = 0;
	}
	} else if (complex[j].flag == COMPLEX_YES)
	{
	if (i + complex[j].offset >= 0
	&& strncmp(text+i+complex[j].offset, complex[j].str, complex[j].len) == 0)
	{
	// match here
	complex_match = 1;
	}
	}
	}

	if (complex_match)
	{
	if (i - last_period <= COMPLEX_WINDOW)
	return 1;
	last_complex = i;
	}
	}
	return 0;
	}

	void MPtok::tok_16_1()
	{
	int i, j;
	char v1, v2;
	int c1, c2;

	if (option_new == 3 && strstr(text, "complex"))
	return;

	if (option_new >= 4 && complex_check())
	return;

	for (i = 0; i < text_len; i++)
	{
	if (text[i] == '.' && tokflag[i] == 0)
	{
	char suffix[10];
	int s_i;

	v1 = '\0';
	c1 = 0;
	for (j = i - 1; j >= 0; --j)
	{
	if (isalpha(text[j]) == 0)
	break;
	if (strchr("aeiouAEIOU", text[j]))
	v1 = tolower(text[j]);
	c1++;
	if (tokflag[j])
	break;
	}
	if ((j >= 0 && tokflag[j] == 0)
	\|\| v1 == '\0'
	\|\| c1 == 1)
	continue;

	j = i + 1;

	v2 = '\0';
	c2 = 0;
	s_i = 0;
	for (; j < text_len && tokflag[j] == 0; ++j)
	{
	if (isalpha(text[j]) == 0)
	break;
	if (strchr("aeiouAEIOU", text[j]))
	v2 = tolower(text[j]);
	if (s_i < 3)
	suffix[s_i++] = tolower(text[j]); suffix[s_i] = '\0';
	c2++;
	}

	if ((j < text_len && tokflag[j] == 0)
	\|\| v2 == '\0'
	\|\| (c2 == 1 && v2 != 'a')
	\|\| (c2 == 3 && tokflag[j] == 1 && s_i == 3
	&& (strcmp(suffix, "gov") == 0
	\|\| strcmp(suffix, "edu") == 0
	\|\| strcmp(suffix, "org") == 0
	\|\| strcmp(suffix, "com") == 0)))
	continue;

	tokflag[i] = 1;
	tokflag[i + 1] = 1;
	}
	}
	}


	// Numeric endings of sentences
	//
	// A period after a numeric token followed by a token that starts
	// with an alphabetic character, is a separate token.
	//
	// This should be covered already by tok_13

	void MPtok::tok_17()
	{
	int j;

	for (int i = 0; i < text_len; i++)
	{
	if (text[i] == '.'
	&& tokflag[i] == 0
	&& tokflag[i + 1])
	{
	for (j = i - 1; j >= 0 && isdigit(text[j]) && tokflag[j] == 0; --j)
	;
	if (j >= 0 && j < i - 1 && tokflag[j] && isalpha(nextchar(text, i + 1)))
	tokflag[i] = 1;
	}
	}
	}

	// period at end of string is a token

	void MPtok::tok_20()
	{
	for (int i = text_len - 1; i >= 0; --i)
	{
	if (isspace(text[i]))
	continue;

	if (strchr(".!?", text[i]))
	tokflag[i] = 1;

	break;
	}
	}

	// a period that follows a non-common word, and that is
	// followed by a lower case common word is probably not a token

	void MPtok::tok_20_1()
	{
	int j;

	for (int i = 0; i < text_len; ++i)
	{
	if (text[i] == '.' && tokflag[i] == 1)
	{
	int tcnt, lcnt, ocnt;
	tcnt = lcnt = ocnt = 0;

	// make sure the previous word was not common

	for (j = i - 1; j >= 0; j--)
	{
	if (isspace(text[j])) continue;
	if (option_new >= 2)
	{
	if (islower(text[j]) == 0 && text[j] != '-') ocnt++;
	} else
	{
	if (! islower(text[j])) ocnt++;
	}

	if (tokflag[j] \|\| j == 0)
	{
	if (ocnt == 0)
	{
	goto nexti;
	}
	break;
	}
	}

	tcnt = lcnt = ocnt = 0;

	// make sure the next word is common

	for (j = i + 1; j < text_len; j++)
	{
	if (isspace(text[j])) continue;
	if (tokflag[j]) tcnt++;

	if (tcnt == 2 \|\| j == text_len - 1)
	{
	if (lcnt > 0 && ocnt == 0) tokflag[i] = 0;
	break;
	}

	if (islower(text[j])) lcnt++;
	else ocnt++;
	}
	}
	nexti: ;
	}
	}

	// tokenized period followed by non-space other than close paren
	// is not a token

	void MPtok::tok_20_2()
	{
	int j;

	for (int i = 0; i < text_len - 1; ++i)
	{
	if (text[i] == '.' && tokflag[i] == 1
	&& strchr(" ()[]\"\'\n\t\r", text[i+1]) == 0)
	{
	tokflag[i] = 0;
	}
	}
	}


	// long dash
	//
	// A pair of hyphens is a complete token

	void MPtok::tok_21()
	{
	for (int i = 0; i + 1 < text_len; i++)
	{
	if (strncmp(&text[i], "--", 2) == 0)
	{
	tokflag[i] = 1;
	if (i + 2 < text_len)
	{
	i += 2;
	tokflag[i] = 1;
	}
	}
	}
	}

	// hyphens
	//
	// If specified as an option, a hyphen between letters is a complete token

	void MPtok::tok_21a()
	{
	if (option_hyphen == 0) return;

	for (int i = 0; i + 1 < text_len; i++)
	{
	if (text[i] == '-'
	&& (i == 0 \|\| text[i-1] != '-')
	&& text[i+1] != '-')
	{
	tokflag[i] = 1;
	tokflag[i+1] = 1;
	}
	}
	}


	// quote
	//
	// Any double quote is a separate token

	void MPtok::tok_22()
	{
	for (int i = 0; i < text_len; i++)
	{
	if (text[i] == '"')
	{
	tokflag[i] = 1;
	if (i + 1 < text_len)
	{
	i += 1;
	tokflag[i] = 1;
	}
	}
	}
	}

	// possessive
	//
	// Any single quote at the end of a token that is not
	// preceded by a single quote is a separate token

	void MPtok::tok_23()
	{
	for (int i = 0; i < text_len; i++)
	{
	if (text[i] == '\''
	&& (i - 1 >= 0 && text[i - 1] != '\'')
	&& tokflag[i + 1])
	{
	tokflag[i] = 1;
	}
	}
	}


	// quote
	//
	// If a single quote starts a token, or is preceded by a
	// single quote, and followed by a character
	// that is not a single quote, then
	// the character to it's right is the start of a new token

	void MPtok::tok_24()
	{
	for (int i = 0; i < text_len; i++)
	{
	if (text[i] == '\''
	&& (tokflag[i] == 1 \|\| (i - 1 >= 0 && text[i - 1] == '\''))
	&& (i + 1 < text_len && text[i + 1] != '\''))
	{
	tokflag[i + 1] = 1;
	}
	}
	}

	// put back possessive
	//
	// A single quote that is a whole token followed by a lower case s
	// that is also a whole token (without space between them)
	// should be merged into a single token

	void MPtok::tok_25()
	{
	for (int i = 0; i < text_len; i++)
	{
	if (text[i] == '\''
	&& tokflag[i] == 1
	&& i + 1 < text_len && text[i + 1] == 's'
	&& tokflag[i+1] == 1
	&& (i + 2 >= text_len \|\| isspace(text[i + 2]) \|\| tokflag[i + 2] == 1))
	{
	tokflag[i + 1] = 0;
	}
	}
	}

	// quote
	//
	// A pair of single quotes is a separate token

	void MPtok::tok_26()
	{
	for (int i = 0; i < text_len; i++)
	{
	if (strncmp(&text[i], "''", 2) == 0
	\|\| strncmp(&text[i], "``", 2) == 0)
	{
	tokflag[i] = 1;
	if (i + 2 < text_len) tokflag[i + 2] = 1;
	}
	}
	}

	// possessive
	//
	// A single quote followed by a letter s is a possessive

	void MPtok::tok_27()
	{
	for (int i = 0; i < text_len; i++)
	{
	if (text[i] == '\''
	&& i + 1 < text_len
	&& tolower(text[i + 1]) == 's'
	&& (i + 2 >= text_len \|\| tokflag[i + 2]))
	{
	tokflag[i] = 1;
	}
	}
	}

	// split "cannot" to "can not"
	//
	// A single token that is the word cannot (in any case)
	// is split into two words

	void MPtok::tok_28()
	{
	for (int i = 0; i < text_len; i++)
	{
	if ((strncmp(&text[i], "cannot", 6) == 0
	\|\| strncmp(&text[i], "Cannot", 6) == 0)
	&& tokflag[i + 6])
	{
	tokflag[i + 3] = 1;
	}
	}
	}

	// put list item elements back at sentence end
	//
	// A period that is preceded by an alphanumeric (no space)
	// and any amount of preceding space and an end-mark
	// stays with the alphanumeric.

	void MPtok::tok_29()
	{
	int j;

	for (int i = 0; i < text_len; i++)
	{
	if (text[i] == '.'
	&& tokflag[i] && tokflag[i + 1]
	&& i - 1 >= 0 && isalnum(text[i - 1])
	&& tokflag[i - 1]
	&& ((j = lookbehind(text, i-2, ".", tokflag)) >= 0
	\|\| (j = lookbehind(text, i-2, "?", tokflag)) >= 0
	\|\| (j = lookbehind(text, i-2, "!", tokflag)) >= 0)
	&& tokflag[j])
	{
	tokflag[i] = 0;
	}
	}
	}

	// attach list elements to the beginnings of their sentences
	// this means, attach the period to the list element
	//
	// a list element is a single letter or a one or two digits
	// which is preceded by an end of sentence ".!?;"
	// or colon (provided it doesn't belong to a proportion construct)

	void MPtok::tok_29a()
	{
	int i, j;

	for (i = 0; i < text_len; i++)
	{
	if (text[i] == '.' && tokflag[i])
	{
	// Look back, make sure the token before the period
	// is either single alphanumeric, or at most a two digit number
	// and the character before that is a punctuation ".?!:,"

	int tcnt, acnt, dcnt, pcnt, ocnt, scnt;
	tcnt = acnt = dcnt = pcnt = ocnt = scnt = 0;
	char p;

	for (j = i - 1; j >= 0; j--)
	{
	if (isspace(text[j])) { scnt++; continue; }
	else if (tcnt == 0 && isalpha(text[j])) ++acnt;
	else if (tcnt == 0 && isdigit(text[j])) ++dcnt;
	else if (tcnt == 1 && strchr(".!?:;,", text[j])) { pcnt++; p = text[j]; }
	else ocnt++;

	if (tokflag[j] \|\| j == 0)
	{
	tcnt++;
	if (tcnt == 1 && ocnt == 0 && scnt == 0
	&& ((acnt == 1 && dcnt == 0) \|\| (acnt == 0 && dcnt > 0 && dcnt <= 2)))
	{
	// This is acceptable
	} else if (tcnt == 2 && pcnt <= 1 && ocnt == 0 && scnt > 0)
	{
	if (p == ':')
	{
	while (--j >= 0 && isspace(text[j]))
	;
	if (j >= 0 && isdigit(text[j]))
	{
	// It's probably a proportion
	break;
	}
	}
	// Jackpot
	tokflag[i] = 0;
	} else
	{
	// This is not
	break;
	}
	scnt = 0;
	}
	}
	}
	}
	}

	// list elements at the beginning of a string
	//
	// An alphanumeric token followed by a period
	// at the beginning of the line stays with the
	// alphanumeric

	void MPtok::tok_30()
	{
	int i = 0;

	while (isspace(text[i])) i++;

	if (isalnum(text[i])
	&& tokflag[i]
	&& i + 1 < text_len
	&& text[i + 1] == '.'
	&& tokflag[i + 1])
	{
	tokflag[i + 1] = 0;
	}
	}

	// process American style numbers

	void MPtok::tok_31()
	{
	int j;

	for (int i = 0; i < text_len; i++)
	{
	if (text[i] == ','
	&& i + 3 < text_len
	&& tokflag[i] && tokflag[i + 1]
	&& isdigit(text[i + 1])
	&& isdigit(text[i + 2])
	&& isdigit(text[i + 3])
	&& i - 1 >= 0 && isdigit(text[i - 1])
	)
	{
	tokflag[i] = 0;
	tokflag[i + 1] = 0;
	}
	}
	}

	// process British style numbers

	void MPtok::tok_32()
	{
	int j;

	for (int i = 0; i < text_len; i++)
	{
	if (text[i] == ' '
	&& i + 3 < text_len
	&& tokflag[i] && tokflag[i + 1]
	&& isdigit(text[i + 1])
	&& isdigit(text[i + 2])
	&& isdigit(text[i + 3])
	&& i - 1 >= 0 && isdigit(text[i - 1])
	)
	{
	tokflag[i] = 0;
	tokflag[i + 1] = 0;
	}
	}
	}

	// tokenize unicode escapes
	//
	// Added

	void MPtok::tok_33()
	{
	int j;

	for (int i = 0; i < text_len; i++)
	{
	if (text[i] == '&')
	{
	if (text[i + 1] == '#')
	{
	for (j = i + 2; isdigit(text[j]); j++)
	;
	} else
	{
	for (j = i + 1; isalpha(text[j]); j++)
	;
	}

	if (text[j] == ';')
	{
	// Tokenize the escape, untokenize everything inside

	tokflag[i] = 1;
	for (i++; i <= j; i++) tokflag[i] = 0;
	tokflag[i] = 1;
	}
	}
	}
	}

	// Remove tags if they are present

	void MPtok::tok_un()
	{
	int untok = 0;
	for (int i = 0; text[i]; ++i)
	{
	if (isspace(text[i])) untok = 0;
	if (text[i] == option_tagsep) untok = 1;
	if (untok) text[i] = ' ';
	}
	}


	void MPtok::set_tokflag()
	{
	int i;

	tok_0();
	tok_1();
	tok_2();
	tok_3();

	// step 4 replaces tag char, this is done at output

	tok_5_6_7();
	tok_8_9();

	tok_10();
	tok_11();
	if (option_new >= 1)
	{
	tok_21();
	tok_21a();
	tok_22();
	tok_23();
	tok_24();
	tok_25();
	tok_26();
	tok_27();
	}
	tok_12();
	tok_13();
	tok_14();
	if (option_new <= 5)
	tok_15();
	if (option_new < 2)
	tok_16();
	tok_17();

	// steps 18 and 19 recognize periods within parens,
	// and this is moved to the segmentation section

	tok_20();
	if (option_new >= 1)
	{
	tok_20_1();
	tok_20_2();
	if (option_new >= 2)
	tok_16_1();
	if (option_new >= 6)
	tok_15();
	if (option_new >= 7)
	tok_15_1();
	}
	if (option_new < 1)
	{
	tok_21();
	tok_21a();
	tok_22();
	tok_23();
	tok_24();
	tok_25();
	tok_26();
	tok_27();
	}
	tok_28();
	if (option_new >= 1)
	tok_29a();
	else
	tok_29();
	tok_30();
	tok_31();
	tok_32();

	tok_33();
	}

	/* set_endflag
	**
	** After tokflag has been set, find the possible sentence endings.
	*/

	void MPtok::set_endflag()
	{
	int i;

	// The following tests look for end-stops and label them.
	// They include steps 18 and 19

	for (i = 0; i <= text_len; i++)
	endflag[i] = 0;

	// Count the number of unmatched parens

	int up = 0; // unmatched round parens
	int ub = 0; // unmatched brackets

	for (i = 0; i < text_len; i++)
	{
	if (text[i] == '(') ++up;
	if (text[i] == ')') --up;
	if (text[i] == '[') ++ub;
	if (text[i] == ']') --ub;
	if (up < 0) up = 0;
	if (ub < 0) ub = 0;
	}

	// Now find the end-of-sentence marks

	// tok_18: periods within parentheses, allow for nesting
	// tok_19: periods within brackets, allow for nesting
	// the perl version solves this by putting the period
	// back with the previous token, but a better solution
	// is to allow it to be tokenized but just don't
	// allow it to be an end-of-sentence.
	// Therefore, these are moved to the segmentation
	// section

	int p = 0; // round parens
	int b = 0; // brackets

	for (i = 0; i < text_len; i++)
	{
	if (text[i] == '(') ++p;
	if (text[i] == ')') --p;
	if (text[i] == '[') ++b;
	if (text[i] == ']') --b;
	if (p < 0) p = 0;
	if (b < 0) b = 0;

	if (strchr(".!?", text[i])
	&& tokflag[i]
	&& tokflag[i + 1])
	{
	if (option_segment && p <= up && b <= ub)
	endflag[i] = 1;

	// This is optional to join periods with
	// probable abbreviations

	if (p > up \|\| b > ub)
	tokflag[i] = 0;
	}
	}

	// endtokens followed by a single or double quote, which matches
	// a single or double quote in the previous sentence

	if (option_new >= 1)
	{
	int dquo, squo;
	dquo = squo = 0;

	for (i = 0; i < text_len; i++)
	{
	if (text[i] == '"') dquo = ! dquo;
	else if (text[i] == '\'') squo = ! squo;
	else if (endflag[i])
	{
	if ((text[i+1] == '"' && dquo) \|\| (text[i+1] == '\'' && squo))
	{
	endflag[i] = 0;

	// But don't end at all if the next token is something
	// other than an upper case letter.

	if (option_new >= 2)
	{
	int j;
	int ok = 0;

	for (j = i + 2; j < text_len; j++)
	{
	if (isspace(text[j])) continue;
	// if (isupper(text[j]))
	if (isupper(text[j]) \|\| text[j] == '(')
	{
	ok = 1;
	break;
	}
	if (tokflag[j]) break;
	}

	if (ok)
	endflag[i+1] = 1;
	} else
	{
	endflag[i+1] = 1;
	}
	}
	dquo = squo = 0;
	}
	}
	}
	}


	/* set_endflag_01
	**
	** After tokflag has been set, find the possible sentence endings.
	** This has improved paren matching.
	*/

	#define MAX_MATCH 500 // Maximum length to get a paren match

	void MPtok::set_endflag_01()
	{
	int match[text_len];
	int i, j;

	// The following tests look for end-stops and label them.
	// They include steps 18 and 19

	for (i = 0; i <= text_len; i++)
	endflag[i] = 0;

	for (i = 0; i < text_len; i++)
	match[i] = 0;

	for (i = text_len - 1; i >= 0; i--)
	{
	if (text[i] == '(' \|\| text[i] == '[')
	{
	for (j = i + 1; text[j] && j - i <= MAX_MATCH; j++)
	{
	// Skip parens that are already matched

	if (match[j] > j)
	{
	j = match[j];
	continue;
	}

	// Look for a matching close paren

	if (match[j] == 0
	&& ((text[i] == '(' && text[j] == ')')
	\|\| (text[i] == '[' && text[j] == ']')))
	{
	match[i] = j;
	match[j] = i;
	break;
	}
	}
	}
	}

	int next_match = 0;
	for (i = 0; i < text_len; i++)
	{
	if (match[i] > next_match)
	next_match = match[i];

	if (strchr(".!?", text[i])
	&& tokflag[i]
	&& tokflag[i + 1]
	&& (option_new <= 4 \|\| option_doteos == 1 \|\| (i > 0 && isspace(text[i-1]) == 0)))
	{
	if (i <= next_match)
	tokflag[i] = 0;
	else if (option_segment)
	endflag[i] = 1;
	}
	}

	// endtokens followed by a single or double quote, which matches
	// a single or double quote in the previous sentence

	int dquo, squo;
	dquo = squo = 0;

	for (i = 0; i < text_len; i++)
	{
	if (option_new <= 7 && text[i] == '"') dquo = ! dquo;
	else if (option_new >= 8 && text[i] == '"' && tokflag[i] && tokflag[i+1]) dquo = ! dquo;
	else if (option_new <= 7 && text[i] == '\'') squo = ! squo;
	else if (option_new >= 8 && text[i] == '\''
	&& tokflag[i] && (tokflag[i+1] \|\| (text[i+1] == '\'' && tokflag[i+2]))) squo = ! squo;
	else if (endflag[i])
	{
	if ((text[i+1] == '"' && dquo) \|\| (text[i+1] == '\'' && squo))
	{
	endflag[i] = 0;

	// But don't end at all if the next token is something
	// other than an upper case letter.

	if (option_new >= 2)
	{
	int j;
	int ok = 0;

	for (j = i + 2; j < text_len; j++)
	{
	if (isspace(text[j])) continue;
	// if (isupper(text[j]))
	if (isupper(text[j]) \|\| text[j] == '(')
	{
	ok = 1;
	break;
	}
	if (tokflag[j]) break;
	}

	if (ok)
	endflag[i+1] = 1;
	} else
	{
	endflag[i+1] = 1;
	}
	}
	dquo = squo = 0;
	}
	}
	}


	// Size buffer: return the size of the buffer required to hold all of the tokenized text.
	// It can be simply estimated by a formula that depends only on the length of text and number of tokens.

	int MPtok::size_buff()
	{
	int size = 1; // Start with null terminator
	int t = option_pretag.size(); // for each tag, the length of the UNTAG string

	if (t <= 0) t = 1; // Make sure there is at least one
	t += 2; // Add one for underscore and one for space

	for (int i = 0; i < text_len; i++)
	{
	size++; // Count all characters
	if (tokflag[i]) size += t; // Count token delimiters (may overcount)
	if (endflag[i]) size++; // Add one for newline
	}
	return size;
	}


	/* append_token
	**
	** Save a single token to a buffer.
	*/

	void MPtok::append_token(string& buff, int& sp, char *tok, int ef)
	{
	// Convert tag separator chars and back quotes (?)

	for (int i = 0; tok[i]; i++)
	{
	if (tok[i] == option_tagsep) tok[i] = option_replacesep;
	if (tok[i] == '`') tok[i] = '\'';
	}

	// Skip whitespace if tokens are being output
	// Otherwise, skip whitespace at the start of a sentence

	if (option_token \|\| ! sp) while (isspace(*tok)) ++tok;

	// Save the token

	if (strlen(tok) > 0)
	{
	// Add delimiter if needed

	if (option_token && sp) buff += ' ';

	// Append token to output

	if (option_new < 9)
	{
	while (tok && (! option_token \|\| ! isspace(tok)))
	buff += *(tok++);
	} else
	{
	while (*tok)
	buff += *(tok++);
	}

	sp = 1;

	// Add tag holders

	if (option_token && option_pretag.size() > 0)
	{
	buff += option_tagsep;
	buff += option_pretag;
	}

	// If it was end of sentence, then add newline

	if (ef)
	{
	buff += '\n';
	sp = 0;
	}
	}
	}

	// Strip whitespace after sentences

	static void adjust_space(string& buff)
	{
	while (buff.size() > 0 && isspace(buff[0])) buff.erase(0, 1);

	// delete two spaces in a row, but keep newlines

	for (int i = 1; i < buff.size(); i++)
	{
	if (isspace(buff[i]) && isspace(buff[i-1]))
	buff.erase((buff[i] == '\n')?(--i):(i--), 1);
	}

	for (int i = buff.size() - 1; i >= 0 && isspace(buff[i]); i--)
	buff.erase(i, 1);
	}

	/* token_string
	**
	** After the tokflag and endflag have been set, copy the tokens to the buffer.
	*/

	string MPtok::token_string()
	{
	string buff;

	int i;

	// Move token starts to non-whitespace chars

	int last_tok = 0;
	for (i = 0; i < text_len; i++)
	{
	if (tokflag[i] == 1 && isspace(text[i]))
	{
	tokflag[i] = 0;
	last_tok = 1;
	} else if (isspace(text[i]) == 0 && last_tok)
	{
	tokflag[i] = 1;
	last_tok = 0;
	}
	}

	// Extract the tokens and print them out now

	char *tok = new char[text_len + 1];
	int pos = 0;
	int sp = 0;
	int ef = 0;

	tok[pos] = '\0';

	for (i = 0; i <= text_len; i++)
	{
	// The start of a new token

	if (tokflag[i])
	{
	// Print the current token

	append_token(buff, sp, tok, ef);

	// Start a new token

	pos = 0;
	tok[pos] = '\0';

	ef = 0;
	}

	// Append to the current token

	tok[pos++] = text[i];
	tok[pos] = '\0';

	// If any of the characters in the token are endflagged,
	// Then pass this information along for end-of-sentence

	if (endflag[i]) ef = 1;
	}

	// Print the last token

	append_token(buff, sp, tok, ef);

	delete[] tok;

	// Adjust the end of sentence boundaries

	adjust_space(buff);

	return buff;
	}

	void MPtok::map_escapes()
	{
	char *s;
	int j, k, ch;
	char buff[10];

	k = 0;
	for (int i = 0; text[i]; i++)
	{
	if (text[i] == '&' && text[i + 1] == '#')
	{
	for (s = &buff[0], j = 2; j <= 4 && i + j < text_len && isdigit(text[i + j]); j++)
	*s++ = text[i + j];
	*s = '\0';
	ch = atoi(buff);
	if (strlen(buff) > 0 && text[i + j] == ';' && ch > 0 && ch <= 256)
	{
	text[k] = ch;
	if (! text[k]) text[k] = ' ';
	k++;
	i = i + j;
	continue;
	}
	}
	text[k++] = text[i];
	}
	text[k] = '\0';
	text_len = k;
	}

	MPtok::MPtok(string idir, const string& cnam)
	{
	tok_initialized = 0;

	if (idir.size() == 0)
	{
	char *p = getenv("MEDPOST_HOME");
	if (p && strlen(p))
	{
	idir = p;

	int found = idir.find("=");
	if (found != string::npos)
	idir = idir.substr(found + 1);
	}
	}


	if (idir.size() == 0)
	{
	char buff[1000];
	FILE *fp = fopen("path_medpost", "r");
	if (fp)
	{
	if (fgets(buff, 1000, fp))
	{
	chomp(buff);
	idir = &buff[0];
	}
	fclose(fp);
	}
	}

	if (idir.size() == 0)
	idir = "/home/natxie/CPP64/lib/FIXED_DATA/";

	option_dir = idir;

	option_token = 1;
	option_segment = 1;
	option_hyphen = 0;
	option_comma = 1;
	option_pretok = 0;
	option_new = MPTOK_VERSION;
	option_doteos = 0;

	if (cnam.size() > 0)
	{
	option_cnam = "_";
	option_cnam += cnam;
	}

	init();
	}

	void MPtok::init(void)
	{
	if (tok_initialized) return;

	string fname;

	fname = option_dir + "/medpost" + option_cnam + ".pairs";
	init_pair(fname);

	fname = option_dir + "/medpost" + option_cnam + ".abbr";
	init_abbr(fname);

	tok_initialized = 1;
	}

	MPtok::~MPtok()
	{
	}

	// Global tokenizer

	string MPtok::tokenize(const string& txt, int mt)
	{
	if (option_pretok) return save_string(txt);

	option_token = mt;
	text_len = txt.size();
	if (text_len == 0) return string("");

	text = new char[text_len + 1];
	strcpy(text, txt.c_str());

	map_escapes();

	if (text_len == 0) return NULL;

	tokflag = new int[text_len + 1];
	endflag = new int[text_len + 1];

	set_tokflag();
	if (option_new < 3)
	set_endflag();
	else
	set_endflag_01();

	string buff = token_string();
	save_string(buff);

	delete[] text; text = NULL;
	delete[] tokflag; tokflag = NULL;
	delete[] endflag; endflag = NULL;

	return buff;
	}

	string MPtok::tokenize(const string& text)
	{
	return tokenize(text, 1);
	}

	string MPtok::segment(const string& text)
	{
	sent.clear();

	// tokenize the text

	int save_option_segment = option_segment;
	option_segment = 1;
	string buff = tokenize(text, 0);
	option_segment = save_option_segment;

	if (buff.size() == 0) return text;

	int found = 0;
	int pos = 0;

	while (pos < buff.size())
	{
	found = buff.find('\n', pos);
	if (found == string::npos)
	{
	sent.push_back(buff.substr(pos));
	pos = buff.size();
	} else
	{
	sent.push_back(buff.substr(pos, found - pos));
	pos = found + 1;
	}
	}

	return buff;
	}

	string MPtok::save_string(const string& s)
	{
	stringstream ss (stringstream::in \| stringstream::out);
	string w, t;
	int found;
	string ret;

	word.clear();
	tag.clear();

	ss << s;
	while (ss.good())
	{
	ss >> w;
	if (w.size() == 0) break;

	found = w.find('_');

	if (found != string::npos)
	{
	t = w.substr(found + 1);
	w.resize(found);
	word.push_back(w);
	tag.push_back(t);
	} else
	{
	word.push_back(w);
	tag.push_back(option_pretag);

	}
	if (ret.size() > 0) ret += " ";
	ret += w;
	}

	// now look for continuation tags...

	for (int i = 0; i < word.size(); i++)
	{
	int j = tag[i].size() - 1;
	if (j >= 0 && tag[i][j] == '+' && i < tag.size() - 1)
	{
	word[i] = word[i] + " " + word[i + 1];
	tag[i] = tag[i + 1];
	word.erase(word.begin() + i + 1, word.begin() + i + 2);
	tag.erase(tag.begin() + i + 1, tag.begin() + i + 2);
	i--;
	}
	}

	return ret;
	}


	static int count_words(const char *s)
	{
	int i;

	i = 1;
	for (; *s; ++s)
	{
	if (*s == ' ') ++i;
	}
	return i;
	}

	static void print_word(const char *s, int i)
	{
	for (; i > 0 && s; ++s) { if (s == ' ') --i; }
	while (s && s != ' ') { printf("%c", *s); ++s; }
	}

	void MPtok::print(int how)
	{
	int i, j, w;

	if (how != 0 && how != 2)
	{
	printf("print(%d) not defined\n", how);
	return;
	}

	for (i = 0; i < word.size(); ++i)
	{
	// Get the words from an idiom

	for (w = 0; w < count_words(word[i].c_str()); ++w)
	{
	if (how == 2 && i + w > 0) printf(" ");

	print_word(word[i].c_str(), w);

	if (how == 0)
	{
	printf(" tagged %s", tag[i].c_str());
	if (w < count_words(word[i].c_str()) - 1) printf("+");
	printf("\n");
	} else if (how == 2)
	{
	printf("%s%s", "_", tag[i].c_str());
	if (w < count_words(word[i].c_str()) - 1) printf("+");
	}
	}
	}
	if (how == 2)
	printf("\n");
	}

	void MPtok::merge_words(int s, int n)
	{
	string tmp = word[s];

	for (int i = s + 1; i < s + n; i++)
	{
	tmp += " ";
	tmp += word[i];
	}

	// printf("merging words : '%s' n = %d\n", tmp.c_str(), n);

	for (int k = s; k + n < word.size(); k++)
	{
	word[k+1] = word[k+n];
	tag[k+1] = tag[k+n];
	}

	// Fixup the remaining array

	word.resize(word.size() - n + 1);
	tag.resize(word.size());

	word[s] = tmp;
	}

	void MPtok::split_words()
	{
	for (int i = 0; i < word.size(); i++)
	{
	int found = word[i].find(' ');

	if (found != string::npos)
	{
	string tmp1(word[i], 0, found);
	string tmp2(word[i], found + 1, string::npos);

	// Move all the words and tags down

	word.resize(word.size() + 1);
	tag.resize(tag.size() + 1);

	for (int j = word.size() - 1; j > i; j--)
	{
	word[j] = word[j - 1];
	tag[j] = tag[j - 1];
	}

	word[i] = tmp1;
	tag[i] = tag[i+1];
	tag[i] += "+";

	word[i+1] = tmp2;
	}
	}
	}

	// Callable functions to set internal options

	void MPtok::set_segment(int i) { option_segment = i; }
	void MPtok::set_hyphen(int i) { option_hyphen = i; }
	void MPtok::set_comma(int i) { option_comma = i; }
	void MPtok::set_pretag(char *a) { option_pretag = a; }
	void MPtok::set_pretok(int i) { option_pretok = i; }
	void MPtok::set_new(int i) { option_new = i; }
	void MPtok::set_doteos(int i) { option_doteos = i; }