Spaces:
Sleeping
Sleeping
// DARTS_THROW() throws a <Darts::Exception> whose message starts with the | |
// file name and the line number. For example, DARTS_THROW("error message") at | |
// line 123 of "darts.h" throws a <Darts::Exception> which has a pointer to | |
// "darts.h:123: exception: error message". The message is available by using | |
// what() as well as that of <std::exception>. | |
namespace Darts { | |
// The following namespace hides the internal types and classes. | |
namespace Details { | |
// This header assumes that <int> and <unsigned int> are 32-bit integer types. | |
// | |
// Darts-clone keeps values associated with keys. The type of the values is | |
// <value_type>. Note that the values must be positive integers because the | |
// most significant bit (MSB) of each value is used to represent whether the | |
// corresponding unit is a leaf or not. Also, the keys are represented by | |
// sequences of <char_type>s. <uchar_type> is the unsigned type of <char_type>. | |
typedef char char_type; | |
typedef unsigned char uchar_type; | |
typedef int value_type; | |
// The main structure of Darts-clone is an array of <DoubleArrayUnit>s, and the | |
// unit type is actually a wrapper of <id_type>. | |
typedef unsigned int id_type; | |
// <progress_func_type> is the type of callback functions for reporting the | |
// progress of building a dictionary. See also build() of <DoubleArray>. | |
// The 1st argument receives the progress value and the 2nd argument receives | |
// the maximum progress value. A usage example is to show the progress | |
// percentage, 100.0 * (the 1st argument) / (the 2nd argument). | |
typedef int (*progress_func_type)(std::size_t, std::size_t); | |
// <DoubleArrayUnit> is the type of double-array units and it is a wrapper of | |
// <id_type> in practice. | |
class DoubleArrayUnit { | |
public: | |
DoubleArrayUnit() : unit_() {} | |
// has_leaf() returns whether a leaf unit is immediately derived from the | |
// unit (true) or not (false). | |
bool has_leaf() const { | |
return ((unit_ >> 8) & 1) == 1; | |
} | |
// value() returns the value stored in the unit, and thus value() is | |
// available when and only when the unit is a leaf unit. | |
value_type value() const { | |
return static_cast<value_type>(unit_ & ((1U << 31) - 1)); | |
} | |
// label() returns the label associted with the unit. Note that a leaf unit | |
// always returns an invalid label. For this feature, leaf unit's label() | |
// returns an <id_type> that has the MSB of 1. | |
id_type label() const { | |
return unit_ & ((1U << 31) | 0xFF); | |
} | |
// offset() returns the offset from the unit to its derived units. | |
id_type offset() const { | |
return (unit_ >> 10) << ((unit_ & (1U << 9)) >> 6); | |
} | |
private: | |
id_type unit_; | |
// Copyable. | |
}; | |
// Darts-clone throws an <Exception> for memory allocation failure, invalid | |
// arguments or a too large offset. The last case means that there are too many | |
// keys in the given set of keys. Note that the `msg' of <Exception> must be a | |
// constant or static string because an <Exception> keeps only a pointer to | |
// that string. | |
class Exception : public std::exception { | |
public: | |
explicit Exception(const char *msg = NULL) throw() : msg_(msg) {} | |
Exception(const Exception &rhs) throw() : msg_(rhs.msg_) {} | |
virtual ~Exception() throw() {} | |
// <Exception> overrides what() of <std::exception>. | |
virtual const char *what() const throw() { | |
return (msg_ != NULL) ? msg_ : ""; | |
} | |
private: | |
const char *msg_; | |
// Disallows operator=. | |
Exception &operator=(const Exception &); | |
}; | |
} // namespace Details | |
// <DoubleArrayImpl> is the interface of Darts-clone. Note that other | |
// classes should not be accessed from outside. | |
// | |
// <DoubleArrayImpl> has 4 template arguments but only the 3rd one is used as | |
// the type of values. Note that the given <T> is used only from outside, and | |
// the internal value type is not changed from <Darts::Details::value_type>. | |
// In build(), given values are casted from <T> to <Darts::Details::value_type> | |
// by using static_cast. On the other hand, values are casted from | |
// <Darts::Details::value_type> to <T> in searching dictionaries. | |
template <typename, typename, typename T, typename> | |
class DoubleArrayImpl { | |
public: | |
// Even if this <value_type> is changed, the internal value type is still | |
// <Darts::Details::value_type>. Other types, such as 64-bit integer types | |
// and floating-point number types, should not be used. | |
typedef T value_type; | |
// A key is reprenseted by a sequence of <key_type>s. For example, | |
// exactMatchSearch() takes a <const key_type *>. | |
typedef Details::char_type key_type; | |
// In searching dictionaries, the values associated with the matched keys are | |
// stored into or returned as <result_type>s. | |
typedef value_type result_type; | |
// <result_pair_type> enables applications to get the lengths of the matched | |
// keys in addition to the values. | |
struct result_pair_type { | |
value_type value; | |
std::size_t length; | |
}; | |
// The constructor initializes member variables with 0 and NULLs. | |
DoubleArrayImpl() : size_(0), array_(NULL), buf_(NULL) {} | |
// The destructor frees memory allocated for units and then initializes | |
// member variables with 0 and NULLs. | |
virtual ~DoubleArrayImpl() { | |
clear(); | |
} | |
// <DoubleArrayImpl> has 2 kinds of set_result()s. The 1st set_result() is to | |
// set a value to a <value_type>. The 2nd set_result() is to set a value and | |
// a length to a <result_pair_type>. By using set_result()s, search methods | |
// can return the 2 kinds of results in the same way. | |
// Why the set_result()s are non-static? It is for compatibility. | |
// | |
// The 1st set_result() takes a length as the 3rd argument but it is not | |
// used. If a compiler does a good job, codes for getting the length may be | |
// removed. | |
void set_result(value_type *result, value_type value, std::size_t) const { | |
*result = value; | |
} | |
// The 2nd set_result() uses both `value' and `length'. | |
void set_result(result_pair_type *result, | |
value_type value, std::size_t length) const { | |
result->value = value; | |
result->length = length; | |
} | |
// set_array() calls clear() in order to free memory allocated to the old | |
// array and then sets a new array. This function is useful to set a memory- | |
// mapped array. Note that the array set by set_array() is not freed in | |
// clear() and the destructor of <DoubleArrayImpl>. | |
// set_array() can also set the size of the new array but the size is not | |
// used in search methods. So it works well even if the 2nd argument is 0 or | |
// omitted. Remember that size() and total_size() returns 0 in such a case. | |
void set_array(const void *ptr, std::size_t size = 0) { | |
clear(); | |
array_ = static_cast<const unit_type *>(ptr); | |
size_ = size; | |
} | |
// array() returns a pointer to the array of units. | |
const void *array() const { | |
return array_; | |
} | |
// clear() frees memory allocated to units and then initializes member | |
// variables with 0 and NULLs. Note that clear() does not free memory if the | |
// array of units was set by set_array(). In such a case, `array_' is not | |
// NULL and `buf_' is NULL. | |
void clear() { | |
size_ = 0; | |
array_ = NULL; | |
if (buf_ != NULL) { | |
delete[] buf_; | |
buf_ = NULL; | |
} | |
} | |
// unit_size() returns the size of each unit. The size must be 4 bytes. | |
std::size_t unit_size() const { | |
return sizeof(unit_type); | |
} | |
// size() returns the number of units. It can be 0 if set_array() is used. | |
std::size_t size() const { | |
return size_; | |
} | |
// total_size() returns the number of bytes allocated to the array of units. | |
// It can be 0 if set_array() is used. | |
std::size_t total_size() const { | |
return unit_size() * size(); | |
} | |
// nonzero_size() exists for compatibility. It always returns the number of | |
// units because it takes long time to count the number of non-zero units. | |
std::size_t nonzero_size() const { | |
return size(); | |
} | |
// build() constructs a dictionary from given key-value pairs. If `lengths' | |
// is NULL, `keys' is handled as an array of zero-terminated strings. If | |
// `values' is NULL, the index in `keys' is associated with each key, i.e. | |
// the ith key has (i - 1) as its value. | |
// Note that the key-value pairs must be arranged in key order and the values | |
// must not be negative. Also, if there are duplicate keys, only the first | |
// pair will be stored in the resultant dictionary. | |
// `progress_func' is a pointer to a callback function. If it is not NULL, | |
// it will be called in build() so that the caller can check the progress of | |
// dictionary construction. For details, please see the definition of | |
// <Darts::Details::progress_func_type>. | |
// The return value of build() is 0, and it indicates the success of the | |
// operation. Otherwise, build() throws a <Darts::Exception>, which is a | |
// derived class of <std::exception>. | |
// build() uses another construction algorithm if `values' is not NULL. In | |
// this case, Darts-clone uses a Directed Acyclic Word Graph (DAWG) instead | |
// of a trie because a DAWG is likely to be more compact than a trie. | |
int build(std::size_t num_keys, const key_type * const *keys, | |
const std::size_t *lengths = NULL, const value_type *values = NULL, | |
Details::progress_func_type progress_func = NULL); | |
// open() reads an array of units from the specified file. And if it goes | |
// well, the old array will be freed and replaced with the new array read | |
// from the file. `offset' specifies the number of bytes to be skipped before | |
// reading an array. `size' specifies the number of bytes to be read from the | |
// file. If the `size' is 0, the whole file will be read. | |
// open() returns 0 iff the operation succeeds. Otherwise, it returns a | |
// non-zero value or throws a <Darts::Exception>. The exception is thrown | |
// when and only when a memory allocation fails. | |
int open(const char *file_name, const char *mode = "rb", | |
std::size_t offset = 0, std::size_t size = 0); | |
// save() writes the array of units into the specified file. `offset' | |
// specifies the number of bytes to be skipped before writing the array. | |
// open() returns 0 iff the operation succeeds. Otherwise, it returns a | |
// non-zero value. | |
int save(const char *file_name, const char *mode = "wb", | |
std::size_t offset = 0) const; | |
// The 1st exactMatchSearch() tests whether the given key exists or not, and | |
// if it exists, its value and length are set to `result'. Otherwise, the | |
// value and the length of `result' are set to -1 and 0 respectively. | |
// Note that if `length' is 0, `key' is handled as a zero-terminated string. | |
// `node_pos' specifies the start position of matching. This argument enables | |
// the combination of exactMatchSearch() and traverse(). For example, if you | |
// want to test "xyzA", "xyzBC", and "xyzDE", you can use traverse() to get | |
// the node position corresponding to "xyz" and then you can use | |
// exactMatchSearch() to test "A", "BC", and "DE" from that position. | |
// Note that the length of `result' indicates the length from the `node_pos'. | |
// In the above example, the lengths are { 1, 2, 2 }, not { 4, 5, 5 }. | |
template <class U> | |
void exactMatchSearch(const key_type *key, U &result, | |
std::size_t length = 0, std::size_t node_pos = 0) const { | |
result = exactMatchSearch<U>(key, length, node_pos); | |
} | |
// The 2nd exactMatchSearch() returns a result instead of updating the 2nd | |
// argument. So, the following exactMatchSearch() has only 3 arguments. | |
template <class U> | |
inline U exactMatchSearch(const key_type *key, std::size_t length = 0, | |
std::size_t node_pos = 0) const; | |
// commonPrefixSearch() searches for keys which match a prefix of the given | |
// string. If `length' is 0, `key' is handled as a zero-terminated string. | |
// The values and the lengths of at most `max_num_results' matched keys are | |
// stored in `results'. commonPrefixSearch() returns the number of matched | |
// keys. Note that the return value can be larger than `max_num_results' if | |
// there are more than `max_num_results' matches. If you want to get all the | |
// results, allocate more spaces and call commonPrefixSearch() again. | |
// `node_pos' works as well as in exactMatchSearch(). | |
template <class U> | |
inline std::size_t commonPrefixSearch(const key_type *key, U *results, | |
std::size_t max_num_results, std::size_t length = 0, | |
std::size_t node_pos = 0) const; | |
// In Darts-clone, a dictionary is a deterministic finite-state automaton | |
// (DFA) and traverse() tests transitions on the DFA. The initial state is | |
// `node_pos' and traverse() chooses transitions labeled key[key_pos], | |
// key[key_pos + 1], ... in order. If there is not a transition labeled | |
// key[key_pos + i], traverse() terminates the transitions at that state and | |
// returns -2. Otherwise, traverse() ends without a termination and returns | |
// -1 or a nonnegative value, -1 indicates that the final state was not an | |
// accept state. When a nonnegative value is returned, it is the value | |
// associated with the final accept state. That is, traverse() returns the | |
// value associated with the given key if it exists. Note that traverse() | |
// updates `node_pos' and `key_pos' after each transition. | |
inline value_type traverse(const key_type *key, std::size_t &node_pos, | |
std::size_t &key_pos, std::size_t length = 0) const; | |
private: | |
typedef Details::uchar_type uchar_type; | |
typedef Details::id_type id_type; | |
typedef Details::DoubleArrayUnit unit_type; | |
std::size_t size_; | |
const unit_type *array_; | |
unit_type *buf_; | |
// Disallows copy and assignment. | |
DoubleArrayImpl(const DoubleArrayImpl &); | |
DoubleArrayImpl &operator=(const DoubleArrayImpl &); | |
}; | |
// <DoubleArray> is the typical instance of <DoubleArrayImpl>. It uses <int> | |
// as the type of values and it is suitable for most cases. | |
typedef DoubleArrayImpl<void, void, int, void> DoubleArray; | |
// The interface section ends here. For using Darts-clone, there is no need | |
// to read the remaining section, which gives the implementation of | |
// Darts-clone. | |
// | |
// Member functions of DoubleArrayImpl (except build()). | |
// | |
template <typename A, typename B, typename T, typename C> | |
int DoubleArrayImpl<A, B, T, C>::open(const char *file_name, | |
const char *mode, std::size_t offset, std::size_t size) { | |
std::FILE *file; | |
if (::fopen_s(&file, file_name, mode) != 0) { | |
return -1; | |
} | |
std::FILE *file = std::fopen(file_name, mode); | |
if (file == NULL) { | |
return -1; | |
} | |
if (size == 0) { | |
if (std::fseek(file, 0, SEEK_END) != 0) { | |
std::fclose(file); | |
return -1; | |
} | |
size = std::ftell(file) - offset; | |
} | |
size /= unit_size(); | |
if (size < 256 || (size & 0xFF) != 0) { | |
std::fclose(file); | |
return -1; | |
} | |
if (std::fseek(file, offset, SEEK_SET) != 0) { | |
std::fclose(file); | |
return -1; | |
} | |
unit_type units[256]; | |
if (std::fread(units, unit_size(), 256, file) != 256) { | |
std::fclose(file); | |
return -1; | |
} | |
if (units[0].label() != '\0' || units[0].has_leaf() || | |
units[0].offset() == 0 || units[0].offset() >= 512) { | |
std::fclose(file); | |
return -1; | |
} | |
for (id_type i = 1; i < 256; ++i) { | |
if (units[i].label() <= 0xFF && units[i].offset() >= size) { | |
std::fclose(file); | |
return -1; | |
} | |
} | |
unit_type *buf; | |
try { | |
buf = new unit_type[size]; | |
for (id_type i = 0; i < 256; ++i) { | |
buf[i] = units[i]; | |
} | |
} catch (const std::bad_alloc &) { | |
std::fclose(file); | |
DARTS_THROW("failed to open double-array: std::bad_alloc"); | |
} | |
if (size > 256) { | |
if (std::fread(buf + 256, unit_size(), size - 256, file) != size - 256) { | |
std::fclose(file); | |
delete[] buf; | |
return -1; | |
} | |
} | |
std::fclose(file); | |
clear(); | |
size_ = size; | |
array_ = buf; | |
buf_ = buf; | |
return 0; | |
} | |
template <typename A, typename B, typename T, typename C> | |
int DoubleArrayImpl<A, B, T, C>::save(const char *file_name, | |
const char *mode, std::size_t) const { | |
if (size() == 0) { | |
return -1; | |
} | |
std::FILE *file; | |
if (::fopen_s(&file, file_name, mode) != 0) { | |
return -1; | |
} | |
std::FILE *file = std::fopen(file_name, mode); | |
if (file == NULL) { | |
return -1; | |
} | |
if (std::fwrite(array_, unit_size(), size(), file) != size()) { | |
std::fclose(file); | |
return -1; | |
} | |
std::fclose(file); | |
return 0; | |
} | |
template <typename A, typename B, typename T, typename C> | |
template <typename U> | |
inline U DoubleArrayImpl<A, B, T, C>::exactMatchSearch(const key_type *key, | |
std::size_t length, std::size_t node_pos) const { | |
U result; | |
set_result(&result, static_cast<value_type>(-1), 0); | |
unit_type unit = array_[node_pos]; | |
if (length != 0) { | |
for (std::size_t i = 0; i < length; ++i) { | |
node_pos ^= unit.offset() ^ static_cast<uchar_type>(key[i]); | |
unit = array_[node_pos]; | |
if (unit.label() != static_cast<uchar_type>(key[i])) { | |
return result; | |
} | |
} | |
} else { | |
for ( ; key[length] != '\0'; ++length) { | |
node_pos ^= unit.offset() ^ static_cast<uchar_type>(key[length]); | |
unit = array_[node_pos]; | |
if (unit.label() != static_cast<uchar_type>(key[length])) { | |
return result; | |
} | |
} | |
} | |
if (!unit.has_leaf()) { | |
return result; | |
} | |
unit = array_[node_pos ^ unit.offset()]; | |
set_result(&result, static_cast<value_type>(unit.value()), length); | |
return result; | |
} | |
template <typename A, typename B, typename T, typename C> | |
template <typename U> | |
inline std::size_t DoubleArrayImpl<A, B, T, C>::commonPrefixSearch( | |
const key_type *key, U *results, std::size_t max_num_results, | |
std::size_t length, std::size_t node_pos) const { | |
std::size_t num_results = 0; | |
unit_type unit = array_[node_pos]; | |
node_pos ^= unit.offset(); | |
if (length != 0) { | |
for (std::size_t i = 0; i < length; ++i) { | |
node_pos ^= static_cast<uchar_type>(key[i]); | |
unit = array_[node_pos]; | |
if (unit.label() != static_cast<uchar_type>(key[i])) { | |
return num_results; | |
} | |
node_pos ^= unit.offset(); | |
if (unit.has_leaf()) { | |
if (num_results < max_num_results) { | |
set_result(&results[num_results], static_cast<value_type>( | |
array_[node_pos].value()), i + 1); | |
} | |
++num_results; | |
} | |
} | |
} else { | |
for ( ; key[length] != '\0'; ++length) { | |
node_pos ^= static_cast<uchar_type>(key[length]); | |
unit = array_[node_pos]; | |
if (unit.label() != static_cast<uchar_type>(key[length])) { | |
return num_results; | |
} | |
node_pos ^= unit.offset(); | |
if (unit.has_leaf()) { | |
if (num_results < max_num_results) { | |
set_result(&results[num_results], static_cast<value_type>( | |
array_[node_pos].value()), length + 1); | |
} | |
++num_results; | |
} | |
} | |
} | |
return num_results; | |
} | |
template <typename A, typename B, typename T, typename C> | |
inline typename DoubleArrayImpl<A, B, T, C>::value_type | |
DoubleArrayImpl<A, B, T, C>::traverse(const key_type *key, | |
std::size_t &node_pos, std::size_t &key_pos, std::size_t length) const { | |
id_type id = static_cast<id_type>(node_pos); | |
unit_type unit = array_[id]; | |
if (length != 0) { | |
for ( ; key_pos < length; ++key_pos) { | |
id ^= unit.offset() ^ static_cast<uchar_type>(key[key_pos]); | |
unit = array_[id]; | |
if (unit.label() != static_cast<uchar_type>(key[key_pos])) { | |
return static_cast<value_type>(-2); | |
} | |
node_pos = id; | |
} | |
} else { | |
for ( ; key[key_pos] != '\0'; ++key_pos) { | |
id ^= unit.offset() ^ static_cast<uchar_type>(key[key_pos]); | |
unit = array_[id]; | |
if (unit.label() != static_cast<uchar_type>(key[key_pos])) { | |
return static_cast<value_type>(-2); | |
} | |
node_pos = id; | |
} | |
} | |
if (!unit.has_leaf()) { | |
return static_cast<value_type>(-1); | |
} | |
unit = array_[id ^ unit.offset()]; | |
return static_cast<value_type>(unit.value()); | |
} | |
namespace Details { | |
// | |
// Memory management of array. | |
// | |
template <typename T> | |
class AutoArray { | |
public: | |
explicit AutoArray(T *array = NULL) : array_(array) {} | |
~AutoArray() { | |
clear(); | |
} | |
const T &operator[](std::size_t id) const { | |
return array_[id]; | |
} | |
T &operator[](std::size_t id) { | |
return array_[id]; | |
} | |
bool empty() const { | |
return array_ == NULL; | |
} | |
void clear() { | |
if (array_ != NULL) { | |
delete[] array_; | |
array_ = NULL; | |
} | |
} | |
void swap(AutoArray *array) { | |
T *temp = array_; | |
array_ = array->array_; | |
array->array_ = temp; | |
} | |
void reset(T *array = NULL) { | |
AutoArray(array).swap(this); | |
} | |
private: | |
T *array_; | |
// Disallows copy and assignment. | |
AutoArray(const AutoArray &); | |
AutoArray &operator=(const AutoArray &); | |
}; | |
// | |
// Memory management of resizable array. | |
// | |
template <typename T> | |
class AutoPool { | |
public: | |
AutoPool() : buf_(), size_(0), capacity_(0) {} | |
~AutoPool() { clear(); } | |
const T &operator[](std::size_t id) const { | |
return *(reinterpret_cast<const T *>(&buf_[0]) + id); | |
} | |
T &operator[](std::size_t id) { | |
return *(reinterpret_cast<T *>(&buf_[0]) + id); | |
} | |
bool empty() const { | |
return size_ == 0; | |
} | |
std::size_t size() const { | |
return size_; | |
} | |
void clear() { | |
resize(0); | |
buf_.clear(); | |
size_ = 0; | |
capacity_ = 0; | |
} | |
void push_back(const T &value) { | |
append(value); | |
} | |
void pop_back() { | |
(*this)[--size_].~T(); | |
} | |
void append() { | |
if (size_ == capacity_) | |
resize_buf(size_ + 1); | |
new(&(*this)[size_++]) T; | |
} | |
void append(const T &value) { | |
if (size_ == capacity_) | |
resize_buf(size_ + 1); | |
new(&(*this)[size_++]) T(value); | |
} | |
void resize(std::size_t size) { | |
while (size_ > size) { | |
(*this)[--size_].~T(); | |
} | |
if (size > capacity_) { | |
resize_buf(size); | |
} | |
while (size_ < size) { | |
new(&(*this)[size_++]) T; | |
} | |
} | |
void resize(std::size_t size, const T &value) { | |
while (size_ > size) { | |
(*this)[--size_].~T(); | |
} | |
if (size > capacity_) { | |
resize_buf(size); | |
} | |
while (size_ < size) { | |
new(&(*this)[size_++]) T(value); | |
} | |
} | |
void reserve(std::size_t size) { | |
if (size > capacity_) { | |
resize_buf(size); | |
} | |
} | |
private: | |
AutoArray<char> buf_; | |
std::size_t size_; | |
std::size_t capacity_; | |
// Disallows copy and assignment. | |
AutoPool(const AutoPool &); | |
AutoPool &operator=(const AutoPool &); | |
void resize_buf(std::size_t size); | |
}; | |
template <typename T> | |
void AutoPool<T>::resize_buf(std::size_t size) { | |
std::size_t capacity; | |
if (size >= capacity_ * 2) { | |
capacity = size; | |
} else { | |
capacity = 1; | |
while (capacity < size) { | |
capacity <<= 1; | |
} | |
} | |
AutoArray<char> buf; | |
try { | |
buf.reset(new char[sizeof(T) * capacity]); | |
} catch (const std::bad_alloc &) { | |
DARTS_THROW("failed to resize pool: std::bad_alloc"); | |
} | |
if (size_ > 0) { | |
T *src = reinterpret_cast<T *>(&buf_[0]); | |
T *dest = reinterpret_cast<T *>(&buf[0]); | |
for (std::size_t i = 0; i < size_; ++i) { | |
new(&dest[i]) T(src[i]); | |
src[i].~T(); | |
} | |
} | |
buf_.swap(&buf); | |
capacity_ = capacity; | |
} | |
// | |
// Memory management of stack. | |
// | |
template <typename T> | |
class AutoStack { | |
public: | |
AutoStack() : pool_() {} | |
~AutoStack() { | |
clear(); | |
} | |
const T &top() const { | |
return pool_[size() - 1]; | |
} | |
T &top() { | |
return pool_[size() - 1]; | |
} | |
bool empty() const { | |
return pool_.empty(); | |
} | |
std::size_t size() const { | |
return pool_.size(); | |
} | |
void push(const T &value) { | |
pool_.push_back(value); | |
} | |
void pop() { | |
pool_.pop_back(); | |
} | |
void clear() { | |
pool_.clear(); | |
} | |
private: | |
AutoPool<T> pool_; | |
// Disallows copy and assignment. | |
AutoStack(const AutoStack &); | |
AutoStack &operator=(const AutoStack &); | |
}; | |
// | |
// Succinct bit vector. | |
// | |
class BitVector { | |
public: | |
BitVector() : units_(), ranks_(), num_ones_(0), size_(0) {} | |
~BitVector() { | |
clear(); | |
} | |
bool operator[](std::size_t id) const { | |
return (units_[id / UNIT_SIZE] >> (id % UNIT_SIZE) & 1) == 1; | |
} | |
id_type rank(std::size_t id) const { | |
std::size_t unit_id = id / UNIT_SIZE; | |
return ranks_[unit_id] + pop_count(units_[unit_id] | |
& (~0U >> (UNIT_SIZE - (id % UNIT_SIZE) - 1))); | |
} | |
void set(std::size_t id, bool bit) { | |
if (bit) { | |
units_[id / UNIT_SIZE] |= 1U << (id % UNIT_SIZE); | |
} else { | |
units_[id / UNIT_SIZE] &= ~(1U << (id % UNIT_SIZE)); | |
} | |
} | |
bool empty() const { | |
return units_.empty(); | |
} | |
std::size_t num_ones() const { | |
return num_ones_; | |
} | |
std::size_t size() const { | |
return size_; | |
} | |
void append() { | |
if ((size_ % UNIT_SIZE) == 0) { | |
units_.append(0); | |
} | |
++size_; | |
} | |
void build(); | |
void clear() { | |
units_.clear(); | |
ranks_.clear(); | |
} | |
private: | |
enum { UNIT_SIZE = sizeof(id_type) * 8 }; | |
AutoPool<id_type> units_; | |
AutoArray<id_type> ranks_; | |
std::size_t num_ones_; | |
std::size_t size_; | |
// Disallows copy and assignment. | |
BitVector(const BitVector &); | |
BitVector &operator=(const BitVector &); | |
static id_type pop_count(id_type unit) { | |
unit = ((unit & 0xAAAAAAAA) >> 1) + (unit & 0x55555555); | |
unit = ((unit & 0xCCCCCCCC) >> 2) + (unit & 0x33333333); | |
unit = ((unit >> 4) + unit) & 0x0F0F0F0F; | |
unit += unit >> 8; | |
unit += unit >> 16; | |
return unit & 0xFF; | |
} | |
}; | |
inline void BitVector::build() { | |
try { | |
ranks_.reset(new id_type[units_.size()]); | |
} catch (const std::bad_alloc &) { | |
DARTS_THROW("failed to build rank index: std::bad_alloc"); | |
} | |
num_ones_ = 0; | |
for (std::size_t i = 0; i < units_.size(); ++i) { | |
ranks_[i] = num_ones_; | |
num_ones_ += pop_count(units_[i]); | |
} | |
} | |
// | |
// Keyset. | |
// | |
template <typename T> | |
class Keyset { | |
public: | |
Keyset(std::size_t num_keys, const char_type * const *keys, | |
const std::size_t *lengths, const T *values) : | |
num_keys_(num_keys), keys_(keys), lengths_(lengths), values_(values) {} | |
std::size_t num_keys() const { | |
return num_keys_; | |
} | |
const char_type *keys(std::size_t id) const { | |
return keys_[id]; | |
} | |
uchar_type keys(std::size_t key_id, std::size_t char_id) const { | |
if (has_lengths() && char_id >= lengths_[key_id]) | |
return '\0'; | |
return keys_[key_id][char_id]; | |
} | |
bool has_lengths() const { | |
return lengths_ != NULL; | |
} | |
std::size_t lengths(std::size_t id) const { | |
if (has_lengths()) { | |
return lengths_[id]; | |
} | |
std::size_t length = 0; | |
while (keys_[id][length] != '\0') { | |
++length; | |
} | |
return length; | |
} | |
bool has_values() const { | |
return values_ != NULL; | |
} | |
const value_type values(std::size_t id) const { | |
if (has_values()) { | |
return static_cast<value_type>(values_[id]); | |
} | |
return static_cast<value_type>(id); | |
} | |
private: | |
std::size_t num_keys_; | |
const char_type * const * keys_; | |
const std::size_t *lengths_; | |
const T *values_; | |
// Disallows copy and assignment. | |
Keyset(const Keyset &); | |
Keyset &operator=(const Keyset &); | |
}; | |
// | |
// Node of Directed Acyclic Word Graph (DAWG). | |
// | |
class DawgNode { | |
public: | |
DawgNode() : child_(0), sibling_(0), label_('\0'), | |
is_state_(false), has_sibling_(false) {} | |
void set_child(id_type child) { | |
child_ = child; | |
} | |
void set_sibling(id_type sibling) { | |
sibling_ = sibling; | |
} | |
void set_value(value_type value) { | |
child_ = value; | |
} | |
void set_label(uchar_type label) { | |
label_ = label; | |
} | |
void set_is_state(bool is_state) { | |
is_state_ = is_state; | |
} | |
void set_has_sibling(bool has_sibling) { | |
has_sibling_ = has_sibling; | |
} | |
id_type child() const { | |
return child_; | |
} | |
id_type sibling() const { | |
return sibling_; | |
} | |
value_type value() const { | |
return static_cast<value_type>(child_); | |
} | |
uchar_type label() const { | |
return label_; | |
} | |
bool is_state() const { | |
return is_state_; | |
} | |
bool has_sibling() const { | |
return has_sibling_; | |
} | |
id_type unit() const { | |
if (label_ == '\0') { | |
return (child_ << 1) | (has_sibling_ ? 1 : 0); | |
} | |
return (child_ << 2) | (is_state_ ? 2 : 0) | (has_sibling_ ? 1 : 0); | |
} | |
private: | |
id_type child_; | |
id_type sibling_; | |
uchar_type label_; | |
bool is_state_; | |
bool has_sibling_; | |
// Copyable. | |
}; | |
// | |
// Fixed unit of Directed Acyclic Word Graph (DAWG). | |
// | |
class DawgUnit { | |
public: | |
explicit DawgUnit(id_type unit = 0) : unit_(unit) {} | |
DawgUnit(const DawgUnit &unit) : unit_(unit.unit_) {} | |
DawgUnit &operator=(id_type unit) { | |
unit_ = unit; | |
return *this; | |
} | |
id_type unit() const { | |
return unit_; | |
} | |
id_type child() const { | |
return unit_ >> 2; | |
} | |
bool has_sibling() const { | |
return (unit_ & 1) == 1; | |
} | |
value_type value() const { | |
return static_cast<value_type>(unit_ >> 1); | |
} | |
bool is_state() const { | |
return (unit_ & 2) == 2; | |
} | |
private: | |
id_type unit_; | |
// Copyable. | |
}; | |
// | |
// Directed Acyclic Word Graph (DAWG) builder. | |
// | |
class DawgBuilder { | |
public: | |
DawgBuilder() : nodes_(), units_(), labels_(), is_intersections_(), | |
table_(), node_stack_(), recycle_bin_(), num_states_(0) {} | |
~DawgBuilder() { | |
clear(); | |
} | |
id_type root() const { | |
return 0; | |
} | |
id_type child(id_type id) const { | |
return units_[id].child(); | |
} | |
id_type sibling(id_type id) const { | |
return units_[id].has_sibling() ? (id + 1) : 0; | |
} | |
int value(id_type id) const { | |
return units_[id].value(); | |
} | |
bool is_leaf(id_type id) const { | |
return label(id) == '\0'; | |
} | |
uchar_type label(id_type id) const { | |
return labels_[id]; | |
} | |
bool is_intersection(id_type id) const { | |
return is_intersections_[id]; | |
} | |
id_type intersection_id(id_type id) const { | |
return is_intersections_.rank(id) - 1; | |
} | |
std::size_t num_intersections() const { | |
return is_intersections_.num_ones(); | |
} | |
std::size_t size() const { | |
return units_.size(); | |
} | |
void init(); | |
void finish(); | |
void insert(const char *key, std::size_t length, value_type value); | |
void clear(); | |
private: | |
enum { INITIAL_TABLE_SIZE = 1 << 10 }; | |
AutoPool<DawgNode> nodes_; | |
AutoPool<DawgUnit> units_; | |
AutoPool<uchar_type> labels_; | |
BitVector is_intersections_; | |
AutoPool<id_type> table_; | |
AutoStack<id_type> node_stack_; | |
AutoStack<id_type> recycle_bin_; | |
std::size_t num_states_; | |
// Disallows copy and assignment. | |
DawgBuilder(const DawgBuilder &); | |
DawgBuilder &operator=(const DawgBuilder &); | |
void flush(id_type id); | |
void expand_table(); | |
id_type find_unit(id_type id, id_type *hash_id) const; | |
id_type find_node(id_type node_id, id_type *hash_id) const; | |
bool are_equal(id_type node_id, id_type unit_id) const; | |
id_type hash_unit(id_type id) const; | |
id_type hash_node(id_type id) const; | |
id_type append_node(); | |
id_type append_unit(); | |
void free_node(id_type id) { | |
recycle_bin_.push(id); | |
} | |
static id_type hash(id_type key) { | |
key = ~key + (key << 15); // key = (key << 15) - key - 1; | |
key = key ^ (key >> 12); | |
key = key + (key << 2); | |
key = key ^ (key >> 4); | |
key = key * 2057; // key = (key + (key << 3)) + (key << 11); | |
key = key ^ (key >> 16); | |
return key; | |
} | |
}; | |
inline void DawgBuilder::init() { | |
table_.resize(INITIAL_TABLE_SIZE, 0); | |
append_node(); | |
append_unit(); | |
num_states_ = 1; | |
nodes_[0].set_label(0xFF); | |
node_stack_.push(0); | |
} | |
inline void DawgBuilder::finish() { | |
flush(0); | |
units_[0] = nodes_[0].unit(); | |
labels_[0] = nodes_[0].label(); | |
nodes_.clear(); | |
table_.clear(); | |
node_stack_.clear(); | |
recycle_bin_.clear(); | |
is_intersections_.build(); | |
} | |
inline void DawgBuilder::insert(const char *key, std::size_t length, | |
value_type value) { | |
if (value < 0) { | |
DARTS_THROW("failed to insert key: negative value"); | |
} else if (length == 0) { | |
DARTS_THROW("failed to insert key: zero-length key"); | |
} | |
id_type id = 0; | |
std::size_t key_pos = 0; | |
for ( ; key_pos <= length; ++key_pos) { | |
id_type child_id = nodes_[id].child(); | |
if (child_id == 0) { | |
break; | |
} | |
uchar_type key_label = static_cast<uchar_type>(key[key_pos]); | |
if (key_pos < length && key_label == '\0') { | |
DARTS_THROW("failed to insert key: invalid null character"); | |
} | |
uchar_type unit_label = nodes_[child_id].label(); | |
if (key_label < unit_label) { | |
DARTS_THROW("failed to insert key: wrong key order"); | |
} else if (key_label > unit_label) { | |
nodes_[child_id].set_has_sibling(true); | |
flush(child_id); | |
break; | |
} | |
id = child_id; | |
} | |
if (key_pos > length) { | |
return; | |
} | |
for ( ; key_pos <= length; ++key_pos) { | |
uchar_type key_label = static_cast<uchar_type>( | |
(key_pos < length) ? key[key_pos] : '\0'); | |
id_type child_id = append_node(); | |
if (nodes_[id].child() == 0) { | |
nodes_[child_id].set_is_state(true); | |
} | |
nodes_[child_id].set_sibling(nodes_[id].child()); | |
nodes_[child_id].set_label(key_label); | |
nodes_[id].set_child(child_id); | |
node_stack_.push(child_id); | |
id = child_id; | |
} | |
nodes_[id].set_value(value); | |
} | |
inline void DawgBuilder::clear() { | |
nodes_.clear(); | |
units_.clear(); | |
labels_.clear(); | |
is_intersections_.clear(); | |
table_.clear(); | |
node_stack_.clear(); | |
recycle_bin_.clear(); | |
num_states_ = 0; | |
} | |
inline void DawgBuilder::flush(id_type id) { | |
while (node_stack_.top() != id) { | |
id_type node_id = node_stack_.top(); | |
node_stack_.pop(); | |
if (num_states_ >= table_.size() - (table_.size() >> 2)) { | |
expand_table(); | |
} | |
id_type num_siblings = 0; | |
for (id_type i = node_id; i != 0; i = nodes_[i].sibling()) { | |
++num_siblings; | |
} | |
id_type hash_id; | |
id_type match_id = find_node(node_id, &hash_id); | |
if (match_id != 0) { | |
is_intersections_.set(match_id, true); | |
} else { | |
id_type unit_id = 0; | |
for (id_type i = 0; i < num_siblings; ++i) { | |
unit_id = append_unit(); | |
} | |
for (id_type i = node_id; i != 0; i = nodes_[i].sibling()) { | |
units_[unit_id] = nodes_[i].unit(); | |
labels_[unit_id] = nodes_[i].label(); | |
--unit_id; | |
} | |
match_id = unit_id + 1; | |
table_[hash_id] = match_id; | |
++num_states_; | |
} | |
for (id_type i = node_id, next; i != 0; i = next) { | |
next = nodes_[i].sibling(); | |
free_node(i); | |
} | |
nodes_[node_stack_.top()].set_child(match_id); | |
} | |
node_stack_.pop(); | |
} | |
inline void DawgBuilder::expand_table() { | |
std::size_t table_size = table_.size() << 1; | |
table_.clear(); | |
table_.resize(table_size, 0); | |
for (std::size_t i = 1; i < units_.size(); ++i) { | |
id_type id = static_cast<id_type>(i); | |
if (labels_[id] == '\0' || units_[id].is_state()) { | |
id_type hash_id; | |
find_unit(id, &hash_id); | |
table_[hash_id] = id; | |
} | |
} | |
} | |
inline id_type DawgBuilder::find_unit(id_type id, id_type *hash_id) const { | |
*hash_id = hash_unit(id) % table_.size(); | |
for ( ; ; *hash_id = (*hash_id + 1) % table_.size()) { | |
id_type unit_id = table_[*hash_id]; | |
if (unit_id == 0) { | |
break; | |
} | |
// There must not be the same unit. | |
} | |
return 0; | |
} | |
inline id_type DawgBuilder::find_node(id_type node_id, | |
id_type *hash_id) const { | |
*hash_id = hash_node(node_id) % table_.size(); | |
for ( ; ; *hash_id = (*hash_id + 1) % table_.size()) { | |
id_type unit_id = table_[*hash_id]; | |
if (unit_id == 0) { | |
break; | |
} | |
if (are_equal(node_id, unit_id)) { | |
return unit_id; | |
} | |
} | |
return 0; | |
} | |
inline bool DawgBuilder::are_equal(id_type node_id, id_type unit_id) const { | |
for (id_type i = nodes_[node_id].sibling(); i != 0; | |
i = nodes_[i].sibling()) { | |
if (units_[unit_id].has_sibling() == false) { | |
return false; | |
} | |
++unit_id; | |
} | |
if (units_[unit_id].has_sibling() == true) { | |
return false; | |
} | |
for (id_type i = node_id; i != 0; i = nodes_[i].sibling(), --unit_id) { | |
if (nodes_[i].unit() != units_[unit_id].unit() || | |
nodes_[i].label() != labels_[unit_id]) { | |
return false; | |
} | |
} | |
return true; | |
} | |
inline id_type DawgBuilder::hash_unit(id_type id) const { | |
id_type hash_value = 0; | |
for ( ; id != 0; ++id) { | |
id_type unit = units_[id].unit(); | |
uchar_type label = labels_[id]; | |
hash_value ^= hash((label << 24) ^ unit); | |
if (units_[id].has_sibling() == false) { | |
break; | |
} | |
} | |
return hash_value; | |
} | |
inline id_type DawgBuilder::hash_node(id_type id) const { | |
id_type hash_value = 0; | |
for ( ; id != 0; id = nodes_[id].sibling()) { | |
id_type unit = nodes_[id].unit(); | |
uchar_type label = nodes_[id].label(); | |
hash_value ^= hash((label << 24) ^ unit); | |
} | |
return hash_value; | |
} | |
inline id_type DawgBuilder::append_unit() { | |
is_intersections_.append(); | |
units_.append(); | |
labels_.append(); | |
return static_cast<id_type>(is_intersections_.size() - 1); | |
} | |
inline id_type DawgBuilder::append_node() { | |
id_type id; | |
if (recycle_bin_.empty()) { | |
id = static_cast<id_type>(nodes_.size()); | |
nodes_.append(); | |
} else { | |
id = recycle_bin_.top(); | |
nodes_[id] = DawgNode(); | |
recycle_bin_.pop(); | |
} | |
return id; | |
} | |
// | |
// Unit of double-array builder. | |
// | |
class DoubleArrayBuilderUnit { | |
public: | |
DoubleArrayBuilderUnit() : unit_(0) {} | |
void set_has_leaf(bool has_leaf) { | |
if (has_leaf) { | |
unit_ |= 1U << 8; | |
} else { | |
unit_ &= ~(1U << 8); | |
} | |
} | |
void set_value(value_type value) { | |
unit_ = value | (1U << 31); | |
} | |
void set_label(uchar_type label) { | |
unit_ = (unit_ & ~0xFFU) | label; | |
} | |
void set_offset(id_type offset) { | |
if (offset >= 1U << 29) { | |
DARTS_THROW("failed to modify unit: too large offset"); | |
} | |
unit_ &= (1U << 31) | (1U << 8) | 0xFF; | |
if (offset < 1U << 21) { | |
unit_ |= (offset << 10); | |
} else { | |
unit_ |= (offset << 2) | (1U << 9); | |
} | |
} | |
private: | |
id_type unit_; | |
// Copyable. | |
}; | |
// | |
// Extra unit of double-array builder. | |
// | |
class DoubleArrayBuilderExtraUnit { | |
public: | |
DoubleArrayBuilderExtraUnit() : prev_(0), next_(0), | |
is_fixed_(false), is_used_(false) {} | |
void set_prev(id_type prev) { | |
prev_ = prev; | |
} | |
void set_next(id_type next) { | |
next_ = next; | |
} | |
void set_is_fixed(bool is_fixed) { | |
is_fixed_ = is_fixed; | |
} | |
void set_is_used(bool is_used) { | |
is_used_ = is_used; | |
} | |
id_type prev() const { | |
return prev_; | |
} | |
id_type next() const { | |
return next_; | |
} | |
bool is_fixed() const { | |
return is_fixed_; | |
} | |
bool is_used() const { | |
return is_used_; | |
} | |
private: | |
id_type prev_; | |
id_type next_; | |
bool is_fixed_; | |
bool is_used_; | |
// Copyable. | |
}; | |
// | |
// DAWG -> double-array converter. | |
// | |
class DoubleArrayBuilder { | |
public: | |
explicit DoubleArrayBuilder(progress_func_type progress_func) | |
: progress_func_(progress_func), units_(), extras_(), labels_(), | |
table_(), extras_head_(0) {} | |
~DoubleArrayBuilder() { | |
clear(); | |
} | |
template <typename T> | |
void build(const Keyset<T> &keyset); | |
void copy(std::size_t *size_ptr, DoubleArrayUnit **buf_ptr) const; | |
void clear(); | |
private: | |
enum { BLOCK_SIZE = 256 }; | |
enum { NUM_EXTRA_BLOCKS = 16 }; | |
enum { NUM_EXTRAS = BLOCK_SIZE * NUM_EXTRA_BLOCKS }; | |
enum { UPPER_MASK = 0xFF << 21 }; | |
enum { LOWER_MASK = 0xFF }; | |
typedef DoubleArrayBuilderUnit unit_type; | |
typedef DoubleArrayBuilderExtraUnit extra_type; | |
progress_func_type progress_func_; | |
AutoPool<unit_type> units_; | |
AutoArray<extra_type> extras_; | |
AutoPool<uchar_type> labels_; | |
AutoArray<id_type> table_; | |
id_type extras_head_; | |
// Disallows copy and assignment. | |
DoubleArrayBuilder(const DoubleArrayBuilder &); | |
DoubleArrayBuilder &operator=(const DoubleArrayBuilder &); | |
std::size_t num_blocks() const { | |
return units_.size() / BLOCK_SIZE; | |
} | |
const extra_type &extras(id_type id) const { | |
return extras_[id % NUM_EXTRAS]; | |
} | |
extra_type &extras(id_type id) { | |
return extras_[id % NUM_EXTRAS]; | |
} | |
template <typename T> | |
void build_dawg(const Keyset<T> &keyset, DawgBuilder *dawg_builder); | |
void build_from_dawg(const DawgBuilder &dawg); | |
void build_from_dawg(const DawgBuilder &dawg, | |
id_type dawg_id, id_type dic_id); | |
id_type arrange_from_dawg(const DawgBuilder &dawg, | |
id_type dawg_id, id_type dic_id); | |
template <typename T> | |
void build_from_keyset(const Keyset<T> &keyset); | |
template <typename T> | |
void build_from_keyset(const Keyset<T> &keyset, std::size_t begin, | |
std::size_t end, std::size_t depth, id_type dic_id); | |
template <typename T> | |
id_type arrange_from_keyset(const Keyset<T> &keyset, std::size_t begin, | |
std::size_t end, std::size_t depth, id_type dic_id); | |
id_type find_valid_offset(id_type id) const; | |
bool is_valid_offset(id_type id, id_type offset) const; | |
void reserve_id(id_type id); | |
void expand_units(); | |
void fix_all_blocks(); | |
void fix_block(id_type block_id); | |
}; | |
template <typename T> | |
void DoubleArrayBuilder::build(const Keyset<T> &keyset) { | |
if (keyset.has_values()) { | |
Details::DawgBuilder dawg_builder; | |
build_dawg(keyset, &dawg_builder); | |
build_from_dawg(dawg_builder); | |
dawg_builder.clear(); | |
} else { | |
build_from_keyset(keyset); | |
} | |
} | |
inline void DoubleArrayBuilder::copy(std::size_t *size_ptr, | |
DoubleArrayUnit **buf_ptr) const { | |
if (size_ptr != NULL) { | |
*size_ptr = units_.size(); | |
} | |
if (buf_ptr != NULL) { | |
*buf_ptr = new DoubleArrayUnit[units_.size()]; | |
unit_type *units = reinterpret_cast<unit_type *>(*buf_ptr); | |
for (std::size_t i = 0; i < units_.size(); ++i) { | |
units[i] = units_[i]; | |
} | |
} | |
} | |
inline void DoubleArrayBuilder::clear() { | |
units_.clear(); | |
extras_.clear(); | |
labels_.clear(); | |
table_.clear(); | |
extras_head_ = 0; | |
} | |
template <typename T> | |
void DoubleArrayBuilder::build_dawg(const Keyset<T> &keyset, | |
DawgBuilder *dawg_builder) { | |
dawg_builder->init(); | |
for (std::size_t i = 0; i < keyset.num_keys(); ++i) { | |
dawg_builder->insert(keyset.keys(i), keyset.lengths(i), keyset.values(i)); | |
if (progress_func_ != NULL) { | |
progress_func_(i + 1, keyset.num_keys() + 1); | |
} | |
} | |
dawg_builder->finish(); | |
} | |
inline void DoubleArrayBuilder::build_from_dawg(const DawgBuilder &dawg) { | |
std::size_t num_units = 1; | |
while (num_units < dawg.size()) { | |
num_units <<= 1; | |
} | |
units_.reserve(num_units); | |
table_.reset(new id_type[dawg.num_intersections()]); | |
for (std::size_t i = 0; i < dawg.num_intersections(); ++i) { | |
table_[i] = 0; | |
} | |
extras_.reset(new extra_type[NUM_EXTRAS]); | |
reserve_id(0); | |
extras(0).set_is_used(true); | |
units_[0].set_offset(1); | |
units_[0].set_label('\0'); | |
if (dawg.child(dawg.root()) != 0) { | |
build_from_dawg(dawg, dawg.root(), 0); | |
} | |
fix_all_blocks(); | |
extras_.clear(); | |
labels_.clear(); | |
table_.clear(); | |
} | |
inline void DoubleArrayBuilder::build_from_dawg(const DawgBuilder &dawg, | |
id_type dawg_id, id_type dic_id) { | |
id_type dawg_child_id = dawg.child(dawg_id); | |
if (dawg.is_intersection(dawg_child_id)) { | |
id_type intersection_id = dawg.intersection_id(dawg_child_id); | |
id_type offset = table_[intersection_id]; | |
if (offset != 0) { | |
offset ^= dic_id; | |
if (!(offset & UPPER_MASK) || !(offset & LOWER_MASK)) { | |
if (dawg.is_leaf(dawg_child_id)) { | |
units_[dic_id].set_has_leaf(true); | |
} | |
units_[dic_id].set_offset(offset); | |
return; | |
} | |
} | |
} | |
id_type offset = arrange_from_dawg(dawg, dawg_id, dic_id); | |
if (dawg.is_intersection(dawg_child_id)) { | |
table_[dawg.intersection_id(dawg_child_id)] = offset; | |
} | |
do { | |
uchar_type child_label = dawg.label(dawg_child_id); | |
id_type dic_child_id = offset ^ child_label; | |
if (child_label != '\0') { | |
build_from_dawg(dawg, dawg_child_id, dic_child_id); | |
} | |
dawg_child_id = dawg.sibling(dawg_child_id); | |
} while (dawg_child_id != 0); | |
} | |
inline id_type DoubleArrayBuilder::arrange_from_dawg(const DawgBuilder &dawg, | |
id_type dawg_id, id_type dic_id) { | |
labels_.resize(0); | |
id_type dawg_child_id = dawg.child(dawg_id); | |
while (dawg_child_id != 0) { | |
labels_.append(dawg.label(dawg_child_id)); | |
dawg_child_id = dawg.sibling(dawg_child_id); | |
} | |
id_type offset = find_valid_offset(dic_id); | |
units_[dic_id].set_offset(dic_id ^ offset); | |
dawg_child_id = dawg.child(dawg_id); | |
for (std::size_t i = 0; i < labels_.size(); ++i) { | |
id_type dic_child_id = offset ^ labels_[i]; | |
reserve_id(dic_child_id); | |
if (dawg.is_leaf(dawg_child_id)) { | |
units_[dic_id].set_has_leaf(true); | |
units_[dic_child_id].set_value(dawg.value(dawg_child_id)); | |
} else { | |
units_[dic_child_id].set_label(labels_[i]); | |
} | |
dawg_child_id = dawg.sibling(dawg_child_id); | |
} | |
extras(offset).set_is_used(true); | |
return offset; | |
} | |
template <typename T> | |
void DoubleArrayBuilder::build_from_keyset(const Keyset<T> &keyset) { | |
std::size_t num_units = 1; | |
while (num_units < keyset.num_keys()) { | |
num_units <<= 1; | |
} | |
units_.reserve(num_units); | |
extras_.reset(new extra_type[NUM_EXTRAS]); | |
reserve_id(0); | |
extras(0).set_is_used(true); | |
units_[0].set_offset(1); | |
units_[0].set_label('\0'); | |
if (keyset.num_keys() > 0) { | |
build_from_keyset(keyset, 0, keyset.num_keys(), 0, 0); | |
} | |
fix_all_blocks(); | |
extras_.clear(); | |
labels_.clear(); | |
} | |
template <typename T> | |
void DoubleArrayBuilder::build_from_keyset(const Keyset<T> &keyset, | |
std::size_t begin, std::size_t end, std::size_t depth, id_type dic_id) { | |
id_type offset = arrange_from_keyset(keyset, begin, end, depth, dic_id); | |
while (begin < end) { | |
if (keyset.keys(begin, depth) != '\0') { | |
break; | |
} | |
++begin; | |
} | |
if (begin == end) { | |
return; | |
} | |
std::size_t last_begin = begin; | |
uchar_type last_label = keyset.keys(begin, depth); | |
while (++begin < end) { | |
uchar_type label = keyset.keys(begin, depth); | |
if (label != last_label) { | |
build_from_keyset(keyset, last_begin, begin, | |
depth + 1, offset ^ last_label); | |
last_begin = begin; | |
last_label = keyset.keys(begin, depth); | |
} | |
} | |
build_from_keyset(keyset, last_begin, end, depth + 1, offset ^ last_label); | |
} | |
template <typename T> | |
id_type DoubleArrayBuilder::arrange_from_keyset(const Keyset<T> &keyset, | |
std::size_t begin, std::size_t end, std::size_t depth, id_type dic_id) { | |
labels_.resize(0); | |
value_type value = -1; | |
for (std::size_t i = begin; i < end; ++i) { | |
uchar_type label = keyset.keys(i, depth); | |
if (label == '\0') { | |
if (keyset.has_lengths() && depth < keyset.lengths(i)) { | |
DARTS_THROW("failed to build double-array: " | |
"invalid null character"); | |
} else if (keyset.values(i) < 0) { | |
DARTS_THROW("failed to build double-array: negative value"); | |
} | |
if (value == -1) { | |
value = keyset.values(i); | |
} | |
if (progress_func_ != NULL) { | |
progress_func_(i + 1, keyset.num_keys() + 1); | |
} | |
} | |
if (labels_.empty()) { | |
labels_.append(label); | |
} else if (label != labels_[labels_.size() - 1]) { | |
if (label < labels_[labels_.size() - 1]) { | |
DARTS_THROW("failed to build double-array: wrong key order"); | |
} | |
labels_.append(label); | |
} | |
} | |
id_type offset = find_valid_offset(dic_id); | |
units_[dic_id].set_offset(dic_id ^ offset); | |
for (std::size_t i = 0; i < labels_.size(); ++i) { | |
id_type dic_child_id = offset ^ labels_[i]; | |
reserve_id(dic_child_id); | |
if (labels_[i] == '\0') { | |
units_[dic_id].set_has_leaf(true); | |
units_[dic_child_id].set_value(value); | |
} else { | |
units_[dic_child_id].set_label(labels_[i]); | |
} | |
} | |
extras(offset).set_is_used(true); | |
return offset; | |
} | |
inline id_type DoubleArrayBuilder::find_valid_offset(id_type id) const { | |
if (extras_head_ >= units_.size()) { | |
return units_.size() | (id & LOWER_MASK); | |
} | |
id_type unfixed_id = extras_head_; | |
do { | |
id_type offset = unfixed_id ^ labels_[0]; | |
if (is_valid_offset(id, offset)) { | |
return offset; | |
} | |
unfixed_id = extras(unfixed_id).next(); | |
} while (unfixed_id != extras_head_); | |
return units_.size() | (id & LOWER_MASK); | |
} | |
inline bool DoubleArrayBuilder::is_valid_offset(id_type id, | |
id_type offset) const { | |
if (extras(offset).is_used()) { | |
return false; | |
} | |
id_type rel_offset = id ^ offset; | |
if ((rel_offset & LOWER_MASK) && (rel_offset & UPPER_MASK)) { | |
return false; | |
} | |
for (std::size_t i = 1; i < labels_.size(); ++i) { | |
if (extras(offset ^ labels_[i]).is_fixed()) { | |
return false; | |
} | |
} | |
return true; | |
} | |
inline void DoubleArrayBuilder::reserve_id(id_type id) { | |
if (id >= units_.size()) { | |
expand_units(); | |
} | |
if (id == extras_head_) { | |
extras_head_ = extras(id).next(); | |
if (extras_head_ == id) { | |
extras_head_ = units_.size(); | |
} | |
} | |
extras(extras(id).prev()).set_next(extras(id).next()); | |
extras(extras(id).next()).set_prev(extras(id).prev()); | |
extras(id).set_is_fixed(true); | |
} | |
inline void DoubleArrayBuilder::expand_units() { | |
id_type src_num_units = units_.size(); | |
id_type src_num_blocks = num_blocks(); | |
id_type dest_num_units = src_num_units + BLOCK_SIZE; | |
id_type dest_num_blocks = src_num_blocks + 1; | |
if (dest_num_blocks > NUM_EXTRA_BLOCKS) { | |
fix_block(src_num_blocks - NUM_EXTRA_BLOCKS); | |
} | |
units_.resize(dest_num_units); | |
if (dest_num_blocks > NUM_EXTRA_BLOCKS) { | |
for (std::size_t id = src_num_units; id < dest_num_units; ++id) { | |
extras(id).set_is_used(false); | |
extras(id).set_is_fixed(false); | |
} | |
} | |
for (id_type i = src_num_units + 1; i < dest_num_units; ++i) { | |
extras(i - 1).set_next(i); | |
extras(i).set_prev(i - 1); | |
} | |
extras(src_num_units).set_prev(dest_num_units - 1); | |
extras(dest_num_units - 1).set_next(src_num_units); | |
extras(src_num_units).set_prev(extras(extras_head_).prev()); | |
extras(dest_num_units - 1).set_next(extras_head_); | |
extras(extras(extras_head_).prev()).set_next(src_num_units); | |
extras(extras_head_).set_prev(dest_num_units - 1); | |
} | |
inline void DoubleArrayBuilder::fix_all_blocks() { | |
id_type begin = 0; | |
if (num_blocks() > NUM_EXTRA_BLOCKS) { | |
begin = num_blocks() - NUM_EXTRA_BLOCKS; | |
} | |
id_type end = num_blocks(); | |
for (id_type block_id = begin; block_id != end; ++block_id) { | |
fix_block(block_id); | |
} | |
} | |
inline void DoubleArrayBuilder::fix_block(id_type block_id) { | |
id_type begin = block_id * BLOCK_SIZE; | |
id_type end = begin + BLOCK_SIZE; | |
id_type unused_offset = 0; | |
for (id_type offset = begin; offset != end; ++offset) { | |
if (!extras(offset).is_used()) { | |
unused_offset = offset; | |
break; | |
} | |
} | |
for (id_type id = begin; id != end; ++id) { | |
if (!extras(id).is_fixed()) { | |
reserve_id(id); | |
units_[id].set_label(static_cast<uchar_type>(id ^ unused_offset)); | |
} | |
} | |
} | |
} // namespace Details | |
// | |
// Member function build() of DoubleArrayImpl. | |
// | |
template <typename A, typename B, typename T, typename C> | |
int DoubleArrayImpl<A, B, T, C>::build(std::size_t num_keys, | |
const key_type * const *keys, const std::size_t *lengths, | |
const value_type *values, Details::progress_func_type progress_func) { | |
Details::Keyset<value_type> keyset(num_keys, keys, lengths, values); | |
Details::DoubleArrayBuilder builder(progress_func); | |
builder.build(keyset); | |
std::size_t size = 0; | |
unit_type *buf = NULL; | |
builder.copy(&size, &buf); | |
clear(); | |
size_ = size; | |
array_ = buf; | |
buf_ = buf; | |
if (progress_func != NULL) { | |
progress_func(num_keys + 1, num_keys + 1); | |
} | |
return 0; | |
} | |
} // namespace Darts | |