// // CRF++ -- Yet Another CRF toolkit // // $Id: feature.cpp 1595 2007-02-24 10:18:32Z taku $; // // Copyright(C) 2005-2007 Taku Kudo // #include "feature_index.h" #include "common.h" #include "node.h" #include "path.h" #include "tagger.h" namespace CRFPP { static const char *BOS[4] = { "_B-1", "_B-2", "_B-3", "_B-4"}; static const char *EOS[4] = { "_B+1", "_B+2", "_B+3", "_B+4"}; const char *FeatureIndex::get_index(char *&p, size_t pos, const TaggerImpl &tagger) { if (*p++ !='[') return 0; int col = 0; int row = 0; int neg = 1; if (*p++ == '-') neg = -1; else --p; for (; *p; ++p) { switch (*p) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': row = 10 * row +(*p - '0'); break; case ',': ++p; goto NEXT1; default: return 0; } } NEXT1: for (; *p; ++p) { switch (*p) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': col = 10 * col +(*p - '0'); break; case ']': goto NEXT2; default: return 0; } } NEXT2: row *= neg; if (row < -4 || row > 4 || col < 0 || col >= static_cast(tagger.xsize())) return 0; max_xsize_ = _max(max_xsize_, static_cast(col + 1)); int idx = pos + row; if (idx < 0) return BOS[-idx-1]; if (idx >= static_cast(tagger.size())) return EOS[idx - tagger.size()]; return tagger.x(idx, col); } bool FeatureIndex::apply_rule(string_buffer *os, char *p, size_t pos, const TaggerImpl& tagger) { os->assign(""); // clear const char *r; for (; *p; p++) { switch (*p) { default: *os << *p; break; case '%': switch (*++p) { case 'x': ++p; r = get_index(p, pos, tagger); if (!r) return false; *os << r; break; default: return false; } break; } } *os << '\0'; return true; } void FeatureIndex::rebuildFeatures(TaggerImpl *tagger) { size_t fid = tagger->feature_id(); unsigned short thread_id = tagger->thread_id(); path_freelist_[thread_id].free(); node_freelist_[thread_id].free(); for (size_t cur = 0; cur < tagger->size(); ++cur) { int *f = feature_cache_[fid++]; for (size_t i = 0; i < y_.size(); ++i) { Node *n = node_freelist_[thread_id].alloc(); n->clear(); n->x = cur; n->y = i; n->fvector = f; tagger->set_node(n, cur, i); } } for (size_t cur = 1; cur < tagger->size(); ++cur) { int *f = feature_cache_[fid++]; for (size_t j = 0; j < y_.size(); ++j) { for (size_t i = 0; i < y_.size(); ++i) { Path *p = path_freelist_[thread_id].alloc(); p->clear(); p->add(tagger->node(cur-1, j), tagger->node(cur, i)); p->fvector = f; } } } } #define ADD { int id = this->getID(os.c_str()); \ if (id != -1) feature.push_back(id); } while (0) bool FeatureIndex::buildFeatures(TaggerImpl *tagger) { string_buffer os; std::vector feature; tagger->set_feature_id(feature_cache_.size()); for (size_t cur = 0; cur < tagger->size(); ++cur) { for (std::vector::const_iterator it = unigram_templs_.begin(); it != unigram_templs_.end(); ++it) { CHECK_FALSE(apply_rule(&os, *it, cur, *tagger)) << " format error: " << *it; ADD; } feature_cache_.add(feature); feature.clear(); } for (size_t cur = 1; cur < tagger->size(); ++cur) { for (std::vector::const_iterator it = bigram_templs_.begin(); it != bigram_templs_.end(); ++it) { CHECK_FALSE(apply_rule(&os, *it, cur, *tagger)) << "format error: " << *it; ADD; } feature_cache_.add(feature); feature.clear(); } return true; } #undef ADD }