ICU 4.8.1.1
4.8.1.1
|
00001 /* 00002 ******************************************************************************* 00003 * Copyright (C) 2010-2011, International Business Machines 00004 * Corporation and others. All Rights Reserved. 00005 ******************************************************************************* 00006 * file name: bytestrie.h 00007 * encoding: US-ASCII 00008 * tab size: 8 (not used) 00009 * indentation:4 00010 * 00011 * created on: 2010sep25 00012 * created by: Markus W. Scherer 00013 */ 00014 00015 #ifndef __BYTESTRIE_H__ 00016 #define __BYTESTRIE_H__ 00017 00023 #include "unicode/utypes.h" 00024 #include "unicode/stringpiece.h" 00025 #include "unicode/uobject.h" 00026 #include "unicode/ustringtrie.h" 00027 00028 U_NAMESPACE_BEGIN 00029 00030 class ByteSink; 00031 class BytesTrieBuilder; 00032 class CharString; 00033 class UVector32; 00034 00048 class U_COMMON_API BytesTrie : public UMemory { 00049 public: 00064 BytesTrie(const void *trieBytes) 00065 : ownedArray_(NULL), bytes_(reinterpret_cast<const uint8_t *>(trieBytes)), 00066 pos_(bytes_), remainingMatchLength_(-1) {} 00067 00072 ~BytesTrie(); 00073 00080 BytesTrie(const BytesTrie &other) 00081 : ownedArray_(NULL), bytes_(other.bytes_), 00082 pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {} 00083 00089 BytesTrie &reset() { 00090 pos_=bytes_; 00091 remainingMatchLength_=-1; 00092 return *this; 00093 } 00094 00100 class State : public UMemory { 00101 public: 00106 State() { bytes=NULL; } 00107 private: 00108 friend class BytesTrie; 00109 00110 const uint8_t *bytes; 00111 const uint8_t *pos; 00112 int32_t remainingMatchLength; 00113 }; 00114 00122 const BytesTrie &saveState(State &state) const { 00123 state.bytes=bytes_; 00124 state.pos=pos_; 00125 state.remainingMatchLength=remainingMatchLength_; 00126 return *this; 00127 } 00128 00139 BytesTrie &resetToState(const State &state) { 00140 if(bytes_==state.bytes && bytes_!=NULL) { 00141 pos_=state.pos; 00142 remainingMatchLength_=state.remainingMatchLength; 00143 } 00144 return *this; 00145 } 00146 00153 UStringTrieResult current() const; 00154 00163 inline UStringTrieResult first(int32_t inByte) { 00164 remainingMatchLength_=-1; 00165 if(inByte<0) { 00166 inByte+=0x100; 00167 } 00168 return nextImpl(bytes_, inByte); 00169 } 00170 00178 UStringTrieResult next(int32_t inByte); 00179 00195 UStringTrieResult next(const char *s, int32_t length); 00196 00206 inline int32_t getValue() const { 00207 const uint8_t *pos=pos_; 00208 int32_t leadByte=*pos++; 00209 // U_ASSERT(leadByte>=kMinValueLead); 00210 return readValue(pos, leadByte>>1); 00211 } 00212 00222 inline UBool hasUniqueValue(int32_t &uniqueValue) const { 00223 const uint8_t *pos=pos_; 00224 // Skip the rest of a pending linear-match node. 00225 return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue); 00226 } 00227 00236 int32_t getNextBytes(ByteSink &out) const; 00237 00242 class U_COMMON_API Iterator : public UMemory { 00243 public: 00255 Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode); 00256 00268 Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode); 00269 00274 ~Iterator(); 00275 00281 Iterator &reset(); 00282 00287 UBool hasNext() const; 00288 00303 UBool next(UErrorCode &errorCode); 00304 00309 const StringPiece &getString() const { return sp_; } 00314 int32_t getValue() const { return value_; } 00315 00316 private: 00317 UBool truncateAndStop(); 00318 00319 const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode); 00320 00321 const uint8_t *bytes_; 00322 const uint8_t *pos_; 00323 const uint8_t *initialPos_; 00324 int32_t remainingMatchLength_; 00325 int32_t initialRemainingMatchLength_; 00326 00327 CharString *str_; 00328 StringPiece sp_; 00329 int32_t maxLength_; 00330 int32_t value_; 00331 00332 // The stack stores pairs of integers for backtracking to another 00333 // outbound edge of a branch node. 00334 // The first integer is an offset from bytes_. 00335 // The second integer has the str_->length() from before the node in bits 15..0, 00336 // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.) 00337 // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24, 00338 // but the code looks more confusing that way.) 00339 UVector32 *stack_; 00340 }; 00341 00342 private: 00343 friend class BytesTrieBuilder; 00344 00351 BytesTrie(void *adoptBytes, const void *trieBytes) 00352 : ownedArray_(reinterpret_cast<uint8_t *>(adoptBytes)), 00353 bytes_(reinterpret_cast<const uint8_t *>(trieBytes)), 00354 pos_(bytes_), remainingMatchLength_(-1) {} 00355 00356 // No assignment operator. 00357 BytesTrie &operator=(const BytesTrie &other); 00358 00359 inline void stop() { 00360 pos_=NULL; 00361 } 00362 00363 // Reads a compact 32-bit integer. 00364 // pos is already after the leadByte, and the lead byte is already shifted right by 1. 00365 static int32_t readValue(const uint8_t *pos, int32_t leadByte); 00366 static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) { 00367 // U_ASSERT(leadByte>=kMinValueLead); 00368 if(leadByte>=(kMinTwoByteValueLead<<1)) { 00369 if(leadByte<(kMinThreeByteValueLead<<1)) { 00370 ++pos; 00371 } else if(leadByte<(kFourByteValueLead<<1)) { 00372 pos+=2; 00373 } else { 00374 pos+=3+((leadByte>>1)&1); 00375 } 00376 } 00377 return pos; 00378 } 00379 static inline const uint8_t *skipValue(const uint8_t *pos) { 00380 int32_t leadByte=*pos++; 00381 return skipValue(pos, leadByte); 00382 } 00383 00384 // Reads a jump delta and jumps. 00385 static const uint8_t *jumpByDelta(const uint8_t *pos); 00386 00387 static inline const uint8_t *skipDelta(const uint8_t *pos) { 00388 int32_t delta=*pos++; 00389 if(delta>=kMinTwoByteDeltaLead) { 00390 if(delta<kMinThreeByteDeltaLead) { 00391 ++pos; 00392 } else if(delta<kFourByteDeltaLead) { 00393 pos+=2; 00394 } else { 00395 pos+=3+(delta&1); 00396 } 00397 } 00398 return pos; 00399 } 00400 00401 static inline UStringTrieResult valueResult(int32_t node) { 00402 return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node&kValueIsFinal)); 00403 } 00404 00405 // Handles a branch node for both next(byte) and next(string). 00406 UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte); 00407 00408 // Requires remainingLength_<0. 00409 UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte); 00410 00411 // Helper functions for hasUniqueValue(). 00412 // Recursively finds a unique value (or whether there is not a unique one) 00413 // from a branch. 00414 static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length, 00415 UBool haveUniqueValue, int32_t &uniqueValue); 00416 // Recursively finds a unique value (or whether there is not a unique one) 00417 // starting from a position on a node lead byte. 00418 static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue); 00419 00420 // Helper functions for getNextBytes(). 00421 // getNextBytes() when pos is on a branch node. 00422 static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out); 00423 static void append(ByteSink &out, int c); 00424 00425 // BytesTrie data structure 00426 // 00427 // The trie consists of a series of byte-serialized nodes for incremental 00428 // string/byte sequence matching. The root node is at the beginning of the trie data. 00429 // 00430 // Types of nodes are distinguished by their node lead byte ranges. 00431 // After each node, except a final-value node, another node follows to 00432 // encode match values or continue matching further bytes. 00433 // 00434 // Node types: 00435 // - Value node: Stores a 32-bit integer in a compact, variable-length format. 00436 // The value is for the string/byte sequence so far. 00437 // One node bit indicates whether the value is final or whether 00438 // matching continues with the next node. 00439 // - Linear-match node: Matches a number of bytes. 00440 // - Branch node: Branches to other nodes according to the current input byte. 00441 // The node byte is the length of the branch (number of bytes to select from) 00442 // minus 1. It is followed by a sub-node: 00443 // - If the length is at most kMaxBranchLinearSubNodeLength, then 00444 // there are length-1 (key, value) pairs and then one more comparison byte. 00445 // If one of the key bytes matches, then the value is either a final value for 00446 // the string/byte sequence so far, or a "jump" delta to the next node. 00447 // If the last byte matches, then matching continues with the next node. 00448 // (Values have the same encoding as value nodes.) 00449 // - If the length is greater than kMaxBranchLinearSubNodeLength, then 00450 // there is one byte and one "jump" delta. 00451 // If the input byte is less than the sub-node byte, then "jump" by delta to 00452 // the next sub-node which will have a length of length/2. 00453 // (The delta has its own compact encoding.) 00454 // Otherwise, skip the "jump" delta to the next sub-node 00455 // which will have a length of length-length/2. 00456 00457 // Node lead byte values. 00458 00459 // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise 00460 // the length is one more than the next byte. 00461 00462 // For a branch sub-node with at most this many entries, we drop down 00463 // to a linear search. 00464 static const int32_t kMaxBranchLinearSubNodeLength=5; 00465 00466 // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node. 00467 static const int32_t kMinLinearMatch=0x10; 00468 static const int32_t kMaxLinearMatchLength=0x10; 00469 00470 // 20..ff: Variable-length value node. 00471 // If odd, the value is final. (Otherwise, intermediate value or jump delta.) 00472 // Then shift-right by 1 bit. 00473 // The remaining lead byte value indicates the number of following bytes (0..4) 00474 // and contains the value's top bits. 00475 static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20 00476 // It is a final value if bit 0 is set. 00477 static const int32_t kValueIsFinal=1; 00478 00479 // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds. 00480 static const int32_t kMinOneByteValueLead=kMinValueLead/2; // 0x10 00481 static const int32_t kMaxOneByteValue=0x40; // At least 6 bits in the first byte. 00482 00483 static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51 00484 static const int32_t kMaxTwoByteValue=0x1aff; 00485 00486 static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c 00487 static const int32_t kFourByteValueLead=0x7e; 00488 00489 // A little more than Unicode code points. (0x11ffff) 00490 static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1; 00491 00492 static const int32_t kFiveByteValueLead=0x7f; 00493 00494 // Compact delta integers. 00495 static const int32_t kMaxOneByteDelta=0xbf; 00496 static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0 00497 static const int32_t kMinThreeByteDeltaLead=0xf0; 00498 static const int32_t kFourByteDeltaLead=0xfe; 00499 static const int32_t kFiveByteDeltaLead=0xff; 00500 00501 static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff 00502 static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff 00503 00504 uint8_t *ownedArray_; 00505 00506 // Fixed value referencing the BytesTrie bytes. 00507 const uint8_t *bytes_; 00508 00509 // Iterator variables. 00510 00511 // Pointer to next trie byte to read. NULL if no more matches. 00512 const uint8_t *pos_; 00513 // Remaining length of a linear-match node, minus 1. Negative if not in such a node. 00514 int32_t remainingMatchLength_; 00515 }; 00516 00517 U_NAMESPACE_END 00518 00519 #endif // __BYTESTRIE_H__