ICU 4.8.1.1  4.8.1.1
bytestrie.h
Go to the documentation of this file.
00001 /*
00002 *******************************************************************************
00003 *   Copyright (C) 2010-2011, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 *******************************************************************************
00006 *   file name:  bytestrie.h
00007 *   encoding:   US-ASCII
00008 *   tab size:   8 (not used)
00009 *   indentation:4
00010 *
00011 *   created on: 2010sep25
00012 *   created by: Markus W. Scherer
00013 */
00014 
00015 #ifndef __BYTESTRIE_H__
00016 #define __BYTESTRIE_H__
00017 
00023 #include "unicode/utypes.h"
00024 #include "unicode/stringpiece.h"
00025 #include "unicode/uobject.h"
00026 #include "unicode/ustringtrie.h"
00027 
00028 U_NAMESPACE_BEGIN
00029 
00030 class ByteSink;
00031 class BytesTrieBuilder;
00032 class CharString;
00033 class UVector32;
00034 
00048 class U_COMMON_API BytesTrie : public UMemory {
00049 public:
00064     BytesTrie(const void *trieBytes)
00065             : ownedArray_(NULL), bytes_(reinterpret_cast<const uint8_t *>(trieBytes)),
00066               pos_(bytes_), remainingMatchLength_(-1) {}
00067 
00072     ~BytesTrie();
00073 
00080     BytesTrie(const BytesTrie &other)
00081             : ownedArray_(NULL), bytes_(other.bytes_),
00082               pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
00083 
00089     BytesTrie &reset() {
00090         pos_=bytes_;
00091         remainingMatchLength_=-1;
00092         return *this;
00093     }
00094 
00100     class State : public UMemory {
00101     public:
00106         State() { bytes=NULL; }
00107     private:
00108         friend class BytesTrie;
00109 
00110         const uint8_t *bytes;
00111         const uint8_t *pos;
00112         int32_t remainingMatchLength;
00113     };
00114 
00122     const BytesTrie &saveState(State &state) const {
00123         state.bytes=bytes_;
00124         state.pos=pos_;
00125         state.remainingMatchLength=remainingMatchLength_;
00126         return *this;
00127     }
00128 
00139     BytesTrie &resetToState(const State &state) {
00140         if(bytes_==state.bytes && bytes_!=NULL) {
00141             pos_=state.pos;
00142             remainingMatchLength_=state.remainingMatchLength;
00143         }
00144         return *this;
00145     }
00146 
00153     UStringTrieResult current() const;
00154 
00163     inline UStringTrieResult first(int32_t inByte) {
00164         remainingMatchLength_=-1;
00165         if(inByte<0) {
00166             inByte+=0x100;
00167         }
00168         return nextImpl(bytes_, inByte);
00169     }
00170 
00178     UStringTrieResult next(int32_t inByte);
00179 
00195     UStringTrieResult next(const char *s, int32_t length);
00196 
00206     inline int32_t getValue() const {
00207         const uint8_t *pos=pos_;
00208         int32_t leadByte=*pos++;
00209         // U_ASSERT(leadByte>=kMinValueLead);
00210         return readValue(pos, leadByte>>1);
00211     }
00212 
00222     inline UBool hasUniqueValue(int32_t &uniqueValue) const {
00223         const uint8_t *pos=pos_;
00224         // Skip the rest of a pending linear-match node.
00225         return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
00226     }
00227 
00236     int32_t getNextBytes(ByteSink &out) const;
00237 
00242     class U_COMMON_API Iterator : public UMemory {
00243     public:
00255         Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode);
00256 
00268         Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
00269 
00274         ~Iterator();
00275 
00281         Iterator &reset();
00282 
00287         UBool hasNext() const;
00288 
00303         UBool next(UErrorCode &errorCode);
00304 
00309         const StringPiece &getString() const { return sp_; }
00314         int32_t getValue() const { return value_; }
00315 
00316     private:
00317         UBool truncateAndStop();
00318 
00319         const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode);
00320 
00321         const uint8_t *bytes_;
00322         const uint8_t *pos_;
00323         const uint8_t *initialPos_;
00324         int32_t remainingMatchLength_;
00325         int32_t initialRemainingMatchLength_;
00326 
00327         CharString *str_;
00328         StringPiece sp_;
00329         int32_t maxLength_;
00330         int32_t value_;
00331 
00332         // The stack stores pairs of integers for backtracking to another
00333         // outbound edge of a branch node.
00334         // The first integer is an offset from bytes_.
00335         // The second integer has the str_->length() from before the node in bits 15..0,
00336         // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.)
00337         // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24,
00338         // but the code looks more confusing that way.)
00339         UVector32 *stack_;
00340     };
00341 
00342 private:
00343     friend class BytesTrieBuilder;
00344 
00351     BytesTrie(void *adoptBytes, const void *trieBytes)
00352             : ownedArray_(reinterpret_cast<uint8_t *>(adoptBytes)),
00353               bytes_(reinterpret_cast<const uint8_t *>(trieBytes)),
00354               pos_(bytes_), remainingMatchLength_(-1) {}
00355 
00356     // No assignment operator.
00357     BytesTrie &operator=(const BytesTrie &other);
00358 
00359     inline void stop() {
00360         pos_=NULL;
00361     }
00362 
00363     // Reads a compact 32-bit integer.
00364     // pos is already after the leadByte, and the lead byte is already shifted right by 1.
00365     static int32_t readValue(const uint8_t *pos, int32_t leadByte);
00366     static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) {
00367         // U_ASSERT(leadByte>=kMinValueLead);
00368         if(leadByte>=(kMinTwoByteValueLead<<1)) {
00369             if(leadByte<(kMinThreeByteValueLead<<1)) {
00370                 ++pos;
00371             } else if(leadByte<(kFourByteValueLead<<1)) {
00372                 pos+=2;
00373             } else {
00374                 pos+=3+((leadByte>>1)&1);
00375             }
00376         }
00377         return pos;
00378     }
00379     static inline const uint8_t *skipValue(const uint8_t *pos) {
00380         int32_t leadByte=*pos++;
00381         return skipValue(pos, leadByte);
00382     }
00383 
00384     // Reads a jump delta and jumps.
00385     static const uint8_t *jumpByDelta(const uint8_t *pos);
00386 
00387     static inline const uint8_t *skipDelta(const uint8_t *pos) {
00388         int32_t delta=*pos++;
00389         if(delta>=kMinTwoByteDeltaLead) {
00390             if(delta<kMinThreeByteDeltaLead) {
00391                 ++pos;
00392             } else if(delta<kFourByteDeltaLead) {
00393                 pos+=2;
00394             } else {
00395                 pos+=3+(delta&1);
00396             }
00397         }
00398         return pos;
00399     }
00400 
00401     static inline UStringTrieResult valueResult(int32_t node) {
00402         return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node&kValueIsFinal));
00403     }
00404 
00405     // Handles a branch node for both next(byte) and next(string).
00406     UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte);
00407 
00408     // Requires remainingLength_<0.
00409     UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte);
00410 
00411     // Helper functions for hasUniqueValue().
00412     // Recursively finds a unique value (or whether there is not a unique one)
00413     // from a branch.
00414     static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
00415                                                     UBool haveUniqueValue, int32_t &uniqueValue);
00416     // Recursively finds a unique value (or whether there is not a unique one)
00417     // starting from a position on a node lead byte.
00418     static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
00419 
00420     // Helper functions for getNextBytes().
00421     // getNextBytes() when pos is on a branch node.
00422     static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out);
00423     static void append(ByteSink &out, int c);
00424 
00425     // BytesTrie data structure
00426     //
00427     // The trie consists of a series of byte-serialized nodes for incremental
00428     // string/byte sequence matching. The root node is at the beginning of the trie data.
00429     //
00430     // Types of nodes are distinguished by their node lead byte ranges.
00431     // After each node, except a final-value node, another node follows to
00432     // encode match values or continue matching further bytes.
00433     //
00434     // Node types:
00435     //  - Value node: Stores a 32-bit integer in a compact, variable-length format.
00436     //    The value is for the string/byte sequence so far.
00437     //    One node bit indicates whether the value is final or whether
00438     //    matching continues with the next node.
00439     //  - Linear-match node: Matches a number of bytes.
00440     //  - Branch node: Branches to other nodes according to the current input byte.
00441     //    The node byte is the length of the branch (number of bytes to select from)
00442     //    minus 1. It is followed by a sub-node:
00443     //    - If the length is at most kMaxBranchLinearSubNodeLength, then
00444     //      there are length-1 (key, value) pairs and then one more comparison byte.
00445     //      If one of the key bytes matches, then the value is either a final value for
00446     //      the string/byte sequence so far, or a "jump" delta to the next node.
00447     //      If the last byte matches, then matching continues with the next node.
00448     //      (Values have the same encoding as value nodes.)
00449     //    - If the length is greater than kMaxBranchLinearSubNodeLength, then
00450     //      there is one byte and one "jump" delta.
00451     //      If the input byte is less than the sub-node byte, then "jump" by delta to
00452     //      the next sub-node which will have a length of length/2.
00453     //      (The delta has its own compact encoding.)
00454     //      Otherwise, skip the "jump" delta to the next sub-node
00455     //      which will have a length of length-length/2.
00456 
00457     // Node lead byte values.
00458 
00459     // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
00460     // the length is one more than the next byte.
00461 
00462     // For a branch sub-node with at most this many entries, we drop down
00463     // to a linear search.
00464     static const int32_t kMaxBranchLinearSubNodeLength=5;
00465 
00466     // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node.
00467     static const int32_t kMinLinearMatch=0x10;
00468     static const int32_t kMaxLinearMatchLength=0x10;
00469 
00470     // 20..ff: Variable-length value node.
00471     // If odd, the value is final. (Otherwise, intermediate value or jump delta.)
00472     // Then shift-right by 1 bit.
00473     // The remaining lead byte value indicates the number of following bytes (0..4)
00474     // and contains the value's top bits.
00475     static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength;  // 0x20
00476     // It is a final value if bit 0 is set.
00477     static const int32_t kValueIsFinal=1;
00478 
00479     // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds.
00480     static const int32_t kMinOneByteValueLead=kMinValueLead/2;  // 0x10
00481     static const int32_t kMaxOneByteValue=0x40;  // At least 6 bits in the first byte.
00482 
00483     static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1;  // 0x51
00484     static const int32_t kMaxTwoByteValue=0x1aff;
00485 
00486     static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1;  // 0x6c
00487     static const int32_t kFourByteValueLead=0x7e;
00488 
00489     // A little more than Unicode code points. (0x11ffff)
00490     static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;
00491 
00492     static const int32_t kFiveByteValueLead=0x7f;
00493 
00494     // Compact delta integers.
00495     static const int32_t kMaxOneByteDelta=0xbf;
00496     static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1;  // 0xc0
00497     static const int32_t kMinThreeByteDeltaLead=0xf0;
00498     static const int32_t kFourByteDeltaLead=0xfe;
00499     static const int32_t kFiveByteDeltaLead=0xff;
00500 
00501     static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1;  // 0x2fff
00502     static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1;  // 0xdffff
00503 
00504     uint8_t *ownedArray_;
00505 
00506     // Fixed value referencing the BytesTrie bytes.
00507     const uint8_t *bytes_;
00508 
00509     // Iterator variables.
00510 
00511     // Pointer to next trie byte to read. NULL if no more matches.
00512     const uint8_t *pos_;
00513     // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
00514     int32_t remainingMatchLength_;
00515 };
00516 
00517 U_NAMESPACE_END
00518 
00519 #endif  // __BYTESTRIE_H__
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Friends Defines