ICU 4.8.1.1
4.8.1.1
|
00001 /* 00002 ******************************************************************************* 00003 * Copyright (C) 2010-2011, International Business Machines 00004 * Corporation and others. All Rights Reserved. 00005 ******************************************************************************* 00006 * file name: stringtriebuilder.h 00007 * encoding: US-ASCII 00008 * tab size: 8 (not used) 00009 * indentation:4 00010 * 00011 * created on: 2010dec24 00012 * created by: Markus W. Scherer 00013 */ 00014 00015 #ifndef __STRINGTRIEBUILDER_H__ 00016 #define __STRINGTRIEBUILDER_H__ 00017 00018 #include "unicode/utypes.h" 00019 #include "unicode/uobject.h" 00020 00021 // Forward declaration. 00022 struct UHashtable; 00023 typedef struct UHashtable UHashtable; 00024 00029 enum UStringTrieBuildOption { 00034 USTRINGTRIE_BUILD_FAST, 00045 USTRINGTRIE_BUILD_SMALL 00046 }; 00047 00048 U_NAMESPACE_BEGIN 00049 00056 class U_COMMON_API StringTrieBuilder : public UObject { 00057 public: 00059 static UBool hashNode(const void *node); 00061 static UBool equalNodes(const void *left, const void *right); 00062 00063 protected: 00065 StringTrieBuilder(); 00067 virtual ~StringTrieBuilder(); 00068 00070 void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode); 00072 void deleteCompactBuilder(); 00073 00075 void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode); 00076 00078 int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex); 00080 int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length); 00081 00082 class Node; 00083 00085 Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode); 00087 Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, 00088 int32_t length, UErrorCode &errorCode); 00089 00091 virtual int32_t getElementStringLength(int32_t i) const = 0; 00093 virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0; 00095 virtual int32_t getElementValue(int32_t i) const = 0; 00096 00097 // Finds the first unit index after this one where 00098 // the first and last element have different units again. 00100 virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0; 00101 00102 // Number of different units at unitIndex. 00104 virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0; 00106 virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0; 00108 virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0; 00109 00111 virtual UBool matchNodesCanHaveValues() const = 0; 00112 00114 virtual int32_t getMaxBranchLinearSubNodeLength() const = 0; 00116 virtual int32_t getMinLinearMatch() const = 0; 00118 virtual int32_t getMaxLinearMatchLength() const = 0; 00119 00120 // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength). 00122 static const int32_t kMaxBranchLinearSubNodeLength=5; 00123 00124 // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units. 00125 // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up. 00127 static const int32_t kMaxSplitBranchLevels=14; 00128 00139 Node *registerNode(Node *newNode, UErrorCode &errorCode); 00150 Node *registerFinalValue(int32_t value, UErrorCode &errorCode); 00151 00152 /* 00153 * C++ note: 00154 * registerNode() and registerFinalValue() take ownership of their input nodes, 00155 * and only return owned nodes. 00156 * If they see a failure UErrorCode, they will delete the input node. 00157 * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR. 00158 * If there is a failure, they return NULL. 00159 * 00160 * NULL Node pointers can be safely passed into other Nodes because 00161 * they call the static Node::hashCode() which checks for a NULL pointer first. 00162 * 00163 * Therefore, as long as builder functions register a new node, 00164 * they need to check for failures only before explicitly dereferencing 00165 * a Node pointer, or before setting a new UErrorCode. 00166 */ 00167 00168 // Hash set of nodes, maps from nodes to integer 1. 00170 UHashtable *nodes; 00171 00173 class Node : public UObject { 00174 public: 00175 Node(int32_t initialHash) : hash(initialHash), offset(0) {} 00176 inline int32_t hashCode() const { return hash; } 00177 // Handles node==NULL. 00178 static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); } 00179 // Base class operator==() compares the actual class types. 00180 virtual UBool operator==(const Node &other) const; 00181 inline UBool operator!=(const Node &other) const { return !operator==(other); } 00209 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 00210 // write() must set the offset to a positive value. 00211 virtual void write(StringTrieBuilder &builder) = 0; 00212 // See markRightEdgesFirst. 00213 inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight, 00214 StringTrieBuilder &builder) { 00215 // Note: Edge numbers are negative, lastRight<=firstRight. 00216 // If offset>0 then this node and its sub-nodes have been written already 00217 // and we need not write them again. 00218 // If this node is part of the unwritten right branch edge, 00219 // then we wait until that is written. 00220 if(offset<0 && (offset<lastRight || firstRight<offset)) { 00221 write(builder); 00222 } 00223 } 00224 inline int32_t getOffset() const { return offset; } 00225 protected: 00226 int32_t hash; 00227 int32_t offset; 00228 private: 00229 // No ICU "poor man's RTTI" for this class nor its subclasses. 00230 virtual UClassID getDynamicClassID() const; 00231 }; 00232 00233 // This class should not be overridden because 00234 // registerFinalValue() compares a stack-allocated FinalValueNode 00235 // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes) 00236 // with the input node, and the 00237 // !Node::operator==(other) used inside FinalValueNode::operator==(other) 00238 // will be false if the typeid's are different. 00240 class FinalValueNode : public Node { 00241 public: 00242 FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {} 00243 virtual UBool operator==(const Node &other) const; 00244 virtual void write(StringTrieBuilder &builder); 00245 protected: 00246 int32_t value; 00247 }; 00248 00250 class ValueNode : public Node { 00251 public: 00252 ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {} 00253 virtual UBool operator==(const Node &other) const; 00254 void setValue(int32_t v) { 00255 hasValue=TRUE; 00256 value=v; 00257 hash=hash*37+v; 00258 } 00259 protected: 00260 UBool hasValue; 00261 int32_t value; 00262 }; 00263 00265 class IntermediateValueNode : public ValueNode { 00266 public: 00267 IntermediateValueNode(int32_t v, Node *nextNode) 00268 : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); } 00269 virtual UBool operator==(const Node &other) const; 00270 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 00271 virtual void write(StringTrieBuilder &builder); 00272 protected: 00273 Node *next; 00274 }; 00275 00277 class LinearMatchNode : public ValueNode { 00278 public: 00279 LinearMatchNode(int32_t len, Node *nextNode) 00280 : ValueNode((0x333333*37+len)*37+hashCode(nextNode)), 00281 length(len), next(nextNode) {} 00282 virtual UBool operator==(const Node &other) const; 00283 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 00284 protected: 00285 int32_t length; 00286 Node *next; 00287 }; 00288 00290 class BranchNode : public Node { 00291 public: 00292 BranchNode(int32_t initialHash) : Node(initialHash) {} 00293 protected: 00294 int32_t firstEdgeNumber; 00295 }; 00296 00298 class ListBranchNode : public BranchNode { 00299 public: 00300 ListBranchNode() : BranchNode(0x444444), length(0) {} 00301 virtual UBool operator==(const Node &other) const; 00302 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 00303 virtual void write(StringTrieBuilder &builder); 00304 // Adds a unit with a final value. 00305 void add(int32_t c, int32_t value) { 00306 units[length]=(UChar)c; 00307 equal[length]=NULL; 00308 values[length]=value; 00309 ++length; 00310 hash=(hash*37+c)*37+value; 00311 } 00312 // Adds a unit which leads to another match node. 00313 void add(int32_t c, Node *node) { 00314 units[length]=(UChar)c; 00315 equal[length]=node; 00316 values[length]=0; 00317 ++length; 00318 hash=(hash*37+c)*37+hashCode(node); 00319 } 00320 protected: 00321 Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value". 00322 int32_t length; 00323 int32_t values[kMaxBranchLinearSubNodeLength]; 00324 UChar units[kMaxBranchLinearSubNodeLength]; 00325 }; 00326 00328 class SplitBranchNode : public BranchNode { 00329 public: 00330 SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode) 00331 : BranchNode(((0x555555*37+middleUnit)*37+ 00332 hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)), 00333 unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {} 00334 virtual UBool operator==(const Node &other) const; 00335 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 00336 virtual void write(StringTrieBuilder &builder); 00337 protected: 00338 UChar unit; 00339 Node *lessThan; 00340 Node *greaterOrEqual; 00341 }; 00342 00343 // Branch head node, for writing the actual node lead unit. 00345 class BranchHeadNode : public ValueNode { 00346 public: 00347 BranchHeadNode(int32_t len, Node *subNode) 00348 : ValueNode((0x666666*37+len)*37+hashCode(subNode)), 00349 length(len), next(subNode) {} 00350 virtual UBool operator==(const Node &other) const; 00351 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 00352 virtual void write(StringTrieBuilder &builder); 00353 protected: 00354 int32_t length; 00355 Node *next; // A branch sub-node. 00356 }; 00357 00359 virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length, 00360 Node *nextNode) const = 0; 00361 00363 virtual int32_t write(int32_t unit) = 0; 00365 virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0; 00367 virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0; 00369 virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0; 00371 virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0; 00372 00373 private: 00374 // No ICU "poor man's RTTI" for this class nor its subclasses. 00375 virtual UClassID getDynamicClassID() const; 00376 }; 00377 00378 U_NAMESPACE_END 00379 00380 #endif // __STRINGTRIEBUILDER_H__