ICU 4.8.1.1  4.8.1.1
stringtriebuilder.h
00001 /*
00002 *******************************************************************************
00003 *   Copyright (C) 2010-2011, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 *******************************************************************************
00006 *   file name:  stringtriebuilder.h
00007 *   encoding:   US-ASCII
00008 *   tab size:   8 (not used)
00009 *   indentation:4
00010 *
00011 *   created on: 2010dec24
00012 *   created by: Markus W. Scherer
00013 */
00014 
00015 #ifndef __STRINGTRIEBUILDER_H__
00016 #define __STRINGTRIEBUILDER_H__
00017 
00018 #include "unicode/utypes.h"
00019 #include "unicode/uobject.h"
00020 
00021 // Forward declaration.
00022 struct UHashtable;
00023 typedef struct UHashtable UHashtable;
00024 
00029 enum UStringTrieBuildOption {
00034     USTRINGTRIE_BUILD_FAST,
00045     USTRINGTRIE_BUILD_SMALL
00046 };
00047 
00048 U_NAMESPACE_BEGIN
00049 
00056 class U_COMMON_API StringTrieBuilder : public UObject {
00057 public:
00059     static UBool hashNode(const void *node);
00061     static UBool equalNodes(const void *left, const void *right);
00062 
00063 protected:
00065     StringTrieBuilder();
00067     virtual ~StringTrieBuilder();
00068 
00070     void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
00072     void deleteCompactBuilder();
00073 
00075     void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
00076 
00078     int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
00080     int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
00081 
00082     class Node;
00083 
00085     Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
00087     Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
00088                             int32_t length, UErrorCode &errorCode);
00089 
00091     virtual int32_t getElementStringLength(int32_t i) const = 0;
00093     virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
00095     virtual int32_t getElementValue(int32_t i) const = 0;
00096 
00097     // Finds the first unit index after this one where
00098     // the first and last element have different units again.
00100     virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
00101 
00102     // Number of different units at unitIndex.
00104     virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
00106     virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
00108     virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
00109 
00111     virtual UBool matchNodesCanHaveValues() const = 0;
00112 
00114     virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
00116     virtual int32_t getMinLinearMatch() const = 0;
00118     virtual int32_t getMaxLinearMatchLength() const = 0;
00119 
00120     // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
00122     static const int32_t kMaxBranchLinearSubNodeLength=5;
00123 
00124     // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
00125     // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
00127     static const int32_t kMaxSplitBranchLevels=14;
00128 
00139     Node *registerNode(Node *newNode, UErrorCode &errorCode);
00150     Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
00151 
00152     /*
00153      * C++ note:
00154      * registerNode() and registerFinalValue() take ownership of their input nodes,
00155      * and only return owned nodes.
00156      * If they see a failure UErrorCode, they will delete the input node.
00157      * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
00158      * If there is a failure, they return NULL.
00159      *
00160      * NULL Node pointers can be safely passed into other Nodes because
00161      * they call the static Node::hashCode() which checks for a NULL pointer first.
00162      *
00163      * Therefore, as long as builder functions register a new node,
00164      * they need to check for failures only before explicitly dereferencing
00165      * a Node pointer, or before setting a new UErrorCode.
00166      */
00167 
00168     // Hash set of nodes, maps from nodes to integer 1.
00170     UHashtable *nodes;
00171 
00173     class Node : public UObject {
00174     public:
00175         Node(int32_t initialHash) : hash(initialHash), offset(0) {}
00176         inline int32_t hashCode() const { return hash; }
00177         // Handles node==NULL.
00178         static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
00179         // Base class operator==() compares the actual class types.
00180         virtual UBool operator==(const Node &other) const;
00181         inline UBool operator!=(const Node &other) const { return !operator==(other); }
00209         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
00210         // write() must set the offset to a positive value.
00211         virtual void write(StringTrieBuilder &builder) = 0;
00212         // See markRightEdgesFirst.
00213         inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
00214                                                StringTrieBuilder &builder) {
00215             // Note: Edge numbers are negative, lastRight<=firstRight.
00216             // If offset>0 then this node and its sub-nodes have been written already
00217             // and we need not write them again.
00218             // If this node is part of the unwritten right branch edge,
00219             // then we wait until that is written.
00220             if(offset<0 && (offset<lastRight || firstRight<offset)) {
00221                 write(builder);
00222             }
00223         }
00224         inline int32_t getOffset() const { return offset; }
00225     protected:
00226         int32_t hash;
00227         int32_t offset;
00228     private:
00229         // No ICU "poor man's RTTI" for this class nor its subclasses.
00230         virtual UClassID getDynamicClassID() const;
00231     };
00232 
00233     // This class should not be overridden because
00234     // registerFinalValue() compares a stack-allocated FinalValueNode
00235     // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
00236     // with the input node, and the
00237     // !Node::operator==(other) used inside FinalValueNode::operator==(other)
00238     // will be false if the typeid's are different.
00240     class FinalValueNode : public Node {
00241     public:
00242         FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
00243         virtual UBool operator==(const Node &other) const;
00244         virtual void write(StringTrieBuilder &builder);
00245     protected:
00246         int32_t value;
00247     };
00248 
00250     class ValueNode : public Node {
00251     public:
00252         ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
00253         virtual UBool operator==(const Node &other) const;
00254         void setValue(int32_t v) {
00255             hasValue=TRUE;
00256             value=v;
00257             hash=hash*37+v;
00258         }
00259     protected:
00260         UBool hasValue;
00261         int32_t value;
00262     };
00263 
00265     class IntermediateValueNode : public ValueNode {
00266     public:
00267         IntermediateValueNode(int32_t v, Node *nextNode)
00268                 : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
00269         virtual UBool operator==(const Node &other) const;
00270         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
00271         virtual void write(StringTrieBuilder &builder);
00272     protected:
00273         Node *next;
00274     };
00275 
00277     class LinearMatchNode : public ValueNode {
00278     public:
00279         LinearMatchNode(int32_t len, Node *nextNode)
00280                 : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
00281                   length(len), next(nextNode) {}
00282         virtual UBool operator==(const Node &other) const;
00283         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
00284     protected:
00285         int32_t length;
00286         Node *next;
00287     };
00288 
00290     class BranchNode : public Node {
00291     public:
00292         BranchNode(int32_t initialHash) : Node(initialHash) {}
00293     protected:
00294         int32_t firstEdgeNumber;
00295     };
00296 
00298     class ListBranchNode : public BranchNode {
00299     public:
00300         ListBranchNode() : BranchNode(0x444444), length(0) {}
00301         virtual UBool operator==(const Node &other) const;
00302         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
00303         virtual void write(StringTrieBuilder &builder);
00304         // Adds a unit with a final value.
00305         void add(int32_t c, int32_t value) {
00306             units[length]=(UChar)c;
00307             equal[length]=NULL;
00308             values[length]=value;
00309             ++length;
00310             hash=(hash*37+c)*37+value;
00311         }
00312         // Adds a unit which leads to another match node.
00313         void add(int32_t c, Node *node) {
00314             units[length]=(UChar)c;
00315             equal[length]=node;
00316             values[length]=0;
00317             ++length;
00318             hash=(hash*37+c)*37+hashCode(node);
00319         }
00320     protected:
00321         Node *equal[kMaxBranchLinearSubNodeLength];  // NULL means "has final value".
00322         int32_t length;
00323         int32_t values[kMaxBranchLinearSubNodeLength];
00324         UChar units[kMaxBranchLinearSubNodeLength];
00325     };
00326 
00328     class SplitBranchNode : public BranchNode {
00329     public:
00330         SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
00331                 : BranchNode(((0x555555*37+middleUnit)*37+
00332                               hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
00333                   unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
00334         virtual UBool operator==(const Node &other) const;
00335         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
00336         virtual void write(StringTrieBuilder &builder);
00337     protected:
00338         UChar unit;
00339         Node *lessThan;
00340         Node *greaterOrEqual;
00341     };
00342 
00343     // Branch head node, for writing the actual node lead unit.
00345     class BranchHeadNode : public ValueNode {
00346     public:
00347         BranchHeadNode(int32_t len, Node *subNode)
00348                 : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
00349                   length(len), next(subNode) {}
00350         virtual UBool operator==(const Node &other) const;
00351         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
00352         virtual void write(StringTrieBuilder &builder);
00353     protected:
00354         int32_t length;
00355         Node *next;  // A branch sub-node.
00356     };
00357 
00359     virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
00360                                         Node *nextNode) const = 0;
00361 
00363     virtual int32_t write(int32_t unit) = 0;
00365     virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
00367     virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
00369     virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
00371     virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
00372 
00373 private:
00374     // No ICU "poor man's RTTI" for this class nor its subclasses.
00375     virtual UClassID getDynamicClassID() const;
00376 };
00377 
00378 U_NAMESPACE_END
00379 
00380 #endif  // __STRINGTRIEBUILDER_H__
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Friends Defines