ICU 4.8.1.1  4.8.1.1
uniset.h
Go to the documentation of this file.
00001 /*
00002 ***************************************************************************
00003 * Copyright (C) 1999-2011, International Business Machines Corporation
00004 * and others. All Rights Reserved.
00005 ***************************************************************************
00006 *   Date        Name        Description
00007 *   10/20/99    alan        Creation.
00008 ***************************************************************************
00009 */
00010 
00011 #ifndef UNICODESET_H
00012 #define UNICODESET_H
00013 
00014 #include "unicode/unifilt.h"
00015 #include "unicode/unistr.h"
00016 #include "unicode/uset.h"
00017 
00023 U_NAMESPACE_BEGIN
00024 
00025 class BMPSet;
00026 class ParsePosition;
00027 class SymbolTable;
00028 class UnicodeSetStringSpan;
00029 class UVector;
00030 class RuleCharacterIterator;
00031 
00272 class U_COMMON_API UnicodeSet : public UnicodeFilter {
00273 
00274     int32_t len; // length of list used; 0 <= len <= capacity
00275     int32_t capacity; // capacity of list
00276     UChar32* list; // MUST be terminated with HIGH
00277     BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL.
00278     UChar32* buffer; // internal buffer, may be NULL
00279     int32_t bufferCapacity; // capacity of buffer
00280     int32_t patLen;
00281 
00291     UChar *pat;
00292     UVector* strings; // maintained in sorted order
00293     UnicodeSetStringSpan *stringSpan;
00294 
00295 private:
00296     enum { // constants
00297         kIsBogus = 1       // This set is bogus (i.e. not valid)
00298     };
00299     uint8_t fFlags;         // Bit flag (see constants above)
00300 public:
00310     inline UBool isBogus(void) const;
00311     
00328     void setToBogus();
00329 
00330 public:
00331 
00332     enum {
00337         MIN_VALUE = 0,
00338 
00343         MAX_VALUE = 0x10ffff
00344     };
00345 
00346     //----------------------------------------------------------------
00347     // Constructors &c
00348     //----------------------------------------------------------------
00349 
00350 public:
00351 
00356     UnicodeSet();
00357 
00366     UnicodeSet(UChar32 start, UChar32 end);
00367 
00376     UnicodeSet(const UnicodeString& pattern,
00377                UErrorCode& status);
00378 
00391     UnicodeSet(const UnicodeString& pattern,
00392                uint32_t options,
00393                const SymbolTable* symbols,
00394                UErrorCode& status);
00395 
00409     UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
00410                uint32_t options,
00411                const SymbolTable* symbols,
00412                UErrorCode& status);
00413 
00418     UnicodeSet(const UnicodeSet& o);
00419 
00424     virtual ~UnicodeSet();
00425 
00431     UnicodeSet& operator=(const UnicodeSet& o);
00432 
00444     virtual UBool operator==(const UnicodeSet& o) const;
00445 
00451     UBool operator!=(const UnicodeSet& o) const;
00452 
00462     virtual UnicodeFunctor* clone() const;
00463 
00471     virtual int32_t hashCode(void) const;
00472 
00481     inline static UnicodeSet *fromUSet(USet *uset);
00482 
00491     inline static const UnicodeSet *fromUSet(const USet *uset);
00492     
00500     inline USet *toUSet();
00501 
00502 
00510     inline const USet * toUSet() const;
00511 
00512 
00513     //----------------------------------------------------------------
00514     // Freezable API
00515     //----------------------------------------------------------------
00516 
00525     inline UBool isFrozen() const;
00526 
00540     UnicodeFunctor *freeze();
00541 
00550     UnicodeFunctor *cloneAsThawed() const;
00551 
00552     //----------------------------------------------------------------
00553     // Public API
00554     //----------------------------------------------------------------
00555 
00566     UnicodeSet& set(UChar32 start, UChar32 end);
00567 
00573     static UBool resemblesPattern(const UnicodeString& pattern,
00574                                   int32_t pos);
00575 
00588     UnicodeSet& applyPattern(const UnicodeString& pattern,
00589                              UErrorCode& status);
00590 
00607     UnicodeSet& applyPattern(const UnicodeString& pattern,
00608                              uint32_t options,
00609                              const SymbolTable* symbols,
00610                              UErrorCode& status);
00611 
00643     UnicodeSet& applyPattern(const UnicodeString& pattern,
00644                              ParsePosition& pos,
00645                              uint32_t options,
00646                              const SymbolTable* symbols,
00647                              UErrorCode& status);
00648 
00662     virtual UnicodeString& toPattern(UnicodeString& result,
00663                              UBool escapeUnprintable = FALSE) const;
00664 
00687     UnicodeSet& applyIntPropertyValue(UProperty prop,
00688                                       int32_t value,
00689                                       UErrorCode& ec);
00690 
00720     UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
00721                                    const UnicodeString& value,
00722                                    UErrorCode& ec);
00723 
00732     virtual int32_t size(void) const;
00733 
00740     virtual UBool isEmpty(void) const;
00741 
00749     virtual UBool contains(UChar32 c) const;
00750 
00759     virtual UBool contains(UChar32 start, UChar32 end) const;
00760 
00768     UBool contains(const UnicodeString& s) const;
00769 
00777     virtual UBool containsAll(const UnicodeSet& c) const;
00778 
00786     UBool containsAll(const UnicodeString& s) const;
00787 
00796     UBool containsNone(UChar32 start, UChar32 end) const;
00797 
00805     UBool containsNone(const UnicodeSet& c) const;
00806 
00814     UBool containsNone(const UnicodeString& s) const;
00815 
00824     inline UBool containsSome(UChar32 start, UChar32 end) const;
00825 
00833     inline UBool containsSome(const UnicodeSet& s) const;
00834 
00842     inline UBool containsSome(const UnicodeString& s) const;
00843 
00862     int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
00863 
00876     inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
00877 
00895     int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
00896 
00910     inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
00911 
00930     int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
00931 
00949     int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
00950 
00955     virtual UMatchDegree matches(const Replaceable& text,
00956                          int32_t& offset,
00957                          int32_t limit,
00958                          UBool incremental);
00959 
00960 private:
00983     static int32_t matchRest(const Replaceable& text,
00984                              int32_t start, int32_t limit,
00985                              const UnicodeString& s);
00986 
00996     int32_t findCodePoint(UChar32 c) const;
00997 
00998 public:
00999 
01007     virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
01008 
01017     int32_t indexOf(UChar32 c) const;
01018 
01028     UChar32 charAt(int32_t index) const;
01029 
01044     virtual UnicodeSet& add(UChar32 start, UChar32 end);
01045 
01053     UnicodeSet& add(UChar32 c);
01054 
01066     UnicodeSet& add(const UnicodeString& s);
01067 
01068  private:
01074     static int32_t getSingleCP(const UnicodeString& s);
01075 
01076     void _add(const UnicodeString& s);
01077 
01078  public:
01087     UnicodeSet& addAll(const UnicodeString& s);
01088 
01097     UnicodeSet& retainAll(const UnicodeString& s);
01098 
01107     UnicodeSet& complementAll(const UnicodeString& s);
01108 
01117     UnicodeSet& removeAll(const UnicodeString& s);
01118 
01127     static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
01128 
01129 
01137     static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
01138 
01152     virtual UnicodeSet& retain(UChar32 start, UChar32 end);
01153 
01154 
01160     UnicodeSet& retain(UChar32 c);
01161 
01175     virtual UnicodeSet& remove(UChar32 start, UChar32 end);
01176 
01184     UnicodeSet& remove(UChar32 c);
01185 
01195     UnicodeSet& remove(const UnicodeString& s);
01196 
01204     virtual UnicodeSet& complement(void);
01205 
01220     virtual UnicodeSet& complement(UChar32 start, UChar32 end);
01221 
01229     UnicodeSet& complement(UChar32 c);
01230 
01241     UnicodeSet& complement(const UnicodeString& s);
01242 
01255     virtual UnicodeSet& addAll(const UnicodeSet& c);
01256 
01268     virtual UnicodeSet& retainAll(const UnicodeSet& c);
01269 
01281     virtual UnicodeSet& removeAll(const UnicodeSet& c);
01282 
01293     virtual UnicodeSet& complementAll(const UnicodeSet& c);
01294 
01301     virtual UnicodeSet& clear(void);
01302 
01328     UnicodeSet& closeOver(int32_t attribute);
01329 
01336     virtual UnicodeSet &removeAllStrings();
01337 
01345     virtual int32_t getRangeCount(void) const;
01346 
01354     virtual UChar32 getRangeStart(int32_t index) const;
01355 
01363     virtual UChar32 getRangeEnd(int32_t index) const;
01364 
01413     int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
01414 
01421     virtual UnicodeSet& compact();
01422 
01434     static UClassID U_EXPORT2 getStaticClassID(void);
01435 
01444     virtual UClassID getDynamicClassID(void) const;
01445 
01446 private:
01447 
01448     // Private API for the USet API
01449 
01450     friend class USetAccess;
01451 
01452     int32_t getStringCount() const;
01453 
01454     const UnicodeString* getString(int32_t index) const;
01455 
01456     //----------------------------------------------------------------
01457     // RuleBasedTransliterator support
01458     //----------------------------------------------------------------
01459 
01460 private:
01461 
01467     virtual UBool matchesIndexValue(uint8_t v) const;
01468 
01469 private:
01470 
01471     //----------------------------------------------------------------
01472     // Implementation: Clone as thawed (see ICU4J Freezable)
01473     //----------------------------------------------------------------
01474 
01475     UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
01476 
01477     //----------------------------------------------------------------
01478     // Implementation: Pattern parsing
01479     //----------------------------------------------------------------
01480 
01481     void applyPattern(RuleCharacterIterator& chars,
01482                       const SymbolTable* symbols,
01483                       UnicodeString& rebuiltPat,
01484                       uint32_t options,
01485                       UErrorCode& ec);
01486 
01487     //----------------------------------------------------------------
01488     // Implementation: Utility methods
01489     //----------------------------------------------------------------
01490 
01491     void ensureCapacity(int32_t newLen, UErrorCode& ec);
01492 
01493     void ensureBufferCapacity(int32_t newLen, UErrorCode& ec);
01494 
01495     void swapBuffers(void);
01496 
01497     UBool allocateStrings(UErrorCode &status);
01498 
01499     UnicodeString& _toPattern(UnicodeString& result,
01500                               UBool escapeUnprintable) const;
01501 
01502     UnicodeString& _generatePattern(UnicodeString& result,
01503                                     UBool escapeUnprintable) const;
01504 
01505     static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
01506 
01507     static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
01508 
01509     //----------------------------------------------------------------
01510     // Implementation: Fundamental operators
01511     //----------------------------------------------------------------
01512 
01513     void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
01514 
01515     void add(const UChar32* other, int32_t otherLen, int8_t polarity);
01516 
01517     void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
01518 
01524     static UBool resemblesPropertyPattern(const UnicodeString& pattern,
01525                                           int32_t pos);
01526 
01527     static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
01528                                           int32_t iterOpts);
01529 
01569     UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
01570                                      ParsePosition& ppos,
01571                                      UErrorCode &ec);
01572 
01573     void applyPropertyPattern(RuleCharacterIterator& chars,
01574                               UnicodeString& rebuiltPat,
01575                               UErrorCode& ec);
01576 
01577     static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);
01578 
01583     typedef UBool (*Filter)(UChar32 codePoint, void* context);
01584 
01594     void applyFilter(Filter filter,
01595                      void* context,
01596                      int32_t src,
01597                      UErrorCode &status);
01598 
01602     void setPattern(const UnicodeString& newPat);
01606     void releasePattern();
01607 
01608     friend class UnicodeSetIterator;
01609 };
01610 
01611 
01612 
01613 inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
01614     return !operator==(o);
01615 }
01616 
01617 inline UBool UnicodeSet::isFrozen() const {
01618     return (UBool)(bmpSet!=NULL || stringSpan!=NULL);
01619 }
01620 
01621 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
01622     return !containsNone(start, end);
01623 }
01624 
01625 inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
01626     return !containsNone(s);
01627 }
01628 
01629 inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
01630     return !containsNone(s);
01631 }
01632 
01633 inline UBool UnicodeSet::isBogus() const {
01634     return (UBool)(fFlags & kIsBogus);
01635 }
01636 
01637 inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) {
01638     return reinterpret_cast<UnicodeSet *>(uset);
01639 }
01640 
01641 inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) {
01642     return reinterpret_cast<const UnicodeSet *>(uset);
01643 }
01644 
01645 inline USet *UnicodeSet::toUSet() {
01646     return reinterpret_cast<USet *>(this);
01647 }
01648 
01649 inline const USet *UnicodeSet::toUSet() const {
01650     return reinterpret_cast<const USet *>(this);
01651 }
01652 
01653 inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
01654     int32_t sLength=s.length();
01655     if(start<0) {
01656         start=0;
01657     } else if(start>sLength) {
01658         start=sLength;
01659     }
01660     return start+span(s.getBuffer()+start, sLength-start, spanCondition);
01661 }
01662 
01663 inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
01664     int32_t sLength=s.length();
01665     if(limit<0) {
01666         limit=0;
01667     } else if(limit>sLength) {
01668         limit=sLength;
01669     }
01670     return spanBack(s.getBuffer(), limit, spanCondition);
01671 }
01672 
01673 U_NAMESPACE_END
01674 
01675 #endif
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Friends Defines