ICU 4.8.1.1
4.8.1.1
|
00001 /* 00002 ********************************************************************** 00003 * Copyright (C) 2002-2011, International Business Machines 00004 * Corporation and others. All Rights Reserved. 00005 ********************************************************************** 00006 * file name: regex.h 00007 * encoding: US-ASCII 00008 * indentation:4 00009 * 00010 * created on: 2002oct22 00011 * created by: Andy Heninger 00012 * 00013 * ICU Regular Expressions, API for C++ 00014 */ 00015 00016 #ifndef REGEX_H 00017 #define REGEX_H 00018 00019 //#define REGEX_DEBUG 00020 00045 #include "unicode/utypes.h" 00046 00047 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 00048 00049 #include "unicode/uobject.h" 00050 #include "unicode/unistr.h" 00051 #include "unicode/utext.h" 00052 #include "unicode/parseerr.h" 00053 00054 #include "unicode/uregex.h" 00055 00056 U_NAMESPACE_BEGIN 00057 00058 00059 // Forward Declarations... 00060 00061 class RegexMatcher; 00062 class RegexPattern; 00063 class UVector; 00064 class UVector32; 00065 class UVector64; 00066 class UnicodeSet; 00067 struct REStackFrame; 00068 struct Regex8BitSet; 00069 class RuleBasedBreakIterator; 00070 class RegexCImpl; 00071 00072 00073 00074 00079 #ifdef REGEX_DEBUG 00080 U_INTERNAL void U_EXPORT2 00081 RegexPatternDump(const RegexPattern *pat); 00082 #else 00083 #undef RegexPatternDump 00084 #define RegexPatternDump(pat) 00085 #endif 00086 00087 00088 00100 class U_I18N_API RegexPattern: public UObject { 00101 public: 00102 00110 RegexPattern(); 00111 00118 RegexPattern(const RegexPattern &source); 00119 00125 virtual ~RegexPattern(); 00126 00135 UBool operator==(const RegexPattern& that) const; 00136 00145 inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);} 00146 00152 RegexPattern &operator =(const RegexPattern &source); 00153 00161 virtual RegexPattern *clone() const; 00162 00163 00188 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, 00189 UParseError &pe, 00190 UErrorCode &status); 00191 00192 00219 static RegexPattern * U_EXPORT2 compile( UText *regex, 00220 UParseError &pe, 00221 UErrorCode &status); 00222 00247 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, 00248 uint32_t flags, 00249 UParseError &pe, 00250 UErrorCode &status); 00251 00252 00279 static RegexPattern * U_EXPORT2 compile( UText *regex, 00280 uint32_t flags, 00281 UParseError &pe, 00282 UErrorCode &status); 00283 00284 00307 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, 00308 uint32_t flags, 00309 UErrorCode &status); 00310 00311 00336 static RegexPattern * U_EXPORT2 compile( UText *regex, 00337 uint32_t flags, 00338 UErrorCode &status); 00339 00340 00346 virtual uint32_t flags() const; 00347 00365 virtual RegexMatcher *matcher(const UnicodeString &input, 00366 UErrorCode &status) const; 00367 00368 private: 00382 RegexMatcher *matcher(const UChar *input, 00383 UErrorCode &status) const; 00384 public: 00385 00386 00398 virtual RegexMatcher *matcher(UErrorCode &status) const; 00399 00400 00415 static UBool U_EXPORT2 matches(const UnicodeString ®ex, 00416 const UnicodeString &input, 00417 UParseError &pe, 00418 UErrorCode &status); 00419 00420 00435 static UBool U_EXPORT2 matches(UText *regex, 00436 UText *input, 00437 UParseError &pe, 00438 UErrorCode &status); 00439 00440 00449 virtual UnicodeString pattern() const; 00450 00451 00462 virtual UText *patternText(UErrorCode &status) const; 00463 00464 00503 virtual int32_t split(const UnicodeString &input, 00504 UnicodeString dest[], 00505 int32_t destCapacity, 00506 UErrorCode &status) const; 00507 00508 00547 virtual int32_t split(UText *input, 00548 UText *dest[], 00549 int32_t destCapacity, 00550 UErrorCode &status) const; 00551 00552 00558 virtual UClassID getDynamicClassID() const; 00559 00565 static UClassID U_EXPORT2 getStaticClassID(); 00566 00567 private: 00568 // 00569 // Implementation Data 00570 // 00571 UText *fPattern; // The original pattern string. 00572 UnicodeString *fPatternString; // The original pattern UncodeString if relevant 00573 uint32_t fFlags; // The flags used when compiling the pattern. 00574 // 00575 UVector64 *fCompiledPat; // The compiled pattern p-code. 00576 UnicodeString fLiteralText; // Any literal string data from the pattern, 00577 // after un-escaping, for use during the match. 00578 00579 UVector *fSets; // Any UnicodeSets referenced from the pattern. 00580 Regex8BitSet *fSets8; // (and fast sets for latin-1 range.) 00581 00582 00583 UErrorCode fDeferredStatus; // status if some prior error has left this 00584 // RegexPattern in an unusable state. 00585 00586 int32_t fMinMatchLen; // Minimum Match Length. All matches will have length 00587 // >= this value. For some patterns, this calculated 00588 // value may be less than the true shortest 00589 // possible match. 00590 00591 int32_t fFrameSize; // Size of a state stack frame in the 00592 // execution engine. 00593 00594 int32_t fDataSize; // The size of the data needed by the pattern that 00595 // does not go on the state stack, but has just 00596 // a single copy per matcher. 00597 00598 UVector32 *fGroupMap; // Map from capture group number to position of 00599 // the group's variables in the matcher stack frame. 00600 00601 int32_t fMaxCaptureDigits; 00602 00603 UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined 00604 // regex character classes, e.g. Word. 00605 00606 Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only 00607 // sets for predefined regex classes. 00608 00609 int32_t fStartType; // Info on how a match must start. 00610 int32_t fInitialStringIdx; // 00611 int32_t fInitialStringLen; 00612 UnicodeSet *fInitialChars; 00613 UChar32 fInitialChar; 00614 Regex8BitSet *fInitialChars8; 00615 UBool fNeedsAltInput; 00616 00617 friend class RegexCompile; 00618 friend class RegexMatcher; 00619 friend class RegexCImpl; 00620 00621 // 00622 // Implementation Methods 00623 // 00624 void init(); // Common initialization, for use by constructors. 00625 void zap(); // Common cleanup 00626 #ifdef REGEX_DEBUG 00627 void dumpOp(int32_t index) const; 00628 friend void U_EXPORT2 RegexPatternDump(const RegexPattern *); 00629 #endif 00630 00631 }; 00632 00633 00634 00644 class U_I18N_API RegexMatcher: public UObject { 00645 public: 00646 00661 RegexMatcher(const UnicodeString ®exp, uint32_t flags, UErrorCode &status); 00662 00678 RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status); 00679 00701 RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, 00702 uint32_t flags, UErrorCode &status); 00703 00725 RegexMatcher(UText *regexp, UText *input, 00726 uint32_t flags, UErrorCode &status); 00727 00728 private: 00742 RegexMatcher(const UnicodeString ®exp, const UChar *input, 00743 uint32_t flags, UErrorCode &status); 00744 public: 00745 00746 00752 virtual ~RegexMatcher(); 00753 00754 00761 virtual UBool matches(UErrorCode &status); 00762 00763 00774 virtual UBool matches(int64_t startIndex, UErrorCode &status); 00775 00776 00790 virtual UBool lookingAt(UErrorCode &status); 00791 00792 00806 virtual UBool lookingAt(int64_t startIndex, UErrorCode &status); 00807 00808 00821 virtual UBool find(); 00822 00823 00833 virtual UBool find(int64_t start, UErrorCode &status); 00834 00835 00845 virtual UnicodeString group(UErrorCode &status) const; 00846 00847 00860 virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const; 00861 00862 00868 virtual int32_t groupCount() const; 00869 00870 00885 virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const; 00886 00902 virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const; 00903 00919 virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const; 00920 00921 00929 virtual int32_t start(UErrorCode &status) const; 00930 00938 virtual int64_t start64(UErrorCode &status) const; 00939 00940 00954 virtual int32_t start(int32_t group, UErrorCode &status) const; 00955 00969 virtual int64_t start64(int32_t group, UErrorCode &status) const; 00970 00971 00985 virtual int32_t end(UErrorCode &status) const; 00986 01000 virtual int64_t end64(UErrorCode &status) const; 01001 01002 01020 virtual int32_t end(int32_t group, UErrorCode &status) const; 01021 01039 virtual int64_t end64(int32_t group, UErrorCode &status) const; 01040 01041 01050 virtual RegexMatcher &reset(); 01051 01052 01068 virtual RegexMatcher &reset(int64_t index, UErrorCode &status); 01069 01070 01088 virtual RegexMatcher &reset(const UnicodeString &input); 01089 01090 01104 virtual RegexMatcher &reset(UText *input); 01105 01106 01131 virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status); 01132 01133 private: 01147 RegexMatcher &reset(const UChar *input); 01148 public: 01149 01157 virtual const UnicodeString &input() const; 01158 01167 virtual UText *inputText() const; 01168 01179 virtual UText *getInput(UText *dest, UErrorCode &status) const; 01180 01181 01200 virtual RegexMatcher ®ion(int64_t start, int64_t limit, UErrorCode &status); 01201 01213 virtual RegexMatcher ®ion(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status); 01214 01223 virtual int32_t regionStart() const; 01224 01233 virtual int64_t regionStart64() const; 01234 01235 01244 virtual int32_t regionEnd() const; 01245 01254 virtual int64_t regionEnd64() const; 01255 01264 virtual UBool hasTransparentBounds() const; 01265 01284 virtual RegexMatcher &useTransparentBounds(UBool b); 01285 01286 01294 virtual UBool hasAnchoringBounds() const; 01295 01296 01309 virtual RegexMatcher &useAnchoringBounds(UBool b); 01310 01311 01324 virtual UBool hitEnd() const; 01325 01335 virtual UBool requireEnd() const; 01336 01337 01343 virtual const RegexPattern &pattern() const; 01344 01345 01362 virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status); 01363 01364 01385 virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status); 01386 01387 01408 virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status); 01409 01410 01435 virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status); 01436 01437 01465 virtual RegexMatcher &appendReplacement(UnicodeString &dest, 01466 const UnicodeString &replacement, UErrorCode &status); 01467 01468 01496 virtual RegexMatcher &appendReplacement(UText *dest, 01497 UText *replacement, UErrorCode &status); 01498 01499 01510 virtual UnicodeString &appendTail(UnicodeString &dest); 01511 01512 01526 virtual UText *appendTail(UText *dest, UErrorCode &status); 01527 01528 01552 virtual int32_t split(const UnicodeString &input, 01553 UnicodeString dest[], 01554 int32_t destCapacity, 01555 UErrorCode &status); 01556 01557 01581 virtual int32_t split(UText *input, 01582 UText *dest[], 01583 int32_t destCapacity, 01584 UErrorCode &status); 01585 01607 virtual void setTimeLimit(int32_t limit, UErrorCode &status); 01608 01615 virtual int32_t getTimeLimit() const; 01616 01638 virtual void setStackLimit(int32_t limit, UErrorCode &status); 01639 01647 virtual int32_t getStackLimit() const; 01648 01649 01663 virtual void setMatchCallback(URegexMatchCallback *callback, 01664 const void *context, 01665 UErrorCode &status); 01666 01667 01678 virtual void getMatchCallback(URegexMatchCallback *&callback, 01679 const void *&context, 01680 UErrorCode &status); 01681 01682 01696 virtual void setFindProgressCallback(URegexFindProgressCallback *callback, 01697 const void *context, 01698 UErrorCode &status); 01699 01700 01711 virtual void getFindProgressCallback(URegexFindProgressCallback *&callback, 01712 const void *&context, 01713 UErrorCode &status); 01714 01715 01721 void setTrace(UBool state); 01722 01723 01729 static UClassID U_EXPORT2 getStaticClassID(); 01730 01736 virtual UClassID getDynamicClassID() const; 01737 01738 private: 01739 // Constructors and other object boilerplate are private. 01740 // Instances of RegexMatcher can not be assigned, copied, cloned, etc. 01741 RegexMatcher(); // default constructor not implemented 01742 RegexMatcher(const RegexPattern *pat); 01743 RegexMatcher(const RegexMatcher &other); 01744 RegexMatcher &operator =(const RegexMatcher &rhs); 01745 void init(UErrorCode &status); // Common initialization 01746 void init2(UText *t, UErrorCode &e); // Common initialization, part 2. 01747 01748 friend class RegexPattern; 01749 friend class RegexCImpl; 01750 public: 01752 void resetPreserveRegion(); // Reset matcher state, but preserve any region. 01753 private: 01754 01755 // 01756 // MatchAt This is the internal interface to the match engine itself. 01757 // Match status comes back in matcher member variables. 01758 // 01759 void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status); 01760 inline void backTrack(int64_t &inputIdx, int32_t &patIdx); 01761 UBool isWordBoundary(int64_t pos); // perform Perl-like \b test 01762 UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test 01763 REStackFrame *resetStack(); 01764 inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status); 01765 void IncrementTime(UErrorCode &status); 01766 UBool ReportFindProgress(int64_t matchIndex, UErrorCode &status); 01767 01768 int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const; 01769 01770 UBool findUsingChunk(); 01771 void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status); 01772 UBool isChunkWordBoundary(int32_t pos); 01773 01774 const RegexPattern *fPattern; 01775 RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and 01776 // should delete it when through. 01777 01778 const UnicodeString *fInput; // The string being matched. Only used for input() 01779 UText *fInputText; // The text being matched. Is never NULL. 01780 UText *fAltInputText; // A shallow copy of the text being matched. 01781 // Only created if the pattern contains backreferences. 01782 int64_t fInputLength; // Full length of the input text. 01783 int32_t fFrameSize; // The size of a frame in the backtrack stack. 01784 01785 int64_t fRegionStart; // Start of the input region, default = 0. 01786 int64_t fRegionLimit; // End of input region, default to input.length. 01787 01788 int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $). 01789 int64_t fAnchorLimit; // See useAnchoringBounds 01790 01791 int64_t fLookStart; // Region bounds for look-ahead/behind and 01792 int64_t fLookLimit; // and other boundary tests. See 01793 // useTransparentBounds 01794 01795 int64_t fActiveStart; // Currently active bounds for matching. 01796 int64_t fActiveLimit; // Usually is the same as region, but 01797 // is changed to fLookStart/Limit when 01798 // entering look around regions. 01799 01800 UBool fTransparentBounds; // True if using transparent bounds. 01801 UBool fAnchoringBounds; // True if using anchoring bounds. 01802 01803 UBool fMatch; // True if the last attempted match was successful. 01804 int64_t fMatchStart; // Position of the start of the most recent match 01805 int64_t fMatchEnd; // First position after the end of the most recent match 01806 // Zero if no previous match, even when a region 01807 // is active. 01808 int64_t fLastMatchEnd; // First position after the end of the previous match, 01809 // or -1 if there was no previous match. 01810 int64_t fAppendPosition; // First position after the end of the previous 01811 // appendReplacement(). As described by the 01812 // JavaDoc for Java Matcher, where it is called 01813 // "append position" 01814 UBool fHitEnd; // True if the last match touched the end of input. 01815 UBool fRequireEnd; // True if the last match required end-of-input 01816 // (matched $ or Z) 01817 01818 UVector64 *fStack; 01819 REStackFrame *fFrame; // After finding a match, the last active stack frame, 01820 // which will contain the capture group results. 01821 // NOT valid while match engine is running. 01822 01823 int64_t *fData; // Data area for use by the compiled pattern. 01824 int64_t fSmallData[8]; // Use this for data if it's enough. 01825 01826 int32_t fTimeLimit; // Max time (in arbitrary steps) to let the 01827 // match engine run. Zero for unlimited. 01828 01829 int32_t fTime; // Match time, accumulates while matching. 01830 int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves. 01831 // Kept separately from fTime to keep as much 01832 // code as possible out of the inline 01833 // StateSave function. 01834 01835 int32_t fStackLimit; // Maximum memory size to use for the backtrack 01836 // stack, in bytes. Zero for unlimited. 01837 01838 URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct. 01839 // NULL if there is no callback. 01840 const void *fCallbackContext; // User Context ptr for callback function. 01841 01842 URegexFindProgressCallback *fFindProgressCallbackFn; // Pointer to match progress callback funct. 01843 // NULL if there is no callback. 01844 const void *fFindProgressCallbackContext; // User Context ptr for callback function. 01845 01846 01847 UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility. 01848 01849 UBool fTraceDebug; // Set true for debug tracing of match engine. 01850 01851 UErrorCode fDeferredStatus; // Save error state that cannot be immediately 01852 // reported, or that permanently disables this matcher. 01853 01854 RuleBasedBreakIterator *fWordBreakItr; 01855 01856 01857 }; 01858 01859 U_NAMESPACE_END 01860 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS 01861 #endif