ICU 4.8.1.1  4.8.1.1
regex.h
Go to the documentation of this file.
00001 /*
00002 **********************************************************************
00003 *   Copyright (C) 2002-2011, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 **********************************************************************
00006 *   file name:  regex.h
00007 *   encoding:   US-ASCII
00008 *   indentation:4
00009 *
00010 *   created on: 2002oct22
00011 *   created by: Andy Heninger
00012 *
00013 *   ICU Regular Expressions, API for C++
00014 */
00015 
00016 #ifndef REGEX_H
00017 #define REGEX_H
00018 
00019 //#define REGEX_DEBUG
00020 
00045 #include "unicode/utypes.h"
00046 
00047 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
00048 
00049 #include "unicode/uobject.h"
00050 #include "unicode/unistr.h"
00051 #include "unicode/utext.h"
00052 #include "unicode/parseerr.h"
00053 
00054 #include "unicode/uregex.h"
00055 
00056 U_NAMESPACE_BEGIN
00057 
00058 
00059 // Forward Declarations...
00060 
00061 class RegexMatcher;
00062 class RegexPattern;
00063 class UVector;
00064 class UVector32;
00065 class UVector64;
00066 class UnicodeSet;
00067 struct REStackFrame;
00068 struct Regex8BitSet;
00069 class  RuleBasedBreakIterator;
00070 class  RegexCImpl;
00071 
00072 
00073 
00074 
00079 #ifdef REGEX_DEBUG
00080 U_INTERNAL void U_EXPORT2
00081     RegexPatternDump(const RegexPattern *pat);
00082 #else
00083     #undef RegexPatternDump
00084     #define RegexPatternDump(pat)
00085 #endif
00086 
00087 
00088 
00100 class U_I18N_API RegexPattern: public UObject {
00101 public:
00102 
00110     RegexPattern();
00111 
00118     RegexPattern(const RegexPattern &source);
00119 
00125     virtual ~RegexPattern();
00126 
00135     UBool           operator==(const RegexPattern& that) const;
00136 
00145     inline UBool    operator!=(const RegexPattern& that) const {return ! operator ==(that);}
00146 
00152     RegexPattern  &operator =(const RegexPattern &source);
00153 
00161     virtual RegexPattern  *clone() const;
00162 
00163 
00188     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
00189         UParseError          &pe,
00190         UErrorCode           &status);
00191 
00192 
00219     static RegexPattern * U_EXPORT2 compile( UText *regex,
00220         UParseError          &pe,
00221         UErrorCode           &status);
00222 
00247     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
00248         uint32_t             flags,
00249         UParseError          &pe,
00250         UErrorCode           &status);
00251         
00252         
00279     static RegexPattern * U_EXPORT2 compile( UText *regex,
00280         uint32_t             flags,
00281         UParseError          &pe,
00282         UErrorCode           &status);
00283     
00284 
00307     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
00308         uint32_t             flags,
00309         UErrorCode           &status);
00310 
00311 
00336     static RegexPattern * U_EXPORT2 compile( UText *regex,
00337         uint32_t             flags,
00338         UErrorCode           &status);
00339     
00340 
00346     virtual uint32_t flags() const;
00347 
00365     virtual RegexMatcher *matcher(const UnicodeString &input,
00366         UErrorCode          &status) const;
00367         
00368 private:
00382     RegexMatcher *matcher(const UChar *input,
00383         UErrorCode          &status) const;
00384 public:
00385 
00386 
00398     virtual RegexMatcher *matcher(UErrorCode  &status) const;
00399 
00400 
00415     static UBool U_EXPORT2 matches(const UnicodeString   &regex,
00416         const UnicodeString   &input,
00417               UParseError     &pe,
00418               UErrorCode      &status);
00419 
00420 
00435     static UBool U_EXPORT2 matches(UText *regex,
00436         UText           *input,
00437         UParseError     &pe,
00438         UErrorCode      &status);
00439 
00440 
00449     virtual UnicodeString pattern() const;
00450     
00451     
00462     virtual UText *patternText(UErrorCode      &status) const;
00463 
00464 
00503     virtual int32_t  split(const UnicodeString &input,
00504         UnicodeString    dest[],
00505         int32_t          destCapacity,
00506         UErrorCode       &status) const;
00507 
00508 
00547     virtual int32_t  split(UText *input,
00548         UText            *dest[],
00549         int32_t          destCapacity,
00550         UErrorCode       &status) const;
00551 
00552 
00558     virtual UClassID getDynamicClassID() const;
00559 
00565     static UClassID U_EXPORT2 getStaticClassID();
00566 
00567 private:
00568     //
00569     //  Implementation Data
00570     //
00571     UText          *fPattern;      // The original pattern string.
00572     UnicodeString  *fPatternString; // The original pattern UncodeString if relevant
00573     uint32_t        fFlags;        // The flags used when compiling the pattern.
00574                                    //
00575     UVector64       *fCompiledPat; // The compiled pattern p-code.
00576     UnicodeString   fLiteralText;  // Any literal string data from the pattern,
00577                                    //   after un-escaping, for use during the match.
00578 
00579     UVector         *fSets;        // Any UnicodeSets referenced from the pattern.
00580     Regex8BitSet    *fSets8;       //      (and fast sets for latin-1 range.)
00581 
00582 
00583     UErrorCode      fDeferredStatus; // status if some prior error has left this
00584                                    //  RegexPattern in an unusable state.
00585 
00586     int32_t         fMinMatchLen;  // Minimum Match Length.  All matches will have length
00587                                    //   >= this value.  For some patterns, this calculated
00588                                    //   value may be less than the true shortest
00589                                    //   possible match.
00590     
00591     int32_t         fFrameSize;    // Size of a state stack frame in the
00592                                    //   execution engine.
00593 
00594     int32_t         fDataSize;     // The size of the data needed by the pattern that
00595                                    //   does not go on the state stack, but has just
00596                                    //   a single copy per matcher.
00597 
00598     UVector32       *fGroupMap;    // Map from capture group number to position of
00599                                    //   the group's variables in the matcher stack frame.
00600 
00601     int32_t         fMaxCaptureDigits;
00602 
00603     UnicodeSet     **fStaticSets;  // Ptr to static (shared) sets for predefined
00604                                    //   regex character classes, e.g. Word.
00605 
00606     Regex8BitSet   *fStaticSets8;  // Ptr to the static (shared) latin-1 only
00607                                    //  sets for predefined regex classes.
00608 
00609     int32_t         fStartType;    // Info on how a match must start.
00610     int32_t         fInitialStringIdx;     //
00611     int32_t         fInitialStringLen;
00612     UnicodeSet     *fInitialChars;
00613     UChar32         fInitialChar;
00614     Regex8BitSet   *fInitialChars8;
00615     UBool           fNeedsAltInput;
00616 
00617     friend class RegexCompile;
00618     friend class RegexMatcher;
00619     friend class RegexCImpl;
00620 
00621     //
00622     //  Implementation Methods
00623     //
00624     void        init();            // Common initialization, for use by constructors.
00625     void        zap();             // Common cleanup
00626 #ifdef REGEX_DEBUG
00627     void        dumpOp(int32_t index) const;
00628     friend     void U_EXPORT2 RegexPatternDump(const RegexPattern *);
00629 #endif
00630 
00631 };
00632 
00633 
00634 
00644 class U_I18N_API RegexMatcher: public UObject {
00645 public:
00646 
00661     RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
00662 
00678     RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
00679     
00701     RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
00702         uint32_t flags, UErrorCode &status);
00703 
00725     RegexMatcher(UText *regexp, UText *input,
00726         uint32_t flags, UErrorCode &status);
00727 
00728 private:
00742     RegexMatcher(const UnicodeString &regexp, const UChar *input,
00743         uint32_t flags, UErrorCode &status);
00744 public:
00745 
00746 
00752     virtual ~RegexMatcher();
00753 
00754 
00761     virtual UBool matches(UErrorCode &status);
00762 
00763 
00774     virtual UBool matches(int64_t startIndex, UErrorCode &status);
00775 
00776 
00790     virtual UBool lookingAt(UErrorCode &status);
00791 
00792 
00806     virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
00807 
00808 
00821     virtual UBool find();
00822 
00823 
00833     virtual UBool find(int64_t start, UErrorCode &status);
00834 
00835 
00845     virtual UnicodeString group(UErrorCode &status) const;
00846 
00847 
00860     virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
00861 
00862 
00868     virtual int32_t groupCount() const;
00869 
00870 
00885     virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const; 
00886 
00902     virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
00903 
00919     virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const;
00920 
00921 
00929     virtual int32_t start(UErrorCode &status) const;
00930 
00938     virtual int64_t start64(UErrorCode &status) const;
00939 
00940 
00954     virtual int32_t start(int32_t group, UErrorCode &status) const;
00955 
00969     virtual int64_t start64(int32_t group, UErrorCode &status) const;
00970 
00971 
00985     virtual int32_t end(UErrorCode &status) const;
00986 
01000     virtual int64_t end64(UErrorCode &status) const;
01001 
01002 
01020     virtual int32_t end(int32_t group, UErrorCode &status) const;
01021 
01039     virtual int64_t end64(int32_t group, UErrorCode &status) const;
01040 
01041 
01050     virtual RegexMatcher &reset();
01051 
01052 
01068     virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
01069 
01070 
01088     virtual RegexMatcher &reset(const UnicodeString &input);
01089 
01090 
01104     virtual RegexMatcher &reset(UText *input);
01105 
01106 
01131     virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
01132 
01133 private:
01147     RegexMatcher &reset(const UChar *input);
01148 public:
01149 
01157     virtual const UnicodeString &input() const;
01158     
01167     virtual UText *inputText() const;
01168     
01179     virtual UText *getInput(UText *dest, UErrorCode &status) const;
01180     
01181 
01200      virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
01201 
01213      virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
01214 
01223      virtual int32_t regionStart() const;
01224 
01233      virtual int64_t regionStart64() const;
01234 
01235 
01244       virtual int32_t regionEnd() const;
01245 
01254       virtual int64_t regionEnd64() const;
01255 
01264       virtual UBool hasTransparentBounds() const;
01265 
01284       virtual RegexMatcher &useTransparentBounds(UBool b);
01285 
01286      
01294       virtual UBool hasAnchoringBounds() const;
01295 
01296 
01309       virtual RegexMatcher &useAnchoringBounds(UBool b);
01310 
01311 
01324       virtual UBool hitEnd() const;
01325 
01335       virtual UBool requireEnd() const;
01336 
01337 
01343     virtual const RegexPattern &pattern() const;
01344 
01345 
01362     virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
01363 
01364 
01385     virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
01386     
01387 
01408     virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
01409     
01410 
01435     virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
01436     
01437     
01465     virtual RegexMatcher &appendReplacement(UnicodeString &dest,
01466         const UnicodeString &replacement, UErrorCode &status);
01467     
01468     
01496     virtual RegexMatcher &appendReplacement(UText *dest,
01497         UText *replacement, UErrorCode &status);
01498 
01499 
01510     virtual UnicodeString &appendTail(UnicodeString &dest);
01511 
01512 
01526     virtual UText *appendTail(UText *dest, UErrorCode &status);
01527 
01528 
01552     virtual int32_t  split(const UnicodeString &input,
01553         UnicodeString    dest[],
01554         int32_t          destCapacity,
01555         UErrorCode       &status);
01556 
01557 
01581     virtual int32_t  split(UText *input,
01582         UText           *dest[],
01583         int32_t          destCapacity,
01584         UErrorCode       &status);
01585     
01607     virtual void setTimeLimit(int32_t limit, UErrorCode &status);
01608 
01615     virtual int32_t getTimeLimit() const;
01616 
01638     virtual void setStackLimit(int32_t  limit, UErrorCode &status);
01639     
01647     virtual int32_t  getStackLimit() const;
01648 
01649 
01663     virtual void setMatchCallback(URegexMatchCallback     *callback,
01664                                   const void              *context,
01665                                   UErrorCode              &status);
01666 
01667 
01678     virtual void getMatchCallback(URegexMatchCallback     *&callback,
01679                                   const void              *&context,
01680                                   UErrorCode              &status);
01681 
01682 
01696     virtual void setFindProgressCallback(URegexFindProgressCallback      *callback,
01697                                               const void                              *context,
01698                                               UErrorCode                              &status);
01699 
01700 
01711     virtual void getFindProgressCallback(URegexFindProgressCallback      *&callback,
01712                                               const void                      *&context,
01713                                               UErrorCode                      &status);
01714 
01715 
01721     void setTrace(UBool state);
01722 
01723 
01729     static UClassID U_EXPORT2 getStaticClassID();
01730 
01736     virtual UClassID getDynamicClassID() const;
01737 
01738 private:
01739     // Constructors and other object boilerplate are private.
01740     // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
01741     RegexMatcher();                  // default constructor not implemented
01742     RegexMatcher(const RegexPattern *pat);
01743     RegexMatcher(const RegexMatcher &other);
01744     RegexMatcher &operator =(const RegexMatcher &rhs);
01745     void init(UErrorCode &status);                      // Common initialization
01746     void init2(UText *t, UErrorCode &e);  // Common initialization, part 2.
01747 
01748     friend class RegexPattern;
01749     friend class RegexCImpl;
01750 public:
01752     void resetPreserveRegion();  // Reset matcher state, but preserve any region.
01753 private:
01754 
01755     //
01756     //  MatchAt   This is the internal interface to the match engine itself.
01757     //            Match status comes back in matcher member variables.
01758     //
01759     void                 MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
01760     inline void          backTrack(int64_t &inputIdx, int32_t &patIdx);
01761     UBool                isWordBoundary(int64_t pos);         // perform Perl-like  \b test
01762     UBool                isUWordBoundary(int64_t pos);        // perform RBBI based \b test
01763     REStackFrame        *resetStack();
01764     inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
01765     void                 IncrementTime(UErrorCode &status);
01766     UBool                ReportFindProgress(int64_t matchIndex, UErrorCode &status);
01767     
01768     int64_t              appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
01769     
01770     UBool                findUsingChunk();
01771     void                 MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
01772     UBool                isChunkWordBoundary(int32_t pos);
01773 
01774     const RegexPattern  *fPattern;
01775     RegexPattern        *fPatternOwned;    // Non-NULL if this matcher owns the pattern, and
01776                                            //   should delete it when through.
01777 
01778     const UnicodeString *fInput;           // The string being matched. Only used for input()
01779     UText               *fInputText;       // The text being matched. Is never NULL.
01780     UText               *fAltInputText;    // A shallow copy of the text being matched.
01781                                            //   Only created if the pattern contains backreferences.
01782     int64_t              fInputLength;     // Full length of the input text.
01783     int32_t              fFrameSize;       // The size of a frame in the backtrack stack.
01784     
01785     int64_t              fRegionStart;     // Start of the input region, default = 0.
01786     int64_t              fRegionLimit;     // End of input region, default to input.length.
01787     
01788     int64_t              fAnchorStart;     // Region bounds for anchoring operations (^ or $).
01789     int64_t              fAnchorLimit;     //   See useAnchoringBounds
01790     
01791     int64_t              fLookStart;       // Region bounds for look-ahead/behind and
01792     int64_t              fLookLimit;       //   and other boundary tests.  See
01793                                            //   useTransparentBounds
01794 
01795     int64_t              fActiveStart;     // Currently active bounds for matching.
01796     int64_t              fActiveLimit;     //   Usually is the same as region, but
01797                                            //   is changed to fLookStart/Limit when
01798                                            //   entering look around regions.
01799 
01800     UBool                fTransparentBounds;  // True if using transparent bounds.
01801     UBool                fAnchoringBounds; // True if using anchoring bounds.
01802 
01803     UBool                fMatch;           // True if the last attempted match was successful.
01804     int64_t              fMatchStart;      // Position of the start of the most recent match
01805     int64_t              fMatchEnd;        // First position after the end of the most recent match
01806                                            //   Zero if no previous match, even when a region
01807                                            //   is active.
01808     int64_t              fLastMatchEnd;    // First position after the end of the previous match,
01809                                            //   or -1 if there was no previous match.
01810     int64_t              fAppendPosition;  // First position after the end of the previous
01811                                            //   appendReplacement().  As described by the
01812                                            //   JavaDoc for Java Matcher, where it is called 
01813                                            //   "append position"
01814     UBool                fHitEnd;          // True if the last match touched the end of input.
01815     UBool                fRequireEnd;      // True if the last match required end-of-input
01816                                            //    (matched $ or Z)
01817 
01818     UVector64           *fStack;
01819     REStackFrame        *fFrame;           // After finding a match, the last active stack frame,
01820                                            //   which will contain the capture group results.
01821                                            //   NOT valid while match engine is running.
01822 
01823     int64_t             *fData;            // Data area for use by the compiled pattern.
01824     int64_t             fSmallData[8];     //   Use this for data if it's enough.
01825 
01826     int32_t             fTimeLimit;        // Max time (in arbitrary steps) to let the
01827                                            //   match engine run.  Zero for unlimited.
01828     
01829     int32_t             fTime;             // Match time, accumulates while matching.
01830     int32_t             fTickCounter;      // Low bits counter for time.  Counts down StateSaves.
01831                                            //   Kept separately from fTime to keep as much
01832                                            //   code as possible out of the inline
01833                                            //   StateSave function.
01834 
01835     int32_t             fStackLimit;       // Maximum memory size to use for the backtrack
01836                                            //   stack, in bytes.  Zero for unlimited.
01837 
01838     URegexMatchCallback *fCallbackFn;       // Pointer to match progress callback funct.
01839                                            //   NULL if there is no callback.
01840     const void         *fCallbackContext;  // User Context ptr for callback function.
01841 
01842     URegexFindProgressCallback  *fFindProgressCallbackFn;  // Pointer to match progress callback funct.
01843                                                            //   NULL if there is no callback.
01844     const void         *fFindProgressCallbackContext;      // User Context ptr for callback function.
01845 
01846 
01847     UBool               fInputUniStrMaybeMutable;  // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
01848 
01849     UBool               fTraceDebug;       // Set true for debug tracing of match engine.
01850 
01851     UErrorCode          fDeferredStatus;   // Save error state that cannot be immediately
01852                                            //   reported, or that permanently disables this matcher.
01853 
01854     RuleBasedBreakIterator  *fWordBreakItr;
01855 
01856 
01857 };
01858 
01859 U_NAMESPACE_END
01860 #endif  // UCONFIG_NO_REGULAR_EXPRESSIONS
01861 #endif
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Friends Defines