regex.h

Go to the documentation of this file.
00001 /*
00002 **********************************************************************
00003 *   Copyright (C) 2002-2006, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 **********************************************************************
00006 *   file name:  regex.h
00007 *   encoding:   US-ASCII
00008 *   indentation:4
00009 *
00010 *   created on: 2002oct22
00011 *   created by: Andy Heninger
00012 *
00013 *   ICU Regular Expressions, API for C++
00014 */
00015 
00016 #ifndef REGEX_H
00017 #define REGEX_H
00018 
00019 //#define REGEX_DEBUG
00020 
00045 #include "unicode/utypes.h"
00046 
00047 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
00048 
00049 #include "unicode/uobject.h"
00050 #include "unicode/unistr.h"
00051 #include "unicode/parseerr.h"
00052 
00053 #include "unicode/uregex.h"
00054 
00055 U_NAMESPACE_BEGIN
00056 
00057 
00058 // Forward Declarations...
00059 
00060 class RegexMatcher;
00061 class RegexPattern;
00062 class UVector;
00063 class UVector32;
00064 class UnicodeSet;
00065 struct REStackFrame;
00066 struct Regex8BitSet;
00067 class  RuleBasedBreakIterator;
00068 class  RegexCImpl;
00069 
00070 
00071 
00072 
00077 #ifdef REGEX_DEBUG
00078 U_INTERNAL void U_EXPORT2
00079     RegexPatternDump(const RegexPattern *pat);
00080 #else
00081     #define RegexPatternDump(pat)
00082 #endif
00083 
00084 
00085 
00097 class U_I18N_API RegexPattern: public UObject {
00098 public:
00099 
00107     RegexPattern();
00108 
00115     RegexPattern(const RegexPattern &source);
00116 
00122     virtual ~RegexPattern();
00123 
00132     UBool           operator==(const RegexPattern& that) const;
00133 
00142     inline UBool    operator!=(const RegexPattern& that) const {return ! operator ==(that);};
00143 
00149     RegexPattern  &operator =(const RegexPattern &source);
00150 
00158     virtual RegexPattern  *clone() const;
00159 
00160 
00185     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
00186         UParseError          &pe,
00187         UErrorCode           &status);
00188 
00213     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
00214         uint32_t             flags,
00215         UParseError          &pe,
00216         UErrorCode           &status);
00217 
00218 
00241     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
00242         uint32_t             flags,
00243         UErrorCode           &status);
00244 
00245 
00251     virtual uint32_t flags() const;
00252 
00270     virtual RegexMatcher *matcher(const UnicodeString &input,
00271         UErrorCode          &status) const;
00272 
00273 private:
00285     RegexMatcher *matcher(const UChar *input,
00286         UErrorCode          &status) const;
00287 public:
00288 
00289 
00301     virtual RegexMatcher *matcher(UErrorCode  &status) const;
00302 
00303 
00318     static UBool U_EXPORT2 matches(const UnicodeString   &regex,
00319         const UnicodeString   &input,
00320         UParseError     &pe,
00321         UErrorCode      &status);
00322 
00323 
00328     virtual UnicodeString pattern() const;
00329 
00330 
00356     virtual int32_t  split(const UnicodeString &input,
00357         UnicodeString    dest[],
00358         int32_t          destCapacity,
00359         UErrorCode       &status) const;
00360 
00361 
00367     virtual UClassID getDynamicClassID() const;
00368 
00374     static UClassID U_EXPORT2 getStaticClassID();
00375 
00376 private:
00377     //
00378     //  Implementation Data
00379     //
00380     UnicodeString   fPattern;      // The original pattern string.
00381     uint32_t        fFlags;        // The flags used when compiling the pattern.
00382                                    //
00383     UVector32       *fCompiledPat; // The compiled pattern p-code.
00384     UnicodeString   fLiteralText;  // Any literal string data from the pattern,
00385                                    //   after un-escaping, for use during the match.
00386 
00387     UVector         *fSets;        // Any UnicodeSets referenced from the pattern.
00388     Regex8BitSet    *fSets8;       //      (and fast sets for latin-1 range.)
00389 
00390 
00391     UErrorCode      fDeferredStatus; // status if some prior error has left this
00392                                    //  RegexPattern in an unusable state.
00393 
00394     int32_t         fMinMatchLen;  // Minimum Match Length.  All matches will have length
00395                                    //   >= this value.  For some patterns, this calculated
00396                                    //   value may be less than the true shortest
00397                                    //   possible match.
00398 
00399     int32_t         fFrameSize;    // Size of a state stack frame in the
00400                                    //   execution engine.
00401 
00402     int32_t         fDataSize;     // The size of the data needed by the pattern that
00403                                    //   does not go on the state stack, but has just
00404                                    //   a single copy per matcher.
00405 
00406     UVector32       *fGroupMap;    // Map from capture group number to position of
00407                                    //   the group's variables in the matcher stack frame.
00408 
00409     int32_t         fMaxCaptureDigits;
00410 
00411     UnicodeSet     **fStaticSets;  // Ptr to static (shared) sets for predefined
00412                                    //   regex character classes, e.g. Word.
00413 
00414     Regex8BitSet   *fStaticSets8;  // Ptr to the static (shared) latin-1 only
00415                                    //  sets for predefined regex classes.
00416 
00417     int32_t         fStartType;    // Info on how a match must start.
00418     int32_t         fInitialStringIdx;     //
00419     int32_t         fInitialStringLen;
00420     UnicodeSet     *fInitialChars;
00421     UChar32         fInitialChar;
00422     Regex8BitSet   *fInitialChars8;
00423 
00424     friend class RegexCompile;
00425     friend class RegexMatcher;
00426     friend class RegexCImpl;
00427 
00428     //
00429     //  Implementation Methods
00430     //
00431     void        init();            // Common initialization, for use by constructors.
00432     void        zap();             // Common cleanup
00433 #ifdef REGEX_DEBUG
00434     void        dumpOp(int32_t index) const;
00435     friend     void U_EXPORT2 RegexPatternDump(const RegexPattern *);
00436 #endif
00437 
00438 };
00439 
00440 
00441 
00451 class U_I18N_API RegexMatcher: public UObject {
00452 public:
00453 
00468     RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
00469 
00491     RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
00492         uint32_t flags, UErrorCode &status);
00493 
00494 private:
00506     RegexMatcher(const UnicodeString &regexp, const UChar *input,
00507         uint32_t flags, UErrorCode &status);
00508 public:
00509 
00510 
00516     virtual ~RegexMatcher();
00517 
00518 
00525     virtual UBool matches(UErrorCode &status);
00526 
00535     virtual UBool matches(int32_t startIndex, UErrorCode &status);
00536 
00537 
00538 
00539 
00552     virtual UBool lookingAt(UErrorCode &status);
00553 
00554 
00568     virtual UBool lookingAt(int32_t startIndex, UErrorCode &status);
00569 
00582     virtual UBool find();
00583 
00584 
00594     virtual UBool find(int32_t start, UErrorCode &status);
00595 
00596 
00606     virtual UnicodeString group(UErrorCode &status) const;
00607 
00608 
00621     virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
00622 
00623 
00629     virtual int32_t groupCount() const;
00630 
00631 
00639     virtual int32_t start(UErrorCode &status) const;
00640 
00641 
00655     virtual int32_t start(int32_t group, UErrorCode &status) const;
00656 
00657 
00667     virtual int32_t end(UErrorCode &status) const;
00668 
00669 
00683     virtual int32_t end(int32_t group, UErrorCode &status) const;
00684 
00685 
00694     virtual RegexMatcher &reset();
00695 
00696 
00706     virtual RegexMatcher &reset(int32_t index, UErrorCode &status);
00707 
00708 
00722     virtual RegexMatcher &reset(const UnicodeString &input);
00723 
00724 private:
00736     virtual RegexMatcher &reset(const UChar *input);
00737 public:
00738 
00745     virtual const UnicodeString &input() const;
00746 
00747 
00753     virtual const RegexPattern &pattern() const;
00754 
00755 
00772     virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
00773 
00774 
00795     virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
00796 
00824     virtual RegexMatcher &appendReplacement(UnicodeString &dest,
00825         const UnicodeString &replacement, UErrorCode &status);
00826 
00827 
00838     virtual UnicodeString &appendTail(UnicodeString &dest);
00839 
00840 
00841 
00866     virtual int32_t  split(const UnicodeString &input,
00867         UnicodeString    dest[],
00868         int32_t          destCapacity,
00869         UErrorCode       &status);
00870 
00871 
00872 
00878     void setTrace(UBool state);
00879 
00880 
00886     static UClassID U_EXPORT2 getStaticClassID();
00887 
00893     virtual UClassID getDynamicClassID() const;
00894 
00895 private:
00896     // Constructors and other object boilerplate are private.
00897     // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
00898     RegexMatcher(); // default constructor not implemented
00899     RegexMatcher(const RegexPattern *pat);
00900     RegexMatcher(const RegexMatcher &other);
00901     RegexMatcher &operator =(const RegexMatcher &rhs);
00902     friend class RegexPattern;
00903     friend class RegexCImpl;
00904 
00905     //
00906     //  MatchAt   This is the internal interface to the match engine itself.
00907     //            Match status comes back in matcher member variables.
00908     //
00909     void                 MatchAt(int32_t startIdx, UErrorCode &status);
00910     inline void          backTrack(int32_t &inputIdx, int32_t &patIdx);
00911     UBool                isWordBoundary(int32_t pos);         // perform Perl-like  \b test
00912     UBool                isUWordBoundary(int32_t pos);        // perform RBBI based \b test
00913     REStackFrame        *resetStack();
00914     inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx,
00915                                    int32_t frameSize, UErrorCode &status);
00916 
00917 
00918     const RegexPattern  *fPattern;
00919     RegexPattern        *fPatternOwned;    // Non-NULL if this matcher owns the pattern, and
00920                                            //   should delete it when through.
00921     const UnicodeString *fInput;
00922 
00923     UBool                fMatch;           // True if the last match was successful.
00924     int32_t              fMatchStart;      // Position of the start of the most recent match
00925     int32_t              fMatchEnd;        // First position after the end of the most recent match
00926     int32_t              fLastMatchEnd;    // First position after the end of the previous match,
00927                                            //   or -1 if there was no previous match.
00928     int32_t              fLastReplaceEnd;  // First position after the end of the previous appendReplacement();
00929 
00930     UVector32           *fStack;
00931     REStackFrame        *fFrame;           // After finding a match, the last active stack
00932                                            //   frame, which will contain the capture group results.
00933                                            //   NOT valid while match engine is running.
00934 
00935     int32_t             *fData;            // Data area for use by the compiled pattern.
00936     int32_t             fSmallData[8];     //   Use this for data if it's enough.
00937 
00938     UBool               fTraceDebug;       // Set true for debug tracing of match engine.
00939 
00940     UErrorCode          fDeferredStatus;   // Save error state if that cannot be immediately
00941                                            //   reported, or that permanently disables this matcher.
00942 
00943     RuleBasedBreakIterator  *fWordBreakItr;
00944 
00945 
00946 };
00947 
00948 U_NAMESPACE_END
00949 #endif  // UCONFIG_NO_REGULAR_EXPRESSIONS
00950 #endif

Generated on Mon Aug 13 07:17:24 2007 for ICU 3.6 by  doxygen 1.5.2