"Fossies" - the Fresh Open Source Software Archive

Member "regex_8h_source.html" (3 Oct 2019, 105616 Bytes) of package /linux/misc/icu4c-65_1-docs.zip:


Caution: In this restricted "Fossies" environment the current HTML page may not be correctly presentated and may have some non-functional links. You can here alternatively try to browse the pure source code or just view or download the uninterpreted raw source code. If the rendering is insufficient you may try to find and view the page on the project site itself.

ICU 65.1  65.1
regex.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * file name: regex.h
9 * encoding: UTF-8
10 * indentation:4
11 *
12 * created on: 2002oct22
13 * created by: Andy Heninger
14 *
15 * ICU Regular Expressions, API for C++
16 */
17 
18 #ifndef REGEX_H
19 #define REGEX_H
20 
21 //#define REGEX_DEBUG
22 
45 #include "unicode/utypes.h"
46 
47 #if U_SHOW_CPLUSPLUS_API
48 
49 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
50 
51 #include "unicode/uobject.h"
52 #include "unicode/unistr.h"
53 #include "unicode/utext.h"
54 #include "unicode/parseerr.h"
55 
56 #include "unicode/uregex.h"
57 
58 // Forward Declarations
59 
60 struct UHashtable;
61 
62 U_NAMESPACE_BEGIN
63 
64 struct Regex8BitSet;
65 class RegexCImpl;
66 class RegexMatcher;
67 class RegexPattern;
68 struct REStackFrame;
69 class RuleBasedBreakIterator;
70 class UnicodeSet;
71 class UVector;
72 class UVector32;
73 class UVector64;
74 
75 
88 public:
89 
97  RegexPattern();
98 
105  RegexPattern(const RegexPattern &source);
106 
112  virtual ~RegexPattern();
113 
122  UBool operator==(const RegexPattern& that) const;
123 
132  inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);}
133 
139  RegexPattern &operator =(const RegexPattern &source);
140 
148  virtual RegexPattern *clone() const;
149 
150 
175  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
176  UParseError &pe,
177  UErrorCode &status);
178 
205  static RegexPattern * U_EXPORT2 compile( UText *regex,
206  UParseError &pe,
207  UErrorCode &status);
208 
233  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
234  uint32_t flags,
235  UParseError &pe,
236  UErrorCode &status);
237 
264  static RegexPattern * U_EXPORT2 compile( UText *regex,
265  uint32_t flags,
266  UParseError &pe,
267  UErrorCode &status);
268 
291  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
292  uint32_t flags,
293  UErrorCode &status);
294 
319  static RegexPattern * U_EXPORT2 compile( UText *regex,
320  uint32_t flags,
321  UErrorCode &status);
322 
328  virtual uint32_t flags() const;
329 
347  virtual RegexMatcher *matcher(const UnicodeString &input,
348  UErrorCode &status) const;
349 
350 private:
363  RegexMatcher *matcher(const char16_t *input,
364  UErrorCode &status) const;
365 public:
366 
367 
379  virtual RegexMatcher *matcher(UErrorCode &status) const;
380 
381 
396  static UBool U_EXPORT2 matches(const UnicodeString &regex,
397  const UnicodeString &input,
398  UParseError &pe,
399  UErrorCode &status);
400 
415  static UBool U_EXPORT2 matches(UText *regex,
416  UText *input,
417  UParseError &pe,
418  UErrorCode &status);
419 
428  virtual UnicodeString pattern() const;
429 
430 
441  virtual UText *patternText(UErrorCode &status) const;
442 
443 
457  virtual int32_t groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const;
458 
459 
476  virtual int32_t groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const;
477 
478 
517  virtual int32_t split(const UnicodeString &input,
518  UnicodeString dest[],
519  int32_t destCapacity,
520  UErrorCode &status) const;
521 
522 
561  virtual int32_t split(UText *input,
562  UText *dest[],
563  int32_t destCapacity,
564  UErrorCode &status) const;
565 
566 
572  virtual UClassID getDynamicClassID() const;
573 
579  static UClassID U_EXPORT2 getStaticClassID();
580 
581 private:
582  //
583  // Implementation Data
584  //
585  UText *fPattern; // The original pattern string.
586  UnicodeString *fPatternString; // The original pattern UncodeString if relevant
587  uint32_t fFlags; // The flags used when compiling the pattern.
588  //
589  UVector64 *fCompiledPat; // The compiled pattern p-code.
590  UnicodeString fLiteralText; // Any literal string data from the pattern,
591  // after un-escaping, for use during the match.
592 
593  UVector *fSets; // Any UnicodeSets referenced from the pattern.
594  Regex8BitSet *fSets8; // (and fast sets for latin-1 range.)
595 
596 
597  UErrorCode fDeferredStatus; // status if some prior error has left this
598  // RegexPattern in an unusable state.
599 
600  int32_t fMinMatchLen; // Minimum Match Length. All matches will have length
601  // >= this value. For some patterns, this calculated
602  // value may be less than the true shortest
603  // possible match.
604 
605  int32_t fFrameSize; // Size of a state stack frame in the
606  // execution engine.
607 
608  int32_t fDataSize; // The size of the data needed by the pattern that
609  // does not go on the state stack, but has just
610  // a single copy per matcher.
611 
612  UVector32 *fGroupMap; // Map from capture group number to position of
613  // the group's variables in the matcher stack frame.
614 
615  UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
616  // regex character classes, e.g. Word.
617 
618  Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only
619  // sets for predefined regex classes.
620 
621  int32_t fStartType; // Info on how a match must start.
622  int32_t fInitialStringIdx; //
623  int32_t fInitialStringLen;
624  UnicodeSet *fInitialChars;
625  UChar32 fInitialChar;
626  Regex8BitSet *fInitialChars8;
627  UBool fNeedsAltInput;
628 
629  UHashtable *fNamedCaptureMap; // Map from capture group names to numbers.
630 
631  friend class RegexCompile;
632  friend class RegexMatcher;
633  friend class RegexCImpl;
634 
635  //
636  // Implementation Methods
637  //
638  void init(); // Common initialization, for use by constructors.
639  void zap(); // Common cleanup
640 
641  void dumpOp(int32_t index) const;
642 
643  public:
644 #ifndef U_HIDE_INTERNAL_API
645 
649  void dumpPattern() const;
650 #endif /* U_HIDE_INTERNAL_API */
651 };
652 
653 
654 
665 public:
666 
680  RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
681 
696  RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
697 
718  RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
719  uint32_t flags, UErrorCode &status);
720 
741  RegexMatcher(UText *regexp, UText *input,
742  uint32_t flags, UErrorCode &status);
743 
744 private:
756  RegexMatcher(const UnicodeString &regexp, const char16_t *input,
757  uint32_t flags, UErrorCode &status);
758 public:
759 
760 
766  virtual ~RegexMatcher();
767 
768 
775  virtual UBool matches(UErrorCode &status);
776 
777 
788  virtual UBool matches(int64_t startIndex, UErrorCode &status);
789 
790 
804  virtual UBool lookingAt(UErrorCode &status);
805 
806 
820  virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
821 
822 
835  virtual UBool find();
836 
837 
852  virtual UBool find(UErrorCode &status);
853 
863  virtual UBool find(int64_t start, UErrorCode &status);
864 
865 
875  virtual UnicodeString group(UErrorCode &status) const;
876 
877 
895  virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
896 
902  virtual int32_t groupCount() const;
903 
904 
919  virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const;
920 
941  virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
942 
950  virtual int32_t start(UErrorCode &status) const;
951 
959  virtual int64_t start64(UErrorCode &status) const;
960 
961 
975  virtual int32_t start(int32_t group, UErrorCode &status) const;
976 
990  virtual int64_t start64(int32_t group, UErrorCode &status) const;
991 
1005  virtual int32_t end(UErrorCode &status) const;
1006 
1020  virtual int64_t end64(UErrorCode &status) const;
1021 
1022 
1040  virtual int32_t end(int32_t group, UErrorCode &status) const;
1041 
1059  virtual int64_t end64(int32_t group, UErrorCode &status) const;
1060 
1069  virtual RegexMatcher &reset();
1070 
1071 
1087  virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
1088 
1089 
1107  virtual RegexMatcher &reset(const UnicodeString &input);
1108 
1109 
1123  virtual RegexMatcher &reset(UText *input);
1124 
1125 
1150  virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
1151 
1152 private:
1165  RegexMatcher &reset(const char16_t *input);
1166 public:
1167 
1175  virtual const UnicodeString &input() const;
1176 
1185  virtual UText *inputText() const;
1186 
1197  virtual UText *getInput(UText *dest, UErrorCode &status) const;
1198 
1199 
1218  virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
1219 
1231  virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
1232 
1241  virtual int32_t regionStart() const;
1242 
1251  virtual int64_t regionStart64() const;
1252 
1253 
1262  virtual int32_t regionEnd() const;
1263 
1272  virtual int64_t regionEnd64() const;
1273 
1282  virtual UBool hasTransparentBounds() const;
1283 
1302  virtual RegexMatcher &useTransparentBounds(UBool b);
1303 
1304 
1312  virtual UBool hasAnchoringBounds() const;
1313 
1314 
1327  virtual RegexMatcher &useAnchoringBounds(UBool b);
1328 
1329 
1342  virtual UBool hitEnd() const;
1343 
1353  virtual UBool requireEnd() const;
1354 
1355 
1361  virtual const RegexPattern &pattern() const;
1362 
1363 
1380  virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
1381 
1382 
1403  virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
1404 
1405 
1426  virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
1427 
1428 
1453  virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
1454 
1455 
1483  virtual RegexMatcher &appendReplacement(UnicodeString &dest,
1484  const UnicodeString &replacement, UErrorCode &status);
1485 
1486 
1514  virtual RegexMatcher &appendReplacement(UText *dest,
1515  UText *replacement, UErrorCode &status);
1516 
1517 
1528  virtual UnicodeString &appendTail(UnicodeString &dest);
1529 
1530 
1544  virtual UText *appendTail(UText *dest, UErrorCode &status);
1545 
1546 
1570  virtual int32_t split(const UnicodeString &input,
1571  UnicodeString dest[],
1572  int32_t destCapacity,
1573  UErrorCode &status);
1574 
1575 
1599  virtual int32_t split(UText *input,
1600  UText *dest[],
1601  int32_t destCapacity,
1602  UErrorCode &status);
1603 
1625  virtual void setTimeLimit(int32_t limit, UErrorCode &status);
1626 
1633  virtual int32_t getTimeLimit() const;
1634 
1656  virtual void setStackLimit(int32_t limit, UErrorCode &status);
1657 
1665  virtual int32_t getStackLimit() const;
1666 
1667 
1681  virtual void setMatchCallback(URegexMatchCallback *callback,
1682  const void *context,
1683  UErrorCode &status);
1684 
1685 
1696  virtual void getMatchCallback(URegexMatchCallback *&callback,
1697  const void *&context,
1698  UErrorCode &status);
1699 
1700 
1714  virtual void setFindProgressCallback(URegexFindProgressCallback *callback,
1715  const void *context,
1716  UErrorCode &status);
1717 
1718 
1729  virtual void getFindProgressCallback(URegexFindProgressCallback *&callback,
1730  const void *&context,
1731  UErrorCode &status);
1732 
1733 #ifndef U_HIDE_INTERNAL_API
1734 
1739  void setTrace(UBool state);
1740 #endif /* U_HIDE_INTERNAL_API */
1741 
1747  static UClassID U_EXPORT2 getStaticClassID();
1748 
1754  virtual UClassID getDynamicClassID() const;
1755 
1756 private:
1757  // Constructors and other object boilerplate are private.
1758  // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
1759  RegexMatcher(); // default constructor not implemented
1760  RegexMatcher(const RegexPattern *pat);
1761  RegexMatcher(const RegexMatcher &other);
1762  RegexMatcher &operator =(const RegexMatcher &rhs);
1763  void init(UErrorCode &status); // Common initialization
1764  void init2(UText *t, UErrorCode &e); // Common initialization, part 2.
1765 
1766  friend class RegexPattern;
1767  friend class RegexCImpl;
1768 public:
1769 #ifndef U_HIDE_INTERNAL_API
1770 
1771  void resetPreserveRegion(); // Reset matcher state, but preserve any region.
1772 #endif /* U_HIDE_INTERNAL_API */
1773 private:
1774 
1775  //
1776  // MatchAt This is the internal interface to the match engine itself.
1777  // Match status comes back in matcher member variables.
1778  //
1779  void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
1780  inline void backTrack(int64_t &inputIdx, int32_t &patIdx);
1781  UBool isWordBoundary(int64_t pos); // perform Perl-like \b test
1782  UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test
1783  REStackFrame *resetStack();
1784  inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
1785  void IncrementTime(UErrorCode &status);
1786 
1787  // Call user find callback function, if set. Return TRUE if operation should be interrupted.
1788  inline UBool findProgressInterrupt(int64_t matchIndex, UErrorCode &status);
1789 
1790  int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
1791 
1792  UBool findUsingChunk(UErrorCode &status);
1793  void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
1794  UBool isChunkWordBoundary(int32_t pos);
1795 
1796  const RegexPattern *fPattern;
1797  RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and
1798  // should delete it when through.
1799 
1800  const UnicodeString *fInput; // The string being matched. Only used for input()
1801  UText *fInputText; // The text being matched. Is never NULL.
1802  UText *fAltInputText; // A shallow copy of the text being matched.
1803  // Only created if the pattern contains backreferences.
1804  int64_t fInputLength; // Full length of the input text.
1805  int32_t fFrameSize; // The size of a frame in the backtrack stack.
1806 
1807  int64_t fRegionStart; // Start of the input region, default = 0.
1808  int64_t fRegionLimit; // End of input region, default to input.length.
1809 
1810  int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $).
1811  int64_t fAnchorLimit; // See useAnchoringBounds
1812 
1813  int64_t fLookStart; // Region bounds for look-ahead/behind and
1814  int64_t fLookLimit; // and other boundary tests. See
1815  // useTransparentBounds
1816 
1817  int64_t fActiveStart; // Currently active bounds for matching.
1818  int64_t fActiveLimit; // Usually is the same as region, but
1819  // is changed to fLookStart/Limit when
1820  // entering look around regions.
1821 
1822  UBool fTransparentBounds; // True if using transparent bounds.
1823  UBool fAnchoringBounds; // True if using anchoring bounds.
1824 
1825  UBool fMatch; // True if the last attempted match was successful.
1826  int64_t fMatchStart; // Position of the start of the most recent match
1827  int64_t fMatchEnd; // First position after the end of the most recent match
1828  // Zero if no previous match, even when a region
1829  // is active.
1830  int64_t fLastMatchEnd; // First position after the end of the previous match,
1831  // or -1 if there was no previous match.
1832  int64_t fAppendPosition; // First position after the end of the previous
1833  // appendReplacement(). As described by the
1834  // JavaDoc for Java Matcher, where it is called
1835  // "append position"
1836  UBool fHitEnd; // True if the last match touched the end of input.
1837  UBool fRequireEnd; // True if the last match required end-of-input
1838  // (matched $ or Z)
1839 
1840  UVector64 *fStack;
1841  REStackFrame *fFrame; // After finding a match, the last active stack frame,
1842  // which will contain the capture group results.
1843  // NOT valid while match engine is running.
1844 
1845  int64_t *fData; // Data area for use by the compiled pattern.
1846  int64_t fSmallData[8]; // Use this for data if it's enough.
1847 
1848  int32_t fTimeLimit; // Max time (in arbitrary steps) to let the
1849  // match engine run. Zero for unlimited.
1850 
1851  int32_t fTime; // Match time, accumulates while matching.
1852  int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves.
1853  // Kept separately from fTime to keep as much
1854  // code as possible out of the inline
1855  // StateSave function.
1856 
1857  int32_t fStackLimit; // Maximum memory size to use for the backtrack
1858  // stack, in bytes. Zero for unlimited.
1859 
1860  URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct.
1861  // NULL if there is no callback.
1862  const void *fCallbackContext; // User Context ptr for callback function.
1863 
1864  URegexFindProgressCallback *fFindProgressCallbackFn; // Pointer to match progress callback funct.
1865  // NULL if there is no callback.
1866  const void *fFindProgressCallbackContext; // User Context ptr for callback function.
1867 
1868 
1869  UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
1870 
1871  UBool fTraceDebug; // Set true for debug tracing of match engine.
1872 
1873  UErrorCode fDeferredStatus; // Save error state that cannot be immediately
1874  // reported, or that permanently disables this matcher.
1875 
1876  RuleBasedBreakIterator *fWordBreakItr;
1877 };
1878 
1879 U_NAMESPACE_END
1880 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
1881 
1882 #endif /* U_SHOW_CPLUSPLUS_API */
1883 
1884 #endif
struct UHashtable UHashtable
Definition: msgfmt.h:43
C++ API: Unicode String.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
void * UClassID
UClassID is used to identify classes without using the compiler's RTTI.
Definition: uobject.h:96
Class RegexPattern represents a compiled regular expression.
Definition: regex.h:87
UBool URegexFindProgressCallback(const void *context, int64_t matchIndex)
Function pointer for a regular expression find callback function.
Definition: uregex.h:1573
C API: Abstract Unicode Text API.
class RegexMatcher bundles together a regular expression pattern and input text to which the expressi...
Definition: regex.h:664
UBool operator!=(const RegexPattern &that) const
Comparison operator.
Definition: regex.h:132
#define U_I18N_API
Set to export library symbols from inside the i18n library, and to import them from outside...
Definition: utypes.h:301
C API: Regular Expressions.
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:425
virtual UClassID getDynamicClassID() const
ICU4C "poor man's RTTI", returns a UClassID for the actual ICU class.
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:281
C++ API: Common ICU base class UObject.
UBool URegexMatchCallback(const void *context, int32_t steps)
Function pointer for a regular expression matching callback function.
Definition: uregex.h:1499
C API: Parse Error Information.
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:415
#define U_FINAL
Defined to the C++11 "final" keyword if available.
Definition: umachine.h:140
UText struct.
Definition: utext.h:1347
A subclass of BreakIterator whose behavior is specified using a list of rules.
Definition: rbbi.h:55
A UParseError struct is used to returned detailed information about parsing errors.
Definition: parseerr.h:58
Basic definitions for ICU, for both C and C++ APIs.
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:294
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:223
int8_t UBool
The ICU boolean type.
Definition: umachine.h:261