"Fossies" - the Fresh Open Source Software Archive

Member "ucharstrie_8h_source.html" (3 Oct 2019, 79267 Bytes) of package /linux/misc/icu4c-65_1-docs.zip:


Caution: In this restricted "Fossies" environment the current HTML page may not be correctly presentated and may have some non-functional links. You can here alternatively try to browse the pure source code or just view or download the uninterpreted raw source code. If the rendering is insufficient you may try to find and view the page on the project site itself.

ICU 65.1  65.1
ucharstrie.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2010-2012, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ucharstrie.h
9 * encoding: UTF-8
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2010nov14
14 * created by: Markus W. Scherer
15 */
16 
17 #ifndef __UCHARSTRIE_H__
18 #define __UCHARSTRIE_H__
19 
26 #include "unicode/utypes.h"
27 
28 #if U_SHOW_CPLUSPLUS_API
29 
30 #include "unicode/unistr.h"
31 #include "unicode/uobject.h"
32 #include "unicode/ustringtrie.h"
33 
34 U_NAMESPACE_BEGIN
35 
36 class Appendable;
37 class UCharsTrieBuilder;
38 class UVector32;
39 
54 public:
70  : ownedArray_(NULL), uchars_(trieUChars),
71  pos_(uchars_), remainingMatchLength_(-1) {}
72 
77  ~UCharsTrie();
78 
85  UCharsTrie(const UCharsTrie &other)
86  : ownedArray_(NULL), uchars_(other.uchars_),
87  pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
88 
95  pos_=uchars_;
96  remainingMatchLength_=-1;
97  return *this;
98  }
99 
100 #ifndef U_HIDE_DRAFT_API
101 
109  uint64_t getState64() const {
110  return (static_cast<uint64_t>(remainingMatchLength_ + 2) << kState64RemainingShift) |
111  (uint64_t)(pos_ - uchars_);
112  }
113 
128  UCharsTrie &resetToState64(uint64_t state) {
129  remainingMatchLength_ = static_cast<int32_t>(state >> kState64RemainingShift) - 2;
130  pos_ = uchars_ + (state & kState64PosMask);
131  return *this;
132  }
133 #endif /* U_HIDE_DRAFT_API */
134 
140  class State : public UMemory {
141  public:
146  State() { uchars=NULL; }
147  private:
148  friend class UCharsTrie;
149 
150  const char16_t *uchars;
151  const char16_t *pos;
152  int32_t remainingMatchLength;
153  };
154 
162  const UCharsTrie &saveState(State &state) const {
163  state.uchars=uchars_;
164  state.pos=pos_;
165  state.remainingMatchLength=remainingMatchLength_;
166  return *this;
167  }
168 
179  UCharsTrie &resetToState(const State &state) {
180  if(uchars_==state.uchars && uchars_!=NULL) {
181  pos_=state.pos;
182  remainingMatchLength_=state.remainingMatchLength;
183  }
184  return *this;
185  }
186 
193  UStringTrieResult current() const;
194 
202  inline UStringTrieResult first(int32_t uchar) {
203  remainingMatchLength_=-1;
204  return nextImpl(uchars_, uchar);
205  }
206 
215  UStringTrieResult firstForCodePoint(UChar32 cp);
216 
223  UStringTrieResult next(int32_t uchar);
224 
232  UStringTrieResult nextForCodePoint(UChar32 cp);
233 
249  UStringTrieResult next(ConstChar16Ptr s, int32_t length);
250 
260  inline int32_t getValue() const {
261  const char16_t *pos=pos_;
262  int32_t leadUnit=*pos++;
263  // U_ASSERT(leadUnit>=kMinValueLead);
264  return leadUnit&kValueIsFinal ?
265  readValue(pos, leadUnit&0x7fff) : readNodeValue(pos, leadUnit);
266  }
267 
277  inline UBool hasUniqueValue(int32_t &uniqueValue) const {
278  const char16_t *pos=pos_;
279  // Skip the rest of a pending linear-match node.
280  return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
281  }
282 
290  int32_t getNextUChars(Appendable &out) const;
291 
296  class U_COMMON_API Iterator : public UMemory {
297  public:
309  Iterator(ConstChar16Ptr trieUChars, int32_t maxStringLength, UErrorCode &errorCode);
310 
322  Iterator(const UCharsTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
323 
328  ~Iterator();
329 
335  Iterator &reset();
336 
341  UBool hasNext() const;
342 
357  UBool next(UErrorCode &errorCode);
358 
363  const UnicodeString &getString() const { return str_; }
368  int32_t getValue() const { return value_; }
369 
370  private:
371  UBool truncateAndStop() {
372  pos_=NULL;
373  value_=-1; // no real value for str
374  return TRUE;
375  }
376 
377  const char16_t *branchNext(const char16_t *pos, int32_t length, UErrorCode &errorCode);
378 
379  const char16_t *uchars_;
380  const char16_t *pos_;
381  const char16_t *initialPos_;
382  int32_t remainingMatchLength_;
383  int32_t initialRemainingMatchLength_;
384  UBool skipValue_; // Skip intermediate value which was already delivered.
385 
386  UnicodeString str_;
387  int32_t maxLength_;
388  int32_t value_;
389 
390  // The stack stores pairs of integers for backtracking to another
391  // outbound edge of a branch node.
392  // The first integer is an offset from uchars_.
393  // The second integer has the str_.length() from before the node in bits 15..0,
394  // and the remaining branch length in bits 31..16.
395  // (We could store the remaining branch length minus 1 in bits 30..16 and not use the sign bit,
396  // but the code looks more confusing that way.)
397  UVector32 *stack_;
398  };
399 
400 private:
401  friend class UCharsTrieBuilder;
402 
409  UCharsTrie(char16_t *adoptUChars, const char16_t *trieUChars)
410  : ownedArray_(adoptUChars), uchars_(trieUChars),
411  pos_(uchars_), remainingMatchLength_(-1) {}
412 
413  // No assignment operator.
414  UCharsTrie &operator=(const UCharsTrie &other);
415 
416  inline void stop() {
417  pos_=NULL;
418  }
419 
420  // Reads a compact 32-bit integer.
421  // pos is already after the leadUnit, and the lead unit has bit 15 reset.
422  static inline int32_t readValue(const char16_t *pos, int32_t leadUnit) {
423  int32_t value;
424  if(leadUnit<kMinTwoUnitValueLead) {
425  value=leadUnit;
426  } else if(leadUnit<kThreeUnitValueLead) {
427  value=((leadUnit-kMinTwoUnitValueLead)<<16)|*pos;
428  } else {
429  value=(pos[0]<<16)|pos[1];
430  }
431  return value;
432  }
433  static inline const char16_t *skipValue(const char16_t *pos, int32_t leadUnit) {
434  if(leadUnit>=kMinTwoUnitValueLead) {
435  if(leadUnit<kThreeUnitValueLead) {
436  ++pos;
437  } else {
438  pos+=2;
439  }
440  }
441  return pos;
442  }
443  static inline const char16_t *skipValue(const char16_t *pos) {
444  int32_t leadUnit=*pos++;
445  return skipValue(pos, leadUnit&0x7fff);
446  }
447 
448  static inline int32_t readNodeValue(const char16_t *pos, int32_t leadUnit) {
449  // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
450  int32_t value;
451  if(leadUnit<kMinTwoUnitNodeValueLead) {
452  value=(leadUnit>>6)-1;
453  } else if(leadUnit<kThreeUnitNodeValueLead) {
454  value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|*pos;
455  } else {
456  value=(pos[0]<<16)|pos[1];
457  }
458  return value;
459  }
460  static inline const char16_t *skipNodeValue(const char16_t *pos, int32_t leadUnit) {
461  // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
462  if(leadUnit>=kMinTwoUnitNodeValueLead) {
463  if(leadUnit<kThreeUnitNodeValueLead) {
464  ++pos;
465  } else {
466  pos+=2;
467  }
468  }
469  return pos;
470  }
471 
472  static inline const char16_t *jumpByDelta(const char16_t *pos) {
473  int32_t delta=*pos++;
474  if(delta>=kMinTwoUnitDeltaLead) {
475  if(delta==kThreeUnitDeltaLead) {
476  delta=(pos[0]<<16)|pos[1];
477  pos+=2;
478  } else {
479  delta=((delta-kMinTwoUnitDeltaLead)<<16)|*pos++;
480  }
481  }
482  return pos+delta;
483  }
484 
485  static const char16_t *skipDelta(const char16_t *pos) {
486  int32_t delta=*pos++;
487  if(delta>=kMinTwoUnitDeltaLead) {
488  if(delta==kThreeUnitDeltaLead) {
489  pos+=2;
490  } else {
491  ++pos;
492  }
493  }
494  return pos;
495  }
496 
497  static inline UStringTrieResult valueResult(int32_t node) {
499  }
500 
501  // Handles a branch node for both next(uchar) and next(string).
502  UStringTrieResult branchNext(const char16_t *pos, int32_t length, int32_t uchar);
503 
504  // Requires remainingLength_<0.
505  UStringTrieResult nextImpl(const char16_t *pos, int32_t uchar);
506 
507  // Helper functions for hasUniqueValue().
508  // Recursively finds a unique value (or whether there is not a unique one)
509  // from a branch.
510  static const char16_t *findUniqueValueFromBranch(const char16_t *pos, int32_t length,
511  UBool haveUniqueValue, int32_t &uniqueValue);
512  // Recursively finds a unique value (or whether there is not a unique one)
513  // starting from a position on a node lead unit.
514  static UBool findUniqueValue(const char16_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
515 
516  // Helper functions for getNextUChars().
517  // getNextUChars() when pos is on a branch node.
518  static void getNextBranchUChars(const char16_t *pos, int32_t length, Appendable &out);
519 
520  // UCharsTrie data structure
521  //
522  // The trie consists of a series of char16_t-serialized nodes for incremental
523  // Unicode string/char16_t sequence matching. (char16_t=16-bit unsigned integer)
524  // The root node is at the beginning of the trie data.
525  //
526  // Types of nodes are distinguished by their node lead unit ranges.
527  // After each node, except a final-value node, another node follows to
528  // encode match values or continue matching further units.
529  //
530  // Node types:
531  // - Final-value node: Stores a 32-bit integer in a compact, variable-length format.
532  // The value is for the string/char16_t sequence so far.
533  // - Match node, optionally with an intermediate value in a different compact format.
534  // The value, if present, is for the string/char16_t sequence so far.
535  //
536  // Aside from the value, which uses the node lead unit's high bits:
537  //
538  // - Linear-match node: Matches a number of units.
539  // - Branch node: Branches to other nodes according to the current input unit.
540  // The node unit is the length of the branch (number of units to select from)
541  // minus 1. It is followed by a sub-node:
542  // - If the length is at most kMaxBranchLinearSubNodeLength, then
543  // there are length-1 (key, value) pairs and then one more comparison unit.
544  // If one of the key units matches, then the value is either a final value for
545  // the string so far, or a "jump" delta to the next node.
546  // If the last unit matches, then matching continues with the next node.
547  // (Values have the same encoding as final-value nodes.)
548  // - If the length is greater than kMaxBranchLinearSubNodeLength, then
549  // there is one unit and one "jump" delta.
550  // If the input unit is less than the sub-node unit, then "jump" by delta to
551  // the next sub-node which will have a length of length/2.
552  // (The delta has its own compact encoding.)
553  // Otherwise, skip the "jump" delta to the next sub-node
554  // which will have a length of length-length/2.
555 
556  // Match-node lead unit values, after masking off intermediate-value bits:
557 
558  // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise
559  // the length is one more than the next unit.
560 
561  // For a branch sub-node with at most this many entries, we drop down
562  // to a linear search.
563  static const int32_t kMaxBranchLinearSubNodeLength=5;
564 
565  // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
566  static const int32_t kMinLinearMatch=0x30;
567  static const int32_t kMaxLinearMatchLength=0x10;
568 
569  // Match-node lead unit bits 14..6 for the optional intermediate value.
570  // If these bits are 0, then there is no intermediate value.
571  // Otherwise, see the *NodeValue* constants below.
572  static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040
573  static const int32_t kNodeTypeMask=kMinValueLead-1; // 0x003f
574 
575  // A final-value node has bit 15 set.
576  static const int32_t kValueIsFinal=0x8000;
577 
578  // Compact value: After testing and masking off bit 15, use the following thresholds.
579  static const int32_t kMaxOneUnitValue=0x3fff;
580 
581  static const int32_t kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000
582  static const int32_t kThreeUnitValueLead=0x7fff;
583 
584  static const int32_t kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff
585 
586  // Compact intermediate-value integer, lead unit shared with a branch or linear-match node.
587  static const int32_t kMaxOneUnitNodeValue=0xff;
588  static const int32_t kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040
589  static const int32_t kThreeUnitNodeValueLead=0x7fc0;
590 
591  static const int32_t kMaxTwoUnitNodeValue=
592  ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff
593 
594  // Compact delta integers.
595  static const int32_t kMaxOneUnitDelta=0xfbff;
596  static const int32_t kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00
597  static const int32_t kThreeUnitDeltaLead=0xffff;
598 
599  static const int32_t kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff
600 
601  // For getState64():
602  // The remainingMatchLength_ is -1..14=(kMaxLinearMatchLength=0x10)-2
603  // so we need at least 5 bits for that.
604  // We add 2 to store it as a positive value 1..16=kMaxLinearMatchLength.
605  static constexpr int32_t kState64RemainingShift = 59;
606  static constexpr uint64_t kState64PosMask = (UINT64_C(1) << kState64RemainingShift) - 1;
607 
608  char16_t *ownedArray_;
609 
610  // Fixed value referencing the UCharsTrie words.
611  const char16_t *uchars_;
612 
613  // Iterator variables.
614 
615  // Pointer to next trie unit to read. NULL if no more matches.
616  const char16_t *pos_;
617  // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
618  int32_t remainingMatchLength_;
619 };
620 
621 U_NAMESPACE_END
622 
623 #endif /* U_SHOW_CPLUSPLUS_API */
624 
625 #endif // __UCHARSTRIE_H__
UCharsTrie & resetToState(const State &state)
Resets this trie to the saved state.
Definition: ucharstrie.h:179
int32_t getValue() const
Returns a matching string&#39;s value if called immediately after current()/first()/next() returned USTRI...
Definition: ucharstrie.h:260
C++ API: Unicode String.
int32_t getValue() const
Definition: ucharstrie.h:368
UStringTrieResult
Return values for BytesTrie::next(), UCharsTrie::next() and similar methods.
Definition: ustringtrie.h:35
UCharsTrie(const UCharsTrie &other)
Copy constructor, copies the other trie reader object and its state, but not the char16_t array which...
Definition: ucharstrie.h:85
UBool hasUniqueValue(int32_t &uniqueValue) const
Determines whether all strings reachable from the current state map to the same value.
Definition: ucharstrie.h:277
const UnicodeString & getString() const
Definition: ucharstrie.h:363
UCharsTrie & reset()
Resets this trie to its initial state.
Definition: ucharstrie.h:94
Iterator for all of the (string, value) pairs in a UCharsTrie.
Definition: ucharstrie.h:296
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:425
#define NULL
Define NULL if necessary, to nullptr for C++ and to ((void *)0) for C.
Definition: utypes.h:188
Builder class for UCharsTrie.
#define TRUE
The TRUE value of a UBool.
Definition: umachine.h:265
const UCharsTrie & saveState(State &state) const
Saves the state of this trie.
Definition: ucharstrie.h:162
C++ API: Common ICU base class UObject.
State()
Constructs an empty State.
Definition: ucharstrie.h:146
UCharsTrie & resetToState64(uint64_t state)
Resets this trie to the saved state.
Definition: ucharstrie.h:128
uint64_t getState64() const
Returns the state of this trie as a 64-bit integer.
Definition: ucharstrie.h:109
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:415
UCharsTrie state object, for saving a trie&#39;s current state and resetting the trie back to this state ...
Definition: ucharstrie.h:140
const char16_t * wrapper with implicit conversion from distinct but bit-compatible pointer types...
Definition: char16ptr.h:149
C API: Helper definitions for dictionary trie APIs.
UCharsTrie(ConstChar16Ptr trieUChars)
Constructs a UCharsTrie reader instance.
Definition: ucharstrie.h:69
Basic definitions for ICU, for both C and C++ APIs.
#define FALSE
The FALSE value of a UBool.
Definition: umachine.h:269
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:300
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:294
The input unit(s) continued a matching string and there is a value for the string so far...
Definition: ustringtrie.h:66
UStringTrieResult first(int32_t uchar)
Traverses the trie from the initial state for this input char16_t.
Definition: ucharstrie.h:202
#define UINT64_C(c)
Provides a platform independent way to specify an unsigned 64-bit integer constant.
Definition: umachine.h:240
UMemory is the common ICU base class.
Definition: uobject.h:115
Light-weight, non-const reader class for a UCharsTrie.
Definition: ucharstrie.h:53
int8_t UBool
The ICU boolean type.
Definition: umachine.h:261
Base class for objects to which Unicode characters and strings can be appended.
Definition: appendable.h:54