"Fossies" - the Fresh Open Source Software Archive

Member "etcd-3.4.13/vendor/golang.org/x/text/unicode/norm/normalize.go" (24 Aug 2020, 15245 Bytes) of package /linux/misc/etcd-3.4.13.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Go source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. See also the latest Fossies "Diffs" side-by-side code changes report for "normalize.go": 3.4.12_vs_3.4.13.

    1 // Copyright 2011 The Go Authors. All rights reserved.
    2 // Use of this source code is governed by a BSD-style
    3 // license that can be found in the LICENSE file.
    4 
    5 // Note: the file data_test.go that is generated should not be checked in.
    6 //go:generate go run maketables.go triegen.go
    7 //go:generate go test -tags test
    8 
    9 // Package norm contains types and functions for normalizing Unicode strings.
   10 package norm // import "golang.org/x/text/unicode/norm"
   11 
   12 import (
   13     "unicode/utf8"
   14 
   15     "golang.org/x/text/transform"
   16 )
   17 
   18 // A Form denotes a canonical representation of Unicode code points.
   19 // The Unicode-defined normalization and equivalence forms are:
   20 //
   21 //   NFC   Unicode Normalization Form C
   22 //   NFD   Unicode Normalization Form D
   23 //   NFKC  Unicode Normalization Form KC
   24 //   NFKD  Unicode Normalization Form KD
   25 //
   26 // For a Form f, this documentation uses the notation f(x) to mean
   27 // the bytes or string x converted to the given form.
   28 // A position n in x is called a boundary if conversion to the form can
   29 // proceed independently on both sides:
   30 //   f(x) == append(f(x[0:n]), f(x[n:])...)
   31 //
   32 // References: https://unicode.org/reports/tr15/ and
   33 // https://unicode.org/notes/tn5/.
   34 type Form int
   35 
   36 const (
   37     NFC Form = iota
   38     NFD
   39     NFKC
   40     NFKD
   41 )
   42 
   43 // Bytes returns f(b). May return b if f(b) = b.
   44 func (f Form) Bytes(b []byte) []byte {
   45     src := inputBytes(b)
   46     ft := formTable[f]
   47     n, ok := ft.quickSpan(src, 0, len(b), true)
   48     if ok {
   49         return b
   50     }
   51     out := make([]byte, n, len(b))
   52     copy(out, b[0:n])
   53     rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush}
   54     return doAppendInner(&rb, n)
   55 }
   56 
   57 // String returns f(s).
   58 func (f Form) String(s string) string {
   59     src := inputString(s)
   60     ft := formTable[f]
   61     n, ok := ft.quickSpan(src, 0, len(s), true)
   62     if ok {
   63         return s
   64     }
   65     out := make([]byte, n, len(s))
   66     copy(out, s[0:n])
   67     rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush}
   68     return string(doAppendInner(&rb, n))
   69 }
   70 
   71 // IsNormal returns true if b == f(b).
   72 func (f Form) IsNormal(b []byte) bool {
   73     src := inputBytes(b)
   74     ft := formTable[f]
   75     bp, ok := ft.quickSpan(src, 0, len(b), true)
   76     if ok {
   77         return true
   78     }
   79     rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)}
   80     rb.setFlusher(nil, cmpNormalBytes)
   81     for bp < len(b) {
   82         rb.out = b[bp:]
   83         if bp = decomposeSegment(&rb, bp, true); bp < 0 {
   84             return false
   85         }
   86         bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true)
   87     }
   88     return true
   89 }
   90 
   91 func cmpNormalBytes(rb *reorderBuffer) bool {
   92     b := rb.out
   93     for i := 0; i < rb.nrune; i++ {
   94         info := rb.rune[i]
   95         if int(info.size) > len(b) {
   96             return false
   97         }
   98         p := info.pos
   99         pe := p + info.size
  100         for ; p < pe; p++ {
  101             if b[0] != rb.byte[p] {
  102                 return false
  103             }
  104             b = b[1:]
  105         }
  106     }
  107     return true
  108 }
  109 
  110 // IsNormalString returns true if s == f(s).
  111 func (f Form) IsNormalString(s string) bool {
  112     src := inputString(s)
  113     ft := formTable[f]
  114     bp, ok := ft.quickSpan(src, 0, len(s), true)
  115     if ok {
  116         return true
  117     }
  118     rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)}
  119     rb.setFlusher(nil, func(rb *reorderBuffer) bool {
  120         for i := 0; i < rb.nrune; i++ {
  121             info := rb.rune[i]
  122             if bp+int(info.size) > len(s) {
  123                 return false
  124             }
  125             p := info.pos
  126             pe := p + info.size
  127             for ; p < pe; p++ {
  128                 if s[bp] != rb.byte[p] {
  129                     return false
  130                 }
  131                 bp++
  132             }
  133         }
  134         return true
  135     })
  136     for bp < len(s) {
  137         if bp = decomposeSegment(&rb, bp, true); bp < 0 {
  138             return false
  139         }
  140         bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true)
  141     }
  142     return true
  143 }
  144 
  145 // patchTail fixes a case where a rune may be incorrectly normalized
  146 // if it is followed by illegal continuation bytes. It returns the
  147 // patched buffer and whether the decomposition is still in progress.
  148 func patchTail(rb *reorderBuffer) bool {
  149     info, p := lastRuneStart(&rb.f, rb.out)
  150     if p == -1 || info.size == 0 {
  151         return true
  152     }
  153     end := p + int(info.size)
  154     extra := len(rb.out) - end
  155     if extra > 0 {
  156         // Potentially allocating memory. However, this only
  157         // happens with ill-formed UTF-8.
  158         x := make([]byte, 0)
  159         x = append(x, rb.out[len(rb.out)-extra:]...)
  160         rb.out = rb.out[:end]
  161         decomposeToLastBoundary(rb)
  162         rb.doFlush()
  163         rb.out = append(rb.out, x...)
  164         return false
  165     }
  166     buf := rb.out[p:]
  167     rb.out = rb.out[:p]
  168     decomposeToLastBoundary(rb)
  169     if s := rb.ss.next(info); s == ssStarter {
  170         rb.doFlush()
  171         rb.ss.first(info)
  172     } else if s == ssOverflow {
  173         rb.doFlush()
  174         rb.insertCGJ()
  175         rb.ss = 0
  176     }
  177     rb.insertUnsafe(inputBytes(buf), 0, info)
  178     return true
  179 }
  180 
  181 func appendQuick(rb *reorderBuffer, i int) int {
  182     if rb.nsrc == i {
  183         return i
  184     }
  185     end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true)
  186     rb.out = rb.src.appendSlice(rb.out, i, end)
  187     return end
  188 }
  189 
  190 // Append returns f(append(out, b...)).
  191 // The buffer out must be nil, empty, or equal to f(out).
  192 func (f Form) Append(out []byte, src ...byte) []byte {
  193     return f.doAppend(out, inputBytes(src), len(src))
  194 }
  195 
  196 func (f Form) doAppend(out []byte, src input, n int) []byte {
  197     if n == 0 {
  198         return out
  199     }
  200     ft := formTable[f]
  201     // Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
  202     if len(out) == 0 {
  203         p, _ := ft.quickSpan(src, 0, n, true)
  204         out = src.appendSlice(out, 0, p)
  205         if p == n {
  206             return out
  207         }
  208         rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush}
  209         return doAppendInner(&rb, p)
  210     }
  211     rb := reorderBuffer{f: *ft, src: src, nsrc: n}
  212     return doAppend(&rb, out, 0)
  213 }
  214 
  215 func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
  216     rb.setFlusher(out, appendFlush)
  217     src, n := rb.src, rb.nsrc
  218     doMerge := len(out) > 0
  219     if q := src.skipContinuationBytes(p); q > p {
  220         // Move leading non-starters to destination.
  221         rb.out = src.appendSlice(rb.out, p, q)
  222         p = q
  223         doMerge = patchTail(rb)
  224     }
  225     fd := &rb.f
  226     if doMerge {
  227         var info Properties
  228         if p < n {
  229             info = fd.info(src, p)
  230             if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 {
  231                 if p == 0 {
  232                     decomposeToLastBoundary(rb)
  233                 }
  234                 p = decomposeSegment(rb, p, true)
  235             }
  236         }
  237         if info.size == 0 {
  238             rb.doFlush()
  239             // Append incomplete UTF-8 encoding.
  240             return src.appendSlice(rb.out, p, n)
  241         }
  242         if rb.nrune > 0 {
  243             return doAppendInner(rb, p)
  244         }
  245     }
  246     p = appendQuick(rb, p)
  247     return doAppendInner(rb, p)
  248 }
  249 
  250 func doAppendInner(rb *reorderBuffer, p int) []byte {
  251     for n := rb.nsrc; p < n; {
  252         p = decomposeSegment(rb, p, true)
  253         p = appendQuick(rb, p)
  254     }
  255     return rb.out
  256 }
  257 
  258 // AppendString returns f(append(out, []byte(s))).
  259 // The buffer out must be nil, empty, or equal to f(out).
  260 func (f Form) AppendString(out []byte, src string) []byte {
  261     return f.doAppend(out, inputString(src), len(src))
  262 }
  263 
  264 // QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
  265 // It is not guaranteed to return the largest such n.
  266 func (f Form) QuickSpan(b []byte) int {
  267     n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true)
  268     return n
  269 }
  270 
  271 // Span implements transform.SpanningTransformer. It returns a boundary n such
  272 // that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n.
  273 func (f Form) Span(b []byte, atEOF bool) (n int, err error) {
  274     n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF)
  275     if n < len(b) {
  276         if !ok {
  277             err = transform.ErrEndOfSpan
  278         } else {
  279             err = transform.ErrShortSrc
  280         }
  281     }
  282     return n, err
  283 }
  284 
  285 // SpanString returns a boundary n such that s[0:n] == f(s[0:n]).
  286 // It is not guaranteed to return the largest such n.
  287 func (f Form) SpanString(s string, atEOF bool) (n int, err error) {
  288     n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF)
  289     if n < len(s) {
  290         if !ok {
  291             err = transform.ErrEndOfSpan
  292         } else {
  293             err = transform.ErrShortSrc
  294         }
  295     }
  296     return n, err
  297 }
  298 
  299 // quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
  300 // whether any non-normalized parts were found. If atEOF is false, n will
  301 // not point past the last segment if this segment might be become
  302 // non-normalized by appending other runes.
  303 func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) {
  304     var lastCC uint8
  305     ss := streamSafe(0)
  306     lastSegStart := i
  307     for n = end; i < n; {
  308         if j := src.skipASCII(i, n); i != j {
  309             i = j
  310             lastSegStart = i - 1
  311             lastCC = 0
  312             ss = 0
  313             continue
  314         }
  315         info := f.info(src, i)
  316         if info.size == 0 {
  317             if atEOF {
  318                 // include incomplete runes
  319                 return n, true
  320             }
  321             return lastSegStart, true
  322         }
  323         // This block needs to be before the next, because it is possible to
  324         // have an overflow for runes that are starters (e.g. with U+FF9E).
  325         switch ss.next(info) {
  326         case ssStarter:
  327             lastSegStart = i
  328         case ssOverflow:
  329             return lastSegStart, false
  330         case ssSuccess:
  331             if lastCC > info.ccc {
  332                 return lastSegStart, false
  333             }
  334         }
  335         if f.composing {
  336             if !info.isYesC() {
  337                 break
  338             }
  339         } else {
  340             if !info.isYesD() {
  341                 break
  342             }
  343         }
  344         lastCC = info.ccc
  345         i += int(info.size)
  346     }
  347     if i == n {
  348         if !atEOF {
  349             n = lastSegStart
  350         }
  351         return n, true
  352     }
  353     return lastSegStart, false
  354 }
  355 
  356 // QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]).
  357 // It is not guaranteed to return the largest such n.
  358 func (f Form) QuickSpanString(s string) int {
  359     n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true)
  360     return n
  361 }
  362 
  363 // FirstBoundary returns the position i of the first boundary in b
  364 // or -1 if b contains no boundary.
  365 func (f Form) FirstBoundary(b []byte) int {
  366     return f.firstBoundary(inputBytes(b), len(b))
  367 }
  368 
  369 func (f Form) firstBoundary(src input, nsrc int) int {
  370     i := src.skipContinuationBytes(0)
  371     if i >= nsrc {
  372         return -1
  373     }
  374     fd := formTable[f]
  375     ss := streamSafe(0)
  376     // We should call ss.first here, but we can't as the first rune is
  377     // skipped already. This means FirstBoundary can't really determine
  378     // CGJ insertion points correctly. Luckily it doesn't have to.
  379     for {
  380         info := fd.info(src, i)
  381         if info.size == 0 {
  382             return -1
  383         }
  384         if s := ss.next(info); s != ssSuccess {
  385             return i
  386         }
  387         i += int(info.size)
  388         if i >= nsrc {
  389             if !info.BoundaryAfter() && !ss.isMax() {
  390                 return -1
  391             }
  392             return nsrc
  393         }
  394     }
  395 }
  396 
  397 // FirstBoundaryInString returns the position i of the first boundary in s
  398 // or -1 if s contains no boundary.
  399 func (f Form) FirstBoundaryInString(s string) int {
  400     return f.firstBoundary(inputString(s), len(s))
  401 }
  402 
  403 // NextBoundary reports the index of the boundary between the first and next
  404 // segment in b or -1 if atEOF is false and there are not enough bytes to
  405 // determine this boundary.
  406 func (f Form) NextBoundary(b []byte, atEOF bool) int {
  407     return f.nextBoundary(inputBytes(b), len(b), atEOF)
  408 }
  409 
  410 // NextBoundaryInString reports the index of the boundary between the first and
  411 // next segment in b or -1 if atEOF is false and there are not enough bytes to
  412 // determine this boundary.
  413 func (f Form) NextBoundaryInString(s string, atEOF bool) int {
  414     return f.nextBoundary(inputString(s), len(s), atEOF)
  415 }
  416 
  417 func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int {
  418     if nsrc == 0 {
  419         if atEOF {
  420             return 0
  421         }
  422         return -1
  423     }
  424     fd := formTable[f]
  425     info := fd.info(src, 0)
  426     if info.size == 0 {
  427         if atEOF {
  428             return 1
  429         }
  430         return -1
  431     }
  432     ss := streamSafe(0)
  433     ss.first(info)
  434 
  435     for i := int(info.size); i < nsrc; i += int(info.size) {
  436         info = fd.info(src, i)
  437         if info.size == 0 {
  438             if atEOF {
  439                 return i
  440             }
  441             return -1
  442         }
  443         // TODO: Using streamSafe to determine the boundary isn't the same as
  444         // using BoundaryBefore. Determine which should be used.
  445         if s := ss.next(info); s != ssSuccess {
  446             return i
  447         }
  448     }
  449     if !atEOF && !info.BoundaryAfter() && !ss.isMax() {
  450         return -1
  451     }
  452     return nsrc
  453 }
  454 
  455 // LastBoundary returns the position i of the last boundary in b
  456 // or -1 if b contains no boundary.
  457 func (f Form) LastBoundary(b []byte) int {
  458     return lastBoundary(formTable[f], b)
  459 }
  460 
  461 func lastBoundary(fd *formInfo, b []byte) int {
  462     i := len(b)
  463     info, p := lastRuneStart(fd, b)
  464     if p == -1 {
  465         return -1
  466     }
  467     if info.size == 0 { // ends with incomplete rune
  468         if p == 0 { // starts with incomplete rune
  469             return -1
  470         }
  471         i = p
  472         info, p = lastRuneStart(fd, b[:i])
  473         if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
  474             return i
  475         }
  476     }
  477     if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
  478         return i
  479     }
  480     if info.BoundaryAfter() {
  481         return i
  482     }
  483     ss := streamSafe(0)
  484     v := ss.backwards(info)
  485     for i = p; i >= 0 && v != ssStarter; i = p {
  486         info, p = lastRuneStart(fd, b[:i])
  487         if v = ss.backwards(info); v == ssOverflow {
  488             break
  489         }
  490         if p+int(info.size) != i {
  491             if p == -1 { // no boundary found
  492                 return -1
  493             }
  494             return i // boundary after an illegal UTF-8 encoding
  495         }
  496     }
  497     return i
  498 }
  499 
  500 // decomposeSegment scans the first segment in src into rb. It inserts 0x034f
  501 // (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
  502 // and returns the number of bytes consumed from src or iShortDst or iShortSrc.
  503 func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
  504     // Force one character to be consumed.
  505     info := rb.f.info(rb.src, sp)
  506     if info.size == 0 {
  507         return 0
  508     }
  509     if s := rb.ss.next(info); s == ssStarter {
  510         // TODO: this could be removed if we don't support merging.
  511         if rb.nrune > 0 {
  512             goto end
  513         }
  514     } else if s == ssOverflow {
  515         rb.insertCGJ()
  516         goto end
  517     }
  518     if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
  519         return int(err)
  520     }
  521     for {
  522         sp += int(info.size)
  523         if sp >= rb.nsrc {
  524             if !atEOF && !info.BoundaryAfter() {
  525                 return int(iShortSrc)
  526             }
  527             break
  528         }
  529         info = rb.f.info(rb.src, sp)
  530         if info.size == 0 {
  531             if !atEOF {
  532                 return int(iShortSrc)
  533             }
  534             break
  535         }
  536         if s := rb.ss.next(info); s == ssStarter {
  537             break
  538         } else if s == ssOverflow {
  539             rb.insertCGJ()
  540             break
  541         }
  542         if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
  543             return int(err)
  544         }
  545     }
  546 end:
  547     if !rb.doFlush() {
  548         return int(iShortDst)
  549     }
  550     return sp
  551 }
  552 
  553 // lastRuneStart returns the runeInfo and position of the last
  554 // rune in buf or the zero runeInfo and -1 if no rune was found.
  555 func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
  556     p := len(buf) - 1
  557     for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
  558     }
  559     if p < 0 {
  560         return Properties{}, -1
  561     }
  562     return fd.info(inputBytes(buf), p), p
  563 }
  564 
  565 // decomposeToLastBoundary finds an open segment at the end of the buffer
  566 // and scans it into rb. Returns the buffer minus the last segment.
  567 func decomposeToLastBoundary(rb *reorderBuffer) {
  568     fd := &rb.f
  569     info, i := lastRuneStart(fd, rb.out)
  570     if int(info.size) != len(rb.out)-i {
  571         // illegal trailing continuation bytes
  572         return
  573     }
  574     if info.BoundaryAfter() {
  575         return
  576     }
  577     var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
  578     padd := 0
  579     ss := streamSafe(0)
  580     p := len(rb.out)
  581     for {
  582         add[padd] = info
  583         v := ss.backwards(info)
  584         if v == ssOverflow {
  585             // Note that if we have an overflow, it the string we are appending to
  586             // is not correctly normalized. In this case the behavior is undefined.
  587             break
  588         }
  589         padd++
  590         p -= int(info.size)
  591         if v == ssStarter || p < 0 {
  592             break
  593         }
  594         info, i = lastRuneStart(fd, rb.out[:p])
  595         if int(info.size) != p-i {
  596             break
  597         }
  598     }
  599     rb.ss = ss
  600     // Copy bytes for insertion as we may need to overwrite rb.out.
  601     var buf [maxBufferSize * utf8.UTFMax]byte
  602     cp := buf[:copy(buf[:], rb.out[p:])]
  603     rb.out = rb.out[:p]
  604     for padd--; padd >= 0; padd-- {
  605         info = add[padd]
  606         rb.insertUnsafe(inputBytes(cp), 0, info)
  607         cp = cp[info.size:]
  608     }
  609 }