"Fossies" - the Fresh Open Source Software Archive

Member "install-tl-20200916/tlpkg/tlperl/lib/Encode/encode.h" (7 Mar 2020, 49158 Bytes) of package /windows/misc/install-tl.zip:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 #ifndef ENCODE_H
    2 #define ENCODE_H
    3 
    4 #ifndef H_PERL
    5 /* check whether we're "in perl" so that we can do data parts without
    6    getting extern references to the code parts
    7 */
    8 typedef unsigned char U8;
    9 #endif
   10 
   11 typedef struct encpage_s encpage_t;
   12 
   13 struct encpage_s
   14 {
   15     /* fields ordered to pack nicely on 32-bit machines */
   16     const U8 *const seq;   /* Packed output sequences we generate 
   17                   if we match */
   18     const encpage_t *const next;      /* Page to go to if we match */
   19     const U8   min;        /* Min value of octet to match this entry */
   20     const U8   max;        /* Max value of octet to match this entry */
   21     const U8   dlen;       /* destination length - 
   22                   size of entries in seq */
   23     const U8   slen;       /* source length - 
   24                   number of source octets needed */
   25 };
   26 
   27 /*
   28   At any point in a translation there is a page pointer which points
   29   at an array of the above structures.
   30 
   31   Basic operation :
   32   get octet from source stream.
   33   if (octet >= min && octet < max) {
   34     if slen is 0 then we cannot represent this character.
   35     if we have less than slen octets (including this one) then 
   36       we have a partial character.
   37     otherwise
   38       copy dlen octets from seq + dlen*(octet-min) to output
   39       (dlen may be zero if we don't know yet.)
   40       load page pointer with next to continue.
   41       (is slen is one this is end of a character)
   42       get next octet.
   43   }
   44   else {
   45     increment the page pointer to look at next slot in the array
   46   }
   47 
   48   arrays SHALL be constructed so there is an entry which matches
   49   ..0xFF at the end, and either maps it or indicates no
   50   representation.
   51 
   52   if MSB of slen is set then mapping is an approximate "FALLBACK" entry.
   53 
   54 */
   55 
   56 
   57 typedef struct encode_s encode_t;
   58 struct encode_s
   59 {
   60     const encpage_t *const t_utf8;  /* Starting table for translation from 
   61                        the encoding to UTF-8 form */
   62     const encpage_t *const f_utf8;  /* Starting table for translation 
   63                        from UTF-8 to the encoding */
   64     const U8 *const rep;            /* Replacement character in this
   65                        encoding e.g. "?" */
   66     int        replen;              /* Number of octets in rep */
   67     U8         min_el;              /* Minimum octets to represent a
   68                        character */
   69     U8         max_el;              /* Maximum octets to represent a
   70                        character */
   71     const char *const name[2];      /* name(s) of this encoding */
   72 };
   73 
   74 #ifdef H_PERL
   75 /* See comment at top of file for deviousness */
   76 
   77 extern int do_encode(const encpage_t *enc, const U8 *src, STRLEN *slen,
   78                      U8 *dst, STRLEN dlen, STRLEN *dout, int approx,
   79              const U8 *term, STRLEN tlen);
   80 
   81 extern void Encode_DefineEncoding(encode_t *enc);
   82 
   83 #endif /* H_PERL */
   84 
   85 #define ENCODE_NOSPACE  1
   86 #define ENCODE_PARTIAL  2
   87 #define ENCODE_NOREP    3
   88 #define ENCODE_FALLBACK 4
   89 #define ENCODE_FOUND_TERM 5
   90 
   91 /* Use the perl core value if available; it is portable to EBCDIC */
   92 #ifdef REPLACEMENT_CHARACTER_UTF8
   93 #  define FBCHAR_UTF8       REPLACEMENT_CHARACTER_UTF8
   94 #else
   95 #  define FBCHAR_UTF8           "\xEF\xBF\xBD"
   96 #endif
   97 
   98 #define  ENCODE_DIE_ON_ERR     0x0001 /* croaks immediately */
   99 #define  ENCODE_WARN_ON_ERR    0x0002 /* warn on error; may proceed */
  100 #define  ENCODE_RETURN_ON_ERR  0x0004 /* immediately returns on NOREP */
  101 #define  ENCODE_LEAVE_SRC      0x0008 /* $src updated unless set */
  102 #define  ENCODE_ONLY_PRAGMA_WARNINGS 0x0010 /* when enabled report only warnings configured by pragma warnings, otherwise report all warnings; no effect without ENCODE_WARN_ON_ERR */
  103 #define  ENCODE_PERLQQ         0x0100 /* perlqq fallback string */
  104 #define  ENCODE_HTMLCREF       0x0200 /* HTML character ref. fb mode */
  105 #define  ENCODE_XMLCREF        0x0400 /* XML  character ref. fb mode */
  106 #define  ENCODE_STOP_AT_PARTIAL 0x0800 /* stop at partial explicitly */
  107 
  108 #define  ENCODE_FB_DEFAULT     0x0000
  109 #define  ENCODE_FB_CROAK       0x0001
  110 #define  ENCODE_FB_QUIET       ENCODE_RETURN_ON_ERR
  111 #define  ENCODE_FB_WARN        (ENCODE_RETURN_ON_ERR|ENCODE_WARN_ON_ERR)
  112 #define  ENCODE_FB_PERLQQ      (ENCODE_PERLQQ|ENCODE_LEAVE_SRC)
  113 #define  ENCODE_FB_HTMLCREF    (ENCODE_HTMLCREF|ENCODE_LEAVE_SRC)
  114 #define  ENCODE_FB_XMLCREF     (ENCODE_XMLCREF|ENCODE_LEAVE_SRC)
  115 
  116 #define encode_ckWARN(c, w) ((c & ENCODE_WARN_ON_ERR)                         \
  117                         && (!(c & ENCODE_ONLY_PRAGMA_WARNINGS) || ckWARN(w)))
  118 
  119 #ifdef UTF8SKIP
  120 #  ifdef EBCDIC   /* The value on early perls is wrong */
  121 #    undef UTF8_MAXBYTES 
  122 #    define UTF8_MAXBYTES 14
  123 #  endif
  124 #  ifndef UNLIKELY
  125 #    define UNLIKELY(x) (x)
  126 #  endif
  127 #  ifndef LIKELY
  128 #    define LIKELY(x) (x)
  129 #  endif
  130 
  131 /* EBCDIC requires a later perl to work, so the next two definitions are for
  132  * ASCII machines only */
  133 #  ifndef NATIVE_UTF8_TO_I8
  134 #    define NATIVE_UTF8_TO_I8(x) (x)
  135 #  endif
  136 #  ifndef I8_TO_NATIVE_UTF8
  137 #    define I8_TO_NATIVE_UTF8(x)  (x)
  138 #  endif
  139 #  ifndef OFFUNISKIP
  140 #    define OFFUNISKIP(x)  UNISKIP(x)
  141 #  endif
  142 #  ifndef uvoffuni_to_utf8_flags
  143 #    define uvoffuni_to_utf8_flags(a,b,c) uvuni_to_utf8_flags(a,b,c)
  144 #  endif
  145 #  ifndef WARN_SURROGATE    /* Use the overarching category if these
  146                                subcategories are missing */
  147 #    define WARN_SURROGATE WARN_UTF8
  148 #    define WARN_NONCHAR WARN_UTF8
  149 #    define WARN_NON_UNICODE WARN_UTF8
  150      /* If there's only one possible category, then packing is a no-op */
  151 #    define encode_ckWARN_packed(c, w) encode_ckWARN(c, w)
  152 #  else
  153 #    define encode_ckWARN_packed(c, w)                                      \
  154             ((c & ENCODE_WARN_ON_ERR)                                       \
  155         && (!(c & ENCODE_ONLY_PRAGMA_WARNINGS) || Perl_ckwarn(aTHX_ w)))
  156 #  endif
  157 
  158 /* All these formats take a single UV code point argument */
  159 static const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
  160 static const char nonchar_cp_format[]   = "Unicode non-character U+%04" UVXf
  161                                    " is not recommended for open interchange";
  162 static const char super_cp_format[]     = "Code point 0x%" UVXf " is not Unicode,"
  163                                    " may not be portable";
  164 
  165 /* If the perl doesn't have the 5.28 functions, this file includes
  166  * stripped-down versions of them but containing enough functionality to be
  167  * suitable for Encode's needs.  Many of the comments have been removed.  But
  168  * you can inspect the 5.28 source if you get stuck.
  169  *
  170  * These could be put in Devel::PPPort, but Encode is likely the only user */
  171 
  172 #if    (defined(IN_ENCODE_XS) || defined(IN_UNICODE_XS))                     \
  173   && (! defined(utf8n_to_uvchr_msgs) && ! defined(uvchr_to_utf8_flags_msgs))
  174 
  175 #  ifndef hv_stores
  176 #    define hv_stores(hv, key, val) hv_store((hv), ("" key ""), (sizeof(key)-1), (val), 0)
  177 #  endif
  178 
  179 static HV *
  180 S_new_msg_hv(const char * const message, /* The message text */
  181                    U32 categories)  /* Packed warning categories */
  182 {
  183     /* Creates, populates, and returns an HV* that describes an error message
  184      * for the translators between UTF8 and code point */
  185 
  186     dTHX;
  187     SV* msg_sv = newSVpv(message, 0);
  188     SV* category_sv = newSVuv(categories);
  189 
  190     HV* msg_hv = newHV();
  191 
  192     (void) hv_stores(msg_hv, "text", msg_sv);
  193     (void) hv_stores(msg_hv, "warn_categories",  category_sv);
  194 
  195     return msg_hv;
  196 }
  197 
  198 #endif
  199 
  200 #if ! defined(utf8n_to_uvchr_msgs)                      \
  201   && (defined(IN_ENCODE_XS) || defined(IN_UNICODE_XS))
  202 
  203 #  undef utf8n_to_uvchr     /* Don't use an earlier version: use the version
  204                                defined in this file */
  205 #  define utf8n_to_uvchr(a,b,c,d) utf8n_to_uvchr_msgs(a, b, c, d, 0, NULL)
  206 
  207 #  undef UTF8_IS_START      /* Early perls wrongly accepted C0 and C1 */
  208 #  define UTF8_IS_START(c)  (((U8)(c)) >= 0xc2)
  209 #  ifndef isUTF8_POSSIBLY_PROBLEMATIC
  210 #    ifdef EBCDIC
  211 #      define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c > ' ')
  212 #    else
  213 #      define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED)
  214 #    endif
  215 #  endif
  216 #  ifndef UTF8_ALLOW_OVERFLOW
  217 #    define UTF8_ALLOW_OVERFLOW (1U<<31)    /* Choose highest bit to avoid
  218                                                potential conflicts */
  219 #    define UTF8_GOT_OVERFLOW           UTF8_ALLOW_OVERFLOW
  220 #  endif
  221 #  undef UTF8_ALLOW_ANY     /* Early perl definitions don't work properly with
  222                                the code in this file */
  223 #  define UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION                              \
  224                           |UTF8_ALLOW_NON_CONTINUATION                          \
  225                           |UTF8_ALLOW_SHORT                                     \
  226                           |UTF8_ALLOW_LONG                                      \
  227                           |UTF8_ALLOW_OVERFLOW)
  228 
  229 /* The meanings of these were complemented at some point, but the functions
  230  * bundled in this file use the complemented meanings */
  231 #  ifndef UTF8_DISALLOW_SURROGATE
  232 #    define UTF8_DISALLOW_SURROGATE     UTF8_ALLOW_SURROGATE
  233 #    define UTF8_DISALLOW_NONCHAR       UTF8_ALLOW_FFFF
  234 #    define UTF8_DISALLOW_SUPER         UTF8_ALLOW_FE_FF
  235 
  236      /* In the stripped-down implementation in this file, disallowing is not
  237       * independent of warning */
  238 #    define UTF8_WARN_SURROGATE     UTF8_DISALLOW_SURROGATE
  239 #    define UTF8_WARN_NONCHAR       UTF8_DISALLOW_NONCHAR
  240 #    define UTF8_WARN_SUPER         UTF8_DISALLOW_SUPER
  241 #  endif
  242 #  ifndef UTF8_DISALLOW_ILLEGAL_INTERCHANGE
  243 #    define UTF8_DISALLOW_ILLEGAL_INTERCHANGE                                   \
  244      (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_SURROGATE|UTF8_DISALLOW_NONCHAR)
  245 #  endif
  246 #  ifndef UTF8_WARN_ILLEGAL_INTERCHANGE
  247 #    define UTF8_WARN_ILLEGAL_INTERCHANGE                                       \
  248          (UTF8_WARN_SUPER|UTF8_WARN_SURROGATE|UTF8_WARN_NONCHAR)
  249 #  endif
  250 #  ifndef FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER
  251 #    ifdef EBCDIC   /* On EBCDIC, these are actually I8 bytes */
  252 #      define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER  0xFA
  253 #      define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
  254 
  255 #      define IS_UTF8_2_BYTE_SURROGATE(s0, s1)     ((s0) == 0xF1            \
  256                                               && ((s1) & 0xFE ) == 0xB6)
  257 #    else
  258 #      define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER  0xF5
  259 #      define IS_UTF8_2_BYTE_SUPER(s0, s1)       ((s0) == 0xF4 && (s1) >= 0x90)
  260 #      define IS_UTF8_2_BYTE_SURROGATE(s0, s1)   ((s0) == 0xED && (s1) >= 0xA0)
  261 #    endif
  262 #    ifndef HIGHEST_REPRESENTABLE_UTF8
  263 #      if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1 */
  264 #        ifdef EBCDIC     /* Actually is I8 */
  265 #          define HIGHEST_REPRESENTABLE_UTF8                                    \
  266                    "\xFF\xA7\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
  267 #        else
  268 #          define HIGHEST_REPRESENTABLE_UTF8                                    \
  269                    "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
  270 #        endif
  271 #      endif
  272 #    endif
  273 #  endif
  274 
  275 #  ifndef Newx
  276 #    define Newx(v,n,t) New(0,v,n,t)
  277 #  endif
  278 
  279 #  ifndef PERL_UNUSED_ARG
  280 #    define PERL_UNUSED_ARG(x) ((void)x)
  281 #  endif
  282 
  283 static const char malformed_text[] = "Malformed UTF-8 character";
  284 
  285 static char *
  286 _byte_dump_string(const U8 * const start, const STRLEN len)
  287 {
  288     /* Returns a mortalized C string that is a displayable copy of the 'len' */
  289 
  290     const STRLEN output_len = 4 * len + 1;  /* 4 bytes per each input, plus a
  291                                                trailing NUL */
  292     const U8 * s = start;
  293     const U8 * const e = start + len;
  294     char * output;
  295     char * d;
  296     dTHX;
  297 
  298     Newx(output, output_len, char);
  299     SAVEFREEPV(output);
  300 
  301     d = output;
  302     for (s = start; s < e; s++) {
  303         const unsigned high_nibble = (*s & 0xF0) >> 4;
  304         const unsigned low_nibble =  (*s & 0x0F);
  305 
  306         *d++ = '\\';
  307         *d++ = 'x';
  308 
  309         if (high_nibble < 10) {
  310             *d++ = high_nibble + '0';
  311         }
  312         else {
  313             *d++ = high_nibble - 10 + 'a';
  314         }
  315 
  316         if (low_nibble < 10) {
  317             *d++ = low_nibble + '0';
  318         }
  319         else {
  320             *d++ = low_nibble - 10 + 'a';
  321         }
  322     }
  323 
  324     *d = '\0';
  325     return output;
  326 }
  327 
  328 static char *
  329 S_unexpected_non_continuation_text(const U8 * const s,
  330 
  331                                          /* Max number of bytes to print */
  332                                          STRLEN print_len,
  333 
  334                                          /* Which one is the non-continuation */
  335                                          const STRLEN non_cont_byte_pos,
  336 
  337                                          /* How many bytes should there be? */
  338                                          const STRLEN expect_len)
  339 {
  340     /* Return the malformation warning text for an unexpected continuation
  341      * byte. */
  342 
  343     dTHX;
  344     const char * const where = (non_cont_byte_pos == 1)
  345                                ? "immediately"
  346                                : Perl_form(aTHX_ "%d bytes",
  347                                                  (int) non_cont_byte_pos);
  348     const U8 * x = s + non_cont_byte_pos;
  349     const U8 * e = s + print_len;
  350 
  351     /* We don't need to pass this parameter, but since it has already been
  352      * calculated, it's likely faster to pass it; verify under DEBUGGING */
  353     assert(expect_len == UTF8SKIP(s));
  354 
  355     /* As a defensive coding measure, don't output anything past a NUL.  Such
  356      * bytes shouldn't be in the middle of a malformation, and could mark the
  357      * end of the allocated string, and what comes after is undefined */
  358     for (; x < e; x++) {
  359         if (*x == '\0') {
  360             x++;            /* Output this particular NUL */
  361             break;
  362         }
  363     }
  364 
  365     return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
  366                            " %s after start byte 0x%02x; need %d bytes, got %d)",
  367                            malformed_text,
  368                            _byte_dump_string(s, x - s),
  369                            *(s + non_cont_byte_pos),
  370                            where,
  371                            *s,
  372                            (int) expect_len,
  373                            (int) non_cont_byte_pos);
  374 }
  375 
  376 static int
  377 S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len);
  378 
  379 static int
  380 S_does_utf8_overflow(const U8 * const s,
  381                        const U8 * e,
  382                        const bool consider_overlongs)
  383 {
  384     /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
  385      * 'e' - 1 would overflow an IV on this platform. */
  386 
  387 #  if ! defined(UV_IS_QUAD)
  388 
  389     const STRLEN len = e - s;
  390     int is_overlong;
  391 
  392     assert(s <= e && s + UTF8SKIP(s) >= e);
  393     assert(! UTF8_IS_INVARIANT(*s) && e > s);
  394 
  395 #    ifdef EBCDIC
  396 
  397     PERL_UNUSED_ARG(consider_overlongs);
  398 
  399     if (*s != 0xFE) {
  400         return 0;
  401     }
  402 
  403     if (len == 1) {
  404         return -1;
  405     }
  406 
  407 #    else
  408 
  409     if (LIKELY(*s < 0xFE)) {
  410         return 0;
  411     }
  412 
  413     if (! consider_overlongs) {
  414         return 1;
  415     }
  416 
  417     if (len == 1) {
  418         return -1;
  419     }
  420 
  421     is_overlong = S_is_utf8_overlong_given_start_byte_ok(s, len);
  422 
  423     if (is_overlong == 0) {
  424         return 1;
  425     }
  426 
  427     if (is_overlong < 0) {
  428         return -1;
  429     }
  430 
  431     if (*s == 0xFE) {
  432         return 0;
  433     }
  434 
  435 #    endif
  436 
  437     /* Here, ASCII and EBCDIC rejoin:
  438     *  On ASCII:   We have an overlong sequence starting with FF
  439     *  On EBCDIC:  We have a sequence starting with FE. */
  440 
  441     {   /* For C89, use a block so the declaration can be close to its use */
  442 
  443 #    ifdef EBCDIC
  444         const U8 conts_for_highest_30_bit[] = "\x41\x41\x41\x41\x41\x41\x42";
  445 #    else
  446         const U8 conts_for_highest_30_bit[] = "\x80\x80\x80\x80\x80\x80\x81";
  447 #    endif
  448         const STRLEN conts_len = sizeof(conts_for_highest_30_bit) - 1;
  449         const STRLEN cmp_len = MIN(conts_len, len - 1);
  450 
  451         if (cmp_len >= conts_len || memNE(s + 1,
  452                                           conts_for_highest_30_bit,
  453                                           cmp_len))
  454         {
  455             return memGT(s + 1, conts_for_highest_30_bit, cmp_len);
  456         }
  457 
  458         return -1;
  459     }
  460 
  461 #  else /* Below is 64-bit word */
  462 
  463     PERL_UNUSED_ARG(consider_overlongs);
  464 
  465     {
  466         const STRLEN len = e - s;
  467         const U8 *x;
  468         const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
  469 
  470         for (x = s; x < e; x++, y++) {
  471 
  472             if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
  473                 continue;
  474             }
  475             return NATIVE_UTF8_TO_I8(*x) > *y;
  476         }
  477 
  478         if (len < sizeof(HIGHEST_REPRESENTABLE_UTF8) - 1) {
  479             return -1;
  480         }
  481 
  482         return 0;
  483     }
  484 
  485 #  endif
  486 
  487 }
  488 
  489 static int
  490 S_isFF_OVERLONG(const U8 * const s, const STRLEN len);
  491 
  492 static int
  493 S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
  494 {
  495     const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
  496     const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
  497 
  498     assert(len > 1 && UTF8_IS_START(*s));
  499 
  500 #         ifdef EBCDIC
  501 #             define F0_ABOVE_OVERLONG 0xB0
  502 #             define F8_ABOVE_OVERLONG 0xA8
  503 #             define FC_ABOVE_OVERLONG 0xA4
  504 #             define FE_ABOVE_OVERLONG 0xA2
  505 #             define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
  506 #         else
  507 
  508     if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
  509         return 1;
  510     }
  511 
  512 #             define F0_ABOVE_OVERLONG 0x90
  513 #             define F8_ABOVE_OVERLONG 0x88
  514 #             define FC_ABOVE_OVERLONG 0x84
  515 #             define FE_ABOVE_OVERLONG 0x82
  516 #             define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
  517 #         endif
  518 
  519     if (   (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
  520         || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
  521         || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
  522         || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
  523     {
  524         return 1;
  525     }
  526 
  527     /* Check for the FF overlong */
  528     return S_isFF_OVERLONG(s, len);
  529 }
  530 
  531 int
  532 S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
  533 {
  534     if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
  535                      MIN(len, sizeof(FF_OVERLONG_PREFIX) - 1))))
  536     {
  537         return 0;
  538     }
  539 
  540     if (len >= sizeof(FF_OVERLONG_PREFIX) - 1) {
  541         return 1;
  542     }
  543 
  544     return -1;
  545 }
  546 
  547 #  ifndef UTF8_GOT_CONTINUATION
  548 #    define UTF8_GOT_CONTINUATION       UTF8_ALLOW_CONTINUATION
  549 #    define UTF8_GOT_EMPTY              UTF8_ALLOW_EMPTY
  550 #    define UTF8_GOT_LONG               UTF8_ALLOW_LONG
  551 #    define UTF8_GOT_NON_CONTINUATION   UTF8_ALLOW_NON_CONTINUATION
  552 #    define UTF8_GOT_SHORT              UTF8_ALLOW_SHORT
  553 #    define UTF8_GOT_SURROGATE          UTF8_DISALLOW_SURROGATE
  554 #    define UTF8_GOT_NONCHAR            UTF8_DISALLOW_NONCHAR
  555 #    define UTF8_GOT_SUPER              UTF8_DISALLOW_SUPER
  556 #  endif
  557 
  558 #  ifndef UNICODE_IS_SUPER
  559 #    define UNICODE_IS_SUPER(uv)    ((UV) (uv) > PERL_UNICODE_MAX)
  560 #  endif
  561 #  ifndef UNICODE_IS_32_CONTIGUOUS_NONCHARS
  562 #    define UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)      ((UV) (uv) >= 0xFDD0   \
  563                                                    && (UV) (uv) <= 0xFDEF)
  564 #  endif
  565 #  ifndef UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER
  566 #    define UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)                  \
  567                                           (((UV) (uv) & 0xFFFE) == 0xFFFE)
  568 #  endif
  569 #  ifndef is_NONCHAR_utf8_safe
  570 #    define is_NONCHAR_utf8_safe(s,e)     /*** GENERATED CODE ***/            \
  571 ( ( ( LIKELY((e) > (s)) ) && ( LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) ) ? ( ( 0xEF == ((const U8*)s)[0] ) ?\
  572         ( ( 0xB7 == ((const U8*)s)[1] ) ?                               \
  573         ( ( 0x90 <= ((const U8*)s)[2] && ((const U8*)s)[2] <= 0xAF ) ? 3 : 0 )\
  574         : ( ( 0xBF == ((const U8*)s)[1] ) && ( ( ((const U8*)s)[2] & 0xFE ) == 0xBE ) ) ? 3 : 0 )\
  575     : ( 0xF0 == ((const U8*)s)[0] ) ?                                   \
  576         ( ( ( ( ((const U8*)s)[1] == 0x9F || ( ( ((const U8*)s)[1] & 0xEF ) == 0xAF ) ) && ( 0xBF == ((const U8*)s)[2] ) ) && ( ( ((const U8*)s)[3] & 0xFE ) == 0xBE ) ) ? 4 : 0 )\
  577     : ( 0xF1 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xF3 ) ?      \
  578         ( ( ( ( ( ((const U8*)s)[1] & 0xCF ) == 0x8F ) && ( 0xBF == ((const U8*)s)[2] ) ) && ( ( ((const U8*)s)[3] & 0xFE ) == 0xBE ) ) ? 4 : 0 )\
  579     : ( ( ( ( 0xF4 == ((const U8*)s)[0] ) && ( 0x8F == ((const U8*)s)[1] ) ) && ( 0xBF == ((const U8*)s)[2] ) ) && ( ( ((const U8*)s)[3] & 0xFE ) == 0xBE ) ) ? 4 : 0 ) : 0 )
  580 #  endif
  581 
  582 #  ifndef UTF8_IS_NONCHAR
  583 #    define UTF8_IS_NONCHAR(s, e) (is_NONCHAR_utf8_safe(s,e) > 0)
  584 #  endif
  585 #  ifndef UNICODE_IS_NONCHAR
  586 #    define UNICODE_IS_NONCHAR(uv)                                    \
  587     (   UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)                       \
  588      || (   LIKELY( ! UNICODE_IS_SUPER(uv))                         \
  589          && UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
  590 #  endif
  591 
  592 #  ifndef UTF8_MAXBYTES
  593 #    define UTF8_MAXBYTES UTF8_MAXLEN
  594 #  endif
  595 
  596 static UV
  597 utf8n_to_uvchr_msgs(const U8 *s,
  598                     STRLEN curlen,
  599                     STRLEN *retlen,
  600                     const U32 flags,
  601                     U32 * errors,
  602                     AV ** msgs)
  603 {
  604     const U8 * const s0 = s;
  605     const U8 * send = NULL;
  606     U32 possible_problems = 0;
  607     UV uv = *s;
  608     STRLEN expectlen   = 0;
  609     U8 * adjusted_s0 = (U8 *) s0;
  610     U8 temp_char_buf[UTF8_MAXBYTES + 1];
  611     UV uv_so_far = 0;
  612     dTHX;
  613 
  614     assert(errors == NULL); /* This functionality has been stripped */
  615 
  616     if (UNLIKELY(curlen == 0)) {
  617         possible_problems |= UTF8_GOT_EMPTY;
  618         curlen = 0;
  619         uv = UNICODE_REPLACEMENT;
  620     goto ready_to_handle_errors;
  621     }
  622 
  623     expectlen = UTF8SKIP(s);
  624 
  625     if (retlen) {
  626     *retlen = expectlen;
  627     }
  628 
  629     if (UTF8_IS_INVARIANT(uv)) {
  630     return uv;
  631     }
  632 
  633     if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
  634     possible_problems |= UTF8_GOT_CONTINUATION;
  635         curlen = 1;
  636         uv = UNICODE_REPLACEMENT;
  637     goto ready_to_handle_errors;
  638     }
  639 
  640     uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
  641 
  642     send = (U8*) s0;
  643     if (UNLIKELY(curlen < expectlen)) {
  644         possible_problems |= UTF8_GOT_SHORT;
  645         send += curlen;
  646     }
  647     else {
  648         send += expectlen;
  649     }
  650 
  651     for (s = s0 + 1; s < send; s++) {
  652     if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
  653         uv = UTF8_ACCUMULATE(uv, *s);
  654             continue;
  655         }
  656 
  657         possible_problems |= UTF8_GOT_NON_CONTINUATION;
  658         break;
  659     } /* End of loop through the character's bytes */
  660 
  661     curlen = s - s0;
  662 
  663 #     define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
  664 
  665     if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
  666         uv_so_far = uv;
  667         uv = UNICODE_REPLACEMENT;
  668     }
  669 
  670     if (UNLIKELY(0 < S_does_utf8_overflow(s0, s, 1))) {
  671         possible_problems |= UTF8_GOT_OVERFLOW;
  672         uv = UNICODE_REPLACEMENT;
  673     }
  674 
  675     if (     (   LIKELY(! possible_problems)
  676               && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
  677         || (       UNLIKELY(possible_problems)
  678             && (   UNLIKELY(! UTF8_IS_START(*s0))
  679                 || (   curlen > 1
  680                     && UNLIKELY(0 < S_is_utf8_overlong_given_start_byte_ok(s0,
  681                                                                 s - s0))))))
  682     {
  683         possible_problems |= UTF8_GOT_LONG;
  684 
  685         if (   UNLIKELY(   possible_problems & UTF8_GOT_TOO_SHORT)
  686             &&   LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
  687         {
  688             UV min_uv = uv_so_far;
  689             STRLEN i;
  690 
  691             for (i = curlen; i < expectlen; i++) {
  692                 min_uv = UTF8_ACCUMULATE(min_uv,
  693                                      I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
  694             }
  695 
  696             adjusted_s0 = temp_char_buf;
  697             (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
  698         }
  699     }
  700 
  701     /* Here, we have found all the possible problems, except for when the input
  702      * is for a problematic code point not allowed by the input parameters. */
  703 
  704                                 /* uv is valid for overlongs */
  705     if (   (   (      LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
  706                    && uv >= UNICODE_SURROGATE_FIRST)
  707             || (   UNLIKELY(possible_problems)
  708                 && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
  709     && ((flags & ( UTF8_DISALLOW_NONCHAR
  710                       |UTF8_DISALLOW_SURROGATE
  711                       |UTF8_DISALLOW_SUPER))))
  712     {
  713         if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
  714             if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
  715                 possible_problems |= UTF8_GOT_SURROGATE;
  716             }
  717             else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
  718                 possible_problems |= UTF8_GOT_SUPER;
  719             }
  720             else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
  721                 possible_problems |= UTF8_GOT_NONCHAR;
  722             }
  723         }
  724         else {
  725             if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
  726                                 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
  727             {
  728                 possible_problems |= UTF8_GOT_SUPER;
  729             }
  730             else if (curlen > 1) {
  731                 if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
  732                                       NATIVE_UTF8_TO_I8(*adjusted_s0),
  733                                       NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
  734                 {
  735                     possible_problems |= UTF8_GOT_SUPER;
  736                 }
  737                 else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
  738                                       NATIVE_UTF8_TO_I8(*adjusted_s0),
  739                                       NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
  740                 {
  741                     possible_problems |= UTF8_GOT_SURROGATE;
  742                 }
  743             }
  744         }
  745     }
  746 
  747   ready_to_handle_errors:
  748 
  749     if (UNLIKELY(possible_problems)) {
  750         bool disallowed = FALSE;
  751         const U32 orig_problems = possible_problems;
  752 
  753         if (msgs) {
  754             *msgs = NULL;
  755         }
  756 
  757         while (possible_problems) { /* Handle each possible problem */
  758             UV pack_warn = 0;
  759             char * message = NULL;
  760             U32 this_flag_bit = 0;
  761 
  762             /* Each 'if' clause handles one problem.  They are ordered so that
  763              * the first ones' messages will be displayed before the later
  764              * ones; this is kinda in decreasing severity order.  But the
  765              * overlong must come last, as it changes 'uv' looked at by the
  766              * others */
  767             if (possible_problems & UTF8_GOT_OVERFLOW) {
  768 
  769                 /* Overflow means also got a super; we handle both here */
  770                 possible_problems
  771                   &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER);
  772 
  773                 /* Disallow if any of the categories say to */
  774                 if ( ! (flags &  UTF8_ALLOW_OVERFLOW)
  775                     || (flags &  UTF8_DISALLOW_SUPER))
  776                 {
  777                     disallowed = TRUE;
  778                 }
  779 
  780                 /* Likewise, warn if any say to */
  781                 if (  ! (flags & UTF8_ALLOW_OVERFLOW)) {
  782 
  783                     /* The warnings code explicitly says it doesn't handle the
  784                      * case of packWARN2 and two categories which have
  785                      * parent-child relationship.  Even if it works now to
  786                      * raise the warning if either is enabled, it wouldn't
  787                      * necessarily do so in the future.  We output (only) the
  788                      * most dire warning */
  789                     if (! (flags & UTF8_CHECK_ONLY)) {
  790                         if (msgs || ckWARN_d(WARN_UTF8)) {
  791                             pack_warn = packWARN(WARN_UTF8);
  792                         }
  793                         else if (msgs || ckWARN_d(WARN_NON_UNICODE)) {
  794                             pack_warn = packWARN(WARN_NON_UNICODE);
  795                         }
  796                         if (pack_warn) {
  797                             message = Perl_form(aTHX_ "%s: %s (overflows)",
  798                                             malformed_text,
  799                                             _byte_dump_string(s0, curlen));
  800                             this_flag_bit = UTF8_GOT_OVERFLOW;
  801                         }
  802                     }
  803                 }
  804             }
  805             else if (possible_problems & UTF8_GOT_EMPTY) {
  806                 possible_problems &= ~UTF8_GOT_EMPTY;
  807 
  808                 if (! (flags & UTF8_ALLOW_EMPTY)) {
  809                     disallowed = TRUE;
  810                     if (  (msgs
  811                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
  812                     {
  813                         pack_warn = packWARN(WARN_UTF8);
  814                         message = Perl_form(aTHX_ "%s (empty string)",
  815                                                    malformed_text);
  816                         this_flag_bit = UTF8_GOT_EMPTY;
  817                     }
  818                 }
  819             }
  820             else if (possible_problems & UTF8_GOT_CONTINUATION) {
  821                 possible_problems &= ~UTF8_GOT_CONTINUATION;
  822 
  823                 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
  824                     disallowed = TRUE;
  825                     if ((   msgs
  826                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
  827                     {
  828                         pack_warn = packWARN(WARN_UTF8);
  829                         message = Perl_form(aTHX_
  830                                 "%s: %s (unexpected continuation byte 0x%02x,"
  831                                 " with no preceding start byte)",
  832                                 malformed_text,
  833                                 _byte_dump_string(s0, 1), *s0);
  834                         this_flag_bit = UTF8_GOT_CONTINUATION;
  835                     }
  836                 }
  837             }
  838             else if (possible_problems & UTF8_GOT_SHORT) {
  839                 possible_problems &= ~UTF8_GOT_SHORT;
  840 
  841                 if (! (flags & UTF8_ALLOW_SHORT)) {
  842                     disallowed = TRUE;
  843                     if ((   msgs
  844                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
  845                     {
  846                         pack_warn = packWARN(WARN_UTF8);
  847                         message = Perl_form(aTHX_
  848                              "%s: %s (too short; %d byte%s available, need %d)",
  849                              malformed_text,
  850                              _byte_dump_string(s0, send - s0),
  851                              (int)curlen,
  852                              curlen == 1 ? "" : "s",
  853                              (int)expectlen);
  854                         this_flag_bit = UTF8_GOT_SHORT;
  855                     }
  856                 }
  857 
  858             }
  859             else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
  860                 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
  861 
  862                 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
  863                     disallowed = TRUE;
  864                     if ((   msgs
  865                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
  866                     {
  867                         int printlen = s - s0;
  868                         pack_warn = packWARN(WARN_UTF8);
  869                         message = Perl_form(aTHX_ "%s",
  870                             S_unexpected_non_continuation_text(s0,
  871                                                             printlen,
  872                                                             s - s0,
  873                                                             (int) expectlen));
  874                         this_flag_bit = UTF8_GOT_NON_CONTINUATION;
  875                     }
  876                 }
  877             }
  878             else if (possible_problems & UTF8_GOT_SURROGATE) {
  879                 possible_problems &= ~UTF8_GOT_SURROGATE;
  880 
  881                 if (flags & UTF8_WARN_SURROGATE) {
  882 
  883                     if (   ! (flags & UTF8_CHECK_ONLY)
  884                         && (msgs || ckWARN_d(WARN_SURROGATE)))
  885                     {
  886                         pack_warn = packWARN(WARN_SURROGATE);
  887 
  888                         /* These are the only errors that can occur with a
  889                         * surrogate when the 'uv' isn't valid */
  890                         if (orig_problems & UTF8_GOT_TOO_SHORT) {
  891                             message = Perl_form(aTHX_
  892                                     "UTF-16 surrogate (any UTF-8 sequence that"
  893                                     " starts with \"%s\" is for a surrogate)",
  894                                     _byte_dump_string(s0, curlen));
  895                         }
  896                         else {
  897                             message = Perl_form(aTHX_ surrogate_cp_format, uv);
  898                         }
  899                         this_flag_bit = UTF8_GOT_SURROGATE;
  900                     }
  901                 }
  902 
  903                 if (flags & UTF8_DISALLOW_SURROGATE) {
  904                     disallowed = TRUE;
  905                 }
  906             }
  907             else if (possible_problems & UTF8_GOT_SUPER) {
  908                 possible_problems &= ~UTF8_GOT_SUPER;
  909 
  910                 if (flags & UTF8_WARN_SUPER) {
  911 
  912                     if (   ! (flags & UTF8_CHECK_ONLY)
  913                         && (msgs || ckWARN_d(WARN_NON_UNICODE)))
  914                     {
  915                         pack_warn = packWARN(WARN_NON_UNICODE);
  916 
  917                         if (orig_problems & UTF8_GOT_TOO_SHORT) {
  918                             message = Perl_form(aTHX_
  919                                     "Any UTF-8 sequence that starts with"
  920                                     " \"%s\" is for a non-Unicode code point,"
  921                                     " may not be portable",
  922                                     _byte_dump_string(s0, curlen));
  923                         }
  924                         else {
  925                             message = Perl_form(aTHX_ super_cp_format, uv);
  926                         }
  927                         this_flag_bit = UTF8_GOT_SUPER;
  928                     }
  929                 }
  930 
  931                 if (flags & UTF8_DISALLOW_SUPER) {
  932                     disallowed = TRUE;
  933                 }
  934             }
  935             else if (possible_problems & UTF8_GOT_NONCHAR) {
  936                 possible_problems &= ~UTF8_GOT_NONCHAR;
  937 
  938                 if (flags & UTF8_WARN_NONCHAR) {
  939 
  940                     if (  ! (flags & UTF8_CHECK_ONLY)
  941                         && (msgs || ckWARN_d(WARN_NONCHAR)))
  942                     {
  943                         /* The code above should have guaranteed that we don't
  944                          * get here with errors other than overlong */
  945                         assert (! (orig_problems
  946                                         & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
  947 
  948                         pack_warn = packWARN(WARN_NONCHAR);
  949                         message = Perl_form(aTHX_ nonchar_cp_format, uv);
  950                         this_flag_bit = UTF8_GOT_NONCHAR;
  951                     }
  952                 }
  953 
  954                 if (flags & UTF8_DISALLOW_NONCHAR) {
  955                     disallowed = TRUE;
  956                 }
  957             }
  958             else if (possible_problems & UTF8_GOT_LONG) {
  959                 possible_problems &= ~UTF8_GOT_LONG;
  960 
  961                 if (flags & UTF8_ALLOW_LONG) {
  962                     uv = UNICODE_REPLACEMENT;
  963                 }
  964                 else {
  965                     disallowed = TRUE;
  966 
  967                     if ((   msgs
  968                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
  969                     {
  970                         pack_warn = packWARN(WARN_UTF8);
  971 
  972                         /* These error types cause 'uv' to be something that
  973                          * isn't what was intended, so can't use it in the
  974                          * message.  The other error types either can't
  975                          * generate an overlong, or else the 'uv' is valid */
  976                         if (orig_problems &
  977                                         (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
  978                         {
  979                             message = Perl_form(aTHX_
  980                                     "%s: %s (any UTF-8 sequence that starts"
  981                                     " with \"%s\" is overlong which can and"
  982                                     " should be represented with a"
  983                                     " different, shorter sequence)",
  984                                     malformed_text,
  985                                     _byte_dump_string(s0, send - s0),
  986                                     _byte_dump_string(s0, curlen));
  987                         }
  988                         else {
  989                             U8 tmpbuf[UTF8_MAXBYTES+1];
  990                             const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
  991                                                                         uv, 0);
  992                             /* Don't use U+ for non-Unicode code points, which
  993                              * includes those in the Latin1 range */
  994                             const char * preface = (    uv > PERL_UNICODE_MAX
  995 #  ifdef EBCDIC
  996                                                      || uv <= 0xFF
  997 #  endif
  998                                                     )
  999                                                    ? "0x"
 1000                                                    : "U+";
 1001                             message = Perl_form(aTHX_
 1002                                 "%s: %s (overlong; instead use %s to represent"
 1003                                 " %s%0*" UVXf ")",
 1004                                 malformed_text,
 1005                                 _byte_dump_string(s0, send - s0),
 1006                                 _byte_dump_string(tmpbuf, e - tmpbuf),
 1007                                 preface,
 1008                                 ((uv < 256) ? 2 : 4), /* Field width of 2 for
 1009                                                          small code points */
 1010                                 UNI_TO_NATIVE(uv));
 1011                         }
 1012                         this_flag_bit = UTF8_GOT_LONG;
 1013                     }
 1014                 }
 1015             } /* End of looking through the possible flags */
 1016 
 1017             /* Display the message (if any) for the problem being handled in
 1018              * this iteration of the loop */
 1019             if (message) {
 1020                 if (msgs) {
 1021                     assert(this_flag_bit);
 1022 
 1023                     if (*msgs == NULL) {
 1024                         *msgs = newAV();
 1025                     }
 1026 
 1027                     av_push(*msgs, newRV_noinc((SV*) S_new_msg_hv(message,
 1028                                                                 pack_warn)));
 1029                 }
 1030                 else if (PL_op)
 1031                     Perl_warner(aTHX_ pack_warn, "%s in %s", message,
 1032                                                  OP_DESC(PL_op));
 1033                 else
 1034                     Perl_warner(aTHX_ pack_warn, "%s", message);
 1035             }
 1036         }   /* End of 'while (possible_problems)' */
 1037 
 1038         if (retlen) {
 1039             *retlen = curlen;
 1040         }
 1041 
 1042         if (disallowed) {
 1043             if (flags & UTF8_CHECK_ONLY && retlen) {
 1044                 *retlen = ((STRLEN) -1);
 1045             }
 1046             return 0;
 1047         }
 1048     }
 1049 
 1050     return UNI_TO_NATIVE(uv);
 1051 }
 1052 
 1053 static STRLEN
 1054 S_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
 1055 {
 1056     STRLEN len;
 1057     const U8 *x;
 1058 
 1059     assert(0 == (flags & ~UTF8_DISALLOW_ILLEGAL_INTERCHANGE));
 1060     assert(! UTF8_IS_INVARIANT(*s));
 1061 
 1062     if (UNLIKELY(! UTF8_IS_START(*s))) {
 1063         return 0;
 1064     }
 1065 
 1066     /* Examine a maximum of a single whole code point */
 1067     if (e - s > UTF8SKIP(s)) {
 1068         e = s + UTF8SKIP(s);
 1069     }
 1070 
 1071     len = e - s;
 1072 
 1073     if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
 1074         const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
 1075 
 1076         if (  (flags & UTF8_DISALLOW_SUPER)
 1077             && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
 1078         {
 1079             return 0;           /* Above Unicode */
 1080         }
 1081 
 1082         if (len > 1) {
 1083             const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
 1084 
 1085             if (   (flags & UTF8_DISALLOW_SUPER)
 1086                 &&  UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
 1087             {
 1088                 return 0;       /* Above Unicode */
 1089             }
 1090 
 1091             if (   (flags & UTF8_DISALLOW_SURROGATE)
 1092                 &&  UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
 1093             {
 1094                 return 0;       /* Surrogate */
 1095             }
 1096 
 1097             if (  (flags & UTF8_DISALLOW_NONCHAR)
 1098                 && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
 1099             {
 1100                 return 0;       /* Noncharacter code point */
 1101             }
 1102         }
 1103     }
 1104 
 1105     for (x = s + 1; x < e; x++) {
 1106         if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
 1107             return 0;
 1108         }
 1109     }
 1110 
 1111     if (len > 1 && S_is_utf8_overlong_given_start_byte_ok(s, len) > 0) {
 1112         return 0;
 1113     }
 1114 
 1115     if (0 < S_does_utf8_overflow(s, e, 0)) {
 1116         return 0;
 1117     }
 1118 
 1119     return UTF8SKIP(s);
 1120 }
 1121 
 1122 #  undef is_utf8_valid_partial_char_flags
 1123 
 1124 static bool
 1125 is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
 1126 {
 1127 
 1128     return S_is_utf8_char_helper(s, e, flags) > 0;
 1129 }
 1130 
 1131 #  undef is_utf8_string_loc_flags
 1132 
 1133 static bool
 1134 is_utf8_string_loc_flags(const U8 *s, STRLEN len, const U8 **ep, const U32 flags)
 1135 {
 1136     const U8* send = s + len;
 1137 
 1138     assert(0 == (flags & ~UTF8_DISALLOW_ILLEGAL_INTERCHANGE));
 1139 
 1140     while (s < send) {
 1141         if (UTF8_IS_INVARIANT(*s)) {
 1142             s++;
 1143         }
 1144         else if (     UNLIKELY(send - s < UTF8SKIP(s))
 1145                  || ! S_is_utf8_char_helper(s, send, flags))
 1146         {
 1147             *ep = s;
 1148             return 0;
 1149         }
 1150         else {
 1151             s += UTF8SKIP(s);
 1152         }
 1153     }
 1154 
 1155     *ep = send;
 1156 
 1157     return 1;
 1158 }
 1159 
 1160 #endif
 1161 
 1162 #if defined(IN_UNICODE_XS) && ! defined(uvchr_to_utf8_flags_msgs)
 1163 
 1164 #  define MY_SHIFT   UTF_ACCUMULATION_SHIFT
 1165 #  define MY_MARK    UTF_CONTINUATION_MARK
 1166 #  define MY_MASK    UTF_CONTINUATION_MASK
 1167 
 1168 static const char cp_above_legal_max[] =
 1169                         "Use of code point 0x%" UVXf " is not allowed; the"
 1170                         " permissible max is 0x%" UVXf;
 1171 
 1172 /* These two can be dummys, as they are not looked at by the function, which
 1173  * has hard-coded into it what flags it is expecting are */
 1174 #  ifndef UNICODE_DISALLOW_ILLEGAL_INTERCHANGE
 1175 #    define UNICODE_DISALLOW_ILLEGAL_INTERCHANGE 0
 1176 #  endif
 1177 #  ifndef UNICODE_WARN_ILLEGAL_INTERCHANGE
 1178 #    define UNICODE_WARN_ILLEGAL_INTERCHANGE 0
 1179 #  endif
 1180 
 1181 #  ifndef OFFUNI_IS_INVARIANT
 1182 #    define OFFUNI_IS_INVARIANT(cp) UNI_IS_INVARIANT(cp)
 1183 #  endif
 1184 #  ifndef MAX_EXTERNALLY_LEGAL_CP
 1185 #    define MAX_EXTERNALLY_LEGAL_CP ((UV) (IV_MAX))
 1186 #  endif
 1187 #  ifndef LATIN1_TO_NATIVE
 1188 #    define LATIN1_TO_NATIVE(a) ASCII_TO_NATIVE(a)
 1189 #  endif
 1190 #  ifndef I8_TO_NATIVE_UTF8
 1191 #    define I8_TO_NATIVE_UTF8(a) NATIVE_TO_UTF(a)
 1192 #  endif
 1193 #  ifndef MAX_UTF8_TWO_BYTE
 1194 #    define MAX_UTF8_TWO_BYTE (32 * (1U << UTF_ACCUMULATION_SHIFT) - 1)
 1195 #  endif
 1196 #  ifndef UNICODE_IS_32_CONTIGUOUS_NONCHARS
 1197 #    define UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)    ((UV) (uv) >= 0xFDD0   \
 1198                                                  && (UV) (uv) <= 0xFDEF)
 1199 #  endif
 1200 #  ifndef UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER
 1201 #    define UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)                \
 1202                                           (((UV) (uv) & 0xFFFE) == 0xFFFE)
 1203 #  endif
 1204 #  ifndef UNICODE_IS_SUPER
 1205 #    define UNICODE_IS_SUPER(uv)    ((UV) (uv) > PERL_UNICODE_MAX)
 1206 #  endif
 1207 #  ifndef OFFUNISKIP
 1208 #    define OFFUNISKIP(cp)    UNISKIP(NATIVE_TO_UNI(cp))
 1209 #  endif
 1210 
 1211 #  define HANDLE_UNICODE_SURROGATE(uv, flags, msgs)                 \
 1212     STMT_START {                                                    \
 1213         U32 category = packWARN(WARN_SURROGATE);                    \
 1214         const char * format = surrogate_cp_format;                  \
 1215         *msgs = S_new_msg_hv(Perl_form(aTHX_ format, uv),           \
 1216                                  category);                         \
 1217         return NULL;                                                \
 1218     } STMT_END;
 1219 
 1220 #  define HANDLE_UNICODE_NONCHAR(uv, flags, msgs)                   \
 1221     STMT_START {                                                    \
 1222         U32 category = packWARN(WARN_NONCHAR);                      \
 1223         const char * format = nonchar_cp_format;                    \
 1224         *msgs = S_new_msg_hv(Perl_form(aTHX_ format, uv),           \
 1225                                  category);                         \
 1226         return NULL;                                                \
 1227     } STMT_END;
 1228 
 1229 static U8 *
 1230 uvchr_to_utf8_flags_msgs(U8 *d, UV uv, const UV flags, HV** msgs)
 1231 {
 1232     dTHX;
 1233 
 1234     assert(msgs);
 1235 
 1236     PERL_UNUSED_ARG(flags);
 1237 
 1238     uv = NATIVE_TO_UNI(uv);
 1239 
 1240     *msgs = NULL;
 1241 
 1242     if (OFFUNI_IS_INVARIANT(uv)) {
 1243     *d++ = LATIN1_TO_NATIVE(uv);
 1244     return d;
 1245     }
 1246 
 1247     if (uv <= MAX_UTF8_TWO_BYTE) {
 1248         *d++ = I8_TO_NATIVE_UTF8(( uv >> MY_SHIFT) | UTF_START_MARK(2));
 1249         *d++ = I8_TO_NATIVE_UTF8(( uv   & MY_MASK) | MY_MARK);
 1250         return d;
 1251     }
 1252 
 1253     /* Not 2-byte; test for and handle 3-byte result.   In the test immediately
 1254      * below, the 16 is for start bytes E0-EF (which are all the possible ones
 1255      * for 3 byte characters).  The 2 is for 2 continuation bytes; these each
 1256      * contribute MY_SHIFT bits.  This yields 0x4000 on EBCDIC platforms, 0x1_0000
 1257      * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
 1258      * 0x800-0xFFFF on ASCII */
 1259     if (uv < (16 * (1U << (2 * MY_SHIFT)))) {
 1260     *d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * MY_SHIFT)) | UTF_START_MARK(3));
 1261     *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * MY_SHIFT)) & MY_MASK) | MY_MARK);
 1262     *d++ = I8_TO_NATIVE_UTF8(( uv  /* (1 - 1) */           & MY_MASK) | MY_MARK);
 1263 
 1264 #ifndef EBCDIC  /* These problematic code points are 4 bytes on EBCDIC, so
 1265                    aren't tested here */
 1266         /* The most likely code points in this range are below the surrogates.
 1267          * Do an extra test to quickly exclude those. */
 1268         if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
 1269             if (UNLIKELY(   UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
 1270                          || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
 1271             {
 1272                 HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
 1273             }
 1274             else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
 1275                 HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
 1276             }
 1277         }
 1278 #endif
 1279     return d;
 1280     }
 1281 
 1282     /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
 1283      * platforms, and 0x4000 on EBCDIC.  There are problematic cases that can
 1284      * happen starting with 4-byte characters on ASCII platforms.  We unify the
 1285      * code for these with EBCDIC, even though some of them require 5-bytes on
 1286      * those, because khw believes the code saving is worth the very slight
 1287      * performance hit on these high EBCDIC code points. */
 1288 
 1289     if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
 1290         const char * format = super_cp_format;
 1291         U32 category = packWARN(WARN_NON_UNICODE);
 1292         if (UNLIKELY(uv > MAX_EXTERNALLY_LEGAL_CP)) {
 1293             Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_EXTERNALLY_LEGAL_CP);
 1294         }
 1295         *msgs = S_new_msg_hv(Perl_form(aTHX_ format, uv), category);
 1296         return NULL;
 1297     }
 1298     else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
 1299         HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
 1300     }
 1301 
 1302     /* Test for and handle 4-byte result.   In the test immediately below, the
 1303      * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
 1304      * characters).  The 3 is for 3 continuation bytes; these each contribute
 1305      * MY_SHIFT bits.  This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
 1306      * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
 1307      * 0x1_0000-0x1F_FFFF on ASCII */
 1308     if (uv < (8 * (1U << (3 * MY_SHIFT)))) {
 1309     *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * MY_SHIFT)) | UTF_START_MARK(4));
 1310     *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * MY_SHIFT)) & MY_MASK) | MY_MARK);
 1311     *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * MY_SHIFT)) & MY_MASK) | MY_MARK);
 1312     *d++ = I8_TO_NATIVE_UTF8(( uv  /* (1 - 1) */           & MY_MASK) | MY_MARK);
 1313 
 1314 #ifdef EBCDIC   /* These were handled on ASCII platforms in the code for 3-byte
 1315                    characters.  The end-plane non-characters for EBCDIC were
 1316                    handled just above */
 1317         if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
 1318             HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
 1319         }
 1320         else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
 1321             HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
 1322         }
 1323 #endif
 1324 
 1325     return d;
 1326     }
 1327 
 1328     /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
 1329      * platforms, and 0x4000 on EBCDIC.  At this point we switch to a loop
 1330      * format.  The unrolled version above turns out to not save all that much
 1331      * time, and at these high code points (well above the legal Unicode range
 1332      * on ASCII platforms, and well above anything in common use in EBCDIC),
 1333      * khw believes that less code outweighs slight performance gains. */
 1334 
 1335     {
 1336     STRLEN len  = OFFUNISKIP(uv);
 1337     U8 *p = d+len-1;
 1338     while (p > d) {
 1339         *p-- = I8_TO_NATIVE_UTF8((uv & MY_MASK) | MY_MARK);
 1340         uv >>= MY_SHIFT;
 1341     }
 1342     *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
 1343     return d+len;
 1344     }
 1345 }
 1346 
 1347 #endif  /* End of defining our own uvchr_to_utf8_flags_msgs() */
 1348 #endif  /* End of UTF8SKIP */
 1349 
 1350 #endif /* ENCODE_H */