"Fossies" - the Fresh Open Source Software Archive

Member "utf8proc-2.4.0/utf8proc.h" (11 May 2019, 30620 Bytes) of package /linux/privat/utf8proc-2.4.0.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "utf8proc.h" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 2.3.0_vs_2.4.0.

    1 /*
    2  * Copyright (c) 2018 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
    3  * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
    4  *
    5  * Permission is hereby granted, free of charge, to any person obtaining a
    6  * copy of this software and associated documentation files (the "Software"),
    7  * to deal in the Software without restriction, including without limitation
    8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
    9  * and/or sell copies of the Software, and to permit persons to whom the
   10  * Software is furnished to do so, subject to the following conditions:
   11  *
   12  * The above copyright notice and this permission notice shall be included in
   13  * all copies or substantial portions of the Software.
   14  *
   15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
   18  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
   20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
   21  * DEALINGS IN THE SOFTWARE.
   22  */
   23 
   24 
   25 /**
   26  * @mainpage
   27  *
   28  * utf8proc is a free/open-source (MIT/expat licensed) C library
   29  * providing Unicode normalization, case-folding, and other operations
   30  * for strings in the UTF-8 encoding, supporting Unicode version
   31  * 9.0.0.  See the utf8proc home page (http://julialang.org/utf8proc/)
   32  * for downloads and other information, or the source code on github
   33  * (https://github.com/JuliaLang/utf8proc).
   34  *
   35  * For the utf8proc API documentation, see: @ref utf8proc.h
   36  *
   37  * The features of utf8proc include:
   38  *
   39  * - Transformation of strings (@ref utf8proc_map) to:
   40  *    - decompose (@ref UTF8PROC_DECOMPOSE) or compose (@ref UTF8PROC_COMPOSE) Unicode combining characters (http://en.wikipedia.org/wiki/Combining_character)
   41  *    - canonicalize Unicode compatibility characters (@ref UTF8PROC_COMPAT)
   42  *    - strip "ignorable" (@ref UTF8PROC_IGNORE) characters, control characters (@ref UTF8PROC_STRIPCC), or combining characters such as accents (@ref UTF8PROC_STRIPMARK)
   43  *    - case-folding (@ref UTF8PROC_CASEFOLD)
   44  * - Unicode normalization: @ref utf8proc_NFD, @ref utf8proc_NFC, @ref utf8proc_NFKD, @ref utf8proc_NFKC
   45  * - Detecting grapheme boundaries (@ref utf8proc_grapheme_break and @ref UTF8PROC_CHARBOUND)
   46  * - Character-width computation: @ref utf8proc_charwidth
   47  * - Classification of characters by Unicode category: @ref utf8proc_category and @ref utf8proc_category_string
   48  * - Encode (@ref utf8proc_encode_char) and decode (@ref utf8proc_iterate) Unicode codepoints to/from UTF-8.
   49  */
   50 
   51 /** @file */
   52 
   53 #ifndef UTF8PROC_H
   54 #define UTF8PROC_H
   55 
   56 /** @name API version
   57  *
   58  * The utf8proc API version MAJOR.MINOR.PATCH, following
   59  * semantic-versioning rules (http://semver.org) based on API
   60  * compatibility.
   61  *
   62  * This is also returned at runtime by @ref utf8proc_version; however, the
   63  * runtime version may append a string like "-dev" to the version number
   64  * for prerelease versions.
   65  *
   66  * @note The shared-library version number in the Makefile
   67  *       (and CMakeLists.txt, and MANIFEST) may be different,
   68  *       being based on ABI compatibility rather than API compatibility.
   69  */
   70 /** @{ */
   71 /** The MAJOR version number (increased when backwards API compatibility is broken). */
   72 #define UTF8PROC_VERSION_MAJOR 2
   73 /** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
   74 #define UTF8PROC_VERSION_MINOR 4
   75 /** The PATCH version (increased for fixes that do not change the API). */
   76 #define UTF8PROC_VERSION_PATCH 0
   77 /** @} */
   78 
   79 #include <stdlib.h>
   80 
   81 #if defined(_MSC_VER) && _MSC_VER < 1800
   82 // MSVC prior to 2013 lacked stdbool.h and inttypes.h
   83 typedef signed char utf8proc_int8_t;
   84 typedef unsigned char utf8proc_uint8_t;
   85 typedef short utf8proc_int16_t;
   86 typedef unsigned short utf8proc_uint16_t;
   87 typedef int utf8proc_int32_t;
   88 typedef unsigned int utf8proc_uint32_t;
   89 #  ifdef _WIN64
   90 typedef __int64 utf8proc_ssize_t;
   91 typedef unsigned __int64 utf8proc_size_t;
   92 #  else
   93 typedef int utf8proc_ssize_t;
   94 typedef unsigned int utf8proc_size_t;
   95 #  endif
   96 #  ifndef __cplusplus
   97 // emulate C99 bool
   98 typedef unsigned char utf8proc_bool;
   99 #    ifndef __bool_true_false_are_defined
  100 #      define false 0
  101 #      define true 1
  102 #      define __bool_true_false_are_defined 1
  103 #    endif
  104 #  else
  105 typedef bool utf8proc_bool;
  106 #  endif
  107 #else
  108 #  include <stddef.h>
  109 #  include <stdbool.h>
  110 #  include <inttypes.h>
  111 typedef int8_t utf8proc_int8_t;
  112 typedef uint8_t utf8proc_uint8_t;
  113 typedef int16_t utf8proc_int16_t;
  114 typedef uint16_t utf8proc_uint16_t;
  115 typedef int32_t utf8proc_int32_t;
  116 typedef uint32_t utf8proc_uint32_t;
  117 typedef size_t utf8proc_size_t;
  118 typedef ptrdiff_t utf8proc_ssize_t;
  119 typedef bool utf8proc_bool;
  120 #endif
  121 #include <limits.h>
  122 
  123 #ifdef UTF8PROC_STATIC
  124 #  define UTF8PROC_DLLEXPORT
  125 #else
  126 #  ifdef _WIN32
  127 #    ifdef UTF8PROC_EXPORTS
  128 #      define UTF8PROC_DLLEXPORT __declspec(dllexport)
  129 #    else
  130 #      define UTF8PROC_DLLEXPORT __declspec(dllimport)
  131 #    endif
  132 #  elif __GNUC__ >= 4
  133 #    define UTF8PROC_DLLEXPORT __attribute__ ((visibility("default")))
  134 #  else
  135 #    define UTF8PROC_DLLEXPORT
  136 #  endif
  137 #endif
  138 
  139 #ifdef __cplusplus
  140 extern "C" {
  141 #endif
  142 
  143 /**
  144  * Option flags used by several functions in the library.
  145  */
  146 typedef enum {
  147   /** The given UTF-8 input is NULL terminated. */
  148   UTF8PROC_NULLTERM  = (1<<0),
  149   /** Unicode Versioning Stability has to be respected. */
  150   UTF8PROC_STABLE    = (1<<1),
  151   /** Compatibility decomposition (i.e. formatting information is lost). */
  152   UTF8PROC_COMPAT    = (1<<2),
  153   /** Return a result with decomposed characters. */
  154   UTF8PROC_COMPOSE   = (1<<3),
  155   /** Return a result with decomposed characters. */
  156   UTF8PROC_DECOMPOSE = (1<<4),
  157   /** Strip "default ignorable characters" such as SOFT-HYPHEN or ZERO-WIDTH-SPACE. */
  158   UTF8PROC_IGNORE    = (1<<5),
  159   /** Return an error, if the input contains unassigned codepoints. */
  160   UTF8PROC_REJECTNA  = (1<<6),
  161   /**
  162    * Indicating that NLF-sequences (LF, CRLF, CR, NEL) are representing a
  163    * line break, and should be converted to the codepoint for line
  164    * separation (LS).
  165    */
  166   UTF8PROC_NLF2LS    = (1<<7),
  167   /**
  168    * Indicating that NLF-sequences are representing a paragraph break, and
  169    * should be converted to the codepoint for paragraph separation
  170    * (PS).
  171    */
  172   UTF8PROC_NLF2PS    = (1<<8),
  173   /** Indicating that the meaning of NLF-sequences is unknown. */
  174   UTF8PROC_NLF2LF    = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS),
  175   /** Strips and/or convers control characters.
  176    *
  177    * NLF-sequences are transformed into space, except if one of the
  178    * NLF2LS/PS/LF options is given. HorizontalTab (HT) and FormFeed (FF)
  179    * are treated as a NLF-sequence in this case.  All other control
  180    * characters are simply removed.
  181    */
  182   UTF8PROC_STRIPCC   = (1<<9),
  183   /**
  184    * Performs unicode case folding, to be able to do a case-insensitive
  185    * string comparison.
  186    */
  187   UTF8PROC_CASEFOLD  = (1<<10),
  188   /**
  189    * Inserts 0xFF bytes at the beginning of each sequence which is
  190    * representing a single grapheme cluster (see UAX#29).
  191    */
  192   UTF8PROC_CHARBOUND = (1<<11),
  193   /** Lumps certain characters together.
  194    *
  195    * E.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-". See lump.md for details.
  196    *
  197    * If NLF2LF is set, this includes a transformation of paragraph and
  198    * line separators to ASCII line-feed (LF).
  199    */
  200   UTF8PROC_LUMP      = (1<<12),
  201   /** Strips all character markings.
  202    *
  203    * This includes non-spacing, spacing and enclosing (i.e. accents).
  204    * @note This option works only with @ref UTF8PROC_COMPOSE or
  205    *       @ref UTF8PROC_DECOMPOSE
  206    */
  207   UTF8PROC_STRIPMARK = (1<<13),
  208   /**
  209    * Strip unassigned codepoints.
  210    */
  211   UTF8PROC_STRIPNA    = (1<<14),
  212 } utf8proc_option_t;
  213 
  214 /** @name Error codes
  215  * Error codes being returned by almost all functions.
  216  */
  217 /** @{ */
  218 /** Memory could not be allocated. */
  219 #define UTF8PROC_ERROR_NOMEM -1
  220 /** The given string is too long to be processed. */
  221 #define UTF8PROC_ERROR_OVERFLOW -2
  222 /** The given string is not a legal UTF-8 string. */
  223 #define UTF8PROC_ERROR_INVALIDUTF8 -3
  224 /** The @ref UTF8PROC_REJECTNA flag was set and an unassigned codepoint was found. */
  225 #define UTF8PROC_ERROR_NOTASSIGNED -4
  226 /** Invalid options have been used. */
  227 #define UTF8PROC_ERROR_INVALIDOPTS -5
  228 /** @} */
  229 
  230 /* @name Types */
  231 
  232 /** Holds the value of a property. */
  233 typedef utf8proc_int16_t utf8proc_propval_t;
  234 
  235 /** Struct containing information about a codepoint. */
  236 typedef struct utf8proc_property_struct {
  237   /**
  238    * Unicode category.
  239    * @see utf8proc_category_t.
  240    */
  241   utf8proc_propval_t category;
  242   utf8proc_propval_t combining_class;
  243   /**
  244    * Bidirectional class.
  245    * @see utf8proc_bidi_class_t.
  246    */
  247   utf8proc_propval_t bidi_class;
  248   /**
  249    * @anchor Decomposition type.
  250    * @see utf8proc_decomp_type_t.
  251    */
  252   utf8proc_propval_t decomp_type;
  253   utf8proc_uint16_t decomp_seqindex;
  254   utf8proc_uint16_t casefold_seqindex;
  255   utf8proc_uint16_t uppercase_seqindex;
  256   utf8proc_uint16_t lowercase_seqindex;
  257   utf8proc_uint16_t titlecase_seqindex;
  258   utf8proc_uint16_t comb_index;
  259   unsigned bidi_mirrored:1;
  260   unsigned comp_exclusion:1;
  261   /**
  262    * Can this codepoint be ignored?
  263    *
  264    * Used by @ref utf8proc_decompose_char when @ref UTF8PROC_IGNORE is
  265    * passed as an option.
  266    */
  267   unsigned ignorable:1;
  268   unsigned control_boundary:1;
  269   /** The width of the codepoint. */
  270   unsigned charwidth:2;
  271   unsigned pad:2;
  272   /**
  273    * Boundclass.
  274    * @see utf8proc_boundclass_t.
  275    */
  276   unsigned boundclass:8;
  277 } utf8proc_property_t;
  278 
  279 /** Unicode categories. */
  280 typedef enum {
  281   UTF8PROC_CATEGORY_CN  = 0, /**< Other, not assigned */
  282   UTF8PROC_CATEGORY_LU  = 1, /**< Letter, uppercase */
  283   UTF8PROC_CATEGORY_LL  = 2, /**< Letter, lowercase */
  284   UTF8PROC_CATEGORY_LT  = 3, /**< Letter, titlecase */
  285   UTF8PROC_CATEGORY_LM  = 4, /**< Letter, modifier */
  286   UTF8PROC_CATEGORY_LO  = 5, /**< Letter, other */
  287   UTF8PROC_CATEGORY_MN  = 6, /**< Mark, nonspacing */
  288   UTF8PROC_CATEGORY_MC  = 7, /**< Mark, spacing combining */
  289   UTF8PROC_CATEGORY_ME  = 8, /**< Mark, enclosing */
  290   UTF8PROC_CATEGORY_ND  = 9, /**< Number, decimal digit */
  291   UTF8PROC_CATEGORY_NL = 10, /**< Number, letter */
  292   UTF8PROC_CATEGORY_NO = 11, /**< Number, other */
  293   UTF8PROC_CATEGORY_PC = 12, /**< Punctuation, connector */
  294   UTF8PROC_CATEGORY_PD = 13, /**< Punctuation, dash */
  295   UTF8PROC_CATEGORY_PS = 14, /**< Punctuation, open */
  296   UTF8PROC_CATEGORY_PE = 15, /**< Punctuation, close */
  297   UTF8PROC_CATEGORY_PI = 16, /**< Punctuation, initial quote */
  298   UTF8PROC_CATEGORY_PF = 17, /**< Punctuation, final quote */
  299   UTF8PROC_CATEGORY_PO = 18, /**< Punctuation, other */
  300   UTF8PROC_CATEGORY_SM = 19, /**< Symbol, math */
  301   UTF8PROC_CATEGORY_SC = 20, /**< Symbol, currency */
  302   UTF8PROC_CATEGORY_SK = 21, /**< Symbol, modifier */
  303   UTF8PROC_CATEGORY_SO = 22, /**< Symbol, other */
  304   UTF8PROC_CATEGORY_ZS = 23, /**< Separator, space */
  305   UTF8PROC_CATEGORY_ZL = 24, /**< Separator, line */
  306   UTF8PROC_CATEGORY_ZP = 25, /**< Separator, paragraph */
  307   UTF8PROC_CATEGORY_CC = 26, /**< Other, control */
  308   UTF8PROC_CATEGORY_CF = 27, /**< Other, format */
  309   UTF8PROC_CATEGORY_CS = 28, /**< Other, surrogate */
  310   UTF8PROC_CATEGORY_CO = 29, /**< Other, private use */
  311 } utf8proc_category_t;
  312 
  313 /** Bidirectional character classes. */
  314 typedef enum {
  315   UTF8PROC_BIDI_CLASS_L     = 1, /**< Left-to-Right */
  316   UTF8PROC_BIDI_CLASS_LRE   = 2, /**< Left-to-Right Embedding */
  317   UTF8PROC_BIDI_CLASS_LRO   = 3, /**< Left-to-Right Override */
  318   UTF8PROC_BIDI_CLASS_R     = 4, /**< Right-to-Left */
  319   UTF8PROC_BIDI_CLASS_AL    = 5, /**< Right-to-Left Arabic */
  320   UTF8PROC_BIDI_CLASS_RLE   = 6, /**< Right-to-Left Embedding */
  321   UTF8PROC_BIDI_CLASS_RLO   = 7, /**< Right-to-Left Override */
  322   UTF8PROC_BIDI_CLASS_PDF   = 8, /**< Pop Directional Format */
  323   UTF8PROC_BIDI_CLASS_EN    = 9, /**< European Number */
  324   UTF8PROC_BIDI_CLASS_ES   = 10, /**< European Separator */
  325   UTF8PROC_BIDI_CLASS_ET   = 11, /**< European Number Terminator */
  326   UTF8PROC_BIDI_CLASS_AN   = 12, /**< Arabic Number */
  327   UTF8PROC_BIDI_CLASS_CS   = 13, /**< Common Number Separator */
  328   UTF8PROC_BIDI_CLASS_NSM  = 14, /**< Nonspacing Mark */
  329   UTF8PROC_BIDI_CLASS_BN   = 15, /**< Boundary Neutral */
  330   UTF8PROC_BIDI_CLASS_B    = 16, /**< Paragraph Separator */
  331   UTF8PROC_BIDI_CLASS_S    = 17, /**< Segment Separator */
  332   UTF8PROC_BIDI_CLASS_WS   = 18, /**< Whitespace */
  333   UTF8PROC_BIDI_CLASS_ON   = 19, /**< Other Neutrals */
  334   UTF8PROC_BIDI_CLASS_LRI  = 20, /**< Left-to-Right Isolate */
  335   UTF8PROC_BIDI_CLASS_RLI  = 21, /**< Right-to-Left Isolate */
  336   UTF8PROC_BIDI_CLASS_FSI  = 22, /**< First Strong Isolate */
  337   UTF8PROC_BIDI_CLASS_PDI  = 23, /**< Pop Directional Isolate */
  338 } utf8proc_bidi_class_t;
  339 
  340 /** Decomposition type. */
  341 typedef enum {
  342   UTF8PROC_DECOMP_TYPE_FONT      = 1, /**< Font */
  343   UTF8PROC_DECOMP_TYPE_NOBREAK   = 2, /**< Nobreak */
  344   UTF8PROC_DECOMP_TYPE_INITIAL   = 3, /**< Initial */
  345   UTF8PROC_DECOMP_TYPE_MEDIAL    = 4, /**< Medial */
  346   UTF8PROC_DECOMP_TYPE_FINAL     = 5, /**< Final */
  347   UTF8PROC_DECOMP_TYPE_ISOLATED  = 6, /**< Isolated */
  348   UTF8PROC_DECOMP_TYPE_CIRCLE    = 7, /**< Circle */
  349   UTF8PROC_DECOMP_TYPE_SUPER     = 8, /**< Super */
  350   UTF8PROC_DECOMP_TYPE_SUB       = 9, /**< Sub */
  351   UTF8PROC_DECOMP_TYPE_VERTICAL = 10, /**< Vertical */
  352   UTF8PROC_DECOMP_TYPE_WIDE     = 11, /**< Wide */
  353   UTF8PROC_DECOMP_TYPE_NARROW   = 12, /**< Narrow */
  354   UTF8PROC_DECOMP_TYPE_SMALL    = 13, /**< Small */
  355   UTF8PROC_DECOMP_TYPE_SQUARE   = 14, /**< Square */
  356   UTF8PROC_DECOMP_TYPE_FRACTION = 15, /**< Fraction */
  357   UTF8PROC_DECOMP_TYPE_COMPAT   = 16, /**< Compat */
  358 } utf8proc_decomp_type_t;
  359 
  360 /** Boundclass property. (TR29) */
  361 typedef enum {
  362   UTF8PROC_BOUNDCLASS_START              =  0, /**< Start */
  363   UTF8PROC_BOUNDCLASS_OTHER              =  1, /**< Other */
  364   UTF8PROC_BOUNDCLASS_CR                 =  2, /**< Cr */
  365   UTF8PROC_BOUNDCLASS_LF                 =  3, /**< Lf */
  366   UTF8PROC_BOUNDCLASS_CONTROL            =  4, /**< Control */
  367   UTF8PROC_BOUNDCLASS_EXTEND             =  5, /**< Extend */
  368   UTF8PROC_BOUNDCLASS_L                  =  6, /**< L */
  369   UTF8PROC_BOUNDCLASS_V                  =  7, /**< V */
  370   UTF8PROC_BOUNDCLASS_T                  =  8, /**< T */
  371   UTF8PROC_BOUNDCLASS_LV                 =  9, /**< Lv */
  372   UTF8PROC_BOUNDCLASS_LVT                = 10, /**< Lvt */
  373   UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */
  374   UTF8PROC_BOUNDCLASS_SPACINGMARK        = 12, /**< Spacingmark */
  375   UTF8PROC_BOUNDCLASS_PREPEND            = 13, /**< Prepend */
  376   UTF8PROC_BOUNDCLASS_ZWJ                = 14, /**< Zero Width Joiner */
  377 
  378   /* the following are no longer used in Unicode 11, but we keep
  379      the constants here for backward compatibility */
  380   UTF8PROC_BOUNDCLASS_E_BASE             = 15, /**< Emoji Base */
  381   UTF8PROC_BOUNDCLASS_E_MODIFIER         = 16, /**< Emoji Modifier */
  382   UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ     = 17, /**< Glue_After_ZWJ */
  383   UTF8PROC_BOUNDCLASS_E_BASE_GAZ         = 18, /**< E_BASE + GLUE_AFTER_ZJW */
  384 
  385   /* the Extended_Pictographic property is used in the Unicode 11
  386      grapheme-boundary rules, so we store it in the boundclass field */
  387   UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC = 19,
  388   UTF8PROC_BOUNDCLASS_E_ZWG = 20, /* UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC + ZWJ */
  389 } utf8proc_boundclass_t;
  390 
  391 /**
  392  * Function pointer type passed to @ref utf8proc_map_custom and
  393  * @ref utf8proc_decompose_custom, which is used to specify a user-defined
  394  * mapping of codepoints to be applied in conjunction with other mappings.
  395  */
  396 typedef utf8proc_int32_t (*utf8proc_custom_func)(utf8proc_int32_t codepoint, void *data);
  397 
  398 /**
  399  * Array containing the byte lengths of a UTF-8 encoded codepoint based
  400  * on the first byte.
  401  */
  402 UTF8PROC_DLLEXPORT extern const utf8proc_int8_t utf8proc_utf8class[256];
  403 
  404 /**
  405  * Returns the utf8proc API version as a string MAJOR.MINOR.PATCH
  406  * (http://semver.org format), possibly with a "-dev" suffix for
  407  * development versions.
  408  */
  409 UTF8PROC_DLLEXPORT const char *utf8proc_version(void);
  410 
  411 /**
  412  * Returns the utf8proc supported Unicode version as a string MAJOR.MINOR.PATCH.
  413  */
  414 UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void);
  415 
  416 /**
  417  * Returns an informative error string for the given utf8proc error code
  418  * (e.g. the error codes returned by @ref utf8proc_map).
  419  */
  420 UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode);
  421 
  422 /**
  423  * Reads a single codepoint from the UTF-8 sequence being pointed to by `str`.
  424  * The maximum number of bytes read is `strlen`, unless `strlen` is
  425  * negative (in which case up to 4 bytes are read).
  426  *
  427  * If a valid codepoint could be read, it is stored in the variable
  428  * pointed to by `codepoint_ref`, otherwise that variable will be set to -1.
  429  * In case of success, the number of bytes read is returned; otherwise, a
  430  * negative error code is returned.
  431  */
  432 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *codepoint_ref);
  433 
  434 /**
  435  * Check if a codepoint is valid (regardless of whether it has been
  436  * assigned a value by the current Unicode standard).
  437  *
  438  * @return 1 if the given `codepoint` is valid and otherwise return 0.
  439  */
  440 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t codepoint);
  441 
  442 /**
  443  * Encodes the codepoint as an UTF-8 string in the byte array pointed
  444  * to by `dst`. This array must be at least 4 bytes long.
  445  *
  446  * In case of success the number of bytes written is returned, and
  447  * otherwise 0 is returned.
  448  *
  449  * This function does not check whether `codepoint` is valid Unicode.
  450  */
  451 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t codepoint, utf8proc_uint8_t *dst);
  452 
  453 /**
  454  * Look up the properties for a given codepoint.
  455  *
  456  * @param codepoint The Unicode codepoint.
  457  *
  458  * @returns
  459  * A pointer to a (constant) struct containing information about
  460  * the codepoint.
  461  * @par
  462  * If the codepoint is unassigned or invalid, a pointer to a special struct is
  463  * returned in which `category` is 0 (@ref UTF8PROC_CATEGORY_CN).
  464  */
  465 UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t codepoint);
  466 
  467 /** Decompose a codepoint into an array of codepoints.
  468  *
  469  * @param codepoint the codepoint.
  470  * @param dst the destination buffer.
  471  * @param bufsize the size of the destination buffer.
  472  * @param options one or more of the following flags:
  473  * - @ref UTF8PROC_REJECTNA  - return an error `codepoint` is unassigned
  474  * - @ref UTF8PROC_IGNORE    - strip "default ignorable" codepoints
  475  * - @ref UTF8PROC_CASEFOLD  - apply Unicode casefolding
  476  * - @ref UTF8PROC_COMPAT    - replace certain codepoints with their
  477  *                             compatibility decomposition
  478  * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
  479  * - @ref UTF8PROC_LUMP      - lump certain different codepoints together
  480  * - @ref UTF8PROC_STRIPMARK - remove all character marks
  481  * - @ref UTF8PROC_STRIPNA   - remove unassigned codepoints
  482  * @param last_boundclass
  483  * Pointer to an integer variable containing
  484  * the previous codepoint's boundary class if the @ref UTF8PROC_CHARBOUND
  485  * option is used.  Otherwise, this parameter is ignored.
  486  *
  487  * @return
  488  * In case of success, the number of codepoints written is returned; in case
  489  * of an error, a negative error code is returned (@ref utf8proc_errmsg).
  490  * @par
  491  * If the number of written codepoints would be bigger than `bufsize`, the
  492  * required buffer size is returned, while the buffer will be overwritten with
  493  * undefined data.
  494  */
  495 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(
  496   utf8proc_int32_t codepoint, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize,
  497   utf8proc_option_t options, int *last_boundclass
  498 );
  499 
  500 /**
  501  * The same as @ref utf8proc_decompose_char, but acts on a whole UTF-8
  502  * string and orders the decomposed sequences correctly.
  503  *
  504  * If the @ref UTF8PROC_NULLTERM flag in `options` is set, processing
  505  * will be stopped, when a NULL byte is encounted, otherwise `strlen`
  506  * bytes are processed.  The result (in the form of 32-bit unicode
  507  * codepoints) is written into the buffer being pointed to by
  508  * `buffer` (which must contain at least `bufsize` entries).  In case of
  509  * success, the number of codepoints written is returned; in case of an
  510  * error, a negative error code is returned (@ref utf8proc_errmsg).
  511  * See @ref utf8proc_decompose_custom to supply additional transformations.
  512  *
  513  * If the number of written codepoints would be bigger than `bufsize`, the
  514  * required buffer size is returned, while the buffer will be overwritten with
  515  * undefined data.
  516  */
  517 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
  518   const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
  519   utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
  520 );
  521 
  522 /**
  523  * The same as @ref utf8proc_decompose, but also takes a `custom_func` mapping function
  524  * that is called on each codepoint in `str` before any other transformations
  525  * (along with a `custom_data` pointer that is passed through to `custom_func`).
  526  * The `custom_func` argument is ignored if it is `NULL`.  See also @ref utf8proc_map_custom.
  527  */
  528 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
  529   const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
  530   utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
  531   utf8proc_custom_func custom_func, void *custom_data
  532 );
  533 
  534 /**
  535  * Normalizes the sequence of `length` codepoints pointed to by `buffer`
  536  * in-place (i.e., the result is also stored in `buffer`).
  537  *
  538  * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
  539  * @param length the length (in codepoints) of the buffer.
  540  * @param options a bitwise or (`|`) of one or more of the following flags:
  541  * - @ref UTF8PROC_NLF2LS  - convert LF, CRLF, CR and NEL into LS
  542  * - @ref UTF8PROC_NLF2PS  - convert LF, CRLF, CR and NEL into PS
  543  * - @ref UTF8PROC_NLF2LF  - convert LF, CRLF, CR and NEL into LF
  544  * - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters
  545  * - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite
  546  *                           codepoints
  547  * - @ref UTF8PROC_STABLE  - prohibit combining characters that would violate
  548  *                           the unicode versioning stability
  549  *
  550  * @return
  551  * In case of success, the length (in codepoints) of the normalized UTF-32 string is
  552  * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg).
  553  *
  554  * @warning The entries of the array pointed to by `str` have to be in the
  555  *          range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
  556  */
  557 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
  558 
  559 /**
  560  * Reencodes the sequence of `length` codepoints pointed to by `buffer`
  561  * UTF-8 data in-place (i.e., the result is also stored in `buffer`).
  562  * Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion.
  563  *
  564  * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
  565  * @param length the length (in codepoints) of the buffer.
  566  * @param options a bitwise or (`|`) of one or more of the following flags:
  567  * - @ref UTF8PROC_NLF2LS  - convert LF, CRLF, CR and NEL into LS
  568  * - @ref UTF8PROC_NLF2PS  - convert LF, CRLF, CR and NEL into PS
  569  * - @ref UTF8PROC_NLF2LF  - convert LF, CRLF, CR and NEL into LF
  570  * - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters
  571  * - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite
  572  *                           codepoints
  573  * - @ref UTF8PROC_STABLE  - prohibit combining characters that would violate
  574  *                           the unicode versioning stability
  575  * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
  576  *
  577  * @return
  578  * In case of success, the length (in bytes) of the resulting nul-terminated
  579  * UTF-8 string is returned; otherwise, a negative error code is returned
  580  * (@ref utf8proc_errmsg).
  581  *
  582  * @warning The amount of free space pointed to by `buffer` must
  583  *          exceed the amount of the input data by one byte, and the
  584  *          entries of the array pointed to by `str` have to be in the
  585  *          range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
  586  */
  587 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
  588 
  589 /**
  590  * Given a pair of consecutive codepoints, return whether a grapheme break is
  591  * permitted between them (as defined by the extended grapheme clusters in UAX#29).
  592  *
  593  * @param codepoint1 The first codepoint.
  594  * @param codepoint2 The second codepoint, occurring consecutively after `codepoint1`.
  595  * @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires
  596  *              state to break graphemes. This state can be passed in as a pointer
  597  *              in the `state` argument and should initially be set to 0. If the
  598  *              state is not passed in (i.e. a null pointer is passed), UAX#29 rules
  599  *              GB10/12/13 which require this state will not be applied, essentially
  600  *              matching the rules in Unicode 8.0.0.
  601  *
  602  * @warning If the state parameter is used, `utf8proc_grapheme_break_stateful` must
  603  *          be called IN ORDER on ALL potential breaks in a string.  However, it
  604  *          is safe to reset the state to zero after a grapheme break.
  605  */
  606 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
  607     utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state);
  608 
  609 /**
  610  * Same as @ref utf8proc_grapheme_break_stateful, except without support for the
  611  * Unicode 9 additions to the algorithm. Supported for legacy reasons.
  612  */
  613 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
  614     utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
  615 
  616 
  617 /**
  618  * Given a codepoint `c`, return the codepoint of the corresponding
  619  * lower-case character, if any; otherwise (if there is no lower-case
  620  * variant, or if `c` is not a valid codepoint) return `c`.
  621  */
  622 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c);
  623 
  624 /**
  625  * Given a codepoint `c`, return the codepoint of the corresponding
  626  * upper-case character, if any; otherwise (if there is no upper-case
  627  * variant, or if `c` is not a valid codepoint) return `c`.
  628  */
  629 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
  630 
  631 /**
  632  * Given a codepoint `c`, return the codepoint of the corresponding
  633  * title-case character, if any; otherwise (if there is no title-case
  634  * variant, or if `c` is not a valid codepoint) return `c`.
  635  */
  636 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c);
  637 
  638 /**
  639  * Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
  640  * except that a width of 0 is returned for non-printable codepoints
  641  * instead of -1 as in `wcwidth`.
  642  *
  643  * @note
  644  * If you want to check for particular types of non-printable characters,
  645  * (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */
  646 UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t codepoint);
  647 
  648 /**
  649  * Return the Unicode category for the codepoint (one of the
  650  * @ref utf8proc_category_t constants.)
  651  */
  652 UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t codepoint);
  653 
  654 /**
  655  * Return the two-letter (nul-terminated) Unicode category string for
  656  * the codepoint (e.g. `"Lu"` or `"Co"`).
  657  */
  658 UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoint);
  659 
  660 /**
  661  * Maps the given UTF-8 string pointed to by `str` to a new UTF-8
  662  * string, allocated dynamically by `malloc` and returned via `dstptr`.
  663  *
  664  * If the @ref UTF8PROC_NULLTERM flag in the `options` field is set,
  665  * the length is determined by a NULL terminator, otherwise the
  666  * parameter `strlen` is evaluated to determine the string length, but
  667  * in any case the result will be NULL terminated (though it might
  668  * contain NULL characters with the string if `str` contained NULL
  669  * characters). Other flags in the `options` field are passed to the
  670  * functions defined above, and regarded as described.  See also
  671  * @ref utf8proc_map_custom to supply a custom codepoint transformation.
  672  *
  673  * In case of success the length of the new string is returned,
  674  * otherwise a negative error code is returned.
  675  *
  676  * @note The memory of the new UTF-8 string will have been allocated
  677  * with `malloc`, and should therefore be deallocated with `free`.
  678  */
  679 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
  680   const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
  681 );
  682 
  683 /**
  684  * Like @ref utf8proc_map, but also takes a `custom_func` mapping function
  685  * that is called on each codepoint in `str` before any other transformations
  686  * (along with a `custom_data` pointer that is passed through to `custom_func`).
  687  * The `custom_func` argument is ignored if it is `NULL`.
  688  */
  689 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
  690   const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
  691   utf8proc_custom_func custom_func, void *custom_data
  692 );
  693 
  694 /** @name Unicode normalization
  695  *
  696  * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD, NFKC or
  697  * NFKC_Casefold normalized version of the null-terminated string `str`.  These
  698  * are shortcuts to calling @ref utf8proc_map with @ref UTF8PROC_NULLTERM
  699  * combined with @ref UTF8PROC_STABLE and flags indicating the normalization.
  700  */
  701 /** @{ */
  702 /** NFD normalization (@ref UTF8PROC_DECOMPOSE). */
  703 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str);
  704 /** NFC normalization (@ref UTF8PROC_COMPOSE). */
  705 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str);
  706 /** NFKD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
  707 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
  708 /** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
  709 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
  710 /**
  711  * NFKC_Casefold normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT
  712  * and @ref UTF8PROC_CASEFOLD and @ref UTF8PROC_IGNORE).
  713  **/
  714 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str);
  715 /** @} */
  716 
  717 #ifdef __cplusplus
  718 }
  719 #endif
  720 
  721 #endif