Unicode 9 updates (#70)

* Updates for Unicode 9.0.0 TR29 Changes - New rules GB10/(12/13) are used to combine emoji-zwj sequences/ (force grapheme breaks every two RI codepoints). Unfortunately this breaks statelessness of grapheme-boundary determination. Deal with this by ignoring the problem in utf8proc_grapheme_break, and by hacking in a special case in decompose - ZWJ moved to its own boundclass, update what is now GB9 accordingly. - Add comments to indicate which rule a given case implements - The Number of bound classes Now exceeds 4 bits, expand to 8 and reorganize fields * Import Unicode 9 data * Update Grapheme break API to expose state override * Bump MAJOR version
2016-06-28 16:04:25 -04:00
parent 3d0576a9b9
commit 41c6b23aab
7 changed files with 11517 additions and 11113 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,9 +7,9 @@ disallow_intree_builds()
 project (utf8proc C)
 # Be sure to also update these in Makefile!
-set(SO_MAJOR 2)
+set(SO_MAJOR 3)
 set(SO_MINOR 0)
-set(SO_PATCH 1)
+set(SO_PATCH 0)
 add_definitions (
  -DUTF8PROC_EXPORTS
--- a/6
+++ b/6
@@ -2,6 +2,6 @@ include/
 include/utf8proc.h
 lib/
 lib/libutf8proc.a
-lib/libutf8proc.so -> libutf8proc.so.2.0.1
+lib/libutf8proc.so -> libutf8proc.so.3.0.0
-lib/libutf8proc.so.2 -> libutf8proc.so.2.0.1
+lib/libutf8proc.so.3 -> libutf8proc.so.3.0.0
-lib/libutf8proc.so.2.0.1
+lib/libutf8proc.so.3.0.0
--- a/4
+++ b/4
@@ -19,9 +19,9 @@ UCFLAGS = $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS
 # not API compatibility: MAJOR should be incremented whenever *binary*
 # compatibility is broken, even if the API is backward-compatible
 # Be sure to also update these in MANIFEST and CMakeLists.txt!
-MAJOR=2
+MAJOR=3
 MINOR=0
-PATCH=1
+PATCH=0
 OS := $(shell uname)
 ifeq ($(OS),Darwin) # MacOS X
--- a/data/data_generator.rb
+++ b/data/data_generator.rb
@@ -182,8 +182,8 @@ class UnicodeChar
    "#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
    "#{$ignorable.include?(code)}, " <<
    "#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
-    "#{$grapheme_boundclass[code]}, " <<
+    "#{$charwidth[code]}, 0, " <<
-    "#{$charwidth[code]}},\n"
+    "#{$grapheme_boundclass[code]}},\n"
  end
 end
@@ -306,7 +306,7 @@ end
 $stdout << "};\n\n"
 $stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
-$stdout << "  {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER, 0},\n"
+$stdout << "  {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, -1, -1, -1, -1, -1, false,false,false,false,0,0,UTF8PROC_BOUNDCLASS_OTHER},\n"
 properties.each { |line|
  $stdout << line
 }
--- a/utf8proc.c
+++ b/utf8proc.c
@@ -233,36 +233,87 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
  return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
 }
-/* return whether there is a grapheme break between boundclasses lbc and tbc */
+/* return whether there is a grapheme break between boundclasses lbc and tbc
-static utf8proc_bool grapheme_break(int lbc, int tbc) {
+   (according to the definition of extended grapheme clusters)
  Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
  http://www.unicode.org/reports/tr29/tr29-29.html
  CAVEATS:
   Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
   and GB 12/13 (regional indicator code points) require knowledge of previous characters
   and are thus not handled by this function. This may result in an incorrect break before
   an E_Modifier class codepoint and an incorrectly missing break between two
   REGIONAL_INDICATOR class code points if such support does not exist in the caller.
   See the special support in grapheme_break_extended, for required bookkeeping by the caller.
 */
 static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
  return
-    (lbc == UTF8PROC_BOUNDCLASS_START) ? true :
+    (lbc == UTF8PROC_BOUNDCLASS_START) ? true :       // GB1
-    (lbc == UTF8PROC_BOUNDCLASS_CR &&
+    (lbc == UTF8PROC_BOUNDCLASS_CR &&                 // GB3
-     tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
+     tbc == UTF8PROC_BOUNDCLASS_LF) ? false :         // ---
-    (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
+    (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :  // GB4
-    (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
+    (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :  // GB5
-    (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
+    (lbc == UTF8PROC_BOUNDCLASS_L &&                  // GB6
-    (lbc == UTF8PROC_BOUNDCLASS_L &&
+     (tbc == UTF8PROC_BOUNDCLASS_L ||                 // ---
-     (tbc == UTF8PROC_BOUNDCLASS_L ||
+      tbc == UTF8PROC_BOUNDCLASS_V ||                 // ---
-      tbc == UTF8PROC_BOUNDCLASS_V ||
+      tbc == UTF8PROC_BOUNDCLASS_LV ||                // ---
-      tbc == UTF8PROC_BOUNDCLASS_LV ||
+      tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :      // ---
-      tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
+    ((lbc == UTF8PROC_BOUNDCLASS_LV ||                // GB7
-    ((lbc == UTF8PROC_BOUNDCLASS_LV ||
+      lbc == UTF8PROC_BOUNDCLASS_V) &&                // ---
-      lbc == UTF8PROC_BOUNDCLASS_V) &&
+     (tbc == UTF8PROC_BOUNDCLASS_V ||                 // ---
-     (tbc == UTF8PROC_BOUNDCLASS_V ||
+      tbc == UTF8PROC_BOUNDCLASS_T)) ? false :        // ---
-      tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
+    ((lbc == UTF8PROC_BOUNDCLASS_LVT ||               // GB8
-    ((lbc == UTF8PROC_BOUNDCLASS_LVT ||
+      lbc == UTF8PROC_BOUNDCLASS_T) &&                // ---
-      lbc == UTF8PROC_BOUNDCLASS_T) &&
+     tbc == UTF8PROC_BOUNDCLASS_T) ? false :          // ---
-     tbc == UTF8PROC_BOUNDCLASS_T) ? false :
+    (tbc == UTF8PROC_BOUNDCLASS_EXTEND ||             // GB9
-    (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&
+     tbc == UTF8PROC_BOUNDCLASS_ZWJ ||                // ---
-     tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :
+     tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK ||        // GB9a
-    (tbc != UTF8PROC_BOUNDCLASS_SPACINGMARK);
+     lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false :    // GB9b
    ((lbc == UTF8PROC_BOUNDCLASS_E_BASE ||            // GB10 (requires additional handling below)
      lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&       // ----
     tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : // ----
    (lbc == UTF8PROC_BOUNDCLASS_ZWJ &&                         // GB11
     (tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ ||             // ----
      tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false :        // ----
    (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&          // GB12/13 (requires additional handling below)
     tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :  // ----
    true; // GB999
 }
-/* return whether there is a grapheme break between codepoints c1 and c2 */
+static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
-UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, utf8proc_int32_t c2) {
+{
-  return grapheme_break(utf8proc_get_property(c1)->boundclass,
+  int lbc_override = lbc;
-                        utf8proc_get_property(c2)->boundclass);
+  if (state && *state != UTF8PROC_BOUNDCLASS_START)
    lbc_override = *state;
  utf8proc_bool break_permitted = grapheme_break_simple(lbc, tbc);
  if (state) {
    // Special support for GB 12/13 made possible by GB999. After two RI
    // class codepoints we want to force a break. Do this by resetting the
    // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
    // after that character according to GB999 (unless of course such a break is
    // forbidden by a different rule such as GB9).
    if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
      *state = UTF8PROC_BOUNDCLASS_OTHER;
    // Special support for GB10. Fold any EXTEND codepoints into the previous
    // boundclass if we're dealing with an emoji base boundclass.
    else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE      ||
              *state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&
             tbc == UTF8PROC_BOUNDCLASS_EXTEND)
      *state = UTF8PROC_BOUNDCLASS_E_BASE;
    else
      *state = tbc;
  }
  return break_permitted;
 }
 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
    utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
  return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
                                 utf8proc_get_property(c2)->boundclass,
                                 state);
 }
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
@@ -388,8 +439,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
  if (options & UTF8PROC_CHARBOUND) {
    utf8proc_bool boundary;
    int tbc = property->boundclass;
-    boundary = grapheme_break(*last_boundclass, tbc);
+    boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
    *last_boundclass = tbc;
    if (boundary) {
      if (bufsize >= 1) dst[0] = 0xFFFF;
      if (bufsize >= 2) dst[1] = uc;
--- a/utf8proc.h
+++ b/utf8proc.h
@@ -68,9 +68,9 @@
 */
 /** @{ */
 /** The MAJOR version number (increased when backwards API compatibility is broken). */
-#define UTF8PROC_VERSION_MAJOR 1
+#define UTF8PROC_VERSION_MAJOR 2
 /** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
-#define UTF8PROC_VERSION_MINOR 3
+#define UTF8PROC_VERSION_MINOR 0
 /** The PATCH version (increased for fixes that do not change the API). */
 #define UTF8PROC_VERSION_PATCH 0
 /** @} */
@@ -259,13 +259,14 @@ typedef struct utf8proc_property_struct {
   */
  unsigned ignorable:1;
  unsigned control_boundary:1;
  /** The width of the codepoint. */
  unsigned charwidth:2;
  unsigned pad:2;
  /**
   * Boundclass.
   * @see utf8proc_boundclass_t.
   */
-  unsigned boundclass:4;
+  unsigned boundclass:8;
  /** The width of the codepoint. */
  unsigned charwidth:2;
 } utf8proc_property_t;
 /** Unicode categories. */
@@ -349,7 +350,7 @@ typedef enum {
  UTF8PROC_DECOMP_TYPE_COMPAT   = 16, /**< Compat */
 } utf8proc_decomp_type_t;
-/** Boundclass property. */
+/** Boundclass property. (TR29) */
 typedef enum {
  UTF8PROC_BOUNDCLASS_START              =  0, /**< Start */
  UTF8PROC_BOUNDCLASS_OTHER              =  1, /**< Other */
@@ -364,6 +365,12 @@ typedef enum {
  UTF8PROC_BOUNDCLASS_LVT                = 10, /**< Lvt */
  UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */
  UTF8PROC_BOUNDCLASS_SPACINGMARK        = 12, /**< Spacingmark */
  UTF8PROC_BOUNDCLASS_PREPEND            = 13, /**< Prepend */
  UTF8PROC_BOUNDCLASS_ZWJ                = 14, /**< Zero Width Joiner */
  UTF8PROC_BOUNDCLASS_E_BASE             = 15, /**< Emoji Base */
  UTF8PROC_BOUNDCLASS_E_MODIFIER         = 16, /**< Emoji Modifier */
  UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ     = 17, /**< Glue_After_ZWJ */
  UTF8PROC_BOUNDCLASS_E_BASE_GAZ         = 18, /**< E_BASE + GLUE_AFTER_ZJW */
 } utf8proc_boundclass_t;
 /**
@@ -513,8 +520,19 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
 /**
 * Given a pair of consecutive codepoints, return whether a grapheme break is
 * permitted between them (as defined by the extended grapheme clusters in UAX#29).
 *
 * @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires
 *              state to break graphemes. This state can be passed in as a pointer
 *              in the `state` argument and should initially be set to 0. If the
 *              state is not passed in (i.e. a null pointer is passed), UAX#29 rules
 *              GB10/12/13 which require this state will not be applied, essentially
 *              matching the rules in Unicode 8.0.0.
 *
 * @warning If the state parameter is used, `utf8proc_grapheme_break` must be called
 *          IN ORDER on ALL potential breaks in a string.
 */
-UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
+UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
    utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state);
 /**
--- a/utf8proc_data.c
+++ b/utf8proc_data.c