Unicode 9 updates (#70)

* Updates for Unicode 9.0.0 TR29 Changes

- New rules GB10/(12/13) are used to combine emoji-zwj sequences/
  (force grapheme breaks every two RI codepoints). Unfortunately this
  breaks statelessness of grapheme-boundary determination. Deal with
  this by ignoring the problem in utf8proc_grapheme_break, and by
  hacking in a special case in decompose

- ZWJ moved to its own boundclass, update what is now GB9 accordingly.

- Add comments to indicate which rule a given case implements

- The Number of bound classes Now exceeds 4 bits, expand to 8 and
  reorganize fields

* Import Unicode 9 data

* Update Grapheme break API to expose state override

* Bump MAJOR version
This commit is contained in:
Keno Fischer 2016-06-28 16:04:25 -04:00 committed by Steven G. Johnson
parent 3d0576a9b9
commit 41c6b23aab
7 changed files with 11517 additions and 11113 deletions

View File

@ -7,9 +7,9 @@ disallow_intree_builds()
project (utf8proc C) project (utf8proc C)
# Be sure to also update these in Makefile! # Be sure to also update these in Makefile!
set(SO_MAJOR 2) set(SO_MAJOR 3)
set(SO_MINOR 0) set(SO_MINOR 0)
set(SO_PATCH 1) set(SO_PATCH 0)
add_definitions ( add_definitions (
-DUTF8PROC_EXPORTS -DUTF8PROC_EXPORTS

View File

@ -2,6 +2,6 @@ include/
include/utf8proc.h include/utf8proc.h
lib/ lib/
lib/libutf8proc.a lib/libutf8proc.a
lib/libutf8proc.so -> libutf8proc.so.2.0.1 lib/libutf8proc.so -> libutf8proc.so.3.0.0
lib/libutf8proc.so.2 -> libutf8proc.so.2.0.1 lib/libutf8proc.so.3 -> libutf8proc.so.3.0.0
lib/libutf8proc.so.2.0.1 lib/libutf8proc.so.3.0.0

View File

@ -19,9 +19,9 @@ UCFLAGS = $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS
# not API compatibility: MAJOR should be incremented whenever *binary* # not API compatibility: MAJOR should be incremented whenever *binary*
# compatibility is broken, even if the API is backward-compatible # compatibility is broken, even if the API is backward-compatible
# Be sure to also update these in MANIFEST and CMakeLists.txt! # Be sure to also update these in MANIFEST and CMakeLists.txt!
MAJOR=2 MAJOR=3
MINOR=0 MINOR=0
PATCH=1 PATCH=0
OS := $(shell uname) OS := $(shell uname)
ifeq ($(OS),Darwin) # MacOS X ifeq ($(OS),Darwin) # MacOS X

View File

@ -182,8 +182,8 @@ class UnicodeChar
"#{$exclusions.include?(code) or $excl_version.include?(code)}, " << "#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
"#{$ignorable.include?(code)}, " << "#{$ignorable.include?(code)}, " <<
"#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " << "#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
"#{$grapheme_boundclass[code]}, " << "#{$charwidth[code]}, 0, " <<
"#{$charwidth[code]}},\n" "#{$grapheme_boundclass[code]}},\n"
end end
end end
@ -306,7 +306,7 @@ end
$stdout << "};\n\n" $stdout << "};\n\n"
$stdout << "const utf8proc_property_t utf8proc_properties[] = {\n" $stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER, 0},\n" $stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, -1, -1, -1, -1, -1, false,false,false,false,0,0,UTF8PROC_BOUNDCLASS_OTHER},\n"
properties.each { |line| properties.each { |line|
$stdout << line $stdout << line
} }

View File

@ -233,36 +233,87 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc); return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
} }
/* return whether there is a grapheme break between boundclasses lbc and tbc */ /* return whether there is a grapheme break between boundclasses lbc and tbc
static utf8proc_bool grapheme_break(int lbc, int tbc) { (according to the definition of extended grapheme clusters)
Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
http://www.unicode.org/reports/tr29/tr29-29.html
CAVEATS:
Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
and GB 12/13 (regional indicator code points) require knowledge of previous characters
and are thus not handled by this function. This may result in an incorrect break before
an E_Modifier class codepoint and an incorrectly missing break between two
REGIONAL_INDICATOR class code points if such support does not exist in the caller.
See the special support in grapheme_break_extended, for required bookkeeping by the caller.
*/
static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
return return
(lbc == UTF8PROC_BOUNDCLASS_START) ? true : (lbc == UTF8PROC_BOUNDCLASS_START) ? true : // GB1
(lbc == UTF8PROC_BOUNDCLASS_CR && (lbc == UTF8PROC_BOUNDCLASS_CR && // GB3
tbc == UTF8PROC_BOUNDCLASS_LF) ? false : tbc == UTF8PROC_BOUNDCLASS_LF) ? false : // ---
(lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB4
(tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB5
(tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false : (lbc == UTF8PROC_BOUNDCLASS_L && // GB6
(lbc == UTF8PROC_BOUNDCLASS_L && (tbc == UTF8PROC_BOUNDCLASS_L || // ---
(tbc == UTF8PROC_BOUNDCLASS_L || tbc == UTF8PROC_BOUNDCLASS_V || // ---
tbc == UTF8PROC_BOUNDCLASS_V || tbc == UTF8PROC_BOUNDCLASS_LV || // ---
tbc == UTF8PROC_BOUNDCLASS_LV || tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : // ---
tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : ((lbc == UTF8PROC_BOUNDCLASS_LV || // GB7
((lbc == UTF8PROC_BOUNDCLASS_LV || lbc == UTF8PROC_BOUNDCLASS_V) && // ---
lbc == UTF8PROC_BOUNDCLASS_V) && (tbc == UTF8PROC_BOUNDCLASS_V || // ---
(tbc == UTF8PROC_BOUNDCLASS_V || tbc == UTF8PROC_BOUNDCLASS_T)) ? false : // ---
tbc == UTF8PROC_BOUNDCLASS_T)) ? false : ((lbc == UTF8PROC_BOUNDCLASS_LVT || // GB8
((lbc == UTF8PROC_BOUNDCLASS_LVT || lbc == UTF8PROC_BOUNDCLASS_T) && // ---
lbc == UTF8PROC_BOUNDCLASS_T) && tbc == UTF8PROC_BOUNDCLASS_T) ? false : // ---
tbc == UTF8PROC_BOUNDCLASS_T) ? false : (tbc == UTF8PROC_BOUNDCLASS_EXTEND || // GB9
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && tbc == UTF8PROC_BOUNDCLASS_ZWJ || // ---
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a
(tbc != UTF8PROC_BOUNDCLASS_SPACINGMARK); lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b
((lbc == UTF8PROC_BOUNDCLASS_E_BASE || // GB10 (requires additional handling below)
lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) && // ----
tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : // ----
(lbc == UTF8PROC_BOUNDCLASS_ZWJ && // GB11
(tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ || // ----
tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false : // ----
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below)
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ----
true; // GB999
} }
/* return whether there is a grapheme break between codepoints c1 and c2 */ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, utf8proc_int32_t c2) { {
return grapheme_break(utf8proc_get_property(c1)->boundclass, int lbc_override = lbc;
utf8proc_get_property(c2)->boundclass); if (state && *state != UTF8PROC_BOUNDCLASS_START)
lbc_override = *state;
utf8proc_bool break_permitted = grapheme_break_simple(lbc, tbc);
if (state) {
// Special support for GB 12/13 made possible by GB999. After two RI
// class codepoints we want to force a break. Do this by resetting the
// second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
// after that character according to GB999 (unless of course such a break is
// forbidden by a different rule such as GB9).
if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
*state = UTF8PROC_BOUNDCLASS_OTHER;
// Special support for GB10. Fold any EXTEND codepoints into the previous
// boundclass if we're dealing with an emoji base boundclass.
else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE ||
*state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&
tbc == UTF8PROC_BOUNDCLASS_EXTEND)
*state = UTF8PROC_BOUNDCLASS_E_BASE;
else
*state = tbc;
}
return break_permitted;
}
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
utf8proc_get_property(c2)->boundclass,
state);
} }
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c) UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
@ -388,8 +439,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
if (options & UTF8PROC_CHARBOUND) { if (options & UTF8PROC_CHARBOUND) {
utf8proc_bool boundary; utf8proc_bool boundary;
int tbc = property->boundclass; int tbc = property->boundclass;
boundary = grapheme_break(*last_boundclass, tbc); boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
*last_boundclass = tbc;
if (boundary) { if (boundary) {
if (bufsize >= 1) dst[0] = 0xFFFF; if (bufsize >= 1) dst[0] = 0xFFFF;
if (bufsize >= 2) dst[1] = uc; if (bufsize >= 2) dst[1] = uc;

View File

@ -68,9 +68,9 @@
*/ */
/** @{ */ /** @{ */
/** The MAJOR version number (increased when backwards API compatibility is broken). */ /** The MAJOR version number (increased when backwards API compatibility is broken). */
#define UTF8PROC_VERSION_MAJOR 1 #define UTF8PROC_VERSION_MAJOR 2
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */ /** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
#define UTF8PROC_VERSION_MINOR 3 #define UTF8PROC_VERSION_MINOR 0
/** The PATCH version (increased for fixes that do not change the API). */ /** The PATCH version (increased for fixes that do not change the API). */
#define UTF8PROC_VERSION_PATCH 0 #define UTF8PROC_VERSION_PATCH 0
/** @} */ /** @} */
@ -259,13 +259,14 @@ typedef struct utf8proc_property_struct {
*/ */
unsigned ignorable:1; unsigned ignorable:1;
unsigned control_boundary:1; unsigned control_boundary:1;
/** The width of the codepoint. */
unsigned charwidth:2;
unsigned pad:2;
/** /**
* Boundclass. * Boundclass.
* @see utf8proc_boundclass_t. * @see utf8proc_boundclass_t.
*/ */
unsigned boundclass:4; unsigned boundclass:8;
/** The width of the codepoint. */
unsigned charwidth:2;
} utf8proc_property_t; } utf8proc_property_t;
/** Unicode categories. */ /** Unicode categories. */
@ -349,7 +350,7 @@ typedef enum {
UTF8PROC_DECOMP_TYPE_COMPAT = 16, /**< Compat */ UTF8PROC_DECOMP_TYPE_COMPAT = 16, /**< Compat */
} utf8proc_decomp_type_t; } utf8proc_decomp_type_t;
/** Boundclass property. */ /** Boundclass property. (TR29) */
typedef enum { typedef enum {
UTF8PROC_BOUNDCLASS_START = 0, /**< Start */ UTF8PROC_BOUNDCLASS_START = 0, /**< Start */
UTF8PROC_BOUNDCLASS_OTHER = 1, /**< Other */ UTF8PROC_BOUNDCLASS_OTHER = 1, /**< Other */
@ -364,6 +365,12 @@ typedef enum {
UTF8PROC_BOUNDCLASS_LVT = 10, /**< Lvt */ UTF8PROC_BOUNDCLASS_LVT = 10, /**< Lvt */
UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */ UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */
UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */ UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */
UTF8PROC_BOUNDCLASS_PREPEND = 13, /**< Prepend */
UTF8PROC_BOUNDCLASS_ZWJ = 14, /**< Zero Width Joiner */
UTF8PROC_BOUNDCLASS_E_BASE = 15, /**< Emoji Base */
UTF8PROC_BOUNDCLASS_E_MODIFIER = 16, /**< Emoji Modifier */
UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17, /**< Glue_After_ZWJ */
UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */
} utf8proc_boundclass_t; } utf8proc_boundclass_t;
/** /**
@ -513,8 +520,19 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
/** /**
* Given a pair of consecutive codepoints, return whether a grapheme break is * Given a pair of consecutive codepoints, return whether a grapheme break is
* permitted between them (as defined by the extended grapheme clusters in UAX#29). * permitted between them (as defined by the extended grapheme clusters in UAX#29).
*
* @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires
* state to break graphemes. This state can be passed in as a pointer
* in the `state` argument and should initially be set to 0. If the
* state is not passed in (i.e. a null pointer is passed), UAX#29 rules
* GB10/12/13 which require this state will not be applied, essentially
* matching the rules in Unicode 8.0.0.
*
* @warning If the state parameter is used, `utf8proc_grapheme_break` must be called
* IN ORDER on ALL potential breaks in a string.
*/ */
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2); UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state);
/** /**

File diff suppressed because it is too large Load Diff