Unicode 9 updates (#70)

* Updates for Unicode 9.0.0 TR29 Changes

- New rules GB10/(12/13) are used to combine emoji-zwj sequences/
  (force grapheme breaks every two RI codepoints). Unfortunately this
  breaks statelessness of grapheme-boundary determination. Deal with
  this by ignoring the problem in utf8proc_grapheme_break, and by
  hacking in a special case in decompose

- ZWJ moved to its own boundclass, update what is now GB9 accordingly.

- Add comments to indicate which rule a given case implements

- The Number of bound classes Now exceeds 4 bits, expand to 8 and
  reorganize fields

* Import Unicode 9 data

* Update Grapheme break API to expose state override

* Bump MAJOR version
This commit is contained in:
Keno Fischer 2016-06-28 16:04:25 -04:00 committed by Steven G. Johnson
parent 3d0576a9b9
commit 41c6b23aab
7 changed files with 11517 additions and 11113 deletions

View File

@ -7,9 +7,9 @@ disallow_intree_builds()
project (utf8proc C)
# Be sure to also update these in Makefile!
set(SO_MAJOR 2)
set(SO_MAJOR 3)
set(SO_MINOR 0)
set(SO_PATCH 1)
set(SO_PATCH 0)
add_definitions (
-DUTF8PROC_EXPORTS

View File

@ -2,6 +2,6 @@ include/
include/utf8proc.h
lib/
lib/libutf8proc.a
lib/libutf8proc.so -> libutf8proc.so.2.0.1
lib/libutf8proc.so.2 -> libutf8proc.so.2.0.1
lib/libutf8proc.so.2.0.1
lib/libutf8proc.so -> libutf8proc.so.3.0.0
lib/libutf8proc.so.3 -> libutf8proc.so.3.0.0
lib/libutf8proc.so.3.0.0

View File

@ -19,9 +19,9 @@ UCFLAGS = $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS
# not API compatibility: MAJOR should be incremented whenever *binary*
# compatibility is broken, even if the API is backward-compatible
# Be sure to also update these in MANIFEST and CMakeLists.txt!
MAJOR=2
MAJOR=3
MINOR=0
PATCH=1
PATCH=0
OS := $(shell uname)
ifeq ($(OS),Darwin) # MacOS X

View File

@ -182,8 +182,8 @@ class UnicodeChar
"#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
"#{$ignorable.include?(code)}, " <<
"#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
"#{$grapheme_boundclass[code]}, " <<
"#{$charwidth[code]}},\n"
"#{$charwidth[code]}, 0, " <<
"#{$grapheme_boundclass[code]}},\n"
end
end
@ -306,7 +306,7 @@ end
$stdout << "};\n\n"
$stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER, 0},\n"
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, -1, -1, -1, -1, -1, false,false,false,false,0,0,UTF8PROC_BOUNDCLASS_OTHER},\n"
properties.each { |line|
$stdout << line
}

View File

@ -233,36 +233,87 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
}
/* return whether there is a grapheme break between boundclasses lbc and tbc */
static utf8proc_bool grapheme_break(int lbc, int tbc) {
/* return whether there is a grapheme break between boundclasses lbc and tbc
(according to the definition of extended grapheme clusters)
Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
http://www.unicode.org/reports/tr29/tr29-29.html
CAVEATS:
Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
and GB 12/13 (regional indicator code points) require knowledge of previous characters
and are thus not handled by this function. This may result in an incorrect break before
an E_Modifier class codepoint and an incorrectly missing break between two
REGIONAL_INDICATOR class code points if such support does not exist in the caller.
See the special support in grapheme_break_extended, for required bookkeeping by the caller.
*/
static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
return
(lbc == UTF8PROC_BOUNDCLASS_START) ? true :
(lbc == UTF8PROC_BOUNDCLASS_CR &&
tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
(lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
(tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
(tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
(lbc == UTF8PROC_BOUNDCLASS_L &&
(tbc == UTF8PROC_BOUNDCLASS_L ||
tbc == UTF8PROC_BOUNDCLASS_V ||
tbc == UTF8PROC_BOUNDCLASS_LV ||
tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
((lbc == UTF8PROC_BOUNDCLASS_LV ||
lbc == UTF8PROC_BOUNDCLASS_V) &&
(tbc == UTF8PROC_BOUNDCLASS_V ||
tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
((lbc == UTF8PROC_BOUNDCLASS_LVT ||
lbc == UTF8PROC_BOUNDCLASS_T) &&
tbc == UTF8PROC_BOUNDCLASS_T) ? false :
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :
(tbc != UTF8PROC_BOUNDCLASS_SPACINGMARK);
(lbc == UTF8PROC_BOUNDCLASS_START) ? true : // GB1
(lbc == UTF8PROC_BOUNDCLASS_CR && // GB3
tbc == UTF8PROC_BOUNDCLASS_LF) ? false : // ---
(lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB4
(tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB5
(lbc == UTF8PROC_BOUNDCLASS_L && // GB6
(tbc == UTF8PROC_BOUNDCLASS_L || // ---
tbc == UTF8PROC_BOUNDCLASS_V || // ---
tbc == UTF8PROC_BOUNDCLASS_LV || // ---
tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : // ---
((lbc == UTF8PROC_BOUNDCLASS_LV || // GB7
lbc == UTF8PROC_BOUNDCLASS_V) && // ---
(tbc == UTF8PROC_BOUNDCLASS_V || // ---
tbc == UTF8PROC_BOUNDCLASS_T)) ? false : // ---
((lbc == UTF8PROC_BOUNDCLASS_LVT || // GB8
lbc == UTF8PROC_BOUNDCLASS_T) && // ---
tbc == UTF8PROC_BOUNDCLASS_T) ? false : // ---
(tbc == UTF8PROC_BOUNDCLASS_EXTEND || // GB9
tbc == UTF8PROC_BOUNDCLASS_ZWJ || // ---
tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a
lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b
((lbc == UTF8PROC_BOUNDCLASS_E_BASE || // GB10 (requires additional handling below)
lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) && // ----
tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : // ----
(lbc == UTF8PROC_BOUNDCLASS_ZWJ && // GB11
(tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ || // ----
tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false : // ----
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below)
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ----
true; // GB999
}
/* return whether there is a grapheme break between codepoints c1 and c2 */
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, utf8proc_int32_t c2) {
return grapheme_break(utf8proc_get_property(c1)->boundclass,
utf8proc_get_property(c2)->boundclass);
static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
{
int lbc_override = lbc;
if (state && *state != UTF8PROC_BOUNDCLASS_START)
lbc_override = *state;
utf8proc_bool break_permitted = grapheme_break_simple(lbc, tbc);
if (state) {
// Special support for GB 12/13 made possible by GB999. After two RI
// class codepoints we want to force a break. Do this by resetting the
// second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
// after that character according to GB999 (unless of course such a break is
// forbidden by a different rule such as GB9).
if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
*state = UTF8PROC_BOUNDCLASS_OTHER;
// Special support for GB10. Fold any EXTEND codepoints into the previous
// boundclass if we're dealing with an emoji base boundclass.
else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE ||
*state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&
tbc == UTF8PROC_BOUNDCLASS_EXTEND)
*state = UTF8PROC_BOUNDCLASS_E_BASE;
else
*state = tbc;
}
return break_permitted;
}
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
utf8proc_get_property(c2)->boundclass,
state);
}
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
@ -388,8 +439,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
if (options & UTF8PROC_CHARBOUND) {
utf8proc_bool boundary;
int tbc = property->boundclass;
boundary = grapheme_break(*last_boundclass, tbc);
*last_boundclass = tbc;
boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
if (boundary) {
if (bufsize >= 1) dst[0] = 0xFFFF;
if (bufsize >= 2) dst[1] = uc;

View File

@ -68,9 +68,9 @@
*/
/** @{ */
/** The MAJOR version number (increased when backwards API compatibility is broken). */
#define UTF8PROC_VERSION_MAJOR 1
#define UTF8PROC_VERSION_MAJOR 2
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
#define UTF8PROC_VERSION_MINOR 3
#define UTF8PROC_VERSION_MINOR 0
/** The PATCH version (increased for fixes that do not change the API). */
#define UTF8PROC_VERSION_PATCH 0
/** @} */
@ -259,13 +259,14 @@ typedef struct utf8proc_property_struct {
*/
unsigned ignorable:1;
unsigned control_boundary:1;
/** The width of the codepoint. */
unsigned charwidth:2;
unsigned pad:2;
/**
* Boundclass.
* @see utf8proc_boundclass_t.
*/
unsigned boundclass:4;
/** The width of the codepoint. */
unsigned charwidth:2;
unsigned boundclass:8;
} utf8proc_property_t;
/** Unicode categories. */
@ -349,7 +350,7 @@ typedef enum {
UTF8PROC_DECOMP_TYPE_COMPAT = 16, /**< Compat */
} utf8proc_decomp_type_t;
/** Boundclass property. */
/** Boundclass property. (TR29) */
typedef enum {
UTF8PROC_BOUNDCLASS_START = 0, /**< Start */
UTF8PROC_BOUNDCLASS_OTHER = 1, /**< Other */
@ -364,6 +365,12 @@ typedef enum {
UTF8PROC_BOUNDCLASS_LVT = 10, /**< Lvt */
UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */
UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */
UTF8PROC_BOUNDCLASS_PREPEND = 13, /**< Prepend */
UTF8PROC_BOUNDCLASS_ZWJ = 14, /**< Zero Width Joiner */
UTF8PROC_BOUNDCLASS_E_BASE = 15, /**< Emoji Base */
UTF8PROC_BOUNDCLASS_E_MODIFIER = 16, /**< Emoji Modifier */
UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17, /**< Glue_After_ZWJ */
UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */
} utf8proc_boundclass_t;
/**
@ -513,8 +520,19 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
/**
* Given a pair of consecutive codepoints, return whether a grapheme break is
* permitted between them (as defined by the extended grapheme clusters in UAX#29).
*
* @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires
* state to break graphemes. This state can be passed in as a pointer
* in the `state` argument and should initially be set to 0. If the
* state is not passed in (i.e. a null pointer is passed), UAX#29 rules
* GB10/12/13 which require this state will not be applied, essentially
* matching the rules in Unicode 8.0.0.
*
* @warning If the state parameter is used, `utf8proc_grapheme_break` must be called
* IN ORDER on ALL potential breaks in a string.
*/
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state);
/**

File diff suppressed because it is too large Load Diff