Unicode 9 updates (#70)
* Updates for Unicode 9.0.0 TR29 Changes - New rules GB10/(12/13) are used to combine emoji-zwj sequences/ (force grapheme breaks every two RI codepoints). Unfortunately this breaks statelessness of grapheme-boundary determination. Deal with this by ignoring the problem in utf8proc_grapheme_break, and by hacking in a special case in decompose - ZWJ moved to its own boundclass, update what is now GB9 accordingly. - Add comments to indicate which rule a given case implements - The Number of bound classes Now exceeds 4 bits, expand to 8 and reorganize fields * Import Unicode 9 data * Update Grapheme break API to expose state override * Bump MAJOR version
This commit is contained in:
parent
3d0576a9b9
commit
41c6b23aab
@ -7,9 +7,9 @@ disallow_intree_builds()
|
|||||||
project (utf8proc C)
|
project (utf8proc C)
|
||||||
|
|
||||||
# Be sure to also update these in Makefile!
|
# Be sure to also update these in Makefile!
|
||||||
set(SO_MAJOR 2)
|
set(SO_MAJOR 3)
|
||||||
set(SO_MINOR 0)
|
set(SO_MINOR 0)
|
||||||
set(SO_PATCH 1)
|
set(SO_PATCH 0)
|
||||||
|
|
||||||
add_definitions (
|
add_definitions (
|
||||||
-DUTF8PROC_EXPORTS
|
-DUTF8PROC_EXPORTS
|
||||||
|
|||||||
6
MANIFEST
6
MANIFEST
@ -2,6 +2,6 @@ include/
|
|||||||
include/utf8proc.h
|
include/utf8proc.h
|
||||||
lib/
|
lib/
|
||||||
lib/libutf8proc.a
|
lib/libutf8proc.a
|
||||||
lib/libutf8proc.so -> libutf8proc.so.2.0.1
|
lib/libutf8proc.so -> libutf8proc.so.3.0.0
|
||||||
lib/libutf8proc.so.2 -> libutf8proc.so.2.0.1
|
lib/libutf8proc.so.3 -> libutf8proc.so.3.0.0
|
||||||
lib/libutf8proc.so.2.0.1
|
lib/libutf8proc.so.3.0.0
|
||||||
|
|||||||
4
Makefile
4
Makefile
@ -19,9 +19,9 @@ UCFLAGS = $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS
|
|||||||
# not API compatibility: MAJOR should be incremented whenever *binary*
|
# not API compatibility: MAJOR should be incremented whenever *binary*
|
||||||
# compatibility is broken, even if the API is backward-compatible
|
# compatibility is broken, even if the API is backward-compatible
|
||||||
# Be sure to also update these in MANIFEST and CMakeLists.txt!
|
# Be sure to also update these in MANIFEST and CMakeLists.txt!
|
||||||
MAJOR=2
|
MAJOR=3
|
||||||
MINOR=0
|
MINOR=0
|
||||||
PATCH=1
|
PATCH=0
|
||||||
|
|
||||||
OS := $(shell uname)
|
OS := $(shell uname)
|
||||||
ifeq ($(OS),Darwin) # MacOS X
|
ifeq ($(OS),Darwin) # MacOS X
|
||||||
|
|||||||
@ -182,8 +182,8 @@ class UnicodeChar
|
|||||||
"#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
|
"#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
|
||||||
"#{$ignorable.include?(code)}, " <<
|
"#{$ignorable.include?(code)}, " <<
|
||||||
"#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
|
"#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
|
||||||
"#{$grapheme_boundclass[code]}, " <<
|
"#{$charwidth[code]}, 0, " <<
|
||||||
"#{$charwidth[code]}},\n"
|
"#{$grapheme_boundclass[code]}},\n"
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
@ -306,7 +306,7 @@ end
|
|||||||
$stdout << "};\n\n"
|
$stdout << "};\n\n"
|
||||||
|
|
||||||
$stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
|
$stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
|
||||||
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER, 0},\n"
|
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, -1, -1, -1, -1, -1, false,false,false,false,0,0,UTF8PROC_BOUNDCLASS_OTHER},\n"
|
||||||
properties.each { |line|
|
properties.each { |line|
|
||||||
$stdout << line
|
$stdout << line
|
||||||
}
|
}
|
||||||
|
|||||||
110
utf8proc.c
110
utf8proc.c
@ -233,36 +233,87 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
|
|||||||
return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
|
return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* return whether there is a grapheme break between boundclasses lbc and tbc */
|
/* return whether there is a grapheme break between boundclasses lbc and tbc
|
||||||
static utf8proc_bool grapheme_break(int lbc, int tbc) {
|
(according to the definition of extended grapheme clusters)
|
||||||
return
|
|
||||||
(lbc == UTF8PROC_BOUNDCLASS_START) ? true :
|
Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
|
||||||
(lbc == UTF8PROC_BOUNDCLASS_CR &&
|
http://www.unicode.org/reports/tr29/tr29-29.html
|
||||||
tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
|
|
||||||
(lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
|
CAVEATS:
|
||||||
(tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
|
Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
|
||||||
(tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
|
and GB 12/13 (regional indicator code points) require knowledge of previous characters
|
||||||
(lbc == UTF8PROC_BOUNDCLASS_L &&
|
and are thus not handled by this function. This may result in an incorrect break before
|
||||||
(tbc == UTF8PROC_BOUNDCLASS_L ||
|
an E_Modifier class codepoint and an incorrectly missing break between two
|
||||||
tbc == UTF8PROC_BOUNDCLASS_V ||
|
REGIONAL_INDICATOR class code points if such support does not exist in the caller.
|
||||||
tbc == UTF8PROC_BOUNDCLASS_LV ||
|
|
||||||
tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
|
See the special support in grapheme_break_extended, for required bookkeeping by the caller.
|
||||||
((lbc == UTF8PROC_BOUNDCLASS_LV ||
|
*/
|
||||||
lbc == UTF8PROC_BOUNDCLASS_V) &&
|
static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
|
||||||
(tbc == UTF8PROC_BOUNDCLASS_V ||
|
return
|
||||||
tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
|
(lbc == UTF8PROC_BOUNDCLASS_START) ? true : // GB1
|
||||||
((lbc == UTF8PROC_BOUNDCLASS_LVT ||
|
(lbc == UTF8PROC_BOUNDCLASS_CR && // GB3
|
||||||
lbc == UTF8PROC_BOUNDCLASS_T) &&
|
tbc == UTF8PROC_BOUNDCLASS_LF) ? false : // ---
|
||||||
tbc == UTF8PROC_BOUNDCLASS_T) ? false :
|
(lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB4
|
||||||
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&
|
(tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB5
|
||||||
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :
|
(lbc == UTF8PROC_BOUNDCLASS_L && // GB6
|
||||||
(tbc != UTF8PROC_BOUNDCLASS_SPACINGMARK);
|
(tbc == UTF8PROC_BOUNDCLASS_L || // ---
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_V || // ---
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_LV || // ---
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : // ---
|
||||||
|
((lbc == UTF8PROC_BOUNDCLASS_LV || // GB7
|
||||||
|
lbc == UTF8PROC_BOUNDCLASS_V) && // ---
|
||||||
|
(tbc == UTF8PROC_BOUNDCLASS_V || // ---
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_T)) ? false : // ---
|
||||||
|
((lbc == UTF8PROC_BOUNDCLASS_LVT || // GB8
|
||||||
|
lbc == UTF8PROC_BOUNDCLASS_T) && // ---
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_T) ? false : // ---
|
||||||
|
(tbc == UTF8PROC_BOUNDCLASS_EXTEND || // GB9
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_ZWJ || // ---
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a
|
||||||
|
lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b
|
||||||
|
((lbc == UTF8PROC_BOUNDCLASS_E_BASE || // GB10 (requires additional handling below)
|
||||||
|
lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) && // ----
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : // ----
|
||||||
|
(lbc == UTF8PROC_BOUNDCLASS_ZWJ && // GB11
|
||||||
|
(tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ || // ----
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false : // ----
|
||||||
|
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below)
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ----
|
||||||
|
true; // GB999
|
||||||
}
|
}
|
||||||
|
|
||||||
/* return whether there is a grapheme break between codepoints c1 and c2 */
|
static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
|
||||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, utf8proc_int32_t c2) {
|
{
|
||||||
return grapheme_break(utf8proc_get_property(c1)->boundclass,
|
int lbc_override = lbc;
|
||||||
utf8proc_get_property(c2)->boundclass);
|
if (state && *state != UTF8PROC_BOUNDCLASS_START)
|
||||||
|
lbc_override = *state;
|
||||||
|
utf8proc_bool break_permitted = grapheme_break_simple(lbc, tbc);
|
||||||
|
if (state) {
|
||||||
|
// Special support for GB 12/13 made possible by GB999. After two RI
|
||||||
|
// class codepoints we want to force a break. Do this by resetting the
|
||||||
|
// second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
|
||||||
|
// after that character according to GB999 (unless of course such a break is
|
||||||
|
// forbidden by a different rule such as GB9).
|
||||||
|
if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
|
||||||
|
*state = UTF8PROC_BOUNDCLASS_OTHER;
|
||||||
|
// Special support for GB10. Fold any EXTEND codepoints into the previous
|
||||||
|
// boundclass if we're dealing with an emoji base boundclass.
|
||||||
|
else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE ||
|
||||||
|
*state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_EXTEND)
|
||||||
|
*state = UTF8PROC_BOUNDCLASS_E_BASE;
|
||||||
|
else
|
||||||
|
*state = tbc;
|
||||||
|
}
|
||||||
|
return break_permitted;
|
||||||
|
}
|
||||||
|
|
||||||
|
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
|
||||||
|
utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
|
||||||
|
|
||||||
|
return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
|
||||||
|
utf8proc_get_property(c2)->boundclass,
|
||||||
|
state);
|
||||||
}
|
}
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
|
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
|
||||||
@ -388,8 +439,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
|
|||||||
if (options & UTF8PROC_CHARBOUND) {
|
if (options & UTF8PROC_CHARBOUND) {
|
||||||
utf8proc_bool boundary;
|
utf8proc_bool boundary;
|
||||||
int tbc = property->boundclass;
|
int tbc = property->boundclass;
|
||||||
boundary = grapheme_break(*last_boundclass, tbc);
|
boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
|
||||||
*last_boundclass = tbc;
|
|
||||||
if (boundary) {
|
if (boundary) {
|
||||||
if (bufsize >= 1) dst[0] = 0xFFFF;
|
if (bufsize >= 1) dst[0] = 0xFFFF;
|
||||||
if (bufsize >= 2) dst[1] = uc;
|
if (bufsize >= 2) dst[1] = uc;
|
||||||
|
|||||||
32
utf8proc.h
32
utf8proc.h
@ -68,9 +68,9 @@
|
|||||||
*/
|
*/
|
||||||
/** @{ */
|
/** @{ */
|
||||||
/** The MAJOR version number (increased when backwards API compatibility is broken). */
|
/** The MAJOR version number (increased when backwards API compatibility is broken). */
|
||||||
#define UTF8PROC_VERSION_MAJOR 1
|
#define UTF8PROC_VERSION_MAJOR 2
|
||||||
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
|
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
|
||||||
#define UTF8PROC_VERSION_MINOR 3
|
#define UTF8PROC_VERSION_MINOR 0
|
||||||
/** The PATCH version (increased for fixes that do not change the API). */
|
/** The PATCH version (increased for fixes that do not change the API). */
|
||||||
#define UTF8PROC_VERSION_PATCH 0
|
#define UTF8PROC_VERSION_PATCH 0
|
||||||
/** @} */
|
/** @} */
|
||||||
@ -259,13 +259,14 @@ typedef struct utf8proc_property_struct {
|
|||||||
*/
|
*/
|
||||||
unsigned ignorable:1;
|
unsigned ignorable:1;
|
||||||
unsigned control_boundary:1;
|
unsigned control_boundary:1;
|
||||||
|
/** The width of the codepoint. */
|
||||||
|
unsigned charwidth:2;
|
||||||
|
unsigned pad:2;
|
||||||
/**
|
/**
|
||||||
* Boundclass.
|
* Boundclass.
|
||||||
* @see utf8proc_boundclass_t.
|
* @see utf8proc_boundclass_t.
|
||||||
*/
|
*/
|
||||||
unsigned boundclass:4;
|
unsigned boundclass:8;
|
||||||
/** The width of the codepoint. */
|
|
||||||
unsigned charwidth:2;
|
|
||||||
} utf8proc_property_t;
|
} utf8proc_property_t;
|
||||||
|
|
||||||
/** Unicode categories. */
|
/** Unicode categories. */
|
||||||
@ -349,7 +350,7 @@ typedef enum {
|
|||||||
UTF8PROC_DECOMP_TYPE_COMPAT = 16, /**< Compat */
|
UTF8PROC_DECOMP_TYPE_COMPAT = 16, /**< Compat */
|
||||||
} utf8proc_decomp_type_t;
|
} utf8proc_decomp_type_t;
|
||||||
|
|
||||||
/** Boundclass property. */
|
/** Boundclass property. (TR29) */
|
||||||
typedef enum {
|
typedef enum {
|
||||||
UTF8PROC_BOUNDCLASS_START = 0, /**< Start */
|
UTF8PROC_BOUNDCLASS_START = 0, /**< Start */
|
||||||
UTF8PROC_BOUNDCLASS_OTHER = 1, /**< Other */
|
UTF8PROC_BOUNDCLASS_OTHER = 1, /**< Other */
|
||||||
@ -364,6 +365,12 @@ typedef enum {
|
|||||||
UTF8PROC_BOUNDCLASS_LVT = 10, /**< Lvt */
|
UTF8PROC_BOUNDCLASS_LVT = 10, /**< Lvt */
|
||||||
UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */
|
UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */
|
||||||
UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */
|
UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */
|
||||||
|
UTF8PROC_BOUNDCLASS_PREPEND = 13, /**< Prepend */
|
||||||
|
UTF8PROC_BOUNDCLASS_ZWJ = 14, /**< Zero Width Joiner */
|
||||||
|
UTF8PROC_BOUNDCLASS_E_BASE = 15, /**< Emoji Base */
|
||||||
|
UTF8PROC_BOUNDCLASS_E_MODIFIER = 16, /**< Emoji Modifier */
|
||||||
|
UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17, /**< Glue_After_ZWJ */
|
||||||
|
UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */
|
||||||
} utf8proc_boundclass_t;
|
} utf8proc_boundclass_t;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -513,8 +520,19 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
|
|||||||
/**
|
/**
|
||||||
* Given a pair of consecutive codepoints, return whether a grapheme break is
|
* Given a pair of consecutive codepoints, return whether a grapheme break is
|
||||||
* permitted between them (as defined by the extended grapheme clusters in UAX#29).
|
* permitted between them (as defined by the extended grapheme clusters in UAX#29).
|
||||||
|
*
|
||||||
|
* @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires
|
||||||
|
* state to break graphemes. This state can be passed in as a pointer
|
||||||
|
* in the `state` argument and should initially be set to 0. If the
|
||||||
|
* state is not passed in (i.e. a null pointer is passed), UAX#29 rules
|
||||||
|
* GB10/12/13 which require this state will not be applied, essentially
|
||||||
|
* matching the rules in Unicode 8.0.0.
|
||||||
|
*
|
||||||
|
* @warning If the state parameter is used, `utf8proc_grapheme_break` must be called
|
||||||
|
* IN ORDER on ALL potential breaks in a string.
|
||||||
*/
|
*/
|
||||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
|
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
|
||||||
|
utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
22468
utf8proc_data.c
22468
utf8proc_data.c
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user