Unicode 9 updates (#70)
* Updates for Unicode 9.0.0 TR29 Changes - New rules GB10/(12/13) are used to combine emoji-zwj sequences/ (force grapheme breaks every two RI codepoints). Unfortunately this breaks statelessness of grapheme-boundary determination. Deal with this by ignoring the problem in utf8proc_grapheme_break, and by hacking in a special case in decompose - ZWJ moved to its own boundclass, update what is now GB9 accordingly. - Add comments to indicate which rule a given case implements - The Number of bound classes Now exceeds 4 bits, expand to 8 and reorganize fields * Import Unicode 9 data * Update Grapheme break API to expose state override * Bump MAJOR version
This commit is contained in:
committed by
Steven G. Johnson
parent
3d0576a9b9
commit
41c6b23aab
@@ -182,8 +182,8 @@ class UnicodeChar
|
||||
"#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
|
||||
"#{$ignorable.include?(code)}, " <<
|
||||
"#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
|
||||
"#{$grapheme_boundclass[code]}, " <<
|
||||
"#{$charwidth[code]}},\n"
|
||||
"#{$charwidth[code]}, 0, " <<
|
||||
"#{$grapheme_boundclass[code]}},\n"
|
||||
end
|
||||
end
|
||||
|
||||
@@ -306,7 +306,7 @@ end
|
||||
$stdout << "};\n\n"
|
||||
|
||||
$stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
|
||||
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER, 0},\n"
|
||||
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, -1, -1, -1, -1, -1, false,false,false,false,0,0,UTF8PROC_BOUNDCLASS_OTHER},\n"
|
||||
properties.each { |line|
|
||||
$stdout << line
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user