Merge branch 'master' of https://github.com/JuliaLang/utf8proc
This commit is contained in:
commit
e0295be467
@ -40,7 +40,7 @@ The C library is found in this directory after successful compilation
|
|||||||
and is named `libutf8proc.a` (for the static library) and
|
and is named `libutf8proc.a` (for the static library) and
|
||||||
`libutf8proc.so` (for the dynamic library).
|
`libutf8proc.so` (for the dynamic library).
|
||||||
|
|
||||||
The Unicode version supported is 10.0.0.
|
The Unicode version supported is 11.0.0.
|
||||||
|
|
||||||
For Unicode normalizations, the following options are used:
|
For Unicode normalizations, the following options are used:
|
||||||
|
|
||||||
|
|||||||
@ -16,11 +16,11 @@ CURLFLAGS = --retry 5 --location
|
|||||||
|
|
||||||
.DELETE_ON_ERROR:
|
.DELETE_ON_ERROR:
|
||||||
|
|
||||||
utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt
|
utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt emoji-data.txt
|
||||||
$(RUBY) data_generator.rb < UnicodeData.txt > $@
|
$(RUBY) data_generator.rb < UnicodeData.txt > $@
|
||||||
|
|
||||||
# GNU Unifont version for font metric calculations:
|
# GNU Unifont version for font metric calculations:
|
||||||
UNIFONT_VERSION=10.0.07
|
UNIFONT_VERSION=11.0.01
|
||||||
|
|
||||||
unifont.ttf:
|
unifont.ttf:
|
||||||
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://mirrors.kernel.org/gnu/unifont/unifont-$(UNIFONT_VERSION)/unifont-$(UNIFONT_VERSION).ttf
|
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://mirrors.kernel.org/gnu/unifont/unifont-$(UNIFONT_VERSION)/unifont-$(UNIFONT_VERSION).ttf
|
||||||
@ -35,7 +35,7 @@ CharWidths.txt: charwidths.jl unifont.sfd unifont_upper.sfd EastAsianWidth.txt
|
|||||||
$(JULIA) charwidths.jl > $@
|
$(JULIA) charwidths.jl > $@
|
||||||
|
|
||||||
# Unicode data version
|
# Unicode data version
|
||||||
UNICODE_VERSION=10.0.0
|
UNICODE_VERSION=11.0.0
|
||||||
|
|
||||||
UnicodeData.txt:
|
UnicodeData.txt:
|
||||||
$(CURL) $(CURLFLAGS) -o $@ -O http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
|
$(CURL) $(CURLFLAGS) -o $@ -O http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
|
||||||
@ -61,6 +61,9 @@ NormalizationTest.txt:
|
|||||||
GraphemeBreakTest.txt:
|
GraphemeBreakTest.txt:
|
||||||
$(CURL) $(CURLFLAGS) $(URLCACHE)http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@
|
$(CURL) $(CURLFLAGS) $(URLCACHE)http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@
|
||||||
|
|
||||||
|
emoji-data.txt:
|
||||||
|
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://unicode.org/Public/emoji/`echo $(UNICODE_VERSION) | cut -d. -f1-2`/emoji-data.txt
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt unifont*.ttf unifont*.sfd
|
rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt unifont*.ttf unifont*.sfd emoji-data.txt
|
||||||
rm -f utf8proc_data.c.new
|
rm -f utf8proc_data.c.new
|
||||||
|
|||||||
@ -87,6 +87,19 @@ $grapheme_boundclass_list.each_line do |entry|
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
$emoji_data_list = File.read("emoji-data.txt")
|
||||||
|
$emoji_data_list.each_line do |entry|
|
||||||
|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Extended_Pictographic\W/
|
||||||
|
$1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC" }
|
||||||
|
elsif entry =~ /^([0-9A-F]+)\s*;\s*Extended_Pictographic\W/
|
||||||
|
$grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC"
|
||||||
|
elsif entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Modifier\W/
|
||||||
|
$1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTEND" }
|
||||||
|
elsif entry =~ /^([0-9A-F]+)\s*;\s*Emoji_Modifier\W/
|
||||||
|
$grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTEND"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
$charwidth_list = File.read("CharWidths.txt")
|
$charwidth_list = File.read("CharWidths.txt")
|
||||||
$charwidth = Hash.new(0)
|
$charwidth = Hash.new(0)
|
||||||
$charwidth_list.each_line do |entry|
|
$charwidth_list.each_line do |entry|
|
||||||
|
|||||||
@ -19,7 +19,9 @@ int main(int argc, char **argv)
|
|||||||
check(u == c || utf8proc_codepoint_valid(u), "invalid toupper");
|
check(u == c || utf8proc_codepoint_valid(u), "invalid toupper");
|
||||||
check(t == c || utf8proc_codepoint_valid(t), "invalid totitle");
|
check(t == c || utf8proc_codepoint_valid(t), "invalid totitle");
|
||||||
|
|
||||||
if (utf8proc_codepoint_valid(c) && (l == u) != (l == t)) {
|
if (utf8proc_codepoint_valid(c) && (l == u) != (l == t) &&
|
||||||
|
/* Unicode 11: Georgian Mkhedruli chars have uppercase but no titlecase. */
|
||||||
|
!(((c >= 0x10d0 && c <= 0x10fa) || c >= (0x10fd && c <= 0x10ff)) && l != u)) {
|
||||||
fprintf(stderr, "unexpected titlecase %x for lowercase %x / uppercase %x\n", t, l, c);
|
fprintf(stderr, "unexpected titlecase %x for lowercase %x / uppercase %x\n", t, l, c);
|
||||||
++error;
|
++error;
|
||||||
}
|
}
|
||||||
|
|||||||
23
utf8proc.c
23
utf8proc.c
@ -271,12 +271,8 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
|
|||||||
tbc == UTF8PROC_BOUNDCLASS_ZWJ || // ---
|
tbc == UTF8PROC_BOUNDCLASS_ZWJ || // ---
|
||||||
tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a
|
tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a
|
||||||
lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b
|
lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b
|
||||||
((lbc == UTF8PROC_BOUNDCLASS_E_BASE || // GB10 (requires additional handling below)
|
(lbc == UTF8PROC_BOUNDCLASS_E_ZWG && // GB11 (requires additional handling below)
|
||||||
lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) && // ----
|
tbc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) ? false : // ----
|
||||||
tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : // ----
|
|
||||||
(lbc == UTF8PROC_BOUNDCLASS_ZWJ && // GB11
|
|
||||||
(tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ || // ----
|
|
||||||
tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false : // ----
|
|
||||||
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below)
|
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below)
|
||||||
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ----
|
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ----
|
||||||
true; // GB999
|
true; // GB999
|
||||||
@ -295,12 +291,15 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t
|
|||||||
// forbidden by a different rule such as GB9).
|
// forbidden by a different rule such as GB9).
|
||||||
if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
|
if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
|
||||||
*state = UTF8PROC_BOUNDCLASS_OTHER;
|
*state = UTF8PROC_BOUNDCLASS_OTHER;
|
||||||
// Special support for GB10. Fold any EXTEND codepoints into the previous
|
// Special support for GB11 (emoji extend* zwj / emoji)
|
||||||
// boundclass if we're dealing with an emoji base boundclass.
|
else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
|
||||||
else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE ||
|
if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
|
||||||
*state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&
|
*state = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
|
||||||
tbc == UTF8PROC_BOUNDCLASS_EXTEND)
|
else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ)
|
||||||
*state = UTF8PROC_BOUNDCLASS_E_BASE;
|
*state = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
|
||||||
|
else
|
||||||
|
*state = tbc;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
*state = tbc;
|
*state = tbc;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -382,10 +382,18 @@ typedef enum {
|
|||||||
UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */
|
UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */
|
||||||
UTF8PROC_BOUNDCLASS_PREPEND = 13, /**< Prepend */
|
UTF8PROC_BOUNDCLASS_PREPEND = 13, /**< Prepend */
|
||||||
UTF8PROC_BOUNDCLASS_ZWJ = 14, /**< Zero Width Joiner */
|
UTF8PROC_BOUNDCLASS_ZWJ = 14, /**< Zero Width Joiner */
|
||||||
|
|
||||||
|
/* the following are no longer used in Unicode 11, but we keep
|
||||||
|
the constants here for backward compatibility */
|
||||||
UTF8PROC_BOUNDCLASS_E_BASE = 15, /**< Emoji Base */
|
UTF8PROC_BOUNDCLASS_E_BASE = 15, /**< Emoji Base */
|
||||||
UTF8PROC_BOUNDCLASS_E_MODIFIER = 16, /**< Emoji Modifier */
|
UTF8PROC_BOUNDCLASS_E_MODIFIER = 16, /**< Emoji Modifier */
|
||||||
UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17, /**< Glue_After_ZWJ */
|
UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17, /**< Glue_After_ZWJ */
|
||||||
UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */
|
UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */
|
||||||
|
|
||||||
|
/* the Extended_Pictographic property is used in the Unicode 11
|
||||||
|
grapheme-boundary rules, so we store it in the boundclass field */
|
||||||
|
UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC = 19,
|
||||||
|
UTF8PROC_BOUNDCLASS_E_ZWG = 20, /* UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC + ZWJ */
|
||||||
} utf8proc_boundclass_t;
|
} utf8proc_boundclass_t;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
20423
utf8proc_data.c
20423
utf8proc_data.c
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user