uppercase mapping ß (U+00df) to ẞ (U+1E9E) (#134)

* uppercase(0x00df) = 0x1e9e

* tests for titlecase and u+00df uppercase

* NEWS, another test
This commit is contained in:
Steven G. Johnson 2018-05-02 14:18:26 -04:00 committed by GitHub
parent 8639450134
commit d81308faba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 1312 additions and 1276 deletions

View File

@ -14,6 +14,9 @@
- `cmake` fix to avoid defining `UTF8PROC_EXPORTS` globally ([#121]). - `cmake` fix to avoid defining `UTF8PROC_EXPORTS` globally ([#121]).
- `toupper` of ß (U+00df) now yields ẞ (U+1E9E) ([#134]), similar to musl;
case-folding still yields the standard "ss" mapping.
## Version 2.1.1 ## ## Version 2.1.1 ##
2018-04-27 2018-04-27
@ -332,3 +335,4 @@ Release of version 1.0.1
[#128]: https://github.com/JuliaLang/utf8proc/issues/128 [#128]: https://github.com/JuliaLang/utf8proc/issues/128
[#132]: https://github.com/JuliaLang/utf8proc/issues/132 [#132]: https://github.com/JuliaLang/utf8proc/issues/132
[#133]: https://github.com/JuliaLang/utf8proc/issues/133 [#133]: https://github.com/JuliaLang/utf8proc/issues/133
[#134]: https://github.com/JuliaLang/utf8proc/issues/134

View File

@ -137,13 +137,13 @@ def cpary2utf16encoded(array)
end end
def cpary2c(array) def cpary2c(array)
return "UINT16_MAX" if array.nil? || array.length == 0 return "UINT16_MAX" if array.nil? || array.length == 0
lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ... lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ...
array = cpary2utf16encoded(array) array = cpary2utf16encoded(array)
if lencode >= 7 #we have only 3 bits for the length (which is already cutting it close. might need to change it to 2 bits in future Unicode versions) if lencode >= 7 #we have only 3 bits for the length (which is already cutting it close. might need to change it to 2 bits in future Unicode versions)
array = [lencode] + array array = [lencode] + array
lencode = 7 lencode = 7
end end
idx = pushary(array) idx = pushary(array)
raise "Array index out of bound" if idx > 0x1FFF raise "Array index out of bound" if idx > 0x1FFF
return "#{idx | (lencode << 13)}" return "#{idx | (lencode << 13)}"
end end
@ -188,9 +188,10 @@ class UnicodeChar
@decomp_mapping = ($8=='') ? nil : @decomp_mapping = ($8=='') ? nil :
$8.split.collect { |element| element.hex } $8.split.collect { |element| element.hex }
@bidi_mirrored = ($13=='Y') ? true : false @bidi_mirrored = ($13=='Y') ? true : false
@uppercase_mapping = ($16=='') ? nil : $16.hex # issue #130: use nonstandard uppercase ß -> ẞ
@uppercase_mapping = ($16=='') ? (code==0x00df ? 0x1e9e : nil) : $16.hex
@lowercase_mapping = ($17=='') ? nil : $17.hex @lowercase_mapping = ($17=='') ? nil : $17.hex
@titlecase_mapping = ($18=='') ? nil : $18.hex @titlecase_mapping = ($18=='') ? (code==0x00df ? 0x1e9e : nil) : $18.hex
end end
def case_folding def case_folding
$case_folding[code] $case_folding[code]
@ -260,17 +261,17 @@ chars.each do |char|
end end
unless comb2nd_indicies[dm1] unless comb2nd_indicies[dm1]
comb2nd_indicies_sorted_keys << dm1 comb2nd_indicies_sorted_keys << dm1
comb2nd_indicies[dm1] = comb2nd_indicies.keys.length comb2nd_indicies[dm1] = comb2nd_indicies.keys.length
end end
comb_array[comb1st_indicies[dm0]] ||= [] comb_array[comb1st_indicies[dm0]] ||= []
raise "Duplicate canonical mapping: #{char.code} #{dm0} #{dm1}" if comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]] raise "Duplicate canonical mapping: #{char.code} #{dm0} #{dm1}" if comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]]
comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]] = char.code comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]] = char.code
comb2nd_indicies_nonbasic[dm1] = true if char.code > 0xFFFF comb2nd_indicies_nonbasic[dm1] = true if char.code > 0xFFFF
end end
char.c_decomp_mapping = cpary2c(char.decomp_mapping) char.c_decomp_mapping = cpary2c(char.decomp_mapping)
char.c_case_folding = cpary2c(char.case_folding) char.c_case_folding = cpary2c(char.case_folding)
end end
comb_indicies = {} comb_indicies = {}
cumoffset = 0 cumoffset = 0
@ -281,7 +282,7 @@ comb1st_indicies.each do |dm0, index|
last = nil last = nil
offset = 0 offset = 0
comb2nd_indicies_sorted_keys.each_with_index do |dm1, b| comb2nd_indicies_sorted_keys.each_with_index do |dm1, b|
if comb_array[index][b] if comb_array[index][b]
first = offset unless first first = offset unless first
last = offset last = offset
last += 1 if comb2nd_indicies_nonbasic[dm1] last += 1 if comb2nd_indicies_nonbasic[dm1]
@ -391,7 +392,7 @@ comb1st_indicies.keys.each_index do |a|
offset = 0 offset = 0
$stdout << comb1st_indicies_firstoffsets[a] << ", " << comb1st_indicies_lastoffsets[a] << ", " $stdout << comb1st_indicies_firstoffsets[a] << ", " << comb1st_indicies_lastoffsets[a] << ", "
comb2nd_indicies_sorted_keys.each_with_index do |dm1, b| comb2nd_indicies_sorted_keys.each_with_index do |dm1, b|
break if offset > comb1st_indicies_lastoffsets[a] break if offset > comb1st_indicies_lastoffsets[a]
if offset >= comb1st_indicies_firstoffsets[a] if offset >= comb1st_indicies_firstoffsets[a]
i += 1 i += 1
if i == 8 if i == 8
@ -403,9 +404,8 @@ comb1st_indicies.keys.each_index do |a|
$stdout << (v & 0xFFFF) << ", " $stdout << (v & 0xFFFF) << ", "
end end
offset += 1 offset += 1
offset += 1 if comb2nd_indicies_nonbasic[dm1] offset += 1 if comb2nd_indicies_nonbasic[dm1]
end end
$stdout << "\n" $stdout << "\n"
end end
$stdout << "};\n\n" $stdout << "};\n\n"

View File

@ -13,13 +13,20 @@ int main(int argc, char **argv)
for (c = 0; c <= 0x110000; ++c) { for (c = 0; c <= 0x110000; ++c) {
utf8proc_int32_t l = utf8proc_tolower(c); utf8proc_int32_t l = utf8proc_tolower(c);
utf8proc_int32_t u = utf8proc_toupper(c); utf8proc_int32_t u = utf8proc_toupper(c);
utf8proc_int32_t t = utf8proc_totitle(c);
check(l == c || utf8proc_codepoint_valid(l), "invalid tolower"); check(l == c || utf8proc_codepoint_valid(l), "invalid tolower");
check(u == c || utf8proc_codepoint_valid(u), "invalid toupper"); check(u == c || utf8proc_codepoint_valid(u), "invalid toupper");
check(t == c || utf8proc_codepoint_valid(t), "invalid totitle");
if (utf8proc_codepoint_valid(c) && (l == u) != (l == t)) {
fprintf(stderr, "unexpected titlecase %x for lowercase %x / uppercase %x\n", t, l, c);
++error;
}
if (sizeof(wint_t) > 2 || c < (1<<16)) { if (sizeof(wint_t) > 2 || c < (1<<16)) {
wint_t l0 = towlower(c), u0 = towupper(c); wint_t l0 = towlower(c), u0 = towupper(c);
/* OS unicode tables may be out of date. But if they /* OS unicode tables may be out of date. But if they
do have a lower/uppercase mapping, hopefully it do have a lower/uppercase mapping, hopefully it
is correct? */ is correct? */
@ -44,6 +51,20 @@ int main(int argc, char **argv)
} }
} }
check(!error, "utf8proc case conversion FAILED %d tests.", error); check(!error, "utf8proc case conversion FAILED %d tests.", error);
/* issue #130 */
check(utf8proc_toupper(0x00df) == 0x1e9e &&
utf8proc_totitle(0x00df) == 0x1e9e &&
utf8proc_tolower(0x00df) == 0x00df &&
utf8proc_tolower(0x1e9e) == 0x00df &&
utf8proc_toupper(0x1e9e) == 0x1e9e,
"incorrect 0x00df/0x1e9e case conversions");
utf8proc_uint8_t str_00df[] = {0xc3, 0x9f, 0x00};
utf8proc_uint8_t str_1e9e[] = {0xe1, 0xba, 0x9e, 0x00};
check(!strcmp((char*)utf8proc_NFKC_Casefold(str_00df), "ss") &&
!strcmp((char*)utf8proc_NFKC_Casefold(str_1e9e), "ss"),
"incorrect 0x00df/0x1e9e casefold normalization");
printf("More up-to-date than OS unicode tables for %d tests.\n", better); printf("More up-to-date than OS unicode tables for %d tests.\n", better);
printf("utf8proc case conversion tests SUCCEEDED.\n"); printf("utf8proc case conversion tests SUCCEEDED.\n");
return 0; return 0;

View File

@ -4,46 +4,57 @@
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
int i; int i;
for (i = 1; i < argc; ++i) { for (i = 1; i < argc; ++i) {
unsigned int c; utf8proc_uint8_t cstr[16], *map;
if (!strcmp(argv[i], "-V")) { unsigned int c;
printf("utf8proc version %s\n", utf8proc_version()); if (!strcmp(argv[i], "-V")) {
continue; printf("utf8proc version %s\n", utf8proc_version());
} continue;
check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]); }
const utf8proc_property_t *p = utf8proc_get_property(c); check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]);
printf("U+%s:\n" const utf8proc_property_t *p = utf8proc_get_property(c);
" category = %s\n"
" combining_class = %d\n" if (utf8proc_codepoint_valid(c))
" bidi_class = %d\n" cstr[utf8proc_encode_char(c, cstr)] = 0;
" decomp_type = %d\n" else
" uppercase_mapping = %x\n" strcat((char*)cstr, "N/A");
" lowercase_mapping = %x\n" utf8proc_map(cstr, 0, &map, UTF8PROC_NULLTERM | UTF8PROC_CASEFOLD);
" titlecase_mapping = %x\n"
" comb_index = %d\n" printf("U+%s: %s\n"
" bidi_mirrored = %d\n" " category = %s\n"
" comp_exclusion = %d\n" " combining_class = %d\n"
" ignorable = %d\n" " bidi_class = %d\n"
" control_boundary = %d\n" " decomp_type = %d\n"
" boundclass = %d\n" " uppercase_mapping = %x\n"
" charwidth = %d\n", " lowercase_mapping = %x\n"
argv[i], " titlecase_mapping = %x\n"
utf8proc_category_string(c), " casefold = %s\n"
p->combining_class, " comb_index = %d\n"
p->bidi_class, " bidi_mirrored = %d\n"
p->decomp_type, " comp_exclusion = %d\n"
utf8proc_toupper(c), " ignorable = %d\n"
utf8proc_tolower(c), " control_boundary = %d\n"
utf8proc_totitle(c), " boundclass = %d\n"
p->comb_index, " charwidth = %d\n",
p->bidi_mirrored, argv[i], (char*) cstr,
p->comp_exclusion, utf8proc_category_string(c),
p->ignorable, p->combining_class,
p->control_boundary, p->bidi_class,
p->boundclass, p->decomp_type,
utf8proc_charwidth(c)); utf8proc_toupper(c),
} utf8proc_tolower(c),
return 0; utf8proc_totitle(c),
(char *) map,
p->comb_index,
p->bidi_mirrored,
p->comp_exclusion,
p->ignorable,
p->control_boundary,
p->boundclass,
utf8proc_charwidth(c));
free(map);
}
return 0;
} }

File diff suppressed because it is too large Load Diff