charwidth=1 for soft hyphen and unassigned codepoints (#135)

* use width=1 for soft hyphen and for unassigned/PUA codepoints

* don't count unassigned codepoints when comparing with system wcwidth

* more tests

* indentation fixes

* NEWS for 135

* remove special-casing for arabic control characters affecting a span of numbers, which are sometimes zero-width and sometimes not

* regenerate
This commit is contained in:
Steven G. Johnson 2018-07-24 10:45:02 -04:00 committed by GitHub
parent 0975bf9b6d
commit 02f4e1890c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 1897 additions and 1882 deletions

View File

@ -17,6 +17,9 @@
- `toupper` of ß (U+00df) now yields ẞ (U+1E9E) ([#134]), similar to musl;
case-folding still yields the standard "ss" mapping.
- `utf8proc_charwidth` now returns `1` for U+00AD (soft hyphen) and
for unassigned/PUA codepoints ([#135]).
## Version 2.1.1 ##
2018-04-27
@ -336,3 +339,4 @@ Release of version 1.0.1
[#132]: https://github.com/JuliaLang/utf8proc/issues/132
[#133]: https://github.com/JuliaLang/utf8proc/issues/133
[#134]: https://github.com/JuliaLang/utf8proc/issues/134
[#135]: https://github.com/JuliaLang/utf8proc/issues/135

View File

@ -20,12 +20,12 @@ import Base.UTF8proc
#############################################################################
# Use a default width of 1 for all character categories that are
# letter/symbol/number-like. This can be overriden by Unifont or UAX 11
# letter/symbol/number-like, as well as for unassigned/private-use chars.
# This can be overriden by Unifont or UAX 11
# below, but provides a useful nonzero fallback for new codepoints when
# a new Unicode version has been released but Unifont hasn't been updated yet.
zerowidth = Set{Int}() # categories that may contain zero-width chars
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CN)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MN)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MC)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ME)
@ -36,7 +36,6 @@ push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ZP)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CC)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CF)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CS)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CO)
for c in 0x0000:0x110000
if catcode(c) zerowidth
CharWidths[c] = 1
@ -102,7 +101,7 @@ for line in readlines(open("EastAsianWidth.txt"))
for c in charstart:charend
if width=="W" || width=="F" # wide or full
CharWidths[c]=2
elseif width=="Na"|| width=="H" # narrow or half
elseif width=="Na"|| width=="H"
CharWidths[c]=1
end
end
@ -115,9 +114,11 @@ end
for c in keys(CharWidths)
cat = catcode(c)
# make sure format control character (category Cf) have width 0,
# except for the Arabic characters 0x06xx (see unicode std 6.2, sec. 8.2)
if cat==UTF8proc.UTF8PROC_CATEGORY_CF && c [0x0601,0x0602,0x0603,0x06dd]
# make sure format control character (category Cf) have width 0
# (some of these, like U+0601, can have a width in some cases
# but normally act like prepended combining marks. U+fff9 etc
# are also odd, but have zero width in typical terminal contexts)
if cat==UTF8proc.UTF8PROC_CATEGORY_CF
CharWidths[c]=0
end
@ -128,11 +129,12 @@ for c in keys(CharWidths)
CharWidths[c]=0
end
# We also assign width of zero to unassigned and private-use
# We also assign width of one to unassigned and private-use
# codepoints (Unifont includes ConScript Unicode Registry PUA fonts,
# but since these are nonstandard it seems questionable to recognize them).
# but since these are nonstandard it seems questionable to use Unifont metrics;
# if they are printed as the replacement character U+FFFD they will have width 1).
if cat==UTF8proc.UTF8PROC_CATEGORY_CO || cat==UTF8proc.UTF8PROC_CATEGORY_CN
CharWidths[c]=0
CharWidths[c]=1
end
# for some reason, Unifont has width-2 glyphs for ASCII control chars
@ -141,6 +143,9 @@ for c in keys(CharWidths)
end
end
#Soft hyphen is typically printed as a hyphen (-) in terminals.
CharWidths[0x00ad]=1
#By definition, should have zero width (on the same line)
#0x002028 '' category: Zl name: LINE SEPARATOR/
#0x002029 '' category: Zp name: PARAGRAPH SEPARATOR/
@ -158,8 +163,8 @@ CharWidths[0x2001]=2
CharWidths[0x2003]=2
#############################################################################
# Output (to a file or pipe) for processing by data_generator.rb
# ... don't bother to output zero widths since that will be the default.
# Output (to a file or pipe) for processing by data_generator.rb,
# encoded as a sequence of intervals.
firstc = 0x000000
lastv = 0

View File

@ -378,7 +378,7 @@ end
$stdout << "};\n\n"
$stdout << "static const utf8proc_property_t utf8proc_properties[] = {\n"
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 0, 0, UTF8PROC_BOUNDCLASS_OTHER},\n"
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},\n"
properties.each { |line|
$stdout << line
}

View File

@ -2,10 +2,16 @@
#include <ctype.h>
#include <wchar.h>
static int my_unassigned(int c) {
int cat = utf8proc_get_property(c)->category;
return (cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO);
}
static int my_isprint(int c) {
int cat = utf8proc_get_property(c)->category;
return (UTF8PROC_CATEGORY_LU <= cat && cat <= UTF8PROC_CATEGORY_ZS) ||
(c == 0x0601 || c == 0x0602 || c == 0x0603 || c == 0x06dd);
(c == 0x0601 || c == 0x0602 || c == 0x0603 || c == 0x06dd || c == 0x00ad) ||
(cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO);
}
int main(int argc, char **argv)
@ -19,31 +25,36 @@ int main(int argc, char **argv)
for (c = 0; c <= 0x110000; ++c) {
int cat = utf8proc_get_property(c)->category;
int w = utf8proc_charwidth(c);
if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) &&
w > 0) {
if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) && w > 0) {
fprintf(stderr, "nonzero width %d for combining char %x\n", w, c);
error = 1;
error += 1;
}
if (w == 0 &&
((cat >= UTF8PROC_CATEGORY_LU && cat <= UTF8PROC_CATEGORY_LO) ||
(cat >= UTF8PROC_CATEGORY_ND && cat <= UTF8PROC_CATEGORY_SC) ||
(cat >= UTF8PROC_CATEGORY_SO && cat <= UTF8PROC_CATEGORY_ZS))) {
fprintf(stderr, "zero width for symbol-like char %x\n", c);
error = 1;
error += 1;
}
if (c <= 127 && ((!isprint(c) && w > 0) ||
(isprint(c) && wcwidth(c) != w))) {
if (c <= 127 && ((!isprint(c) && w > 0) || (isprint(c) && wcwidth(c) != w))) {
fprintf(stderr, "wcwidth %d mismatch %d for %s ASCII %x\n",
wcwidth(c), w,
isprint(c) ? "printable" : "non-printable", c);
error = 1;
error += 1;
}
if (!my_isprint(c) && w > 0) {
fprintf(stderr, "non-printing %x had width %d\n", c, w);
error = 1;
error += 1;
}
if (my_unassigned(c) && w != 1) {
fprintf(stderr, "unexpected width %d for unassigned char %x\n", w, c);
error += 1;
}
}
check(!error, "utf8proc_charwidth FAILED tests.");
check(!error, "utf8proc_charwidth FAILED %d tests.", error);
check(utf8proc_charwidth(0x00ad) == 1, "incorrect width for U+00AD (soft hyphen)");
check(utf8proc_charwidth(0xe000) == 1, "incorrect width for U+e000 (PUA)");
/* print some other information by compariing with system wcwidth */
printf("Mismatches with system wcwidth (not necessarily errors):\n");
@ -52,19 +63,14 @@ int main(int argc, char **argv)
int wc = wcwidth(c);
if (sizeof(wchar_t) == 2 && c >= (1<<16)) continue;
/* lots of these errors for out-of-date system unicode tables */
if (wc == -1 && my_isprint(c) && w > 0) {
if (wc == -1 && my_isprint(c) && !my_unassigned(c) && w > 0)
updates += 1;
#if 0
printf(" wcwidth(%x) = -1 for printable char\n", c);
#endif
}
if (wc == -1 && !my_isprint(c) && w > 0)
printf(" wcwidth(%x) = -1 for non-printable width-%d char\n", c, w);
if (wc >= 0 && wc != w)
printf(" wcwidth(%x) = %d != charwidth %d\n", c, wc, w);
}
printf(" ... (positive widths for %d chars unknown to wcwidth) ...\n",
updates);
printf(" ... (positive widths for %d chars unknown to wcwidth) ...\n", updates);
printf("Character-width tests SUCCEEDED.\n");
return 0;

File diff suppressed because it is too large Load Diff