charwidth=1 for soft hyphen and unassigned codepoints (#135)
* use width=1 for soft hyphen and for unassigned/PUA codepoints * don't count unassigned codepoints when comparing with system wcwidth * more tests * indentation fixes * NEWS for 135 * remove special-casing for arabic control characters affecting a span of numbers, which are sometimes zero-width and sometimes not * regenerate
This commit is contained in:
parent
0975bf9b6d
commit
02f4e1890c
4
NEWS.md
4
NEWS.md
@ -17,6 +17,9 @@
|
|||||||
- `toupper` of ß (U+00df) now yields ẞ (U+1E9E) ([#134]), similar to musl;
|
- `toupper` of ß (U+00df) now yields ẞ (U+1E9E) ([#134]), similar to musl;
|
||||||
case-folding still yields the standard "ss" mapping.
|
case-folding still yields the standard "ss" mapping.
|
||||||
|
|
||||||
|
- `utf8proc_charwidth` now returns `1` for U+00AD (soft hyphen) and
|
||||||
|
for unassigned/PUA codepoints ([#135]).
|
||||||
|
|
||||||
## Version 2.1.1 ##
|
## Version 2.1.1 ##
|
||||||
|
|
||||||
2018-04-27
|
2018-04-27
|
||||||
@ -336,3 +339,4 @@ Release of version 1.0.1
|
|||||||
[#132]: https://github.com/JuliaLang/utf8proc/issues/132
|
[#132]: https://github.com/JuliaLang/utf8proc/issues/132
|
||||||
[#133]: https://github.com/JuliaLang/utf8proc/issues/133
|
[#133]: https://github.com/JuliaLang/utf8proc/issues/133
|
||||||
[#134]: https://github.com/JuliaLang/utf8proc/issues/134
|
[#134]: https://github.com/JuliaLang/utf8proc/issues/134
|
||||||
|
[#135]: https://github.com/JuliaLang/utf8proc/issues/135
|
||||||
|
|||||||
@ -20,12 +20,12 @@ import Base.UTF8proc
|
|||||||
|
|
||||||
#############################################################################
|
#############################################################################
|
||||||
# Use a default width of 1 for all character categories that are
|
# Use a default width of 1 for all character categories that are
|
||||||
# letter/symbol/number-like. This can be overriden by Unifont or UAX 11
|
# letter/symbol/number-like, as well as for unassigned/private-use chars.
|
||||||
|
# This can be overriden by Unifont or UAX 11
|
||||||
# below, but provides a useful nonzero fallback for new codepoints when
|
# below, but provides a useful nonzero fallback for new codepoints when
|
||||||
# a new Unicode version has been released but Unifont hasn't been updated yet.
|
# a new Unicode version has been released but Unifont hasn't been updated yet.
|
||||||
|
|
||||||
zerowidth = Set{Int}() # categories that may contain zero-width chars
|
zerowidth = Set{Int}() # categories that may contain zero-width chars
|
||||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CN)
|
|
||||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MN)
|
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MN)
|
||||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MC)
|
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MC)
|
||||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ME)
|
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ME)
|
||||||
@ -36,7 +36,6 @@ push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ZP)
|
|||||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CC)
|
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CC)
|
||||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CF)
|
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CF)
|
||||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CS)
|
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CS)
|
||||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CO)
|
|
||||||
for c in 0x0000:0x110000
|
for c in 0x0000:0x110000
|
||||||
if catcode(c) ∉ zerowidth
|
if catcode(c) ∉ zerowidth
|
||||||
CharWidths[c] = 1
|
CharWidths[c] = 1
|
||||||
@ -102,7 +101,7 @@ for line in readlines(open("EastAsianWidth.txt"))
|
|||||||
for c in charstart:charend
|
for c in charstart:charend
|
||||||
if width=="W" || width=="F" # wide or full
|
if width=="W" || width=="F" # wide or full
|
||||||
CharWidths[c]=2
|
CharWidths[c]=2
|
||||||
elseif width=="Na"|| width=="H" # narrow or half
|
elseif width=="Na"|| width=="H"
|
||||||
CharWidths[c]=1
|
CharWidths[c]=1
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@ -115,9 +114,11 @@ end
|
|||||||
for c in keys(CharWidths)
|
for c in keys(CharWidths)
|
||||||
cat = catcode(c)
|
cat = catcode(c)
|
||||||
|
|
||||||
# make sure format control character (category Cf) have width 0,
|
# make sure format control character (category Cf) have width 0
|
||||||
# except for the Arabic characters 0x06xx (see unicode std 6.2, sec. 8.2)
|
# (some of these, like U+0601, can have a width in some cases
|
||||||
if cat==UTF8proc.UTF8PROC_CATEGORY_CF && c ∉ [0x0601,0x0602,0x0603,0x06dd]
|
# but normally act like prepended combining marks. U+fff9 etc
|
||||||
|
# are also odd, but have zero width in typical terminal contexts)
|
||||||
|
if cat==UTF8proc.UTF8PROC_CATEGORY_CF
|
||||||
CharWidths[c]=0
|
CharWidths[c]=0
|
||||||
end
|
end
|
||||||
|
|
||||||
@ -128,11 +129,12 @@ for c in keys(CharWidths)
|
|||||||
CharWidths[c]=0
|
CharWidths[c]=0
|
||||||
end
|
end
|
||||||
|
|
||||||
# We also assign width of zero to unassigned and private-use
|
# We also assign width of one to unassigned and private-use
|
||||||
# codepoints (Unifont includes ConScript Unicode Registry PUA fonts,
|
# codepoints (Unifont includes ConScript Unicode Registry PUA fonts,
|
||||||
# but since these are nonstandard it seems questionable to recognize them).
|
# but since these are nonstandard it seems questionable to use Unifont metrics;
|
||||||
|
# if they are printed as the replacement character U+FFFD they will have width 1).
|
||||||
if cat==UTF8proc.UTF8PROC_CATEGORY_CO || cat==UTF8proc.UTF8PROC_CATEGORY_CN
|
if cat==UTF8proc.UTF8PROC_CATEGORY_CO || cat==UTF8proc.UTF8PROC_CATEGORY_CN
|
||||||
CharWidths[c]=0
|
CharWidths[c]=1
|
||||||
end
|
end
|
||||||
|
|
||||||
# for some reason, Unifont has width-2 glyphs for ASCII control chars
|
# for some reason, Unifont has width-2 glyphs for ASCII control chars
|
||||||
@ -141,6 +143,9 @@ for c in keys(CharWidths)
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
#Soft hyphen is typically printed as a hyphen (-) in terminals.
|
||||||
|
CharWidths[0x00ad]=1
|
||||||
|
|
||||||
#By definition, should have zero width (on the same line)
|
#By definition, should have zero width (on the same line)
|
||||||
#0x002028 '
' category: Zl name: LINE SEPARATOR/
|
#0x002028 '
' category: Zl name: LINE SEPARATOR/
|
||||||
#0x002029 '
' category: Zp name: PARAGRAPH SEPARATOR/
|
#0x002029 '
' category: Zp name: PARAGRAPH SEPARATOR/
|
||||||
@ -158,8 +163,8 @@ CharWidths[0x2001]=2
|
|||||||
CharWidths[0x2003]=2
|
CharWidths[0x2003]=2
|
||||||
|
|
||||||
#############################################################################
|
#############################################################################
|
||||||
# Output (to a file or pipe) for processing by data_generator.rb
|
# Output (to a file or pipe) for processing by data_generator.rb,
|
||||||
# ... don't bother to output zero widths since that will be the default.
|
# encoded as a sequence of intervals.
|
||||||
|
|
||||||
firstc = 0x000000
|
firstc = 0x000000
|
||||||
lastv = 0
|
lastv = 0
|
||||||
|
|||||||
@ -378,7 +378,7 @@ end
|
|||||||
$stdout << "};\n\n"
|
$stdout << "};\n\n"
|
||||||
|
|
||||||
$stdout << "static const utf8proc_property_t utf8proc_properties[] = {\n"
|
$stdout << "static const utf8proc_property_t utf8proc_properties[] = {\n"
|
||||||
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 0, 0, UTF8PROC_BOUNDCLASS_OTHER},\n"
|
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},\n"
|
||||||
properties.each { |line|
|
properties.each { |line|
|
||||||
$stdout << line
|
$stdout << line
|
||||||
}
|
}
|
||||||
|
|||||||
120
test/charwidth.c
120
test/charwidth.c
@ -2,70 +2,76 @@
|
|||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
#include <wchar.h>
|
#include <wchar.h>
|
||||||
|
|
||||||
|
static int my_unassigned(int c) {
|
||||||
|
int cat = utf8proc_get_property(c)->category;
|
||||||
|
return (cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO);
|
||||||
|
}
|
||||||
|
|
||||||
static int my_isprint(int c) {
|
static int my_isprint(int c) {
|
||||||
int cat = utf8proc_get_property(c)->category;
|
int cat = utf8proc_get_property(c)->category;
|
||||||
return (UTF8PROC_CATEGORY_LU <= cat && cat <= UTF8PROC_CATEGORY_ZS) ||
|
return (UTF8PROC_CATEGORY_LU <= cat && cat <= UTF8PROC_CATEGORY_ZS) ||
|
||||||
(c == 0x0601 || c == 0x0602 || c == 0x0603 || c == 0x06dd);
|
(c == 0x0601 || c == 0x0602 || c == 0x0603 || c == 0x06dd || c == 0x00ad) ||
|
||||||
|
(cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO);
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
int c, error = 0, updates = 0;
|
int c, error = 0, updates = 0;
|
||||||
|
|
||||||
(void) argc; /* unused */
|
(void) argc; /* unused */
|
||||||
(void) argv; /* unused */
|
(void) argv; /* unused */
|
||||||
|
|
||||||
/* some simple sanity tests of the character widths */
|
/* some simple sanity tests of the character widths */
|
||||||
for (c = 0; c <= 0x110000; ++c) {
|
for (c = 0; c <= 0x110000; ++c) {
|
||||||
int cat = utf8proc_get_property(c)->category;
|
int cat = utf8proc_get_property(c)->category;
|
||||||
int w = utf8proc_charwidth(c);
|
int w = utf8proc_charwidth(c);
|
||||||
if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) &&
|
if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) && w > 0) {
|
||||||
w > 0) {
|
fprintf(stderr, "nonzero width %d for combining char %x\n", w, c);
|
||||||
fprintf(stderr, "nonzero width %d for combining char %x\n", w, c);
|
error += 1;
|
||||||
error = 1;
|
}
|
||||||
}
|
if (w == 0 &&
|
||||||
if (w == 0 &&
|
((cat >= UTF8PROC_CATEGORY_LU && cat <= UTF8PROC_CATEGORY_LO) ||
|
||||||
((cat >= UTF8PROC_CATEGORY_LU && cat <= UTF8PROC_CATEGORY_LO) ||
|
(cat >= UTF8PROC_CATEGORY_ND && cat <= UTF8PROC_CATEGORY_SC) ||
|
||||||
(cat >= UTF8PROC_CATEGORY_ND && cat <= UTF8PROC_CATEGORY_SC) ||
|
(cat >= UTF8PROC_CATEGORY_SO && cat <= UTF8PROC_CATEGORY_ZS))) {
|
||||||
(cat >= UTF8PROC_CATEGORY_SO && cat <= UTF8PROC_CATEGORY_ZS))) {
|
fprintf(stderr, "zero width for symbol-like char %x\n", c);
|
||||||
fprintf(stderr, "zero width for symbol-like char %x\n", c);
|
error += 1;
|
||||||
error = 1;
|
}
|
||||||
}
|
if (c <= 127 && ((!isprint(c) && w > 0) || (isprint(c) && wcwidth(c) != w))) {
|
||||||
if (c <= 127 && ((!isprint(c) && w > 0) ||
|
fprintf(stderr, "wcwidth %d mismatch %d for %s ASCII %x\n",
|
||||||
(isprint(c) && wcwidth(c) != w))) {
|
wcwidth(c), w,
|
||||||
fprintf(stderr, "wcwidth %d mismatch %d for %s ASCII %x\n",
|
isprint(c) ? "printable" : "non-printable", c);
|
||||||
wcwidth(c), w,
|
error += 1;
|
||||||
isprint(c) ? "printable" : "non-printable", c);
|
}
|
||||||
error = 1;
|
if (!my_isprint(c) && w > 0) {
|
||||||
}
|
fprintf(stderr, "non-printing %x had width %d\n", c, w);
|
||||||
if (!my_isprint(c) && w > 0) {
|
error += 1;
|
||||||
fprintf(stderr, "non-printing %x had width %d\n", c, w);
|
}
|
||||||
error = 1;
|
if (my_unassigned(c) && w != 1) {
|
||||||
}
|
fprintf(stderr, "unexpected width %d for unassigned char %x\n", w, c);
|
||||||
}
|
error += 1;
|
||||||
check(!error, "utf8proc_charwidth FAILED tests.");
|
}
|
||||||
|
}
|
||||||
|
check(!error, "utf8proc_charwidth FAILED %d tests.", error);
|
||||||
|
|
||||||
/* print some other information by compariing with system wcwidth */
|
check(utf8proc_charwidth(0x00ad) == 1, "incorrect width for U+00AD (soft hyphen)");
|
||||||
printf("Mismatches with system wcwidth (not necessarily errors):\n");
|
check(utf8proc_charwidth(0xe000) == 1, "incorrect width for U+e000 (PUA)");
|
||||||
for (c = 0; c <= 0x110000; ++c) {
|
|
||||||
int w = utf8proc_charwidth(c);
|
|
||||||
int wc = wcwidth(c);
|
|
||||||
if (sizeof(wchar_t) == 2 && c >= (1<<16)) continue;
|
|
||||||
/* lots of these errors for out-of-date system unicode tables */
|
|
||||||
if (wc == -1 && my_isprint(c) && w > 0) {
|
|
||||||
updates += 1;
|
|
||||||
#if 0
|
|
||||||
printf(" wcwidth(%x) = -1 for printable char\n", c);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
if (wc == -1 && !my_isprint(c) && w > 0)
|
|
||||||
printf(" wcwidth(%x) = -1 for non-printable width-%d char\n", c, w);
|
|
||||||
if (wc >= 0 && wc != w)
|
|
||||||
printf(" wcwidth(%x) = %d != charwidth %d\n", c, wc, w);
|
|
||||||
}
|
|
||||||
printf(" ... (positive widths for %d chars unknown to wcwidth) ...\n",
|
|
||||||
updates);
|
|
||||||
printf("Character-width tests SUCCEEDED.\n");
|
|
||||||
|
|
||||||
return 0;
|
/* print some other information by compariing with system wcwidth */
|
||||||
|
printf("Mismatches with system wcwidth (not necessarily errors):\n");
|
||||||
|
for (c = 0; c <= 0x110000; ++c) {
|
||||||
|
int w = utf8proc_charwidth(c);
|
||||||
|
int wc = wcwidth(c);
|
||||||
|
if (sizeof(wchar_t) == 2 && c >= (1<<16)) continue;
|
||||||
|
/* lots of these errors for out-of-date system unicode tables */
|
||||||
|
if (wc == -1 && my_isprint(c) && !my_unassigned(c) && w > 0)
|
||||||
|
updates += 1;
|
||||||
|
if (wc == -1 && !my_isprint(c) && w > 0)
|
||||||
|
printf(" wcwidth(%x) = -1 for non-printable width-%d char\n", c, w);
|
||||||
|
if (wc >= 0 && wc != w)
|
||||||
|
printf(" wcwidth(%x) = %d != charwidth %d\n", c, wc, w);
|
||||||
|
}
|
||||||
|
printf(" ... (positive widths for %d chars unknown to wcwidth) ...\n", updates);
|
||||||
|
printf("Character-width tests SUCCEEDED.\n");
|
||||||
|
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
3624
utf8proc_data.c
3624
utf8proc_data.c
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user