charwidth=1 for soft hyphen and unassigned codepoints (#135)
* use width=1 for soft hyphen and for unassigned/PUA codepoints * don't count unassigned codepoints when comparing with system wcwidth * more tests * indentation fixes * NEWS for 135 * remove special-casing for arabic control characters affecting a span of numbers, which are sometimes zero-width and sometimes not * regenerate
This commit is contained in:
committed by
GitHub
parent
0975bf9b6d
commit
02f4e1890c
@@ -20,12 +20,12 @@ import Base.UTF8proc
|
||||
|
||||
#############################################################################
|
||||
# Use a default width of 1 for all character categories that are
|
||||
# letter/symbol/number-like. This can be overriden by Unifont or UAX 11
|
||||
# letter/symbol/number-like, as well as for unassigned/private-use chars.
|
||||
# This can be overriden by Unifont or UAX 11
|
||||
# below, but provides a useful nonzero fallback for new codepoints when
|
||||
# a new Unicode version has been released but Unifont hasn't been updated yet.
|
||||
|
||||
zerowidth = Set{Int}() # categories that may contain zero-width chars
|
||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CN)
|
||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MN)
|
||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MC)
|
||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ME)
|
||||
@@ -36,7 +36,6 @@ push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ZP)
|
||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CC)
|
||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CF)
|
||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CS)
|
||||
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CO)
|
||||
for c in 0x0000:0x110000
|
||||
if catcode(c) ∉ zerowidth
|
||||
CharWidths[c] = 1
|
||||
@@ -102,7 +101,7 @@ for line in readlines(open("EastAsianWidth.txt"))
|
||||
for c in charstart:charend
|
||||
if width=="W" || width=="F" # wide or full
|
||||
CharWidths[c]=2
|
||||
elseif width=="Na"|| width=="H" # narrow or half
|
||||
elseif width=="Na"|| width=="H"
|
||||
CharWidths[c]=1
|
||||
end
|
||||
end
|
||||
@@ -115,9 +114,11 @@ end
|
||||
for c in keys(CharWidths)
|
||||
cat = catcode(c)
|
||||
|
||||
# make sure format control character (category Cf) have width 0,
|
||||
# except for the Arabic characters 0x06xx (see unicode std 6.2, sec. 8.2)
|
||||
if cat==UTF8proc.UTF8PROC_CATEGORY_CF && c ∉ [0x0601,0x0602,0x0603,0x06dd]
|
||||
# make sure format control character (category Cf) have width 0
|
||||
# (some of these, like U+0601, can have a width in some cases
|
||||
# but normally act like prepended combining marks. U+fff9 etc
|
||||
# are also odd, but have zero width in typical terminal contexts)
|
||||
if cat==UTF8proc.UTF8PROC_CATEGORY_CF
|
||||
CharWidths[c]=0
|
||||
end
|
||||
|
||||
@@ -128,11 +129,12 @@ for c in keys(CharWidths)
|
||||
CharWidths[c]=0
|
||||
end
|
||||
|
||||
# We also assign width of zero to unassigned and private-use
|
||||
# We also assign width of one to unassigned and private-use
|
||||
# codepoints (Unifont includes ConScript Unicode Registry PUA fonts,
|
||||
# but since these are nonstandard it seems questionable to recognize them).
|
||||
# but since these are nonstandard it seems questionable to use Unifont metrics;
|
||||
# if they are printed as the replacement character U+FFFD they will have width 1).
|
||||
if cat==UTF8proc.UTF8PROC_CATEGORY_CO || cat==UTF8proc.UTF8PROC_CATEGORY_CN
|
||||
CharWidths[c]=0
|
||||
CharWidths[c]=1
|
||||
end
|
||||
|
||||
# for some reason, Unifont has width-2 glyphs for ASCII control chars
|
||||
@@ -141,6 +143,9 @@ for c in keys(CharWidths)
|
||||
end
|
||||
end
|
||||
|
||||
#Soft hyphen is typically printed as a hyphen (-) in terminals.
|
||||
CharWidths[0x00ad]=1
|
||||
|
||||
#By definition, should have zero width (on the same line)
|
||||
#0x002028 '
' category: Zl name: LINE SEPARATOR/
|
||||
#0x002029 '
' category: Zp name: PARAGRAPH SEPARATOR/
|
||||
@@ -158,8 +163,8 @@ CharWidths[0x2001]=2
|
||||
CharWidths[0x2003]=2
|
||||
|
||||
#############################################################################
|
||||
# Output (to a file or pipe) for processing by data_generator.rb
|
||||
# ... don't bother to output zero widths since that will be the default.
|
||||
# Output (to a file or pipe) for processing by data_generator.rb,
|
||||
# encoded as a sequence of intervals.
|
||||
|
||||
firstc = 0x000000
|
||||
lastv = 0
|
||||
|
||||
@@ -378,7 +378,7 @@ end
|
||||
$stdout << "};\n\n"
|
||||
|
||||
$stdout << "static const utf8proc_property_t utf8proc_properties[] = {\n"
|
||||
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 0, 0, UTF8PROC_BOUNDCLASS_OTHER},\n"
|
||||
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},\n"
|
||||
properties.each { |line|
|
||||
$stdout << line
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user