Smaller tables (#68)
* convert sequences to utf-16 (saves 25kb) * store sequence length in properties instead using -1 termination (saves 10kb) * cache index for slightly faster data creation * store lower/upper/title mapping in sequence array (saves 25kb). Add utf8proc_totitle, as title_mapping cannot be used to get the title codepoint anymore. Rename xxx_mapping to xxx_seqindex, so programs assuming a value with the old meaning fail at compile time * change combination array data type to uint16 (saves 40kb) * merge 1st and 2nd comb index (saves 50kb) * kill empty prefix/suffix in combination array (saves 50kb) * there was no need to have a separate combination start array, it can be merged in a single array * some fixes * mark the table as const again * and regen
This commit is contained in:
parent
9a0b87b57e
commit
eeebf70bcf
@ -115,22 +115,52 @@ def str2c(string, prefix)
|
|||||||
return "0" if string.nil?
|
return "0" if string.nil?
|
||||||
return "UTF8PROC_#{prefix}_#{string.upcase}"
|
return "UTF8PROC_#{prefix}_#{string.upcase}"
|
||||||
end
|
end
|
||||||
def ary2c(array)
|
def pushary(array)
|
||||||
return "UINT16_MAX" if array.nil?
|
idx = $int_array_indicies[array]
|
||||||
unless $int_array_indicies[array]
|
unless idx
|
||||||
$int_array_indicies[array] = $int_array.length
|
$int_array_indicies[array] = $int_array.length
|
||||||
|
idx = $int_array.length
|
||||||
array.each { |entry| $int_array << entry }
|
array.each { |entry| $int_array << entry }
|
||||||
$int_array << -1
|
|
||||||
end
|
end
|
||||||
raise "Array index out of bound" if $int_array_indicies[array] >= 65535
|
return idx
|
||||||
return "#{$int_array_indicies[array]}"
|
end
|
||||||
|
def cpary2utf16encoded(array)
|
||||||
|
return array.flat_map { |cp|
|
||||||
|
if (cp <= 0xFFFF)
|
||||||
|
raise "utf-16 code: #{cp}" if cp & 0b1111100000000000 == 0b1101100000000000
|
||||||
|
cp
|
||||||
|
else
|
||||||
|
temp = cp - 0x10000
|
||||||
|
[(temp >> 10) | 0b1101100000000000, (temp & 0b0000001111111111) | 0b1101110000000000]
|
||||||
|
end
|
||||||
|
}
|
||||||
|
end
|
||||||
|
def cpary2c(array)
|
||||||
|
return "UINT16_MAX" if array.nil? || array.length == 0
|
||||||
|
lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ...
|
||||||
|
array = cpary2utf16encoded(array)
|
||||||
|
if lencode >= 7 #we have only 3 bits for the length (which is already cutting it close. might need to change it to 2 bits in future Unicode versions)
|
||||||
|
array = [lencode] + array
|
||||||
|
lencode = 7
|
||||||
|
end
|
||||||
|
idx = pushary(array)
|
||||||
|
raise "Array index out of bound" if idx > 0x1FFF
|
||||||
|
return "#{idx | (lencode << 13)}"
|
||||||
|
end
|
||||||
|
def singlecpmap(cp)
|
||||||
|
return "UINT16_MAX" if cp == nil
|
||||||
|
idx = pushary(cpary2utf16encoded([cp]))
|
||||||
|
raise "Array index out of bound" if idx > 0xFFFF
|
||||||
|
return "#{idx}"
|
||||||
end
|
end
|
||||||
|
|
||||||
class UnicodeChar
|
class UnicodeChar
|
||||||
attr_accessor :code, :name, :category, :combining_class, :bidi_class,
|
attr_accessor :code, :name, :category, :combining_class, :bidi_class,
|
||||||
:decomp_type, :decomp_mapping,
|
:decomp_type, :decomp_mapping,
|
||||||
:bidi_mirrored,
|
:bidi_mirrored,
|
||||||
:uppercase_mapping, :lowercase_mapping, :titlecase_mapping
|
:uppercase_mapping, :lowercase_mapping, :titlecase_mapping,
|
||||||
|
#caches:
|
||||||
|
:c_entry_index, :c_decomp_mapping, :c_case_folding
|
||||||
def initialize(line)
|
def initialize(line)
|
||||||
raise "Could not parse input." unless line =~ /^
|
raise "Could not parse input." unless line =~ /^
|
||||||
([0-9A-F]+); # code
|
([0-9A-F]+); # code
|
||||||
@ -165,19 +195,17 @@ class UnicodeChar
|
|||||||
def case_folding
|
def case_folding
|
||||||
$case_folding[code]
|
$case_folding[code]
|
||||||
end
|
end
|
||||||
def c_entry(comb1_indicies, comb2_indicies)
|
def c_entry(comb_indicies)
|
||||||
" " <<
|
" " <<
|
||||||
"{#{str2c category, 'CATEGORY'}, #{combining_class}, " <<
|
"{#{str2c category, 'CATEGORY'}, #{combining_class}, " <<
|
||||||
"#{str2c bidi_class, 'BIDI_CLASS'}, " <<
|
"#{str2c bidi_class, 'BIDI_CLASS'}, " <<
|
||||||
"#{str2c decomp_type, 'DECOMP_TYPE'}, " <<
|
"#{str2c decomp_type, 'DECOMP_TYPE'}, " <<
|
||||||
"#{ary2c decomp_mapping}, " <<
|
"#{c_decomp_mapping}, " <<
|
||||||
"#{ary2c case_folding}, " <<
|
"#{c_case_folding}, " <<
|
||||||
"#{uppercase_mapping or -1}, " <<
|
"#{singlecpmap uppercase_mapping }, " <<
|
||||||
"#{lowercase_mapping or -1}, " <<
|
"#{singlecpmap lowercase_mapping }, " <<
|
||||||
"#{titlecase_mapping or -1}, " <<
|
"#{singlecpmap titlecase_mapping }, " <<
|
||||||
"#{comb1_indicies[code] ?
|
"#{comb_indicies[code] ? comb_indicies[code]: 'UINT16_MAX'}, " <<
|
||||||
(comb1_indicies[code]*comb2_indicies.keys.length) : -1
|
|
||||||
}, #{comb2_indicies[code] or -1}, " <<
|
|
||||||
"#{bidi_mirrored}, " <<
|
"#{bidi_mirrored}, " <<
|
||||||
"#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
|
"#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
|
||||||
"#{$ignorable.include?(code)}, " <<
|
"#{$ignorable.include?(code)}, " <<
|
||||||
@ -215,6 +243,8 @@ end
|
|||||||
|
|
||||||
comb1st_indicies = {}
|
comb1st_indicies = {}
|
||||||
comb2nd_indicies = {}
|
comb2nd_indicies = {}
|
||||||
|
comb2nd_indicies_sorted_keys = []
|
||||||
|
comb2nd_indicies_nonbasic = {}
|
||||||
comb_array = []
|
comb_array = []
|
||||||
|
|
||||||
chars.each do |char|
|
chars.each do |char|
|
||||||
@ -222,27 +252,69 @@ chars.each do |char|
|
|||||||
char.decomp_mapping.length == 2 and !char_hash[char.decomp_mapping[0]].nil? and
|
char.decomp_mapping.length == 2 and !char_hash[char.decomp_mapping[0]].nil? and
|
||||||
char_hash[char.decomp_mapping[0]].combining_class == 0 and
|
char_hash[char.decomp_mapping[0]].combining_class == 0 and
|
||||||
not $exclusions.include?(char.code)
|
not $exclusions.include?(char.code)
|
||||||
unless comb1st_indicies[char.decomp_mapping[0]]
|
|
||||||
comb1st_indicies[char.decomp_mapping[0]] = comb1st_indicies.keys.length
|
dm0 = char.decomp_mapping[0]
|
||||||
|
dm1 = char.decomp_mapping[1]
|
||||||
|
unless comb1st_indicies[dm0]
|
||||||
|
comb1st_indicies[dm0] = comb1st_indicies.keys.length
|
||||||
end
|
end
|
||||||
unless comb2nd_indicies[char.decomp_mapping[1]]
|
unless comb2nd_indicies[dm1]
|
||||||
comb2nd_indicies[char.decomp_mapping[1]] = comb2nd_indicies.keys.length
|
comb2nd_indicies_sorted_keys << dm1
|
||||||
|
comb2nd_indicies[dm1] = comb2nd_indicies.keys.length
|
||||||
end
|
end
|
||||||
comb_array[comb1st_indicies[char.decomp_mapping[0]]] ||= []
|
comb_array[comb1st_indicies[dm0]] ||= []
|
||||||
raise "Duplicate canonical mapping" if
|
raise "Duplicate canonical mapping: #{char.code} #{dm0} #{dm1}" if comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]]
|
||||||
comb_array[comb1st_indicies[char.decomp_mapping[0]]][
|
comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]] = char.code
|
||||||
comb2nd_indicies[char.decomp_mapping[1]]]
|
|
||||||
comb_array[comb1st_indicies[char.decomp_mapping[0]]][
|
comb2nd_indicies_nonbasic[dm1] = true if char.code > 0xFFFF
|
||||||
comb2nd_indicies[char.decomp_mapping[1]]] = char.code
|
end
|
||||||
|
char.c_decomp_mapping = cpary2c(char.decomp_mapping)
|
||||||
|
char.c_case_folding = cpary2c(char.case_folding)
|
||||||
|
end
|
||||||
|
|
||||||
|
comb_indicies = {}
|
||||||
|
cumoffset = 0
|
||||||
|
comb1st_indicies_lastoffsets = []
|
||||||
|
comb1st_indicies_firstoffsets = []
|
||||||
|
comb1st_indicies.each do |dm0, index|
|
||||||
|
first = nil
|
||||||
|
last = nil
|
||||||
|
offset = 0
|
||||||
|
comb2nd_indicies_sorted_keys.each_with_index do |dm1, b|
|
||||||
|
if comb_array[index][b]
|
||||||
|
first = offset unless first
|
||||||
|
last = offset
|
||||||
|
last += 1 if comb2nd_indicies_nonbasic[dm1]
|
||||||
|
end
|
||||||
|
offset += 1
|
||||||
|
offset += 1 if comb2nd_indicies_nonbasic[dm1]
|
||||||
|
end
|
||||||
|
comb1st_indicies_firstoffsets[index] = first
|
||||||
|
comb1st_indicies_lastoffsets[index] = last
|
||||||
|
raise "double index" if comb_indicies[dm0]
|
||||||
|
comb_indicies[dm0] = cumoffset
|
||||||
|
cumoffset += last - first + 1 + 2
|
||||||
|
end
|
||||||
|
|
||||||
|
offset = 0
|
||||||
|
comb2nd_indicies_sorted_keys.each do |dm1|
|
||||||
|
raise "double index" if comb_indicies[dm1]
|
||||||
|
comb_indicies[dm1] = 0x8000 | (comb2nd_indicies[dm1] + offset)
|
||||||
|
raise "too large comb index" if comb2nd_indicies[dm1] + offset > 0x4000
|
||||||
|
if comb2nd_indicies_nonbasic[dm1]
|
||||||
|
comb_indicies[dm1] = comb_indicies[dm1] | 0x4000
|
||||||
|
offset += 1
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
properties_indicies = {}
|
properties_indicies = {}
|
||||||
properties = []
|
properties = []
|
||||||
chars.each do |char|
|
chars.each do |char|
|
||||||
c_entry = char.c_entry(comb1st_indicies, comb2nd_indicies)
|
c_entry = char.c_entry(comb_indicies)
|
||||||
unless properties_indicies[c_entry]
|
char.c_entry_index = properties_indicies[c_entry]
|
||||||
|
unless char.c_entry_index
|
||||||
properties_indicies[c_entry] = properties.length
|
properties_indicies[c_entry] = properties.length
|
||||||
|
char.c_entry_index = properties.length
|
||||||
properties << c_entry
|
properties << c_entry
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@ -254,8 +326,7 @@ for code in 0...0x110000
|
|||||||
stage2_entry = []
|
stage2_entry = []
|
||||||
for code2 in code...(code+0x100)
|
for code2 in code...(code+0x100)
|
||||||
if char_hash[code2]
|
if char_hash[code2]
|
||||||
stage2_entry << (properties_indicies[char_hash[code2].c_entry(
|
stage2_entry << (char_hash[code2].c_entry_index + 1)
|
||||||
comb1st_indicies, comb2nd_indicies)] + 1)
|
|
||||||
else
|
else
|
||||||
stage2_entry << 0
|
stage2_entry << 0
|
||||||
end
|
end
|
||||||
@ -269,7 +340,7 @@ for code in 0...0x110000
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
$stdout << "const utf8proc_int32_t utf8proc_sequences[] = {\n "
|
$stdout << "const utf8proc_uint16_t utf8proc_sequences[] = {\n "
|
||||||
i = 0
|
i = 0
|
||||||
$int_array.each do |entry|
|
$int_array.each do |entry|
|
||||||
i += 1
|
i += 1
|
||||||
@ -306,23 +377,35 @@ end
|
|||||||
$stdout << "};\n\n"
|
$stdout << "};\n\n"
|
||||||
|
|
||||||
$stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
|
$stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
|
||||||
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, -1, -1, -1, -1, -1, false,false,false,false,0,0,UTF8PROC_BOUNDCLASS_OTHER},\n"
|
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 0, 0, UTF8PROC_BOUNDCLASS_OTHER},\n"
|
||||||
properties.each { |line|
|
properties.each { |line|
|
||||||
$stdout << line
|
$stdout << line
|
||||||
}
|
}
|
||||||
$stdout << "};\n\n"
|
$stdout << "};\n\n"
|
||||||
|
|
||||||
$stdout << "const utf8proc_int32_t utf8proc_combinations[] = {\n "
|
|
||||||
|
|
||||||
|
$stdout << "const utf8proc_uint16_t utf8proc_combinations[] = {\n "
|
||||||
i = 0
|
i = 0
|
||||||
comb1st_indicies.keys.sort.each_index do |a|
|
comb1st_indicies.keys.each_index do |a|
|
||||||
comb2nd_indicies.keys.sort.each_index do |b|
|
offset = 0
|
||||||
|
$stdout << comb1st_indicies_firstoffsets[a] << ", " << comb1st_indicies_lastoffsets[a] << ", "
|
||||||
|
comb2nd_indicies_sorted_keys.each_with_index do |dm1, b|
|
||||||
|
break if offset > comb1st_indicies_lastoffsets[a]
|
||||||
|
if offset >= comb1st_indicies_firstoffsets[a]
|
||||||
i += 1
|
i += 1
|
||||||
if i == 8
|
if i == 8
|
||||||
i = 0
|
i = 0
|
||||||
$stdout << "\n "
|
$stdout << "\n "
|
||||||
end
|
end
|
||||||
$stdout << ( comb_array[a][b] or -1 ) << ", "
|
v = comb_array[a][b] ? comb_array[a][b] : 0
|
||||||
|
$stdout << (( v & 0xFFFF0000 ) >> 16) << ", " if comb2nd_indicies_nonbasic[dm1]
|
||||||
|
$stdout << (v & 0xFFFF) << ", "
|
||||||
end
|
end
|
||||||
|
offset += 1
|
||||||
|
offset += 1 if comb2nd_indicies_nonbasic[dm1]
|
||||||
|
end
|
||||||
|
$stdout << "\n"
|
||||||
end
|
end
|
||||||
$stdout << "};\n\n"
|
$stdout << "};\n\n"
|
||||||
|
|
||||||
|
|||||||
@ -22,8 +22,7 @@ int main(int argc, char **argv)
|
|||||||
" uppercase_mapping = %x\n"
|
" uppercase_mapping = %x\n"
|
||||||
" lowercase_mapping = %x\n"
|
" lowercase_mapping = %x\n"
|
||||||
" titlecase_mapping = %x\n"
|
" titlecase_mapping = %x\n"
|
||||||
" comb1st_index = %d\n"
|
" comb_index = %d\n"
|
||||||
" comb2nd_index = %d\n"
|
|
||||||
" bidi_mirrored = %d\n"
|
" bidi_mirrored = %d\n"
|
||||||
" comp_exclusion = %d\n"
|
" comp_exclusion = %d\n"
|
||||||
" ignorable = %d\n"
|
" ignorable = %d\n"
|
||||||
@ -35,11 +34,10 @@ int main(int argc, char **argv)
|
|||||||
p->combining_class,
|
p->combining_class,
|
||||||
p->bidi_class,
|
p->bidi_class,
|
||||||
p->decomp_type,
|
p->decomp_type,
|
||||||
p->uppercase_mapping,
|
utf8proc_toupper(c),
|
||||||
p->lowercase_mapping,
|
utf8proc_tolower(c),
|
||||||
p->titlecase_mapping,
|
utf8proc_totitle(c),
|
||||||
p->comb1st_index,
|
p->comb_index,
|
||||||
p->comb2nd_index,
|
|
||||||
p->bidi_mirrored,
|
p->bidi_mirrored,
|
||||||
p->comp_exclusion,
|
p->comp_exclusion,
|
||||||
p->ignorable,
|
p->ignorable,
|
||||||
|
|||||||
99
utf8proc.c
99
utf8proc.c
@ -316,16 +316,58 @@ UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
|
|||||||
state);
|
state);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry)
|
||||||
|
{
|
||||||
|
utf8proc_int32_t entry_cp = **entry;
|
||||||
|
if ((entry_cp & 0xF800) == 0xD800) {
|
||||||
|
*entry = *entry + 1;
|
||||||
|
entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF);
|
||||||
|
entry_cp += 0x10000;
|
||||||
|
}
|
||||||
|
return entry_cp;
|
||||||
|
}
|
||||||
|
|
||||||
|
static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex)
|
||||||
|
{
|
||||||
|
const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex];
|
||||||
|
return seqindex_decode_entry(&entry);
|
||||||
|
}
|
||||||
|
|
||||||
|
static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
|
||||||
|
utf8proc_ssize_t written = 0;
|
||||||
|
const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x1FFF];
|
||||||
|
int len = seqindex >> 13;
|
||||||
|
if (len >= 7) {
|
||||||
|
len = *entry;
|
||||||
|
entry++;
|
||||||
|
}
|
||||||
|
for (; len >= 0; entry++, len--) {
|
||||||
|
utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry);
|
||||||
|
|
||||||
|
written += utf8proc_decompose_char(entry_cp, dst+written,
|
||||||
|
(bufsize > written) ? (bufsize - written) : 0, options,
|
||||||
|
last_boundclass);
|
||||||
|
if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
|
||||||
|
}
|
||||||
|
return written;
|
||||||
|
}
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
|
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
|
||||||
{
|
{
|
||||||
utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_mapping;
|
utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex;
|
||||||
return cl >= 0 ? cl : c;
|
return cl != UINT16_MAX ? seqindex_decode_index(cl) : c;
|
||||||
}
|
}
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
|
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
|
||||||
{
|
{
|
||||||
utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_mapping;
|
utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex;
|
||||||
return cu >= 0 ? cu : c;
|
return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
|
||||||
|
}
|
||||||
|
|
||||||
|
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
|
||||||
|
{
|
||||||
|
utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex;
|
||||||
|
return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* return a character width analogous to wcwidth (except portable and
|
/* return a character width analogous to wcwidth (except portable and
|
||||||
@ -343,6 +385,8 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
|
|||||||
return s[utf8proc_category(c)];
|
return s[utf8proc_category(c)];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#define utf8proc_decompose_lump(replacement_uc) \
|
#define utf8proc_decompose_lump(replacement_uc) \
|
||||||
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
|
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
|
||||||
options & ~UTF8PROC_LUMP, last_boundclass)
|
options & ~UTF8PROC_LUMP, last_boundclass)
|
||||||
@ -408,32 +452,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
|
|||||||
category == UTF8PROC_CATEGORY_ME) return 0;
|
category == UTF8PROC_CATEGORY_ME) return 0;
|
||||||
}
|
}
|
||||||
if (options & UTF8PROC_CASEFOLD) {
|
if (options & UTF8PROC_CASEFOLD) {
|
||||||
if (property->casefold_mapping != UINT16_MAX) {
|
if (property->casefold_seqindex != UINT16_MAX) {
|
||||||
const utf8proc_int32_t *casefold_entry;
|
return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass);
|
||||||
utf8proc_ssize_t written = 0;
|
|
||||||
for (casefold_entry = &utf8proc_sequences[property->casefold_mapping];
|
|
||||||
*casefold_entry >= 0; casefold_entry++) {
|
|
||||||
written += utf8proc_decompose_char(*casefold_entry, dst+written,
|
|
||||||
(bufsize > written) ? (bufsize - written) : 0, options,
|
|
||||||
last_boundclass);
|
|
||||||
if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
|
|
||||||
}
|
|
||||||
return written;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
|
if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
|
||||||
if (property->decomp_mapping != UINT16_MAX &&
|
if (property->decomp_seqindex != UINT16_MAX &&
|
||||||
(!property->decomp_type || (options & UTF8PROC_COMPAT))) {
|
(!property->decomp_type || (options & UTF8PROC_COMPAT))) {
|
||||||
const utf8proc_int32_t *decomp_entry;
|
return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass);
|
||||||
utf8proc_ssize_t written = 0;
|
|
||||||
for (decomp_entry = &utf8proc_sequences[property->decomp_mapping];
|
|
||||||
*decomp_entry >= 0; decomp_entry++) {
|
|
||||||
written += utf8proc_decompose_char(*decomp_entry, dst+written,
|
|
||||||
(bufsize > written) ? (bufsize - written) : 0, options,
|
|
||||||
last_boundclass);
|
|
||||||
if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
|
|
||||||
}
|
|
||||||
return written;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (options & UTF8PROC_CHARBOUND) {
|
if (options & UTF8PROC_CHARBOUND) {
|
||||||
@ -588,13 +614,19 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
|
|||||||
if (!starter_property) {
|
if (!starter_property) {
|
||||||
starter_property = unsafe_get_property(*starter);
|
starter_property = unsafe_get_property(*starter);
|
||||||
}
|
}
|
||||||
if (starter_property->comb1st_index >= 0 &&
|
if (starter_property->comb_index < 0x8000 &&
|
||||||
current_property->comb2nd_index >= 0) {
|
current_property->comb_index != UINT16_MAX &&
|
||||||
composition = utf8proc_combinations[
|
current_property->comb_index >= 0x8000) {
|
||||||
starter_property->comb1st_index +
|
int sidx = starter_property->comb_index;
|
||||||
current_property->comb2nd_index
|
int idx = (current_property->comb_index & 0x3FFF) - utf8proc_combinations[sidx];
|
||||||
];
|
if (idx >= 0 && idx <= utf8proc_combinations[sidx + 1] ) {
|
||||||
if (composition >= 0 && (!(options & UTF8PROC_STABLE) ||
|
idx += sidx + 2;
|
||||||
|
if (current_property->comb_index & 0x4000) {
|
||||||
|
composition = (utf8proc_combinations[idx] << 16) | utf8proc_combinations[idx+1];
|
||||||
|
} else
|
||||||
|
composition = utf8proc_combinations[idx];
|
||||||
|
|
||||||
|
if (composition > 0 && (!(options & UTF8PROC_STABLE) ||
|
||||||
!(unsafe_get_property(composition)->comp_exclusion))) {
|
!(unsafe_get_property(composition)->comp_exclusion))) {
|
||||||
*starter = composition;
|
*starter = composition;
|
||||||
starter_property = NULL;
|
starter_property = NULL;
|
||||||
@ -602,6 +634,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
buffer[wpos] = current_char;
|
buffer[wpos] = current_char;
|
||||||
if (current_property->combining_class) {
|
if (current_property->combining_class) {
|
||||||
if (current_property->combining_class > max_combining_class) {
|
if (current_property->combining_class > max_combining_class) {
|
||||||
|
|||||||
20
utf8proc.h
20
utf8proc.h
@ -242,13 +242,12 @@ typedef struct utf8proc_property_struct {
|
|||||||
* @see utf8proc_decomp_type_t.
|
* @see utf8proc_decomp_type_t.
|
||||||
*/
|
*/
|
||||||
utf8proc_propval_t decomp_type;
|
utf8proc_propval_t decomp_type;
|
||||||
utf8proc_uint16_t decomp_mapping;
|
utf8proc_uint16_t decomp_seqindex;
|
||||||
utf8proc_uint16_t casefold_mapping;
|
utf8proc_uint16_t casefold_seqindex;
|
||||||
utf8proc_int32_t uppercase_mapping;
|
utf8proc_uint16_t uppercase_seqindex;
|
||||||
utf8proc_int32_t lowercase_mapping;
|
utf8proc_uint16_t lowercase_seqindex;
|
||||||
utf8proc_int32_t titlecase_mapping;
|
utf8proc_uint16_t titlecase_seqindex;
|
||||||
utf8proc_int32_t comb1st_index;
|
utf8proc_uint16_t comb_index;
|
||||||
utf8proc_int32_t comb2nd_index;
|
|
||||||
unsigned bidi_mirrored:1;
|
unsigned bidi_mirrored:1;
|
||||||
unsigned comp_exclusion:1;
|
unsigned comp_exclusion:1;
|
||||||
/**
|
/**
|
||||||
@ -549,6 +548,13 @@ UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c);
|
|||||||
*/
|
*/
|
||||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
|
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given a codepoint `c`, return the codepoint of the corresponding
|
||||||
|
* title-case character, if any; otherwise (if there is no title-case
|
||||||
|
* variant, or if `c` is not a valid codepoint) return `c`.
|
||||||
|
*/
|
||||||
|
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
|
* Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
|
||||||
* except that a width of 0 is returned for non-printable codepoints
|
* except that a width of 0 is returned for non-printable codepoints
|
||||||
|
|||||||
20635
utf8proc_data.c
20635
utf8proc_data.c
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user