update graphemes for Unicode 7, add utf8proc_grapheme_break function
This commit is contained in:
parent
539d2cc202
commit
397a1eabea
2
.gitignore
vendored
2
.gitignore
vendored
@ -15,3 +15,5 @@ bench/icu
|
|||||||
bench/unistring
|
bench/unistring
|
||||||
normtest
|
normtest
|
||||||
graphemetest
|
graphemetest
|
||||||
|
utf8proc_data.c.new
|
||||||
|
printproperty
|
||||||
|
|||||||
14
Makefile
14
Makefile
@ -29,16 +29,19 @@ clean:
|
|||||||
$(MAKE) -C bench clean
|
$(MAKE) -C bench clean
|
||||||
|
|
||||||
update: utf8proc_data.c.new
|
update: utf8proc_data.c.new
|
||||||
|
cp -f utf8proc_data.c.new utf8proc_data.c
|
||||||
|
|
||||||
# real targets
|
# real targets
|
||||||
|
|
||||||
utf8proc_data.c.new: UnicodeData.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt
|
utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt
|
||||||
$(RUBY) data_generator.rb < UnicodeData.txt > utf8proc_data.c.new
|
$(RUBY) data_generator.rb < UnicodeData.txt > utf8proc_data.c.new
|
||||||
|
|
||||||
UnicodeData.txt:
|
UnicodeData.txt:
|
||||||
|
|
||||||
$(CURL) -O http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
|
$(CURL) -O http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
|
||||||
|
|
||||||
|
GraphemeBreakProperty.txt:
|
||||||
|
$(CURL) -O http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
|
||||||
|
|
||||||
DerivedCoreProperties.txt:
|
DerivedCoreProperties.txt:
|
||||||
$(CURL) -O http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
|
$(CURL) -O http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
|
||||||
|
|
||||||
@ -72,10 +75,13 @@ GraphemeBreakTest.txt:
|
|||||||
$(CURL) http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@
|
$(CURL) http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@
|
||||||
|
|
||||||
normtest: normtest.c utf8proc.o mojibake.h tests.h
|
normtest: normtest.c utf8proc.o mojibake.h tests.h
|
||||||
$(cc) normtest.c utf8proc.o -o normtest
|
$(cc) normtest.c utf8proc.o -o $@
|
||||||
|
|
||||||
graphemetest: graphemetest.c utf8proc.o mojibake.h tests.h
|
graphemetest: graphemetest.c utf8proc.o mojibake.h tests.h
|
||||||
$(cc) graphemetest.c utf8proc.o -o graphemetest
|
$(cc) graphemetest.c utf8proc.o -o $@
|
||||||
|
|
||||||
|
printproperty: printproperty.c utf8proc.o mojibake.h tests.h
|
||||||
|
$(cc) printproperty.c utf8proc.o -o $@
|
||||||
|
|
||||||
check: normtest NormalizationTest.txt graphemetest GraphemeBreakTest.txt
|
check: normtest NormalizationTest.txt graphemetest GraphemeBreakTest.txt
|
||||||
./normtest
|
./normtest
|
||||||
|
|||||||
@ -75,13 +75,13 @@ $ignorable_list.each_line do |entry|
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
$grapheme_extend_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Grapheme_Extend.*?# Total code points:/m]
|
$grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt")
|
||||||
$grapheme_extend = []
|
$grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER")
|
||||||
$grapheme_extend_list.each_line do |entry|
|
$grapheme_boundclass_list.each_line do |entry|
|
||||||
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
|
||||||
$1.hex.upto($2.hex) { |e2| $grapheme_extend << e2 }
|
$1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_" + $3.upcase }
|
||||||
elsif entry =~ /^[0-9A-F]+/
|
elsif entry =~ /^([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
|
||||||
$grapheme_extend << $&.hex
|
$grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_" + $2.upcase
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
@ -161,18 +161,18 @@ class UnicodeChar
|
|||||||
"#{str2c bidi_class, 'BIDI_CLASS'}, " <<
|
"#{str2c bidi_class, 'BIDI_CLASS'}, " <<
|
||||||
"#{str2c decomp_type, 'DECOMP_TYPE'}, " <<
|
"#{str2c decomp_type, 'DECOMP_TYPE'}, " <<
|
||||||
"#{ary2c decomp_mapping}, " <<
|
"#{ary2c decomp_mapping}, " <<
|
||||||
"#{bidi_mirrored}, " <<
|
"#{ary2c case_folding}, " <<
|
||||||
"#{uppercase_mapping or -1}, " <<
|
"#{uppercase_mapping or -1}, " <<
|
||||||
"#{lowercase_mapping or -1}, " <<
|
"#{lowercase_mapping or -1}, " <<
|
||||||
"#{titlecase_mapping or -1}, " <<
|
"#{titlecase_mapping or -1}, " <<
|
||||||
"#{comb1_indicies[code] ?
|
"#{comb1_indicies[code] ?
|
||||||
(comb1_indicies[code]*comb2_indicies.keys.length) : -1
|
(comb1_indicies[code]*comb2_indicies.keys.length) : -1
|
||||||
}, #{comb2_indicies[code] or -1}, " <<
|
}, #{comb2_indicies[code] or -1}, " <<
|
||||||
|
"#{bidi_mirrored}, " <<
|
||||||
"#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
|
"#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
|
||||||
"#{$ignorable.include?(code)}, " <<
|
"#{$ignorable.include?(code)}, " <<
|
||||||
"#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
|
"#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
|
||||||
"#{$grapheme_extend.include?(code)}, " <<
|
"#{$grapheme_boundclass[code]}},\n"
|
||||||
"#{ary2c case_folding}},\n"
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
@ -295,7 +295,7 @@ end
|
|||||||
$stdout << "};\n\n"
|
$stdout << "};\n\n"
|
||||||
|
|
||||||
$stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
|
$stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
|
||||||
$stdout << " {0, 0, 0, 0, NULL, false, -1, -1, -1, -1, -1, false},\n"
|
$stdout << " {0, 0, 0, 0, NULL, NULL, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER},\n"
|
||||||
properties.each { |line|
|
properties.each { |line|
|
||||||
$stdout << line
|
$stdout << line
|
||||||
}
|
}
|
||||||
|
|||||||
@ -7,7 +7,7 @@ int main(void)
|
|||||||
FILE *f = fopen("GraphemeBreakTest.txt", "r");
|
FILE *f = fopen("GraphemeBreakTest.txt", "r");
|
||||||
uint8_t src[1024];
|
uint8_t src[1024];
|
||||||
|
|
||||||
check(f != NULL, "error opening NormalizationTest.txt");
|
check(f != NULL, "error opening GraphemeBreakTest.txt");
|
||||||
while (getline(&buf, &bufsize, f) > 0) {
|
while (getline(&buf, &bufsize, f) > 0) {
|
||||||
size_t bi = 0, si = 0;
|
size_t bi = 0, si = 0;
|
||||||
lineno += 1;
|
lineno += 1;
|
||||||
@ -20,7 +20,7 @@ int main(void)
|
|||||||
while (buf[bi]) {
|
while (buf[bi]) {
|
||||||
bi = skipspaces(buf, bi);
|
bi = skipspaces(buf, bi);
|
||||||
if (buf[bi] == '/') { /* grapheme break */
|
if (buf[bi] == '/') { /* grapheme break */
|
||||||
src[si++] = 0xff;
|
src[si++] = '/';
|
||||||
bi++;
|
bi++;
|
||||||
}
|
}
|
||||||
else if (buf[bi] == '+') { /* no break */
|
else if (buf[bi] == '+') { /* no break */
|
||||||
@ -34,8 +34,8 @@ int main(void)
|
|||||||
while (src[si]) ++si; /* advance to NUL termination */
|
while (src[si]) ++si; /* advance to NUL termination */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (si && src[si-1] == 0xff)
|
if (si && src[si-1] == '/')
|
||||||
--si; /* no 0xff after final grapheme */
|
--si; /* no break after final grapheme */
|
||||||
src[si] = 0; /* NUL-terminate */
|
src[si] = 0; /* NUL-terminate */
|
||||||
|
|
||||||
if (si) {
|
if (si) {
|
||||||
@ -44,16 +44,27 @@ int main(void)
|
|||||||
ssize_t glen;
|
ssize_t glen;
|
||||||
uint8_t *g; /* utf8proc_map grapheme results */
|
uint8_t *g; /* utf8proc_map grapheme results */
|
||||||
while (i < si) {
|
while (i < si) {
|
||||||
if (src[i] != 0xff)
|
if (src[i] != '/')
|
||||||
utf8[j++] = src[i++];
|
utf8[j++] = src[i++];
|
||||||
else
|
else
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND);
|
glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND);
|
||||||
|
if (glen == UTF8PROC_ERROR_INVALIDUTF8) {
|
||||||
|
/* the test file contains surrogate codepoints, which are only for UTF-16 */
|
||||||
|
printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno);
|
||||||
|
}
|
||||||
|
else {
|
||||||
check(glen >= 0, "utf8proc_map error = %s",
|
check(glen >= 0, "utf8proc_map error = %s",
|
||||||
utf8proc_errmsg(glen));
|
utf8proc_errmsg(glen));
|
||||||
|
for (i = 0; i <= glen; ++i)
|
||||||
|
if (g[i] == 0xff)
|
||||||
|
g[i] = '/'; /* easier-to-read output (/ is not in test strings) */
|
||||||
|
printf("line %zd\n", lineno);
|
||||||
check(!strcmp((char*)g, (char*)src),
|
check(!strcmp((char*)g, (char*)src),
|
||||||
"grapheme mismatch: %s vs. %s", (char*)g, (char*)src);
|
"grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src);
|
||||||
|
}
|
||||||
|
free(g);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fclose(f);
|
fclose(f);
|
||||||
|
|||||||
27
mojibake.h
27
mojibake.h
@ -170,17 +170,17 @@ typedef struct utf8proc_property_struct {
|
|||||||
utf8proc_propval_t bidi_class;
|
utf8proc_propval_t bidi_class;
|
||||||
utf8proc_propval_t decomp_type;
|
utf8proc_propval_t decomp_type;
|
||||||
const int32_t *decomp_mapping;
|
const int32_t *decomp_mapping;
|
||||||
unsigned bidi_mirrored:1;
|
const int32_t *casefold_mapping;
|
||||||
int32_t uppercase_mapping;
|
int32_t uppercase_mapping;
|
||||||
int32_t lowercase_mapping;
|
int32_t lowercase_mapping;
|
||||||
int32_t titlecase_mapping;
|
int32_t titlecase_mapping;
|
||||||
int32_t comb1st_index;
|
int32_t comb1st_index;
|
||||||
int32_t comb2nd_index;
|
int32_t comb2nd_index;
|
||||||
|
unsigned bidi_mirrored:1;
|
||||||
unsigned comp_exclusion:1;
|
unsigned comp_exclusion:1;
|
||||||
unsigned ignorable:1;
|
unsigned ignorable:1;
|
||||||
unsigned control_boundary:1;
|
unsigned control_boundary:1;
|
||||||
unsigned extend:1;
|
unsigned boundclass:4;
|
||||||
const int32_t *casefold_mapping;
|
|
||||||
} utf8proc_property_t;
|
} utf8proc_property_t;
|
||||||
|
|
||||||
#define UTF8PROC_CATEGORY_LU 1
|
#define UTF8PROC_CATEGORY_LU 1
|
||||||
@ -253,6 +253,21 @@ typedef struct utf8proc_property_struct {
|
|||||||
#define UTF8PROC_DECOMP_TYPE_FRACTION 15
|
#define UTF8PROC_DECOMP_TYPE_FRACTION 15
|
||||||
#define UTF8PROC_DECOMP_TYPE_COMPAT 16
|
#define UTF8PROC_DECOMP_TYPE_COMPAT 16
|
||||||
|
|
||||||
|
/* values for boundclass property: */
|
||||||
|
#define UTF8PROC_BOUNDCLASS_START 0
|
||||||
|
#define UTF8PROC_BOUNDCLASS_OTHER 1
|
||||||
|
#define UTF8PROC_BOUNDCLASS_CR 2
|
||||||
|
#define UTF8PROC_BOUNDCLASS_LF 3
|
||||||
|
#define UTF8PROC_BOUNDCLASS_CONTROL 4
|
||||||
|
#define UTF8PROC_BOUNDCLASS_EXTEND 5
|
||||||
|
#define UTF8PROC_BOUNDCLASS_L 6
|
||||||
|
#define UTF8PROC_BOUNDCLASS_V 7
|
||||||
|
#define UTF8PROC_BOUNDCLASS_T 8
|
||||||
|
#define UTF8PROC_BOUNDCLASS_LV 9
|
||||||
|
#define UTF8PROC_BOUNDCLASS_LVT 10
|
||||||
|
#define UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR 11
|
||||||
|
#define UTF8PROC_BOUNDCLASS_SPACINGMARK 12
|
||||||
|
|
||||||
DLLEXPORT extern const int8_t utf8proc_utf8class[256];
|
DLLEXPORT extern const int8_t utf8proc_utf8class[256];
|
||||||
|
|
||||||
DLLEXPORT const char *utf8proc_version(void);
|
DLLEXPORT const char *utf8proc_version(void);
|
||||||
@ -367,6 +382,12 @@ DLLEXPORT ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options
|
|||||||
* crash!
|
* crash!
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2);
|
||||||
|
/*
|
||||||
|
* Given a pair of consecutive codepoints (c1,c2), return whether a grapheme break is
|
||||||
|
* permitted between them (as defined by the extended grapheme clusters in UAX#29).
|
||||||
|
*/
|
||||||
|
|
||||||
DLLEXPORT ssize_t utf8proc_map(
|
DLLEXPORT ssize_t utf8proc_map(
|
||||||
const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
|
const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
|
||||||
);
|
);
|
||||||
|
|||||||
45
printproperty.c
Normal file
45
printproperty.c
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
/* simple test program to print out the utf8proc properties for a codepoint */
|
||||||
|
|
||||||
|
#include "tests.h"
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 1; i < argc; ++i) {
|
||||||
|
int c;
|
||||||
|
check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]);
|
||||||
|
const utf8proc_property_t *p = utf8proc_get_property(c);
|
||||||
|
printf("U+%s:\n"
|
||||||
|
" category = %d\n"
|
||||||
|
" combining_class = %d\n"
|
||||||
|
" bidi_class = %d\n"
|
||||||
|
" decomp_type = %d\n"
|
||||||
|
" uppercase_mapping = %x\n"
|
||||||
|
" lowercase_mapping = %x\n"
|
||||||
|
" titlecase_mapping = %x\n"
|
||||||
|
" comb1st_index = %d\n"
|
||||||
|
" comb2nd_index = %d\n"
|
||||||
|
" bidi_mirrored = %d\n"
|
||||||
|
" comp_exclusion = %d\n"
|
||||||
|
" ignorable = %d\n"
|
||||||
|
" control_boundary = %d\n"
|
||||||
|
" boundclass = %d\n",
|
||||||
|
argv[i],
|
||||||
|
p->category,
|
||||||
|
p->combining_class,
|
||||||
|
p->bidi_class,
|
||||||
|
p->decomp_type,
|
||||||
|
p->uppercase_mapping,
|
||||||
|
p->lowercase_mapping,
|
||||||
|
p->titlecase_mapping,
|
||||||
|
p->comb1st_index,
|
||||||
|
p->comb2nd_index,
|
||||||
|
p->bidi_mirrored,
|
||||||
|
p->comp_exclusion,
|
||||||
|
p->ignorable,
|
||||||
|
p->control_boundary,
|
||||||
|
p->boundclass);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
89
utf8proc.c
89
utf8proc.c
@ -81,19 +81,6 @@ DLLEXPORT const int8_t utf8proc_utf8class[256] = {
|
|||||||
#define UTF8PROC_HANGUL_S_START 0xAC00
|
#define UTF8PROC_HANGUL_S_START 0xAC00
|
||||||
#define UTF8PROC_HANGUL_S_END 0xD7A4
|
#define UTF8PROC_HANGUL_S_END 0xD7A4
|
||||||
|
|
||||||
|
|
||||||
#define UTF8PROC_BOUNDCLASS_START 0
|
|
||||||
#define UTF8PROC_BOUNDCLASS_OTHER 1
|
|
||||||
#define UTF8PROC_BOUNDCLASS_CR 2
|
|
||||||
#define UTF8PROC_BOUNDCLASS_LF 3
|
|
||||||
#define UTF8PROC_BOUNDCLASS_CONTROL 4
|
|
||||||
#define UTF8PROC_BOUNDCLASS_EXTEND 5
|
|
||||||
#define UTF8PROC_BOUNDCLASS_L 6
|
|
||||||
#define UTF8PROC_BOUNDCLASS_V 7
|
|
||||||
#define UTF8PROC_BOUNDCLASS_T 8
|
|
||||||
#define UTF8PROC_BOUNDCLASS_LV 9
|
|
||||||
#define UTF8PROC_BOUNDCLASS_LVT 10
|
|
||||||
|
|
||||||
/* in libmojibake, we append "m" to whatever version of utf8proc
|
/* in libmojibake, we append "m" to whatever version of utf8proc
|
||||||
we have merged with most recently + whatever increment would
|
we have merged with most recently + whatever increment would
|
||||||
correspond to semantic versioning rules. Currently, we use 1.2m
|
correspond to semantic versioning rules. Currently, we use 1.2m
|
||||||
@ -206,6 +193,38 @@ DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t uc) {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* return whether there is a grapheme break between boundclasses lbc and tbc */
|
||||||
|
static bool grapheme_break(int lbc, int tbc) {
|
||||||
|
return
|
||||||
|
(lbc == UTF8PROC_BOUNDCLASS_START) ? true :
|
||||||
|
(lbc == UTF8PROC_BOUNDCLASS_CR &&
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
|
||||||
|
(lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
|
||||||
|
(tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
|
||||||
|
(tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
|
||||||
|
(lbc == UTF8PROC_BOUNDCLASS_L &&
|
||||||
|
(tbc == UTF8PROC_BOUNDCLASS_L ||
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_V ||
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_LV ||
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
|
||||||
|
((lbc == UTF8PROC_BOUNDCLASS_LV ||
|
||||||
|
lbc == UTF8PROC_BOUNDCLASS_V) &&
|
||||||
|
(tbc == UTF8PROC_BOUNDCLASS_V ||
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
|
||||||
|
((lbc == UTF8PROC_BOUNDCLASS_LVT ||
|
||||||
|
lbc == UTF8PROC_BOUNDCLASS_T) &&
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_T) ? false :
|
||||||
|
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&
|
||||||
|
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :
|
||||||
|
(tbc != UTF8PROC_BOUNDCLASS_SPACINGMARK);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* return whether there is a grapheme break between codepoints c1 and c2 */
|
||||||
|
DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2) {
|
||||||
|
return grapheme_break(utf8proc_get_property(c1)->boundclass,
|
||||||
|
utf8proc_get_property(c2)->boundclass);
|
||||||
|
}
|
||||||
|
|
||||||
#define utf8proc_decompose_lump(replacement_uc) \
|
#define utf8proc_decompose_lump(replacement_uc) \
|
||||||
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
|
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
|
||||||
options & ~UTF8PROC_LUMP, last_boundclass)
|
options & ~UTF8PROC_LUMP, last_boundclass)
|
||||||
@ -302,48 +321,8 @@ DLLEXPORT ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufs
|
|||||||
}
|
}
|
||||||
if (options & UTF8PROC_CHARBOUND) {
|
if (options & UTF8PROC_CHARBOUND) {
|
||||||
bool boundary;
|
bool boundary;
|
||||||
int tbc, lbc;
|
int tbc = property->boundclass;
|
||||||
tbc =
|
boundary = grapheme_break(*last_boundclass, tbc);
|
||||||
(uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR :
|
|
||||||
(uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF :
|
|
||||||
((category == UTF8PROC_CATEGORY_ZL ||
|
|
||||||
category == UTF8PROC_CATEGORY_ZP ||
|
|
||||||
category == UTF8PROC_CATEGORY_CC ||
|
|
||||||
category == UTF8PROC_CATEGORY_CF) &&
|
|
||||||
!(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL :
|
|
||||||
property->extend ? UTF8PROC_BOUNDCLASS_EXTEND :
|
|
||||||
((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) ||
|
|
||||||
uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L :
|
|
||||||
(uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ?
|
|
||||||
UTF8PROC_BOUNDCLASS_V :
|
|
||||||
(uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ?
|
|
||||||
UTF8PROC_BOUNDCLASS_T :
|
|
||||||
(uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? (
|
|
||||||
((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ?
|
|
||||||
UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT
|
|
||||||
) :
|
|
||||||
UTF8PROC_BOUNDCLASS_OTHER;
|
|
||||||
lbc = *last_boundclass;
|
|
||||||
boundary =
|
|
||||||
(tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
|
|
||||||
(lbc == UTF8PROC_BOUNDCLASS_START) ? true :
|
|
||||||
(lbc == UTF8PROC_BOUNDCLASS_CR &&
|
|
||||||
tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
|
|
||||||
(lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
|
|
||||||
(tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
|
|
||||||
(lbc == UTF8PROC_BOUNDCLASS_L &&
|
|
||||||
(tbc == UTF8PROC_BOUNDCLASS_L ||
|
|
||||||
tbc == UTF8PROC_BOUNDCLASS_V ||
|
|
||||||
tbc == UTF8PROC_BOUNDCLASS_LV ||
|
|
||||||
tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
|
|
||||||
((lbc == UTF8PROC_BOUNDCLASS_LV ||
|
|
||||||
lbc == UTF8PROC_BOUNDCLASS_V) &&
|
|
||||||
(tbc == UTF8PROC_BOUNDCLASS_V ||
|
|
||||||
tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
|
|
||||||
((lbc == UTF8PROC_BOUNDCLASS_LVT ||
|
|
||||||
lbc == UTF8PROC_BOUNDCLASS_T) &&
|
|
||||||
tbc == UTF8PROC_BOUNDCLASS_T) ? false :
|
|
||||||
true;
|
|
||||||
*last_boundclass = tbc;
|
*last_boundclass = tbc;
|
||||||
if (boundary) {
|
if (boundary) {
|
||||||
if (bufsize >= 1) dst[0] = 0xFFFF;
|
if (bufsize >= 1) dst[0] = 0xFFFF;
|
||||||
|
|||||||
20856
utf8proc_data.c
20856
utf8proc_data.c
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user