Merge pull request #20 from JuliaLang/graphemes

Update graphemes for Unicode 7
This commit is contained in:
Steven G. Johnson 2014-12-14 08:47:06 -05:00
commit 4f70bbe780
10 changed files with 10809 additions and 10430 deletions

3
.gitignore vendored
View File

@ -14,3 +14,6 @@ bench/bench
bench/icu
bench/unistring
normtest
graphemetest
utf8proc_data.c.new
printproperty

View File

@ -2,6 +2,7 @@
CURL=curl
RUBY=ruby
PERL=perl
MAKE=make
# settings
@ -24,20 +25,23 @@ all: c-library
c-library: libmojibake.a libmojibake.$(SHLIB_EXT)
clean:
rm -f utf8proc.o libmojibake.a libmojibake.$(SHLIB_EXT) normtest UnicodeData.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt
rm -f utf8proc.o libmojibake.a libmojibake.$(SHLIB_EXT) normtest graphemetest UnicodeData.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt
$(MAKE) -C bench clean
update: utf8proc_data.c.new
cp -f utf8proc_data.c.new utf8proc_data.c
# real targets
utf8proc_data.c.new: UnicodeData.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt
utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt
$(RUBY) data_generator.rb < UnicodeData.txt > utf8proc_data.c.new
UnicodeData.txt:
$(CURL) -O http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
GraphemeBreakProperty.txt:
$(CURL) -O http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
DerivedCoreProperties.txt:
$(CURL) -O http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
@ -67,8 +71,18 @@ libmojibake.dylib: utf8proc.o
NormalizationTest.txt:
$(CURL) -O http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
normtest: normtest.c utf8proc.o mojibake.h
$(cc) normtest.c utf8proc.o -o normtest
GraphemeBreakTest.txt:
$(CURL) http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@
check: normtest NormalizationTest.txt
normtest: normtest.c utf8proc.o mojibake.h tests.h
$(cc) normtest.c utf8proc.o -o $@
graphemetest: graphemetest.c utf8proc.o mojibake.h tests.h
$(cc) graphemetest.c utf8proc.o -o $@
printproperty: printproperty.c utf8proc.o mojibake.h tests.h
$(cc) printproperty.c utf8proc.o -o $@
check: normtest NormalizationTest.txt graphemetest GraphemeBreakTest.txt
./normtest
./graphemetest

View File

@ -75,13 +75,13 @@ $ignorable_list.each_line do |entry|
end
end
$grapheme_extend_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Grapheme_Extend.*?# Total code points:/m]
$grapheme_extend = []
$grapheme_extend_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
$1.hex.upto($2.hex) { |e2| $grapheme_extend << e2 }
elsif entry =~ /^[0-9A-F]+/
$grapheme_extend << $&.hex
$grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt")
$grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER")
$grapheme_boundclass_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
$1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_" + $3.upcase }
elsif entry =~ /^([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
$grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_" + $2.upcase
end
end
@ -161,18 +161,18 @@ class UnicodeChar
"#{str2c bidi_class, 'BIDI_CLASS'}, " <<
"#{str2c decomp_type, 'DECOMP_TYPE'}, " <<
"#{ary2c decomp_mapping}, " <<
"#{bidi_mirrored}, " <<
"#{ary2c case_folding}, " <<
"#{uppercase_mapping or -1}, " <<
"#{lowercase_mapping or -1}, " <<
"#{titlecase_mapping or -1}, " <<
"#{comb1_indicies[code] ?
(comb1_indicies[code]*comb2_indicies.keys.length) : -1
}, #{comb2_indicies[code] or -1}, " <<
"#{bidi_mirrored}, " <<
"#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
"#{$ignorable.include?(code)}, " <<
"#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
"#{$grapheme_extend.include?(code)}, " <<
"#{ary2c case_folding}},\n"
"#{$grapheme_boundclass[code]}},\n"
end
end
@ -295,7 +295,7 @@ end
$stdout << "};\n\n"
$stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
$stdout << " {0, 0, 0, 0, NULL, false, -1, -1, -1, -1, -1, false},\n"
$stdout << " {0, 0, 0, 0, NULL, NULL, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER},\n"
properties.each { |line|
$stdout << line
}

73
graphemetest.c Normal file
View File

@ -0,0 +1,73 @@
#include "tests.h"
int main(void)
{
char *buf = NULL;
size_t bufsize = 0;
FILE *f = fopen("GraphemeBreakTest.txt", "r");
uint8_t src[1024];
check(f != NULL, "error opening GraphemeBreakTest.txt");
while (getline(&buf, &bufsize, f) > 0) {
size_t bi = 0, si = 0;
lineno += 1;
if (lineno % 100 == 0)
printf("checking line %zd...\n", lineno);
if (buf[0] == '#') continue;
while (buf[bi]) {
bi = skipspaces(buf, bi);
if (buf[bi] == '/') { /* grapheme break */
src[si++] = '/';
bi++;
}
else if (buf[bi] == '+') { /* no break */
bi++;
}
else if (buf[bi] == '#') { /* start of comments */
break;
}
else { /* hex-encoded codepoint */
bi += encode((char*) (src + si), buf + bi) - 1;
while (src[si]) ++si; /* advance to NUL termination */
}
}
if (si && src[si-1] == '/')
--si; /* no break after final grapheme */
src[si] = 0; /* NUL-terminate */
if (si) {
uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
size_t i = 0, j = 0;
ssize_t glen;
uint8_t *g; /* utf8proc_map grapheme results */
while (i < si) {
if (src[i] != '/')
utf8[j++] = src[i++];
else
i++;
}
glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND);
if (glen == UTF8PROC_ERROR_INVALIDUTF8) {
/* the test file contains surrogate codepoints, which are only for UTF-16 */
printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno);
}
else {
check(glen >= 0, "utf8proc_map error = %s",
utf8proc_errmsg(glen));
for (i = 0; i <= glen; ++i)
if (g[i] == 0xff)
g[i] = '/'; /* easier-to-read output (/ is not in test strings) */
printf("line %zd\n", lineno);
check(!strcmp((char*)g, (char*)src),
"grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src);
}
free(g);
}
}
fclose(f);
printf("Passed tests after %zd lines!\n", lineno);
return 0;
}

View File

@ -170,17 +170,17 @@ typedef struct utf8proc_property_struct {
utf8proc_propval_t bidi_class;
utf8proc_propval_t decomp_type;
const int32_t *decomp_mapping;
unsigned bidi_mirrored:1;
const int32_t *casefold_mapping;
int32_t uppercase_mapping;
int32_t lowercase_mapping;
int32_t titlecase_mapping;
int32_t comb1st_index;
int32_t comb2nd_index;
unsigned bidi_mirrored:1;
unsigned comp_exclusion:1;
unsigned ignorable:1;
unsigned control_boundary:1;
unsigned extend:1;
const int32_t *casefold_mapping;
unsigned boundclass:4;
} utf8proc_property_t;
#define UTF8PROC_CATEGORY_CN 0
@ -253,6 +253,21 @@ typedef struct utf8proc_property_struct {
#define UTF8PROC_DECOMP_TYPE_FRACTION 15
#define UTF8PROC_DECOMP_TYPE_COMPAT 16
/* values for boundclass property: */
#define UTF8PROC_BOUNDCLASS_START 0
#define UTF8PROC_BOUNDCLASS_OTHER 1
#define UTF8PROC_BOUNDCLASS_CR 2
#define UTF8PROC_BOUNDCLASS_LF 3
#define UTF8PROC_BOUNDCLASS_CONTROL 4
#define UTF8PROC_BOUNDCLASS_EXTEND 5
#define UTF8PROC_BOUNDCLASS_L 6
#define UTF8PROC_BOUNDCLASS_V 7
#define UTF8PROC_BOUNDCLASS_T 8
#define UTF8PROC_BOUNDCLASS_LV 9
#define UTF8PROC_BOUNDCLASS_LVT 10
#define UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR 11
#define UTF8PROC_BOUNDCLASS_SPACINGMARK 12
DLLEXPORT extern const int8_t utf8proc_utf8class[256];
DLLEXPORT const char *utf8proc_version(void);
@ -367,6 +382,12 @@ DLLEXPORT ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options
* crash!
*/
DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2);
/*
* Given a pair of consecutive codepoints (c1,c2), return whether a grapheme break is
* permitted between them (as defined by the extended grapheme clusters in UAX#29).
*/
DLLEXPORT ssize_t utf8proc_map(
const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
);

View File

@ -1,47 +1,4 @@
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <stdarg.h>
#include "mojibake.h"
size_t lineno = 0;
void check(int cond, const char *format, ...)
{
if (!cond) {
va_list args;
fprintf(stderr, "line %zd: ", lineno);
va_start(args, format);
vfprintf(stderr, format, args);
va_end(args);
fprintf(stderr, "\n");
exit(1);
}
}
/* if buf points to a sequence of codepoints encoded as hexadecimal strings,
separated by whitespace, and terminated by any character not in
[0-9a-fA-F] or whitespace, then stores the corresponding utf8 string
in dest, returning the number of bytes read from buf */
size_t encode(char *dest, const char *buf)
{
size_t i = 0, j, d = 0;
do {
int c;
while (isspace(buf[i])) ++i; /* skip whitespace */
for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j)
; /* find end of hex input */
if (j == i) { /* no codepoint found */
dest[d] = 0; /* NUL-terminate destination string */
return i + 1;
}
check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i);
i = j; /* skip to char after hex input */
d += utf8proc_encode_char(c, (uint8_t *) (dest + d));
} while (1);
}
#include "tests.h"
#define CHECK_NORM(NRM, norm, src) { \
char *src_norm = (char*) utf8proc_ ## NRM((uint8_t*) src); \

45
printproperty.c Normal file
View File

@ -0,0 +1,45 @@
/* simple test program to print out the utf8proc properties for a codepoint */
#include "tests.h"
int main(int argc, char **argv)
{
int i;
for (i = 1; i < argc; ++i) {
int c;
check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]);
const utf8proc_property_t *p = utf8proc_get_property(c);
printf("U+%s:\n"
" category = %d\n"
" combining_class = %d\n"
" bidi_class = %d\n"
" decomp_type = %d\n"
" uppercase_mapping = %x\n"
" lowercase_mapping = %x\n"
" titlecase_mapping = %x\n"
" comb1st_index = %d\n"
" comb2nd_index = %d\n"
" bidi_mirrored = %d\n"
" comp_exclusion = %d\n"
" ignorable = %d\n"
" control_boundary = %d\n"
" boundclass = %d\n",
argv[i],
p->category,
p->combining_class,
p->bidi_class,
p->decomp_type,
p->uppercase_mapping,
p->lowercase_mapping,
p->titlecase_mapping,
p->comb1st_index,
p->comb2nd_index,
p->bidi_mirrored,
p->comp_exclusion,
p->ignorable,
p->control_boundary,
p->boundclass);
}
return 0;
}

53
tests.h Normal file
View File

@ -0,0 +1,53 @@
/* Common functions and includes for our test programs. */
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <stdarg.h>
#include "mojibake.h"
size_t lineno = 0;
void check(int cond, const char *format, ...)
{
if (!cond) {
va_list args;
fprintf(stderr, "line %zd: ", lineno);
va_start(args, format);
vfprintf(stderr, format, args);
va_end(args);
fprintf(stderr, "\n");
exit(1);
}
}
size_t skipspaces(const char *buf, size_t i)
{
while (isspace(buf[i])) ++i;
return i;
}
/* if buf points to a sequence of codepoints encoded as hexadecimal strings,
separated by whitespace, and terminated by any character not in
[0-9a-fA-F] or whitespace, then stores the corresponding utf8 string
in dest, returning the number of bytes read from buf */
size_t encode(char *dest, const char *buf)
{
size_t i = 0, j, d = 0;
do {
int c;
i = skipspaces(buf, i);
for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j)
; /* find end of hex input */
if (j == i) { /* no codepoint found */
dest[d] = 0; /* NUL-terminate destination string */
return i + 1;
}
check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i);
i = j; /* skip to char after hex input */
d += utf8proc_encode_char(c, (uint8_t *) (dest + d));
} while (1);
}

View File

@ -81,19 +81,6 @@ DLLEXPORT const int8_t utf8proc_utf8class[256] = {
#define UTF8PROC_HANGUL_S_START 0xAC00
#define UTF8PROC_HANGUL_S_END 0xD7A4
#define UTF8PROC_BOUNDCLASS_START 0
#define UTF8PROC_BOUNDCLASS_OTHER 1
#define UTF8PROC_BOUNDCLASS_CR 2
#define UTF8PROC_BOUNDCLASS_LF 3
#define UTF8PROC_BOUNDCLASS_CONTROL 4
#define UTF8PROC_BOUNDCLASS_EXTEND 5
#define UTF8PROC_BOUNDCLASS_L 6
#define UTF8PROC_BOUNDCLASS_V 7
#define UTF8PROC_BOUNDCLASS_T 8
#define UTF8PROC_BOUNDCLASS_LV 9
#define UTF8PROC_BOUNDCLASS_LVT 10
/* in libmojibake, we append "m" to whatever version of utf8proc
we have merged with most recently + whatever increment would
correspond to semantic versioning rules. Currently, we use 1.2m
@ -206,6 +193,38 @@ DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t uc) {
);
}
/* return whether there is a grapheme break between boundclasses lbc and tbc */
static bool grapheme_break(int lbc, int tbc) {
return
(lbc == UTF8PROC_BOUNDCLASS_START) ? true :
(lbc == UTF8PROC_BOUNDCLASS_CR &&
tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
(lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
(tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
(tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
(lbc == UTF8PROC_BOUNDCLASS_L &&
(tbc == UTF8PROC_BOUNDCLASS_L ||
tbc == UTF8PROC_BOUNDCLASS_V ||
tbc == UTF8PROC_BOUNDCLASS_LV ||
tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
((lbc == UTF8PROC_BOUNDCLASS_LV ||
lbc == UTF8PROC_BOUNDCLASS_V) &&
(tbc == UTF8PROC_BOUNDCLASS_V ||
tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
((lbc == UTF8PROC_BOUNDCLASS_LVT ||
lbc == UTF8PROC_BOUNDCLASS_T) &&
tbc == UTF8PROC_BOUNDCLASS_T) ? false :
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :
(tbc != UTF8PROC_BOUNDCLASS_SPACINGMARK);
}
/* return whether there is a grapheme break between codepoints c1 and c2 */
DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2) {
return grapheme_break(utf8proc_get_property(c1)->boundclass,
utf8proc_get_property(c2)->boundclass);
}
#define utf8proc_decompose_lump(replacement_uc) \
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
options & ~UTF8PROC_LUMP, last_boundclass)
@ -302,48 +321,8 @@ DLLEXPORT ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufs
}
if (options & UTF8PROC_CHARBOUND) {
bool boundary;
int tbc, lbc;
tbc =
(uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR :
(uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF :
((category == UTF8PROC_CATEGORY_ZL ||
category == UTF8PROC_CATEGORY_ZP ||
category == UTF8PROC_CATEGORY_CC ||
category == UTF8PROC_CATEGORY_CF) &&
!(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL :
property->extend ? UTF8PROC_BOUNDCLASS_EXTEND :
((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) ||
uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L :
(uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ?
UTF8PROC_BOUNDCLASS_V :
(uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ?
UTF8PROC_BOUNDCLASS_T :
(uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? (
((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ?
UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT
) :
UTF8PROC_BOUNDCLASS_OTHER;
lbc = *last_boundclass;
boundary =
(tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
(lbc == UTF8PROC_BOUNDCLASS_START) ? true :
(lbc == UTF8PROC_BOUNDCLASS_CR &&
tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
(lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
(tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
(lbc == UTF8PROC_BOUNDCLASS_L &&
(tbc == UTF8PROC_BOUNDCLASS_L ||
tbc == UTF8PROC_BOUNDCLASS_V ||
tbc == UTF8PROC_BOUNDCLASS_LV ||
tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
((lbc == UTF8PROC_BOUNDCLASS_LV ||
lbc == UTF8PROC_BOUNDCLASS_V) &&
(tbc == UTF8PROC_BOUNDCLASS_V ||
tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
((lbc == UTF8PROC_BOUNDCLASS_LVT ||
lbc == UTF8PROC_BOUNDCLASS_T) &&
tbc == UTF8PROC_BOUNDCLASS_T) ? false :
true;
int tbc = property->boundclass;
boundary = grapheme_break(*last_boundclass, tbc);
*last_boundclass = tbc;
if (boundary) {
if (bufsize >= 1) dst[0] = 0xFFFF;

File diff suppressed because it is too large Load Diff