grapheme test for UAX#29
This commit is contained in:
parent
1b3992ebe5
commit
539d2cc202
1
.gitignore
vendored
1
.gitignore
vendored
@ -14,3 +14,4 @@ bench/bench
|
|||||||
bench/icu
|
bench/icu
|
||||||
bench/unistring
|
bench/unistring
|
||||||
normtest
|
normtest
|
||||||
|
graphemetest
|
||||||
|
|||||||
14
Makefile
14
Makefile
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
CURL=curl
|
CURL=curl
|
||||||
RUBY=ruby
|
RUBY=ruby
|
||||||
|
PERL=perl
|
||||||
MAKE=make
|
MAKE=make
|
||||||
|
|
||||||
# settings
|
# settings
|
||||||
@ -24,7 +25,7 @@ all: c-library
|
|||||||
c-library: libmojibake.a libmojibake.$(SHLIB_EXT)
|
c-library: libmojibake.a libmojibake.$(SHLIB_EXT)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -f utf8proc.o libmojibake.a libmojibake.$(SHLIB_EXT) normtest UnicodeData.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt
|
rm -f utf8proc.o libmojibake.a libmojibake.$(SHLIB_EXT) normtest graphemetest UnicodeData.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt
|
||||||
$(MAKE) -C bench clean
|
$(MAKE) -C bench clean
|
||||||
|
|
||||||
update: utf8proc_data.c.new
|
update: utf8proc_data.c.new
|
||||||
@ -67,8 +68,15 @@ libmojibake.dylib: utf8proc.o
|
|||||||
NormalizationTest.txt:
|
NormalizationTest.txt:
|
||||||
$(CURL) -O http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
|
$(CURL) -O http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
|
||||||
|
|
||||||
normtest: normtest.c utf8proc.o mojibake.h
|
GraphemeBreakTest.txt:
|
||||||
|
$(CURL) http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@
|
||||||
|
|
||||||
|
normtest: normtest.c utf8proc.o mojibake.h tests.h
|
||||||
$(cc) normtest.c utf8proc.o -o normtest
|
$(cc) normtest.c utf8proc.o -o normtest
|
||||||
|
|
||||||
check: normtest NormalizationTest.txt
|
graphemetest: graphemetest.c utf8proc.o mojibake.h tests.h
|
||||||
|
$(cc) graphemetest.c utf8proc.o -o graphemetest
|
||||||
|
|
||||||
|
check: normtest NormalizationTest.txt graphemetest GraphemeBreakTest.txt
|
||||||
./normtest
|
./normtest
|
||||||
|
./graphemetest
|
||||||
|
|||||||
62
graphemetest.c
Normal file
62
graphemetest.c
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
#include "tests.h"
|
||||||
|
|
||||||
|
int main(void)
|
||||||
|
{
|
||||||
|
char *buf = NULL;
|
||||||
|
size_t bufsize = 0;
|
||||||
|
FILE *f = fopen("GraphemeBreakTest.txt", "r");
|
||||||
|
uint8_t src[1024];
|
||||||
|
|
||||||
|
check(f != NULL, "error opening NormalizationTest.txt");
|
||||||
|
while (getline(&buf, &bufsize, f) > 0) {
|
||||||
|
size_t bi = 0, si = 0;
|
||||||
|
lineno += 1;
|
||||||
|
|
||||||
|
if (lineno % 100 == 0)
|
||||||
|
printf("checking line %zd...\n", lineno);
|
||||||
|
|
||||||
|
if (buf[0] == '#') continue;
|
||||||
|
|
||||||
|
while (buf[bi]) {
|
||||||
|
bi = skipspaces(buf, bi);
|
||||||
|
if (buf[bi] == '/') { /* grapheme break */
|
||||||
|
src[si++] = 0xff;
|
||||||
|
bi++;
|
||||||
|
}
|
||||||
|
else if (buf[bi] == '+') { /* no break */
|
||||||
|
bi++;
|
||||||
|
}
|
||||||
|
else if (buf[bi] == '#') { /* start of comments */
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else { /* hex-encoded codepoint */
|
||||||
|
bi += encode((char*) (src + si), buf + bi) - 1;
|
||||||
|
while (src[si]) ++si; /* advance to NUL termination */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (si && src[si-1] == 0xff)
|
||||||
|
--si; /* no 0xff after final grapheme */
|
||||||
|
src[si] = 0; /* NUL-terminate */
|
||||||
|
|
||||||
|
if (si) {
|
||||||
|
uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
|
||||||
|
size_t i = 0, j = 0;
|
||||||
|
ssize_t glen;
|
||||||
|
uint8_t *g; /* utf8proc_map grapheme results */
|
||||||
|
while (i < si) {
|
||||||
|
if (src[i] != 0xff)
|
||||||
|
utf8[j++] = src[i++];
|
||||||
|
else
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND);
|
||||||
|
check(glen >= 0, "utf8proc_map error = %s",
|
||||||
|
utf8proc_errmsg(glen));
|
||||||
|
check(!strcmp((char*)g, (char*)src),
|
||||||
|
"grapheme mismatch: %s vs. %s", (char*)g, (char*)src);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fclose(f);
|
||||||
|
printf("Passed tests after %zd lines!\n", lineno);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
45
normtest.c
45
normtest.c
@ -1,47 +1,4 @@
|
|||||||
#include <stdio.h>
|
#include "tests.h"
|
||||||
#include <stdlib.h>
|
|
||||||
#include <ctype.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <stdarg.h>
|
|
||||||
|
|
||||||
#include "mojibake.h"
|
|
||||||
|
|
||||||
size_t lineno = 0;
|
|
||||||
|
|
||||||
void check(int cond, const char *format, ...)
|
|
||||||
{
|
|
||||||
if (!cond) {
|
|
||||||
va_list args;
|
|
||||||
fprintf(stderr, "line %zd: ", lineno);
|
|
||||||
va_start(args, format);
|
|
||||||
vfprintf(stderr, format, args);
|
|
||||||
va_end(args);
|
|
||||||
fprintf(stderr, "\n");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* if buf points to a sequence of codepoints encoded as hexadecimal strings,
|
|
||||||
separated by whitespace, and terminated by any character not in
|
|
||||||
[0-9a-fA-F] or whitespace, then stores the corresponding utf8 string
|
|
||||||
in dest, returning the number of bytes read from buf */
|
|
||||||
size_t encode(char *dest, const char *buf)
|
|
||||||
{
|
|
||||||
size_t i = 0, j, d = 0;
|
|
||||||
do {
|
|
||||||
int c;
|
|
||||||
while (isspace(buf[i])) ++i; /* skip whitespace */
|
|
||||||
for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j)
|
|
||||||
; /* find end of hex input */
|
|
||||||
if (j == i) { /* no codepoint found */
|
|
||||||
dest[d] = 0; /* NUL-terminate destination string */
|
|
||||||
return i + 1;
|
|
||||||
}
|
|
||||||
check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i);
|
|
||||||
i = j; /* skip to char after hex input */
|
|
||||||
d += utf8proc_encode_char(c, (uint8_t *) (dest + d));
|
|
||||||
} while (1);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define CHECK_NORM(NRM, norm, src) { \
|
#define CHECK_NORM(NRM, norm, src) { \
|
||||||
char *src_norm = (char*) utf8proc_ ## NRM((uint8_t*) src); \
|
char *src_norm = (char*) utf8proc_ ## NRM((uint8_t*) src); \
|
||||||
|
|||||||
53
tests.h
Normal file
53
tests.h
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
/* Common functions and includes for our test programs. */
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <ctype.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
|
||||||
|
#include "mojibake.h"
|
||||||
|
|
||||||
|
size_t lineno = 0;
|
||||||
|
|
||||||
|
void check(int cond, const char *format, ...)
|
||||||
|
{
|
||||||
|
if (!cond) {
|
||||||
|
va_list args;
|
||||||
|
fprintf(stderr, "line %zd: ", lineno);
|
||||||
|
va_start(args, format);
|
||||||
|
vfprintf(stderr, format, args);
|
||||||
|
va_end(args);
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t skipspaces(const char *buf, size_t i)
|
||||||
|
{
|
||||||
|
while (isspace(buf[i])) ++i;
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* if buf points to a sequence of codepoints encoded as hexadecimal strings,
|
||||||
|
separated by whitespace, and terminated by any character not in
|
||||||
|
[0-9a-fA-F] or whitespace, then stores the corresponding utf8 string
|
||||||
|
in dest, returning the number of bytes read from buf */
|
||||||
|
size_t encode(char *dest, const char *buf)
|
||||||
|
{
|
||||||
|
size_t i = 0, j, d = 0;
|
||||||
|
do {
|
||||||
|
int c;
|
||||||
|
i = skipspaces(buf, i);
|
||||||
|
for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j)
|
||||||
|
; /* find end of hex input */
|
||||||
|
if (j == i) { /* no codepoint found */
|
||||||
|
dest[d] = 0; /* NUL-terminate destination string */
|
||||||
|
return i + 1;
|
||||||
|
}
|
||||||
|
check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i);
|
||||||
|
i = j; /* skip to char after hex input */
|
||||||
|
d += utf8proc_encode_char(c, (uint8_t *) (dest + d));
|
||||||
|
} while (1);
|
||||||
|
}
|
||||||
|
|
||||||
Loading…
Reference in New Issue
Block a user