added normalization and encoding test for #13
This commit is contained in:
parent
af06f858e1
commit
91a77d7588
3
.gitignore
vendored
3
.gitignore
vendored
@ -13,5 +13,4 @@
|
|||||||
bench/bench
|
bench/bench
|
||||||
bench/icu
|
bench/icu
|
||||||
bench/unistring
|
bench/unistring
|
||||||
|
normtest
|
||||||
|
|
||||||
|
|||||||
13
Makefile
13
Makefile
@ -45,7 +45,6 @@ CompositionExclusions.txt:
|
|||||||
CaseFolding.txt:
|
CaseFolding.txt:
|
||||||
$(CURL) -O http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
|
$(CURL) -O http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
|
||||||
|
|
||||||
|
|
||||||
utf8proc.o: mojibake.h utf8proc.c utf8proc_data.c
|
utf8proc.o: mojibake.h utf8proc.c utf8proc_data.c
|
||||||
$(cc) -c -o utf8proc.o utf8proc.c
|
$(cc) -c -o utf8proc.o utf8proc.c
|
||||||
|
|
||||||
@ -59,3 +58,15 @@ libmojibake.so: utf8proc.o
|
|||||||
|
|
||||||
libmojibake.dylib: utf8proc.o
|
libmojibake.dylib: utf8proc.o
|
||||||
$(cc) -dynamiclib -o $@ $^ -install_name $(libdir)/$@
|
$(cc) -dynamiclib -o $@ $^ -install_name $(libdir)/$@
|
||||||
|
|
||||||
|
|
||||||
|
# Test programs
|
||||||
|
|
||||||
|
NormalizationTest.txt:
|
||||||
|
$(CURL) -O http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
|
||||||
|
|
||||||
|
normtest: normtest.c utf8proc.o mojibake.h
|
||||||
|
$(cc) normtest.c utf8proc.o -o normtest
|
||||||
|
|
||||||
|
check: normtest NormalizationTest.txt
|
||||||
|
./normtest
|
||||||
|
|||||||
107
normtest.c
Normal file
107
normtest.c
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <ctype.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
|
||||||
|
#include "mojibake.h"
|
||||||
|
|
||||||
|
size_t lineno = 0;
|
||||||
|
|
||||||
|
void check(int cond, const char *format, ...)
|
||||||
|
{
|
||||||
|
if (!cond) {
|
||||||
|
va_list args;
|
||||||
|
fprintf(stderr, "line %zd: ", lineno);
|
||||||
|
va_start(args, format);
|
||||||
|
vfprintf(stderr, format, args);
|
||||||
|
va_end(args);
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* if buf points to a sequence of codepoints encoded as hexadecimal strings,
|
||||||
|
separated by whitespace, and terminated by any character not in
|
||||||
|
[0-9a-fA-F] or whitespace, then stores the corresponding utf8 string
|
||||||
|
in dest, returning the number of bytes read from buf */
|
||||||
|
size_t encode(char *dest, const char *buf)
|
||||||
|
{
|
||||||
|
size_t i = 0, j, d = 0;
|
||||||
|
do {
|
||||||
|
int c;
|
||||||
|
while (isspace(buf[i])) ++i; /* skip whitespace */
|
||||||
|
for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j)
|
||||||
|
; /* find end of hex input */
|
||||||
|
if (j == i) { /* no codepoint found */
|
||||||
|
dest[d] = 0; /* NUL-terminate destination string */
|
||||||
|
return i + 1;
|
||||||
|
}
|
||||||
|
check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i);
|
||||||
|
i = j; /* skip to char after hex input */
|
||||||
|
d += utf8proc_encode_char(c, (uint8_t *) (dest + d));
|
||||||
|
} while (1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define CHECK_NORM(NRM, norm, src) { \
|
||||||
|
char *src_norm = (char*) utf8proc_ ## NRM((uint8_t*) src); \
|
||||||
|
check(!strcmp(norm, src_norm), \
|
||||||
|
"normalization failed for %s -> %s", src, norm); \
|
||||||
|
free(src_norm); \
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(void)
|
||||||
|
{
|
||||||
|
char *buf = NULL;
|
||||||
|
size_t bufsize = 0;
|
||||||
|
FILE *f = fopen("NormalizationTest.txt", "r");
|
||||||
|
char source[1024], NFC[1024], NFD[1024], NFKC[1024], NFKD[1024];
|
||||||
|
|
||||||
|
check(f != NULL, "error opening NormalizationTest.txt");
|
||||||
|
while (getline(&buf, &bufsize, f) > 0) {
|
||||||
|
size_t offset;
|
||||||
|
lineno += 1;
|
||||||
|
|
||||||
|
if (buf[0] == '@') {
|
||||||
|
printf("line %zd: %s", lineno, buf + 1);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
else if (lineno % 1000 == 0)
|
||||||
|
printf("checking line %zd...\n", lineno);
|
||||||
|
|
||||||
|
if (buf[0] == '#') continue;
|
||||||
|
|
||||||
|
offset = encode(source, buf);
|
||||||
|
offset += encode(NFC, buf + offset);
|
||||||
|
offset += encode(NFD, buf + offset);
|
||||||
|
offset += encode(NFKC, buf + offset);
|
||||||
|
offset += encode(NFKD, buf + offset);
|
||||||
|
|
||||||
|
CHECK_NORM(NFC, NFC, source);
|
||||||
|
CHECK_NORM(NFC, NFC, NFC);
|
||||||
|
CHECK_NORM(NFC, NFC, NFD);
|
||||||
|
CHECK_NORM(NFC, NFKC, NFKC);
|
||||||
|
CHECK_NORM(NFC, NFKC, NFKD);
|
||||||
|
|
||||||
|
CHECK_NORM(NFD, NFD, source);
|
||||||
|
CHECK_NORM(NFD, NFD, NFC);
|
||||||
|
CHECK_NORM(NFD, NFD, NFD);
|
||||||
|
CHECK_NORM(NFD, NFKD, NFKC);
|
||||||
|
CHECK_NORM(NFD, NFKD, NFKD);
|
||||||
|
|
||||||
|
CHECK_NORM(NFKC, NFKC, source);
|
||||||
|
CHECK_NORM(NFKC, NFKC, NFC);
|
||||||
|
CHECK_NORM(NFKC, NFKC, NFD);
|
||||||
|
CHECK_NORM(NFKC, NFKC, NFKC);
|
||||||
|
CHECK_NORM(NFKC, NFKC, NFKD);
|
||||||
|
|
||||||
|
CHECK_NORM(NFKD, NFKD, source);
|
||||||
|
CHECK_NORM(NFKD, NFKD, NFC);
|
||||||
|
CHECK_NORM(NFKD, NFKD, NFD);
|
||||||
|
CHECK_NORM(NFKD, NFKD, NFKC);
|
||||||
|
CHECK_NORM(NFKD, NFKD, NFKD);
|
||||||
|
}
|
||||||
|
fclose(f);
|
||||||
|
printf("Passed tests after %zd lines!\n", lineno);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue
Block a user