utf8proc/test/graphemetest.c
Mike Glorioso 610730f231
Fix Sign-Conversion warnings in library and test code (#214)
* JuliaStrings#169 turn on sign-conversion warnings

Signed-off-by: Mike Glorioso <mike.glorioso@gmail.com>

* JuliaStrings#169 fix sign-conversion warnings for utf8proc.c

fix sign-converstion warnings for utf8proc_iterate
uc requires at most 21 bits to identify a unicode codepoint, so there is no need for it to be unsigned
multiple locations use, modify, or store uc with a signed value
the only exception is line 137 where uc is compared with an unsigned value

fix sign-converstion warnings for utf8proc_tolower, utf8proc_toupper, utf8proc_totitle
all three methods have sign conversion warnings when calling seqindex_decode_index
seqindex_decode_index uses the passed value as an index to an array utf8proc_sequences
as utf8proc_sequences is hard-coded and smaller than 2^31 - 1 we can safely cast to unsigned

fix sign-converstion warnings for utf8proc_decompose_char
lines with this warning use the defined function utf8proc_decompose_lump
in the function, a hardcoded unsigned value (1<<12) is complemented then cast as a signed value
as the intent is to remove the 12th bit flag from options, a signed value, and explicit cast is safe

fix sign-conversion warnings for utf8proc_map_custom
result is declared as signed, but is only expected to contain values between 0 and 4
sizeof returns an unsigned value. result must be cast to unsigned

Signed-off-by: Mike Glorioso <mike.glorioso@gmail.com>

* JuliaStrings#169 fix sign-conversion warnings for test/*

fix sign-conversion warnings for test/tests.c encode
change type for d to match return value of utf8proc_encode_char

fix sign-conversion warnings for test/graphemetest.c checkline
si, i, and j are unsigned size types, utf8proc_map and utf8proc_iterate accept and return signed size types
utf8proc_map treats negative strlen values as 0. the strlen used by the test must be similarly limited
utf8proc_iterate treats negative strlen values as 4 which will be less than the unsigned size
fix unused-but-set-variable warning by checking the glen value

fix sign-conversion warnings for test/case.c main
the if block ensures that tested codepoint fits in wint_t, but needs to include u and l as well
c, u, and l can be safely cast to wint_t

fix sign-conversion warnings for test/iterate.c
all values used for len are below 8, so an explicit cast is safe
updated types for more portable test code

fix sign-conversion warnings for test/printproperty.c main
change type of c to signed to resolve all sign-converstion warnings.
replace sscanf(... &c) wiht sscanf(... &x) followed by explicit sign converstion

Signed-off-by: Mike Glorioso <mike.glorioso@gmail.com>
2021-01-14 12:59:49 -05:00

129 lines
4.9 KiB
C

#include "tests.h"
/* check one line in the format of GraphemeBreakTest.txt */
void checkline(const char *_buf, bool verbose) {
size_t bi = 0, si = 0;
utf8proc_uint8_t src[1024]; /* more than long enough for all of our tests */
const unsigned char *buf = (const unsigned char *) _buf;
while (buf[bi]) {
bi = skipspaces(buf, bi);
if (buf[bi] == 0xc3 && buf[bi+1] == 0xb7) { /* U+00f7 = grapheme break */
src[si++] = '/';
bi += 2;
}
else if (buf[bi] == 0xc3 && buf[bi+1] == 0x97) { /* U+00d7 = no break */
bi += 2;
}
else if (buf[bi] == '#') { /* start of comments */
break;
}
else if (buf[bi] == '/') { /* for convenience, also accept / as grapheme break */
src[si++] = '/';
bi += 1;
}
else { /* hex-encoded codepoint */
size_t len = encode((unsigned char*) (src + si), buf + bi) - 1;
while (src[si]) ++si; /* advance to NUL termination */
bi += len;
}
}
if (si && src[si-1] == '/')
--si; /* no break after final grapheme */
src[si] = 0; /* NUL-terminate */
if (si) { /* test utf8proc_map */
utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
size_t i = 0, j = 0;
utf8proc_ssize_t glen, k;
utf8proc_uint8_t *g; /* utf8proc_map grapheme results */
while (i < si) {
if (src[i] != '/')
utf8[j++] = src[i++];
else
i++;
}
glen = utf8proc_map(utf8, (utf8proc_ssize_t)j, &g, UTF8PROC_CHARBOUND);
if (glen == UTF8PROC_ERROR_INVALIDUTF8) {
/* the test file contains surrogate codepoints, which are only for UTF-16 */
printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno);
}
else {
check(glen >= 0, "utf8proc_map error = %s",
utf8proc_errmsg(glen));
for (k = 0; k <= glen; ++k)
if (g[k] == 0xff)
g[k] = '/'; /* easier-to-read output (/ is not in test strings) */
check(!strcmp((char*)g, (char*)src),
"grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src);
}
free(g);
}
if (si) { /* test manual calls to utf8proc_grapheme_break_stateful */
utf8proc_int32_t state = 0, prev_codepoint = 0;
size_t i = 0;
utf8proc_bool expectbreak = false;
do {
utf8proc_int32_t codepoint;
i += (size_t)utf8proc_iterate(src + i, (utf8proc_ssize_t)(si - i), &codepoint);
check(codepoint >= 0, "invalid UTF-8 data");
if (codepoint == 0x002F)
expectbreak = true;
else {
if (prev_codepoint != 0) {
check(expectbreak == utf8proc_grapheme_break_stateful(prev_codepoint, codepoint, &state),
"grapheme mismatch: between 0x%04x and 0x%04x in \"%s\"", prev_codepoint, codepoint, (char*) src);
}
expectbreak = false;
prev_codepoint = codepoint;
}
} while (i < si);
}
if (verbose)
printf("passed grapheme test: \"%s\"\n", (char*) src);
}
int main(int argc, char **argv)
{
unsigned char buf[8192];
FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL;
check(f != NULL, "error opening GraphemeBreakTest.txt");
while (simple_getline(buf, f) > 0) {
if ((++lineno) % 100 == 0)
printf("checking line %zd...\n", lineno);
if (buf[0] == '#') continue;
checkline((char *) buf, false);
}
fclose(f);
printf("Passed tests after %zd lines!\n", lineno);
printf("Performing regression tests...\n");
/* issue 144 */
{
utf8proc_uint8_t input[] = {0xef,0xbf,0xbf,0xef,0xbf,0xbe,0x00}; /* "\uffff\ufffe" */
utf8proc_uint8_t output[] = {0xff,0xef,0xbf,0xbf,0xff,0xef,0xbf,0xbe,0x00}; /* with 0xff grapheme markers */
utf8proc_ssize_t glen;
utf8proc_uint8_t *g;
glen = utf8proc_map(input, 6, &g, UTF8PROC_CHARBOUND);
check(!strcmp((char*)g, (char*)output), "mishandled u+ffff and u+fffe grapheme breaks");
check(glen != 6, "mishandled u+ffff and u+fffe grapheme breaks");
free(g);
};
/* https://github.com/JuliaLang/julia/issues/37680 */
checkline("/ 1f1f8 1f1ea / 1f1f8 1f1ea /", true); /* Two swedish flags after each other */
checkline("/ 1f926 1f3fc 200d 2642 fe0f /", true); /* facepalm + pale skin + zwj + male sign + FE0F */
checkline("/ 1f468 1f3fb 200d 1f91d 200d 1f468 1f3fd /", true); /* man face + pale skin + zwj + hand holding + zwj + man face + dark skin */
check(utf8proc_grapheme_break(0x03b1, 0x03b2), "failed 03b1 / 03b2 test");
check(!utf8proc_grapheme_break(0x03b1, 0x0302), "failed 03b1 0302 test");
printf("Passed regression tests!\n");
return 0;
}