Fix grapheme breaks on string-initial (#205)
* Fix extended emoji + zwj combo * Patch initial repeated regional flags and extended+zwj emoj * Merge conditions for setting breaks bt region * updated fix * perform tests for both utf8proc_map and manual calls to utf8proc_grapheme_break_stateful * consolidate tests Co-authored-by: Thomas Marks <marksta@umich.edu>
This commit is contained in:
parent
6f7d73071a
commit
0643a64479
@ -1,20 +1,10 @@
|
|||||||
#include "tests.h"
|
#include "tests.h"
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
/* check one line in the format of GraphemeBreakTest.txt */
|
||||||
{
|
void checkline(const char *_buf, bool verbose) {
|
||||||
unsigned char buf[8192];
|
|
||||||
FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL;
|
|
||||||
utf8proc_uint8_t src[1024];
|
|
||||||
|
|
||||||
check(f != NULL, "error opening GraphemeBreakTest.txt");
|
|
||||||
while (simple_getline(buf, f) > 0) {
|
|
||||||
size_t bi = 0, si = 0;
|
size_t bi = 0, si = 0;
|
||||||
lineno += 1;
|
utf8proc_uint8_t src[1024]; /* more than long enough for all of our tests */
|
||||||
|
const unsigned char *buf = (const unsigned char *) _buf;
|
||||||
if (lineno % 100 == 0)
|
|
||||||
printf("checking line %zd...\n", lineno);
|
|
||||||
|
|
||||||
if (buf[0] == '#') continue;
|
|
||||||
|
|
||||||
while (buf[bi]) {
|
while (buf[bi]) {
|
||||||
bi = skipspaces(buf, bi);
|
bi = skipspaces(buf, bi);
|
||||||
@ -28,6 +18,10 @@ int main(int argc, char **argv)
|
|||||||
else if (buf[bi] == '#') { /* start of comments */
|
else if (buf[bi] == '#') { /* start of comments */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
else if (buf[bi] == '/') { /* for convenience, also accept / as grapheme break */
|
||||||
|
src[si++] = '/';
|
||||||
|
bi += 1;
|
||||||
|
}
|
||||||
else { /* hex-encoded codepoint */
|
else { /* hex-encoded codepoint */
|
||||||
size_t len = encode((unsigned char*) (src + si), buf + bi) - 1;
|
size_t len = encode((unsigned char*) (src + si), buf + bi) - 1;
|
||||||
while (src[si]) ++si; /* advance to NUL termination */
|
while (src[si]) ++si; /* advance to NUL termination */
|
||||||
@ -38,7 +32,7 @@ int main(int argc, char **argv)
|
|||||||
--si; /* no break after final grapheme */
|
--si; /* no break after final grapheme */
|
||||||
src[si] = 0; /* NUL-terminate */
|
src[si] = 0; /* NUL-terminate */
|
||||||
|
|
||||||
if (si) {
|
if (si) { /* test utf8proc_map */
|
||||||
utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
|
utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
|
||||||
size_t i = 0, j = 0;
|
size_t i = 0, j = 0;
|
||||||
utf8proc_ssize_t glen, k;
|
utf8proc_ssize_t glen, k;
|
||||||
@ -65,10 +59,49 @@ int main(int argc, char **argv)
|
|||||||
}
|
}
|
||||||
free(g);
|
free(g);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (si) { /* test manual calls to utf8proc_grapheme_break_stateful */
|
||||||
|
utf8proc_int32_t state = 0, prev_codepoint = 0;
|
||||||
|
size_t i = 0;
|
||||||
|
utf8proc_bool expectbreak = false;
|
||||||
|
do {
|
||||||
|
utf8proc_int32_t codepoint;
|
||||||
|
i += utf8proc_iterate(src + i, si - i, &codepoint);
|
||||||
|
check(codepoint >= 0, "invalid UTF-8 data");
|
||||||
|
if (codepoint == 0x002F)
|
||||||
|
expectbreak = true;
|
||||||
|
else {
|
||||||
|
if (prev_codepoint != 0) {
|
||||||
|
check(expectbreak == utf8proc_grapheme_break_stateful(prev_codepoint, codepoint, &state),
|
||||||
|
"grapheme mismatch: between 0x%04x and 0x%04x in \"%s\"", prev_codepoint, codepoint, (char*) src);
|
||||||
|
}
|
||||||
|
expectbreak = false;
|
||||||
|
prev_codepoint = codepoint;
|
||||||
|
}
|
||||||
|
} while (i < si);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (verbose)
|
||||||
|
printf("passed grapheme test: \"%s\"\n", (char*) src);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
unsigned char buf[8192];
|
||||||
|
FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL;
|
||||||
|
|
||||||
|
check(f != NULL, "error opening GraphemeBreakTest.txt");
|
||||||
|
while (simple_getline(buf, f) > 0) {
|
||||||
|
if ((++lineno) % 100 == 0)
|
||||||
|
printf("checking line %zd...\n", lineno);
|
||||||
|
if (buf[0] == '#') continue;
|
||||||
|
checkline((char *) buf, false);
|
||||||
}
|
}
|
||||||
fclose(f);
|
fclose(f);
|
||||||
printf("Passed tests after %zd lines!\n", lineno);
|
printf("Passed tests after %zd lines!\n", lineno);
|
||||||
|
|
||||||
|
printf("Performing regression tests...\n");
|
||||||
|
|
||||||
/* issue 144 */
|
/* issue 144 */
|
||||||
{
|
{
|
||||||
utf8proc_uint8_t input[] = {0xef,0xbf,0xbf,0xef,0xbf,0xbe,0x00}; /* "\uffff\ufffe" */
|
utf8proc_uint8_t input[] = {0xef,0xbf,0xbf,0xef,0xbf,0xbe,0x00}; /* "\uffff\ufffe" */
|
||||||
@ -80,5 +113,12 @@ int main(int argc, char **argv)
|
|||||||
free(g);
|
free(g);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* https://github.com/JuliaLang/julia/issues/37680 */
|
||||||
|
checkline("/ 1f1f8 1f1ea / 1f1f8 1f1ea /", true); /* Two swedish flags after each other */
|
||||||
|
checkline("/ 1f926 1f3fc 200d 2642 fe0f /", true); /* facepalm + pale skin + zwj + male sign + FE0F */
|
||||||
|
checkline("/ 1f468 1f3fb 200d 1f91d 200d 1f468 1f3fd /", true); /* man face + pale skin + zwj + hand holding + zwj + man face + dark skin */
|
||||||
|
|
||||||
|
printf("Passed regression tests!\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -290,8 +290,11 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
|
|||||||
|
|
||||||
static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
|
static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
|
||||||
{
|
{
|
||||||
int lbc_override = ((state && *state != UTF8PROC_BOUNDCLASS_START)
|
int lbc_override;
|
||||||
? *state : lbc);
|
if (*state == UTF8PROC_BOUNDCLASS_START)
|
||||||
|
*state = lbc_override = lbc;
|
||||||
|
else
|
||||||
|
lbc_override = *state;
|
||||||
utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
|
utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
|
||||||
if (state) {
|
if (state) {
|
||||||
// Special support for GB 12/13 made possible by GB999. After two RI
|
// Special support for GB 12/13 made possible by GB999. After two RI
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user