Case folding fixes (#133)

* Fixes allowing for “Full” folding and NFKC_CaseFold compliance.

* Only include C (Common) and F (Full) foldings from CaseFolding.txt. Removed S (Simple) since F & S are specified to be exclusive.
* Extend UTF8PROC_IGNORE to also ignore unassigned codepoints (such as \u2065) which are specified as being discarded by NFKC_CF.

* Document the changes to UTF8PROC_IGNORE in header.

* Add NFKC_CF helper function with documentation.

* restore old IGNORE behavior, add UTF8PROC_STRIPNA, rename to utf8proc_NFKC_Casefold, add a test

* success message

* test that IGNORE does not strip NA

* data update

* NFKC_Casefold shouldn't strip NA
This commit is contained in:
Steven G. Johnson
2018-05-02 08:15:02 -04:00
committed by GitHub
parent 48949bd3eb
commit bdc8b9e4b2
6 changed files with 4223 additions and 4183 deletions

View File

@@ -23,5 +23,6 @@ int main(void)
check(strlen((char*) output) == 6, "incorrect output length");
check(!memcmp(correct, output, 7), "incorrect output data");
free(output);
printf("map_custom tests SUCCEEDED.\n");
return 0;
}

View File

@@ -19,9 +19,28 @@ static void issue128(void) /* #128 */
free(nfd_out); free(nfc_out);
}
static void issue102(void) /* #128 */
{
utf8proc_uint8_t input[] = {0x58, 0xe2, 0x81, 0xa5, 0x45, 0xcc, 0x80, 0xc2, 0xad, 0xe1, 0xb4, 0xac, 0x00}; /* "X\u2065E\u0300\u00ad\u1d2c" */
utf8proc_uint8_t stripna[] = {0x78, 0xc3, 0xa8, 0x61, 0x00}; /* "x\u00e8a" */
utf8proc_uint8_t correct[] = {0x78, 0xe2, 0x81, 0xa5, 0xc3, 0xa8, 0x61, 0x00}; /* "x\u2065\u00e8a" */
utf8proc_uint8_t *output;
utf8proc_map(input, 0, &output, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE | UTF8PROC_STRIPNA);
printf("NFKC_Casefold \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)output, (char*)stripna);
check(strlen((char*) output) == 4, "incorrect NFKC_Casefold+stripna length");
check(!memcmp(stripna, output, 5), "incorrect NFKC_Casefold+stripna data");
free(output);
output = utf8proc_NFKC_Casefold(input);
printf("NFKC_Casefold \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)output, (char*)correct);
check(strlen((char*) output) == 7, "incorrect NFKC_Casefold length");
check(!memcmp(correct, output, 8), "incorrect NFKC_Casefold data");
}
int main(void)
{
issue128();
issue102();
printf("Misc tests SUCCEEDED.\n");
return 0;
}