Case folding fixes (#133)

* Fixes allowing for “Full” folding and NFKC_CaseFold compliance.

* Only include C (Common) and F (Full) foldings from CaseFolding.txt. Removed S (Simple) since F & S are specified to be exclusive.
* Extend UTF8PROC_IGNORE to also ignore unassigned codepoints (such as \u2065) which are specified as being discarded by NFKC_CF.

* Document the changes to UTF8PROC_IGNORE in header.

* Add NFKC_CF helper function with documentation.

* restore old IGNORE behavior, add UTF8PROC_STRIPNA, rename to utf8proc_NFKC_Casefold, add a test

* success message

* test that IGNORE does not strip NA

* data update

* NFKC_Casefold shouldn't strip NA
This commit is contained in:
Steven G. Johnson 2018-05-02 08:15:02 -04:00 committed by GitHub
parent 48949bd3eb
commit bdc8b9e4b2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 4223 additions and 4183 deletions

View File

@ -104,7 +104,7 @@ $excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex }
$case_folding_string = File.open("CaseFolding.txt", :encoding => 'utf-8').read $case_folding_string = File.open("CaseFolding.txt", :encoding => 'utf-8').read
$case_folding = {} $case_folding = {}
$case_folding_string.chomp.split("\n").each do |line| $case_folding_string.chomp.split("\n").each do |line|
next unless line =~ /([0-9A-F]+); [CFS]; ([0-9A-F ]+);/i next unless line =~ /([0-9A-F]+); [CF]; ([0-9A-F ]+);/i
$case_folding[$1.hex] = $2.split(" ").collect { |e| e.hex } $case_folding[$1.hex] = $2.split(" ").collect { |e| e.hex }
end end

View File

@ -23,5 +23,6 @@ int main(void)
check(strlen((char*) output) == 6, "incorrect output length"); check(strlen((char*) output) == 6, "incorrect output length");
check(!memcmp(correct, output, 7), "incorrect output data"); check(!memcmp(correct, output, 7), "incorrect output data");
free(output); free(output);
printf("map_custom tests SUCCEEDED.\n");
return 0; return 0;
} }

View File

@ -19,9 +19,28 @@ static void issue128(void) /* #128 */
free(nfd_out); free(nfc_out); free(nfd_out); free(nfc_out);
} }
static void issue102(void) /* #128 */
{
utf8proc_uint8_t input[] = {0x58, 0xe2, 0x81, 0xa5, 0x45, 0xcc, 0x80, 0xc2, 0xad, 0xe1, 0xb4, 0xac, 0x00}; /* "X\u2065E\u0300\u00ad\u1d2c" */
utf8proc_uint8_t stripna[] = {0x78, 0xc3, 0xa8, 0x61, 0x00}; /* "x\u00e8a" */
utf8proc_uint8_t correct[] = {0x78, 0xe2, 0x81, 0xa5, 0xc3, 0xa8, 0x61, 0x00}; /* "x\u2065\u00e8a" */
utf8proc_uint8_t *output;
utf8proc_map(input, 0, &output, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE | UTF8PROC_STRIPNA);
printf("NFKC_Casefold \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)output, (char*)stripna);
check(strlen((char*) output) == 4, "incorrect NFKC_Casefold+stripna length");
check(!memcmp(stripna, output, 5), "incorrect NFKC_Casefold+stripna data");
free(output);
output = utf8proc_NFKC_Casefold(input);
printf("NFKC_Casefold \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)output, (char*)correct);
check(strlen((char*) output) == 7, "incorrect NFKC_Casefold length");
check(!memcmp(correct, output, 8), "incorrect NFKC_Casefold data");
}
int main(void) int main(void)
{ {
issue128(); issue128();
issue102();
printf("Misc tests SUCCEEDED.\n"); printf("Misc tests SUCCEEDED.\n");
return 0; return 0;
} }

View File

@ -423,6 +423,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
if (options & UTF8PROC_IGNORE) { if (options & UTF8PROC_IGNORE) {
if (property->ignorable) return 0; if (property->ignorable) return 0;
} }
if (options & UTF8PROC_STRIPNA) {
if (!category) return 0;
}
if (options & UTF8PROC_LUMP) { if (options & UTF8PROC_LUMP) {
if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020); if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8) if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
@ -752,3 +755,10 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str)
UTF8PROC_COMPOSE | UTF8PROC_COMPAT); UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
return retval; return retval;
} }
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str) {
utf8proc_uint8_t *retval;
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE);
return retval;
}

View File

@ -213,6 +213,10 @@ typedef enum {
* @ref UTF8PROC_DECOMPOSE * @ref UTF8PROC_DECOMPOSE
*/ */
UTF8PROC_STRIPMARK = (1<<13), UTF8PROC_STRIPMARK = (1<<13),
/**
* Strip unassigned codepoints.
*/
UTF8PROC_STRIPNA = (1<<14),
} utf8proc_option_t; } utf8proc_option_t;
/** @name Error codes /** @name Error codes
@ -469,6 +473,7 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
* - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
* - @ref UTF8PROC_LUMP - lump certain different codepoints together * - @ref UTF8PROC_LUMP - lump certain different codepoints together
* - @ref UTF8PROC_STRIPMARK - remove all character marks * - @ref UTF8PROC_STRIPMARK - remove all character marks
* - @ref UTF8PROC_STRIPNA - remove unassigned codepoints
* @param last_boundclass * @param last_boundclass
* Pointer to an integer variable containing * Pointer to an integer variable containing
* the previous codepoint's boundary class if the @ref UTF8PROC_CHARBOUND * the previous codepoint's boundary class if the @ref UTF8PROC_CHARBOUND
@ -680,8 +685,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
/** @name Unicode normalization /** @name Unicode normalization
* *
* Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD, NFKC or
* normalized version of the null-terminated string `str`. These * NFKC_Casefold normalized version of the null-terminated string `str`. These
* are shortcuts to calling @ref utf8proc_map with @ref UTF8PROC_NULLTERM * are shortcuts to calling @ref utf8proc_map with @ref UTF8PROC_NULLTERM
* combined with @ref UTF8PROC_STABLE and flags indicating the normalization. * combined with @ref UTF8PROC_STABLE and flags indicating the normalization.
*/ */
@ -694,6 +699,11 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str);
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str); UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
/** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */ /** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str); UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
/**
* NFKC_Casefold normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT
* and @ref UTF8PROC_CASEFOLD and @ref UTF8PROC_IGNORE).
**/
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str);
/** @} */ /** @} */
#ifdef __cplusplus #ifdef __cplusplus

File diff suppressed because it is too large Load Diff