Case folding fixes (#133)

* Fixes allowing for “Full” folding and NFKC_CaseFold compliance. * Only include C (Common) and F (Full) foldings from CaseFolding.txt. Removed S (Simple) since F & S are specified to be exclusive. * Extend UTF8PROC_IGNORE to also ignore unassigned codepoints (such as \u2065) which are specified as being discarded by NFKC_CF. * Document the changes to UTF8PROC_IGNORE in header. * Add NFKC_CF helper function with documentation. * restore old IGNORE behavior, add UTF8PROC_STRIPNA, rename to utf8proc_NFKC_Casefold, add a test * success message * test that IGNORE does not strip NA * data update * NFKC_Casefold shouldn't strip NA
2018-05-02 08:15:02 -04:00
parent 48949bd3eb
commit bdc8b9e4b2
6 changed files with 4223 additions and 4183 deletions
--- a/data/data_generator.rb
+++ b/data/data_generator.rb
@@ -104,7 +104,7 @@ $excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex }
 $case_folding_string = File.open("CaseFolding.txt", :encoding => 'utf-8').read
 $case_folding = {}
 $case_folding_string.chomp.split("\n").each do |line|
-  next unless line =~ /([0-9A-F]+); [CFS]; ([0-9A-F ]+);/i
+  next unless line =~ /([0-9A-F]+); [CF]; ([0-9A-F ]+);/i
  $case_folding[$1.hex] = $2.split(" ").collect { |e| e.hex }
 end

--- a/test/custom.c
+++ b/test/custom.c
@@ -23,5 +23,6 @@ int main(void)
    check(strlen((char*) output) == 6, "incorrect output length");
    check(!memcmp(correct, output, 7), "incorrect output data");
    free(output);
+    printf("map_custom tests SUCCEEDED.\n");
    return 0;
 }
--- a/test/misc.c
+++ b/test/misc.c
@@ -19,9 +19,28 @@ static void issue128(void) /* #128 */
    free(nfd_out); free(nfc_out);
 }

+static void issue102(void) /* #128 */
+{
+    utf8proc_uint8_t input[] = {0x58, 0xe2, 0x81, 0xa5, 0x45, 0xcc, 0x80, 0xc2, 0xad, 0xe1, 0xb4, 0xac, 0x00}; /* "X\u2065E\u0300\u00ad\u1d2c" */
+    utf8proc_uint8_t stripna[] = {0x78, 0xc3, 0xa8, 0x61, 0x00}; /* "x\u00e8a" */
+    utf8proc_uint8_t correct[] = {0x78, 0xe2, 0x81, 0xa5, 0xc3, 0xa8, 0x61, 0x00}; /* "x\u2065\u00e8a" */
+    utf8proc_uint8_t *output;
+    utf8proc_map(input, 0, &output, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
+        UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE | UTF8PROC_STRIPNA);
+    printf("NFKC_Casefold \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)output, (char*)stripna);
+    check(strlen((char*) output) == 4, "incorrect NFKC_Casefold+stripna length");
+    check(!memcmp(stripna, output, 5), "incorrect NFKC_Casefold+stripna data");
+    free(output);
+    output = utf8proc_NFKC_Casefold(input);
+    printf("NFKC_Casefold \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)output, (char*)correct);
+    check(strlen((char*) output) == 7, "incorrect NFKC_Casefold length");
+    check(!memcmp(correct, output, 8), "incorrect NFKC_Casefold data");
+}
+
 int main(void)
 {
    issue128();
+    issue102();
    printf("Misc tests SUCCEEDED.\n");
    return 0;
 }
--- a/utf8proc.c
+++ b/utf8proc.c
@@ -423,6 +423,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
  if (options & UTF8PROC_IGNORE) {
    if (property->ignorable) return 0;
  }
+  if (options & UTF8PROC_STRIPNA) {
+    if (!category) return 0;
+  }
  if (options & UTF8PROC_LUMP) {
    if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
    if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
@@ -752,3 +755,10 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str)
    UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
  return retval;
 }
+
+UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str) {
+  utf8proc_uint8_t *retval;
+  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
+    UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE);
+  return retval;
+}
--- a/utf8proc.h
+++ b/utf8proc.h
@@ -213,6 +213,10 @@ typedef enum {
   *       @ref UTF8PROC_DECOMPOSE
   */
  UTF8PROC_STRIPMARK = (1<<13),
+  /**
+   * Strip unassigned codepoints.
+   */
+  UTF8PROC_STRIPNA    = (1<<14),
 } utf8proc_option_t;

 /** @name Error codes
@@ -469,6 +473,7 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
 * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
 * - @ref UTF8PROC_LUMP      - lump certain different codepoints together
 * - @ref UTF8PROC_STRIPMARK - remove all character marks
+ * - @ref UTF8PROC_STRIPNA   - remove unassigned codepoints
 * @param last_boundclass
 * Pointer to an integer variable containing
 * the previous codepoint's boundary class if the @ref UTF8PROC_CHARBOUND
@@ -680,8 +685,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(

 /** @name Unicode normalization
 *
- * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
- * normalized version of the null-terminated string `str`.  These
+ * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD, NFKC or
+ * NFKC_Casefold normalized version of the null-terminated string `str`.  These
 * are shortcuts to calling @ref utf8proc_map with @ref UTF8PROC_NULLTERM
 * combined with @ref UTF8PROC_STABLE and flags indicating the normalization.
 */
@@ -694,6 +699,11 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str);
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
 /** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
+/**
+ * NFKC_Casefold normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT
+ * and @ref UTF8PROC_CASEFOLD and @ref UTF8PROC_IGNORE).
+ **/
+UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str);
 /** @} */

 #ifdef __cplusplus
--- a/utf8proc_data.c
+++ b/utf8proc_data.c