uppercase mapping ß (U+00df) to ẞ (U+1E9E) (#134)

* uppercase(0x00df) = 0x1e9e * tests for titlecase and u+00df uppercase * NEWS, another test
2018-05-02 14:18:26 -04:00
parent 8639450134
commit d81308faba
5 changed files with 1312 additions and 1276 deletions
--- a/NEWS.md
+++ b/NEWS.md
@@ -14,6 +14,9 @@
 - `cmake` fix to avoid defining `UTF8PROC_EXPORTS` globally ([#121]).
 - `toupper` of ß (U+00df) now yields ẞ (U+1E9E) ([#134]), similar to musl;
  case-folding still yields the standard "ss" mapping.
 ## Version 2.1.1 ##
 2018-04-27
@@ -332,3 +335,4 @@ Release of version 1.0.1
 [#128]: https://github.com/JuliaLang/utf8proc/issues/128
 [#132]: https://github.com/JuliaLang/utf8proc/issues/132
 [#133]: https://github.com/JuliaLang/utf8proc/issues/133
 [#134]: https://github.com/JuliaLang/utf8proc/issues/134
--- a/data/data_generator.rb
+++ b/data/data_generator.rb
@@ -137,13 +137,13 @@ def cpary2utf16encoded(array)
 end
 def cpary2c(array)
  return "UINT16_MAX" if array.nil? || array.length == 0
-  lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ... 
+  lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ...
  array = cpary2utf16encoded(array)
  if lencode >= 7 #we have only 3 bits for the length (which is already cutting it close. might need to change it to 2 bits in future Unicode versions)
-    array = [lencode] + array 
+    array = [lencode] + array
    lencode = 7
-  end  
+  end
-  idx = pushary(array) 
+  idx = pushary(array)
  raise "Array index out of bound" if idx > 0x1FFF
  return "#{idx | (lencode << 13)}"
 end
@@ -188,9 +188,10 @@ class UnicodeChar
    @decomp_mapping    = ($8=='') ? nil :
                         $8.split.collect { |element| element.hex }
    @bidi_mirrored     = ($13=='Y') ? true : false
-    @uppercase_mapping = ($16=='') ? nil : $16.hex
+    # issue #130: use nonstandard uppercase ß -> ẞ
    @uppercase_mapping = ($16=='') ? (code==0x00df ? 0x1e9e : nil) : $16.hex
    @lowercase_mapping = ($17=='') ? nil : $17.hex
-    @titlecase_mapping = ($18=='') ? nil : $18.hex
+    @titlecase_mapping = ($18=='') ? (code==0x00df ? 0x1e9e : nil) : $18.hex
  end
  def case_folding
    $case_folding[code]
@@ -260,17 +261,17 @@ chars.each do |char|
    end
    unless comb2nd_indicies[dm1]
      comb2nd_indicies_sorted_keys << dm1
-      comb2nd_indicies[dm1] = comb2nd_indicies.keys.length 
+      comb2nd_indicies[dm1] = comb2nd_indicies.keys.length
    end
    comb_array[comb1st_indicies[dm0]] ||= []
    raise "Duplicate canonical mapping: #{char.code} #{dm0} #{dm1}" if comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]]
    comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]] = char.code
-    
+
    comb2nd_indicies_nonbasic[dm1] = true if char.code > 0xFFFF
  end
  char.c_decomp_mapping = cpary2c(char.decomp_mapping)
  char.c_case_folding = cpary2c(char.case_folding)
-end 
+end
 comb_indicies = {}
 cumoffset = 0
@@ -281,7 +282,7 @@ comb1st_indicies.each do |dm0, index|
  last = nil
  offset = 0
  comb2nd_indicies_sorted_keys.each_with_index do |dm1, b|
-    if comb_array[index][b] 
+    if comb_array[index][b]
      first = offset unless first
      last = offset
      last += 1 if comb2nd_indicies_nonbasic[dm1]
@@ -391,7 +392,7 @@ comb1st_indicies.keys.each_index do |a|
  offset = 0
  $stdout << comb1st_indicies_firstoffsets[a] << ", " << comb1st_indicies_lastoffsets[a] << ", "
  comb2nd_indicies_sorted_keys.each_with_index do |dm1, b|
-    break if offset > comb1st_indicies_lastoffsets[a] 
+    break if offset > comb1st_indicies_lastoffsets[a]
    if offset >= comb1st_indicies_firstoffsets[a]
      i += 1
      if i == 8
@@ -403,9 +404,8 @@ comb1st_indicies.keys.each_index do |a|
      $stdout << (v & 0xFFFF) << ", "
    end
    offset += 1
-    offset += 1 if comb2nd_indicies_nonbasic[dm1]    
+    offset += 1 if comb2nd_indicies_nonbasic[dm1]
  end
  $stdout  << "\n"
 end
 $stdout << "};\n\n"
--- a/test/case.c
+++ b/test/case.c
@@ -13,13 +13,20 @@ int main(int argc, char **argv)
     for (c = 0; c <= 0x110000; ++c) {
          utf8proc_int32_t l = utf8proc_tolower(c);
          utf8proc_int32_t u = utf8proc_toupper(c);
          utf8proc_int32_t t = utf8proc_totitle(c);
          check(l == c || utf8proc_codepoint_valid(l), "invalid tolower");
          check(u == c || utf8proc_codepoint_valid(u), "invalid toupper");
          check(t == c || utf8proc_codepoint_valid(t), "invalid totitle");
          if (utf8proc_codepoint_valid(c) && (l == u) != (l == t)) {
               fprintf(stderr, "unexpected titlecase %x for lowercase %x / uppercase %x\n", t, l, c);
               ++error;
          }
          if (sizeof(wint_t) > 2 || c < (1<<16)) {
               wint_t l0 = towlower(c), u0 = towupper(c);
-               
+
               /* OS unicode tables may be out of date.  But if they
                  do have a lower/uppercase mapping, hopefully it
                  is correct? */
@@ -44,6 +51,20 @@ int main(int argc, char **argv)
          }
     }
     check(!error, "utf8proc case conversion FAILED %d tests.", error);
     /* issue #130 */
     check(utf8proc_toupper(0x00df) == 0x1e9e &&
           utf8proc_totitle(0x00df) == 0x1e9e &&
           utf8proc_tolower(0x00df) == 0x00df &&
           utf8proc_tolower(0x1e9e) == 0x00df &&
           utf8proc_toupper(0x1e9e) == 0x1e9e,
           "incorrect 0x00df/0x1e9e case conversions");
     utf8proc_uint8_t str_00df[] = {0xc3, 0x9f, 0x00};
     utf8proc_uint8_t str_1e9e[] = {0xe1, 0xba, 0x9e, 0x00};
     check(!strcmp((char*)utf8proc_NFKC_Casefold(str_00df), "ss") &&
           !strcmp((char*)utf8proc_NFKC_Casefold(str_1e9e), "ss"),
           "incorrect 0x00df/0x1e9e casefold normalization");
     printf("More up-to-date than OS unicode tables for %d tests.\n", better);
     printf("utf8proc case conversion tests SUCCEEDED.\n");
     return 0;
--- a/test/printproperty.c
+++ b/test/printproperty.c
@@ -4,46 +4,57 @@
 int main(int argc, char **argv)
 {
-     int i;
+    int i;
-     for (i = 1; i < argc; ++i) {
+    for (i = 1; i < argc; ++i) {
-          unsigned int c;
+        utf8proc_uint8_t cstr[16], *map;
-          if (!strcmp(argv[i], "-V")) {
+        unsigned int c;
-               printf("utf8proc version %s\n", utf8proc_version());
+        if (!strcmp(argv[i], "-V")) {
-               continue;
+            printf("utf8proc version %s\n", utf8proc_version());
-          }
+            continue;
-          check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]);
+        }
-          const utf8proc_property_t *p = utf8proc_get_property(c);
+        check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]);
-          printf("U+%s:\n"
+        const utf8proc_property_t *p = utf8proc_get_property(c);
-                 "  category = %s\n"
+
-                 "  combining_class = %d\n"
+        if (utf8proc_codepoint_valid(c))
-                 "  bidi_class = %d\n"
+            cstr[utf8proc_encode_char(c, cstr)] = 0;
-                 "  decomp_type = %d\n"
+        else
-                 "  uppercase_mapping = %x\n"
+            strcat((char*)cstr, "N/A");
-                 "  lowercase_mapping = %x\n"
+        utf8proc_map(cstr, 0, &map, UTF8PROC_NULLTERM | UTF8PROC_CASEFOLD);
-                 "  titlecase_mapping = %x\n"
+
-                 "  comb_index = %d\n"
+        printf("U+%s: %s\n"
-                 "  bidi_mirrored = %d\n"
+            "  category = %s\n"
-                 "  comp_exclusion = %d\n"
+            "  combining_class = %d\n"
-                 "  ignorable = %d\n"
+            "  bidi_class = %d\n"
-                 "  control_boundary = %d\n"
+            "  decomp_type = %d\n"
-                 "  boundclass = %d\n"
+            "  uppercase_mapping = %x\n"
-                 "  charwidth = %d\n",
+            "  lowercase_mapping = %x\n"
-                 argv[i],
+            "  titlecase_mapping = %x\n"
-                 utf8proc_category_string(c),
+            "  casefold = %s\n"
-                 p->combining_class,
+            "  comb_index = %d\n"
-                 p->bidi_class,
+            "  bidi_mirrored = %d\n"
-                 p->decomp_type,
+            "  comp_exclusion = %d\n"
-                 utf8proc_toupper(c),
+            "  ignorable = %d\n"
-                 utf8proc_tolower(c),
+            "  control_boundary = %d\n"
-                 utf8proc_totitle(c),
+            "  boundclass = %d\n"
-                 p->comb_index,
+            "  charwidth = %d\n",
-                 p->bidi_mirrored,
+        argv[i], (char*) cstr,
-                 p->comp_exclusion,
+        utf8proc_category_string(c),
-                 p->ignorable,
+        p->combining_class,
-                 p->control_boundary,
+        p->bidi_class,
-                 p->boundclass,
+        p->decomp_type,
-                 utf8proc_charwidth(c));
+        utf8proc_toupper(c),
-     }
+        utf8proc_tolower(c),
-     return 0;
+        utf8proc_totitle(c),
        (char *) map,
        p->comb_index,
        p->bidi_mirrored,
        p->comp_exclusion,
        p->ignorable,
        p->control_boundary,
        p->boundclass,
        utf8proc_charwidth(c));
        free(map);
    }
    return 0;
 }
--- a/utf8proc_data.c
+++ b/utf8proc_data.c