add toupper/tolower functions (for JuliaLang/julia#11471)

2015-05-29 13:52:48 -04:00
parent 35ec8e32e7
commit a8fb4b1772
5 changed files with 84 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -22,4 +22,5 @@ utf8proc_data.c.new
 printproperty
 charwidth
 valid
-iterate
+iterate
 case
--- a/6
+++ b/6
@@ -111,10 +111,14 @@ test/valid: test/valid.c utf8proc.o utf8proc.h test/tests.h
 test/iterate: test/iterate.c utf8proc.o utf8proc.h test/tests.h
 	$(cc) test/iterate.c utf8proc.o -o $@
-check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
+test/case: test/case.c utf8proc.o utf8proc.h test/tests.h
 	$(cc) test/case.c utf8proc.o -o $@
 check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
 	$(MAKE) -C bench
 	test/normtest data/NormalizationTest.txt
 	test/graphemetest data/GraphemeBreakTest.txt
 	test/charwidth
 	test/valid
 	test/iterate
 	test/case
--- a/test/case.c
+++ b/test/case.c
@@ -0,0 +1,50 @@
 #include "tests.h"
 #include <wctype.h>
 int main(int argc, char **argv)
 {
     int error = 0, better = 0;
     utf8proc_int32_t c;
     (void) argc; /* unused */
     (void) argv; /* unused */
     /* some simple sanity tests of the character widths */
     for (c = 0; c <= 0x110000; ++c) {
          utf8proc_int32_t l = utf8proc_tolower(c);
          utf8proc_int32_t u = utf8proc_toupper(c);
          check(l == c || utf8proc_codepoint_valid(l), "invalid tolower");
          check(u == c || utf8proc_codepoint_valid(u), "invalid toupper");
          if (sizeof(wint_t) > 2 || c < (1<<16)) {
               wint_t l0 = towlower(c), u0 = towupper(c);
               /* OS unicode tables may be out of date.  But if they
                  do have a lower/uppercase mapping, hopefully it
                  is correct? */
               if (l0 != c && l0 != l) {
                    fprintf(stderr, "MISMATCH %x != towlower(%x) == %x\n",
                            l, c, l0);
                    ++error;
               }
               else if (l0 != l) { /* often true for out-of-date OS unicode */
                    ++better;
                    /* printf("%x != towlower(%x) == %x\n", l, c, l0); */
               }
               if (u0 != c && u0 != u) {
                    fprintf(stderr, "MISMATCH %x != towupper(%x) == %x\n",
                            u, c, u0);
                    ++error;
               }
               else if (u0 != u) { /* often true for out-of-date OS unicode */
                    ++better;
                    /* printf("%x != towupper(%x) == %x\n", u, c, u0); */
               }
          }
     }
     check(!error, "utf8proc case conversion FAILED %d tests.", error);
     printf("More up-to-date than OS unicode tables for %d tests.\n", better);
     printf("utf8proc case conversion tests SUCCEEDED.\n");
     return 0;
 }
--- a/utf8proc.c
+++ b/utf8proc.c
@@ -264,6 +264,18 @@ UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, ut
                        utf8proc_get_property(c2)->boundclass);
 }
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
 {
  utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_mapping;
  return cl >= 0 ? cl : c;
 }
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
 {
  utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_mapping;
  return cu >= 0 ? cu : c;
 }
 /* return a character width analogous to wcwidth (except portable and
   hopefully less buggy than most system wcwidth functions). */
 UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
--- a/utf8proc.h
+++ b/utf8proc.h
@@ -511,6 +511,21 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
 */
 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
 /**
 * Given a codepoint `c`, return the codepoint of the corresponding
 * lower-case character, if any; otherwise (if there is no lower-case
 * variant, or if `c` is not a valid codepoint) return `c`.
 */
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c);
 /**
 * Given a codepoint `c`, return the codepoint of the corresponding
 * upper-case character, if any; otherwise (if there is no upper-case
 * variant, or if `c` is not a valid codepoint) return `c`.
 */
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
 /**
 * Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
 * except that a width of 0 is returned for non-printable codepoints