add islower/isupper functions (#196)
* add islower/isupper functions * added test * more tests + bugfix * Makefile fix * rm iscase test on make clean
This commit is contained in:
parent
08f9999a06
commit
5622a0a51b
1
.gitignore
vendored
1
.gitignore
vendored
@ -26,6 +26,7 @@
|
||||
/test/valid
|
||||
/test/iterate
|
||||
/test/case
|
||||
/test/iscase
|
||||
/test/custom
|
||||
/tmp/
|
||||
/mingw_static/
|
||||
|
||||
14
Makefile
14
Makefile
@ -56,7 +56,7 @@ clean:
|
||||
ifneq ($(OS),Darwin)
|
||||
rm -f libutf8proc.so.$(MAJOR)
|
||||
endif
|
||||
rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom test/misc
|
||||
rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom test/misc test/iscase
|
||||
rm -rf MANIFEST.new tmp
|
||||
$(MAKE) -C bench clean
|
||||
$(MAKE) -C data clean
|
||||
@ -129,6 +129,12 @@ data/NormalizationTest.txt:
|
||||
data/GraphemeBreakTest.txt:
|
||||
$(MAKE) -C data GraphemeBreakTest.txt
|
||||
|
||||
data/Lowercase.txt:
|
||||
$(MAKE) -C data Lowercase.txt
|
||||
|
||||
data/Uppercase.txt:
|
||||
$(MAKE) -C data Uppercase.txt
|
||||
|
||||
test/tests.o: test/tests.c test/tests.h utf8proc.h
|
||||
$(CC) $(UCFLAGS) -c -o test/tests.o test/tests.c
|
||||
|
||||
@ -150,6 +156,9 @@ test/valid: test/valid.c test/tests.o utf8proc.o utf8proc.h test/tests.h
|
||||
test/iterate: test/iterate.c test/tests.o utf8proc.o utf8proc.h test/tests.h
|
||||
$(CC) $(UCFLAGS) $(LDFLAGS) test/iterate.c test/tests.o utf8proc.o -o $@
|
||||
|
||||
test/iscase: test/iscase.c test/tests.o utf8proc.o utf8proc.h test/tests.h
|
||||
$(CC) $(UCFLAGS) $(LDFLAGS) test/iscase.c test/tests.o utf8proc.o -o $@
|
||||
|
||||
test/case: test/case.c test/tests.o utf8proc.o utf8proc.h test/tests.h
|
||||
$(CC) $(UCFLAGS) $(LDFLAGS) test/case.c test/tests.o utf8proc.o -o $@
|
||||
|
||||
@ -159,7 +168,7 @@ test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h
|
||||
test/misc: test/misc.c test/tests.o utf8proc.o utf8proc.h test/tests.h
|
||||
$(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION='"'`$(PERL) -ne "/^UNICODE_VERSION=/ and print $$';" data/Makefile`'"' test/misc.c test/tests.o utf8proc.o -o $@
|
||||
|
||||
check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
|
||||
check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercase.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/iscase test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
|
||||
$(MAKE) -C bench
|
||||
test/normtest data/NormalizationTest.txt
|
||||
test/graphemetest data/GraphemeBreakTest.txt
|
||||
@ -168,4 +177,5 @@ check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeB
|
||||
test/valid
|
||||
test/iterate
|
||||
test/case
|
||||
test/iscase data/Lowercase.txt data/Uppercase.txt
|
||||
test/custom
|
||||
|
||||
@ -51,6 +51,13 @@ GraphemeBreakTest.txt:
|
||||
emoji-data.txt:
|
||||
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://unicode.org/Public/$(UNICODE_VERSION)/ucd/emoji/emoji-data.txt
|
||||
|
||||
Uppercase.txt: DerivedCoreProperties.txt
|
||||
$(RUBY) -e 'puts File.read("DerivedCoreProperties.txt")[/# Derived Property: Uppercase.*?# Total code points:/m]' > $@
|
||||
|
||||
Lowercase.txt: DerivedCoreProperties.txt
|
||||
$(RUBY) -e 'puts File.read("DerivedCoreProperties.txt")[/# Derived Property: Lowercase.*?# Total code points:/m]' > $@
|
||||
|
||||
clean:
|
||||
rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt emoji-data.txt
|
||||
rm -f Uppercase.txt Lowercase.txt
|
||||
rm -f utf8proc_data.c.new
|
||||
|
||||
@ -77,6 +77,26 @@ $ignorable_list.each_line do |entry|
|
||||
end
|
||||
end
|
||||
|
||||
$uppercase_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Uppercase.*?# Total code points:/m]
|
||||
$uppercase = []
|
||||
$uppercase_list.each_line do |entry|
|
||||
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
|
||||
$1.hex.upto($2.hex) { |e2| $uppercase << e2 }
|
||||
elsif entry =~ /^[0-9A-F]+/
|
||||
$uppercase << $&.hex
|
||||
end
|
||||
end
|
||||
|
||||
$lowercase_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Lowercase.*?# Total code points:/m]
|
||||
$lowercase = []
|
||||
$lowercase_list.each_line do |entry|
|
||||
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
|
||||
$1.hex.upto($2.hex) { |e2| $lowercase << e2 }
|
||||
elsif entry =~ /^[0-9A-F]+/
|
||||
$lowercase << $&.hex
|
||||
end
|
||||
end
|
||||
|
||||
$grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt")
|
||||
$grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER")
|
||||
$grapheme_boundclass_list.each_line do |entry|
|
||||
@ -204,8 +224,10 @@ class UnicodeChar
|
||||
$8.split.collect { |element| element.hex }
|
||||
@bidi_mirrored = ($13=='Y') ? true : false
|
||||
# issue #130: use nonstandard uppercase ß -> ẞ
|
||||
@uppercase_mapping = ($16=='') ? (code==0x00df ? 0x1e9e : nil) : $16.hex
|
||||
@lowercase_mapping = ($17=='') ? nil : $17.hex
|
||||
# issue #195: if character is uppercase but has no lowercase mapping,
|
||||
# then make lowercase mapping = itself (vice versa for lowercase)
|
||||
@uppercase_mapping = ($16=='') ? (code==0x00df ? 0x1e9e : ($17=='' && $lowercase.include?(code) ? code : nil)) : $16.hex
|
||||
@lowercase_mapping = ($17=='') ? ($16=='' && $uppercase.include?(code) ? code : nil) : $17.hex
|
||||
@titlecase_mapping = ($18=='') ? (code==0x00df ? 0x1e9e : nil) : $18.hex
|
||||
end
|
||||
def case_folding
|
||||
|
||||
62
test/iscase.c
Normal file
62
test/iscase.c
Normal file
@ -0,0 +1,62 @@
|
||||
#include "tests.h"
|
||||
|
||||
int read_range(FILE *f, utf8proc_int32_t *start, utf8proc_int32_t *end)
|
||||
{
|
||||
unsigned char buf[8192];
|
||||
size_t len = simple_getline(buf, f);
|
||||
size_t pos = skipspaces(buf, 0);
|
||||
unsigned char s[16];
|
||||
if (pos == len || buf[pos] == '#') return 0;
|
||||
pos += encode(s, buf + pos) - 1;
|
||||
check(s[0], "invalid line %s in data", buf);
|
||||
utf8proc_iterate((utf8proc_uint8_t*) s, -1, start);
|
||||
if (buf[pos] == '.' && buf[pos+1] == '.') {
|
||||
encode(s, buf + pos + 2);
|
||||
check(s[0], "invalid line %s in data", buf);
|
||||
utf8proc_iterate((utf8proc_uint8_t*) s, -1, end);
|
||||
}
|
||||
else
|
||||
*end = *start;
|
||||
return 1;
|
||||
}
|
||||
|
||||
int test_iscase(const char *fname, int (*iscase)(utf8proc_int32_t),
|
||||
utf8proc_int32_t (*thatcase)(utf8proc_int32_t))
|
||||
{
|
||||
FILE *f = fopen(fname, "r");
|
||||
int lines = 0, tests = 0, success = 1;
|
||||
utf8proc_int32_t c = 0;
|
||||
|
||||
check(f != NULL, "error opening data file \"%s\"\n", fname);
|
||||
|
||||
while (success && !feof(f)) {
|
||||
utf8proc_int32_t start, end;
|
||||
if (read_range(f, &start, &end)) {
|
||||
for (; c < start; ++c) {
|
||||
check(!iscase(c), "failed !iscase(%04x) in %s\n", c, fname);
|
||||
}
|
||||
for (; c <= end; ++c) {
|
||||
check(iscase(c), "failed iscase(%04x) in %s\n", c, fname);
|
||||
check(thatcase(c) == c, "inconsistent thatcase(%04x) in %s\n", c, fname);
|
||||
++tests;
|
||||
}
|
||||
}
|
||||
++lines;
|
||||
}
|
||||
for (; c <= 0x110000; ++c) {
|
||||
check(!iscase(c), "failed !iscase(%04x) in %s\n", c, fname);
|
||||
}
|
||||
|
||||
printf("Checked %d characters from %d lines of %s\n", tests, lines, fname);
|
||||
fclose(f);
|
||||
return success;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
check(argc == 3, "Expected Lowercase.txt and Uppercase.txt as arguments");
|
||||
check(test_iscase(argv[1], utf8proc_islower, utf8proc_tolower), "Lowercase tests failed");
|
||||
check(test_iscase(argv[2], utf8proc_isupper, utf8proc_toupper), "Uppercase tests failed");
|
||||
printf("utf8proc iscase tests SUCCEEDED.\n");
|
||||
return 0;
|
||||
}
|
||||
@ -27,9 +27,9 @@ int main(int argc, char **argv)
|
||||
" combining_class = %d\n"
|
||||
" bidi_class = %d\n"
|
||||
" decomp_type = %d\n"
|
||||
" uppercase_mapping = %x\n"
|
||||
" lowercase_mapping = %x\n"
|
||||
" titlecase_mapping = %x\n"
|
||||
" uppercase_mapping = %04x (seqindex %04x)%s\n"
|
||||
" lowercase_mapping = %04x (seqindex %04x)%s\n"
|
||||
" titlecase_mapping = %04x (seqindex %04x)\n"
|
||||
" casefold = %s\n"
|
||||
" comb_index = %d\n"
|
||||
" bidi_mirrored = %d\n"
|
||||
@ -43,9 +43,9 @@ int main(int argc, char **argv)
|
||||
p->combining_class,
|
||||
p->bidi_class,
|
||||
p->decomp_type,
|
||||
utf8proc_toupper(c),
|
||||
utf8proc_tolower(c),
|
||||
utf8proc_totitle(c),
|
||||
utf8proc_toupper(c), p->uppercase_seqindex, utf8proc_isupper(c) ? " (isupper)" : "",
|
||||
utf8proc_tolower(c), p->lowercase_seqindex, utf8proc_islower(c) ? " (islower)" : "",
|
||||
utf8proc_totitle(c), p->titlecase_seqindex,
|
||||
(char *) map,
|
||||
p->comb_index,
|
||||
p->bidi_mirrored,
|
||||
|
||||
12
utf8proc.c
12
utf8proc.c
@ -384,6 +384,18 @@ UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
|
||||
return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c)
|
||||
{
|
||||
const utf8proc_property_t *p = utf8proc_get_property(c);
|
||||
return p->lowercase_seqindex != p->uppercase_seqindex && p->lowercase_seqindex == UINT16_MAX;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT int utf8proc_isupper(utf8proc_int32_t c)
|
||||
{
|
||||
const utf8proc_property_t *p = utf8proc_get_property(c);
|
||||
return p->lowercase_seqindex != p->uppercase_seqindex && p->uppercase_seqindex == UINT16_MAX && p->category != UTF8PROC_CATEGORY_LT;
|
||||
}
|
||||
|
||||
/* return a character width analogous to wcwidth (except portable and
|
||||
hopefully less buggy than most system wcwidth functions). */
|
||||
UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
|
||||
|
||||
12
utf8proc.h
12
utf8proc.h
@ -635,6 +635,18 @@ UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c);
|
||||
|
||||
/**
|
||||
* Given a codepoint `c`, return `1` if the codepoint corresponds to a lower-case character
|
||||
* and `0` otherwise.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c);
|
||||
|
||||
/**
|
||||
* Given a codepoint `c`, return `1` if the codepoint corresponds to an upper-case character
|
||||
* and `0` otherwise.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT int utf8proc_isupper(utf8proc_int32_t c);
|
||||
|
||||
/**
|
||||
* Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
|
||||
* except that a width of 0 is returned for non-printable codepoints
|
||||
|
||||
13145
utf8proc_data.c
13145
utf8proc_data.c
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user