Prefix other C99 typedefs with utf8proc_

This commit is contained in:
Tony Kelman 2015-04-06 22:36:33 -07:00
parent ad27722923
commit 0a818c7003
7 changed files with 102 additions and 94 deletions

View File

@ -268,7 +268,7 @@ for code in 0...0x110000
end end
end end
$stdout << "const int32_t utf8proc_sequences[] = {\n " $stdout << "const utf8proc_int32_t utf8proc_sequences[] = {\n "
i = 0 i = 0
$int_array.each do |entry| $int_array.each do |entry|
i += 1 i += 1
@ -280,7 +280,7 @@ $int_array.each do |entry|
end end
$stdout << "};\n\n" $stdout << "};\n\n"
$stdout << "const uint16_t utf8proc_stage1table[] = {\n " $stdout << "const utf8proc_uint16_t utf8proc_stage1table[] = {\n "
i = 0 i = 0
stage1.each do |entry| stage1.each do |entry|
i += 1 i += 1
@ -292,7 +292,7 @@ stage1.each do |entry|
end end
$stdout << "};\n\n" $stdout << "};\n\n"
$stdout << "const uint16_t utf8proc_stage2table[] = {\n " $stdout << "const utf8proc_uint16_t utf8proc_stage2table[] = {\n "
i = 0 i = 0
stage2.flatten.each do |entry| stage2.flatten.each do |entry|
i += 1 i += 1
@ -311,7 +311,7 @@ properties.each { |line|
} }
$stdout << "};\n\n" $stdout << "};\n\n"
$stdout << "const int32_t utf8proc_combinations[] = {\n " $stdout << "const utf8proc_int32_t utf8proc_combinations[] = {\n "
i = 0 i = 0
comb1st_indicies.keys.each_index do |a| comb1st_indicies.keys.each_index do |a|
comb2nd_indicies.keys.each_index do |b| comb2nd_indicies.keys.each_index do |b|

View File

@ -5,7 +5,7 @@ int main(int argc, char **argv)
char *buf = NULL; char *buf = NULL;
size_t bufsize = 0; size_t bufsize = 0;
FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL; FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL;
uint8_t src[1024]; utf8proc_uint8_t src[1024];
check(f != NULL, "error opening GraphemeBreakTest.txt"); check(f != NULL, "error opening GraphemeBreakTest.txt");
while (getline(&buf, &bufsize, f) > 0) { while (getline(&buf, &bufsize, f) > 0) {
@ -39,10 +39,10 @@ int main(int argc, char **argv)
src[si] = 0; /* NUL-terminate */ src[si] = 0; /* NUL-terminate */
if (si) { if (si) {
uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */ utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
size_t i = 0, j = 0; size_t i = 0, j = 0;
utf8proc_ssize_t glen; utf8proc_ssize_t glen;
uint8_t *g; /* utf8proc_map grapheme results */ utf8proc_uint8_t *g; /* utf8proc_map grapheme results */
while (i < si) { while (i < si) {
if (src[i] != '/') if (src[i] != '/')
utf8[j++] = src[i++]; utf8[j++] = src[i++];

View File

@ -1,7 +1,7 @@
#include "tests.h" #include "tests.h"
#define CHECK_NORM(NRM, norm, src) { \ #define CHECK_NORM(NRM, norm, src) { \
char *src_norm = (char*) utf8proc_ ## NRM((uint8_t*) src); \ char *src_norm = (char*) utf8proc_ ## NRM((utf8proc_uint8_t*) src); \
check(!strcmp(norm, src_norm), \ check(!strcmp(norm, src_norm), \
"normalization failed for %s -> %s", src, norm); \ "normalization failed for %s -> %s", src, norm); \
free(src_norm); \ free(src_norm); \

View File

@ -47,7 +47,7 @@ size_t encode(char *dest, const char *buf)
} }
check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i); check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i);
i = j; /* skip to char after hex input */ i = j; /* skip to char after hex input */
d += utf8proc_encode_char(c, (uint8_t *) (dest + d)); d += utf8proc_encode_char(c, (utf8proc_uint8_t *) (dest + d));
} while (1); } while (1);
} }

View File

@ -44,7 +44,7 @@
#include "utf8proc_data.c" #include "utf8proc_data.c"
UTF8PROC_DLLEXPORT const int8_t utf8proc_utf8class[256] = { UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@ -109,11 +109,11 @@ UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
} }
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate( UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
const uint8_t *str, utf8proc_ssize_t strlen, int32_t *dst const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
) { ) {
int length; int length;
int i; int i;
int32_t uc = -1; utf8proc_int32_t uc = -1;
*dst = -1; *dst = -1;
if (!strlen) return 0; if (!strlen) return 0;
length = utf8proc_utf8class[str[0]]; length = utf8proc_utf8class[str[0]];
@ -148,14 +148,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
return length; return length;
} }
UTF8PROC_DLLEXPORT bool utf8proc_codepoint_valid(int32_t uc) { UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
if (uc < 0 || uc >= 0x110000 || if (uc < 0 || uc >= 0x110000 ||
((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) || ((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) ||
(uc >= 0xFDD0 && uc < 0xFDF0)) return false; (uc >= 0xFDD0 && uc < 0xFDF0)) return false;
else return true; else return true;
} }
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) { UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
if (uc < 0x00) { if (uc < 0x00) {
return 0; return 0;
} else if (uc < 0x80) { } else if (uc < 0x80) {
@ -186,7 +186,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(int32_t uc, uint8_t *ds
} }
/* internal "unsafe" version that does not check whether uc is in range */ /* internal "unsafe" version that does not check whether uc is in range */
static const utf8proc_property_t *get_property(int32_t uc) { static const utf8proc_property_t *get_property(utf8proc_int32_t uc) {
/* ASSERT: uc >= 0 && uc < 0x110000 */ /* ASSERT: uc >= 0 && uc < 0x110000 */
return utf8proc_properties + ( return utf8proc_properties + (
utf8proc_stage2table[ utf8proc_stage2table[
@ -195,12 +195,12 @@ static const utf8proc_property_t *get_property(int32_t uc) {
); );
} }
UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t uc) { UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) {
return uc < 0 || uc >= 0x110000 ? utf8proc_properties : get_property(uc); return uc < 0 || uc >= 0x110000 ? utf8proc_properties : get_property(uc);
} }
/* return whether there is a grapheme break between boundclasses lbc and tbc */ /* return whether there is a grapheme break between boundclasses lbc and tbc */
static bool grapheme_break(int lbc, int tbc) { static utf8proc_bool grapheme_break(int lbc, int tbc) {
return return
(lbc == UTF8PROC_BOUNDCLASS_START) ? true : (lbc == UTF8PROC_BOUNDCLASS_START) ? true :
(lbc == UTF8PROC_BOUNDCLASS_CR && (lbc == UTF8PROC_BOUNDCLASS_CR &&
@ -226,22 +226,22 @@ static bool grapheme_break(int lbc, int tbc) {
} }
/* return whether there is a grapheme break between codepoints c1 and c2 */ /* return whether there is a grapheme break between codepoints c1 and c2 */
UTF8PROC_DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2) { UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, utf8proc_int32_t c2) {
return grapheme_break(utf8proc_get_property(c1)->boundclass, return grapheme_break(utf8proc_get_property(c1)->boundclass,
utf8proc_get_property(c2)->boundclass); utf8proc_get_property(c2)->boundclass);
} }
/* return a character width analogous to wcwidth (except portable and /* return a character width analogous to wcwidth (except portable and
hopefully less buggy than most system wcwidth functions). */ hopefully less buggy than most system wcwidth functions). */
UTF8PROC_DLLEXPORT int utf8proc_charwidth(int32_t c) { UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
return utf8proc_get_property(c)->charwidth; return utf8proc_get_property(c)->charwidth;
} }
UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(int32_t c) { UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) {
return utf8proc_get_property(c)->category; return utf8proc_get_property(c)->category;
} }
UTF8PROC_DLLEXPORT const char *utf8proc_category_string(int32_t c) { UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"}; static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"};
return s[utf8proc_category(c)]; return s[utf8proc_category(c)];
} }
@ -250,17 +250,17 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(int32_t c) {
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
options & ~UTF8PROC_LUMP, last_boundclass) options & ~UTF8PROC_LUMP, last_boundclass)
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
const utf8proc_property_t *property; const utf8proc_property_t *property;
utf8proc_propval_t category; utf8proc_propval_t category;
int32_t hangul_sindex; utf8proc_int32_t hangul_sindex;
if (uc < 0 || uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED; if (uc < 0 || uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED;
property = get_property(uc); property = get_property(uc);
category = property->category; category = property->category;
hangul_sindex = uc - UTF8PROC_HANGUL_SBASE; hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) { if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
int32_t hangul_tindex; utf8proc_int32_t hangul_tindex;
if (bufsize >= 1) { if (bufsize >= 1) {
dst[0] = UTF8PROC_HANGUL_LBASE + dst[0] = UTF8PROC_HANGUL_LBASE +
hangul_sindex / UTF8PROC_HANGUL_NCOUNT; hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
@ -312,7 +312,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(int32_t uc, int32_t
} }
if (options & UTF8PROC_CASEFOLD) { if (options & UTF8PROC_CASEFOLD) {
if (property->casefold_mapping) { if (property->casefold_mapping) {
const int32_t *casefold_entry; const utf8proc_int32_t *casefold_entry;
utf8proc_ssize_t written = 0; utf8proc_ssize_t written = 0;
for (casefold_entry = property->casefold_mapping; for (casefold_entry = property->casefold_mapping;
*casefold_entry >= 0; casefold_entry++) { *casefold_entry >= 0; casefold_entry++) {
@ -327,7 +327,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(int32_t uc, int32_t
if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
if (property->decomp_mapping && if (property->decomp_mapping &&
(!property->decomp_type || (options & UTF8PROC_COMPAT))) { (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
const int32_t *decomp_entry; const utf8proc_int32_t *decomp_entry;
utf8proc_ssize_t written = 0; utf8proc_ssize_t written = 0;
for (decomp_entry = property->decomp_mapping; for (decomp_entry = property->decomp_mapping;
*decomp_entry >= 0; decomp_entry++) { *decomp_entry >= 0; decomp_entry++) {
@ -340,7 +340,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(int32_t uc, int32_t
} }
} }
if (options & UTF8PROC_CHARBOUND) { if (options & UTF8PROC_CHARBOUND) {
bool boundary; utf8proc_bool boundary;
int tbc = property->boundclass; int tbc = property->boundclass;
boundary = grapheme_break(*last_boundclass, tbc); boundary = grapheme_break(*last_boundclass, tbc);
*last_boundclass = tbc; *last_boundclass = tbc;
@ -355,8 +355,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(int32_t uc, int32_t
} }
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
const uint8_t *str, utf8proc_ssize_t strlen, const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
) { ) {
/* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */ /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
utf8proc_ssize_t wpos = 0; utf8proc_ssize_t wpos = 0;
@ -366,7 +366,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
!(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE)) !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
return UTF8PROC_ERROR_INVALIDOPTS; return UTF8PROC_ERROR_INVALIDOPTS;
{ {
int32_t uc; utf8proc_int32_t uc;
utf8proc_ssize_t rpos = 0; utf8proc_ssize_t rpos = 0;
utf8proc_ssize_t decomp_result; utf8proc_ssize_t decomp_result;
int boundclass = UTF8PROC_BOUNDCLASS_START; int boundclass = UTF8PROC_BOUNDCLASS_START;
@ -390,14 +390,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
if (decomp_result < 0) return decomp_result; if (decomp_result < 0) return decomp_result;
wpos += decomp_result; wpos += decomp_result;
/* prohibiting integer overflows due to too long strings: */ /* prohibiting integer overflows due to too long strings: */
if (wpos < 0 || wpos > SSIZE_MAX/sizeof(int32_t)/2) if (wpos < 0 || wpos > SSIZE_MAX/sizeof(utf8proc_int32_t)/2)
return UTF8PROC_ERROR_OVERFLOW; return UTF8PROC_ERROR_OVERFLOW;
} }
} }
if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) { if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
utf8proc_ssize_t pos = 0; utf8proc_ssize_t pos = 0;
while (pos < wpos-1) { while (pos < wpos-1) {
int32_t uc1, uc2; utf8proc_int32_t uc1, uc2;
const utf8proc_property_t *property1, *property2; const utf8proc_property_t *property1, *property2;
uc1 = buffer[pos]; uc1 = buffer[pos];
uc2 = buffer[pos+1]; uc2 = buffer[pos+1];
@ -416,13 +416,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
return wpos; return wpos;
} }
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
/* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
ASSERT: 'buffer' has one spare byte of free space at the end! */ ASSERT: 'buffer' has one spare byte of free space at the end! */
if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) { if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
utf8proc_ssize_t rpos; utf8proc_ssize_t rpos;
utf8proc_ssize_t wpos = 0; utf8proc_ssize_t wpos = 0;
int32_t uc; utf8proc_int32_t uc;
for (rpos = 0; rpos < length; rpos++) { for (rpos = 0; rpos < length; rpos++) {
uc = buffer[rpos]; uc = buffer[rpos];
if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++; if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
@ -451,23 +451,23 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(int32_t *buffer, utf8proc_
length = wpos; length = wpos;
} }
if (options & UTF8PROC_COMPOSE) { if (options & UTF8PROC_COMPOSE) {
int32_t *starter = NULL; utf8proc_int32_t *starter = NULL;
int32_t current_char; utf8proc_int32_t current_char;
const utf8proc_property_t *starter_property = NULL, *current_property; const utf8proc_property_t *starter_property = NULL, *current_property;
utf8proc_propval_t max_combining_class = -1; utf8proc_propval_t max_combining_class = -1;
utf8proc_ssize_t rpos; utf8proc_ssize_t rpos;
utf8proc_ssize_t wpos = 0; utf8proc_ssize_t wpos = 0;
int32_t composition; utf8proc_int32_t composition;
for (rpos = 0; rpos < length; rpos++) { for (rpos = 0; rpos < length; rpos++) {
current_char = buffer[rpos]; current_char = buffer[rpos];
current_property = get_property(current_char); current_property = get_property(current_char);
if (starter && current_property->combining_class > max_combining_class) { if (starter && current_property->combining_class > max_combining_class) {
/* combination perhaps possible */ /* combination perhaps possible */
int32_t hangul_lindex; utf8proc_int32_t hangul_lindex;
int32_t hangul_sindex; utf8proc_int32_t hangul_sindex;
hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE; hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) { if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
int32_t hangul_vindex; utf8proc_int32_t hangul_vindex;
hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE; hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) { if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
*starter = UTF8PROC_HANGUL_SBASE + *starter = UTF8PROC_HANGUL_SBASE +
@ -480,7 +480,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(int32_t *buffer, utf8proc_
hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE; hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT && if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
(hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) { (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
int32_t hangul_tindex; utf8proc_int32_t hangul_tindex;
hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE; hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
*starter += hangul_tindex; *starter += hangul_tindex;
@ -521,25 +521,25 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(int32_t *buffer, utf8proc_
} }
{ {
utf8proc_ssize_t rpos, wpos = 0; utf8proc_ssize_t rpos, wpos = 0;
int32_t uc; utf8proc_int32_t uc;
for (rpos = 0; rpos < length; rpos++) { for (rpos = 0; rpos < length; rpos++) {
uc = buffer[rpos]; uc = buffer[rpos];
wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos); wpos += utf8proc_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
} }
((uint8_t *)buffer)[wpos] = 0; ((utf8proc_uint8_t *)buffer)[wpos] = 0;
return wpos; return wpos;
} }
} }
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
const uint8_t *str, utf8proc_ssize_t strlen, uint8_t **dstptr, utf8proc_option_t options const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
) { ) {
int32_t *buffer; utf8proc_int32_t *buffer;
utf8proc_ssize_t result; utf8proc_ssize_t result;
*dstptr = NULL; *dstptr = NULL;
result = utf8proc_decompose(str, strlen, NULL, 0, options); result = utf8proc_decompose(str, strlen, NULL, 0, options);
if (result < 0) return result; if (result < 0) return result;
buffer = (int32_t *) malloc(result * sizeof(int32_t) + 1); buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
if (!buffer) return UTF8PROC_ERROR_NOMEM; if (!buffer) return UTF8PROC_ERROR_NOMEM;
result = utf8proc_decompose(str, strlen, buffer, result, options); result = utf8proc_decompose(str, strlen, buffer, result, options);
if (result < 0) { if (result < 0) {
@ -552,37 +552,37 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
return result; return result;
} }
{ {
int32_t *newptr; utf8proc_int32_t *newptr;
newptr = (int32_t *) realloc(buffer, (size_t)result+1); newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1);
if (newptr) buffer = newptr; if (newptr) buffer = newptr;
} }
*dstptr = (uint8_t *)buffer; *dstptr = (utf8proc_uint8_t *)buffer;
return result; return result;
} }
UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFD(const uint8_t *str) { UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) {
uint8_t *retval; utf8proc_uint8_t *retval;
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
UTF8PROC_DECOMPOSE); UTF8PROC_DECOMPOSE);
return retval; return retval;
} }
UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFC(const uint8_t *str) { UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) {
uint8_t *retval; utf8proc_uint8_t *retval;
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
UTF8PROC_COMPOSE); UTF8PROC_COMPOSE);
return retval; return retval;
} }
UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFKD(const uint8_t *str) { UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) {
uint8_t *retval; utf8proc_uint8_t *retval;
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT); UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
return retval; return retval;
} }
UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFKC(const uint8_t *str) { UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) {
uint8_t *retval; utf8proc_uint8_t *retval;
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
UTF8PROC_COMPOSE | UTF8PROC_COMPAT); UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
return retval; return retval;

View File

@ -77,24 +77,32 @@
#include <stdlib.h> #include <stdlib.h>
#include <sys/types.h> #include <sys/types.h>
#ifdef _MSC_VER #ifdef _MSC_VER
typedef signed char int8_t; typedef signed char utf8proc_int8_t;
typedef unsigned char uint8_t; typedef unsigned char utf8proc_uint8_t;
typedef short int16_t; typedef short utf8proc_int16_t;
typedef unsigned short uint16_t; typedef unsigned short utf8proc_uint16_t;
typedef int int32_t; typedef int utf8proc_int32_t;
# ifdef _WIN64 # ifdef _WIN64
typedef __int64 utf8proc_ssize_t; typedef __int64 utf8proc_ssize_t;
# else # else
typedef int utf8proc_ssize_t; typedef int utf8proc_ssize_t;
# endif # endif
# ifndef __cplusplus # ifndef __cplusplus
typedef unsigned char bool; typedef unsigned char utf8proc_bool;
enum {false, true}; enum {false, true};
# else
typedef bool utf8proc_bool;
# endif # endif
#else #else
# include <stdbool.h> # include <stdbool.h>
# include <inttypes.h> # include <inttypes.h>
typedef int8_t utf8proc_int8_t;
typedef uint8_t utf8proc_uint8_t;
typedef int16_t utf8proc_int16_t;
typedef uint16_t utf8proc_uint16_t;
typedef int32_t utf8proc_int32_t;
typedef ssize_t utf8proc_ssize_t; typedef ssize_t utf8proc_ssize_t;
typedef bool utf8proc_bool;
#endif #endif
#include <limits.h> #include <limits.h>
@ -204,7 +212,7 @@ typedef enum {
/* @name Types */ /* @name Types */
/** Holds the value of a property. */ /** Holds the value of a property. */
typedef int16_t utf8proc_propval_t; typedef utf8proc_int16_t utf8proc_propval_t;
/** Struct containing information about a codepoint. */ /** Struct containing information about a codepoint. */
typedef struct utf8proc_property_struct { typedef struct utf8proc_property_struct {
@ -224,13 +232,13 @@ typedef struct utf8proc_property_struct {
* @see utf8proc_decomp_type_t. * @see utf8proc_decomp_type_t.
*/ */
utf8proc_propval_t decomp_type; utf8proc_propval_t decomp_type;
const int32_t *decomp_mapping; const utf8proc_int32_t *decomp_mapping;
const int32_t *casefold_mapping; const utf8proc_int32_t *casefold_mapping;
int32_t uppercase_mapping; utf8proc_int32_t uppercase_mapping;
int32_t lowercase_mapping; utf8proc_int32_t lowercase_mapping;
int32_t titlecase_mapping; utf8proc_int32_t titlecase_mapping;
int32_t comb1st_index; utf8proc_int32_t comb1st_index;
int32_t comb2nd_index; utf8proc_int32_t comb2nd_index;
unsigned bidi_mirrored:1; unsigned bidi_mirrored:1;
unsigned comp_exclusion:1; unsigned comp_exclusion:1;
/** /**
@ -352,7 +360,7 @@ typedef enum {
* Array containing the byte lengths of a UTF-8 encoded codepoint based * Array containing the byte lengths of a UTF-8 encoded codepoint based
* on the first byte. * on the first byte.
*/ */
UTF8PROC_DLLEXPORT extern const int8_t utf8proc_utf8class[256]; UTF8PROC_DLLEXPORT extern const utf8proc_int8_t utf8proc_utf8class[256];
/** /**
* Returns the utf8proc API version as a string MAJOR.MINOR.PATCH * Returns the utf8proc API version as a string MAJOR.MINOR.PATCH
@ -377,7 +385,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode);
* In case of success, the number of bytes read is returned; otherwise, a * In case of success, the number of bytes read is returned; otherwise, a
* negative error code is returned. * negative error code is returned.
*/ */
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(const uint8_t *str, utf8proc_ssize_t strlen, int32_t *codepoint_ref); UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *codepoint_ref);
/** /**
* Check if a codepoint is valid (regardless of whether it has been * Check if a codepoint is valid (regardless of whether it has been
@ -385,7 +393,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(const uint8_t *str, utf8pro
* *
* @return 1 if the given `codepoint` is valid and otherwise return 0. * @return 1 if the given `codepoint` is valid and otherwise return 0.
*/ */
UTF8PROC_DLLEXPORT bool utf8proc_codepoint_valid(int32_t codepoint); UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t codepoint);
/** /**
* Encodes the codepoint as an UTF-8 string in the byte array pointed * Encodes the codepoint as an UTF-8 string in the byte array pointed
@ -396,7 +404,7 @@ UTF8PROC_DLLEXPORT bool utf8proc_codepoint_valid(int32_t codepoint);
* *
* This function does not check whether `codepoint` is valid Unicode. * This function does not check whether `codepoint` is valid Unicode.
*/ */
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(int32_t codepoint, uint8_t *dst); UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t codepoint, utf8proc_uint8_t *dst);
/** /**
* Look up the properties for a given codepoint. * Look up the properties for a given codepoint.
@ -410,7 +418,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(int32_t codepoint, uint
* If the codepoint is unassigned or invalid, a pointer to a special struct is * If the codepoint is unassigned or invalid, a pointer to a special struct is
* returned in which `category` is 0 (@ref UTF8PROC_CATEGORY_CN). * returned in which `category` is 0 (@ref UTF8PROC_CATEGORY_CN).
*/ */
UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t codepoint); UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t codepoint);
/** Decompose a codepoint into an array of codepoints. /** Decompose a codepoint into an array of codepoints.
* *
@ -440,7 +448,7 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t code
* undefined data. * undefined data.
*/ */
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char( UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(
int32_t codepoint, int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_int32_t codepoint, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize,
utf8proc_option_t options, int *last_boundclass utf8proc_option_t options, int *last_boundclass
); );
@ -461,8 +469,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(
* undefined data. * undefined data.
*/ */
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
const uint8_t *str, utf8proc_ssize_t strlen, const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
); );
/** /**
@ -490,13 +498,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
* entries of the array pointed to by `str` have to be in the * entries of the array pointed to by `str` have to be in the
* range `0x0000` to `0x10FFFF`. Otherwise, the program might crash! * range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
*/ */
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options); UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
/** /**
* Given a pair of consecutive codepoints, return whether a grapheme break is * Given a pair of consecutive codepoints, return whether a grapheme break is
* permitted between them (as defined by the extended grapheme clusters in UAX#29). * permitted between them (as defined by the extended grapheme clusters in UAX#29).
*/ */
UTF8PROC_DLLEXPORT bool utf8proc_grapheme_break(int32_t codepoint1, int32_t codepoint2); UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
/** /**
* Given a codepoint, return a character width analogous to `wcwidth(codepoint)`, * Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
@ -506,19 +514,19 @@ UTF8PROC_DLLEXPORT bool utf8proc_grapheme_break(int32_t codepoint1, int32_t code
* @note * @note
* If you want to check for particular types of non-printable characters, * If you want to check for particular types of non-printable characters,
* (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */ * (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */
UTF8PROC_DLLEXPORT int utf8proc_charwidth(int32_t codepoint); UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t codepoint);
/** /**
* Return the Unicode category for the codepoint (one of the * Return the Unicode category for the codepoint (one of the
* @ref utf8proc_category_t constants.) * @ref utf8proc_category_t constants.)
*/ */
UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(int32_t codepoint); UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t codepoint);
/** /**
* Return the two-letter (nul-terminated) Unicode category string for * Return the two-letter (nul-terminated) Unicode category string for
* the codepoint (e.g. `"Lu"` or `"Co"`). * the codepoint (e.g. `"Lu"` or `"Co"`).
*/ */
UTF8PROC_DLLEXPORT const char *utf8proc_category_string(int32_t codepoint); UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoint);
/** /**
* Maps the given UTF-8 string pointed to by `str` to a new UTF-8 * Maps the given UTF-8 string pointed to by `str` to a new UTF-8
@ -539,7 +547,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(int32_t codepoint);
* with `malloc`, and should therefore be deallocated with `free`. * with `malloc`, and should therefore be deallocated with `free`.
*/ */
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
const uint8_t *str, utf8proc_ssize_t strlen, uint8_t **dstptr, utf8proc_option_t options const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
); );
/** @name Unicode normalization /** @name Unicode normalization
@ -551,13 +559,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
*/ */
/** @{ */ /** @{ */
/** NFD normalization (@ref UTF8PROC_DECOMPOSE). */ /** NFD normalization (@ref UTF8PROC_DECOMPOSE). */
UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFD(const uint8_t *str); UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str);
/** NFC normalization (@ref UTF8PROC_COMPOSE). */ /** NFC normalization (@ref UTF8PROC_COMPOSE). */
UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFC(const uint8_t *str); UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str);
/** NFD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */ /** NFD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFKD(const uint8_t *str); UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
/** NFD normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */ /** NFD normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFKC(const uint8_t *str); UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
/** @} */ /** @} */
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -1,4 +1,4 @@
const int32_t utf8proc_sequences[] = { const utf8proc_int32_t utf8proc_sequences[] = {
97, -1, 98, -1, 99, -1, 100, 97, -1, 98, -1, 99, -1, 100,
-1, 101, -1, 102, -1, 103, -1, 104, -1, 101, -1, 102, -1, 103, -1, 104,
-1, 105, -1, 106, -1, 107, -1, 108, -1, 105, -1, 106, -1, 107, -1, 108,
@ -1523,7 +1523,7 @@ const int32_t utf8proc_sequences[] = {
172689, -1, 19798, -1, 40702, -1, 40709, -1, 172689, -1, 19798, -1, 40702, -1, 40709, -1,
40719, -1, 40726, -1, 173568, -1, }; 40719, -1, 40726, -1, 173568, -1, };
const uint16_t utf8proc_stage1table[] = { const utf8proc_uint16_t utf8proc_stage1table[] = {
0, 256, 512, 768, 1024, 1280, 1536, 0, 256, 512, 768, 1024, 1280, 1536,
1792, 2048, 2304, 2560, 2816, 3072, 3328, 3584, 1792, 2048, 2304, 2560, 2816, 3072, 3328, 3584,
3840, 4096, 4352, 4608, 4864, 5120, 5376, 5632, 3840, 4096, 4352, 4608, 4864, 5120, 5376, 5632,
@ -2070,7 +2070,7 @@ const uint16_t utf8proc_stage1table[] = {
18432, 18432, 18432, 18432, 18432, 18432, 18432, 18432, 18432, 18432, 18432, 18432, 18432, 18432, 18432, 18432,
35584, }; 35584, };
const uint16_t utf8proc_stage2table[] = { const utf8proc_uint16_t utf8proc_stage2table[] = {
1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2,
2, 2, 3, 4, 3, 5, 6, 2, 2, 2, 3, 4, 3, 5, 6, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
@ -13003,7 +13003,7 @@ const utf8proc_property_t utf8proc_properties[] = {
{UTF8PROC_CATEGORY_LO, 0, UTF8PROC_BIDI_CLASS_L, 0, utf8proc_sequences + 12179, NULL, -1, -1, -1, -1, -1, false, false, false, false, UTF8PROC_BOUNDCLASS_OTHER, 2}, {UTF8PROC_CATEGORY_LO, 0, UTF8PROC_BIDI_CLASS_L, 0, utf8proc_sequences + 12179, NULL, -1, -1, -1, -1, -1, false, false, false, false, UTF8PROC_BOUNDCLASS_OTHER, 2},
}; };
const int32_t utf8proc_combinations[] = { const utf8proc_int32_t utf8proc_combinations[] = {
192, 193, 194, 195, 196, 197, -1, 192, 193, 194, 195, 196, 197, -1,
256, 258, 260, 550, 461, -1, -1, 512, 256, 258, 260, 550, 461, -1, -1, 512,
514, -1, -1, -1, -1, -1, -1, -1, 514, -1, -1, -1, -1, -1, -1, -1,