Prefix other C99 typedefs with utf8proc_
This commit is contained in:
parent
ad27722923
commit
0a818c7003
@ -268,7 +268,7 @@ for code in 0...0x110000
|
||||
end
|
||||
end
|
||||
|
||||
$stdout << "const int32_t utf8proc_sequences[] = {\n "
|
||||
$stdout << "const utf8proc_int32_t utf8proc_sequences[] = {\n "
|
||||
i = 0
|
||||
$int_array.each do |entry|
|
||||
i += 1
|
||||
@ -280,7 +280,7 @@ $int_array.each do |entry|
|
||||
end
|
||||
$stdout << "};\n\n"
|
||||
|
||||
$stdout << "const uint16_t utf8proc_stage1table[] = {\n "
|
||||
$stdout << "const utf8proc_uint16_t utf8proc_stage1table[] = {\n "
|
||||
i = 0
|
||||
stage1.each do |entry|
|
||||
i += 1
|
||||
@ -292,7 +292,7 @@ stage1.each do |entry|
|
||||
end
|
||||
$stdout << "};\n\n"
|
||||
|
||||
$stdout << "const uint16_t utf8proc_stage2table[] = {\n "
|
||||
$stdout << "const utf8proc_uint16_t utf8proc_stage2table[] = {\n "
|
||||
i = 0
|
||||
stage2.flatten.each do |entry|
|
||||
i += 1
|
||||
@ -311,7 +311,7 @@ properties.each { |line|
|
||||
}
|
||||
$stdout << "};\n\n"
|
||||
|
||||
$stdout << "const int32_t utf8proc_combinations[] = {\n "
|
||||
$stdout << "const utf8proc_int32_t utf8proc_combinations[] = {\n "
|
||||
i = 0
|
||||
comb1st_indicies.keys.each_index do |a|
|
||||
comb2nd_indicies.keys.each_index do |b|
|
||||
|
||||
@ -5,7 +5,7 @@ int main(int argc, char **argv)
|
||||
char *buf = NULL;
|
||||
size_t bufsize = 0;
|
||||
FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL;
|
||||
uint8_t src[1024];
|
||||
utf8proc_uint8_t src[1024];
|
||||
|
||||
check(f != NULL, "error opening GraphemeBreakTest.txt");
|
||||
while (getline(&buf, &bufsize, f) > 0) {
|
||||
@ -39,10 +39,10 @@ int main(int argc, char **argv)
|
||||
src[si] = 0; /* NUL-terminate */
|
||||
|
||||
if (si) {
|
||||
uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
|
||||
utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
|
||||
size_t i = 0, j = 0;
|
||||
utf8proc_ssize_t glen;
|
||||
uint8_t *g; /* utf8proc_map grapheme results */
|
||||
utf8proc_uint8_t *g; /* utf8proc_map grapheme results */
|
||||
while (i < si) {
|
||||
if (src[i] != '/')
|
||||
utf8[j++] = src[i++];
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
#include "tests.h"
|
||||
|
||||
#define CHECK_NORM(NRM, norm, src) { \
|
||||
char *src_norm = (char*) utf8proc_ ## NRM((uint8_t*) src); \
|
||||
char *src_norm = (char*) utf8proc_ ## NRM((utf8proc_uint8_t*) src); \
|
||||
check(!strcmp(norm, src_norm), \
|
||||
"normalization failed for %s -> %s", src, norm); \
|
||||
free(src_norm); \
|
||||
|
||||
@ -47,7 +47,7 @@ size_t encode(char *dest, const char *buf)
|
||||
}
|
||||
check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i);
|
||||
i = j; /* skip to char after hex input */
|
||||
d += utf8proc_encode_char(c, (uint8_t *) (dest + d));
|
||||
d += utf8proc_encode_char(c, (utf8proc_uint8_t *) (dest + d));
|
||||
} while (1);
|
||||
}
|
||||
|
||||
|
||||
98
utf8proc.c
98
utf8proc.c
@ -44,7 +44,7 @@
|
||||
#include "utf8proc_data.c"
|
||||
|
||||
|
||||
UTF8PROC_DLLEXPORT const int8_t utf8proc_utf8class[256] = {
|
||||
UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = {
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
@ -109,11 +109,11 @@ UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
|
||||
const uint8_t *str, utf8proc_ssize_t strlen, int32_t *dst
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
|
||||
) {
|
||||
int length;
|
||||
int i;
|
||||
int32_t uc = -1;
|
||||
utf8proc_int32_t uc = -1;
|
||||
*dst = -1;
|
||||
if (!strlen) return 0;
|
||||
length = utf8proc_utf8class[str[0]];
|
||||
@ -148,14 +148,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
|
||||
return length;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT bool utf8proc_codepoint_valid(int32_t uc) {
|
||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
|
||||
if (uc < 0 || uc >= 0x110000 ||
|
||||
((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) ||
|
||||
(uc >= 0xFDD0 && uc < 0xFDF0)) return false;
|
||||
else return true;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
|
||||
if (uc < 0x00) {
|
||||
return 0;
|
||||
} else if (uc < 0x80) {
|
||||
@ -186,7 +186,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(int32_t uc, uint8_t *ds
|
||||
}
|
||||
|
||||
/* internal "unsafe" version that does not check whether uc is in range */
|
||||
static const utf8proc_property_t *get_property(int32_t uc) {
|
||||
static const utf8proc_property_t *get_property(utf8proc_int32_t uc) {
|
||||
/* ASSERT: uc >= 0 && uc < 0x110000 */
|
||||
return utf8proc_properties + (
|
||||
utf8proc_stage2table[
|
||||
@ -195,12 +195,12 @@ static const utf8proc_property_t *get_property(int32_t uc) {
|
||||
);
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t uc) {
|
||||
UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) {
|
||||
return uc < 0 || uc >= 0x110000 ? utf8proc_properties : get_property(uc);
|
||||
}
|
||||
|
||||
/* return whether there is a grapheme break between boundclasses lbc and tbc */
|
||||
static bool grapheme_break(int lbc, int tbc) {
|
||||
static utf8proc_bool grapheme_break(int lbc, int tbc) {
|
||||
return
|
||||
(lbc == UTF8PROC_BOUNDCLASS_START) ? true :
|
||||
(lbc == UTF8PROC_BOUNDCLASS_CR &&
|
||||
@ -226,22 +226,22 @@ static bool grapheme_break(int lbc, int tbc) {
|
||||
}
|
||||
|
||||
/* return whether there is a grapheme break between codepoints c1 and c2 */
|
||||
UTF8PROC_DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2) {
|
||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, utf8proc_int32_t c2) {
|
||||
return grapheme_break(utf8proc_get_property(c1)->boundclass,
|
||||
utf8proc_get_property(c2)->boundclass);
|
||||
}
|
||||
|
||||
/* return a character width analogous to wcwidth (except portable and
|
||||
hopefully less buggy than most system wcwidth functions). */
|
||||
UTF8PROC_DLLEXPORT int utf8proc_charwidth(int32_t c) {
|
||||
UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
|
||||
return utf8proc_get_property(c)->charwidth;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(int32_t c) {
|
||||
UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) {
|
||||
return utf8proc_get_property(c)->category;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT const char *utf8proc_category_string(int32_t c) {
|
||||
UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
|
||||
static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"};
|
||||
return s[utf8proc_category(c)];
|
||||
}
|
||||
@ -250,17 +250,17 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(int32_t c) {
|
||||
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
|
||||
options & ~UTF8PROC_LUMP, last_boundclass)
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
|
||||
const utf8proc_property_t *property;
|
||||
utf8proc_propval_t category;
|
||||
int32_t hangul_sindex;
|
||||
utf8proc_int32_t hangul_sindex;
|
||||
if (uc < 0 || uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED;
|
||||
property = get_property(uc);
|
||||
category = property->category;
|
||||
hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
|
||||
if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
|
||||
if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
|
||||
int32_t hangul_tindex;
|
||||
utf8proc_int32_t hangul_tindex;
|
||||
if (bufsize >= 1) {
|
||||
dst[0] = UTF8PROC_HANGUL_LBASE +
|
||||
hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
|
||||
@ -312,7 +312,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(int32_t uc, int32_t
|
||||
}
|
||||
if (options & UTF8PROC_CASEFOLD) {
|
||||
if (property->casefold_mapping) {
|
||||
const int32_t *casefold_entry;
|
||||
const utf8proc_int32_t *casefold_entry;
|
||||
utf8proc_ssize_t written = 0;
|
||||
for (casefold_entry = property->casefold_mapping;
|
||||
*casefold_entry >= 0; casefold_entry++) {
|
||||
@ -327,7 +327,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(int32_t uc, int32_t
|
||||
if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
|
||||
if (property->decomp_mapping &&
|
||||
(!property->decomp_type || (options & UTF8PROC_COMPAT))) {
|
||||
const int32_t *decomp_entry;
|
||||
const utf8proc_int32_t *decomp_entry;
|
||||
utf8proc_ssize_t written = 0;
|
||||
for (decomp_entry = property->decomp_mapping;
|
||||
*decomp_entry >= 0; decomp_entry++) {
|
||||
@ -340,7 +340,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(int32_t uc, int32_t
|
||||
}
|
||||
}
|
||||
if (options & UTF8PROC_CHARBOUND) {
|
||||
bool boundary;
|
||||
utf8proc_bool boundary;
|
||||
int tbc = property->boundclass;
|
||||
boundary = grapheme_break(*last_boundclass, tbc);
|
||||
*last_boundclass = tbc;
|
||||
@ -355,8 +355,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(int32_t uc, int32_t
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
||||
const uint8_t *str, utf8proc_ssize_t strlen,
|
||||
int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
|
||||
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
|
||||
) {
|
||||
/* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
|
||||
utf8proc_ssize_t wpos = 0;
|
||||
@ -366,7 +366,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
||||
!(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
|
||||
return UTF8PROC_ERROR_INVALIDOPTS;
|
||||
{
|
||||
int32_t uc;
|
||||
utf8proc_int32_t uc;
|
||||
utf8proc_ssize_t rpos = 0;
|
||||
utf8proc_ssize_t decomp_result;
|
||||
int boundclass = UTF8PROC_BOUNDCLASS_START;
|
||||
@ -390,14 +390,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
||||
if (decomp_result < 0) return decomp_result;
|
||||
wpos += decomp_result;
|
||||
/* prohibiting integer overflows due to too long strings: */
|
||||
if (wpos < 0 || wpos > SSIZE_MAX/sizeof(int32_t)/2)
|
||||
if (wpos < 0 || wpos > SSIZE_MAX/sizeof(utf8proc_int32_t)/2)
|
||||
return UTF8PROC_ERROR_OVERFLOW;
|
||||
}
|
||||
}
|
||||
if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
|
||||
utf8proc_ssize_t pos = 0;
|
||||
while (pos < wpos-1) {
|
||||
int32_t uc1, uc2;
|
||||
utf8proc_int32_t uc1, uc2;
|
||||
const utf8proc_property_t *property1, *property2;
|
||||
uc1 = buffer[pos];
|
||||
uc2 = buffer[pos+1];
|
||||
@ -416,13 +416,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
||||
return wpos;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
|
||||
/* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
|
||||
ASSERT: 'buffer' has one spare byte of free space at the end! */
|
||||
if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
|
||||
utf8proc_ssize_t rpos;
|
||||
utf8proc_ssize_t wpos = 0;
|
||||
int32_t uc;
|
||||
utf8proc_int32_t uc;
|
||||
for (rpos = 0; rpos < length; rpos++) {
|
||||
uc = buffer[rpos];
|
||||
if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
|
||||
@ -451,23 +451,23 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(int32_t *buffer, utf8proc_
|
||||
length = wpos;
|
||||
}
|
||||
if (options & UTF8PROC_COMPOSE) {
|
||||
int32_t *starter = NULL;
|
||||
int32_t current_char;
|
||||
utf8proc_int32_t *starter = NULL;
|
||||
utf8proc_int32_t current_char;
|
||||
const utf8proc_property_t *starter_property = NULL, *current_property;
|
||||
utf8proc_propval_t max_combining_class = -1;
|
||||
utf8proc_ssize_t rpos;
|
||||
utf8proc_ssize_t wpos = 0;
|
||||
int32_t composition;
|
||||
utf8proc_int32_t composition;
|
||||
for (rpos = 0; rpos < length; rpos++) {
|
||||
current_char = buffer[rpos];
|
||||
current_property = get_property(current_char);
|
||||
if (starter && current_property->combining_class > max_combining_class) {
|
||||
/* combination perhaps possible */
|
||||
int32_t hangul_lindex;
|
||||
int32_t hangul_sindex;
|
||||
utf8proc_int32_t hangul_lindex;
|
||||
utf8proc_int32_t hangul_sindex;
|
||||
hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
|
||||
if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
|
||||
int32_t hangul_vindex;
|
||||
utf8proc_int32_t hangul_vindex;
|
||||
hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
|
||||
if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
|
||||
*starter = UTF8PROC_HANGUL_SBASE +
|
||||
@ -480,7 +480,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(int32_t *buffer, utf8proc_
|
||||
hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
|
||||
if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
|
||||
(hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
|
||||
int32_t hangul_tindex;
|
||||
utf8proc_int32_t hangul_tindex;
|
||||
hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
|
||||
if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
|
||||
*starter += hangul_tindex;
|
||||
@ -521,25 +521,25 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(int32_t *buffer, utf8proc_
|
||||
}
|
||||
{
|
||||
utf8proc_ssize_t rpos, wpos = 0;
|
||||
int32_t uc;
|
||||
utf8proc_int32_t uc;
|
||||
for (rpos = 0; rpos < length; rpos++) {
|
||||
uc = buffer[rpos];
|
||||
wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos);
|
||||
wpos += utf8proc_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
|
||||
}
|
||||
((uint8_t *)buffer)[wpos] = 0;
|
||||
((utf8proc_uint8_t *)buffer)[wpos] = 0;
|
||||
return wpos;
|
||||
}
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
|
||||
const uint8_t *str, utf8proc_ssize_t strlen, uint8_t **dstptr, utf8proc_option_t options
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
|
||||
) {
|
||||
int32_t *buffer;
|
||||
utf8proc_int32_t *buffer;
|
||||
utf8proc_ssize_t result;
|
||||
*dstptr = NULL;
|
||||
result = utf8proc_decompose(str, strlen, NULL, 0, options);
|
||||
if (result < 0) return result;
|
||||
buffer = (int32_t *) malloc(result * sizeof(int32_t) + 1);
|
||||
buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
|
||||
if (!buffer) return UTF8PROC_ERROR_NOMEM;
|
||||
result = utf8proc_decompose(str, strlen, buffer, result, options);
|
||||
if (result < 0) {
|
||||
@ -552,37 +552,37 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
|
||||
return result;
|
||||
}
|
||||
{
|
||||
int32_t *newptr;
|
||||
newptr = (int32_t *) realloc(buffer, (size_t)result+1);
|
||||
utf8proc_int32_t *newptr;
|
||||
newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1);
|
||||
if (newptr) buffer = newptr;
|
||||
}
|
||||
*dstptr = (uint8_t *)buffer;
|
||||
*dstptr = (utf8proc_uint8_t *)buffer;
|
||||
return result;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFD(const uint8_t *str) {
|
||||
uint8_t *retval;
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) {
|
||||
utf8proc_uint8_t *retval;
|
||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||
UTF8PROC_DECOMPOSE);
|
||||
return retval;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFC(const uint8_t *str) {
|
||||
uint8_t *retval;
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) {
|
||||
utf8proc_uint8_t *retval;
|
||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||
UTF8PROC_COMPOSE);
|
||||
return retval;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFKD(const uint8_t *str) {
|
||||
uint8_t *retval;
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) {
|
||||
utf8proc_uint8_t *retval;
|
||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||
UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
|
||||
return retval;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFKC(const uint8_t *str) {
|
||||
uint8_t *retval;
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) {
|
||||
utf8proc_uint8_t *retval;
|
||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||
UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
|
||||
return retval;
|
||||
|
||||
72
utf8proc.h
72
utf8proc.h
@ -77,24 +77,32 @@
|
||||
#include <stdlib.h>
|
||||
#include <sys/types.h>
|
||||
#ifdef _MSC_VER
|
||||
typedef signed char int8_t;
|
||||
typedef unsigned char uint8_t;
|
||||
typedef short int16_t;
|
||||
typedef unsigned short uint16_t;
|
||||
typedef int int32_t;
|
||||
typedef signed char utf8proc_int8_t;
|
||||
typedef unsigned char utf8proc_uint8_t;
|
||||
typedef short utf8proc_int16_t;
|
||||
typedef unsigned short utf8proc_uint16_t;
|
||||
typedef int utf8proc_int32_t;
|
||||
# ifdef _WIN64
|
||||
typedef __int64 utf8proc_ssize_t;
|
||||
# else
|
||||
typedef int utf8proc_ssize_t;
|
||||
# endif
|
||||
# ifndef __cplusplus
|
||||
typedef unsigned char bool;
|
||||
typedef unsigned char utf8proc_bool;
|
||||
enum {false, true};
|
||||
# else
|
||||
typedef bool utf8proc_bool;
|
||||
# endif
|
||||
#else
|
||||
# include <stdbool.h>
|
||||
# include <inttypes.h>
|
||||
typedef int8_t utf8proc_int8_t;
|
||||
typedef uint8_t utf8proc_uint8_t;
|
||||
typedef int16_t utf8proc_int16_t;
|
||||
typedef uint16_t utf8proc_uint16_t;
|
||||
typedef int32_t utf8proc_int32_t;
|
||||
typedef ssize_t utf8proc_ssize_t;
|
||||
typedef bool utf8proc_bool;
|
||||
#endif
|
||||
#include <limits.h>
|
||||
|
||||
@ -204,7 +212,7 @@ typedef enum {
|
||||
/* @name Types */
|
||||
|
||||
/** Holds the value of a property. */
|
||||
typedef int16_t utf8proc_propval_t;
|
||||
typedef utf8proc_int16_t utf8proc_propval_t;
|
||||
|
||||
/** Struct containing information about a codepoint. */
|
||||
typedef struct utf8proc_property_struct {
|
||||
@ -224,13 +232,13 @@ typedef struct utf8proc_property_struct {
|
||||
* @see utf8proc_decomp_type_t.
|
||||
*/
|
||||
utf8proc_propval_t decomp_type;
|
||||
const int32_t *decomp_mapping;
|
||||
const int32_t *casefold_mapping;
|
||||
int32_t uppercase_mapping;
|
||||
int32_t lowercase_mapping;
|
||||
int32_t titlecase_mapping;
|
||||
int32_t comb1st_index;
|
||||
int32_t comb2nd_index;
|
||||
const utf8proc_int32_t *decomp_mapping;
|
||||
const utf8proc_int32_t *casefold_mapping;
|
||||
utf8proc_int32_t uppercase_mapping;
|
||||
utf8proc_int32_t lowercase_mapping;
|
||||
utf8proc_int32_t titlecase_mapping;
|
||||
utf8proc_int32_t comb1st_index;
|
||||
utf8proc_int32_t comb2nd_index;
|
||||
unsigned bidi_mirrored:1;
|
||||
unsigned comp_exclusion:1;
|
||||
/**
|
||||
@ -352,7 +360,7 @@ typedef enum {
|
||||
* Array containing the byte lengths of a UTF-8 encoded codepoint based
|
||||
* on the first byte.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT extern const int8_t utf8proc_utf8class[256];
|
||||
UTF8PROC_DLLEXPORT extern const utf8proc_int8_t utf8proc_utf8class[256];
|
||||
|
||||
/**
|
||||
* Returns the utf8proc API version as a string MAJOR.MINOR.PATCH
|
||||
@ -377,7 +385,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode);
|
||||
* In case of success, the number of bytes read is returned; otherwise, a
|
||||
* negative error code is returned.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(const uint8_t *str, utf8proc_ssize_t strlen, int32_t *codepoint_ref);
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *codepoint_ref);
|
||||
|
||||
/**
|
||||
* Check if a codepoint is valid (regardless of whether it has been
|
||||
@ -385,7 +393,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(const uint8_t *str, utf8pro
|
||||
*
|
||||
* @return 1 if the given `codepoint` is valid and otherwise return 0.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT bool utf8proc_codepoint_valid(int32_t codepoint);
|
||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t codepoint);
|
||||
|
||||
/**
|
||||
* Encodes the codepoint as an UTF-8 string in the byte array pointed
|
||||
@ -396,7 +404,7 @@ UTF8PROC_DLLEXPORT bool utf8proc_codepoint_valid(int32_t codepoint);
|
||||
*
|
||||
* This function does not check whether `codepoint` is valid Unicode.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(int32_t codepoint, uint8_t *dst);
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t codepoint, utf8proc_uint8_t *dst);
|
||||
|
||||
/**
|
||||
* Look up the properties for a given codepoint.
|
||||
@ -410,7 +418,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(int32_t codepoint, uint
|
||||
* If the codepoint is unassigned or invalid, a pointer to a special struct is
|
||||
* returned in which `category` is 0 (@ref UTF8PROC_CATEGORY_CN).
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t codepoint);
|
||||
UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t codepoint);
|
||||
|
||||
/** Decompose a codepoint into an array of codepoints.
|
||||
*
|
||||
@ -440,7 +448,7 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t code
|
||||
* undefined data.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(
|
||||
int32_t codepoint, int32_t *dst, utf8proc_ssize_t bufsize,
|
||||
utf8proc_int32_t codepoint, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize,
|
||||
utf8proc_option_t options, int *last_boundclass
|
||||
);
|
||||
|
||||
@ -461,8 +469,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(
|
||||
* undefined data.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
||||
const uint8_t *str, utf8proc_ssize_t strlen,
|
||||
int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
|
||||
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
|
||||
);
|
||||
|
||||
/**
|
||||
@ -490,13 +498,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
||||
* entries of the array pointed to by `str` have to be in the
|
||||
* range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
|
||||
|
||||
/**
|
||||
* Given a pair of consecutive codepoints, return whether a grapheme break is
|
||||
* permitted between them (as defined by the extended grapheme clusters in UAX#29).
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT bool utf8proc_grapheme_break(int32_t codepoint1, int32_t codepoint2);
|
||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
|
||||
|
||||
/**
|
||||
* Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
|
||||
@ -506,19 +514,19 @@ UTF8PROC_DLLEXPORT bool utf8proc_grapheme_break(int32_t codepoint1, int32_t code
|
||||
* @note
|
||||
* If you want to check for particular types of non-printable characters,
|
||||
* (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */
|
||||
UTF8PROC_DLLEXPORT int utf8proc_charwidth(int32_t codepoint);
|
||||
UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t codepoint);
|
||||
|
||||
/**
|
||||
* Return the Unicode category for the codepoint (one of the
|
||||
* @ref utf8proc_category_t constants.)
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(int32_t codepoint);
|
||||
UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t codepoint);
|
||||
|
||||
/**
|
||||
* Return the two-letter (nul-terminated) Unicode category string for
|
||||
* the codepoint (e.g. `"Lu"` or `"Co"`).
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT const char *utf8proc_category_string(int32_t codepoint);
|
||||
UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoint);
|
||||
|
||||
/**
|
||||
* Maps the given UTF-8 string pointed to by `str` to a new UTF-8
|
||||
@ -539,7 +547,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(int32_t codepoint);
|
||||
* with `malloc`, and should therefore be deallocated with `free`.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
|
||||
const uint8_t *str, utf8proc_ssize_t strlen, uint8_t **dstptr, utf8proc_option_t options
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
|
||||
);
|
||||
|
||||
/** @name Unicode normalization
|
||||
@ -551,13 +559,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
|
||||
*/
|
||||
/** @{ */
|
||||
/** NFD normalization (@ref UTF8PROC_DECOMPOSE). */
|
||||
UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFD(const uint8_t *str);
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str);
|
||||
/** NFC normalization (@ref UTF8PROC_COMPOSE). */
|
||||
UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFC(const uint8_t *str);
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str);
|
||||
/** NFD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
|
||||
UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFKD(const uint8_t *str);
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
|
||||
/** NFD normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
|
||||
UTF8PROC_DLLEXPORT uint8_t *utf8proc_NFKC(const uint8_t *str);
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
|
||||
/** @} */
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
const int32_t utf8proc_sequences[] = {
|
||||
const utf8proc_int32_t utf8proc_sequences[] = {
|
||||
97, -1, 98, -1, 99, -1, 100,
|
||||
-1, 101, -1, 102, -1, 103, -1, 104,
|
||||
-1, 105, -1, 106, -1, 107, -1, 108,
|
||||
@ -1523,7 +1523,7 @@ const int32_t utf8proc_sequences[] = {
|
||||
172689, -1, 19798, -1, 40702, -1, 40709, -1,
|
||||
40719, -1, 40726, -1, 173568, -1, };
|
||||
|
||||
const uint16_t utf8proc_stage1table[] = {
|
||||
const utf8proc_uint16_t utf8proc_stage1table[] = {
|
||||
0, 256, 512, 768, 1024, 1280, 1536,
|
||||
1792, 2048, 2304, 2560, 2816, 3072, 3328, 3584,
|
||||
3840, 4096, 4352, 4608, 4864, 5120, 5376, 5632,
|
||||
@ -2070,7 +2070,7 @@ const uint16_t utf8proc_stage1table[] = {
|
||||
18432, 18432, 18432, 18432, 18432, 18432, 18432, 18432,
|
||||
35584, };
|
||||
|
||||
const uint16_t utf8proc_stage2table[] = {
|
||||
const utf8proc_uint16_t utf8proc_stage2table[] = {
|
||||
1, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 3, 4, 3, 5, 6, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2,
|
||||
@ -13003,7 +13003,7 @@ const utf8proc_property_t utf8proc_properties[] = {
|
||||
{UTF8PROC_CATEGORY_LO, 0, UTF8PROC_BIDI_CLASS_L, 0, utf8proc_sequences + 12179, NULL, -1, -1, -1, -1, -1, false, false, false, false, UTF8PROC_BOUNDCLASS_OTHER, 2},
|
||||
};
|
||||
|
||||
const int32_t utf8proc_combinations[] = {
|
||||
const utf8proc_int32_t utf8proc_combinations[] = {
|
||||
192, 193, 194, 195, 196, 197, -1,
|
||||
256, 258, 260, 550, 461, -1, -1, 512,
|
||||
514, -1, -1, -1, -1, -1, -1, -1,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user