new utf8proc_map_custom for hooking in user-defined custom mappings (#89)
* new utf8proc_map_custom for hooking in user-defined custom mappings * whoops, add test program * NEWS, version bump for 2.1 * change test functions to static so that gcc doesn't complain about missing prototypes
This commit is contained in:
parent
8da37e2892
commit
b4621f43c3
@ -10,8 +10,8 @@ project (utf8proc C)
|
|||||||
# API version number (defined in utf8proc.h).
|
# API version number (defined in utf8proc.h).
|
||||||
# Be sure to also update these in Makefile and MANIFEST!
|
# Be sure to also update these in Makefile and MANIFEST!
|
||||||
set(SO_MAJOR 2)
|
set(SO_MAJOR 2)
|
||||||
set(SO_MINOR 0)
|
set(SO_MINOR 1)
|
||||||
set(SO_PATCH 2)
|
set(SO_PATCH 0)
|
||||||
|
|
||||||
add_definitions (
|
add_definitions (
|
||||||
-DUTF8PROC_EXPORTS
|
-DUTF8PROC_EXPORTS
|
||||||
|
|||||||
6
MANIFEST
6
MANIFEST
@ -2,6 +2,6 @@ include/
|
|||||||
include/utf8proc.h
|
include/utf8proc.h
|
||||||
lib/
|
lib/
|
||||||
lib/libutf8proc.a
|
lib/libutf8proc.a
|
||||||
lib/libutf8proc.so -> libutf8proc.so.2.0.2
|
lib/libutf8proc.so -> libutf8proc.so.2.1.0
|
||||||
lib/libutf8proc.so.2 -> libutf8proc.so.2.0.2
|
lib/libutf8proc.so.2 -> libutf8proc.so.2.1.0
|
||||||
lib/libutf8proc.so.2.0.2
|
lib/libutf8proc.so.2.1.0
|
||||||
|
|||||||
12
Makefile
12
Makefile
@ -21,8 +21,8 @@ UCFLAGS = $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS
|
|||||||
# The API version number is defined in utf8proc.h.
|
# The API version number is defined in utf8proc.h.
|
||||||
# Be sure to also update these ABI versions in MANIFEST and CMakeLists.txt!
|
# Be sure to also update these ABI versions in MANIFEST and CMakeLists.txt!
|
||||||
MAJOR=2
|
MAJOR=2
|
||||||
MINOR=0
|
MINOR=1
|
||||||
PATCH=2
|
PATCH=0
|
||||||
|
|
||||||
OS := $(shell uname)
|
OS := $(shell uname)
|
||||||
ifeq ($(OS),Darwin) # MacOS X
|
ifeq ($(OS),Darwin) # MacOS X
|
||||||
@ -49,7 +49,7 @@ clean:
|
|||||||
ifneq ($(OS),Darwin)
|
ifneq ($(OS),Darwin)
|
||||||
rm -f libutf8proc.so.$(MAJOR)
|
rm -f libutf8proc.so.$(MAJOR)
|
||||||
endif
|
endif
|
||||||
rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case
|
rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom
|
||||||
rm -rf MANIFEST.new tmp
|
rm -rf MANIFEST.new tmp
|
||||||
$(MAKE) -C bench clean
|
$(MAKE) -C bench clean
|
||||||
$(MAKE) -C data clean
|
$(MAKE) -C data clean
|
||||||
@ -136,7 +136,10 @@ test/iterate: test/iterate.c test/tests.o utf8proc.o utf8proc.h test/tests.h
|
|||||||
test/case: test/case.c test/tests.o utf8proc.o utf8proc.h test/tests.h
|
test/case: test/case.c test/tests.o utf8proc.o utf8proc.h test/tests.h
|
||||||
$(CC) $(UCFLAGS) test/case.c test/tests.o utf8proc.o -o $@
|
$(CC) $(UCFLAGS) test/case.c test/tests.o utf8proc.o -o $@
|
||||||
|
|
||||||
check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
|
test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h
|
||||||
|
$(CC) $(UCFLAGS) test/custom.c test/tests.o utf8proc.o -o $@
|
||||||
|
|
||||||
|
check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/custom test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
|
||||||
$(MAKE) -C bench
|
$(MAKE) -C bench
|
||||||
test/normtest data/NormalizationTest.txt
|
test/normtest data/NormalizationTest.txt
|
||||||
test/graphemetest data/GraphemeBreakTest.txt
|
test/graphemetest data/GraphemeBreakTest.txt
|
||||||
@ -144,3 +147,4 @@ check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeB
|
|||||||
test/valid
|
test/valid
|
||||||
test/iterate
|
test/iterate
|
||||||
test/case
|
test/case
|
||||||
|
test/custom
|
||||||
|
|||||||
15
NEWS.md
15
NEWS.md
@ -1,5 +1,17 @@
|
|||||||
# utf8proc release history #
|
# utf8proc release history #
|
||||||
|
|
||||||
|
## Version 2.1 (not yet released) ##
|
||||||
|
|
||||||
|
- New functions `utf8proc_map_custom` and `utf8proc_decompose_custom`
|
||||||
|
to allow user-supplied transformations of codepoints, in conjunction
|
||||||
|
with other transformations ([#89]).
|
||||||
|
|
||||||
|
- New function `utf8proc_normalize_utf32` to apply normalizations
|
||||||
|
directly to UTF-32 data (not just UTF-8) ([#88]).
|
||||||
|
|
||||||
|
- Fixed stack overflow that could occur due to incorrect definition
|
||||||
|
of `UINT16_MAX` with some compilers ([#84]).
|
||||||
|
|
||||||
## Version 2.0.2 ##
|
## Version 2.0.2 ##
|
||||||
|
|
||||||
2016-07-27:
|
2016-07-27:
|
||||||
@ -279,3 +291,6 @@ Release of version 1.0.1
|
|||||||
[#78]: https://github.com/JuliaLang/utf8proc/issues/78
|
[#78]: https://github.com/JuliaLang/utf8proc/issues/78
|
||||||
[#79]: https://github.com/JuliaLang/utf8proc/issues/79
|
[#79]: https://github.com/JuliaLang/utf8proc/issues/79
|
||||||
[#80]: https://github.com/JuliaLang/utf8proc/issues/80
|
[#80]: https://github.com/JuliaLang/utf8proc/issues/80
|
||||||
|
[#84]: https://github.com/JuliaLang/utf8proc/pull/84
|
||||||
|
[#88]: https://github.com/JuliaLang/utf8proc/pull/88
|
||||||
|
[#89]: https://github.com/JuliaLang/utf8proc/pull/89
|
||||||
|
|||||||
27
test/custom.c
Normal file
27
test/custom.c
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
#include "tests.h"
|
||||||
|
|
||||||
|
static int thunk_test = 1;
|
||||||
|
|
||||||
|
static utf8proc_int32_t custom(utf8proc_int32_t codepoint, void *thunk)
|
||||||
|
{
|
||||||
|
check(((int *) thunk) == &thunk_test, "unexpected thunk passed");
|
||||||
|
if (codepoint == 'a')
|
||||||
|
return 'b';
|
||||||
|
if (codepoint == 'S')
|
||||||
|
return 0x00df; /* ß */
|
||||||
|
return codepoint;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(void)
|
||||||
|
{
|
||||||
|
utf8proc_uint8_t input[] = {0x41,0x61,0x53,0x62,0xef,0xbd,0x81,0x00}; /* "AaSb\uff41" */
|
||||||
|
utf8proc_uint8_t correct[] = {0x61,0x62,0x73,0x73,0x62,0x61,0x00}; /* "abssba" */
|
||||||
|
utf8proc_uint8_t *output;
|
||||||
|
utf8proc_map_custom(input, 0, &output, UTF8PROC_CASEFOLD | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_NULLTERM,
|
||||||
|
custom, &thunk_test);
|
||||||
|
printf("mapped \"%s\" -> \"%s\"\n", (char*)input, (char*)output);
|
||||||
|
check(strlen((char*) output) == 6, "incorrect output length");
|
||||||
|
check(!memcmp(correct, output, 7), "incorrect output data");
|
||||||
|
free(output);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
24
utf8proc.c
24
utf8proc.c
@ -391,8 +391,6 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
|
|||||||
return s[utf8proc_category(c)];
|
return s[utf8proc_category(c)];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#define utf8proc_decompose_lump(replacement_uc) \
|
#define utf8proc_decompose_lump(replacement_uc) \
|
||||||
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
|
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
|
||||||
options & ~UTF8PROC_LUMP, last_boundclass)
|
options & ~UTF8PROC_LUMP, last_boundclass)
|
||||||
@ -485,6 +483,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
|
|||||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
||||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
|
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
|
||||||
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
|
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
|
||||||
|
) {
|
||||||
|
return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
|
||||||
|
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
|
||||||
|
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
|
||||||
|
utf8proc_custom_func custom_func, void *custom_data
|
||||||
) {
|
) {
|
||||||
/* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
|
/* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
|
||||||
utf8proc_ssize_t wpos = 0;
|
utf8proc_ssize_t wpos = 0;
|
||||||
@ -511,6 +517,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
|||||||
rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
|
rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
|
||||||
if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
|
if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
|
||||||
}
|
}
|
||||||
|
if (custom_func != NULL) {
|
||||||
|
uc = custom_func(uc, custom_data); /* user-specified custom mapping */
|
||||||
|
}
|
||||||
decomp_result = utf8proc_decompose_char(
|
decomp_result = utf8proc_decompose_char(
|
||||||
uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
|
uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
|
||||||
&boundclass
|
&boundclass
|
||||||
@ -683,15 +692,22 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
|
|||||||
|
|
||||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
|
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
|
||||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
|
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
|
||||||
|
) {
|
||||||
|
return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
|
||||||
|
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
|
||||||
|
utf8proc_custom_func custom_func, void *custom_data
|
||||||
) {
|
) {
|
||||||
utf8proc_int32_t *buffer;
|
utf8proc_int32_t *buffer;
|
||||||
utf8proc_ssize_t result;
|
utf8proc_ssize_t result;
|
||||||
*dstptr = NULL;
|
*dstptr = NULL;
|
||||||
result = utf8proc_decompose(str, strlen, NULL, 0, options);
|
result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
|
||||||
if (result < 0) return result;
|
if (result < 0) return result;
|
||||||
buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
|
buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
|
||||||
if (!buffer) return UTF8PROC_ERROR_NOMEM;
|
if (!buffer) return UTF8PROC_ERROR_NOMEM;
|
||||||
result = utf8proc_decompose(str, strlen, buffer, result, options);
|
result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
|
||||||
if (result < 0) {
|
if (result < 0) {
|
||||||
free(buffer);
|
free(buffer);
|
||||||
return result;
|
return result;
|
||||||
|
|||||||
38
utf8proc.h
38
utf8proc.h
@ -71,9 +71,9 @@
|
|||||||
/** The MAJOR version number (increased when backwards API compatibility is broken). */
|
/** The MAJOR version number (increased when backwards API compatibility is broken). */
|
||||||
#define UTF8PROC_VERSION_MAJOR 2
|
#define UTF8PROC_VERSION_MAJOR 2
|
||||||
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
|
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
|
||||||
#define UTF8PROC_VERSION_MINOR 0
|
#define UTF8PROC_VERSION_MINOR 1
|
||||||
/** The PATCH version (increased for fixes that do not change the API). */
|
/** The PATCH version (increased for fixes that do not change the API). */
|
||||||
#define UTF8PROC_VERSION_PATCH 2
|
#define UTF8PROC_VERSION_PATCH 0
|
||||||
/** @} */
|
/** @} */
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
@ -373,6 +373,13 @@ typedef enum {
|
|||||||
UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */
|
UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */
|
||||||
} utf8proc_boundclass_t;
|
} utf8proc_boundclass_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Function pointer type passed to @ref utf8proc_map_custom and
|
||||||
|
* @ref utf8proc_decompose_custom, which is used to specify a user-defined
|
||||||
|
* mapping of codepoints to be applied in conjunction with other mappings.
|
||||||
|
*/
|
||||||
|
typedef utf8proc_int32_t (*utf8proc_custom_func)(utf8proc_int32_t codepoint, void *data);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Array containing the byte lengths of a UTF-8 encoded codepoint based
|
* Array containing the byte lengths of a UTF-8 encoded codepoint based
|
||||||
* on the first byte.
|
* on the first byte.
|
||||||
@ -480,6 +487,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(
|
|||||||
* `buffer` (which must contain at least `bufsize` entries). In case of
|
* `buffer` (which must contain at least `bufsize` entries). In case of
|
||||||
* success, the number of codepoints written is returned; in case of an
|
* success, the number of codepoints written is returned; in case of an
|
||||||
* error, a negative error code is returned (@ref utf8proc_errmsg).
|
* error, a negative error code is returned (@ref utf8proc_errmsg).
|
||||||
|
* See @ref utf8proc_decompose_custom to supply additional transformations.
|
||||||
*
|
*
|
||||||
* If the number of written codepoints would be bigger than `bufsize`, the
|
* If the number of written codepoints would be bigger than `bufsize`, the
|
||||||
* required buffer size is returned, while the buffer will be overwritten with
|
* required buffer size is returned, while the buffer will be overwritten with
|
||||||
@ -490,6 +498,18 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
|||||||
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
|
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
|
||||||
);
|
);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The same as @ref utf8proc_decompose, but also takes a `custom_func` mapping function
|
||||||
|
* that is called on each codepoint in `str` before any other transformations
|
||||||
|
* (along with a `custom_data` pointer that is passed through to `custom_func`).
|
||||||
|
* The `custom_func` argument is ignored if it is `NULL`. See also @ref utf8proc_map_custom.
|
||||||
|
*/
|
||||||
|
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
|
||||||
|
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
|
||||||
|
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
|
||||||
|
utf8proc_custom_func custom_func, void *custom_data
|
||||||
|
);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Normalizes the sequence of `length` codepoints pointed to by `buffer`
|
* Normalizes the sequence of `length` codepoints pointed to by `buffer`
|
||||||
* in-place (i.e., the result is also stored in `buffer`).
|
* in-place (i.e., the result is also stored in `buffer`).
|
||||||
@ -623,7 +643,8 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoi
|
|||||||
* in any case the result will be NULL terminated (though it might
|
* in any case the result will be NULL terminated (though it might
|
||||||
* contain NULL characters with the string if `str` contained NULL
|
* contain NULL characters with the string if `str` contained NULL
|
||||||
* characters). Other flags in the `options` field are passed to the
|
* characters). Other flags in the `options` field are passed to the
|
||||||
* functions defined above, and regarded as described.
|
* functions defined above, and regarded as described. See also
|
||||||
|
* @ref utfproc_map_custom to supply a custom codepoint transformation.
|
||||||
*
|
*
|
||||||
* In case of success the length of the new string is returned,
|
* In case of success the length of the new string is returned,
|
||||||
* otherwise a negative error code is returned.
|
* otherwise a negative error code is returned.
|
||||||
@ -635,6 +656,17 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
|
|||||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
|
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
|
||||||
);
|
);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Like @ref utf8proc_map, but also takes a `custom_func` mapping function
|
||||||
|
* that is called on each codepoint in `str` before any other transformations
|
||||||
|
* (along with a `custom_data` pointer that is passed through to `custom_func`).
|
||||||
|
* The `custom_func` argument is ignored if it is `NULL`.
|
||||||
|
*/
|
||||||
|
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
|
||||||
|
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
|
||||||
|
utf8proc_custom_func custom_func, void *custom_data
|
||||||
|
);
|
||||||
|
|
||||||
/** @name Unicode normalization
|
/** @name Unicode normalization
|
||||||
*
|
*
|
||||||
* Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
|
* Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user