Tlsa/ucs4 normalize (#88)
* Split codepoint sequence normalisation out into separate function. This creates utf8proc_normalize_utf32() which takes and returns a UTF-32 string, applying the following options: - UTF8PROC_NLF2LS - UTF8PROC_NLF2PS - UTF8PROC_NLF2LF - UTF8PROC_STRIPCC - UTF8PROC_COMPOSE - UTF8PROC_STABLE The utf8proc_reencode() function has been updated to call the new utf8proc_normalize_utf32(). * Update code documentation: utf8proc_reencode handles UTF8PROC_CHARBOUND.
This commit is contained in:
parent
caef918abd
commit
70bbed8626
13
utf8proc.c
13
utf8proc.c
@ -545,9 +545,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
|||||||
return wpos;
|
return wpos;
|
||||||
}
|
}
|
||||||
|
|
||||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
|
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
|
||||||
/* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
|
/* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */
|
||||||
ASSERT: 'buffer' has one spare byte of free space at the end! */
|
|
||||||
if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
|
if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
|
||||||
utf8proc_ssize_t rpos;
|
utf8proc_ssize_t rpos;
|
||||||
utf8proc_ssize_t wpos = 0;
|
utf8proc_ssize_t wpos = 0;
|
||||||
@ -655,6 +654,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
|
|||||||
}
|
}
|
||||||
length = wpos;
|
length = wpos;
|
||||||
}
|
}
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
|
||||||
|
/* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
|
||||||
|
ASSERT: 'buffer' has one spare byte of free space at the end! */
|
||||||
|
length = utf8proc_normalize_utf32(buffer, length, options);
|
||||||
|
if (length < 0) return length;
|
||||||
{
|
{
|
||||||
utf8proc_ssize_t rpos, wpos = 0;
|
utf8proc_ssize_t rpos, wpos = 0;
|
||||||
utf8proc_int32_t uc;
|
utf8proc_int32_t uc;
|
||||||
|
|||||||
34
utf8proc.h
34
utf8proc.h
@ -491,8 +491,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
|||||||
);
|
);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reencodes the sequence of `length` codepoints pointed to by `buffer`
|
* Normalizes the sequence of `length` codepoints pointed to by `buffer`
|
||||||
* UTF-8 data in-place (i.e., the result is also stored in `buffer`).
|
* in-place (i.e., the result is also stored in `buffer`).
|
||||||
*
|
*
|
||||||
* @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
|
* @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
|
||||||
* @param length the length (in codepoints) of the buffer.
|
* @param length the length (in codepoints) of the buffer.
|
||||||
@ -507,9 +507,37 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
|||||||
* the unicode versioning stability
|
* the unicode versioning stability
|
||||||
*
|
*
|
||||||
* @return
|
* @return
|
||||||
* In case of success, the length (in bytes) of the resulting UTF-8 string is
|
* In case of success, the length (in codepoints) of the normalized UTF-32 string is
|
||||||
* returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg).
|
* returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg).
|
||||||
*
|
*
|
||||||
|
* @warning The entries of the array pointed to by `str` have to be in the
|
||||||
|
* range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
|
||||||
|
*/
|
||||||
|
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reencodes the sequence of `length` codepoints pointed to by `buffer`
|
||||||
|
* UTF-8 data in-place (i.e., the result is also stored in `buffer`).
|
||||||
|
* Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion.
|
||||||
|
*
|
||||||
|
* @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
|
||||||
|
* @param length the length (in codepoints) of the buffer.
|
||||||
|
* @param options a bitwise or (`|`) of one or more of the following flags:
|
||||||
|
* - @ref UTF8PROC_NLF2LS - convert LF, CRLF, CR and NEL into LS
|
||||||
|
* - @ref UTF8PROC_NLF2PS - convert LF, CRLF, CR and NEL into PS
|
||||||
|
* - @ref UTF8PROC_NLF2LF - convert LF, CRLF, CR and NEL into LF
|
||||||
|
* - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters
|
||||||
|
* - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite
|
||||||
|
* codepoints
|
||||||
|
* - @ref UTF8PROC_STABLE - prohibit combining characters that would violate
|
||||||
|
* the unicode versioning stability
|
||||||
|
* - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
* In case of success, the length (in bytes) of the resulting nul-terminated
|
||||||
|
* UTF-8 string is returned; otherwise, a negative error code is returned
|
||||||
|
* (@ref utf8proc_errmsg).
|
||||||
|
*
|
||||||
* @warning The amount of free space pointed to by `buffer` must
|
* @warning The amount of free space pointed to by `buffer` must
|
||||||
* exceed the amount of the input data by one byte, and the
|
* exceed the amount of the input data by one byte, and the
|
||||||
* entries of the array pointed to by `str` have to be in the
|
* entries of the array pointed to by `str` have to be in the
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user