rm ruby and pgsql plugins: keep libutf8proc repo focused exclusively on the C library
This commit is contained in:
parent
ab9520d188
commit
c0f2b512a0
53
README
53
README
@ -45,59 +45,6 @@ The documentation for the C library is found in the utf8proc.h header file.
|
|||||||
strings, unless you want to allocate memory yourself.
|
strings, unless you want to allocate memory yourself.
|
||||||
|
|
||||||
|
|
||||||
*** RUBY API ***
|
|
||||||
|
|
||||||
The ruby library adds the methods "utf8map" and "utf8map!" to the String
|
|
||||||
class, and the method "utf8" to the Integer class.
|
|
||||||
|
|
||||||
The String#utf8map method does the same as the "utf8proc_map" C function.
|
|
||||||
Options for the mapping procedure are passed as symbols, i.e:
|
|
||||||
"Hello".utf8map(:casefold) => "hello"
|
|
||||||
|
|
||||||
The descriptions of all options are found in the C header file
|
|
||||||
"utf8proc.h". Please notice that the according symbols in ruby are all
|
|
||||||
lowercase.
|
|
||||||
|
|
||||||
String#utf8map! is the destructive function in the meaning that the string
|
|
||||||
is replaced by the result.
|
|
||||||
|
|
||||||
There are shortcuts for the 4 normalization forms specified by Unicode:
|
|
||||||
String#utf8nfd, String#utf8nfd!,
|
|
||||||
String#utf8nfc, String#utf8nfc!,
|
|
||||||
String#utf8nfkd, String#utf8nfkd!,
|
|
||||||
String#utf8nfkc, String#utf8nfkc!
|
|
||||||
|
|
||||||
The method Integer#utf8 returns a UTF-8 string, which is containing the
|
|
||||||
unicode char given by the code point.
|
|
||||||
0x000A.utf8 => "\n"
|
|
||||||
0x2028.utf8 => "\342\200\250"
|
|
||||||
|
|
||||||
|
|
||||||
*** POSTGRESQL API ***
|
|
||||||
|
|
||||||
For PostgreSQL there are two SQL functions supplied named "unifold" and
|
|
||||||
"unistrip". These functions function can be used to prepare index fields in
|
|
||||||
order to be folded in a way where string-comparisons make more sense, e.g.
|
|
||||||
where "bathtub" == "bath<soft hyphen>tub"
|
|
||||||
or "Hello World" == "hello world".
|
|
||||||
|
|
||||||
CREATE TABLE people (
|
|
||||||
id serial8 primary key,
|
|
||||||
name text,
|
|
||||||
CHECK (unifold(name) NOTNULL)
|
|
||||||
);
|
|
||||||
CREATE INDEX name_idx ON people (unifold(name));
|
|
||||||
SELECT * FROM people WHERE unifold(name) = unifold('John Doe');
|
|
||||||
|
|
||||||
The function "unistrip" removes character marks like accents or diaeresis,
|
|
||||||
while "unifold" keeps then.
|
|
||||||
|
|
||||||
NOTICE: The outputs of the function can change between releases, as
|
|
||||||
utf8proc does not follow a versioning stability policy. You have to
|
|
||||||
rebuild your database indicies, if you upgrade to a newer version
|
|
||||||
of utf8proc.
|
|
||||||
|
|
||||||
|
|
||||||
*** TODO ***
|
*** TODO ***
|
||||||
|
|
||||||
- detect stable code points and process segments independently in order to
|
- detect stable code points and process segments independently in order to
|
||||||
|
|||||||
@ -1,10 +0,0 @@
|
|||||||
utf8proc_pgsql.so: utf8proc_pgsql.o
|
|
||||||
ld -shared -o utf8proc_pgsql.so utf8proc_pgsql.o
|
|
||||||
|
|
||||||
utf8proc_pgsql.o: utf8proc_pgsql.c
|
|
||||||
gcc -Wall -fpic -c -I`pg_config --includedir-server` \
|
|
||||||
-o utf8proc_pgsql.o utf8proc_pgsql.c
|
|
||||||
|
|
||||||
clean:
|
|
||||||
rm -f *.o *.so
|
|
||||||
|
|
||||||
@ -1,6 +0,0 @@
|
|||||||
CREATE OR REPLACE FUNCTION unifold (text) RETURNS text
|
|
||||||
LANGUAGE 'c' IMMUTABLE STRICT AS '$libdir/utf8proc_pgsql.so',
|
|
||||||
'utf8proc_pgsql_unifold';
|
|
||||||
CREATE OR REPLACE FUNCTION unistrip (text) RETURNS text
|
|
||||||
LANGUAGE 'c' IMMUTABLE STRICT AS '$libdir/utf8proc_pgsql.so',
|
|
||||||
'utf8proc_pgsql_unistrip';
|
|
||||||
@ -1,139 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) Public Software Group e. V., Berlin, Germany
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
||||||
* copy of this software and associated documentation files (the "Software"),
|
|
||||||
* to deal in the Software without restriction, including without limitation
|
|
||||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
||||||
* and/or sell copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in
|
|
||||||
* all copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
||||||
* DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
* File name: pgsql/utf8proc_pgsql.c
|
|
||||||
*
|
|
||||||
* Description:
|
|
||||||
* PostgreSQL extension to provide two functions 'unifold' and 'unistrip',
|
|
||||||
* which can be used to case-fold and normalize index fields and
|
|
||||||
* optionally strip marks (e.g. accents) from strings.
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
#include "../utf8proc.c"
|
|
||||||
|
|
||||||
#include <postgres.h>
|
|
||||||
#include <utils/elog.h>
|
|
||||||
#include <fmgr.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#include <utils/builtins.h>
|
|
||||||
|
|
||||||
#ifdef PG_MODULE_MAGIC
|
|
||||||
PG_MODULE_MAGIC;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define UTF8PROC_PGSQL_FOLD_OPTS ( UTF8PROC_REJECTNA | UTF8PROC_COMPAT | \
|
|
||||||
UTF8PROC_COMPOSE | UTF8PROC_STABLE | UTF8PROC_IGNORE | UTF8PROC_STRIPCC | \
|
|
||||||
UTF8PROC_NLF2LF | UTF8PROC_CASEFOLD | UTF8PROC_LUMP )
|
|
||||||
#define UTF8PROC_PGSQL_STRIP_OPTS ( UTF8PROC_REJECTNA | UTF8PROC_COMPAT | \
|
|
||||||
UTF8PROC_COMPOSE | UTF8PROC_STABLE | UTF8PROC_IGNORE | UTF8PROC_STRIPCC | \
|
|
||||||
UTF8PROC_NLF2LF | UTF8PROC_CASEFOLD | UTF8PROC_LUMP | UTF8PROC_STRIPMARK )
|
|
||||||
|
|
||||||
ssize_t utf8proc_pgsql_utf8map(
|
|
||||||
text *input_string, text **output_string_ptr, int options
|
|
||||||
) {
|
|
||||||
ssize_t result;
|
|
||||||
text *output_string;
|
|
||||||
result = utf8proc_decompose(
|
|
||||||
VARDATA(input_string), VARSIZE(input_string) - VARHDRSZ,
|
|
||||||
NULL, 0, options
|
|
||||||
);
|
|
||||||
if (result < 0) return result;
|
|
||||||
if (result > (SIZE_MAX-1-VARHDRSZ)/sizeof(int32_t))
|
|
||||||
return UTF8PROC_ERROR_OVERFLOW;
|
|
||||||
/* reserve one extra byte for termination */
|
|
||||||
*output_string_ptr = palloc(result * sizeof(int32_t) + 1 + VARHDRSZ);
|
|
||||||
output_string = *output_string_ptr;
|
|
||||||
if (!output_string) return UTF8PROC_ERROR_NOMEM;
|
|
||||||
result = utf8proc_decompose(
|
|
||||||
VARDATA(input_string), VARSIZE(input_string) - VARHDRSZ,
|
|
||||||
(int32_t *)VARDATA(output_string), result, options
|
|
||||||
);
|
|
||||||
if (result < 0) return result;
|
|
||||||
result = utf8proc_reencode(
|
|
||||||
(int32_t *)VARDATA(output_string), result, options
|
|
||||||
);
|
|
||||||
if (result >= 0) SET_VARSIZE(output_string, result + VARHDRSZ);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
void utf8proc_pgsql_utf8map_errchk(ssize_t result, text *output_string) {
|
|
||||||
if (result < 0) {
|
|
||||||
int sqlerrcode;
|
|
||||||
if (output_string) pfree(output_string);
|
|
||||||
switch(result) {
|
|
||||||
case UTF8PROC_ERROR_NOMEM:
|
|
||||||
sqlerrcode = ERRCODE_OUT_OF_MEMORY; break;
|
|
||||||
case UTF8PROC_ERROR_OVERFLOW:
|
|
||||||
sqlerrcode = ERRCODE_PROGRAM_LIMIT_EXCEEDED; break;
|
|
||||||
case UTF8PROC_ERROR_INVALIDUTF8:
|
|
||||||
case UTF8PROC_ERROR_NOTASSIGNED:
|
|
||||||
return;
|
|
||||||
default:
|
|
||||||
sqlerrcode = ERRCODE_INTERNAL_ERROR;
|
|
||||||
}
|
|
||||||
ereport(ERROR, (
|
|
||||||
errcode(sqlerrcode),
|
|
||||||
errmsg("%s", utf8proc_errmsg(result))
|
|
||||||
));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
PG_FUNCTION_INFO_V1(utf8proc_pgsql_unifold);
|
|
||||||
Datum utf8proc_pgsql_unifold(PG_FUNCTION_ARGS) {
|
|
||||||
text *input_string;
|
|
||||||
text *output_string = NULL;
|
|
||||||
ssize_t result;
|
|
||||||
input_string = PG_GETARG_TEXT_P(0);
|
|
||||||
result = utf8proc_pgsql_utf8map(
|
|
||||||
input_string, &output_string, UTF8PROC_PGSQL_FOLD_OPTS
|
|
||||||
);
|
|
||||||
PG_FREE_IF_COPY(input_string, 0);
|
|
||||||
utf8proc_pgsql_utf8map_errchk(result, output_string);
|
|
||||||
if (result >= 0) {
|
|
||||||
PG_RETURN_TEXT_P(output_string);
|
|
||||||
} else {
|
|
||||||
PG_RETURN_NULL();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
PG_FUNCTION_INFO_V1(utf8proc_pgsql_unistrip);
|
|
||||||
Datum utf8proc_pgsql_unistrip(PG_FUNCTION_ARGS) {
|
|
||||||
text *input_string;
|
|
||||||
text *output_string = NULL;
|
|
||||||
ssize_t result;
|
|
||||||
input_string = PG_GETARG_TEXT_P(0);
|
|
||||||
result = utf8proc_pgsql_utf8map(
|
|
||||||
input_string, &output_string, UTF8PROC_PGSQL_STRIP_OPTS
|
|
||||||
);
|
|
||||||
PG_FREE_IF_COPY(input_string, 0);
|
|
||||||
utf8proc_pgsql_utf8map_errchk(result, output_string);
|
|
||||||
if (result >= 0) {
|
|
||||||
PG_RETURN_TEXT_P(output_string);
|
|
||||||
} else {
|
|
||||||
PG_RETURN_NULL();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@ -1,2 +0,0 @@
|
|||||||
require 'mkmf'
|
|
||||||
create_makefile("utf8proc_native")
|
|
||||||
@ -1,64 +0,0 @@
|
|||||||
|
|
||||||
Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a
|
|
||||||
copy of this software and associated documentation files (the "Software"),
|
|
||||||
to deal in the Software without restriction, including without limitation
|
|
||||||
the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
||||||
and/or sell copies of the Software, and to permit persons to whom the
|
|
||||||
Software is furnished to do so, subject to the following conditions:
|
|
||||||
|
|
||||||
The above copyright notice and this permission notice shall be included in
|
|
||||||
all copies or substantial portions of the Software.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
||||||
DEALINGS IN THE SOFTWARE.
|
|
||||||
|
|
||||||
|
|
||||||
This software distribution contains derived data from a modified version of
|
|
||||||
the Unicode data files. The following license applies to that data:
|
|
||||||
|
|
||||||
COPYRIGHT AND PERMISSION NOTICE
|
|
||||||
|
|
||||||
Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed
|
|
||||||
under the Terms of Use in http://www.unicode.org/copyright.html.
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a
|
|
||||||
copy of the Unicode data files and any associated documentation (the "Data
|
|
||||||
Files") or Unicode software and any associated documentation (the
|
|
||||||
"Software") to deal in the Data Files or Software without restriction,
|
|
||||||
including without limitation the rights to use, copy, modify, merge,
|
|
||||||
publish, distribute, and/or sell copies of the Data Files or Software, and
|
|
||||||
to permit persons to whom the Data Files or Software are furnished to do
|
|
||||||
so, provided that (a) the above copyright notice(s) and this permission
|
|
||||||
notice appear with all copies of the Data Files or Software, (b) both the
|
|
||||||
above copyright notice(s) and this permission notice appear in associated
|
|
||||||
documentation, and (c) there is clear notice in each modified Data File or
|
|
||||||
in the Software as well as in the documentation associated with the Data
|
|
||||||
File(s) or Software that the data or software has been modified.
|
|
||||||
|
|
||||||
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
|
|
||||||
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
||||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
|
|
||||||
THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
|
|
||||||
INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR
|
|
||||||
CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
|
|
||||||
USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
|
|
||||||
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
|
||||||
PERFORMANCE OF THE DATA FILES OR SOFTWARE.
|
|
||||||
|
|
||||||
Except as contained in this notice, the name of a copyright holder shall
|
|
||||||
not be used in advertising or otherwise to promote the sale, use or other
|
|
||||||
dealings in these Data Files or Software without prior written
|
|
||||||
authorization of the copyright holder.
|
|
||||||
|
|
||||||
|
|
||||||
Unicode and the Unicode logo are trademarks of Unicode, Inc., and may be
|
|
||||||
registered in some jurisdictions. All other trademarks and registered
|
|
||||||
trademarks mentioned herein are the property of their respective owners.
|
|
||||||
|
|
||||||
@ -1,12 +0,0 @@
|
|||||||
require 'rubygems'
|
|
||||||
SPEC = Gem::Specification.new do |s|
|
|
||||||
s.name = 'utf8proc'
|
|
||||||
s.version = '1.1.6'
|
|
||||||
s.author = 'Public Software Group e. V., Berlin, Germany'
|
|
||||||
s.homepage = 'http://www.public-software-group.org/utf8proc'
|
|
||||||
s.summary = 'UTF-8 Unicode string processing'
|
|
||||||
s.files = ['LICENSE', 'lib/utf8proc.rb', 'ext/utf8proc_native.c']
|
|
||||||
s.require_path = 'lib/'
|
|
||||||
s.extensions = ['ext/extconf.rb']
|
|
||||||
s.has_rdoc = false
|
|
||||||
end
|
|
||||||
@ -1,98 +0,0 @@
|
|||||||
# Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
|
|
||||||
#
|
|
||||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
|
||||||
# copy of this software and associated documentation files (the "Software"),
|
|
||||||
# to deal in the Software without restriction, including without limitation
|
|
||||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
||||||
# and/or sell copies of the Software, and to permit persons to whom the
|
|
||||||
# Software is furnished to do so, subject to the following conditions:
|
|
||||||
#
|
|
||||||
# The above copyright notice and this permission notice shall be included in
|
|
||||||
# all copies or substantial portions of the Software.
|
|
||||||
#
|
|
||||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
||||||
# DEALINGS IN THE SOFTWARE.
|
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
# File name: ruby/utf8proc.rb
|
|
||||||
#
|
|
||||||
# Description:
|
|
||||||
# Part of the ruby wrapper for libutf8proc, which is written in ruby.
|
|
||||||
#
|
|
||||||
|
|
||||||
|
|
||||||
require 'utf8proc_native'
|
|
||||||
|
|
||||||
|
|
||||||
module Utf8Proc
|
|
||||||
|
|
||||||
SpecialChars = {
|
|
||||||
:HT => "\x09",
|
|
||||||
:LF => "\x0A",
|
|
||||||
:VT => "\x0B",
|
|
||||||
:FF => "\x0C",
|
|
||||||
:CR => "\x0D",
|
|
||||||
:FS => "\x1C",
|
|
||||||
:GS => "\x1D",
|
|
||||||
:RS => "\x1E",
|
|
||||||
:US => "\x1F",
|
|
||||||
:LS => "\xE2\x80\xA8",
|
|
||||||
:PS => "\xE2\x80\xA9",
|
|
||||||
}
|
|
||||||
|
|
||||||
module StringExtensions
|
|
||||||
def utf8map(*option_array)
|
|
||||||
options = 0
|
|
||||||
option_array.each do |option|
|
|
||||||
flag = Utf8Proc::Options[option]
|
|
||||||
raise ArgumentError, "Unknown argument given to String#utf8map." unless
|
|
||||||
flag
|
|
||||||
options |= flag
|
|
||||||
end
|
|
||||||
return Utf8Proc::utf8map(self, options)
|
|
||||||
end
|
|
||||||
def utf8map!(*option_array)
|
|
||||||
self.replace(self.utf8map(*option_array))
|
|
||||||
end
|
|
||||||
def utf8nfd; utf8map( :stable, :decompose); end
|
|
||||||
def utf8nfd!; utf8map!(:stable, :decompose); end
|
|
||||||
def utf8nfc; utf8map( :stable, :compose); end
|
|
||||||
def utf8nfc!; utf8map!(:stable, :compose); end
|
|
||||||
def utf8nfkd; utf8map( :stable, :decompose, :compat); end
|
|
||||||
def utf8nfkd!; utf8map!(:stable, :decompose, :compat); end
|
|
||||||
def utf8nfkc; utf8map( :stable, :compose, :compat); end
|
|
||||||
def utf8nfkc!; utf8map!(:stable, :compose, :compat); end
|
|
||||||
def utf8chars
|
|
||||||
result = self.utf8map(:charbound).split("\377")
|
|
||||||
result.shift if result.first == ""
|
|
||||||
result
|
|
||||||
end
|
|
||||||
def char_ary
|
|
||||||
# depecated, use String#utf8chars instead
|
|
||||||
utf8chars
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
module IntegerExtensions
|
|
||||||
def utf8
|
|
||||||
return Utf8Proc::utf8char(self)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
|
||||||
|
|
||||||
|
|
||||||
class String
|
|
||||||
include(Utf8Proc::StringExtensions)
|
|
||||||
end
|
|
||||||
|
|
||||||
class Integer
|
|
||||||
include(Utf8Proc::IntegerExtensions)
|
|
||||||
end
|
|
||||||
|
|
||||||
@ -1,160 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
||||||
* copy of this software and associated documentation files (the "Software"),
|
|
||||||
* to deal in the Software without restriction, including without limitation
|
|
||||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
||||||
* and/or sell copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in
|
|
||||||
* all copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
||||||
* DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
* File name: ruby/utf8proc_native.c
|
|
||||||
*
|
|
||||||
* Description:
|
|
||||||
* Native part of the ruby wrapper for libutf8proc.
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
#include "../utf8proc.c"
|
|
||||||
#include "ruby.h"
|
|
||||||
|
|
||||||
#ifndef RSTRING_PTR
|
|
||||||
#define RSTRING_PTR(s) (RSTRING(s)->ptr)
|
|
||||||
#endif
|
|
||||||
#ifndef RSTRING_LEN
|
|
||||||
#define RSTRING_LEN(s) (RSTRING(s)->len)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef struct utf8proc_ruby_mapenv_struct {
|
|
||||||
int32_t *buffer;
|
|
||||||
} utf8proc_ruby_mapenv_t;
|
|
||||||
|
|
||||||
void utf8proc_ruby_mapenv_free(utf8proc_ruby_mapenv_t *env) {
|
|
||||||
free(env->buffer);
|
|
||||||
free(env);
|
|
||||||
}
|
|
||||||
|
|
||||||
VALUE utf8proc_ruby_module;
|
|
||||||
VALUE utf8proc_ruby_options;
|
|
||||||
VALUE utf8proc_ruby_eUnicodeError;
|
|
||||||
VALUE utf8proc_ruby_eInvalidUtf8Error;
|
|
||||||
VALUE utf8proc_ruby_eCodeNotAssignedError;
|
|
||||||
|
|
||||||
VALUE utf8proc_ruby_map_error(ssize_t result) {
|
|
||||||
VALUE excpt_class;
|
|
||||||
switch (result) {
|
|
||||||
case UTF8PROC_ERROR_NOMEM:
|
|
||||||
excpt_class = rb_eNoMemError; break;
|
|
||||||
case UTF8PROC_ERROR_OVERFLOW:
|
|
||||||
case UTF8PROC_ERROR_INVALIDOPTS:
|
|
||||||
excpt_class = rb_eArgError; break;
|
|
||||||
case UTF8PROC_ERROR_INVALIDUTF8:
|
|
||||||
excpt_class = utf8proc_ruby_eInvalidUtf8Error; break;
|
|
||||||
case UTF8PROC_ERROR_NOTASSIGNED:
|
|
||||||
excpt_class = utf8proc_ruby_eCodeNotAssignedError; break;
|
|
||||||
default:
|
|
||||||
excpt_class = rb_eRuntimeError;
|
|
||||||
}
|
|
||||||
rb_raise(excpt_class, "%s", utf8proc_errmsg(result));
|
|
||||||
return Qnil;
|
|
||||||
}
|
|
||||||
|
|
||||||
VALUE utf8proc_ruby_map(VALUE self, VALUE str_param, VALUE options_param) {
|
|
||||||
VALUE str;
|
|
||||||
int options;
|
|
||||||
VALUE env_obj;
|
|
||||||
utf8proc_ruby_mapenv_t *env;
|
|
||||||
ssize_t result;
|
|
||||||
VALUE retval;
|
|
||||||
str = StringValue(str_param);
|
|
||||||
options = NUM2INT(options_param) & ~UTF8PROC_NULLTERM;
|
|
||||||
env_obj = Data_Make_Struct(rb_cObject, utf8proc_ruby_mapenv_t, NULL,
|
|
||||||
utf8proc_ruby_mapenv_free, env);
|
|
||||||
result = utf8proc_decompose(RSTRING_PTR(str), RSTRING_LEN(str),
|
|
||||||
NULL, 0, options);
|
|
||||||
if (result < 0) {
|
|
||||||
utf8proc_ruby_map_error(result);
|
|
||||||
return Qnil; /* needed to prevent problems with optimization */
|
|
||||||
}
|
|
||||||
env->buffer = ALLOC_N(int32_t, result+1);
|
|
||||||
result = utf8proc_decompose(RSTRING_PTR(str), RSTRING_LEN(str),
|
|
||||||
env->buffer, result, options);
|
|
||||||
if (result < 0) {
|
|
||||||
free(env->buffer);
|
|
||||||
env->buffer = 0;
|
|
||||||
utf8proc_ruby_map_error(result);
|
|
||||||
return Qnil; /* needed to prevent problems with optimization */
|
|
||||||
}
|
|
||||||
result = utf8proc_reencode(env->buffer, result, options);
|
|
||||||
if (result < 0) {
|
|
||||||
free(env->buffer);
|
|
||||||
env->buffer = 0;
|
|
||||||
utf8proc_ruby_map_error(result);
|
|
||||||
return Qnil; /* needed to prevent problems with optimization */
|
|
||||||
}
|
|
||||||
retval = rb_str_new((char *)env->buffer, result);
|
|
||||||
free(env->buffer);
|
|
||||||
env->buffer = 0;
|
|
||||||
return retval;
|
|
||||||
}
|
|
||||||
|
|
||||||
static VALUE utf8proc_ruby_char(VALUE self, VALUE code_param) {
|
|
||||||
char buffer[4];
|
|
||||||
ssize_t result;
|
|
||||||
int uc;
|
|
||||||
uc = NUM2INT(code_param);
|
|
||||||
if (!utf8proc_codepoint_valid(uc))
|
|
||||||
rb_raise(rb_eArgError, "Invalid Unicode code point");
|
|
||||||
result = utf8proc_encode_char(uc, buffer);
|
|
||||||
return rb_str_new(buffer, result);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define register_utf8proc_option(sym, field) \
|
|
||||||
rb_hash_aset(utf8proc_ruby_options, ID2SYM(rb_intern(sym)), INT2FIX(field))
|
|
||||||
|
|
||||||
void Init_utf8proc_native() {
|
|
||||||
utf8proc_ruby_module = rb_define_module("Utf8Proc");
|
|
||||||
rb_define_module_function(utf8proc_ruby_module, "utf8map",
|
|
||||||
utf8proc_ruby_map, 2);
|
|
||||||
rb_define_module_function(utf8proc_ruby_module, "utf8char",
|
|
||||||
utf8proc_ruby_char, 1);
|
|
||||||
utf8proc_ruby_eUnicodeError = rb_define_class_under(utf8proc_ruby_module,
|
|
||||||
"UnicodeError", rb_eStandardError);
|
|
||||||
utf8proc_ruby_eInvalidUtf8Error = rb_define_class_under(
|
|
||||||
utf8proc_ruby_module, "InvalidUtf8Error", utf8proc_ruby_eUnicodeError);
|
|
||||||
utf8proc_ruby_eCodeNotAssignedError = rb_define_class_under(
|
|
||||||
utf8proc_ruby_module, "CodeNotAssignedError",
|
|
||||||
utf8proc_ruby_eUnicodeError);
|
|
||||||
utf8proc_ruby_options = rb_hash_new();
|
|
||||||
register_utf8proc_option("stable", UTF8PROC_STABLE);
|
|
||||||
register_utf8proc_option("compat", UTF8PROC_COMPAT);
|
|
||||||
register_utf8proc_option("compose", UTF8PROC_COMPOSE);
|
|
||||||
register_utf8proc_option("decompose", UTF8PROC_DECOMPOSE);
|
|
||||||
register_utf8proc_option("ignore", UTF8PROC_IGNORE);
|
|
||||||
register_utf8proc_option("rejectna", UTF8PROC_REJECTNA);
|
|
||||||
register_utf8proc_option("nlf2ls", UTF8PROC_NLF2LS);
|
|
||||||
register_utf8proc_option("nlf2ps", UTF8PROC_NLF2PS);
|
|
||||||
register_utf8proc_option("nlf2lf", UTF8PROC_NLF2LF);
|
|
||||||
register_utf8proc_option("stripcc", UTF8PROC_STRIPCC);
|
|
||||||
register_utf8proc_option("casefold", UTF8PROC_CASEFOLD);
|
|
||||||
register_utf8proc_option("charbound", UTF8PROC_CHARBOUND);
|
|
||||||
register_utf8proc_option("lump", UTF8PROC_LUMP);
|
|
||||||
register_utf8proc_option("stripmark", UTF8PROC_STRIPMARK);
|
|
||||||
OBJ_FREEZE(utf8proc_ruby_options);
|
|
||||||
rb_define_const(utf8proc_ruby_module, "Options", utf8proc_ruby_options);
|
|
||||||
}
|
|
||||||
|
|
||||||
Loading…
Reference in New Issue
Block a user