rm ruby and pgsql plugins: keep libutf8proc repo focused exclusively on the C library
This commit is contained in:
parent
ab9520d188
commit
c0f2b512a0
53
README
53
README
@ -45,59 +45,6 @@ The documentation for the C library is found in the utf8proc.h header file.
|
||||
strings, unless you want to allocate memory yourself.
|
||||
|
||||
|
||||
*** RUBY API ***
|
||||
|
||||
The ruby library adds the methods "utf8map" and "utf8map!" to the String
|
||||
class, and the method "utf8" to the Integer class.
|
||||
|
||||
The String#utf8map method does the same as the "utf8proc_map" C function.
|
||||
Options for the mapping procedure are passed as symbols, i.e:
|
||||
"Hello".utf8map(:casefold) => "hello"
|
||||
|
||||
The descriptions of all options are found in the C header file
|
||||
"utf8proc.h". Please notice that the according symbols in ruby are all
|
||||
lowercase.
|
||||
|
||||
String#utf8map! is the destructive function in the meaning that the string
|
||||
is replaced by the result.
|
||||
|
||||
There are shortcuts for the 4 normalization forms specified by Unicode:
|
||||
String#utf8nfd, String#utf8nfd!,
|
||||
String#utf8nfc, String#utf8nfc!,
|
||||
String#utf8nfkd, String#utf8nfkd!,
|
||||
String#utf8nfkc, String#utf8nfkc!
|
||||
|
||||
The method Integer#utf8 returns a UTF-8 string, which is containing the
|
||||
unicode char given by the code point.
|
||||
0x000A.utf8 => "\n"
|
||||
0x2028.utf8 => "\342\200\250"
|
||||
|
||||
|
||||
*** POSTGRESQL API ***
|
||||
|
||||
For PostgreSQL there are two SQL functions supplied named "unifold" and
|
||||
"unistrip". These functions function can be used to prepare index fields in
|
||||
order to be folded in a way where string-comparisons make more sense, e.g.
|
||||
where "bathtub" == "bath<soft hyphen>tub"
|
||||
or "Hello World" == "hello world".
|
||||
|
||||
CREATE TABLE people (
|
||||
id serial8 primary key,
|
||||
name text,
|
||||
CHECK (unifold(name) NOTNULL)
|
||||
);
|
||||
CREATE INDEX name_idx ON people (unifold(name));
|
||||
SELECT * FROM people WHERE unifold(name) = unifold('John Doe');
|
||||
|
||||
The function "unistrip" removes character marks like accents or diaeresis,
|
||||
while "unifold" keeps then.
|
||||
|
||||
NOTICE: The outputs of the function can change between releases, as
|
||||
utf8proc does not follow a versioning stability policy. You have to
|
||||
rebuild your database indicies, if you upgrade to a newer version
|
||||
of utf8proc.
|
||||
|
||||
|
||||
*** TODO ***
|
||||
|
||||
- detect stable code points and process segments independently in order to
|
||||
|
||||
@ -1,10 +0,0 @@
|
||||
utf8proc_pgsql.so: utf8proc_pgsql.o
|
||||
ld -shared -o utf8proc_pgsql.so utf8proc_pgsql.o
|
||||
|
||||
utf8proc_pgsql.o: utf8proc_pgsql.c
|
||||
gcc -Wall -fpic -c -I`pg_config --includedir-server` \
|
||||
-o utf8proc_pgsql.o utf8proc_pgsql.c
|
||||
|
||||
clean:
|
||||
rm -f *.o *.so
|
||||
|
||||
@ -1,6 +0,0 @@
|
||||
CREATE OR REPLACE FUNCTION unifold (text) RETURNS text
|
||||
LANGUAGE 'c' IMMUTABLE STRICT AS '$libdir/utf8proc_pgsql.so',
|
||||
'utf8proc_pgsql_unifold';
|
||||
CREATE OR REPLACE FUNCTION unistrip (text) RETURNS text
|
||||
LANGUAGE 'c' IMMUTABLE STRICT AS '$libdir/utf8proc_pgsql.so',
|
||||
'utf8proc_pgsql_unistrip';
|
||||
@ -1,139 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) Public Software Group e. V., Berlin, Germany
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* File name: pgsql/utf8proc_pgsql.c
|
||||
*
|
||||
* Description:
|
||||
* PostgreSQL extension to provide two functions 'unifold' and 'unistrip',
|
||||
* which can be used to case-fold and normalize index fields and
|
||||
* optionally strip marks (e.g. accents) from strings.
|
||||
*/
|
||||
|
||||
|
||||
#include "../utf8proc.c"
|
||||
|
||||
#include <postgres.h>
|
||||
#include <utils/elog.h>
|
||||
#include <fmgr.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <utils/builtins.h>
|
||||
|
||||
#ifdef PG_MODULE_MAGIC
|
||||
PG_MODULE_MAGIC;
|
||||
#endif
|
||||
|
||||
#define UTF8PROC_PGSQL_FOLD_OPTS ( UTF8PROC_REJECTNA | UTF8PROC_COMPAT | \
|
||||
UTF8PROC_COMPOSE | UTF8PROC_STABLE | UTF8PROC_IGNORE | UTF8PROC_STRIPCC | \
|
||||
UTF8PROC_NLF2LF | UTF8PROC_CASEFOLD | UTF8PROC_LUMP )
|
||||
#define UTF8PROC_PGSQL_STRIP_OPTS ( UTF8PROC_REJECTNA | UTF8PROC_COMPAT | \
|
||||
UTF8PROC_COMPOSE | UTF8PROC_STABLE | UTF8PROC_IGNORE | UTF8PROC_STRIPCC | \
|
||||
UTF8PROC_NLF2LF | UTF8PROC_CASEFOLD | UTF8PROC_LUMP | UTF8PROC_STRIPMARK )
|
||||
|
||||
ssize_t utf8proc_pgsql_utf8map(
|
||||
text *input_string, text **output_string_ptr, int options
|
||||
) {
|
||||
ssize_t result;
|
||||
text *output_string;
|
||||
result = utf8proc_decompose(
|
||||
VARDATA(input_string), VARSIZE(input_string) - VARHDRSZ,
|
||||
NULL, 0, options
|
||||
);
|
||||
if (result < 0) return result;
|
||||
if (result > (SIZE_MAX-1-VARHDRSZ)/sizeof(int32_t))
|
||||
return UTF8PROC_ERROR_OVERFLOW;
|
||||
/* reserve one extra byte for termination */
|
||||
*output_string_ptr = palloc(result * sizeof(int32_t) + 1 + VARHDRSZ);
|
||||
output_string = *output_string_ptr;
|
||||
if (!output_string) return UTF8PROC_ERROR_NOMEM;
|
||||
result = utf8proc_decompose(
|
||||
VARDATA(input_string), VARSIZE(input_string) - VARHDRSZ,
|
||||
(int32_t *)VARDATA(output_string), result, options
|
||||
);
|
||||
if (result < 0) return result;
|
||||
result = utf8proc_reencode(
|
||||
(int32_t *)VARDATA(output_string), result, options
|
||||
);
|
||||
if (result >= 0) SET_VARSIZE(output_string, result + VARHDRSZ);
|
||||
return result;
|
||||
}
|
||||
|
||||
void utf8proc_pgsql_utf8map_errchk(ssize_t result, text *output_string) {
|
||||
if (result < 0) {
|
||||
int sqlerrcode;
|
||||
if (output_string) pfree(output_string);
|
||||
switch(result) {
|
||||
case UTF8PROC_ERROR_NOMEM:
|
||||
sqlerrcode = ERRCODE_OUT_OF_MEMORY; break;
|
||||
case UTF8PROC_ERROR_OVERFLOW:
|
||||
sqlerrcode = ERRCODE_PROGRAM_LIMIT_EXCEEDED; break;
|
||||
case UTF8PROC_ERROR_INVALIDUTF8:
|
||||
case UTF8PROC_ERROR_NOTASSIGNED:
|
||||
return;
|
||||
default:
|
||||
sqlerrcode = ERRCODE_INTERNAL_ERROR;
|
||||
}
|
||||
ereport(ERROR, (
|
||||
errcode(sqlerrcode),
|
||||
errmsg("%s", utf8proc_errmsg(result))
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(utf8proc_pgsql_unifold);
|
||||
Datum utf8proc_pgsql_unifold(PG_FUNCTION_ARGS) {
|
||||
text *input_string;
|
||||
text *output_string = NULL;
|
||||
ssize_t result;
|
||||
input_string = PG_GETARG_TEXT_P(0);
|
||||
result = utf8proc_pgsql_utf8map(
|
||||
input_string, &output_string, UTF8PROC_PGSQL_FOLD_OPTS
|
||||
);
|
||||
PG_FREE_IF_COPY(input_string, 0);
|
||||
utf8proc_pgsql_utf8map_errchk(result, output_string);
|
||||
if (result >= 0) {
|
||||
PG_RETURN_TEXT_P(output_string);
|
||||
} else {
|
||||
PG_RETURN_NULL();
|
||||
}
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(utf8proc_pgsql_unistrip);
|
||||
Datum utf8proc_pgsql_unistrip(PG_FUNCTION_ARGS) {
|
||||
text *input_string;
|
||||
text *output_string = NULL;
|
||||
ssize_t result;
|
||||
input_string = PG_GETARG_TEXT_P(0);
|
||||
result = utf8proc_pgsql_utf8map(
|
||||
input_string, &output_string, UTF8PROC_PGSQL_STRIP_OPTS
|
||||
);
|
||||
PG_FREE_IF_COPY(input_string, 0);
|
||||
utf8proc_pgsql_utf8map_errchk(result, output_string);
|
||||
if (result >= 0) {
|
||||
PG_RETURN_TEXT_P(output_string);
|
||||
} else {
|
||||
PG_RETURN_NULL();
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,2 +0,0 @@
|
||||
require 'mkmf'
|
||||
create_makefile("utf8proc_native")
|
||||
@ -1,64 +0,0 @@
|
||||
|
||||
Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a
|
||||
copy of this software and associated documentation files (the "Software"),
|
||||
to deal in the Software without restriction, including without limitation
|
||||
the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
and/or sell copies of the Software, and to permit persons to whom the
|
||||
Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
|
||||
|
||||
This software distribution contains derived data from a modified version of
|
||||
the Unicode data files. The following license applies to that data:
|
||||
|
||||
COPYRIGHT AND PERMISSION NOTICE
|
||||
|
||||
Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed
|
||||
under the Terms of Use in http://www.unicode.org/copyright.html.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a
|
||||
copy of the Unicode data files and any associated documentation (the "Data
|
||||
Files") or Unicode software and any associated documentation (the
|
||||
"Software") to deal in the Data Files or Software without restriction,
|
||||
including without limitation the rights to use, copy, modify, merge,
|
||||
publish, distribute, and/or sell copies of the Data Files or Software, and
|
||||
to permit persons to whom the Data Files or Software are furnished to do
|
||||
so, provided that (a) the above copyright notice(s) and this permission
|
||||
notice appear with all copies of the Data Files or Software, (b) both the
|
||||
above copyright notice(s) and this permission notice appear in associated
|
||||
documentation, and (c) there is clear notice in each modified Data File or
|
||||
in the Software as well as in the documentation associated with the Data
|
||||
File(s) or Software that the data or software has been modified.
|
||||
|
||||
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
|
||||
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
|
||||
THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
|
||||
INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR
|
||||
CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
|
||||
USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
|
||||
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
||||
PERFORMANCE OF THE DATA FILES OR SOFTWARE.
|
||||
|
||||
Except as contained in this notice, the name of a copyright holder shall
|
||||
not be used in advertising or otherwise to promote the sale, use or other
|
||||
dealings in these Data Files or Software without prior written
|
||||
authorization of the copyright holder.
|
||||
|
||||
|
||||
Unicode and the Unicode logo are trademarks of Unicode, Inc., and may be
|
||||
registered in some jurisdictions. All other trademarks and registered
|
||||
trademarks mentioned herein are the property of their respective owners.
|
||||
|
||||
@ -1,12 +0,0 @@
|
||||
require 'rubygems'
|
||||
SPEC = Gem::Specification.new do |s|
|
||||
s.name = 'utf8proc'
|
||||
s.version = '1.1.6'
|
||||
s.author = 'Public Software Group e. V., Berlin, Germany'
|
||||
s.homepage = 'http://www.public-software-group.org/utf8proc'
|
||||
s.summary = 'UTF-8 Unicode string processing'
|
||||
s.files = ['LICENSE', 'lib/utf8proc.rb', 'ext/utf8proc_native.c']
|
||||
s.require_path = 'lib/'
|
||||
s.extensions = ['ext/extconf.rb']
|
||||
s.has_rdoc = false
|
||||
end
|
||||
@ -1,98 +0,0 @@
|
||||
# Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the "Software"),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
# DEALINGS IN THE SOFTWARE.
|
||||
|
||||
|
||||
#
|
||||
# File name: ruby/utf8proc.rb
|
||||
#
|
||||
# Description:
|
||||
# Part of the ruby wrapper for libutf8proc, which is written in ruby.
|
||||
#
|
||||
|
||||
|
||||
require 'utf8proc_native'
|
||||
|
||||
|
||||
module Utf8Proc
|
||||
|
||||
SpecialChars = {
|
||||
:HT => "\x09",
|
||||
:LF => "\x0A",
|
||||
:VT => "\x0B",
|
||||
:FF => "\x0C",
|
||||
:CR => "\x0D",
|
||||
:FS => "\x1C",
|
||||
:GS => "\x1D",
|
||||
:RS => "\x1E",
|
||||
:US => "\x1F",
|
||||
:LS => "\xE2\x80\xA8",
|
||||
:PS => "\xE2\x80\xA9",
|
||||
}
|
||||
|
||||
module StringExtensions
|
||||
def utf8map(*option_array)
|
||||
options = 0
|
||||
option_array.each do |option|
|
||||
flag = Utf8Proc::Options[option]
|
||||
raise ArgumentError, "Unknown argument given to String#utf8map." unless
|
||||
flag
|
||||
options |= flag
|
||||
end
|
||||
return Utf8Proc::utf8map(self, options)
|
||||
end
|
||||
def utf8map!(*option_array)
|
||||
self.replace(self.utf8map(*option_array))
|
||||
end
|
||||
def utf8nfd; utf8map( :stable, :decompose); end
|
||||
def utf8nfd!; utf8map!(:stable, :decompose); end
|
||||
def utf8nfc; utf8map( :stable, :compose); end
|
||||
def utf8nfc!; utf8map!(:stable, :compose); end
|
||||
def utf8nfkd; utf8map( :stable, :decompose, :compat); end
|
||||
def utf8nfkd!; utf8map!(:stable, :decompose, :compat); end
|
||||
def utf8nfkc; utf8map( :stable, :compose, :compat); end
|
||||
def utf8nfkc!; utf8map!(:stable, :compose, :compat); end
|
||||
def utf8chars
|
||||
result = self.utf8map(:charbound).split("\377")
|
||||
result.shift if result.first == ""
|
||||
result
|
||||
end
|
||||
def char_ary
|
||||
# depecated, use String#utf8chars instead
|
||||
utf8chars
|
||||
end
|
||||
end
|
||||
|
||||
module IntegerExtensions
|
||||
def utf8
|
||||
return Utf8Proc::utf8char(self)
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
|
||||
class String
|
||||
include(Utf8Proc::StringExtensions)
|
||||
end
|
||||
|
||||
class Integer
|
||||
include(Utf8Proc::IntegerExtensions)
|
||||
end
|
||||
|
||||
@ -1,160 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* File name: ruby/utf8proc_native.c
|
||||
*
|
||||
* Description:
|
||||
* Native part of the ruby wrapper for libutf8proc.
|
||||
*/
|
||||
|
||||
|
||||
#include "../utf8proc.c"
|
||||
#include "ruby.h"
|
||||
|
||||
#ifndef RSTRING_PTR
|
||||
#define RSTRING_PTR(s) (RSTRING(s)->ptr)
|
||||
#endif
|
||||
#ifndef RSTRING_LEN
|
||||
#define RSTRING_LEN(s) (RSTRING(s)->len)
|
||||
#endif
|
||||
|
||||
typedef struct utf8proc_ruby_mapenv_struct {
|
||||
int32_t *buffer;
|
||||
} utf8proc_ruby_mapenv_t;
|
||||
|
||||
void utf8proc_ruby_mapenv_free(utf8proc_ruby_mapenv_t *env) {
|
||||
free(env->buffer);
|
||||
free(env);
|
||||
}
|
||||
|
||||
VALUE utf8proc_ruby_module;
|
||||
VALUE utf8proc_ruby_options;
|
||||
VALUE utf8proc_ruby_eUnicodeError;
|
||||
VALUE utf8proc_ruby_eInvalidUtf8Error;
|
||||
VALUE utf8proc_ruby_eCodeNotAssignedError;
|
||||
|
||||
VALUE utf8proc_ruby_map_error(ssize_t result) {
|
||||
VALUE excpt_class;
|
||||
switch (result) {
|
||||
case UTF8PROC_ERROR_NOMEM:
|
||||
excpt_class = rb_eNoMemError; break;
|
||||
case UTF8PROC_ERROR_OVERFLOW:
|
||||
case UTF8PROC_ERROR_INVALIDOPTS:
|
||||
excpt_class = rb_eArgError; break;
|
||||
case UTF8PROC_ERROR_INVALIDUTF8:
|
||||
excpt_class = utf8proc_ruby_eInvalidUtf8Error; break;
|
||||
case UTF8PROC_ERROR_NOTASSIGNED:
|
||||
excpt_class = utf8proc_ruby_eCodeNotAssignedError; break;
|
||||
default:
|
||||
excpt_class = rb_eRuntimeError;
|
||||
}
|
||||
rb_raise(excpt_class, "%s", utf8proc_errmsg(result));
|
||||
return Qnil;
|
||||
}
|
||||
|
||||
VALUE utf8proc_ruby_map(VALUE self, VALUE str_param, VALUE options_param) {
|
||||
VALUE str;
|
||||
int options;
|
||||
VALUE env_obj;
|
||||
utf8proc_ruby_mapenv_t *env;
|
||||
ssize_t result;
|
||||
VALUE retval;
|
||||
str = StringValue(str_param);
|
||||
options = NUM2INT(options_param) & ~UTF8PROC_NULLTERM;
|
||||
env_obj = Data_Make_Struct(rb_cObject, utf8proc_ruby_mapenv_t, NULL,
|
||||
utf8proc_ruby_mapenv_free, env);
|
||||
result = utf8proc_decompose(RSTRING_PTR(str), RSTRING_LEN(str),
|
||||
NULL, 0, options);
|
||||
if (result < 0) {
|
||||
utf8proc_ruby_map_error(result);
|
||||
return Qnil; /* needed to prevent problems with optimization */
|
||||
}
|
||||
env->buffer = ALLOC_N(int32_t, result+1);
|
||||
result = utf8proc_decompose(RSTRING_PTR(str), RSTRING_LEN(str),
|
||||
env->buffer, result, options);
|
||||
if (result < 0) {
|
||||
free(env->buffer);
|
||||
env->buffer = 0;
|
||||
utf8proc_ruby_map_error(result);
|
||||
return Qnil; /* needed to prevent problems with optimization */
|
||||
}
|
||||
result = utf8proc_reencode(env->buffer, result, options);
|
||||
if (result < 0) {
|
||||
free(env->buffer);
|
||||
env->buffer = 0;
|
||||
utf8proc_ruby_map_error(result);
|
||||
return Qnil; /* needed to prevent problems with optimization */
|
||||
}
|
||||
retval = rb_str_new((char *)env->buffer, result);
|
||||
free(env->buffer);
|
||||
env->buffer = 0;
|
||||
return retval;
|
||||
}
|
||||
|
||||
static VALUE utf8proc_ruby_char(VALUE self, VALUE code_param) {
|
||||
char buffer[4];
|
||||
ssize_t result;
|
||||
int uc;
|
||||
uc = NUM2INT(code_param);
|
||||
if (!utf8proc_codepoint_valid(uc))
|
||||
rb_raise(rb_eArgError, "Invalid Unicode code point");
|
||||
result = utf8proc_encode_char(uc, buffer);
|
||||
return rb_str_new(buffer, result);
|
||||
}
|
||||
|
||||
#define register_utf8proc_option(sym, field) \
|
||||
rb_hash_aset(utf8proc_ruby_options, ID2SYM(rb_intern(sym)), INT2FIX(field))
|
||||
|
||||
void Init_utf8proc_native() {
|
||||
utf8proc_ruby_module = rb_define_module("Utf8Proc");
|
||||
rb_define_module_function(utf8proc_ruby_module, "utf8map",
|
||||
utf8proc_ruby_map, 2);
|
||||
rb_define_module_function(utf8proc_ruby_module, "utf8char",
|
||||
utf8proc_ruby_char, 1);
|
||||
utf8proc_ruby_eUnicodeError = rb_define_class_under(utf8proc_ruby_module,
|
||||
"UnicodeError", rb_eStandardError);
|
||||
utf8proc_ruby_eInvalidUtf8Error = rb_define_class_under(
|
||||
utf8proc_ruby_module, "InvalidUtf8Error", utf8proc_ruby_eUnicodeError);
|
||||
utf8proc_ruby_eCodeNotAssignedError = rb_define_class_under(
|
||||
utf8proc_ruby_module, "CodeNotAssignedError",
|
||||
utf8proc_ruby_eUnicodeError);
|
||||
utf8proc_ruby_options = rb_hash_new();
|
||||
register_utf8proc_option("stable", UTF8PROC_STABLE);
|
||||
register_utf8proc_option("compat", UTF8PROC_COMPAT);
|
||||
register_utf8proc_option("compose", UTF8PROC_COMPOSE);
|
||||
register_utf8proc_option("decompose", UTF8PROC_DECOMPOSE);
|
||||
register_utf8proc_option("ignore", UTF8PROC_IGNORE);
|
||||
register_utf8proc_option("rejectna", UTF8PROC_REJECTNA);
|
||||
register_utf8proc_option("nlf2ls", UTF8PROC_NLF2LS);
|
||||
register_utf8proc_option("nlf2ps", UTF8PROC_NLF2PS);
|
||||
register_utf8proc_option("nlf2lf", UTF8PROC_NLF2LF);
|
||||
register_utf8proc_option("stripcc", UTF8PROC_STRIPCC);
|
||||
register_utf8proc_option("casefold", UTF8PROC_CASEFOLD);
|
||||
register_utf8proc_option("charbound", UTF8PROC_CHARBOUND);
|
||||
register_utf8proc_option("lump", UTF8PROC_LUMP);
|
||||
register_utf8proc_option("stripmark", UTF8PROC_STRIPMARK);
|
||||
OBJ_FREEZE(utf8proc_ruby_options);
|
||||
rb_define_const(utf8proc_ruby_module, "Options", utf8proc_ruby_options);
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user