diff --git a/Makefile b/Makefile index 60f0423..a9d1aaa 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,7 @@ # libmojibake Makefile +CURL=curl +RUBY=ruby # settings @@ -16,8 +18,26 @@ c-library: libmojibake.a libmojibake.so clean: rm -f utf8proc.o libmojibake.a libmojibake.so +update: utf8proc_data.c.new + # real targets +utf8proc_data.c.new: UnicodeData.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt + $(RUBY) data_generator.rb < UnicodeData.txt > utf8proc_data.c.new + +UnicodeData.txt: + $(CURL) -O http://www.unicode.org/Public/UNIDATA/UnicodeData.txt + +DerivedCoreProperties.txt: + $(CURL) -O http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt + +CompositionExclusions.txt: + $(CURL) -O http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt + +CaseFolding.txt: + $(CURL) -O http://www.unicode.org/Public/UNIDATA/CaseFolding.txt + + utf8proc.o: utf8proc.h utf8proc.c utf8proc_data.c $(cc) -c -o utf8proc.o utf8proc.c diff --git a/data_generator.rb b/data_generator.rb index 0db0331..f0e7aa5 100644 --- a/data_generator.rb +++ b/data_generator.rb @@ -1,4 +1,4 @@ -#!/usr/pkg/bin/ruby +#!/usr/bin/env ruby # This file was used to generate the 'unicode_data.c' file by parsing the # Unicode data file 'UnicodeData.txt' of the Unicode Character Database. @@ -65,42 +65,9 @@ # authorization of the copyright holder. - -$ignorable_list = <.. -000E..001F ; Default_Ignorable_Code_Point # Cc [18] .. -007F..0084 ; Default_Ignorable_Code_Point # Cc [6] .. -0086..009F ; Default_Ignorable_Code_Point # Cc [26] .. -00AD ; Default_Ignorable_Code_Point # Cf SOFT HYPHEN -034F ; Default_Ignorable_Code_Point # Mn COMBINING GRAPHEME JOINER -0600..0603 ; Default_Ignorable_Code_Point # Cf [4] ARABIC NUMBER SIGN..ARABIC SIGN SAFHA -06DD ; Default_Ignorable_Code_Point # Cf ARABIC END OF AYAH -070F ; Default_Ignorable_Code_Point # Cf SYRIAC ABBREVIATION MARK -115F..1160 ; Default_Ignorable_Code_Point # Lo [2] HANGUL CHOSEONG FILLER..HANGUL JUNGSEONG FILLER -17B4..17B5 ; Default_Ignorable_Code_Point # Cf [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA -180B..180D ; Default_Ignorable_Code_Point # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE -200B..200F ; Default_Ignorable_Code_Point # Cf [5] ZERO WIDTH SPACE..RIGHT-TO-LEFT MARK -202A..202E ; Default_Ignorable_Code_Point # Cf [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE -2060..2063 ; Default_Ignorable_Code_Point # Cf [4] WORD JOINER..INVISIBLE SEPARATOR -2064..2069 ; Default_Ignorable_Code_Point # Cn [6] .. -206A..206F ; Default_Ignorable_Code_Point # Cf [6] INHIBIT SYMMETRIC SWAPPING..NOMINAL DIGIT SHAPES -3164 ; Default_Ignorable_Code_Point # Lo HANGUL FILLER -D800..DFFF ; Default_Ignorable_Code_Point # Cs [2048] .. -FE00..FE0F ; Default_Ignorable_Code_Point # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16 -FEFF ; Default_Ignorable_Code_Point # Cf ZERO WIDTH NO-BREAK SPACE -FFA0 ; Default_Ignorable_Code_Point # Lo HALFWIDTH HANGUL FILLER -FFF0..FFF8 ; Default_Ignorable_Code_Point # Cn [9] .. -1D173..1D17A ; Default_Ignorable_Code_Point # Cf [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE -E0001 ; Default_Ignorable_Code_Point # Cf LANGUAGE TAG -E0002..E001F ; Default_Ignorable_Code_Point # Cn [30] .. -E0020..E007F ; Default_Ignorable_Code_Point # Cf [96] TAG SPACE..CANCEL TAG -E0080..E00FF ; Default_Ignorable_Code_Point # Cn [128] .. -E0100..E01EF ; Default_Ignorable_Code_Point # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 -E01F0..E0FFF ; Default_Ignorable_Code_Point # Cn [3600] .. -END_OF_LIST - +$ignorable_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Default_Ignorable_Code_Point.*?# Total code points:/m] $ignorable = [] -$ignorable_list.each do |entry| +$ignorable_list.each_line do |entry| if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/ $1.hex.upto($2.hex) { |e2| $ignorable << e2 } elsif entry =~ /^[0-9A-F]+/ @@ -108,162 +75,9 @@ $ignorable_list.each do |entry| end end -$grapheme_extend_list = <