utf8proc/data_generator.rb

#!/usr/pkg/bin/ruby

#  This file was used to generate the 'unicode_data.c' file by parsing the
#  Unicode data file 'UnicodeData.txt' of the Unicode Character Database.
#  It is included for informational purposes only and not intended for
#  production use.


#  Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
#
#  Permission is hereby granted, free of charge, to any person obtaining a
#  copy of this software and associated documentation files (the "Software"),
#  to deal in the Software without restriction, including without limitation
#  the rights to use, copy, modify, merge, publish, distribute, sublicense,
#  and/or sell copies of the Software, and to permit persons to whom the
#  Software is furnished to do so, subject to the following conditions:
#
#  The above copyright notice and this permission notice shall be included in
#  all copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
#  DEALINGS IN THE SOFTWARE.


#  This file contains derived data from a modified version of the
#  Unicode data files. The following license applies to that data:
#
#  COPYRIGHT AND PERMISSION NOTICE
#
#  Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed
#  under the Terms of Use in http://www.unicode.org/copyright.html.
#
#  Permission is hereby granted, free of charge, to any person obtaining a
#  copy of the Unicode data files and any associated documentation (the "Data
#  Files") or Unicode software and any associated documentation (the
#  "Software") to deal in the Data Files or Software without restriction,
#  including without limitation the rights to use, copy, modify, merge,
#  publish, distribute, and/or sell copies of the Data Files or Software, and
#  to permit persons to whom the Data Files or Software are furnished to do
#  so, provided that (a) the above copyright notice(s) and this permission
#  notice appear with all copies of the Data Files or Software, (b) both the
#  above copyright notice(s) and this permission notice appear in associated
#  documentation, and (c) there is clear notice in each modified Data File or
#  in the Software as well as in the documentation associated with the Data
#  File(s) or Software that the data or software has been modified.
#
#  THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
#  KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
#  THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
#  INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR
#  CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
#  USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
#  TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
#  PERFORMANCE OF THE DATA FILES OR SOFTWARE.
#
#  Except as contained in this notice, the name of a copyright holder shall
#  not be used in advertising or otherwise to promote the sale, use or other
#  dealings in these Data Files or Software without prior written
#  authorization of the copyright holder.


$ignorable_list = <<END_OF_LIST
#From:
#    http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
#Section:
#    Derived Property: Default_Ignorable_Code_Point
END_OF_LIST

$ignorable = []
$ignorable_list.each do |entry|
  if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
    $1.hex.upto($2.hex) { |e2| $ignorable << e2 }
  elsif entry =~ /^[0-9A-F]+/
    $ignorable << $&.hex
  end
end

$grapheme_extend_list = <<END_OF_LIST
#From:
#    http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
#Section:
#    Derived Property: Grapheme_Extend_List
END_OF_LIST

$grapheme_extend = []
$grapheme_extend_list.each do |entry|
  if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
    $1.hex.upto($2.hex) { |e2| $grapheme_extend << e2 }
  elsif entry =~ /^[0-9A-F]+/
    $grapheme_extend << $&.hex
  end
end

$exclusions = <<END_OF_LIST
0958    #  DEVANAGARI LETTER QA
0959    #  DEVANAGARI LETTER KHHA
095A    #  DEVANAGARI LETTER GHHA
095B    #  DEVANAGARI LETTER ZA
095C    #  DEVANAGARI LETTER DDDHA
095D    #  DEVANAGARI LETTER RHA
095E    #  DEVANAGARI LETTER FA
095F    #  DEVANAGARI LETTER YYA
09DC    #  BENGALI LETTER RRA
09DD    #  BENGALI LETTER RHA
09DF    #  BENGALI LETTER YYA
0A33    #  GURMUKHI LETTER LLA
0A36    #  GURMUKHI LETTER SHA
0A59    #  GURMUKHI LETTER KHHA
0A5A    #  GURMUKHI LETTER GHHA
0A5B    #  GURMUKHI LETTER ZA
0A5E    #  GURMUKHI LETTER FA
0B5C    #  ORIYA LETTER RRA
0B5D    #  ORIYA LETTER RHA
0F43    #  TIBETAN LETTER GHA
0F4D    #  TIBETAN LETTER DDHA
0F52    #  TIBETAN LETTER DHA
0F57    #  TIBETAN LETTER BHA
0F5C    #  TIBETAN LETTER DZHA
0F69    #  TIBETAN LETTER KSSA
0F76    #  TIBETAN VOWEL SIGN VOCALIC R
0F78    #  TIBETAN VOWEL SIGN VOCALIC L
0F93    #  TIBETAN SUBJOINED LETTER GHA
0F9D    #  TIBETAN SUBJOINED LETTER DDHA
0FA2    #  TIBETAN SUBJOINED LETTER DHA
0FA7    #  TIBETAN SUBJOINED LETTER BHA
0FAC    #  TIBETAN SUBJOINED LETTER DZHA
0FB9    #  TIBETAN SUBJOINED LETTER KSSA
FB1D    #  HEBREW LETTER YOD WITH HIRIQ
FB1F    #  HEBREW LIGATURE YIDDISH YOD YOD PATAH
FB2A    #  HEBREW LETTER SHIN WITH SHIN DOT
FB2B    #  HEBREW LETTER SHIN WITH SIN DOT
FB2C    #  HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT
FB2D    #  HEBREW LETTER SHIN WITH DAGESH AND SIN DOT
FB2E    #  HEBREW LETTER ALEF WITH PATAH
FB2F    #  HEBREW LETTER ALEF WITH QAMATS
FB30    #  HEBREW LETTER ALEF WITH MAPIQ
FB31    #  HEBREW LETTER BET WITH DAGESH
FB32    #  HEBREW LETTER GIMEL WITH DAGESH
FB33    #  HEBREW LETTER DALET WITH DAGESH
FB34    #  HEBREW LETTER HE WITH MAPIQ
FB35    #  HEBREW LETTER VAV WITH DAGESH
FB36    #  HEBREW LETTER ZAYIN WITH DAGESH
FB38    #  HEBREW LETTER TET WITH DAGESH
FB39    #  HEBREW LETTER YOD WITH DAGESH
FB3A    #  HEBREW LETTER FINAL KAF WITH DAGESH
FB3B    #  HEBREW LETTER KAF WITH DAGESH
FB3C    #  HEBREW LETTER LAMED WITH DAGESH
FB3E    #  HEBREW LETTER MEM WITH DAGESH
FB40    #  HEBREW LETTER NUN WITH DAGESH
FB41    #  HEBREW LETTER SAMEKH WITH DAGESH
FB43    #  HEBREW LETTER FINAL PE WITH DAGESH
FB44    #  HEBREW LETTER PE WITH DAGESH
FB46    #  HEBREW LETTER TSADI WITH DAGESH
FB47    #  HEBREW LETTER QOF WITH DAGESH
FB48    #  HEBREW LETTER RESH WITH DAGESH
FB49    #  HEBREW LETTER SHIN WITH DAGESH
FB4A    #  HEBREW LETTER TAV WITH DAGESH
FB4B    #  HEBREW LETTER VAV WITH HOLAM
FB4C    #  HEBREW LETTER BET WITH RAFE
FB4D    #  HEBREW LETTER KAF WITH RAFE
FB4E    #  HEBREW LETTER PE WITH RAFE
END_OF_LIST
$exclusions = $exclusions.chomp.split("\n").collect { |e| e.hex }

$excl_version = <<END_OF_LIST
2ADC    #  FORKING
1D15E   #  MUSICAL SYMBOL HALF NOTE
1D15F   #  MUSICAL SYMBOL QUARTER NOTE
1D160   #  MUSICAL SYMBOL EIGHTH NOTE
1D161   #  MUSICAL SYMBOL SIXTEENTH NOTE
1D162   #  MUSICAL SYMBOL THIRTY-SECOND NOTE
1D163   #  MUSICAL SYMBOL SIXTY-FOURTH NOTE
1D164   #  MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
1D1BB   #  MUSICAL SYMBOL MINIMA
1D1BC   #  MUSICAL SYMBOL MINIMA BLACK
1D1BD   #  MUSICAL SYMBOL SEMIMINIMA WHITE
1D1BE   #  MUSICAL SYMBOL SEMIMINIMA BLACK
1D1BF   #  MUSICAL SYMBOL FUSA WHITE
1D1C0   #  MUSICAL SYMBOL FUSA BLACK
END_OF_LIST
$excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex }

$case_folding_string = <<END_OF_LIST
#XXX THE NONEMPTY, NON-COMMENT LINES OF
#XXX http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
#XXX GO HERE
END_OF_LIST

$case_folding = {}
$case_folding_string.chomp.split("\n").each do |line|
  next unless line =~ /([0-9A-F]+); [CFS]; ([0-9A-F ]+);/i
  $case_folding[$1.hex] = $2.split(" ").collect { |e| e.hex }
end

$int_array = []
$int_array_indicies = {}

def str2c(string, prefix)
  return "0" if string.nil?
  return "UTF8PROC_#{prefix}_#{string.upcase}"
end
def ary2c(array)
  return "NULL" if array.nil?
  unless $int_array_indicies[array]
    $int_array_indicies[array] = $int_array.length
    array.each { |entry| $int_array << entry }
    $int_array << -1
  end
  return "utf8proc_sequences + #{$int_array_indicies[array]}"
end

class UnicodeChar
  attr_accessor :code, :name, :category, :combining_class, :bidi_class,
                :decomp_type, :decomp_mapping,
                :bidi_mirrored,
                :uppercase_mapping, :lowercase_mapping, :titlecase_mapping
  def initialize(line)
    raise "Could not parse input." unless line =~ /^
      ([0-9A-F]+);        # code
      ([^;]+);            # name
      ([A-Z]+);           # general category
      ([0-9]+);           # canonical combining class
      ([A-Z]+);           # bidi class
      (<([A-Z]*)>)?       # decomposition type
      ((\ ?[0-9A-F]+)*);  # decompomposition mapping
      ([0-9]*);           # decimal digit
      ([0-9]*);           # digit
      ([^;]*);            # numeric
      ([YN]*);            # bidi mirrored
      ([^;]*);            # unicode 1.0 name
      ([^;]*);            # iso comment
      ([0-9A-F]*);        # simple uppercase mapping
      ([0-9A-F]*);        # simple lowercase mapping
      ([0-9A-F]*)$/ix     # simple titlecase mapping
    @code              = $1.hex
    @name              = $2
    @category          = $3
    @combining_class   = Integer($4)
    @bidi_class        = $5
    @decomp_type       = $7
    @decomp_mapping    = ($8=='') ? nil :
                         $8.split.collect { |element| element.hex }
    @bidi_mirrored     = ($13=='Y') ? true : false
    @uppercase_mapping = ($16=='') ? nil : $16.hex
    @lowercase_mapping = ($17=='') ? nil : $17.hex
    @titlecase_mapping = ($18=='') ? nil : $18.hex
  end
  def case_folding
    $case_folding[code]
  end
  def c_entry(comb1_indicies, comb2_indicies)
    "  " <<
    "{#{str2c category, 'CATEGORY'}, #{combining_class}, " <<
    "#{str2c bidi_class, 'BIDI_CLASS'}, " <<
    "#{str2c decomp_type, 'DECOMP_TYPE'}, " <<
    "#{ary2c decomp_mapping}, " <<
    "#{bidi_mirrored}, " <<
    "#{uppercase_mapping or -1}, " <<
    "#{lowercase_mapping or -1}, " <<
    "#{titlecase_mapping or -1}, " <<
    "#{comb1_indicies[code] ?
       (comb1_indicies[code]*comb2_indicies.keys.length) : -1
      }, #{comb2_indicies[code] or -1}, " <<
    "#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
    "#{$ignorable.include?(code)}, " <<
    "#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
    "#{$grapheme_extend.include?(code)}, " <<
    "#{ary2c case_folding}},\n"
  end
end

chars = []
char_hash = {}

while gets
  if $_ =~ /^([0-9A-F]+);<[^;>,]+, First>;/i
    first = $1.hex
    gets
    char = UnicodeChar.new($_)
    raise "No last character of sequence found." unless
      $_ =~ /^([0-9A-F]+);<([^;>,]+), Last>;/i
    last = $1.hex
    name = "<#{$2}>"
    for i in first..last
      char_clone = char.clone
      char_clone.code = i
      char_clone.name = name
      char_hash[char_clone.code] = char_clone
      chars << char_clone
    end
  else
    char = UnicodeChar.new($_)
    char_hash[char.code] = char
    chars << char
  end
end

comb1st_indicies = {}
comb2nd_indicies = {}
comb_array = []

chars.each do |char|
  if char.decomp_type.nil? and char.decomp_mapping and
      char.decomp_mapping.length == 2 and
      char_hash[char.decomp_mapping[0]].combining_class == 0 and
      not $exclusions.include?(char.code)
    unless comb1st_indicies[char.decomp_mapping[0]]
      comb1st_indicies[char.decomp_mapping[0]] = comb1st_indicies.keys.length
    end
    unless comb2nd_indicies[char.decomp_mapping[1]]
      comb2nd_indicies[char.decomp_mapping[1]] = comb2nd_indicies.keys.length
    end
    comb_array[comb1st_indicies[char.decomp_mapping[0]]] ||= []
    raise "Duplicate canonical mapping" if
      comb_array[comb1st_indicies[char.decomp_mapping[0]]][
      comb2nd_indicies[char.decomp_mapping[1]]]
    comb_array[comb1st_indicies[char.decomp_mapping[0]]][
      comb2nd_indicies[char.decomp_mapping[1]]] = char.code
  end
end

properties_indicies = {}
properties = []
chars.each do |char|
  c_entry = char.c_entry(comb1st_indicies, comb2nd_indicies)
  unless properties_indicies[c_entry]
    properties_indicies[c_entry] = properties.length
    properties << c_entry
  end
end

stage1 = []
stage2 = []
for code in 0...0x110000
  next unless code % 0x100 == 0
  stage2_entry = []
  for code2 in code...(code+0x100)
    if char_hash[code2]
      stage2_entry << (properties_indicies[char_hash[code2].c_entry(
        comb1st_indicies, comb2nd_indicies)] + 1)
    else
      stage2_entry << 0
    end
  end
  old_index = stage2.index(stage2_entry)
  if old_index
    stage1 << (old_index * 0x100)
  else
    stage1 << (stage2.length * 0x100)
    stage2 << stage2_entry
  end
end

$stdout << "const int32_t utf8proc_sequences[] = {\n  "
i = 0
$int_array.each do |entry|
  i += 1
  if i == 8
    i = 0
    $stdout << "\n  "
  end
  $stdout << entry << ", "
end
$stdout << "};\n\n"

$stdout << "const uint16_t utf8proc_stage1table[] = {\n  "
i = 0
stage1.each do |entry|
  i += 1
  if i == 8
    i = 0
    $stdout << "\n  "
  end
  $stdout << entry << ", "
end
$stdout << "};\n\n"

$stdout << "const uint16_t utf8proc_stage2table[] = {\n  "
i = 0
stage2.flatten.each do |entry|
  i += 1
  if i == 8
    i = 0
    $stdout << "\n  "
  end
  $stdout << entry << ", "
end
$stdout << "};\n\n"

$stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
$stdout << "  {0, 0, 0, 0, NULL, false, -1, -1, -1, -1, -1, false},\n"
properties.each { |line|
  $stdout << line
}
$stdout << "};\n\n"

$stdout << "const int32_t utf8proc_combinations[] = {\n  "
i = 0
comb1st_indicies.keys.each_index do |a|
  comb2nd_indicies.keys.each_index do |b|
    i += 1
    if i == 8
      i = 0
      $stdout << "\n  "
    end
    $stdout << ( comb_array[a][b] or -1 ) << ", "
  end
end
$stdout << "};\n\n"