diff options
Diffstat (limited to 'libjava/chartables.pl')
-rw-r--r-- | libjava/chartables.pl | 965 |
1 files changed, 0 insertions, 965 deletions
diff --git a/libjava/chartables.pl b/libjava/chartables.pl deleted file mode 100644 index fef7d8742cf..00000000000 --- a/libjava/chartables.pl +++ /dev/null @@ -1,965 +0,0 @@ -# chartables.pl - A perl program to generate tables for use by the -# Character class. - -# Copyright (C) 1998, 1999 Red Hat, Inc. -# -# This file is part of libjava. -# -# This software is copyrighted work licensed under the terms of the -# Libjava License. Please consult the file "LIBJAVA_LICENSE" for -# details. - -# This program requires a `unidata.txt' file of the form distributed -# on the Unicode 2.0 CD ROM. Or, get it more conveniently here: -# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt -# Version `2.1.8' of this file was last used to update the Character class. - -# Written using "Java Class Libraries", 2nd edition, ISBN 0-201-31002-3 -# "The Java Language Specification", ISBN 0-201-63451-1 -# plus online API docs for JDK 1.2 beta from http://www.javasoft.com. - -# Usage: perl chartables.pl [-n] UnicodeData-VERSION.txt -# If this exits with nonzero status, then you must investigate the -# cause of the problem. -# Diagnostics and other information to stderr. -# This creates the new include/java-chartables.h and -# include/java-chardecomp.h files directly. -# With -n, the files are not created, but all processing -# still occurs. - -# Fields in the table. -$CODE = 0; -$NAME = 1; -$CATEGORY = 2; -$DECOMPOSITION = 5; -$DECIMAL = 6; -$DIGIT = 7; -$NUMERIC = 8; -$UPPERCASE = 12; -$LOWERCASE = 13; -$TITLECASE = 14; - -# A special case. -$TAMIL_DIGIT_ONE = 0x0be7; -$TAMIL_DIGIT_NINE = 0x0bef; - -# These are endpoints of legitimate gaps in the tables. -$CJK_IDEOGRAPH_END = 0x9fa5; -$HANGUL_END = 0xd7a3; -$HIGH_SURROGATE_END = 0xdb7f; -$PRIVATE_HIGH_SURROGATE_END = 0xdbff; -$LOW_SURROGATE_END = 0xdfff; -$PRIVATE_END = 0xf8ff; - -%title_to_upper = (); -%title_to_lower = (); -%numerics = (); -%name = (); - -@digit_start = (); -@digit_end = (); - -@space_start = (); -@space_end = (); - -# @letter_start = (); -# @letter_end = (); - -@all_start = (); -@all_end = (); -@all_cats = (); - -@upper_start = (); -@upper_end = (); -@upper_map = (); -%upper_anom = (); - -@lower_start = (); -@lower_end = (); -@lower_map = (); -%lower_anom = (); - -@attributes = (); - -# There are a few characters which actually need two attributes. -# These are special-cased. -$ROMAN_START = 0x2160; -$ROMAN_END = 0x217f; -%second_attributes = (); - -$prevcode = -1; -$status = 0; - -%category_map = -( - 'Mn' => 'NON_SPACING_MARK', - 'Mc' => 'COMBINING_SPACING_MARK', - 'Me' => 'ENCLOSING_MARK', - 'Nd' => 'DECIMAL_DIGIT_NUMBER', - 'Nl' => 'LETTER_NUMBER', - 'No' => 'OTHER_NUMBER', - 'Zs' => 'SPACE_SEPARATOR', - 'Zl' => 'LINE_SEPARATOR', - 'Zp' => 'PARAGRAPH_SEPARATOR', - 'Cc' => 'CONTROL', - 'Cf' => 'FORMAT', - 'Cs' => 'SURROGATE', - 'Co' => 'PRIVATE_USE', - 'Cn' => 'UNASSIGNED', - 'Lu' => 'UPPERCASE_LETTER', - 'Ll' => 'LOWERCASE_LETTER', - 'Lt' => 'TITLECASE_LETTER', - 'Lm' => 'MODIFIER_LETTER', - 'Lo' => 'OTHER_LETTER', - 'Pc' => 'CONNECTOR_PUNCTUATION', - 'Pd' => 'DASH_PUNCTUATION', - 'Ps' => 'START_PUNCTUATION', - 'Pe' => 'END_PUNCTUATION', - 'Pi' => 'START_PUNCTUATION', - 'Pf' => 'END_PUNCTUATION', - 'Po' => 'OTHER_PUNCTUATION', - 'Sm' => 'MATH_SYMBOL', - 'Sc' => 'CURRENCY_SYMBOL', - 'Sk' => 'MODIFIER_SYMBOL', - 'So' => 'OTHER_SYMBOL' - ); - -# These maps characters to their decompositions. -%canonical_decomposition = (); -%full_decomposition = (); - - -# Handle `-n' and open output files. -local ($f1, $f2) = ('include/java-chartables.h', - 'include/java-chardecomp.h'); -if ($ARGV[0] eq '-n') -{ - shift @ARGV; - $f1 = '/dev/null'; - $f2 = '/dev/null'; -} - -open (CHARTABLE, "> $f1"); -open (DECOMP, "> $f2"); - -# Process the Unicode file. -while (<>) -{ - chop; - # Specify a limit for split so that we pick up trailing fields. - # We make the limit larger than we need, to catch the case where - # there are extra fields. - @fields = split (';', $_, 30); - # Convert code to number. - $ncode = hex ($fields[$CODE]); - - if ($#fields != 14) - { - print STDERR ("Entry for \\u", $fields[$CODE], - " has wrong number of fields: ", $#fields, "\n"); - } - - $name{$fields[$CODE]} = $fields[$NAME]; - - # If we've found a gap in the table, fill it in. - if ($ncode != $prevcode + 1) - { - &process_gap (*fields, $prevcode, $ncode); - } - - &process_char (*fields, $ncode); - - $prevcode = $ncode; -} - -if ($prevcode != 0xffff) -{ - # Setting of `fields' parameter doesn't matter here. - &process_gap (*fields, $prevcode, 0x10000); -} - -print CHARTABLE "// java-chartables.h - Character tables for java.lang.Character -*- c++ -*-\n\n"; -print CHARTABLE "#ifndef __JAVA_CHARTABLES_H__\n"; -print CHARTABLE "#define __JAVA_CHARTABLES_H__\n\n"; -print CHARTABLE "// These tables are automatically generated by the chartables.pl\n"; -print CHARTABLE "// script. DO NOT EDIT the tables. Instead, fix the script\n"; -print CHARTABLE "// and run it again.\n\n"; -print CHARTABLE "// This file should only be included by natCharacter.cc\n\n"; - - -$bytes = 0; - -# Titlecase mapping tables. -if ($#title_to_lower != $#title_to_upper) -{ - # If this fails we need to reimplement toTitleCase. - print STDERR "titlecase mappings have different sizes\n"; - $status = 1; -} -# Also ensure that the tables are entirely parallel. -foreach $key (sort keys %title_to_lower) -{ - if (! defined $title_to_upper{$key}) - { - print STDERR "titlecase mappings have different entries\n"; - $status = 1; - } -} -&print_single_map ("title_to_lower_table", %title_to_lower); -&print_single_map ("title_to_upper_table", %title_to_upper); - -print CHARTABLE "#ifdef COMPACT_CHARACTER\n\n"; - -printf CHARTABLE "#define TAMIL_DIGIT_ONE 0x%04x\n\n", $TAMIL_DIGIT_ONE; - -# All numeric values. -&print_numerics; - -# Digits only. -&print_block ("digit_table", *digit_start, *digit_end); - -# Space characters. -&print_block ("space_table", *space_start, *space_end); - -# Letters. We used to generate a separate letter table. But this -# doesn't really seem worthwhile. Simply using `all_table' saves us -# about 800 bytes, and only adds 3 table probes to isLetter. -# &print_block ("letter_table", *letter_start, *letter_end); - -# Case tables. -&print_case_table ("upper", *upper_start, *upper_end, *upper_map, *upper_anom); -&print_case_table ("lower", *lower_start, *lower_end, *lower_map, *lower_anom); - -# Everything else. -&print_all_block (*all_start, *all_end, *all_cats); - -print CHARTABLE "#else /* COMPACT_CHARACTER */\n\n"; - -printf CHARTABLE "#define ROMAN_START 0x%04x\n", $ROMAN_START; -printf CHARTABLE "#define ROMAN_END 0x%04x\n\n", $ROMAN_END; - -&print_fast_tables (*all_start, *all_end, *all_cats, - *attributes, *second_attributes); - -print CHARTABLE "#endif /* COMPACT_CHARACTER */\n\n"; - -print CHARTABLE "#endif /* __JAVA_CHARTABLES_H__ */\n"; - -printf STDERR "Approximately %d bytes of data generated (compact case)\n", - $bytes; - - -# Now generate decomposition tables. -printf DECOMP "// java-chardecomp.h - Decomposition character tables -*- c++ -*-\n\n"; -printf DECOMP "#ifndef __JAVA_CHARDECOMP_H__\n"; -printf DECOMP "#define __JAVA_CHARDECOMP_H__\n\n"; -print DECOMP "// These tables are automatically generated by the chartables.pl\n"; -print DECOMP "// script. DO NOT EDIT the tables. Instead, fix the script\n"; -print DECOMP "// and run it again.\n\n"; -print DECOMP "// This file should only be included by natCollator.cc\n\n"; - -print DECOMP "struct decomp_entry\n{\n"; -print DECOMP " jchar key;\n"; -print DECOMP " const char *value;\n"; -print DECOMP "};\n\n"; - -&write_decompositions; - -printf DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n"; - - -close (CHARTABLE); -close (DECOMP); - -exit $status; - - -# Process a gap in the space. -sub process_gap -{ - local (*fields, $prevcode, $ncode) = @_; - local (@gap_fields, $i); - - if ($ncode == $CJK_IDEOGRAPH_END - || $ncode == $HANGUL_END - || $ncode == $HIGH_SURROGATE_END - || $ncode == $PRIVATE_HIGH_SURROGATE_END - || $ncode == $LOW_SURROGATE_END - || $ncode == $PRIVATE_END) - { - # The characters in the gap we just found are known to - # have the same properties as the character at the end of - # the gap. - @gap_fields = @fields; - } - else - { - # This prints too much to be enabled. - # print STDERR "Gap found at \\u", $fields[$CODE], "\n"; - @gap_fields = ('', '', 'Cn', '', '', '', '', '', '', '', '', - '', '', '', ''); - } - - for ($i = $prevcode + 1; $i < $ncode; ++$i) - { - $gap_fields[$CODE] = sprintf ("%04x", $i); - $gap_fields[$NAME] = "CHARACTER " . $gap_fields[$CODE]; - &process_char (*gap_fields, $i); - } -} - -# Process a single character. -sub process_char -{ - local (*fields, $ncode) = @_; - - if ($fields[$DECOMPOSITION] ne '') - { - &add_decomposition ($ncode, $fields[$DECOMPOSITION]); - } - - # If this is a titlecase character, mark it. - if ($fields[$CATEGORY] eq 'Lt') - { - $title_to_upper{$fields[$CODE]} = $fields[$UPPERCASE]; - $title_to_lower{$fields[$CODE]} = $fields[$LOWERCASE]; - } - else - { - # For upper and lower case mappings, we try to build compact - # tables that map range onto range. We specifically want to - # avoid titlecase characters. Java specifies a range check to - # make sure the character is not between 0x2000 and 0x2fff. - # We avoid that here because we need to generate table entries - # -- toLower and toUpper still work in that range. - if ($fields[$UPPERCASE] eq '' - && ($fields[$LOWERCASE] ne '' - || $fields[$NAME] =~ /CAPITAL (LETTER|LIGATURE)/)) - { - if ($fields[$LOWERCASE] ne '') - { - &update_case_block (*upper_start, *upper_end, *upper_map, - $fields[$CODE], $fields[$LOWERCASE]); - &set_attribute ($ncode, hex ($fields[$LOWERCASE])); - } - else - { - $upper_anom{$fields[$CODE]} = 1; - } - } - elsif ($fields[$LOWERCASE] ne '') - { - print STDERR ("Java missed upper case char \\u", - $fields[$CODE], "\n"); - } - elsif ($fields[$CATEGORY] eq 'Lu') - { - # This case is for letters which are marked as upper case - # but for which there is no lower case equivalent. For - # instance, LATIN LETTER YR. - } - - if ($fields[$LOWERCASE] eq '' - && ($fields[$UPPERCASE] ne '' - || $fields[$NAME] =~ /SMALL (LETTER|LIGATURE)/)) - { - if ($fields[$UPPERCASE] ne '') - { - &update_case_block (*lower_start, *lower_end, *lower_map, - $fields[$CODE], $fields[$UPPERCASE]); - &set_attribute ($ncode, hex ($fields[$UPPERCASE])); - } - else - { - $lower_anom{$fields[$CODE]} = 1; - } - } - elsif ($fields[$UPPERCASE] ne '') - { - print STDERR ("Java missed lower case char \\u", - $fields[$CODE], "\n"); - } - elsif ($fields[$CATEGORY] eq 'Ll') - { - # This case is for letters which are marked as lower case - # but for which there is no upper case equivalent. For - # instance, FEMININE ORDINAL INDICATOR. - } - } - - - # If we have a non-decimal numeric value, add it to the list. - if ($fields[$CATEGORY] eq 'Nd' - && ($ncode < 0x2000 || $ncode > 0x2fff) - && $fields[$NAME] =~ /DIGIT/) - { - # This is a digit character that is handled elsewhere. - } - elsif ($fields[$DIGIT] ne '' || $fields[$NUMERIC] ne '') - { - # Do a simple check. - if ($fields[$DECIMAL] ne '') - { - # This catches bugs in an earlier implementation of - # chartables.pl. Now it is here for historical interest - # only. - # print STDERR ("Character \u", $fields[$CODE], - # " would have been missed as digit\n"); - } - - local ($val) = $fields[$DIGIT]; - $val = $fields[$NUMERIC] if $val eq ''; - local ($ok) = 1; - - # If we have a value which is not a positive integer, then we - # set the value to -2 to make life easier for - # Character.getNumericValue. - if ($val !~ m/^[0-9]+$/) - { - if ($fields[$CATEGORY] ne 'Nl' - && $fields[$CATEGORY] ne 'No') - { - # This shows a few errors in the Unicode table. These - # characters have a missing Numeric field, and the `N' - # for the mirrored field shows up there instead. I - # reported these characters to errata@unicode.org on - # Thu Sep 10 1998. They said it will be fixed in the - # 2.1.6 release of the tables. - print STDERR ("Character \u", $fields[$CODE], - " has value but is not numeric; val = '", - $val, "'\n"); - # We skip these. - $ok = 0; - } - $val = "-2"; - } - - if ($ok) - { - $numerics{$fields[$CODE]} = $val; - &set_attribute ($ncode, $val); - } - } - - # We build a table that lists ranges of ordinary decimal values. - # At each step we make sure that the digits are in the correct - # order, with no holes, as this is assumed by Character. If this - # fails, reimplementation is required. This implementation - # dovetails nicely with the Java Spec, which has strange rules for - # what constitutes a decimal value. In particular the Unicode - # name must contain the word `DIGIT'. The spec doesn't directly - # say that digits must have type `Nd' (or that their value must an - # integer), but that can be inferred from the list of digits in - # the book(s). Currently the only Unicode characters whose name - # includes `DIGIT' which would not fit are the Tibetan "half" - # digits. - if ($fields[$CATEGORY] eq 'Nd') - { - if (($ncode < 0x2000 || $ncode > 0x2fff) - && $fields[$NAME] =~ /DIGIT/) - { - &update_digit_block (*digit_start, *digit_end, $fields[$CODE], - $fields[$DECIMAL]); - &set_attribute ($ncode, $fields[$DECIMAL]); - } - else - { - # If this fails then Character.getType will fail. We - # assume that things in `digit_table' are the only - # category `Nd' characters. - print STDERR ("Character \u", $fields[$CODE], - " is class Nd but not in digit table\n"); - $status = 1; - } - } - - # Keep track of space characters. - if ($fields[$CATEGORY] =~ /Z[slp]/) - { - &update_block (*space_start, *space_end, $fields[$CODE]); - } - - # Keep track of letters. - # if ($fields[$CATEGORY] =~ /L[ultmo]/) - # { - # &update_letter_block (*letter_start, *letter_end, $fields[$CODE], - # $fields[$CATEGORY]); - # } - - # Keep track of all characters. You might think we wouldn't have - # to do this for uppercase letters, or other characters we already - # "classify". The problem is that this classification is - # different. E.g., \u216f is uppercase by Java rules, but is a - # LETTER_NUMBER here. - &update_all_block (*all_start, *all_end, *all_cats, - $fields[$CODE], $fields[$CATEGORY]); -} - - -# Called to add a new decomposition. -sub add_decomposition -{ - local ($ncode, $value) = @_; - local ($is_full) = 0; - local ($first) = 1; - local (@decomp) = (); - - foreach (split (' ', $value)) - { - if ($first && /^\<.*\>$/) - { - $is_full = 1; - } - else - { - push (@decomp, hex ($_)); - } - $first = 0; - } - - # We pack the value into a string because this means we can stick - # with Perl 4 features. - local ($s) = pack "I*", @decomp; - if ($is_full) - { - $full_decomposition{$ncode} = $s; - } - else - { - $canonical_decomposition{$ncode} = $s; - } -} - -# Write a single decomposition table. -sub write_single_decomposition -{ - local ($name, $is_canon, %table) = @_; - - printf DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n"; - - local ($key, @expansion, $char); - local ($first_line) = 1; - - for ($key = 0; $key <= 65535; ++$key) - { - next if ! defined $table{$key}; - - printf DECOMP ",\n" - unless $first_line; - $first_line = 0; - - printf DECOMP " { 0x%04x, \"", $key; - - # We represent the expansion as a series of bytes, terminated - # with a double nul. This is ugly, but relatively - # space-efficient. Most expansions are short, but there are a - # few that are very long (e.g. \uFDFA). This means that if we - # chose a fixed-space representation we would waste a lot of - # space. - @expansion = unpack "I*", $table{$key}; - foreach $char (@expansion) - { - printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256); - } - - printf DECOMP "\" }"; - } - - printf DECOMP "\n};\n\n"; -} - -sub write_decompositions -{ - &write_single_decomposition ('canonical', 1, %canonical_decomposition); - &write_single_decomposition ('full', 0, %full_decomposition); -} - -# We represent a block of characters with a pair of lists. This -# function updates the pair to account for the new character. Returns -# 1 if we added to the old block, 0 otherwise. -sub update_block -{ - local (*start, *end, $char) = @_; - - local ($nchar) = hex ($char); - local ($count) = $#end; - if ($count >= 0 && $end[$count] == $nchar - 1) - { - ++$end[$count]; - return 1; - } - else - { - ++$count; - $start[$count] = $nchar; - $end[$count] = $nchar; - } - return 0; -} - -# Return true if we will be appending this character to the end of the -# existing block. -sub block_append_p -{ - local (*end, $char) = @_; - return $#end >= 0 && $end[$#end] == $char - 1; -} - -# This updates the digit block. This table is much like an ordinary -# block, but it has an extra constraint. -sub update_digit_block -{ - local (*start, *end, $char, $value) = @_; - - &update_block ($start, $end, $char); - local ($nchar) = hex ($char); - - # We want to make sure that the new digit's value is correct for - # its place in the block. However, we special-case Tamil digits, - # since Tamil does not have a digit `0'. - local ($count) = $#start; - if (($nchar < $TAMIL_DIGIT_ONE || $nchar > $TAMIL_DIGIT_NINE) - && $nchar - $start[$count] != $value) - { - # If this fails then Character.digit_value will be wrong. - print STDERR "Character \\u", $char, " violates digit constraint\n"; - $status = 1; - } -} - -# Update letter table. We could be smart about avoiding upper or -# lower case letters, but it is much simpler to just track them all. -sub update_letter_block -{ - local (*start, *end, $char, $category) = @_; - - &update_block (*start, *end, $char); -} - -# Update `all' table. This table holds all the characters we don't -# already categorize for other reasons. FIXME: if a given type has -# very few characters, we should just inline the code. E.g., there is -# only one paragraph separator. -sub update_all_block -{ - local (*start, *end, *cats, $char, $category) = @_; - - local ($nchar) = hex ($char); - local ($count) = $#end; - if ($count >= 0 - && $end[$count] == $nchar - 1 - && $cats[$count] eq $category) - { - ++$end[$count]; - } - else - { - ++$count; - $start[$count] = $nchar; - $end[$count] = $nchar; - $cats[$count] = $category; - } -} - -# Update a case table. We handle case tables specially because we -# want to map (e.g.) a block of uppercase characters directly onto the -# corresponding block of lowercase characters. Therefore we generate -# a new entry when the block would no longer map directly. -sub update_case_block -{ - local (*start, *end, *map, $char, $mapchar) = @_; - - local ($nchar) = hex ($char); - local ($nmap) = hex ($mapchar); - - local ($count) = $#end; - if ($count >= 0 - && $end[$count] == $nchar - 1 - && $nchar - $start[$count] == $nmap - $map[$count]) - { - ++$end[$count]; - } - else - { - ++$count; - $start[$count] = $nchar; - $end[$count] = $nchar; - $map[$count] = $nmap; - } -} - -# Set the attribute value for the character. Each character can have -# only one attribute. -sub set_attribute -{ - local ($ncode, $attr) = @_; - - if ($attributes{$ncode} ne '' && $attributes{$ncode} ne $attr) - { - if ($ncode >= $ROMAN_START && $ncode <= $ROMAN_END) - { - $second_attributes{$ncode} = $attr; - } - else - { - printf STDERR "character \\u%04x already has attribute\n", $ncode; - } - } - # Attributes can be interpreted as unsigned in some situations, - # so we check against 65535. This could cause errors -- we need - # to check the interpretation here. - elsif ($attr < -32768 || $attr > 65535) - { - printf STDERR "attribute out of range for character \\u%04x\n", $ncode; - } - else - { - $attributes{$ncode} = $attr; - } -} - - -# Print a block table. -sub print_block -{ - local ($title, *start, *end) = @_; - - print CHARTABLE "static const jchar ", $title, "[][2] =\n"; - print CHARTABLE " {\n"; - - local ($i) = 0; - while ($i <= $#start) - { - print CHARTABLE " { "; - &print_char ($start[$i]); - print CHARTABLE ", "; - &print_char ($end[$i]); - print CHARTABLE " }"; - print CHARTABLE "," if ($i != $#start); - print CHARTABLE "\n"; - ++$i; - $bytes += 4; # Two bytes per char. - } - - print CHARTABLE " };\n\n"; -} - -# Print the numerics table. -sub print_numerics -{ - local ($i, $key, $count, @keys); - - $i = 0; - @keys = sort keys %numerics; - $count = @keys; - - print CHARTABLE "static const jchar numeric_table[] =\n"; - print CHARTABLE " { "; - foreach $key (@keys) - { - &print_char (hex ($key)); - ++$i; - print CHARTABLE ", " if $i < $count; - # Print 5 per line. - print CHARTABLE "\n " if ($i % 5 == 0); - $bytes += 2; # One character. - } - print CHARTABLE " };\n\n"; - - print CHARTABLE "static const jshort numeric_value[] =\n"; - print CHARTABLE " { "; - $i = 0; - foreach $key (@keys) - { - print CHARTABLE $numerics{$key}; - if ($numerics{$key} > 32767 || $numerics{$key} < -32768) - { - # This means our generated type info is incorrect. We - # could just detect and work around this here, but I'm - # lazy. - print STDERR "numeric value won't fit in a short\n"; - $status = 1; - } - ++$i; - print CHARTABLE ", " if $i < $count; - # Print 10 per line. - print CHARTABLE "\n " if ($i % 10 == 0); - $bytes += 2; # One short. - } - print CHARTABLE " };\n\n"; -} - -# Print a table that maps one single letter onto another. It assumes -# the map is index by char code. -sub print_single_map -{ - local ($title, %map) = @_; - - local (@keys) = sort keys %map; - $num = @keys; - print CHARTABLE "static const jchar ", $title, "[][2] =\n"; - print CHARTABLE " {\n"; - $i = 0; - for $key (@keys) - { - print CHARTABLE " { "; - &print_char (hex ($key)); - print CHARTABLE ", "; - &print_char (hex ($map{$key})); - print CHARTABLE " }"; - ++$i; - if ($i < $num) - { - print CHARTABLE ","; - } - else - { - print CHARTABLE " "; - } - print CHARTABLE " // ", $name{$key}, "\n"; - $bytes += 4; # Two bytes per char. - } - print CHARTABLE " };\n\n"; -} - -# Print the `all' block. -sub print_all_block -{ - local (*start, *end, *cats) = @_; - - &print_block ("all_table", *start, *end); - - local ($i) = 0; - local ($sum) = 0; - while ($i <= $#start) - { - $sum += $end[$i] - $start[$i] + 1; - ++$i; - } - # We do this computation just to make sure it isn't cheaper to - # simply list all the characters individually. - printf STDERR ("all_table encodes %d characters in %d entries\n", - $sum, $#start + 1); - - print CHARTABLE "static const jbyte category_table[] =\n"; - print CHARTABLE " { "; - - $i = 0; - while ($i <= $#cats) - { - if ($i > 0 && $cats[$i] eq $cats[$i - 1]) - { - # This isn't an error. We can have a duplicate because - # two ranges are not adjacent while the intervening - # characters are left out of the table for other reasons. - # We could exploit this to make the table a little smaller. - # printf STDERR "Duplicate all entry at \\u%04x\n", $start[$i]; - } - print CHARTABLE 'java::lang::Character::', $category_map{$cats[$i]}; - print CHARTABLE ", " if ($i < $#cats); - ++$i; - print CHARTABLE "\n "; - ++$bytes; - } - print CHARTABLE " };\n\n"; -} - -# Print case table. -sub print_case_table -{ - local ($title, *start, *end, *map, *anomalous) = @_; - - &print_block ($title . '_case_table', *start, *end); - - print CHARTABLE "static const jchar ", $title, "_case_map_table[] =\n"; - print CHARTABLE " { "; - - local ($i) = 0; - while ($i <= $#map) - { - &print_char ($map[$i]); - print CHARTABLE ", " if $i < $#map; - ++$i; - print CHARTABLE "\n " if $i % 5 == 0; - $bytes += 2; - } - print CHARTABLE " };\n"; - - - local ($key, @keys); - @keys = sort keys %anomalous; - - if ($title eq 'upper') - { - if ($#keys >= 0) - { - # If these are found we need to change Character.isUpperCase. - print STDERR "Found anomalous upper case characters\n"; - $status = 1; - } - } - else - { - print CHARTABLE "\n"; - print CHARTABLE "static const jchar ", $title, "_anomalous_table[] =\n"; - print CHARTABLE " { "; - $i = 0; - foreach $key (@keys) - { - &print_char (hex ($key)); - print CHARTABLE ", " if $i < $#keys; - ++$i; - print CHARTABLE "\n " if $i % 5 == 0; - $bytes += 2; - } - print CHARTABLE " };\n"; - } - - print CHARTABLE "\n"; -} - -# Print the type table and attributes table for the fast version. -sub print_fast_tables -{ - local (*start, *end, *cats, *atts, *second_atts) = @_; - - print CHARTABLE "static const jbyte type_table[] =\n{ "; - - local ($i, $j); - for ($i = 0; $i <= $#cats; ++$i) - { - for ($j = $start[$i]; $j <= $end[$i]; ++$j) - { - print CHARTABLE 'java::lang::Character::', $category_map{$cats[$i]}; - print CHARTABLE "," if ($i < $#cats || $j < $end[$i]); - print CHARTABLE "\n "; - } - } - print CHARTABLE "\n };\n\n"; - - print CHARTABLE "static const jshort attribute_table[] =\n{ "; - for ($i = 0; $i <= 0xffff; ++$i) - { - $atts{$i} = 0 if ! defined $atts{$i}; - print CHARTABLE $atts{$i}; - print CHARTABLE ", " if $i < 0xffff; - print CHARTABLE "\n " if $i % 5 == 1; - } - print CHARTABLE "\n };\n\n"; - - print CHARTABLE "static const jshort secondary_attribute_table[] =\n{ "; - for ($i = $ROMAN_START; $i <= $ROMAN_END; ++$i) - { - print CHARTABLE $second_atts{$i}; - print CHARTABLE ", " if $i < $ROMAN_END; - print CHARTABLE "\n " if $i % 5 == 1; - } - print CHARTABLE "\n };\n\n"; -} - -# Print a character constant. -sub print_char -{ - local ($ncode) = @_; - printf CHARTABLE "0x%04x", $ncode; -} |