diff options
Diffstat (limited to 'src/common/unicode/generate-unicode_norm_table.pl')
-rw-r--r-- | src/common/unicode/generate-unicode_norm_table.pl | 234 |
1 files changed, 234 insertions, 0 deletions
diff --git a/src/common/unicode/generate-unicode_norm_table.pl b/src/common/unicode/generate-unicode_norm_table.pl new file mode 100644 index 0000000..7ce15e1 --- /dev/null +++ b/src/common/unicode/generate-unicode_norm_table.pl @@ -0,0 +1,234 @@ +#!/usr/bin/perl +# +# Generate a composition table, using Unicode data files as input +# +# Input: UnicodeData.txt and CompositionExclusions.txt +# Output: unicode_norm_table.h +# +# Copyright (c) 2000-2020, PostgreSQL Global Development Group + +use strict; +use warnings; + +my $output_file = "unicode_norm_table.h"; + +my $FH; + +# Read list of codes that should be excluded from re-composition. +my @composition_exclusion_codes = (); +open($FH, '<', "CompositionExclusions.txt") + or die "Could not open CompositionExclusions.txt: $!."; +while (my $line = <$FH>) +{ + if ($line =~ /^([[:xdigit:]]+)/) + { + push @composition_exclusion_codes, $1; + } +} +close $FH; + +# Read entries from UnicodeData.txt into a list, and a hash table. We need +# three fields from each row: the codepoint, canonical combining class, +# and character decomposition mapping +my @characters = (); +my %character_hash = (); +open($FH, '<', "UnicodeData.txt") + or die "Could not open UnicodeData.txt: $!."; +while (my $line = <$FH>) +{ + + # Split the line wanted and get the fields needed: + # - Unicode code value + # - Canonical Combining Class + # - Character Decomposition Mapping + my @elts = split(';', $line); + my $code = $elts[0]; + my $class = $elts[3]; + my $decomp = $elts[5]; + + # Skip codepoints above U+10FFFF. They cannot be represented in 4 bytes + # in UTF-8, and PostgreSQL doesn't support UTF-8 characters longer than + # 4 bytes. (This is just pro forma, as there aren't any such entries in + # the data file, currently.) + next if hex($code) > 0x10FFFF; + + # Skip characters with no decompositions and a class of 0, to reduce the + # table size. + next if $class eq '0' && $decomp eq ''; + + my %char_entry = (code => $code, class => $class, decomp => $decomp); + push(@characters, \%char_entry); + $character_hash{$code} = \%char_entry; +} +close $FH; + +my $num_characters = scalar @characters; + +# Start writing out the output file +open my $OUTPUT, '>', $output_file + or die "Could not open output file $output_file: $!\n"; + +print $OUTPUT <<HEADER; +/*------------------------------------------------------------------------- + * + * unicode_norm_table.h + * Composition table used for Unicode normalization + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/common/unicode_norm_table.h + * + *------------------------------------------------------------------------- + */ + +/* + * File auto-generated by src/common/unicode/generate-unicode_norm_table.pl, + * do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_TABLE_H + * here. + */ +typedef struct +{ + uint32 codepoint; /* Unicode codepoint */ + uint8 comb_class; /* combining class of character */ + uint8 dec_size_flags; /* size and flags of decomposition code list */ + uint16 dec_index; /* index into UnicodeDecomp_codepoints, or the + * decomposition itself if DECOMP_INLINE */ +} pg_unicode_decomposition; + +#define DECOMP_NO_COMPOSE 0x80 /* don't use for re-composition */ +#define DECOMP_INLINE 0x40 /* decomposition is stored inline in + * dec_index */ +#define DECOMP_COMPAT 0x20 /* compatibility mapping */ + +#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x1F) +#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & (DECOMP_NO_COMPOSE | DECOMP_COMPAT)) != 0) +#define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0) +#define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0) + +/* Table of Unicode codepoints and their decompositions */ +static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] = +{ +HEADER + +my $decomp_index = 0; +my $decomp_string = ""; + +my $last_code = $characters[-1]->{code}; +foreach my $char (@characters) +{ + my $code = $char->{code}; + my $class = $char->{class}; + my $decomp = $char->{decomp}; + + # The character decomposition mapping field in UnicodeData.txt is a list + # of unicode codepoints, separated by space. But it can be prefixed with + # so-called compatibility formatting tag, like "<compat>", or "<font>". + # The entries with compatibility formatting tags should not be used for + # re-composing characters during normalization, so flag them in the table. + # (The tag doesn't matter, only whether there is a tag or not) + my $compat = 0; + if ($decomp =~ /\<.*\>/) + { + $compat = 1; + $decomp =~ s/\<[^][]*\>//g; + } + my @decomp_elts = split(" ", $decomp); + + # Decomposition size + # Print size of decomposition + my $decomp_size = scalar(@decomp_elts); + die if $decomp_size > 0x1F; # to not overrun bitmask + + my $first_decomp = shift @decomp_elts; + + my $flags = ""; + my $comment = ""; + + if ($compat) + { + $flags .= " | DECOMP_COMPAT"; + } + + if ($decomp_size == 2) + { + # Should this be used for recomposition? + if ( $character_hash{$first_decomp} + && $character_hash{$first_decomp}->{class} != 0) + { + $flags .= " | DECOMP_NO_COMPOSE"; + $comment = "non-starter decomposition"; + } + else + { + foreach my $lcode (@composition_exclusion_codes) + { + if ($lcode eq $char->{code}) + { + $flags .= " | DECOMP_NO_COMPOSE"; + $comment = "in exclusion list"; + last; + } + } + } + } + + if ($decomp_size == 0) + { + print $OUTPUT "\t{0x$code, $class, 0$flags, 0}"; + } + elsif ($decomp_size == 1 && length($first_decomp) <= 4) + { + + # The decomposition consists of a single codepoint, and it fits + # in a uint16, so we can store it "inline" in the main table. + $flags .= " | DECOMP_INLINE"; + print $OUTPUT "\t{0x$code, $class, 1$flags, 0x$first_decomp}"; + } + else + { + print $OUTPUT + "\t{0x$code, $class, $decomp_size$flags, $decomp_index}"; + + # Now save the decompositions into a dedicated area that will + # be written afterwards. First build the entry dedicated to + # a sub-table with the code and decomposition. + $decomp_string .= ",\n" if ($decomp_string ne ""); + + $decomp_string .= "\t /* $decomp_index */ 0x$first_decomp"; + foreach (@decomp_elts) + { + $decomp_string .= ", 0x$_"; + } + + $decomp_index = $decomp_index + $decomp_size; + } + + # Print a comma after all items except the last one. + print $OUTPUT "," unless ($code eq $last_code); + if ($comment ne "") + { + + # If the line is wide already, indent the comment with one tab, + # otherwise with two. This is to make the output match the way + # pgindent would mangle it. (This is quite hacky. To do this + # properly, we should actually track how long the line is so far, + # but this works for now.) + print $OUTPUT "\t" if ($decomp_index < 10); + + print $OUTPUT "\t/* $comment */" if ($comment ne ""); + } + print $OUTPUT "\n"; +} +print $OUTPUT "\n};\n\n"; + +# Print the array of decomposed codes. +print $OUTPUT <<HEADER; +/* codepoints array */ +static const uint32 UnicodeDecomp_codepoints[$decomp_index] = +{ +$decomp_string +}; +HEADER + +close $OUTPUT; |