diff options
author | Jeff Davis | 2025-01-23 17:06:50 +0000 |
---|---|---|
committer | Jeff Davis | 2025-01-23 17:06:50 +0000 |
commit | 4e7f62bc386a479593e4e8ecfb94370f5a88e522 (patch) | |
tree | 9f890349bfb0f2ba7ee7d64b0e3e8df198f3833d /src/common | |
parent | 7921927bbb9d4a80ced9283b27c26eedb638f555 (diff) |
Add support for Unicode case folding.
Expand case mapping tables to include entries for case folding, which
are parsed from CaseFolding.txt.
Discussion: https://postgr.es/m/a1886ddfcd8f60cb3e905c93009b646b4cfb74c5.camel%40j-davis.com
Diffstat (limited to 'src/common')
-rw-r--r-- | src/common/unicode/Makefile | 6 | ||||
-rw-r--r-- | src/common/unicode/case_test.c | 32 | ||||
-rw-r--r-- | src/common/unicode/generate-unicode_case_table.pl | 110 | ||||
-rw-r--r-- | src/common/unicode/meson.build | 4 | ||||
-rw-r--r-- | src/common/unicode_case.c | 32 |
5 files changed, 167 insertions, 17 deletions
diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile index 87d7355794f..f41c850c645 100644 --- a/src/common/unicode/Makefile +++ b/src/common/unicode/Makefile @@ -30,13 +30,13 @@ update-unicode: unicode_case_table.h unicode_category_table.h unicode_east_asian # These files are part of the Unicode Character Database. Download # them on demand. The dependency on Makefile.global is for # UNICODE_VERSION. -CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global +CompositionExclusions.txt CaseFolding.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global $(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F) unicode_version.h: generate-unicode_version.pl $(PERL) $< --version $(UNICODE_VERSION) -unicode_case_table.h: generate-unicode_case_table.pl UnicodeData.txt +unicode_case_table.h: generate-unicode_case_table.pl CaseFolding.txt UnicodeData.txt $(PERL) $< unicode_category_table.h: generate-unicode_category_table.pl DerivedCoreProperties.txt PropList.txt UnicodeData.txt @@ -91,4 +91,4 @@ clean: rm -f $(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o distclean: clean - rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h + rm -f CompositionExclusions.txt CaseFolding.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c index c4ba7e781be..f0b38b3bdd7 100644 --- a/src/common/unicode/case_test.c +++ b/src/common/unicode/case_test.c @@ -81,17 +81,20 @@ icu_test_simple(pg_wchar code) pg_wchar lower = unicode_lowercase_simple(code); pg_wchar title = unicode_titlecase_simple(code); pg_wchar upper = unicode_uppercase_simple(code); + pg_wchar fold = unicode_casefold_simple(code); pg_wchar iculower = u_tolower(code); pg_wchar icutitle = u_totitle(code); pg_wchar icuupper = u_toupper(code); + pg_wchar icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT); - if (lower != iculower || title != icutitle || upper != icuupper) + if (lower != iculower || title != icutitle || upper != icuupper || + fold != icufold) { printf("case_test: FAILURE for codepoint 0x%06x\n", code); - printf("case_test: Postgres lower/title/upper: 0x%06x/0x%06x/0x%06x\n", - lower, title, upper); - printf("case_test: ICU lower/title/upper: 0x%06x/0x%06x/0x%06x\n", - iculower, icutitle, icuupper); + printf("case_test: Postgres lower/title/upper/fold: 0x%06x/0x%06x/0x%06x/0x%06x\n", + lower, title, upper, fold); + printf("case_test: ICU lower/title/upper/fold: 0x%06x/0x%06x/0x%06x/0x%06x\n", + iculower, icutitle, icuupper, icufold); printf("\n"); exit(1); } @@ -103,9 +106,11 @@ icu_test_full(char *str) char lower[BUFSZ]; char title[BUFSZ]; char upper[BUFSZ]; + char fold[BUFSZ]; char icu_lower[BUFSZ]; char icu_title[BUFSZ]; char icu_upper[BUFSZ]; + char icu_fold[BUFSZ]; UErrorCode status; struct WordBoundaryState wbstate = { .str = str, @@ -118,12 +123,15 @@ icu_test_full(char *str) unicode_strlower(lower, BUFSZ, str, -1, true); unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate); unicode_strupper(upper, BUFSZ, str, -1, true); + unicode_strfold(fold, BUFSZ, str, -1, true); status = U_ZERO_ERROR; ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status); status = U_ZERO_ERROR; ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status); status = U_ZERO_ERROR; ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status); + status = U_ZERO_ERROR; + ucasemap_utf8FoldCase(casemap, icu_fold, BUFSZ, str, -1, &status); if (strcmp(lower, icu_lower) != 0) { @@ -143,6 +151,12 @@ icu_test_full(char *str) icu_upper); exit(1); } + if (strcmp(fold, icu_fold) != 0) + { + printf("case_test: str='%s' fold='%s' icu_fold='%s'\n", str, fold, + icu_fold); + exit(1); + } } /* @@ -302,6 +316,12 @@ tfunc_upper(char *dst, size_t dstsize, const char *src, return unicode_strupper(dst, dstsize, src, srclen, true); } +static size_t +tfunc_fold(char *dst, size_t dstsize, const char *src, + ssize_t srclen) +{ + return unicode_strfold(dst, dstsize, src, srclen, true); +} static void test_convert_case() @@ -318,10 +338,12 @@ test_convert_case() test_convert(tfunc_upper, "ß", "SS"); test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307"); test_convert(tfunc_upper, "ıiIİ", "IIIİ"); + test_convert(tfunc_fold, "ıiIİ", "ıiii\u0307"); /* test final sigma */ test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς"); test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'"); test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς"); + test_convert(tfunc_fold, "σςΣ ΣΣΣ", "σσσ σσσ"); #ifdef USE_ICU icu_test_full(""); diff --git a/src/common/unicode/generate-unicode_case_table.pl b/src/common/unicode/generate-unicode_case_table.pl index 957ec14168c..953ebef2fe6 100644 --- a/src/common/unicode/generate-unicode_case_table.pl +++ b/src/common/unicode/generate-unicode_case_table.pl @@ -49,7 +49,8 @@ while (my $line = <$FH>) $simple{$code} = { Simple_Lowercase => ($simple_lowercase || $code), Simple_Titlecase => ($simple_titlecase || $code), - Simple_Uppercase => ($simple_uppercase || $code) + Simple_Uppercase => ($simple_uppercase || $code), + Simple_Foldcase => $code, }; } } @@ -87,6 +88,7 @@ while (my $line = <$FH>) my @lower = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[1])); my @title = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[2])); my @upper = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[3])); + my @fold = (); my @conditions = map { # supporting negated conditions may require storing a # mask of relevant conditions for a given rule to differentiate @@ -101,6 +103,7 @@ while (my $line = <$FH>) push @lower, $code if (scalar @lower == 0); push @title, $code if (scalar @title == 0); push @upper, $code if (scalar @upper == 0); + push @fold, $code; # none should map to more than 3 codepoints die "lowercase expansion for 0x$elts[0] exceeds maximum: '$elts[1]'" @@ -114,13 +117,15 @@ while (my $line = <$FH>) while (scalar @upper < $MAX_CASE_EXPANSION) { push @upper, 0x000000 } while (scalar @lower < $MAX_CASE_EXPANSION) { push @lower, 0x000000 } while (scalar @title < $MAX_CASE_EXPANSION) { push @title, 0x000000 } + while (scalar @fold < $MAX_CASE_EXPANSION) { push @fold, 0x000000 } # Characters with special mappings may not have simple mappings; # ensure that an entry exists. $simple{$code} ||= { Simple_Lowercase => $code, Simple_Titlecase => $code, - Simple_Uppercase => $code + Simple_Uppercase => $code, + Simple_Foldcase => $code }; # Multiple special case rules for a single codepoint could be @@ -135,11 +140,96 @@ while (my $line = <$FH>) Lowercase => \@lower, Titlecase => \@title, Uppercase => \@upper, + Foldcase => \@fold, Conditions => $cond_str }; } close $FH; +open($FH, '<', "$output_path/CaseFolding.txt") + or die "Could not open $output_path/CaseFolding.txt: $!."; +while (my $line = <$FH>) +{ + # remove comments + $line =~ s/^(.*?)#.*$/$1/s; + + # ignore empty lines + next unless $line =~ /;/; + + my @elts = split(';', $line); + my $code = hex($elts[0]); + my $status = $elts[1] =~ s/^\s+|\s+$//rg; + + # Codepoint may map to multiple characters when folding. Split + # each mapping on whitespace and extract the hexadecimal into an + # array of codepoints. + my @fold = map { hex $_ } (grep /[0-9A-F]+/, (split /\s+/, $elts[2])); + + die "codepoint $code out of range" if $code > 0x10FFFF; + + # status 'T' unsupported; skip + next if $status eq 'T'; + + # encountered unrecognized status type + die "unsupported status type '$status'" + if $status ne 'S' && $status ne 'C' && $status ne 'F'; + + # initialize simple case mappings if they don't exist + $simple{$code} ||= { + Simple_Lowercase => $code, + Simple_Titlecase => $code, + Simple_Uppercase => $code, + Simple_Foldcase => $code + }; + + if ($status eq 'S' || $status eq 'C') + { + die + "Simple case folding for $code has multiple codepoints: '$line' '$elts[2]'" + if scalar @fold != 1; + my $simple_foldcase = $fold[0]; + + die "Simple_Foldcase $code out of range" + if $simple_foldcase > 0x10FFFF; + + $simple{$code}{Simple_Foldcase} = $simple_foldcase; + } + + if ($status eq 'F' || ($status eq 'C' && defined $special{$code})) + { + while (scalar @fold < $MAX_CASE_EXPANSION) { push @fold, 0x000000 } + + #initialize special case mappings if they don't exist + if (!defined $special{$code}) + { + my @lower = ($simple{$code}{Simple_Lowercase}); + my @title = ($simple{$code}{Simple_Titlecase}); + my @upper = ($simple{$code}{Simple_Uppercase}); + while (scalar @lower < $MAX_CASE_EXPANSION) + { + push @lower, 0x000000; + } + while (scalar @title < $MAX_CASE_EXPANSION) + { + push @title, 0x000000; + } + while (scalar @upper < $MAX_CASE_EXPANSION) + { + push @upper, 0x000000; + } + $special{$code} = { + Lowercase => \@lower, + Titlecase => \@title, + Uppercase => \@upper, + Conditions => '0' + }; + } + + $special{$code}{Foldcase} = \@fold; + } +} +close $FH; + # assign sequential array indexes to the special mappings my $special_idx = 0; foreach my $code (sort { $a <=> $b } (keys %special)) @@ -202,6 +292,7 @@ typedef enum CaseLower = 0, CaseTitle = 1, CaseUpper = 2, + CaseFold = 3, NCaseKind } CaseKind; @@ -232,14 +323,17 @@ foreach my $code (sort { $a <=> $b } (keys %special)) die if scalar @{ $special{$code}{Lowercase} } != $MAX_CASE_EXPANSION; die if scalar @{ $special{$code}{Titlecase} } != $MAX_CASE_EXPANSION; die if scalar @{ $special{$code}{Uppercase} } != $MAX_CASE_EXPANSION; + die if scalar @{ $special{$code}{Foldcase} } != $MAX_CASE_EXPANSION; my $lower = join ", ", (map { sprintf "0x%06x", $_ } @{ $special{$code}{Lowercase} }); my $title = join ", ", (map { sprintf "0x%06x", $_ } @{ $special{$code}{Titlecase} }); my $upper = join ", ", (map { sprintf "0x%06x", $_ } @{ $special{$code}{Uppercase} }); + my $fold = join ", ", + (map { sprintf "0x%06x", $_ } @{ $special{$code}{Foldcase} }); printf $OT "\t{0x%06x, %s, ", $code, $special{$code}{Conditions}; - printf $OT "{{%s}, {%s}, {%s}}},\n", $lower, $title, $upper; + printf $OT "{{%s}, {%s}, {%s}, {%s}}},\n", $lower, $title, $upper, $fold; } print $OT "\t{0, 0, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}}\n"; @@ -260,11 +354,13 @@ for (my $code = 0; $code < 0x80; $code++) my $lc = ($simple{$code}{Simple_Lowercase} || $code); my $tc = ($simple{$code}{Simple_Titlecase} || $code); my $uc = ($simple{$code}{Simple_Uppercase} || $code); + my $fc = ($simple{$code}{Simple_Foldcase} || $code); + die "unexpected special case for code $code" if defined $special{$code}; printf $OT - "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, NULL},\n", - $code, $lc, $tc, $uc; + "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x,[CaseFold] = 0x%06x}, NULL},\n", + $code, $lc, $tc, $uc, $fc; } printf $OT "\n"; @@ -280,8 +376,8 @@ foreach my $code (sort { $a <=> $b } (keys %simple)) $special_case = sprintf "&special_case[%d]", $special{$code}{Index}; } printf $OT - "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, %s},\n", + "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x,[CaseFold] = 0x%06x}, %s},\n", $code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase}, - $map->{Simple_Uppercase}, $special_case; + $map->{Simple_Uppercase}, $map->{Simple_Foldcase}, $special_case; } print $OT "};\n"; diff --git a/src/common/unicode/meson.build b/src/common/unicode/meson.build index 362cbae0285..b98940de279 100644 --- a/src/common/unicode/meson.build +++ b/src/common/unicode/meson.build @@ -11,7 +11,7 @@ endif # These files are part of the Unicode Character Database. Download them on # demand. -foreach f : ['CompositionExclusions.txt', 'DerivedCoreProperties.txt', 'DerivedNormalizationProps.txt', 'EastAsianWidth.txt', 'NormalizationTest.txt', 'PropList.txt', 'SpecialCasing.txt', 'UnicodeData.txt'] +foreach f : ['CompositionExclusions.txt', 'CaseFolding.txt', 'DerivedCoreProperties.txt', 'DerivedNormalizationProps.txt', 'EastAsianWidth.txt', 'NormalizationTest.txt', 'PropList.txt', 'SpecialCasing.txt', 'UnicodeData.txt'] url = unicode_baseurl.format(UNICODE_VERSION, f) target = custom_target(f, output: f, @@ -26,7 +26,7 @@ update_unicode_targets = [] update_unicode_targets += \ custom_target('unicode_case_table.h', - input: [unicode_data['SpecialCasing.txt'], unicode_data['UnicodeData.txt']], + input: [unicode_data['CaseFolding.txt'], unicode_data['SpecialCasing.txt'], unicode_data['UnicodeData.txt']], output: ['unicode_case_table.h'], command: [ perl, files('generate-unicode_case_table.pl'), diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c index 48521d83239..7afff1b172b 100644 --- a/src/common/unicode_case.c +++ b/src/common/unicode_case.c @@ -51,6 +51,14 @@ unicode_uppercase_simple(pg_wchar code) return map ? map->simplemap[CaseUpper] : code; } +pg_wchar +unicode_casefold_simple(pg_wchar code) +{ + const pg_case_map *map = find_case_map(code); + + return map ? map->simplemap[CaseFold] : code; +} + /* * unicode_strlower() * @@ -143,6 +151,30 @@ unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, } /* + * unicode_strfold() + * + * Case fold src, and return the result length (not including terminating + * NUL). + * + * String src must be encoded in UTF-8. If srclen < 0, src must be + * NUL-terminated. + * + * Result string is stored in dst, truncating if larger than dstsize. If + * dstsize is greater than the result length, dst will be NUL-terminated; + * otherwise not. + * + * If dstsize is zero, dst may be NULL. This is useful for calculating the + * required buffer size before allocating. + */ +size_t +unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen, + bool full) +{ + return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL, + NULL); +} + +/* * Implement Unicode Default Case Conversion algorithm. * * If str_casekind is CaseLower or CaseUpper, map each character in the string |