summaryrefslogtreecommitdiff
path: root/src/common/unicode
diff options
context:
space:
mode:
authorJeff Davis2025-01-23 17:06:50 +0000
committerJeff Davis2025-01-23 17:06:50 +0000
commit4e7f62bc386a479593e4e8ecfb94370f5a88e522 (patch)
tree9f890349bfb0f2ba7ee7d64b0e3e8df198f3833d /src/common/unicode
parent7921927bbb9d4a80ced9283b27c26eedb638f555 (diff)
Add support for Unicode case folding.
Expand case mapping tables to include entries for case folding, which are parsed from CaseFolding.txt. Discussion: https://postgr.es/m/a1886ddfcd8f60cb3e905c93009b646b4cfb74c5.camel%40j-davis.com
Diffstat (limited to 'src/common/unicode')
-rw-r--r--src/common/unicode/Makefile6
-rw-r--r--src/common/unicode/case_test.c32
-rw-r--r--src/common/unicode/generate-unicode_case_table.pl110
-rw-r--r--src/common/unicode/meson.build4
4 files changed, 135 insertions, 17 deletions
diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile
index 87d7355794f..f41c850c645 100644
--- a/src/common/unicode/Makefile
+++ b/src/common/unicode/Makefile
@@ -30,13 +30,13 @@ update-unicode: unicode_case_table.h unicode_category_table.h unicode_east_asian
# These files are part of the Unicode Character Database. Download
# them on demand. The dependency on Makefile.global is for
# UNICODE_VERSION.
-CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
+CompositionExclusions.txt CaseFolding.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
unicode_version.h: generate-unicode_version.pl
$(PERL) $< --version $(UNICODE_VERSION)
-unicode_case_table.h: generate-unicode_case_table.pl UnicodeData.txt
+unicode_case_table.h: generate-unicode_case_table.pl CaseFolding.txt UnicodeData.txt
$(PERL) $<
unicode_category_table.h: generate-unicode_category_table.pl DerivedCoreProperties.txt PropList.txt UnicodeData.txt
@@ -91,4 +91,4 @@ clean:
rm -f $(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o
distclean: clean
- rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
+ rm -f CompositionExclusions.txt CaseFolding.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index c4ba7e781be..f0b38b3bdd7 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -81,17 +81,20 @@ icu_test_simple(pg_wchar code)
pg_wchar lower = unicode_lowercase_simple(code);
pg_wchar title = unicode_titlecase_simple(code);
pg_wchar upper = unicode_uppercase_simple(code);
+ pg_wchar fold = unicode_casefold_simple(code);
pg_wchar iculower = u_tolower(code);
pg_wchar icutitle = u_totitle(code);
pg_wchar icuupper = u_toupper(code);
+ pg_wchar icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
- if (lower != iculower || title != icutitle || upper != icuupper)
+ if (lower != iculower || title != icutitle || upper != icuupper ||
+ fold != icufold)
{
printf("case_test: FAILURE for codepoint 0x%06x\n", code);
- printf("case_test: Postgres lower/title/upper: 0x%06x/0x%06x/0x%06x\n",
- lower, title, upper);
- printf("case_test: ICU lower/title/upper: 0x%06x/0x%06x/0x%06x\n",
- iculower, icutitle, icuupper);
+ printf("case_test: Postgres lower/title/upper/fold: 0x%06x/0x%06x/0x%06x/0x%06x\n",
+ lower, title, upper, fold);
+ printf("case_test: ICU lower/title/upper/fold: 0x%06x/0x%06x/0x%06x/0x%06x\n",
+ iculower, icutitle, icuupper, icufold);
printf("\n");
exit(1);
}
@@ -103,9 +106,11 @@ icu_test_full(char *str)
char lower[BUFSZ];
char title[BUFSZ];
char upper[BUFSZ];
+ char fold[BUFSZ];
char icu_lower[BUFSZ];
char icu_title[BUFSZ];
char icu_upper[BUFSZ];
+ char icu_fold[BUFSZ];
UErrorCode status;
struct WordBoundaryState wbstate = {
.str = str,
@@ -118,12 +123,15 @@ icu_test_full(char *str)
unicode_strlower(lower, BUFSZ, str, -1, true);
unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate);
unicode_strupper(upper, BUFSZ, str, -1, true);
+ unicode_strfold(fold, BUFSZ, str, -1, true);
status = U_ZERO_ERROR;
ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status);
status = U_ZERO_ERROR;
ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status);
status = U_ZERO_ERROR;
ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status);
+ status = U_ZERO_ERROR;
+ ucasemap_utf8FoldCase(casemap, icu_fold, BUFSZ, str, -1, &status);
if (strcmp(lower, icu_lower) != 0)
{
@@ -143,6 +151,12 @@ icu_test_full(char *str)
icu_upper);
exit(1);
}
+ if (strcmp(fold, icu_fold) != 0)
+ {
+ printf("case_test: str='%s' fold='%s' icu_fold='%s'\n", str, fold,
+ icu_fold);
+ exit(1);
+ }
}
/*
@@ -302,6 +316,12 @@ tfunc_upper(char *dst, size_t dstsize, const char *src,
return unicode_strupper(dst, dstsize, src, srclen, true);
}
+static size_t
+tfunc_fold(char *dst, size_t dstsize, const char *src,
+ ssize_t srclen)
+{
+ return unicode_strfold(dst, dstsize, src, srclen, true);
+}
static void
test_convert_case()
@@ -318,10 +338,12 @@ test_convert_case()
test_convert(tfunc_upper, "ß", "SS");
test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307");
test_convert(tfunc_upper, "ıiIİ", "IIIİ");
+ test_convert(tfunc_fold, "ıiIİ", "ıiii\u0307");
/* test final sigma */
test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς");
test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
+ test_convert(tfunc_fold, "σςΣ ΣΣΣ", "σσσ σσσ");
#ifdef USE_ICU
icu_test_full("");
diff --git a/src/common/unicode/generate-unicode_case_table.pl b/src/common/unicode/generate-unicode_case_table.pl
index 957ec14168c..953ebef2fe6 100644
--- a/src/common/unicode/generate-unicode_case_table.pl
+++ b/src/common/unicode/generate-unicode_case_table.pl
@@ -49,7 +49,8 @@ while (my $line = <$FH>)
$simple{$code} = {
Simple_Lowercase => ($simple_lowercase || $code),
Simple_Titlecase => ($simple_titlecase || $code),
- Simple_Uppercase => ($simple_uppercase || $code)
+ Simple_Uppercase => ($simple_uppercase || $code),
+ Simple_Foldcase => $code,
};
}
}
@@ -87,6 +88,7 @@ while (my $line = <$FH>)
my @lower = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[1]));
my @title = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[2]));
my @upper = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[3]));
+ my @fold = ();
my @conditions = map {
# supporting negated conditions may require storing a
# mask of relevant conditions for a given rule to differentiate
@@ -101,6 +103,7 @@ while (my $line = <$FH>)
push @lower, $code if (scalar @lower == 0);
push @title, $code if (scalar @title == 0);
push @upper, $code if (scalar @upper == 0);
+ push @fold, $code;
# none should map to more than 3 codepoints
die "lowercase expansion for 0x$elts[0] exceeds maximum: '$elts[1]'"
@@ -114,13 +117,15 @@ while (my $line = <$FH>)
while (scalar @upper < $MAX_CASE_EXPANSION) { push @upper, 0x000000 }
while (scalar @lower < $MAX_CASE_EXPANSION) { push @lower, 0x000000 }
while (scalar @title < $MAX_CASE_EXPANSION) { push @title, 0x000000 }
+ while (scalar @fold < $MAX_CASE_EXPANSION) { push @fold, 0x000000 }
# Characters with special mappings may not have simple mappings;
# ensure that an entry exists.
$simple{$code} ||= {
Simple_Lowercase => $code,
Simple_Titlecase => $code,
- Simple_Uppercase => $code
+ Simple_Uppercase => $code,
+ Simple_Foldcase => $code
};
# Multiple special case rules for a single codepoint could be
@@ -135,11 +140,96 @@ while (my $line = <$FH>)
Lowercase => \@lower,
Titlecase => \@title,
Uppercase => \@upper,
+ Foldcase => \@fold,
Conditions => $cond_str
};
}
close $FH;
+open($FH, '<', "$output_path/CaseFolding.txt")
+ or die "Could not open $output_path/CaseFolding.txt: $!.";
+while (my $line = <$FH>)
+{
+ # remove comments
+ $line =~ s/^(.*?)#.*$/$1/s;
+
+ # ignore empty lines
+ next unless $line =~ /;/;
+
+ my @elts = split(';', $line);
+ my $code = hex($elts[0]);
+ my $status = $elts[1] =~ s/^\s+|\s+$//rg;
+
+ # Codepoint may map to multiple characters when folding. Split
+ # each mapping on whitespace and extract the hexadecimal into an
+ # array of codepoints.
+ my @fold = map { hex $_ } (grep /[0-9A-F]+/, (split /\s+/, $elts[2]));
+
+ die "codepoint $code out of range" if $code > 0x10FFFF;
+
+ # status 'T' unsupported; skip
+ next if $status eq 'T';
+
+ # encountered unrecognized status type
+ die "unsupported status type '$status'"
+ if $status ne 'S' && $status ne 'C' && $status ne 'F';
+
+ # initialize simple case mappings if they don't exist
+ $simple{$code} ||= {
+ Simple_Lowercase => $code,
+ Simple_Titlecase => $code,
+ Simple_Uppercase => $code,
+ Simple_Foldcase => $code
+ };
+
+ if ($status eq 'S' || $status eq 'C')
+ {
+ die
+ "Simple case folding for $code has multiple codepoints: '$line' '$elts[2]'"
+ if scalar @fold != 1;
+ my $simple_foldcase = $fold[0];
+
+ die "Simple_Foldcase $code out of range"
+ if $simple_foldcase > 0x10FFFF;
+
+ $simple{$code}{Simple_Foldcase} = $simple_foldcase;
+ }
+
+ if ($status eq 'F' || ($status eq 'C' && defined $special{$code}))
+ {
+ while (scalar @fold < $MAX_CASE_EXPANSION) { push @fold, 0x000000 }
+
+ #initialize special case mappings if they don't exist
+ if (!defined $special{$code})
+ {
+ my @lower = ($simple{$code}{Simple_Lowercase});
+ my @title = ($simple{$code}{Simple_Titlecase});
+ my @upper = ($simple{$code}{Simple_Uppercase});
+ while (scalar @lower < $MAX_CASE_EXPANSION)
+ {
+ push @lower, 0x000000;
+ }
+ while (scalar @title < $MAX_CASE_EXPANSION)
+ {
+ push @title, 0x000000;
+ }
+ while (scalar @upper < $MAX_CASE_EXPANSION)
+ {
+ push @upper, 0x000000;
+ }
+ $special{$code} = {
+ Lowercase => \@lower,
+ Titlecase => \@title,
+ Uppercase => \@upper,
+ Conditions => '0'
+ };
+ }
+
+ $special{$code}{Foldcase} = \@fold;
+ }
+}
+close $FH;
+
# assign sequential array indexes to the special mappings
my $special_idx = 0;
foreach my $code (sort { $a <=> $b } (keys %special))
@@ -202,6 +292,7 @@ typedef enum
CaseLower = 0,
CaseTitle = 1,
CaseUpper = 2,
+ CaseFold = 3,
NCaseKind
} CaseKind;
@@ -232,14 +323,17 @@ foreach my $code (sort { $a <=> $b } (keys %special))
die if scalar @{ $special{$code}{Lowercase} } != $MAX_CASE_EXPANSION;
die if scalar @{ $special{$code}{Titlecase} } != $MAX_CASE_EXPANSION;
die if scalar @{ $special{$code}{Uppercase} } != $MAX_CASE_EXPANSION;
+ die if scalar @{ $special{$code}{Foldcase} } != $MAX_CASE_EXPANSION;
my $lower = join ", ",
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Lowercase} });
my $title = join ", ",
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Titlecase} });
my $upper = join ", ",
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Uppercase} });
+ my $fold = join ", ",
+ (map { sprintf "0x%06x", $_ } @{ $special{$code}{Foldcase} });
printf $OT "\t{0x%06x, %s, ", $code, $special{$code}{Conditions};
- printf $OT "{{%s}, {%s}, {%s}}},\n", $lower, $title, $upper;
+ printf $OT "{{%s}, {%s}, {%s}, {%s}}},\n", $lower, $title, $upper, $fold;
}
print $OT "\t{0, 0, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}}\n";
@@ -260,11 +354,13 @@ for (my $code = 0; $code < 0x80; $code++)
my $lc = ($simple{$code}{Simple_Lowercase} || $code);
my $tc = ($simple{$code}{Simple_Titlecase} || $code);
my $uc = ($simple{$code}{Simple_Uppercase} || $code);
+ my $fc = ($simple{$code}{Simple_Foldcase} || $code);
+
die "unexpected special case for code $code"
if defined $special{$code};
printf $OT
- "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, NULL},\n",
- $code, $lc, $tc, $uc;
+ "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x,[CaseFold] = 0x%06x}, NULL},\n",
+ $code, $lc, $tc, $uc, $fc;
}
printf $OT "\n";
@@ -280,8 +376,8 @@ foreach my $code (sort { $a <=> $b } (keys %simple))
$special_case = sprintf "&special_case[%d]", $special{$code}{Index};
}
printf $OT
- "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, %s},\n",
+ "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x,[CaseFold] = 0x%06x}, %s},\n",
$code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase},
- $map->{Simple_Uppercase}, $special_case;
+ $map->{Simple_Uppercase}, $map->{Simple_Foldcase}, $special_case;
}
print $OT "};\n";
diff --git a/src/common/unicode/meson.build b/src/common/unicode/meson.build
index 362cbae0285..b98940de279 100644
--- a/src/common/unicode/meson.build
+++ b/src/common/unicode/meson.build
@@ -11,7 +11,7 @@ endif
# These files are part of the Unicode Character Database. Download them on
# demand.
-foreach f : ['CompositionExclusions.txt', 'DerivedCoreProperties.txt', 'DerivedNormalizationProps.txt', 'EastAsianWidth.txt', 'NormalizationTest.txt', 'PropList.txt', 'SpecialCasing.txt', 'UnicodeData.txt']
+foreach f : ['CompositionExclusions.txt', 'CaseFolding.txt', 'DerivedCoreProperties.txt', 'DerivedNormalizationProps.txt', 'EastAsianWidth.txt', 'NormalizationTest.txt', 'PropList.txt', 'SpecialCasing.txt', 'UnicodeData.txt']
url = unicode_baseurl.format(UNICODE_VERSION, f)
target = custom_target(f,
output: f,
@@ -26,7 +26,7 @@ update_unicode_targets = []
update_unicode_targets += \
custom_target('unicode_case_table.h',
- input: [unicode_data['SpecialCasing.txt'], unicode_data['UnicodeData.txt']],
+ input: [unicode_data['CaseFolding.txt'], unicode_data['SpecialCasing.txt'], unicode_data['UnicodeData.txt']],
output: ['unicode_case_table.h'],
command: [
perl, files('generate-unicode_case_table.pl'),