summaryrefslogtreecommitdiff
path: root/src/include
diff options
context:
space:
mode:
authorTom Lane2018-12-14 17:52:49 +0000
committerTom Lane2018-12-14 17:52:49 +0000
commit5e09280057a4c3f5db297348ea3e044c9c5f4ef8 (patch)
treea153ceede13d3b807d48d420896b6763d44c9086 /src/include
parent8fb569e978af3995f0dd6b0033758ec571aab0c1 (diff)
Make pg_statistic and related code account more honestly for collations.
When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/[email protected]
Diffstat (limited to 'src/include')
-rw-r--r--src/include/catalog/catversion.h2
-rw-r--r--src/include/catalog/pg_statistic.h43
-rw-r--r--src/include/commands/vacuum.h11
-rw-r--r--src/include/statistics/extended_stats_internal.h2
-rw-r--r--src/include/utils/lsyscache.h1
-rw-r--r--src/include/utils/typcache.h1
6 files changed, 40 insertions, 20 deletions
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index e16ec9dd778..838e927547f 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 201812091
+#define CATALOG_VERSION_NO 201812141
#endif
diff --git a/src/include/catalog/pg_statistic.h b/src/include/catalog/pg_statistic.h
index 49223aab4fc..2155f51a5b1 100644
--- a/src/include/catalog/pg_statistic.h
+++ b/src/include/catalog/pg_statistic.h
@@ -74,12 +74,13 @@ CATALOG(pg_statistic,2619,StatisticRelationId)
* statistical data can be placed. Each slot includes:
* kind integer code identifying kind of data (see below)
* op OID of associated operator, if needed
+ * coll OID of relevant collation, or 0 if none
* numbers float4 array (for statistical values)
* values anyarray (for representations of data values)
- * The ID and operator fields are never NULL; they are zeroes in an
- * unused slot. The numbers and values fields are NULL in an unused
- * slot, and might also be NULL in a used slot if the slot kind has
- * no need for one or the other.
+ * The ID, operator, and collation fields are never NULL; they are zeroes
+ * in an unused slot. The numbers and values fields are NULL in an
+ * unused slot, and might also be NULL in a used slot if the slot kind
+ * has no need for one or the other.
* ----------------
*/
@@ -95,6 +96,12 @@ CATALOG(pg_statistic,2619,StatisticRelationId)
Oid staop4;
Oid staop5;
+ Oid stacoll1;
+ Oid stacoll2;
+ Oid stacoll3;
+ Oid stacoll4;
+ Oid stacoll5;
+
#ifdef CATALOG_VARLEN /* variable-length fields start here */
float4 stanumbers1[1];
float4 stanumbers2[1];
@@ -159,7 +166,8 @@ typedef FormData_pg_statistic *Form_pg_statistic;
/*
* In a "most common values" slot, staop is the OID of the "=" operator
- * used to decide whether values are the same or not. stavalues contains
+ * used to decide whether values are the same or not, and stacoll is the
+ * collation used (same as column's collation). stavalues contains
* the K most common non-null values appearing in the column, and stanumbers
* contains their frequencies (fractions of total row count). The values
* shall be ordered in decreasing frequency. Note that since the arrays are
@@ -171,9 +179,11 @@ typedef FormData_pg_statistic *Form_pg_statistic;
/*
* A "histogram" slot describes the distribution of scalar data. staop is
- * the OID of the "<" operator that describes the sort ordering. (In theory,
- * more than one histogram could appear, if a datatype has more than one
- * useful sort operator.) stavalues contains M (>=2) non-null values that
+ * the OID of the "<" operator that describes the sort ordering, and stacoll
+ * is the relevant collation. (In theory more than one histogram could appear,
+ * if a datatype has more than one useful sort operator or we care about more
+ * than one collation. Currently the collation will always be that of the
+ * underlying column.) stavalues contains M (>=2) non-null values that
* divide the non-null column data values into M-1 bins of approximately equal
* population. The first stavalues item is the MIN and the last is the MAX.
* stanumbers is not used and should be NULL. IMPORTANT POINT: if an MCV
@@ -190,11 +200,12 @@ typedef FormData_pg_statistic *Form_pg_statistic;
/*
* A "correlation" slot describes the correlation between the physical order
* of table tuples and the ordering of data values of this column, as seen
- * by the "<" operator identified by staop. (As with the histogram, more
- * than one entry could theoretically appear.) stavalues is not used and
- * should be NULL. stanumbers contains a single entry, the correlation
- * coefficient between the sequence of data values and the sequence of
- * their actual tuple positions. The coefficient ranges from +1 to -1.
+ * by the "<" operator identified by staop with the collation identified by
+ * stacoll. (As with the histogram, more than one entry could theoretically
+ * appear.) stavalues is not used and should be NULL. stanumbers contains
+ * a single entry, the correlation coefficient between the sequence of data
+ * values and the sequence of their actual tuple positions. The coefficient
+ * ranges from +1 to -1.
*/
#define STATISTIC_KIND_CORRELATION 3
@@ -203,7 +214,8 @@ typedef FormData_pg_statistic *Form_pg_statistic;
* except that it stores the most common non-null *elements* of the column
* values. This is useful when the column datatype is an array or some other
* type with identifiable elements (for instance, tsvector). staop contains
- * the equality operator appropriate to the element type. stavalues contains
+ * the equality operator appropriate to the element type, and stacoll
+ * contains the collation to use with it. stavalues contains
* the most common element values, and stanumbers their frequencies. Unlike
* MCV slots, frequencies are measured as the fraction of non-null rows the
* element value appears in, not the frequency of all rows. Also unlike
@@ -226,7 +238,8 @@ typedef FormData_pg_statistic *Form_pg_statistic;
* A "distinct elements count histogram" slot describes the distribution of
* the number of distinct element values present in each row of an array-type
* column. Only non-null rows are considered, and only non-null elements.
- * staop contains the equality operator appropriate to the element type.
+ * staop contains the equality operator appropriate to the element type,
+ * and stacoll contains the collation to use with it.
* stavalues is not used and should be NULL. The last member of stanumbers is
* the average count of distinct element values over all non-null rows. The
* preceding M (>=2) members form a histogram that divides the population of
diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h
index 2f4303e40d8..dfff23ac55b 100644
--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -52,9 +52,11 @@
* careful to allocate any pointed-to data in anl_context, which will NOT
* be CurrentMemoryContext when compute_stats is called.
*
- * Note: for the moment, all comparisons done for statistical purposes
- * should use the database's default collation (DEFAULT_COLLATION_OID).
- * This might change in some future release.
+ * Note: all comparisons done for statistical purposes should use the
+ * underlying column's collation (attcollation), except in situations
+ * where a noncollatable container type contains a collatable type;
+ * in that case use the type's default collation. Be sure to record
+ * the appropriate collation in stacoll.
*----------
*/
typedef struct VacAttrStats *VacAttrStatsP;
@@ -78,11 +80,13 @@ typedef struct VacAttrStats
* because some index opclasses store a different type than the underlying
* column/expression. Instead use attrtypid, attrtypmod, and attrtype for
* information about the datatype being fed to the typanalyze function.
+ * Likewise, use attrcollid not attr->attcollation.
*/
Form_pg_attribute attr; /* copy of pg_attribute row for column */
Oid attrtypid; /* type of data being analyzed */
int32 attrtypmod; /* typmod of data being analyzed */
Form_pg_type attrtype; /* copy of pg_type row for attrtypid */
+ Oid attrcollid; /* collation of data being analyzed */
MemoryContext anl_context; /* where to save long-lived data */
/*
@@ -103,6 +107,7 @@ typedef struct VacAttrStats
float4 stadistinct; /* # distinct values */
int16 stakind[STATISTIC_NUM_SLOTS];
Oid staop[STATISTIC_NUM_SLOTS];
+ Oid stacoll[STATISTIC_NUM_SLOTS];
int numnumbers[STATISTIC_NUM_SLOTS];
float4 *stanumbers[STATISTIC_NUM_SLOTS];
int numvalues[STATISTIC_NUM_SLOTS];
diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h
index b3ca0c1229f..fff6bc67991 100644
--- a/src/include/statistics/extended_stats_internal.h
+++ b/src/include/statistics/extended_stats_internal.h
@@ -59,7 +59,7 @@ extern MVDependencies *statext_dependencies_deserialize(bytea *data);
extern MultiSortSupport multi_sort_init(int ndims);
extern void multi_sort_add_dimension(MultiSortSupport mss, int sortdim,
- Oid oper);
+ Oid oper, Oid collation);
extern int multi_sort_compare(const void *a, const void *b, void *arg);
extern int multi_sort_compare_dim(int dim, const SortItem *a,
const SortItem *b, MultiSortSupport mss);
diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h
index ff1705ad2b8..64089930019 100644
--- a/src/include/utils/lsyscache.h
+++ b/src/include/utils/lsyscache.h
@@ -44,6 +44,7 @@ typedef struct AttStatsSlot
{
/* Always filled: */
Oid staop; /* Actual staop for the found slot */
+ Oid stacoll; /* Actual collation for the found slot */
/* Filled if ATTSTATSSLOT_VALUES is specified: */
Oid valuetype; /* Actual datatype of the values */
Datum *values; /* slot's "values" array, or NULL if none */
diff --git a/src/include/utils/typcache.h b/src/include/utils/typcache.h
index 217d064da52..2b299608cfc 100644
--- a/src/include/utils/typcache.h
+++ b/src/include/utils/typcache.h
@@ -41,6 +41,7 @@ typedef struct TypeCacheEntry
char typtype;
Oid typrelid;
Oid typelem;
+ Oid typcollation;
/*
* Information obtained from opfamily entries