Fix misestimation of n_distinct for a nearly-unique column with many nulls.

author Tom Lane <[email protected]>

Sun, 7 Aug 2016 22:52:02 +0000 (18:52 -0400)

committer Tom Lane <[email protected]>

Sun, 7 Aug 2016 22:52:02 +0000 (18:52 -0400)
author Tom Lane <[email protected]>
Sun, 7 Aug 2016 22:52:02 +0000 (18:52 -0400)
committer Tom Lane <[email protected]>
Sun, 7 Aug 2016 22:52:02 +0000 (18:52 -0400)
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml

index 9a9b3c4f5a36373fb097822b9d5375919950c483..47efe9f2cc0be7ba1626326c03316f9cb7015d81 100644 (file)
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -5307,9 +5307,9 @@
        <entry>The number of distinct nonnull data values in the column.
        A value greater than zero is the actual number of distinct values.
        A value less than zero is the negative of a multiplier for the number
-      of rows in the table; for example, a column in which values appear about
-      twice on the average could be represented by
-      <structfield>stadistinct</> = -0.5.
+      of rows in the table; for example, a column in which about 80% of the
+      values are nonnull and each nonnull value appears about twice on
+      average could be represented by <structfield>stadistinct</> = -0.4.
        A zero value means the number of distinct values is unknown.
        </entry>
       </row>
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c

index a4200b5549b259f3e4c503b63bdbf4b4d3af6e55..1afa12e5f5d025bcc974a810eba0321033c30b15 100644 (file)
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -2096,8 +2096,11 @@ compute_minimal_stats(VacAttrStatsP stats,
  
         if (nmultiple == 0)
         {
-           /* If we found no repeated values, assume it's a unique column */
-           stats->stadistinct = -1.0;
+           /*
+            * If we found no repeated non-null values, assume it's a unique
+            * column; but be sure to discount for any nulls we found.
+            */
+           stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
         }
         else if (track_cnt < track_max && toowide_cnt == 0 &&
                  nmultiple == track_cnt)
@@ -2444,8 +2447,11 @@ compute_scalar_stats(VacAttrStatsP stats,
  
         if (nmultiple == 0)
         {
-           /* If we found no repeated values, assume it's a unique column */
-           stats->stadistinct = -1.0;
+           /*
+            * If we found no repeated non-null values, assume it's a unique
+            * column; but be sure to discount for any nulls we found.
+            */
+           stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
         }
         else if (toowide_cnt == 0 && nmultiple == ndistinct)
         {
@@ -2749,7 +2755,7 @@ compute_scalar_stats(VacAttrStatsP stats,
         else
             stats->stawidth = stats->attrtype->typlen;
         /* Assume all too-wide values are distinct, so it's a unique column */
-       stats->stadistinct = -1.0;
+       stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
     }
     else if (null_cnt > 0)
     {
diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c

index e2e1afdc6bf9d0ccc6749238f6d1fe60514d4097..2bc13b234c45579b504f696c612e0fdd62b83142 100644 (file)
--- a/src/backend/tsearch/ts_typanalyze.c
+++ b/src/backend/tsearch/ts_typanalyze.c
@@ -295,7 +295,7 @@ compute_tsvector_stats(VacAttrStats *stats,
         stats->stawidth = total_width / (double) nonnull_cnt;
  
         /* Assume it's a unique column (see notes above) */
-       stats->stadistinct = -1.0;
+       stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
  
         /*
          * Construct an array of the interesting hashtable items, that is,
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c

index 59dcc30106ab8f62a55247404acb6477c43cdd72..9b377161ba88d8d78e3651dcdd80a3cbe660908d 100644 (file)
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -4623,6 +4623,7 @@ double
  get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
  {
     double      stadistinct;
+   double      stanullfrac = 0.0;
     double      ntuples;
  
     *isdefault = false;
@@ -4630,7 +4631,8 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
     /*
      * Determine the stadistinct value to use.  There are cases where we can
      * get an estimate even without a pg_statistic entry, or can get a better
-    * value than is in pg_statistic.
+    * value than is in pg_statistic.  Grab stanullfrac too if we can find it
+    * (otherwise, assume no nulls, for lack of any better idea).
      */
     if (HeapTupleIsValid(vardata->statsTuple))
     {
@@ -4639,6 +4641,7 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
  
         stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
         stadistinct = stats->stadistinct;
+       stanullfrac = stats->stanullfrac;
     }
     else if (vardata->vartype == BOOLOID)
     {
@@ -4662,7 +4665,7 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
             {
                 case ObjectIdAttributeNumber:
                 case SelfItemPointerAttributeNumber:
-                   stadistinct = -1.0; /* unique */
+                   stadistinct = -1.0; /* unique (and all non null) */
                     break;
                 case TableOidAttributeNumber:
                     stadistinct = 1.0;  /* only 1 value */
@@ -4684,10 +4687,11 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
      * If there is a unique index or DISTINCT clause for the variable, assume
      * it is unique no matter what pg_statistic says; the statistics could be
      * out of date, or we might have found a partial unique index that proves
-    * the var is unique for this query.
+    * the var is unique for this query.  However, we'd better still believe
+    * the null-fraction statistic.
      */
     if (vardata->isunique)
-       stadistinct = -1.0;
+       stadistinct = -1.0 * (1.0 - stanullfrac);
  
     /*
      * If we had an absolute estimate, use that.
diff --git a/src/include/catalog/pg_statistic.h b/src/include/catalog/pg_statistic.h

index 0d4f69a8e98831e4b6e4d48f8be6126330f0809b..fc5d120e1e5b152d30ddb72b5f7fe335b1ab27b6 100644 (file)
--- a/src/include/catalog/pg_statistic.h
+++ b/src/include/catalog/pg_statistic.h
@@ -57,13 +57,14 @@ CATALOG(pg_statistic,2619) BKI_WITHOUT_OIDS
      *      > 0     actual number of distinct values
      *      < 0     negative of multiplier for number of rows
      * The special negative case allows us to cope with columns that are
-    * unique (stadistinct = -1) or nearly so (for example, a column in
-    * which values appear about twice on the average could be represented
-    * by stadistinct = -0.5).  Because the number-of-rows statistic in
-    * pg_class may be updated more frequently than pg_statistic is, it's
-    * important to be able to describe such situations as a multiple of
-    * the number of rows, rather than a fixed number of distinct values.
-    * But in other cases a fixed number is correct (eg, a boolean column).
+    * unique (stadistinct = -1) or nearly so (for example, a column in which
+    * non-null values appear about twice on the average could be represented
+    * by stadistinct = -0.5 if there are no nulls, or -0.4 if 20% of the
+    * column is nulls).  Because the number-of-rows statistic in pg_class may
+    * be updated more frequently than pg_statistic is, it's important to be
+    * able to describe such situations as a multiple of the number of rows,
+    * rather than a fixed number of distinct values.  But in other cases a
+    * fixed number is correct (eg, a boolean column).
      * ----------------
      */
     float4      stadistinct;
author	Tom Lane <[email protected]>
	Sun, 7 Aug 2016 22:52:02 +0000 (18:52 -0400)
committer	Tom Lane <[email protected]>
	Sun, 7 Aug 2016 22:52:02 +0000 (18:52 -0400)
doc/src/sgml/catalogs.sgml		patch \| blob \| blame \| history
src/backend/commands/analyze.c		patch \| blob \| blame \| history
src/backend/tsearch/ts_typanalyze.c		patch \| blob \| blame \| history
src/backend/utils/adt/selfuncs.c		patch \| blob \| blame \| history
src/include/catalog/pg_statistic.h		patch \| blob \| blame \| history