Fix handling of NULL distances in KNN-GiST
authorAlexander Korotkov <[email protected]>
Sun, 8 Sep 2019 18:13:40 +0000 (21:13 +0300)
committerAlexander Korotkov <[email protected]>
Sun, 8 Sep 2019 19:30:12 +0000 (22:30 +0300)
In order to implement NULL LAST semantic GiST previously assumed distance to
the NULL value to be Inf.  However, our distance functions can return Inf and
NaN for non-null values.  In such cases, NULL LAST semantic appears to be
broken.  This commit fixes that by introducing separate array of null flags for
distances.

Backpatch to all supported versions.

Discussion: https://postgr.es/m/CAPpHfdsNvNdA0DBS%2BwMpFrgwT6C3-q50sFVGLSiuWnV3FqOJuQ%40mail.gmail.com
Author: Alexander Korotkov
Backpatch-through: 9.4

src/backend/access/gist/gistget.c
src/backend/access/gist/gistscan.c
src/include/access/gist_private.h

index 7a8692b5087d2ee6cba1bb47eb02ab13057dd0b7..8cf22048321dea2f26de0a4270a280d4ca230e98 100644 (file)
@@ -34,8 +34,9 @@
  * request it.  recheck is not interesting when examining a non-leaf entry,
  * since we must visit the lower index page if there's any doubt.
  *
- * If we are doing an ordered scan, so->distances[] is filled with distance
- * data from the distance() functions before returning success.
+ * If we are doing an ordered scan, so->distancesValues[] and
+ * so->distancesNulls[] is filled with distance data from the distance()
+ * functions before returning success.
  *
  * We must decompress the key in the IndexTuple before passing it to the
  * sk_funcs (which actually are the opclass Consistent or Distance methods).
@@ -55,7 +56,8 @@ gistindex_keytest(IndexScanDesc scan,
    GISTSTATE  *giststate = so->giststate;
    ScanKey     key = scan->keyData;
    int         keySize = scan->numberOfKeys;
-   double     *distance_p;
+   double     *distance_value_p;
+   bool       *distance_null_p;
    Relation    r = scan->indexRelation;
 
    *recheck_p = false;
@@ -72,7 +74,10 @@ gistindex_keytest(IndexScanDesc scan,
        if (GistPageIsLeaf(page))       /* shouldn't happen */
            elog(ERROR, "invalid GiST tuple found on leaf page");
        for (i = 0; i < scan->numberOfOrderBys; i++)
-           so->distances[i] = -get_float8_infinity();
+       {
+           so->distanceValues[i] = -get_float8_infinity();
+           so->distanceNulls[i] = false;
+       }
        return true;
    }
 
@@ -155,7 +160,8 @@ gistindex_keytest(IndexScanDesc scan,
 
    /* OK, it passes --- now let's compute the distances */
    key = scan->orderByData;
-   distance_p = so->distances;
+   distance_value_p = so->distanceValues;
+   distance_null_p = so->distanceNulls;
    keySize = scan->numberOfOrderBys;
    while (keySize > 0)
    {
@@ -169,8 +175,9 @@ gistindex_keytest(IndexScanDesc scan,
 
        if ((key->sk_flags & SK_ISNULL) || isNull)
        {
-           /* Assume distance computes as null and sorts to the end */
-           *distance_p = get_float8_infinity();
+           /* Assume distance computes as null */
+           *distance_value_p = 0.0;
+           *distance_null_p = true;
        }
        else
        {
@@ -199,14 +206,15 @@ gistindex_keytest(IndexScanDesc scan,
                                     key->sk_collation,
                                     PointerGetDatum(&de),
                                     key->sk_argument,
-                                    Int32GetDatum(key->sk_strategy),
+                                    Int16GetDatum(key->sk_strategy),
                                     ObjectIdGetDatum(key->sk_subtype));
-
-           *distance_p = DatumGetFloat8(dist);
+           *distance_value_p = DatumGetFloat8(dist);
+           *distance_null_p = false;
        }
 
        key++;
-       distance_p++;
+       distance_value_p++;
+       distance_null_p++;
        keySize--;
    }
 
@@ -219,7 +227,8 @@ gistindex_keytest(IndexScanDesc scan,
  *
  * scan: index scan we are executing
  * pageItem: search queue item identifying an index page to scan
- * myDistances: distances array associated with pageItem, or NULL at the root
+ * myDistanceValues: distances array associated with pageItem, or NULL at the root
+ * myDistanceNulls: null flags for myDistanceValues array, or NULL at the root
  * tbm: if not NULL, gistgetbitmap's output bitmap
  * ntids: if not NULL, gistgetbitmap's output tuple counter
  *
@@ -234,7 +243,8 @@ gistindex_keytest(IndexScanDesc scan,
  * sibling will be processed next.
  */
 static void
-gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances,
+gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
+            double *myDistanceValues, bool *myDistanceNulls,
             TIDBitmap *tbm, int64 *ntids)
 {
    GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
@@ -270,7 +280,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances,
        GISTSearchItem *item;
 
        /* This can't happen when starting at the root */
-       Assert(myDistances != NULL);
+       Assert(myDistanceValues != NULL && myDistanceNulls != NULL);
 
        oldcxt = MemoryContextSwitchTo(so->queueCxt);
 
@@ -283,8 +293,10 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances,
        /* Insert it into the queue using same distances as for this page */
        tmpItem->head = item;
        tmpItem->lastHeap = NULL;
-       memcpy(tmpItem->distances, myDistances,
-              sizeof(double) * scan->numberOfOrderBys);
+       memcpy(GISTSearchTreeItemDistanceValues(tmpItem, scan->numberOfOrderBys),
+              myDistanceValues, sizeof(double) * scan->numberOfOrderBys);
+       memcpy(GISTSearchTreeItemDistanceNulls(tmpItem, scan->numberOfOrderBys),
+              myDistanceNulls, sizeof(bool) * scan->numberOfOrderBys);
 
        (void) rb_insert(so->queue, (RBNode *) tmpItem, &isNew);
 
@@ -344,6 +356,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances,
             * search.
             */
            GISTSearchItem *item;
+           int         nOrderBys = scan->numberOfOrderBys;
 
            oldcxt = MemoryContextSwitchTo(so->queueCxt);
 
@@ -374,8 +387,10 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances,
            /* Insert it into the queue using new distance data */
            tmpItem->head = item;
            tmpItem->lastHeap = GISTSearchItemIsHeap(*item) ? item : NULL;
-           memcpy(tmpItem->distances, so->distances,
-                  sizeof(double) * scan->numberOfOrderBys);
+           memcpy(GISTSearchTreeItemDistanceValues(tmpItem, nOrderBys),
+                  so->distanceValues, sizeof(double) * nOrderBys);
+           memcpy(GISTSearchTreeItemDistanceNulls(tmpItem, nOrderBys),
+                  so->distanceNulls, sizeof(bool) * nOrderBys);
 
            (void) rb_insert(so->queue, (RBNode *) tmpItem, &isNew);
 
@@ -458,7 +473,10 @@ getNextNearest(IndexScanDesc scan)
            /* visit an index page, extract its items into queue */
            CHECK_FOR_INTERRUPTS();
 
-           gistScanPage(scan, item, so->curTreeItem->distances, NULL, NULL);
+           gistScanPage(scan, item,
+                        GISTSearchTreeItemDistanceValues(so->curTreeItem, scan->numberOfOrderBys),
+                        GISTSearchTreeItemDistanceNulls(so->curTreeItem, scan->numberOfOrderBys),
+                        NULL, NULL);
        }
 
        pfree(item);
@@ -496,7 +514,7 @@ gistgettuple(PG_FUNCTION_ARGS)
 
        fakeItem.blkno = GIST_ROOT_BLKNO;
        memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN));
-       gistScanPage(scan, &fakeItem, NULL, NULL, NULL);
+       gistScanPage(scan, &fakeItem, NULL, NULL, NULL, NULL);
    }
 
    if (scan->numberOfOrderBys > 0)
@@ -534,7 +552,10 @@ gistgettuple(PG_FUNCTION_ARGS)
                 * this page, we fall out of the inner "do" and loop around to
                 * return them.
                 */
-               gistScanPage(scan, item, so->curTreeItem->distances, NULL, NULL);
+               gistScanPage(scan, item,
+                            GISTSearchTreeItemDistanceValues(so->curTreeItem, scan->numberOfOrderBys),
+                            GISTSearchTreeItemDistanceNulls(so->curTreeItem, scan->numberOfOrderBys),
+                            NULL, NULL);
 
                pfree(item);
            } while (so->nPageData == 0);
@@ -565,7 +586,7 @@ gistgetbitmap(PG_FUNCTION_ARGS)
 
    fakeItem.blkno = GIST_ROOT_BLKNO;
    memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN));
-   gistScanPage(scan, &fakeItem, NULL, tbm, &ntids);
+   gistScanPage(scan, &fakeItem, NULL, NULL, tbm, &ntids);
 
    /*
     * While scanning a leaf page, ItemPointers of matching heap tuples will
@@ -580,7 +601,10 @@ gistgetbitmap(PG_FUNCTION_ARGS)
 
        CHECK_FOR_INTERRUPTS();
 
-       gistScanPage(scan, item, so->curTreeItem->distances, tbm, &ntids);
+       gistScanPage(scan, item,
+                    GISTSearchTreeItemDistanceValues(so->curTreeItem, scan->numberOfOrderBys),
+                    GISTSearchTreeItemDistanceNulls(so->curTreeItem, scan->numberOfOrderBys),
+                    tbm, &ntids);
 
        pfree(item);
    }
index 154d23400c57133a5de0df8504f42282f0a50b2b..15a5339f91ff9e4f7ac15c3d735bf3ce11cc7e4e 100644 (file)
@@ -33,14 +33,30 @@ GISTSearchTreeItemComparator(const RBNode *a, const RBNode *b, void *arg)
    const GISTSearchTreeItem *sb = (const GISTSearchTreeItem *) b;
    IndexScanDesc scan = (IndexScanDesc) arg;
    int         i;
+   double     *da = GISTSearchTreeItemDistanceValues(sa, scan->numberOfOrderBys),
+              *db = GISTSearchTreeItemDistanceValues(sb, scan->numberOfOrderBys);
+   bool       *na = GISTSearchTreeItemDistanceNulls(sa, scan->numberOfOrderBys),
+              *nb = GISTSearchTreeItemDistanceNulls(sb, scan->numberOfOrderBys);
 
    /* Order according to distance comparison */
    for (i = 0; i < scan->numberOfOrderBys; i++)
    {
-       int         cmp = float8_cmp_internal(sa->distances[i], sb->distances[i]);
+       if (na[i])
+       {
+           if (!nb[i])
+               return 1;
+       }
+       else if (nb[i])
+       {
+           return -1;
+       }
+       else
+       {
+           int         cmp = float8_cmp_internal(da[i], db[i]);
 
-       if (cmp != 0)
-           return cmp;
+           if (cmp != 0)
+               return cmp;
+       }
    }
 
    return 0;
@@ -86,7 +102,7 @@ GISTSearchTreeItemAllocator(void *arg)
 {
    IndexScanDesc scan = (IndexScanDesc) arg;
 
-   return palloc(GSTIHDRSZ + sizeof(double) * scan->numberOfOrderBys);
+   return palloc(SizeOfGISTSearchTreeItem(scan->numberOfOrderBys));
 }
 
 static void
@@ -130,8 +146,9 @@ gistbeginscan(PG_FUNCTION_ARGS)
    so->queueCxt = giststate->scanCxt;  /* see gistrescan */
 
    /* workspaces with size dependent on numberOfOrderBys: */
-   so->tmpTreeItem = palloc(GSTIHDRSZ + sizeof(double) * scan->numberOfOrderBys);
-   so->distances = palloc(sizeof(double) * scan->numberOfOrderBys);
+   so->tmpTreeItem = palloc(SizeOfGISTSearchTreeItem(scan->numberOfOrderBys));
+   so->distanceValues = palloc(sizeof(double) * scan->numberOfOrderBys);
+   so->distanceNulls = palloc(sizeof(bool) * scan->numberOfOrderBys);
    so->qual_ok = true;         /* in case there are zero keys */
 
    scan->opaque = so;
@@ -191,7 +208,7 @@ gistrescan(PG_FUNCTION_ARGS)
 
    /* create new, empty RBTree for search queue */
    oldCxt = MemoryContextSwitchTo(so->queueCxt);
-   so->queue = rb_create(GSTIHDRSZ + sizeof(double) * scan->numberOfOrderBys,
+   so->queue = rb_create(SizeOfGISTSearchTreeItem(scan->numberOfOrderBys),
                          GISTSearchTreeItemComparator,
                          GISTSearchTreeItemCombiner,
                          GISTSearchTreeItemAllocator,
index 089c67942138614ce4ac42dd9513ac0f7704d15d..878e296816f100bf7ba19b1b0d99998051c82bd6 100644 (file)
@@ -144,10 +144,28 @@ typedef struct GISTSearchTreeItem
    RBNode      rbnode;         /* this is an RBTree item */
    GISTSearchItem *head;       /* first chain member */
    GISTSearchItem *lastHeap;   /* last heap-tuple member, if any */
-   double      distances[1];   /* array with numberOfOrderBys entries */
+
+   /*
+    * This data structure is followed by arrays of distance values and
+    * distance null flags.  Size of both arrays is
+    * IndexScanDesc->numberOfOrderBys. See macros below for accessing those
+    * arrays.
+    */
 } GISTSearchTreeItem;
 
-#define GSTIHDRSZ offsetof(GISTSearchTreeItem, distances)
+#define SizeOfGISTSearchTreeItem(n_distances) (DOUBLEALIGN(sizeof(GISTSearchTreeItem)) + \
+   (sizeof(double) + sizeof(bool)) * (n_distances))
+
+/*
+ * We actually don't need n_distances compute pointer to distance values.
+ * Nevertheless take n_distances as argument to have same arguments list for
+ * GISTSearchItemDistanceValues() and GISTSearchItemDistanceNulls().
+ */
+#define GISTSearchTreeItemDistanceValues(item, n_distances) \
+   ((double *) ((Pointer) (item) + DOUBLEALIGN(sizeof(GISTSearchTreeItem))))
+
+#define GISTSearchTreeItemDistanceNulls(item, n_distances) \
+   ((bool *) ((Pointer) (item) + DOUBLEALIGN(sizeof(GISTSearchTreeItem)) + sizeof(double) * (n_distances)))
 
 /*
  * GISTScanOpaqueData: private state for a scan of a GiST index
@@ -164,7 +182,8 @@ typedef struct GISTScanOpaqueData
 
    /* pre-allocated workspace arrays */
    GISTSearchTreeItem *tmpTreeItem;    /* workspace to pass to rb_insert */
-   double     *distances;      /* output area for gistindex_keytest */
+   double     *distanceValues; /* output area for gistindex_keytest */
+   bool       *distanceNulls;
 
    /* In a non-ordered search, returnable heap items are stored here: */
    GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)];