summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNeil Conway2005-01-18 23:25:55 +0000
committerNeil Conway2005-01-18 23:25:55 +0000
commitb4297c177c466d2cc0268df6b16261ab4f3cb777 (patch)
treebc2f25fcf8ed589ba0fb82d37fc5c76bd9dd14ac
parent1f5299bc3fde2e738e7080af72d26a08da53cabd (diff)
This patch makes some improvements to the rtree index implementation:
(1) Keep a pin on the scan's current buffer and mark buffer. This avoids the need to do a ReadBuffer() for each tuple produced by the scan. Since ReadBuffer() is expensive, this is a significant win. (2) Convert a ReleaseBuffer(); ReadBuffer() pair into ReleaseAndReadBuffer(). Surely not a huge win, but it saves a lock acquire/release... (3) Remove a bunch of duplicated code in rtget.c; make rtnext() handle both the "initial result" and "subsequent result" cases. (4) Add support for index tuple killing (5) Remove rtscancache(): it is dead code, for the same reason that gistscancache() is dead code (an index scan ought not be invoked with NoMovementScanDirection). The end result is about a 10% improvement in rtree index scan perf, according to contrib/rtree_gist/bench.
-rw-r--r--src/backend/access/rtree/rtget.c241
-rw-r--r--src/backend/access/rtree/rtree.c10
-rw-r--r--src/backend/access/rtree/rtscan.c43
-rw-r--r--src/include/access/rtree.h17
4 files changed, 157 insertions, 154 deletions
diff --git a/src/backend/access/rtree/rtget.c b/src/backend/access/rtree/rtget.c
index 868b178ef11..31963e81a06 100644
--- a/src/backend/access/rtree/rtget.c
+++ b/src/backend/access/rtree/rtget.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/rtree/rtget.c,v 1.33 2004/12/31 21:59:26 pgsql Exp $
+ * $PostgreSQL: pgsql/src/backend/access/rtree/rtget.c,v 1.34 2005/01/18 23:25:43 neilc Exp $
*
*-------------------------------------------------------------------------
*/
@@ -19,10 +19,8 @@
#include "access/relscan.h"
#include "access/rtree.h"
-static OffsetNumber findnext(IndexScanDesc s, Page p, OffsetNumber n,
+static OffsetNumber findnext(IndexScanDesc s, OffsetNumber n,
ScanDirection dir);
-static bool rtscancache(IndexScanDesc s, ScanDirection dir);
-static bool rtfirst(IndexScanDesc s, ScanDirection dir);
static bool rtnext(IndexScanDesc s, ScanDirection dir);
@@ -31,138 +29,106 @@ rtgettuple(PG_FUNCTION_ARGS)
{
IndexScanDesc s = (IndexScanDesc) PG_GETARG_POINTER(0);
ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
- bool res;
-
- /* if we have it cached in the scan desc, just return the value */
- if (rtscancache(s, dir))
- PG_RETURN_BOOL(true);
-
- /* not cached, so we'll have to do some work */
- if (ItemPointerIsValid(&(s->currentItemData)))
- res = rtnext(s, dir);
- else
- res = rtfirst(s, dir);
- PG_RETURN_BOOL(res);
-}
-
-static bool
-rtfirst(IndexScanDesc s, ScanDirection dir)
-{
- Buffer b;
- Page p;
- OffsetNumber n;
- OffsetNumber maxoff;
- RTreePageOpaque po;
+ Page page;
+ OffsetNumber offnum;
RTreeScanOpaque so;
- RTSTACK *stk;
- BlockNumber blk;
- IndexTuple it;
- b = ReadBuffer(s->indexRelation, P_ROOT);
- p = BufferGetPage(b);
- po = (RTreePageOpaque) PageGetSpecialPointer(p);
so = (RTreeScanOpaque) s->opaque;
- for (;;)
+ /*
+ * If we've already produced a tuple and the executor has informed
+ * us that it should be marked "killed", do so know.
+ */
+ if (s->kill_prior_tuple && ItemPointerIsValid(&(s->currentItemData)))
{
- maxoff = PageGetMaxOffsetNumber(p);
- if (ScanDirectionIsBackward(dir))
- n = findnext(s, p, maxoff, dir);
- else
- n = findnext(s, p, FirstOffsetNumber, dir);
-
- while (n < FirstOffsetNumber || n > maxoff)
- {
- ReleaseBuffer(b);
- if (so->s_stack == NULL)
- return false;
-
- stk = so->s_stack;
- b = ReadBuffer(s->indexRelation, stk->rts_blk);
- p = BufferGetPage(b);
- po = (RTreePageOpaque) PageGetSpecialPointer(p);
- maxoff = PageGetMaxOffsetNumber(p);
+ offnum = ItemPointerGetOffsetNumber(&(s->currentItemData));
+ page = BufferGetPage(so->curbuf);
+ PageGetItemId(page, offnum)->lp_flags |= LP_DELETE;
+ SetBufferCommitInfoNeedsSave(so->curbuf);
+ }
- if (ScanDirectionIsBackward(dir))
- n = OffsetNumberPrev(stk->rts_child);
- else
- n = OffsetNumberNext(stk->rts_child);
- so->s_stack = stk->rts_parent;
- pfree(stk);
+ /*
+ * Get the next tuple that matches the search key; if asked to
+ * skip killed tuples, find the first non-killed tuple that
+ * matches. Return as soon as we've run out of matches or we've
+ * found an acceptable match.
+ */
+ for (;;)
+ {
+ bool res = rtnext(s, dir);
- n = findnext(s, p, n, dir);
- }
- if (po->flags & F_LEAF)
+ if (res == true && s->ignore_killed_tuples)
{
- ItemPointerSet(&(s->currentItemData), BufferGetBlockNumber(b), n);
-
- it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
-
- s->xs_ctup.t_self = it->t_tid;
-
- ReleaseBuffer(b);
- return true;
+ offnum = ItemPointerGetOffsetNumber(&(s->currentItemData));
+ page = BufferGetPage(so->curbuf);
+ if (ItemIdDeleted(PageGetItemId(page, offnum)))
+ continue;
}
- else
- {
- stk = (RTSTACK *) palloc(sizeof(RTSTACK));
- stk->rts_child = n;
- stk->rts_blk = BufferGetBlockNumber(b);
- stk->rts_parent = so->s_stack;
- so->s_stack = stk;
-
- it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
- blk = ItemPointerGetBlockNumber(&(it->t_tid));
- ReleaseBuffer(b);
- b = ReadBuffer(s->indexRelation, blk);
- p = BufferGetPage(b);
- po = (RTreePageOpaque) PageGetSpecialPointer(p);
- }
+ PG_RETURN_BOOL(res);
}
}
static bool
rtnext(IndexScanDesc s, ScanDirection dir)
{
- Buffer b;
Page p;
OffsetNumber n;
- OffsetNumber maxoff;
RTreePageOpaque po;
RTreeScanOpaque so;
- RTSTACK *stk;
- BlockNumber blk;
- IndexTuple it;
- blk = ItemPointerGetBlockNumber(&(s->currentItemData));
- n = ItemPointerGetOffsetNumber(&(s->currentItemData));
+ so = (RTreeScanOpaque) s->opaque;
- if (ScanDirectionIsForward(dir))
- n = OffsetNumberNext(n);
- else
- n = OffsetNumberPrev(n);
+ if (!ItemPointerIsValid(&(s->currentItemData)))
+ {
+ /* first call: start at the root */
+ Assert(BufferIsValid(so->curbuf) == false);
+ so->curbuf = ReadBuffer(s->indexRelation, P_ROOT);
+ }
- b = ReadBuffer(s->indexRelation, blk);
- p = BufferGetPage(b);
+ p = BufferGetPage(so->curbuf);
po = (RTreePageOpaque) PageGetSpecialPointer(p);
- so = (RTreeScanOpaque) s->opaque;
+
+ if (!ItemPointerIsValid(&(s->currentItemData)))
+ {
+ /* first call: start at first/last offset */
+ if (ScanDirectionIsForward(dir))
+ n = FirstOffsetNumber;
+ else
+ n = PageGetMaxOffsetNumber(p);
+ }
+ else
+ {
+ /* go on to the next offset */
+ n = ItemPointerGetOffsetNumber(&(s->currentItemData));
+ if (ScanDirectionIsForward(dir))
+ n = OffsetNumberNext(n);
+ else
+ n = OffsetNumberPrev(n);
+ }
for (;;)
{
- maxoff = PageGetMaxOffsetNumber(p);
- n = findnext(s, p, n, dir);
+ IndexTuple it;
+ RTSTACK *stk;
+
+ n = findnext(s, n, dir);
- while (n < FirstOffsetNumber || n > maxoff)
+ /* no match on this page, so read in the next stack entry */
+ if (n == InvalidOffsetNumber)
{
- ReleaseBuffer(b);
+ /* if out of stack entries, we're done */
if (so->s_stack == NULL)
+ {
+ ReleaseBuffer(so->curbuf);
+ so->curbuf = InvalidBuffer;
return false;
+ }
stk = so->s_stack;
- b = ReadBuffer(s->indexRelation, stk->rts_blk);
- p = BufferGetPage(b);
- maxoff = PageGetMaxOffsetNumber(p);
+ so->curbuf = ReleaseAndReadBuffer(so->curbuf, s->indexRelation,
+ stk->rts_blk);
+ p = BufferGetPage(so->curbuf);
po = (RTreePageOpaque) PageGetSpecialPointer(p);
if (ScanDirectionIsBackward(dir))
@@ -172,33 +138,41 @@ rtnext(IndexScanDesc s, ScanDirection dir)
so->s_stack = stk->rts_parent;
pfree(stk);
- n = findnext(s, p, n, dir);
+ continue;
}
+
if (po->flags & F_LEAF)
{
- ItemPointerSet(&(s->currentItemData), BufferGetBlockNumber(b), n);
-
+ ItemPointerSet(&(s->currentItemData),
+ BufferGetBlockNumber(so->curbuf),
+ n);
it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
-
s->xs_ctup.t_self = it->t_tid;
-
- ReleaseBuffer(b);
return true;
}
else
{
+ BlockNumber blk;
+
stk = (RTSTACK *) palloc(sizeof(RTSTACK));
stk->rts_child = n;
- stk->rts_blk = BufferGetBlockNumber(b);
+ stk->rts_blk = BufferGetBlockNumber(so->curbuf);
stk->rts_parent = so->s_stack;
so->s_stack = stk;
it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
blk = ItemPointerGetBlockNumber(&(it->t_tid));
- ReleaseBuffer(b);
- b = ReadBuffer(s->indexRelation, blk);
- p = BufferGetPage(b);
+ /*
+ * Note that we release the pin on the page as we descend
+ * down the tree, even though there's a good chance we'll
+ * eventually need to re-read the buffer later in this
+ * scan. This may or may not be optimal, but it doesn't
+ * seem likely to make a huge performance difference
+ * either way.
+ */
+ so->curbuf = ReleaseAndReadBuffer(so->curbuf, s->indexRelation, blk);
+ p = BufferGetPage(so->curbuf);
po = (RTreePageOpaque) PageGetSpecialPointer(p);
if (ScanDirectionIsBackward(dir))
@@ -209,17 +183,26 @@ rtnext(IndexScanDesc s, ScanDirection dir)
}
}
+/*
+ * Return the offset of the next matching index entry. We begin the
+ * search at offset "n" and search for matches in the direction
+ * "dir". If no more matching entries are found on the page,
+ * InvalidOffsetNumber is returned.
+ */
static OffsetNumber
-findnext(IndexScanDesc s, Page p, OffsetNumber n, ScanDirection dir)
+findnext(IndexScanDesc s, OffsetNumber n, ScanDirection dir)
{
OffsetNumber maxoff;
IndexTuple it;
RTreePageOpaque po;
RTreeScanOpaque so;
+ Page p;
+
+ so = (RTreeScanOpaque) s->opaque;
+ p = BufferGetPage(so->curbuf);
maxoff = PageGetMaxOffsetNumber(p);
po = (RTreePageOpaque) PageGetSpecialPointer(p);
- so = (RTreeScanOpaque) s->opaque;
/*
* If we modified the index during the scan, we may have a pointer to
@@ -256,28 +239,8 @@ findnext(IndexScanDesc s, Page p, OffsetNumber n, ScanDirection dir)
n = OffsetNumberNext(n);
}
- return n;
-}
-
-static bool
-rtscancache(IndexScanDesc s, ScanDirection dir)
-{
- Buffer b;
- Page p;
- OffsetNumber n;
- IndexTuple it;
-
- if (!(ScanDirectionIsNoMovement(dir)
- && ItemPointerIsValid(&(s->currentItemData))))
- return false;
-
- b = ReadBuffer(s->indexRelation,
- ItemPointerGetBlockNumber(&(s->currentItemData)));
- p = BufferGetPage(b);
- n = ItemPointerGetOffsetNumber(&(s->currentItemData));
- it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
- s->xs_ctup.t_self = it->t_tid;
- ReleaseBuffer(b);
-
- return true;
+ if (n >= FirstOffsetNumber && n <= maxoff)
+ return n; /* found a match on this page */
+ else
+ return InvalidOffsetNumber; /* no match, go to next page */
}
diff --git a/src/backend/access/rtree/rtree.c b/src/backend/access/rtree/rtree.c
index c2dd2bb8c77..b1762bfe1fb 100644
--- a/src/backend/access/rtree/rtree.c
+++ b/src/backend/access/rtree/rtree.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/rtree/rtree.c,v 1.85 2004/12/31 21:59:26 pgsql Exp $
+ * $PostgreSQL: pgsql/src/backend/access/rtree/rtree.c,v 1.86 2005/01/18 23:25:47 neilc Exp $
*
*-------------------------------------------------------------------------
*/
@@ -280,12 +280,8 @@ rtdoinsert(Relation r, IndexTuple itup, RTSTATE *rtstate)
do
{
- /* let go of current buffer before getting next */
- if (buffer != InvalidBuffer)
- ReleaseBuffer(buffer);
-
- /* get next buffer */
- buffer = ReadBuffer(r, blk);
+ /* release the current buffer, read in the next one */
+ buffer = ReleaseAndReadBuffer(buffer, r, blk);
page = (Page) BufferGetPage(buffer);
opaque = (RTreePageOpaque) PageGetSpecialPointer(page);
diff --git a/src/backend/access/rtree/rtscan.c b/src/backend/access/rtree/rtscan.c
index 7801c5d5768..e608cb431b9 100644
--- a/src/backend/access/rtree/rtscan.c
+++ b/src/backend/access/rtree/rtscan.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/rtree/rtscan.c,v 1.56 2004/12/31 21:59:26 pgsql Exp $
+ * $PostgreSQL: pgsql/src/backend/access/rtree/rtscan.c,v 1.57 2005/01/18 23:25:48 neilc Exp $
*
*-------------------------------------------------------------------------
*/
@@ -89,12 +89,24 @@ rtrescan(PG_FUNCTION_ARGS)
freestack(p->s_markstk);
p->s_stack = p->s_markstk = NULL;
p->s_flags = 0x0;
+ /* drop pins on buffers -- no locks held */
+ if (BufferIsValid(p->curbuf))
+ {
+ ReleaseBuffer(p->curbuf);
+ p->curbuf = InvalidBuffer;
+ }
+ if (BufferIsValid(p->markbuf))
+ {
+ ReleaseBuffer(p->markbuf);
+ p->markbuf = InvalidBuffer;
+ }
}
else
{
/* initialize opaque data */
p = (RTreeScanOpaque) palloc(sizeof(RTreeScanOpaqueData));
p->s_stack = p->s_markstk = NULL;
+ p->curbuf = p->markbuf = InvalidBuffer;
p->s_internalNKey = s->numberOfKeys;
p->s_flags = 0x0;
s->opaque = p;
@@ -175,6 +187,18 @@ rtmarkpos(PG_FUNCTION_ARGS)
freestack(p->s_markstk);
p->s_markstk = o;
+ /* Update markbuf: make sure to bump ref count on curbuf */
+ if (BufferIsValid(p->markbuf))
+ {
+ ReleaseBuffer(p->markbuf);
+ p->markbuf = InvalidBuffer;
+ }
+ if (BufferIsValid(p->curbuf))
+ {
+ IncrBufferRefCount(p->curbuf);
+ p->markbuf = p->curbuf;
+ }
+
PG_RETURN_VOID();
}
@@ -211,6 +235,18 @@ rtrestrpos(PG_FUNCTION_ARGS)
freestack(p->s_stack);
p->s_stack = o;
+ /* Update curbuf; be sure to bump ref count on markbuf */
+ if (BufferIsValid(p->curbuf))
+ {
+ ReleaseBuffer(p->curbuf);
+ p->curbuf = InvalidBuffer;
+ }
+ if (BufferIsValid(p->markbuf))
+ {
+ IncrBufferRefCount(p->markbuf);
+ p->curbuf = p->markbuf;
+ }
+
PG_RETURN_VOID();
}
@@ -226,11 +262,14 @@ rtendscan(PG_FUNCTION_ARGS)
{
freestack(p->s_stack);
freestack(p->s_markstk);
+ if (BufferIsValid(p->curbuf))
+ ReleaseBuffer(p->curbuf);
+ if (BufferIsValid(p->markbuf))
+ ReleaseBuffer(p->markbuf);
pfree(s->opaque);
}
rtdropscan(s);
- /* XXX don't unset read lock -- two-phase locking */
PG_RETURN_VOID();
}
diff --git a/src/include/access/rtree.h b/src/include/access/rtree.h
index a5c76855f22..d06ccdcff09 100644
--- a/src/include/access/rtree.h
+++ b/src/include/access/rtree.h
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/access/rtree.h,v 1.36 2004/12/31 22:03:21 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/access/rtree.h,v 1.37 2005/01/18 23:25:55 neilc Exp $
*
*-------------------------------------------------------------------------
*/
@@ -59,11 +59,14 @@ typedef struct RTSTACK
/*
* When we're doing a scan, we need to keep track of the parent stack
* for the marked and current items. Also, rtrees have the following
- * property: if you're looking for the box (1,1,2,2), on the internal
- * nodes you have to search for all boxes that *contain* (1,1,2,2), and
- * not the ones that match it. We have a private scan key for internal
- * nodes in the opaque structure for rtrees for this reason. See
- * access/index-rtree/rtscan.c and rtstrat.c for how it gets initialized.
+ * property: if you're looking for the box (1,1,2,2), on the internal
+ * nodes you have to search for all boxes that *contain* (1,1,2,2),
+ * and not the ones that match it. We have a private scan key for
+ * internal nodes in the opaque structure for rtrees for this reason.
+ * See access/index-rtree/rtscan.c and rtstrat.c for how it gets
+ * initialized. We also keep pins on the scan's current buffer and
+ * marked buffer, if any: this avoids the need to invoke ReadBuffer()
+ * for each tuple produced by the index scan.
*/
typedef struct RTreeScanOpaqueData
@@ -73,6 +76,8 @@ typedef struct RTreeScanOpaqueData
uint16 s_flags;
int s_internalNKey;
ScanKey s_internalKey;
+ Buffer curbuf;
+ Buffer markbuf;
} RTreeScanOpaqueData;
typedef RTreeScanOpaqueData *RTreeScanOpaque;