Implement lookbehind constraints in our regular-expression engine.

A lookbehind constraint is like a lookahead constraint in that it consumes no text; but it checks for existence (or nonexistence) of a match *ending* at the current point in the string, rather than one *starting* at the current point. This is a long-requested feature since it exists in many other regex libraries, but Henry Spencer had never got around to implementing it in the code we use. Just making it work is actually pretty trivial; but naive copying of the logic for lookahead constraints leads to code that often spends O(N^2) time to scan an N-character string, because we have to run the match engine from string start to the current probe point each time the constraint is checked. In typical use-cases a lookbehind constraint will be written at the start of the regex and hence will need to be checked at every character --- so O(N^2) work overall. To fix that, I introduced a third copy of the core DFA matching loop, paralleling the existing longest() and shortest() loops. This version, matchuntil(), can suspend and resume matching given a couple of pointers' worth of storage space. So we need only run it across the string once, stopping at each interesting probe point and then resuming to advance to the next one. I also put in an optimization that simplifies one-character lookahead and lookbehind constraints, such as "(?=x)" or "(?<!\w)", into AHEAD and BEHIND constraints, which already existed in the engine. This avoids the overhead of the LACON machinery entirely for these rather common cases. The net result is that lookbehind constraints run a factor of three or so slower than Perl's for multi-character constraints, but faster than Perl's for one-character constraints ... and they work fine for variable-length constraints, which Perl gives up on entirely. So that's not bad from a competitive perspective, and there's room for further optimization if anyone cares. (In reality, raw scan rate across a large input string is probably not that big a deal for Postgres usage anyway; so I'm happy if it's linear.)
author: Tom Lane 2015-10-30 23:14:19 +0000
committer: Tom Lane 2015-10-30 23:14:19 +0000
commit: 12c9a04008870c283931d6b3b648ee21bbc2cfda (patch)
tree: 2afd1e048b3681e5a93b7d8b3c37968e71b2532d /src/backend/regex/rege_dfa.c
parent: c5057b2b34813ca114bc808cb56b7a7fcde64393 (diff)
1 files changed, 151 insertions, 11 deletions
diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c
index a37e4b0ef96..7d90242acef 100644
--- a/src/backend/regex/rege_dfa.c
+++ b/src/backend/regex/rege_dfa.c
@@ -287,6 +287,130 @@ shortest(struct vars * v,
 }
 
 /*
+ * matchuntil - incremental matching engine
+ *
+ * This is meant for use with a search-style NFA (that is, the pattern is
+ * known to act as though it had a leading .*).  We determine whether a
+ * match exists starting at v->start and ending at probe.  Multiple calls
+ * require only O(N) time not O(N^2) so long as the probe values are
+ * nondecreasing.  *lastcss and *lastcp must be initialized to NULL before
+ * starting a series of calls.
+ *
+ * Returns 1 if a match exists, 0 if not.
+ * Internal errors also return 0, with v->err set.
+ */
+static int
+matchuntil(struct vars * v,
+		   struct dfa * d,
+		   chr *probe,			/* we want to know if a match ends here */
+		   struct sset ** lastcss,		/* state storage across calls */
+		   chr **lastcp)		/* state storage across calls */
+{
+	chr		   *cp = *lastcp;
+	color		co;
+	struct sset *css = *lastcss;
+	struct sset *ss;
+	struct colormap *cm = d->cm;
+
+	/* initialize and startup, or restart, if necessary */
+	if (cp == NULL || cp > probe)
+	{
+		cp = v->start;
+		css = initialize(v, d, cp);
+		if (css == NULL)
+			return 0;
+
+		FDEBUG((">>> startup >>>\n"));
+		co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
+		FDEBUG(("color %ld\n", (long) co));
+
+		css = miss(v, d, css, co, cp, v->start);
+		if (css == NULL)
+			return 0;
+		css->lastseen = cp;
+	}
+	else if (css == NULL)
+	{
+		/* we previously found that no match is possible beyond *lastcp */
+		return 0;
+	}
+	ss = css;
+
+	/*
+	 * This is the main text-scanning loop.  It seems worth having two copies
+	 * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
+	 * builds, when you're not actively tracing.
+	 */
+#ifdef REG_DEBUG
+	if (v->eflags & REG_FTRACE)
+	{
+		while (cp < probe)
+		{
+			FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
+			co = GETCOLOR(cm, *cp);
+			FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
+			ss = css->outs[co];
+			if (ss == NULL)
+			{
+				ss = miss(v, d, css, co, cp + 1, v->start);
+				if (ss == NULL)
+					break;		/* NOTE BREAK OUT */
+			}
+			cp++;
+			ss->lastseen = cp;
+			css = ss;
+		}
+	}
+	else
+#endif
+	{
+		while (cp < probe)
+		{
+			co = GETCOLOR(cm, *cp);
+			ss = css->outs[co];
+			if (ss == NULL)
+			{
+				ss = miss(v, d, css, co, cp + 1, v->start);
+				if (ss == NULL)
+					break;		/* NOTE BREAK OUT */
+			}
+			cp++;
+			ss->lastseen = cp;
+			css = ss;
+		}
+	}
+
+	*lastcss = ss;
+	*lastcp = cp;
+
+	if (ss == NULL)
+		return 0;				/* impossible match, or internal error */
+
+	/* We need to process one more chr, or the EOS symbol, to check match */
+	if (cp < v->stop)
+	{
+		FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
+		co = GETCOLOR(cm, *cp);
+		FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
+		ss = css->outs[co];
+		if (ss == NULL)
+			ss = miss(v, d, css, co, cp + 1, v->start);
+	}
+	else
+	{
+		assert(cp == v->stop);
+		co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
+		FDEBUG(("color %ld\n", (long) co));
+		ss = miss(v, d, css, co, cp, v->start);
+	}
+
+	if (ss == NULL || !(ss->flags & POSTSTATE))
+		return 0;
+
+	return 1;
+}
+
+/*
  * lastcold - determine last point at which no progress had been made
  */
 static chr *					/* endpoint, or NULL */
@@ -613,19 +737,19 @@ miss(struct vars * v,
 }
 
 /*
- * lacon - lookahead-constraint checker for miss()
+ * lacon - lookaround-constraint checker for miss()
  */
 static int						/* predicate:  constraint satisfied? */
 lacon(struct vars * v,
 	  struct cnfa * pcnfa,		/* parent cnfa */
 	  chr *cp,
-	  pcolor co)				/* "color" of the lookahead constraint */
+	  pcolor co)				/* "color" of the lookaround constraint */
 {
 	int			n;
 	struct subre *sub;
 	struct dfa *d;
-	struct smalldfa sd;
 	chr		   *end;
+	int			satisfied;
 
 	/* Since this is recursive, it could be driven to stack overflow */
 	if (STACK_TOO_DEEP(v->re))
@@ -635,19 +759,35 @@ lacon(struct vars * v,
 	}
 
 	n = co - pcnfa->ncolors;
-	assert(n < v->g->nlacons && v->g->lacons != NULL);
+	assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
 	FDEBUG(("=== testing lacon %d\n", n));
 	sub = &v->g->lacons[n];
-	d = newdfa(v, &sub->cnfa, &v->g->cmap, &sd);
+	d = getladfa(v, n);
 	if (d == NULL)
-	{
-		ERR(REG_ESPACE);
 		return 0;
+	if (LATYPE_IS_AHEAD(sub->subno))
+	{
+		/* used to use longest() here, but shortest() could be much cheaper */
+		end = shortest(v, d, cp, cp, v->stop,
+					   (chr **) NULL, (int *) NULL);
+		satisfied = LATYPE_IS_POS(sub->subno) ? (end != NULL) : (end == NULL);
+	}
+	else
+	{
+		/*
+		 * To avoid doing O(N^2) work when repeatedly testing a lookbehind
+		 * constraint in an N-character string, we use matchuntil() which can
+		 * cache the DFA state across calls.  We only need to restart if the
+		 * probe point decreases, which is not common.  The NFA we're using is
+		 * a search NFA, so it doesn't mind scanning over stuff before the
+		 * nominal match.
+		 */
+		satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]);
+		if (!LATYPE_IS_POS(sub->subno))
+			satisfied = !satisfied;
 	}
-	end = longest(v, d, cp, v->stop, (int *) NULL);
-	freedfa(d);
-	FDEBUG(("=== lacon %d match %d\n", n, (end != NULL)));
-	return (sub->subno) ? (end != NULL) : (end == NULL);
+	FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied));
+	return satisfied;
 }
 
 /*
author	Tom Lane	2015-10-30 23:14:19 +0000
committer	Tom Lane	2015-10-30 23:14:19 +0000
commit	12c9a04008870c283931d6b3b648ee21bbc2cfda (patch)
tree	2afd1e048b3681e5a93b7d8b3c37968e71b2532d /src/backend/regex/rege_dfa.c
parent	c5057b2b34813ca114bc808cb56b7a7fcde64393 (diff)