summaryrefslogtreecommitdiff
path: root/src/backend/regex/rege_dfa.c
diff options
context:
space:
mode:
authorTom Lane2021-02-20 23:31:19 +0000
committerTom Lane2021-02-20 23:31:19 +0000
commit824bf71902db4a2067b8d64583c9d88bb264c44b (patch)
tree1d7abed532407dff795e63e9692ea742465c07de /src/backend/regex/rege_dfa.c
parent08c0d6ad65f7c161add82ae906efb90dbd7f653d (diff)
Recognize "match-all" NFAs within the regex engine.
This builds on the previous "rainbow" patch to detect NFAs that will match any string, though possibly with constraints on the string length. This definition is chosen to match constructs such as ".*", ".+", and ".{1,100}". Recognizing such an NFA after the optimization pass is fairly cheap, since we basically just have to verify that all arcs are RAINBOW arcs and count the number of steps to the end state. (Well, there's a bit of complication with pseudo-color arcs for string boundary conditions, but not much.) Once we have these markings, the regex executor functions longest(), shortest(), and matchuntil() don't have to expend per-character work to determine whether a given substring satisfies such an NFA; they just need to check its length against the bounds. Since some matching problems require O(N) invocations of these functions, we've reduced the runtime for an N-character string from O(N^2) to O(N). Of course, this is no help for non-matchall sub-patterns, but those usually have constraints that allow us to avoid needing O(N) substring checks in the first place. It's precisely the unconstrained "match-all" cases that cause the most headaches. This is part of a patch series that in total reduces the regex engine's runtime by about a factor of four on a large corpus of real-world regexes. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/[email protected]
Diffstat (limited to 'src/backend/regex/rege_dfa.c')
-rw-r--r--src/backend/regex/rege_dfa.c57
1 files changed, 57 insertions, 0 deletions
diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c
index 32be2592c56..89d162ed6af 100644
--- a/src/backend/regex/rege_dfa.c
+++ b/src/backend/regex/rege_dfa.c
@@ -58,6 +58,29 @@ longest(struct vars *v,
if (hitstopp != NULL)
*hitstopp = 0;
+ /* fast path for matchall NFAs */
+ if (d->cnfa->flags & MATCHALL)
+ {
+ size_t nchr = stop - start;
+ size_t maxmatchall = d->cnfa->maxmatchall;
+
+ if (nchr < d->cnfa->minmatchall)
+ return NULL;
+ if (maxmatchall == DUPINF)
+ {
+ if (stop == v->stop && hitstopp != NULL)
+ *hitstopp = 1;
+ }
+ else
+ {
+ if (stop == v->stop && nchr <= maxmatchall + 1 && hitstopp != NULL)
+ *hitstopp = 1;
+ if (nchr > maxmatchall)
+ return start + maxmatchall;
+ }
+ return stop;
+ }
+
/* initialize */
css = initialize(v, d, start);
if (css == NULL)
@@ -187,6 +210,24 @@ shortest(struct vars *v,
if (hitstopp != NULL)
*hitstopp = 0;
+ /* fast path for matchall NFAs */
+ if (d->cnfa->flags & MATCHALL)
+ {
+ size_t nchr = min - start;
+
+ if (d->cnfa->maxmatchall != DUPINF &&
+ nchr > d->cnfa->maxmatchall)
+ return NULL;
+ if ((max - start) < d->cnfa->minmatchall)
+ return NULL;
+ if (nchr < d->cnfa->minmatchall)
+ min = start + d->cnfa->minmatchall;
+ if (coldp != NULL)
+ *coldp = start;
+ /* there is no case where we should set *hitstopp */
+ return min;
+ }
+
/* initialize */
css = initialize(v, d, start);
if (css == NULL)
@@ -312,6 +353,22 @@ matchuntil(struct vars *v,
struct sset *ss;
struct colormap *cm = d->cm;
+ /* fast path for matchall NFAs */
+ if (d->cnfa->flags & MATCHALL)
+ {
+ size_t nchr = probe - v->start;
+
+ /*
+ * It might seem that we should check maxmatchall too, but the .* at
+ * the front of the pattern absorbs any extra characters (and it was
+ * tacked on *after* computing minmatchall/maxmatchall). Thus, we
+ * should match if there are at least minmatchall characters.
+ */
+ if (nchr < d->cnfa->minmatchall)
+ return 0;
+ return 1;
+ }
+
/* initialize and startup, or restart, if necessary */
if (cp == NULL || cp > probe)
{