2012-07-10 14:54:37 -04:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* regprefix.c
|
|
|
|
* Extract a common prefix, if any, from a compiled regex.
|
|
|
|
*
|
|
|
|
*
|
2021-01-02 13:06:25 -05:00
|
|
|
* Portions Copyright (c) 2012-2021, PostgreSQL Global Development Group
|
2012-07-10 14:54:37 -04:00
|
|
|
* Portions Copyright (c) 1998, 1999 Henry Spencer
|
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
|
|
|
* src/backend/regex/regprefix.c
|
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "regex/regguts.h"
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* forward declarations
|
|
|
|
*/
|
2019-05-22 13:04:48 -04:00
|
|
|
static int findprefix(struct cnfa *cnfa, struct colormap *cm,
|
|
|
|
chr *string, size_t *slength);
|
2012-07-10 14:54:37 -04:00
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* pg_regprefix - get common prefix for regular expression
|
|
|
|
*
|
|
|
|
* Returns one of:
|
|
|
|
* REG_NOMATCH: there is no common prefix of strings matching the regex
|
|
|
|
* REG_PREFIX: there is a common prefix of strings matching the regex
|
|
|
|
* REG_EXACT: all strings satisfying the regex must match the same string
|
|
|
|
* or a REG_XXX error code
|
|
|
|
*
|
|
|
|
* In the non-failure cases, *string is set to a malloc'd string containing
|
|
|
|
* the common prefix or exact value, of length *slength (measured in chrs
|
|
|
|
* not bytes!).
|
|
|
|
*
|
Implement lookbehind constraints in our regular-expression engine.
A lookbehind constraint is like a lookahead constraint in that it consumes
no text; but it checks for existence (or nonexistence) of a match *ending*
at the current point in the string, rather than one *starting* at the
current point. This is a long-requested feature since it exists in many
other regex libraries, but Henry Spencer had never got around to
implementing it in the code we use.
Just making it work is actually pretty trivial; but naive copying of the
logic for lookahead constraints leads to code that often spends O(N^2) time
to scan an N-character string, because we have to run the match engine
from string start to the current probe point each time the constraint is
checked. In typical use-cases a lookbehind constraint will be written at
the start of the regex and hence will need to be checked at every character
--- so O(N^2) work overall. To fix that, I introduced a third copy of the
core DFA matching loop, paralleling the existing longest() and shortest()
loops. This version, matchuntil(), can suspend and resume matching given
a couple of pointers' worth of storage space. So we need only run it
across the string once, stopping at each interesting probe point and then
resuming to advance to the next one.
I also put in an optimization that simplifies one-character lookahead and
lookbehind constraints, such as "(?=x)" or "(?<!\w)", into AHEAD and BEHIND
constraints, which already existed in the engine. This avoids the overhead
of the LACON machinery entirely for these rather common cases.
The net result is that lookbehind constraints run a factor of three or so
slower than Perl's for multi-character constraints, but faster than Perl's
for one-character constraints ... and they work fine for variable-length
constraints, which Perl gives up on entirely. So that's not bad from a
competitive perspective, and there's room for further optimization if
anyone cares. (In reality, raw scan rate across a large input string is
probably not that big a deal for Postgres usage anyway; so I'm happy if
it's linear.)
2015-10-30 19:14:19 -04:00
|
|
|
* This function does not analyze all complex cases (such as lookaround
|
2012-07-10 14:54:37 -04:00
|
|
|
* constraints) exactly. Therefore it is possible that some strings matching
|
2014-05-06 12:12:18 -04:00
|
|
|
* the reported prefix or exact-match string do not satisfy the regex. But
|
2012-07-10 14:54:37 -04:00
|
|
|
* it should never be the case that a string satisfying the regex does not
|
|
|
|
* match the reported prefix or exact-match string.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
pg_regprefix(regex_t *re,
|
|
|
|
chr **string,
|
|
|
|
size_t *slength)
|
|
|
|
{
|
|
|
|
struct guts *g;
|
|
|
|
struct cnfa *cnfa;
|
|
|
|
int st;
|
|
|
|
|
|
|
|
/* sanity checks */
|
|
|
|
if (string == NULL || slength == NULL)
|
|
|
|
return REG_INVARG;
|
|
|
|
*string = NULL; /* initialize for failure cases */
|
|
|
|
*slength = 0;
|
|
|
|
if (re == NULL || re->re_magic != REMAGIC)
|
|
|
|
return REG_INVARG;
|
|
|
|
if (re->re_csize != sizeof(chr))
|
|
|
|
return REG_MIXED;
|
|
|
|
|
|
|
|
/* Initialize locale-dependent support */
|
|
|
|
pg_set_regex_collation(re->re_collation);
|
|
|
|
|
|
|
|
/* setup */
|
|
|
|
g = (struct guts *) re->re_guts;
|
|
|
|
if (g->info & REG_UIMPOSSIBLE)
|
|
|
|
return REG_NOMATCH;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This implementation considers only the search NFA for the topmost regex
|
|
|
|
* tree node. Therefore, constraints such as backrefs are not fully
|
|
|
|
* applied, which is allowed per the function's API spec.
|
|
|
|
*/
|
|
|
|
assert(g->tree != NULL);
|
|
|
|
cnfa = &g->tree->cnfa;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Since a correct NFA should never contain any exit-free loops, it should
|
2013-05-29 16:58:43 -04:00
|
|
|
* not be possible for our traversal to return to a previously visited NFA
|
|
|
|
* state. Hence we need at most nstates chrs in the output string.
|
2012-07-10 14:54:37 -04:00
|
|
|
*/
|
|
|
|
*string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
|
|
|
|
if (*string == NULL)
|
|
|
|
return REG_ESPACE;
|
|
|
|
|
|
|
|
/* do it */
|
|
|
|
st = findprefix(cnfa, &g->cmap, *string, slength);
|
|
|
|
|
|
|
|
assert(*slength <= cnfa->nstates);
|
|
|
|
|
|
|
|
/* clean up */
|
|
|
|
if (st != REG_PREFIX && st != REG_EXACT)
|
|
|
|
{
|
|
|
|
FREE(*string);
|
|
|
|
*string = NULL;
|
|
|
|
*slength = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return st;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* findprefix - extract common prefix from cNFA
|
|
|
|
*
|
|
|
|
* Results are returned into the preallocated chr array string[], with
|
|
|
|
* *slength (which must be preset to zero) incremented for each chr.
|
|
|
|
*/
|
|
|
|
static int /* regprefix return code */
|
2017-06-21 14:39:04 -04:00
|
|
|
findprefix(struct cnfa *cnfa,
|
|
|
|
struct colormap *cm,
|
2012-07-10 14:54:37 -04:00
|
|
|
chr *string,
|
|
|
|
size_t *slength)
|
|
|
|
{
|
|
|
|
int st;
|
|
|
|
int nextst;
|
|
|
|
color thiscolor;
|
|
|
|
chr c;
|
|
|
|
struct carc *ca;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The "pre" state must have only BOS/BOL outarcs, else pattern isn't
|
2013-05-29 16:58:43 -04:00
|
|
|
* anchored left. If we have both BOS and BOL, they must go to the same
|
|
|
|
* next state.
|
2012-07-10 14:54:37 -04:00
|
|
|
*/
|
|
|
|
st = cnfa->pre;
|
|
|
|
nextst = -1;
|
|
|
|
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
|
|
|
|
{
|
|
|
|
if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
|
|
|
|
{
|
|
|
|
if (nextst == -1)
|
|
|
|
nextst = ca->to;
|
|
|
|
else if (nextst != ca->to)
|
|
|
|
return REG_NOMATCH;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
return REG_NOMATCH;
|
|
|
|
}
|
|
|
|
if (nextst == -1)
|
|
|
|
return REG_NOMATCH;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Scan through successive states, stopping as soon as we find one with
|
|
|
|
* more than one acceptable transition character (either multiple colors
|
|
|
|
* on out-arcs, or a color with more than one member chr).
|
|
|
|
*
|
|
|
|
* We could find a state with multiple out-arcs that are all labeled with
|
|
|
|
* the same singleton color; this comes from patterns like "^ab(cde|cxy)".
|
|
|
|
* In that case we add the chr "c" to the output string but then exit the
|
2014-05-06 12:12:18 -04:00
|
|
|
* loop with nextst == -1. This leaves a little bit on the table: if the
|
2012-07-10 14:54:37 -04:00
|
|
|
* pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
|
|
|
|
* to the prefix. But chasing multiple parallel state chains doesn't seem
|
|
|
|
* worth the trouble.
|
|
|
|
*/
|
|
|
|
do
|
|
|
|
{
|
|
|
|
st = nextst;
|
|
|
|
nextst = -1;
|
|
|
|
thiscolor = COLORLESS;
|
|
|
|
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
|
|
|
|
{
|
2015-10-19 13:54:53 -07:00
|
|
|
/* We can ignore BOS/BOL arcs */
|
2012-07-10 14:54:37 -04:00
|
|
|
if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
|
|
|
|
continue;
|
2021-02-20 18:11:56 -05:00
|
|
|
|
|
|
|
/*
|
|
|
|
* ... but EOS/EOL arcs terminate the search, as do RAINBOW arcs
|
|
|
|
* and LACONs
|
|
|
|
*/
|
2015-10-19 13:54:53 -07:00
|
|
|
if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] ||
|
2021-02-20 18:11:56 -05:00
|
|
|
ca->co == RAINBOW || ca->co >= cnfa->ncolors)
|
2012-07-10 14:54:37 -04:00
|
|
|
{
|
|
|
|
thiscolor = COLORLESS;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (thiscolor == COLORLESS)
|
|
|
|
{
|
|
|
|
/* First plain outarc */
|
|
|
|
thiscolor = ca->co;
|
|
|
|
nextst = ca->to;
|
|
|
|
}
|
|
|
|
else if (thiscolor == ca->co)
|
|
|
|
{
|
|
|
|
/* Another plain outarc for same color */
|
|
|
|
nextst = -1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* More than one plain outarc color terminates the search */
|
|
|
|
thiscolor = COLORLESS;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* Done if we didn't find exactly one color on plain outarcs */
|
|
|
|
if (thiscolor == COLORLESS)
|
|
|
|
break;
|
|
|
|
/* The color must be a singleton */
|
Make locale-dependent regex character classes work for large char codes.
Previously, we failed to recognize Unicode characters above U+7FF as
being members of locale-dependent character classes such as [[:alpha:]].
(Actually, the same problem occurs for large pg_wchar values in any
multibyte encoding, but UTF8 is the only case people have actually
complained about.) It's impractical to get Spencer's original code to
handle character classes or ranges containing many thousands of characters,
because it insists on considering each member character individually at
regex compile time, whether or not the character will ever be of interest
at run time. To fix, choose a cutoff point MAX_SIMPLE_CHR below which
we process characters individually as before, and deal with entire ranges
or classes as single entities above that. We can actually make things
cheaper than before for chars below the cutoff, because the color map can
now be a simple linear array for those chars, rather than the multilevel
tree structure Spencer designed. It's more expensive than before for
chars above the cutoff, because we must do a binary search in a list of
high chars and char ranges used in the regex pattern, plus call iswalpha()
and friends for each locale-dependent character class used in the pattern.
However, multibyte encodings are normally designed to give smaller codes
to popular characters, so that we can expect that the slow path will be
taken relatively infrequently. In any case, the speed penalty appears
minor except when we have to apply iswalpha() etc. to high character codes
at runtime --- and the previous coding gave wrong answers for those cases,
so whether it was faster is moot.
Tom Lane, reviewed by Heikki Linnakangas
Discussion: <15563.1471913698@sss.pgh.pa.us>
2016-09-05 17:06:29 -04:00
|
|
|
if (cm->cd[thiscolor].nschrs != 1)
|
|
|
|
break;
|
|
|
|
/* Must not have any high-color-map entries */
|
|
|
|
if (cm->cd[thiscolor].nuchrs != 0)
|
2012-07-10 14:54:37 -04:00
|
|
|
break;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Identify the color's sole member chr and add it to the prefix
|
2014-05-06 12:12:18 -04:00
|
|
|
* string. In general the colormap data structure doesn't provide a
|
2012-07-10 14:54:37 -04:00
|
|
|
* way to find color member chrs, except by trying GETCOLOR() on each
|
|
|
|
* possible chr value, which won't do at all. However, for the cases
|
|
|
|
* we care about it should be sufficient to test the "firstchr" value,
|
|
|
|
* that is the first chr ever added to the color. There are cases
|
|
|
|
* where this might no longer be a member of the color (so we do need
|
|
|
|
* to test), but none of them are likely to arise for a character that
|
2014-05-06 12:12:18 -04:00
|
|
|
* is a member of a common prefix. If we do hit such a corner case,
|
2012-07-10 14:54:37 -04:00
|
|
|
* we just fall out without adding anything to the prefix string.
|
|
|
|
*/
|
|
|
|
c = cm->cd[thiscolor].firstchr;
|
|
|
|
if (GETCOLOR(cm, c) != thiscolor)
|
|
|
|
break;
|
|
|
|
|
|
|
|
string[(*slength)++] = c;
|
|
|
|
|
|
|
|
/* Advance to next state, but only if we have a unique next state */
|
|
|
|
} while (nextst != -1);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we ended at a state that only has EOS/EOL outarcs leading to the
|
|
|
|
* "post" state, then we have an exact-match string. Note this is true
|
|
|
|
* even if the string is of zero length.
|
|
|
|
*/
|
|
|
|
nextst = -1;
|
|
|
|
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
|
|
|
|
{
|
|
|
|
if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
|
|
|
|
{
|
|
|
|
if (nextst == -1)
|
|
|
|
nextst = ca->to;
|
|
|
|
else if (nextst != ca->to)
|
|
|
|
{
|
|
|
|
nextst = -1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
nextst = -1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (nextst == cnfa->post)
|
|
|
|
return REG_EXACT;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Otherwise, if we were unable to identify any prefix characters, say
|
|
|
|
* NOMATCH --- the pattern is anchored left, but doesn't specify any
|
|
|
|
* particular first character.
|
|
|
|
*/
|
|
|
|
if (*slength > 0)
|
|
|
|
return REG_PREFIX;
|
|
|
|
|
|
|
|
return REG_NOMATCH;
|
|
|
|
}
|