postgres/src/backend/regex/regprefix.c

/*-------------------------------------------------------------------------
 *
 * regprefix.c
 *	  Extract a common prefix, if any, from a compiled regex.
 *
 *
 * Portions Copyright (c) 2012-2021, PostgreSQL Global Development Group
 * Portions Copyright (c) 1998, 1999 Henry Spencer
 *
 * IDENTIFICATION
 *	  src/backend/regex/regprefix.c
 *
 *-------------------------------------------------------------------------
 */

#include "regex/regguts.h"


/*
 * forward declarations
 */
static int	findprefix(struct cnfa *cnfa, struct colormap *cm,
					   chr *string, size_t *slength);


/*
 * pg_regprefix - get common prefix for regular expression
 *
 * Returns one of:
 *	REG_NOMATCH: there is no common prefix of strings matching the regex
 *	REG_PREFIX: there is a common prefix of strings matching the regex
 *	REG_EXACT: all strings satisfying the regex must match the same string
 *	or a REG_XXX error code
 *
 * In the non-failure cases, *string is set to a malloc'd string containing
 * the common prefix or exact value, of length *slength (measured in chrs
 * not bytes!).
 *
 * This function does not analyze all complex cases (such as lookaround
 * constraints) exactly.  Therefore it is possible that some strings matching
 * the reported prefix or exact-match string do not satisfy the regex.  But
 * it should never be the case that a string satisfying the regex does not
 * match the reported prefix or exact-match string.
 */
int
pg_regprefix(regex_t *re,
			 chr **string,
			 size_t *slength)
{
	struct guts *g;
	struct cnfa *cnfa;
	int			st;

	/* sanity checks */
	if (string == NULL || slength == NULL)
		return REG_INVARG;
	*string = NULL;				/* initialize for failure cases */
	*slength = 0;
	if (re == NULL || re->re_magic != REMAGIC)
		return REG_INVARG;
	if (re->re_csize != sizeof(chr))
		return REG_MIXED;

	/* Initialize locale-dependent support */
	pg_set_regex_collation(re->re_collation);

	/* setup */
	g = (struct guts *) re->re_guts;
	if (g->info & REG_UIMPOSSIBLE)
		return REG_NOMATCH;

	/*
	 * This implementation considers only the search NFA for the topmost regex
	 * tree node.  Therefore, constraints such as backrefs are not fully
	 * applied, which is allowed per the function's API spec.
	 */
	assert(g->tree != NULL);
	cnfa = &g->tree->cnfa;

	/*
	 * Since a correct NFA should never contain any exit-free loops, it should
	 * not be possible for our traversal to return to a previously visited NFA
	 * state.  Hence we need at most nstates chrs in the output string.
	 */
	*string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
	if (*string == NULL)
		return REG_ESPACE;

	/* do it */
	st = findprefix(cnfa, &g->cmap, *string, slength);

	assert(*slength <= cnfa->nstates);

	/* clean up */
	if (st != REG_PREFIX && st != REG_EXACT)
	{
		FREE(*string);
		*string = NULL;
		*slength = 0;
	}

	return st;
}

/*
 * findprefix - extract common prefix from cNFA
 *
 * Results are returned into the preallocated chr array string[], with
 * *slength (which must be preset to zero) incremented for each chr.
 */
static int						/* regprefix return code */
findprefix(struct cnfa *cnfa,
		   struct colormap *cm,
		   chr *string,
		   size_t *slength)
{
	int			st;
	int			nextst;
	color		thiscolor;
	chr			c;
	struct carc *ca;

	/*
	 * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
	 * anchored left.  If we have both BOS and BOL, they must go to the same
	 * next state.
	 */
	st = cnfa->pre;
	nextst = -1;
	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
	{
		if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
		{
			if (nextst == -1)
				nextst = ca->to;
			else if (nextst != ca->to)
				return REG_NOMATCH;
		}
		else
			return REG_NOMATCH;
	}
	if (nextst == -1)
		return REG_NOMATCH;

	/*
	 * Scan through successive states, stopping as soon as we find one with
	 * more than one acceptable transition character (either multiple colors
	 * on out-arcs, or a color with more than one member chr).
	 *
	 * We could find a state with multiple out-arcs that are all labeled with
	 * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
	 * In that case we add the chr "c" to the output string but then exit the
	 * loop with nextst == -1.  This leaves a little bit on the table: if the
	 * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
	 * to the prefix.  But chasing multiple parallel state chains doesn't seem
	 * worth the trouble.
	 */
	do
	{
		st = nextst;
		nextst = -1;
		thiscolor = COLORLESS;
		for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
		{
			/* We can ignore BOS/BOL arcs */
			if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
				continue;

			/*
			 * ... but EOS/EOL arcs terminate the search, as do RAINBOW arcs
			 * and LACONs
			 */
			if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] ||
				ca->co == RAINBOW || ca->co >= cnfa->ncolors)
			{
				thiscolor = COLORLESS;
				break;
			}
			if (thiscolor == COLORLESS)
			{
				/* First plain outarc */
				thiscolor = ca->co;
				nextst = ca->to;
			}
			else if (thiscolor == ca->co)
			{
				/* Another plain outarc for same color */
				nextst = -1;
			}
			else
			{
				/* More than one plain outarc color terminates the search */
				thiscolor = COLORLESS;
				break;
			}
		}
		/* Done if we didn't find exactly one color on plain outarcs */
		if (thiscolor == COLORLESS)
			break;
		/* The color must be a singleton */
		if (cm->cd[thiscolor].nschrs != 1)
			break;
		/* Must not have any high-color-map entries */
		if (cm->cd[thiscolor].nuchrs != 0)
			break;

		/*
		 * Identify the color's sole member chr and add it to the prefix
		 * string.  In general the colormap data structure doesn't provide a
		 * way to find color member chrs, except by trying GETCOLOR() on each
		 * possible chr value, which won't do at all.  However, for the cases
		 * we care about it should be sufficient to test the "firstchr" value,
		 * that is the first chr ever added to the color.  There are cases
		 * where this might no longer be a member of the color (so we do need
		 * to test), but none of them are likely to arise for a character that
		 * is a member of a common prefix.  If we do hit such a corner case,
		 * we just fall out without adding anything to the prefix string.
		 */
		c = cm->cd[thiscolor].firstchr;
		if (GETCOLOR(cm, c) != thiscolor)
			break;

		string[(*slength)++] = c;

		/* Advance to next state, but only if we have a unique next state */
	} while (nextst != -1);

	/*
	 * If we ended at a state that only has EOS/EOL outarcs leading to the
	 * "post" state, then we have an exact-match string.  Note this is true
	 * even if the string is of zero length.
	 */
	nextst = -1;
	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
	{
		if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
		{
			if (nextst == -1)
				nextst = ca->to;
			else if (nextst != ca->to)
			{
				nextst = -1;
				break;
			}
		}
		else
		{
			nextst = -1;
			break;
		}
	}
	if (nextst == cnfa->post)
		return REG_EXACT;

	/*
	 * Otherwise, if we were unable to identify any prefix characters, say
	 * NOMATCH --- the pattern is anchored left, but doesn't specify any
	 * particular first character.
	 */
	if (*slength > 0)
		return REG_PREFIX;

	return REG_NOMATCH;
}