Rewrite LIKE's %-followed-by-_ optimization so it really works (this time

for sure ;-)).  It now also optimizes more cases, such as %_%_.  Improve
comments too.  Per bug #5478.

In passing, also rename the TCHAR macro to GETCHAR, because pgindent is
messing with the formatting of the former (apparently it now thinks TCHAR
is a typedef name).

Back-patch to 8.3, where the bug was introduced.
This commit is contained in:
Tom Lane 2010-05-28 17:35:36 +00:00
parent caea7b7aca
commit 63c0780dba
3 changed files with 84 additions and 78 deletions

View File

@ -1,25 +1,25 @@
/*-------------------------------------------------------------------------
*
* like_match.c
* like expression handling internal code.
* LIKE pattern matching internal code.
*
* This file is included by like.c four times, to provide natching code for
* single-byte encodings, UTF8, and for other multi-byte encodings,
* and case insensitive matches for single byte encodings.
* UTF8 is a special case because we can use a much more efficient version
* of NextChar than can be used for other multi-byte encodings.
* This file is included by like.c four times, to provide matching code for
* (1) single-byte encodings, (2) UTF8, (3) other multi-byte encodings,
* and (4) case insensitive matches in single byte encodings.
* (UTF8 is a special case because we can use a much more efficient version
* of NextChar than can be used for general multi-byte encodings.)
*
* Before the inclusion, we need to define the following macros:
*
* NextChar
* MatchText - to name of function wanted
* do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar
* MATCH_LOWER - define iff using to_lower on text chars
* MATCH_LOWER - define for case (4), using to_lower on single-byte chars
*
* Copyright (c) 1996-2008, PostgreSQL Global Development Group
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/adt/like_match.c,v 1.20.2.3 2009/05/24 18:10:47 tgl Exp $
* $PostgreSQL: pgsql/src/backend/utils/adt/like_match.c,v 1.20.2.4 2010/05/28 17:35:36 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -70,9 +70,9 @@
*/
#ifdef MATCH_LOWER
#define TCHAR(t) ((char) tolower((unsigned char) (t)))
#define GETCHAR(t) ((char) tolower((unsigned char) (t)))
#else
#define TCHAR(t) (t)
#define GETCHAR(t) (t)
#endif
static int
@ -94,87 +94,82 @@ MatchText(char *t, int tlen, char *p, int plen)
{
if (*p == '\\')
{
/* Next byte must match literally, whatever it is */
/* Next pattern byte must match literally, whatever it is */
NextByte(p, plen);
if ((plen <= 0) || TCHAR(*p) != TCHAR(*t))
if (plen <= 0 || GETCHAR(*p) != GETCHAR(*t))
return LIKE_FALSE;
}
else if (*p == '%')
{
/*
* % processing is essentially a search for a match for what
* follows the %, plus a recursive match of the remainder. We
* succeed if and only if both conditions are met.
*/
char firstpat;
/* %% is the same as % according to the SQL standard */
/* Advance past all %'s */
while (plen > 0 && *p == '%')
NextByte(p, plen);
/* Trailing percent matches everything. */
/*
* % processing is essentially a search for a text position at
* which the remainder of the text matches the remainder of the
* pattern, using a recursive call to check each potential match.
*
* If there are wildcards immediately following the %, we can skip
* over them first, using the idea that any sequence of N _'s and
* one or more %'s is equivalent to N _'s and one % (ie, it will
* match any sequence of at least N text characters). In this
* way we will always run the recursive search loop using a
* pattern fragment that begins with a literal character-to-match,
* thereby not recursing more than we have to.
*/
NextByte(p, plen);
while (plen > 0)
{
if (*p == '%')
NextByte(p, plen);
else if (*p == '_')
{
/* If not enough text left to match the pattern, ABORT */
if (tlen <= 0)
return LIKE_ABORT;
NextChar(t, tlen);
NextByte(p, plen);
}
else
break; /* Reached a non-wildcard pattern char */
}
/*
* If we're at end of pattern, match: we have a trailing % which
* matches any remaining text string.
*/
if (plen <= 0)
return LIKE_TRUE;
/*
* Otherwise, scan for a text position at which we can match the
* rest of the pattern.
* rest of the pattern. The first remaining pattern char is known
* to be a regular or escaped literal character, so we can compare
* the first pattern byte to each text byte to avoid recursing
* more than we have to. This fact also guarantees that we don't
* have to consider a match to the zero-length substring at the
* end of the text.
*/
if (*p == '_')
if (*p == '\\')
{
/* %_ is the same as _% - avoid matching _ repeatedly */
if (plen < 2)
return LIKE_FALSE; /* XXX should throw error */
firstpat = GETCHAR(p[1]);
}
else
firstpat = GETCHAR(*p);
do
{
NextChar(t, tlen);
NextByte(p, plen);
} while (tlen > 0 && plen > 0 && *p == '_');
/*
* If we are at the end of the pattern, succeed: % followed
* by n _'s matches any string of at least n characters, and
* we have now found there are at least n characters.
*/
if (plen <= 0)
return LIKE_TRUE;
/* Look for a place that matches the rest of the pattern */
while (tlen > 0)
while (tlen > 0)
{
if (GETCHAR(*t) == firstpat)
{
int matched = MatchText(t, tlen, p, plen);
if (matched != LIKE_FALSE)
return matched; /* TRUE or ABORT */
NextChar(t, tlen);
}
}
else
{
char firstpat = TCHAR(*p);
if (*p == '\\')
{
if (plen < 2)
return LIKE_FALSE;
firstpat = TCHAR(p[1]);
return matched; /* TRUE or ABORT */
}
while (tlen > 0)
{
/*
* Optimization to prevent most recursion: don't recurse
* unless first pattern byte matches first text byte.
*/
if (TCHAR(*t) == firstpat)
{
int matched = MatchText(t, tlen, p, plen);
if (matched != LIKE_FALSE)
return matched; /* TRUE or ABORT */
}
NextChar(t, tlen);
}
NextChar(t, tlen);
}
/*
@ -190,7 +185,7 @@ MatchText(char *t, int tlen, char *p, int plen)
NextByte(p, plen);
continue;
}
else if (TCHAR(*p) != TCHAR(*t))
else if (GETCHAR(*p) != GETCHAR(*t))
{
/* non-wildcard pattern char fails to match text char */
return LIKE_FALSE;
@ -215,10 +210,12 @@ MatchText(char *t, int tlen, char *p, int plen)
if (tlen > 0)
return LIKE_FALSE; /* end of pattern, but not of text */
/* End of text string. Do we have matching pattern remaining? */
while (plen > 0 && *p == '%') /* allow multiple %'s at end of pattern */
/*
* End of text, but perhaps not of pattern. Match iff the remaining
* pattern can match a zero-length string, ie, it's zero or more %'s.
*/
while (plen > 0 && *p == '%')
NextByte(p, plen);
if (plen <= 0)
return LIKE_TRUE;
@ -342,8 +339,9 @@ do_like_escape(text *pat, text *esc)
#undef do_like_escape
#endif
#undef TCHAR
#undef GETCHAR
#ifdef MATCH_LOWER
#undef MATCH_LOWER
#endif

View File

@ -827,7 +827,7 @@ SELECT 'Hawkeye' NOT ILIKE 'h%' AS "false";
(1 row)
--
-- test %/_ combination cases, cf bug #4821
-- test %/_ combination cases, cf bugs #4821 and #5478
--
SELECT 'foo' LIKE '_%' as t, 'f' LIKE '_%' as t, '' LIKE '_%' as f;
t | t | f
@ -853,6 +853,12 @@ SELECT 'foo' LIKE '%__' as t, 'foo' LIKE '%___' as t, 'foo' LIKE '%____' as f;
t | t | f
(1 row)
SELECT 'jack' LIKE '%____%' AS t;
t
---
t
(1 row)
--
-- test implicit type conversion
--

View File

@ -253,7 +253,7 @@ SELECT 'Hawkeye' ILIKE 'h%' AS "true";
SELECT 'Hawkeye' NOT ILIKE 'h%' AS "false";
--
-- test %/_ combination cases, cf bug #4821
-- test %/_ combination cases, cf bugs #4821 and #5478
--
SELECT 'foo' LIKE '_%' as t, 'f' LIKE '_%' as t, '' LIKE '_%' as f;
@ -262,6 +262,8 @@ SELECT 'foo' LIKE '%_' as t, 'f' LIKE '%_' as t, '' LIKE '%_' as f;
SELECT 'foo' LIKE '__%' as t, 'foo' LIKE '___%' as t, 'foo' LIKE '____%' as f;
SELECT 'foo' LIKE '%__' as t, 'foo' LIKE '%___' as t, 'foo' LIKE '%____' as f;
SELECT 'jack' LIKE '%____%' AS t;
--
-- test implicit type conversion