The new indent version includes numerous fixes thanks to Piotr Stefaniak. The main changes visible in this commit are: * Nicer formatting of function-pointer declarations. * No longer unexpectedly removes spaces in expressions using casts, sizeof, or offsetof. * No longer wants to add a space in "struct structname *varname", as well as some similar cases for const- or volatile-qualified pointers. * Declarations using PG_USED_FOR_ASSERTS_ONLY are formatted more nicely. * Fixes bug where comments following declarations were sometimes placed with no space separating them from the code. * Fixes some odd decisions for comments following case labels. * Fixes some cases where comments following code were indented to less than the expected column 33. On the less good side, it now tends to put more whitespace around typedef names that are not listed in typedefs.list. This might encourage us to put more effort into typedef name collection; it's not really a bug in indent itself. There are more changes coming after this round, having to do with comment indentation and alignment of lines appearing within parentheses. I wanted to limit the size of the diffs to something that could be reviewed without one's eyes completely glazing over, so it seemed better to split up the changes as much as practical. Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
374 lines
9.0 KiB
C
374 lines
9.0 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* tsvector_parser.c
|
|
* Parser for tsvector
|
|
*
|
|
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
|
*
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/utils/adt/tsvector_parser.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
#include "tsearch/ts_locale.h"
|
|
#include "tsearch/ts_utils.h"
|
|
|
|
|
|
/*
|
|
* Private state of tsvector parser. Note that tsquery also uses this code to
|
|
* parse its input, hence the boolean flags. The two flags are both true or
|
|
* both false in current usage, but we keep them separate for clarity.
|
|
* is_tsquery affects *only* the content of error messages.
|
|
*/
|
|
struct TSVectorParseStateData
|
|
{
|
|
char *prsbuf; /* next input character */
|
|
char *bufstart; /* whole string (used only for errors) */
|
|
char *word; /* buffer to hold the current word */
|
|
int len; /* size in bytes allocated for 'word' */
|
|
int eml; /* max bytes per character */
|
|
bool oprisdelim; /* treat ! | * ( ) as delimiters? */
|
|
bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */
|
|
};
|
|
|
|
|
|
/*
|
|
* Initializes parser for the input string. If oprisdelim is set, the
|
|
* following characters are treated as delimiters in addition to whitespace:
|
|
* ! | & ( )
|
|
*/
|
|
TSVectorParseState
|
|
init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
|
|
{
|
|
TSVectorParseState state;
|
|
|
|
state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
|
|
state->prsbuf = input;
|
|
state->bufstart = input;
|
|
state->len = 32;
|
|
state->word = (char *) palloc(state->len);
|
|
state->eml = pg_database_encoding_max_length();
|
|
state->oprisdelim = oprisdelim;
|
|
state->is_tsquery = is_tsquery;
|
|
|
|
return state;
|
|
}
|
|
|
|
/*
|
|
* Reinitializes parser to parse 'input', instead of previous input.
|
|
*/
|
|
void
|
|
reset_tsvector_parser(TSVectorParseState state, char *input)
|
|
{
|
|
state->prsbuf = input;
|
|
}
|
|
|
|
/*
|
|
* Shuts down a tsvector parser.
|
|
*/
|
|
void
|
|
close_tsvector_parser(TSVectorParseState state)
|
|
{
|
|
pfree(state->word);
|
|
pfree(state);
|
|
}
|
|
|
|
/* increase the size of 'word' if needed to hold one more character */
|
|
#define RESIZEPRSBUF \
|
|
do { \
|
|
int clen = curpos - state->word; \
|
|
if ( clen + state->eml >= state->len ) \
|
|
{ \
|
|
state->len *= 2; \
|
|
state->word = (char *) repalloc(state->word, state->len); \
|
|
curpos = state->word + clen; \
|
|
} \
|
|
} while (0)
|
|
|
|
/* phrase operator begins with '<' */
|
|
#define ISOPERATOR(x) \
|
|
( pg_mblen(x) == 1 && ( *(x) == '!' || \
|
|
*(x) == '&' || \
|
|
*(x) == '|' || \
|
|
*(x) == '(' || \
|
|
*(x) == ')' || \
|
|
*(x) == '<' \
|
|
) )
|
|
|
|
/* Fills gettoken_tsvector's output parameters, and returns true */
|
|
#define RETURN_TOKEN \
|
|
do { \
|
|
if (pos_ptr != NULL) \
|
|
{ \
|
|
*pos_ptr = pos; \
|
|
*poslen = npos; \
|
|
} \
|
|
else if (pos != NULL) \
|
|
pfree(pos); \
|
|
\
|
|
if (strval != NULL) \
|
|
*strval = state->word; \
|
|
if (lenval != NULL) \
|
|
*lenval = curpos - state->word; \
|
|
if (endptr != NULL) \
|
|
*endptr = state->prsbuf; \
|
|
return true; \
|
|
} while(0)
|
|
|
|
|
|
/* State codes used in gettoken_tsvector */
|
|
#define WAITWORD 1
|
|
#define WAITENDWORD 2
|
|
#define WAITNEXTCHAR 3
|
|
#define WAITENDCMPLX 4
|
|
#define WAITPOSINFO 5
|
|
#define INPOSINFO 6
|
|
#define WAITPOSDELIM 7
|
|
#define WAITCHARCMPLX 8
|
|
|
|
#define PRSSYNTAXERROR prssyntaxerror(state)
|
|
|
|
static void
|
|
prssyntaxerror(TSVectorParseState state)
|
|
{
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_SYNTAX_ERROR),
|
|
state->is_tsquery ?
|
|
errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
|
|
errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
|
|
}
|
|
|
|
|
|
/*
|
|
* Get next token from string being parsed. Returns true if successful,
|
|
* false if end of input string is reached. On success, these output
|
|
* parameters are filled in:
|
|
*
|
|
* *strval pointer to token
|
|
* *lenval length of *strval
|
|
* *pos_ptr pointer to a palloc'd array of positions and weights
|
|
* associated with the token. If the caller is not interested
|
|
* in the information, NULL can be supplied. Otherwise
|
|
* the caller is responsible for pfreeing the array.
|
|
* *poslen number of elements in *pos_ptr
|
|
* *endptr scan resumption point
|
|
*
|
|
* Pass NULL for unwanted output parameters.
|
|
*/
|
|
bool
|
|
gettoken_tsvector(TSVectorParseState state,
|
|
char **strval, int *lenval,
|
|
WordEntryPos **pos_ptr, int *poslen,
|
|
char **endptr)
|
|
{
|
|
int oldstate = 0;
|
|
char *curpos = state->word;
|
|
int statecode = WAITWORD;
|
|
|
|
/*
|
|
* pos is for collecting the comma delimited list of positions followed by
|
|
* the actual token.
|
|
*/
|
|
WordEntryPos *pos = NULL;
|
|
int npos = 0; /* elements of pos used */
|
|
int posalen = 0; /* allocated size of pos */
|
|
|
|
while (1)
|
|
{
|
|
if (statecode == WAITWORD)
|
|
{
|
|
if (*(state->prsbuf) == '\0')
|
|
return false;
|
|
else if (t_iseq(state->prsbuf, '\''))
|
|
statecode = WAITENDCMPLX;
|
|
else if (t_iseq(state->prsbuf, '\\'))
|
|
{
|
|
statecode = WAITNEXTCHAR;
|
|
oldstate = WAITENDWORD;
|
|
}
|
|
else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
|
|
PRSSYNTAXERROR;
|
|
else if (!t_isspace(state->prsbuf))
|
|
{
|
|
COPYCHAR(curpos, state->prsbuf);
|
|
curpos += pg_mblen(state->prsbuf);
|
|
statecode = WAITENDWORD;
|
|
}
|
|
}
|
|
else if (statecode == WAITNEXTCHAR)
|
|
{
|
|
if (*(state->prsbuf) == '\0')
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_SYNTAX_ERROR),
|
|
errmsg("there is no escaped character: \"%s\"",
|
|
state->bufstart)));
|
|
else
|
|
{
|
|
RESIZEPRSBUF;
|
|
COPYCHAR(curpos, state->prsbuf);
|
|
curpos += pg_mblen(state->prsbuf);
|
|
Assert(oldstate != 0);
|
|
statecode = oldstate;
|
|
}
|
|
}
|
|
else if (statecode == WAITENDWORD)
|
|
{
|
|
if (t_iseq(state->prsbuf, '\\'))
|
|
{
|
|
statecode = WAITNEXTCHAR;
|
|
oldstate = WAITENDWORD;
|
|
}
|
|
else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
|
|
(state->oprisdelim && ISOPERATOR(state->prsbuf)))
|
|
{
|
|
RESIZEPRSBUF;
|
|
if (curpos == state->word)
|
|
PRSSYNTAXERROR;
|
|
*(curpos) = '\0';
|
|
RETURN_TOKEN;
|
|
}
|
|
else if (t_iseq(state->prsbuf, ':'))
|
|
{
|
|
if (curpos == state->word)
|
|
PRSSYNTAXERROR;
|
|
*(curpos) = '\0';
|
|
if (state->oprisdelim)
|
|
RETURN_TOKEN;
|
|
else
|
|
statecode = INPOSINFO;
|
|
}
|
|
else
|
|
{
|
|
RESIZEPRSBUF;
|
|
COPYCHAR(curpos, state->prsbuf);
|
|
curpos += pg_mblen(state->prsbuf);
|
|
}
|
|
}
|
|
else if (statecode == WAITENDCMPLX)
|
|
{
|
|
if (t_iseq(state->prsbuf, '\''))
|
|
{
|
|
statecode = WAITCHARCMPLX;
|
|
}
|
|
else if (t_iseq(state->prsbuf, '\\'))
|
|
{
|
|
statecode = WAITNEXTCHAR;
|
|
oldstate = WAITENDCMPLX;
|
|
}
|
|
else if (*(state->prsbuf) == '\0')
|
|
PRSSYNTAXERROR;
|
|
else
|
|
{
|
|
RESIZEPRSBUF;
|
|
COPYCHAR(curpos, state->prsbuf);
|
|
curpos += pg_mblen(state->prsbuf);
|
|
}
|
|
}
|
|
else if (statecode == WAITCHARCMPLX)
|
|
{
|
|
if (t_iseq(state->prsbuf, '\''))
|
|
{
|
|
RESIZEPRSBUF;
|
|
COPYCHAR(curpos, state->prsbuf);
|
|
curpos += pg_mblen(state->prsbuf);
|
|
statecode = WAITENDCMPLX;
|
|
}
|
|
else
|
|
{
|
|
RESIZEPRSBUF;
|
|
*(curpos) = '\0';
|
|
if (curpos == state->word)
|
|
PRSSYNTAXERROR;
|
|
if (state->oprisdelim)
|
|
{
|
|
/* state->prsbuf+=pg_mblen(state->prsbuf); */
|
|
RETURN_TOKEN;
|
|
}
|
|
else
|
|
statecode = WAITPOSINFO;
|
|
continue; /* recheck current character */
|
|
}
|
|
}
|
|
else if (statecode == WAITPOSINFO)
|
|
{
|
|
if (t_iseq(state->prsbuf, ':'))
|
|
statecode = INPOSINFO;
|
|
else
|
|
RETURN_TOKEN;
|
|
}
|
|
else if (statecode == INPOSINFO)
|
|
{
|
|
if (t_isdigit(state->prsbuf))
|
|
{
|
|
if (posalen == 0)
|
|
{
|
|
posalen = 4;
|
|
pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
|
|
npos = 0;
|
|
}
|
|
else if (npos + 1 >= posalen)
|
|
{
|
|
posalen *= 2;
|
|
pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
|
|
}
|
|
npos++;
|
|
WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
|
|
/* we cannot get here in tsquery, so no need for 2 errmsgs */
|
|
if (WEP_GETPOS(pos[npos - 1]) == 0)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_SYNTAX_ERROR),
|
|
errmsg("wrong position info in tsvector: \"%s\"",
|
|
state->bufstart)));
|
|
WEP_SETWEIGHT(pos[npos - 1], 0);
|
|
statecode = WAITPOSDELIM;
|
|
}
|
|
else
|
|
PRSSYNTAXERROR;
|
|
}
|
|
else if (statecode == WAITPOSDELIM)
|
|
{
|
|
if (t_iseq(state->prsbuf, ','))
|
|
statecode = INPOSINFO;
|
|
else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
|
|
{
|
|
if (WEP_GETWEIGHT(pos[npos - 1]))
|
|
PRSSYNTAXERROR;
|
|
WEP_SETWEIGHT(pos[npos - 1], 3);
|
|
}
|
|
else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
|
|
{
|
|
if (WEP_GETWEIGHT(pos[npos - 1]))
|
|
PRSSYNTAXERROR;
|
|
WEP_SETWEIGHT(pos[npos - 1], 2);
|
|
}
|
|
else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
|
|
{
|
|
if (WEP_GETWEIGHT(pos[npos - 1]))
|
|
PRSSYNTAXERROR;
|
|
WEP_SETWEIGHT(pos[npos - 1], 1);
|
|
}
|
|
else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
|
|
{
|
|
if (WEP_GETWEIGHT(pos[npos - 1]))
|
|
PRSSYNTAXERROR;
|
|
WEP_SETWEIGHT(pos[npos - 1], 0);
|
|
}
|
|
else if (t_isspace(state->prsbuf) ||
|
|
*(state->prsbuf) == '\0')
|
|
RETURN_TOKEN;
|
|
else if (!t_isdigit(state->prsbuf))
|
|
PRSSYNTAXERROR;
|
|
}
|
|
else /* internal error */
|
|
elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
|
|
statecode);
|
|
|
|
/* get next char */
|
|
state->prsbuf += pg_mblen(state->prsbuf);
|
|
}
|
|
}
|