postgres/src/backend/utils/adt/tsvector_parser.c
Tom Lane e3860ffa4d Initial pgindent run with pg_bsd_indent version 2.0.
The new indent version includes numerous fixes thanks to Piotr Stefaniak.
The main changes visible in this commit are:

* Nicer formatting of function-pointer declarations.
* No longer unexpectedly removes spaces in expressions using casts,
  sizeof, or offsetof.
* No longer wants to add a space in "struct structname *varname", as
  well as some similar cases for const- or volatile-qualified pointers.
* Declarations using PG_USED_FOR_ASSERTS_ONLY are formatted more nicely.
* Fixes bug where comments following declarations were sometimes placed
  with no space separating them from the code.
* Fixes some odd decisions for comments following case labels.
* Fixes some cases where comments following code were indented to less
  than the expected column 33.

On the less good side, it now tends to put more whitespace around typedef
names that are not listed in typedefs.list.  This might encourage us to
put more effort into typedef name collection; it's not really a bug in
indent itself.

There are more changes coming after this round, having to do with comment
indentation and alignment of lines appearing within parentheses.  I wanted
to limit the size of the diffs to something that could be reviewed without
one's eyes completely glazing over, so it seemed better to split up the
changes as much as practical.

Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 14:39:04 -04:00

374 lines
9.0 KiB
C

/*-------------------------------------------------------------------------
*
* tsvector_parser.c
* Parser for tsvector
*
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* src/backend/utils/adt/tsvector_parser.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_utils.h"
/*
* Private state of tsvector parser. Note that tsquery also uses this code to
* parse its input, hence the boolean flags. The two flags are both true or
* both false in current usage, but we keep them separate for clarity.
* is_tsquery affects *only* the content of error messages.
*/
struct TSVectorParseStateData
{
char *prsbuf; /* next input character */
char *bufstart; /* whole string (used only for errors) */
char *word; /* buffer to hold the current word */
int len; /* size in bytes allocated for 'word' */
int eml; /* max bytes per character */
bool oprisdelim; /* treat ! | * ( ) as delimiters? */
bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */
};
/*
* Initializes parser for the input string. If oprisdelim is set, the
* following characters are treated as delimiters in addition to whitespace:
* ! | & ( )
*/
TSVectorParseState
init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
{
TSVectorParseState state;
state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
state->prsbuf = input;
state->bufstart = input;
state->len = 32;
state->word = (char *) palloc(state->len);
state->eml = pg_database_encoding_max_length();
state->oprisdelim = oprisdelim;
state->is_tsquery = is_tsquery;
return state;
}
/*
* Reinitializes parser to parse 'input', instead of previous input.
*/
void
reset_tsvector_parser(TSVectorParseState state, char *input)
{
state->prsbuf = input;
}
/*
* Shuts down a tsvector parser.
*/
void
close_tsvector_parser(TSVectorParseState state)
{
pfree(state->word);
pfree(state);
}
/* increase the size of 'word' if needed to hold one more character */
#define RESIZEPRSBUF \
do { \
int clen = curpos - state->word; \
if ( clen + state->eml >= state->len ) \
{ \
state->len *= 2; \
state->word = (char *) repalloc(state->word, state->len); \
curpos = state->word + clen; \
} \
} while (0)
/* phrase operator begins with '<' */
#define ISOPERATOR(x) \
( pg_mblen(x) == 1 && ( *(x) == '!' || \
*(x) == '&' || \
*(x) == '|' || \
*(x) == '(' || \
*(x) == ')' || \
*(x) == '<' \
) )
/* Fills gettoken_tsvector's output parameters, and returns true */
#define RETURN_TOKEN \
do { \
if (pos_ptr != NULL) \
{ \
*pos_ptr = pos; \
*poslen = npos; \
} \
else if (pos != NULL) \
pfree(pos); \
\
if (strval != NULL) \
*strval = state->word; \
if (lenval != NULL) \
*lenval = curpos - state->word; \
if (endptr != NULL) \
*endptr = state->prsbuf; \
return true; \
} while(0)
/* State codes used in gettoken_tsvector */
#define WAITWORD 1
#define WAITENDWORD 2
#define WAITNEXTCHAR 3
#define WAITENDCMPLX 4
#define WAITPOSINFO 5
#define INPOSINFO 6
#define WAITPOSDELIM 7
#define WAITCHARCMPLX 8
#define PRSSYNTAXERROR prssyntaxerror(state)
static void
prssyntaxerror(TSVectorParseState state)
{
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
state->is_tsquery ?
errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
}
/*
* Get next token from string being parsed. Returns true if successful,
* false if end of input string is reached. On success, these output
* parameters are filled in:
*
* *strval pointer to token
* *lenval length of *strval
* *pos_ptr pointer to a palloc'd array of positions and weights
* associated with the token. If the caller is not interested
* in the information, NULL can be supplied. Otherwise
* the caller is responsible for pfreeing the array.
* *poslen number of elements in *pos_ptr
* *endptr scan resumption point
*
* Pass NULL for unwanted output parameters.
*/
bool
gettoken_tsvector(TSVectorParseState state,
char **strval, int *lenval,
WordEntryPos **pos_ptr, int *poslen,
char **endptr)
{
int oldstate = 0;
char *curpos = state->word;
int statecode = WAITWORD;
/*
* pos is for collecting the comma delimited list of positions followed by
* the actual token.
*/
WordEntryPos *pos = NULL;
int npos = 0; /* elements of pos used */
int posalen = 0; /* allocated size of pos */
while (1)
{
if (statecode == WAITWORD)
{
if (*(state->prsbuf) == '\0')
return false;
else if (t_iseq(state->prsbuf, '\''))
statecode = WAITENDCMPLX;
else if (t_iseq(state->prsbuf, '\\'))
{
statecode = WAITNEXTCHAR;
oldstate = WAITENDWORD;
}
else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
PRSSYNTAXERROR;
else if (!t_isspace(state->prsbuf))
{
COPYCHAR(curpos, state->prsbuf);
curpos += pg_mblen(state->prsbuf);
statecode = WAITENDWORD;
}
}
else if (statecode == WAITNEXTCHAR)
{
if (*(state->prsbuf) == '\0')
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("there is no escaped character: \"%s\"",
state->bufstart)));
else
{
RESIZEPRSBUF;
COPYCHAR(curpos, state->prsbuf);
curpos += pg_mblen(state->prsbuf);
Assert(oldstate != 0);
statecode = oldstate;
}
}
else if (statecode == WAITENDWORD)
{
if (t_iseq(state->prsbuf, '\\'))
{
statecode = WAITNEXTCHAR;
oldstate = WAITENDWORD;
}
else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
(state->oprisdelim && ISOPERATOR(state->prsbuf)))
{
RESIZEPRSBUF;
if (curpos == state->word)
PRSSYNTAXERROR;
*(curpos) = '\0';
RETURN_TOKEN;
}
else if (t_iseq(state->prsbuf, ':'))
{
if (curpos == state->word)
PRSSYNTAXERROR;
*(curpos) = '\0';
if (state->oprisdelim)
RETURN_TOKEN;
else
statecode = INPOSINFO;
}
else
{
RESIZEPRSBUF;
COPYCHAR(curpos, state->prsbuf);
curpos += pg_mblen(state->prsbuf);
}
}
else if (statecode == WAITENDCMPLX)
{
if (t_iseq(state->prsbuf, '\''))
{
statecode = WAITCHARCMPLX;
}
else if (t_iseq(state->prsbuf, '\\'))
{
statecode = WAITNEXTCHAR;
oldstate = WAITENDCMPLX;
}
else if (*(state->prsbuf) == '\0')
PRSSYNTAXERROR;
else
{
RESIZEPRSBUF;
COPYCHAR(curpos, state->prsbuf);
curpos += pg_mblen(state->prsbuf);
}
}
else if (statecode == WAITCHARCMPLX)
{
if (t_iseq(state->prsbuf, '\''))
{
RESIZEPRSBUF;
COPYCHAR(curpos, state->prsbuf);
curpos += pg_mblen(state->prsbuf);
statecode = WAITENDCMPLX;
}
else
{
RESIZEPRSBUF;
*(curpos) = '\0';
if (curpos == state->word)
PRSSYNTAXERROR;
if (state->oprisdelim)
{
/* state->prsbuf+=pg_mblen(state->prsbuf); */
RETURN_TOKEN;
}
else
statecode = WAITPOSINFO;
continue; /* recheck current character */
}
}
else if (statecode == WAITPOSINFO)
{
if (t_iseq(state->prsbuf, ':'))
statecode = INPOSINFO;
else
RETURN_TOKEN;
}
else if (statecode == INPOSINFO)
{
if (t_isdigit(state->prsbuf))
{
if (posalen == 0)
{
posalen = 4;
pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
npos = 0;
}
else if (npos + 1 >= posalen)
{
posalen *= 2;
pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
}
npos++;
WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
/* we cannot get here in tsquery, so no need for 2 errmsgs */
if (WEP_GETPOS(pos[npos - 1]) == 0)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("wrong position info in tsvector: \"%s\"",
state->bufstart)));
WEP_SETWEIGHT(pos[npos - 1], 0);
statecode = WAITPOSDELIM;
}
else
PRSSYNTAXERROR;
}
else if (statecode == WAITPOSDELIM)
{
if (t_iseq(state->prsbuf, ','))
statecode = INPOSINFO;
else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
{
if (WEP_GETWEIGHT(pos[npos - 1]))
PRSSYNTAXERROR;
WEP_SETWEIGHT(pos[npos - 1], 3);
}
else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
{
if (WEP_GETWEIGHT(pos[npos - 1]))
PRSSYNTAXERROR;
WEP_SETWEIGHT(pos[npos - 1], 2);
}
else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
{
if (WEP_GETWEIGHT(pos[npos - 1]))
PRSSYNTAXERROR;
WEP_SETWEIGHT(pos[npos - 1], 1);
}
else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
{
if (WEP_GETWEIGHT(pos[npos - 1]))
PRSSYNTAXERROR;
WEP_SETWEIGHT(pos[npos - 1], 0);
}
else if (t_isspace(state->prsbuf) ||
*(state->prsbuf) == '\0')
RETURN_TOKEN;
else if (!t_isdigit(state->prsbuf))
PRSSYNTAXERROR;
}
else /* internal error */
elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
statecode);
/* get next char */
state->prsbuf += pg_mblen(state->prsbuf);
}
}