Properly access a buffer's LSN using existing access macros instead of abusing
[PostgreSQL.git] / src / backend / utils / adt / tsvector_parser.c
blob7d466670ae65a7f282f668574796fd7cca2d1bad
1 /*-------------------------------------------------------------------------
3 * tsvector_parser.c
4 * Parser for tsvector
6 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
9 * IDENTIFICATION
10 * $PostgreSQL$
12 *-------------------------------------------------------------------------
15 #include "postgres.h"
17 #include "libpq/pqformat.h"
18 #include "tsearch/ts_type.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_utils.h"
21 #include "utils/memutils.h"
25 * Private state of tsvector parser. Note that tsquery also uses this code to
26 * parse its input, hence the boolean flags. The two flags are both true or
27 * both false in current usage, but we keep them separate for clarity.
28 * is_tsquery affects *only* the content of error messages.
30 struct TSVectorParseStateData
32 char *prsbuf; /* next input character */
33 char *bufstart; /* whole string (used only for errors) */
34 char *word; /* buffer to hold the current word */
35 int len; /* size in bytes allocated for 'word' */
36 int eml; /* max bytes per character */
37 bool oprisdelim; /* treat ! | * ( ) as delimiters? */
38 bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */
43 * Initializes parser for the input string. If oprisdelim is set, the
44 * following characters are treated as delimiters in addition to whitespace:
45 * ! | & ( )
47 TSVectorParseState
48 init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
50 TSVectorParseState state;
52 state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
53 state->prsbuf = input;
54 state->bufstart = input;
55 state->len = 32;
56 state->word = (char *) palloc(state->len);
57 state->eml = pg_database_encoding_max_length();
58 state->oprisdelim = oprisdelim;
59 state->is_tsquery = is_tsquery;
61 return state;
65 * Reinitializes parser to parse 'input', instead of previous input.
67 void
68 reset_tsvector_parser(TSVectorParseState state, char *input)
70 state->prsbuf = input;
74 * Shuts down a tsvector parser.
76 void
77 close_tsvector_parser(TSVectorParseState state)
79 pfree(state->word);
80 pfree(state);
83 /* increase the size of 'word' if needed to hold one more character */
84 #define RESIZEPRSBUF \
85 do { \
86 int clen = curpos - state->word; \
87 if ( clen + state->eml >= state->len ) \
88 { \
89 state->len *= 2; \
90 state->word = (char *) repalloc(state->word, state->len); \
91 curpos = state->word + clen; \
92 } \
93 } while (0)
95 #define ISOPERATOR(x) ( pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) )
97 /* Fills gettoken_tsvector's output parameters, and returns true */
98 #define RETURN_TOKEN \
99 do { \
100 if (pos_ptr != NULL) \
102 *pos_ptr = pos; \
103 *poslen = npos; \
105 else if (pos != NULL) \
106 pfree(pos); \
108 if (strval != NULL) \
109 *strval = state->word; \
110 if (lenval != NULL) \
111 *lenval = curpos - state->word; \
112 if (endptr != NULL) \
113 *endptr = state->prsbuf; \
114 return true; \
115 } while(0)
118 /* State codes used in gettoken_tsvector */
119 #define WAITWORD 1
120 #define WAITENDWORD 2
121 #define WAITNEXTCHAR 3
122 #define WAITENDCMPLX 4
123 #define WAITPOSINFO 5
124 #define INPOSINFO 6
125 #define WAITPOSDELIM 7
126 #define WAITCHARCMPLX 8
128 #define PRSSYNTAXERROR prssyntaxerror(state)
130 static void
131 prssyntaxerror(TSVectorParseState state)
133 ereport(ERROR,
134 (errcode(ERRCODE_SYNTAX_ERROR),
135 state->is_tsquery ?
136 errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
137 errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
142 * Get next token from string being parsed. Returns true if successful,
143 * false if end of input string is reached. On success, these output
144 * parameters are filled in:
146 * *strval pointer to token
147 * *lenval length of *strval
148 * *pos_ptr pointer to a palloc'd array of positions and weights
149 * associated with the token. If the caller is not interested
150 * in the information, NULL can be supplied. Otherwise
151 * the caller is responsible for pfreeing the array.
152 * *poslen number of elements in *pos_ptr
153 * *endptr scan resumption point
155 * Pass NULL for unwanted output parameters.
157 bool
158 gettoken_tsvector(TSVectorParseState state,
159 char **strval, int *lenval,
160 WordEntryPos **pos_ptr, int *poslen,
161 char **endptr)
163 int oldstate = 0;
164 char *curpos = state->word;
165 int statecode = WAITWORD;
168 * pos is for collecting the comma delimited list of positions followed by
169 * the actual token.
171 WordEntryPos *pos = NULL;
172 int npos = 0; /* elements of pos used */
173 int posalen = 0; /* allocated size of pos */
175 while (1)
177 if (statecode == WAITWORD)
179 if (*(state->prsbuf) == '\0')
180 return false;
181 else if (t_iseq(state->prsbuf, '\''))
182 statecode = WAITENDCMPLX;
183 else if (t_iseq(state->prsbuf, '\\'))
185 statecode = WAITNEXTCHAR;
186 oldstate = WAITENDWORD;
188 else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
189 PRSSYNTAXERROR;
190 else if (!t_isspace(state->prsbuf))
192 COPYCHAR(curpos, state->prsbuf);
193 curpos += pg_mblen(state->prsbuf);
194 statecode = WAITENDWORD;
197 else if (statecode == WAITNEXTCHAR)
199 if (*(state->prsbuf) == '\0')
200 ereport(ERROR,
201 (errcode(ERRCODE_SYNTAX_ERROR),
202 errmsg("there is no escaped character: \"%s\"",
203 state->bufstart)));
204 else
206 RESIZEPRSBUF;
207 COPYCHAR(curpos, state->prsbuf);
208 curpos += pg_mblen(state->prsbuf);
209 Assert(oldstate != 0);
210 statecode = oldstate;
213 else if (statecode == WAITENDWORD)
215 if (t_iseq(state->prsbuf, '\\'))
217 statecode = WAITNEXTCHAR;
218 oldstate = WAITENDWORD;
220 else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
221 (state->oprisdelim && ISOPERATOR(state->prsbuf)))
223 RESIZEPRSBUF;
224 if (curpos == state->word)
225 PRSSYNTAXERROR;
226 *(curpos) = '\0';
227 RETURN_TOKEN;
229 else if (t_iseq(state->prsbuf, ':'))
231 if (curpos == state->word)
232 PRSSYNTAXERROR;
233 *(curpos) = '\0';
234 if (state->oprisdelim)
235 RETURN_TOKEN;
236 else
237 statecode = INPOSINFO;
239 else
241 RESIZEPRSBUF;
242 COPYCHAR(curpos, state->prsbuf);
243 curpos += pg_mblen(state->prsbuf);
246 else if (statecode == WAITENDCMPLX)
248 if (t_iseq(state->prsbuf, '\''))
250 statecode = WAITCHARCMPLX;
252 else if (t_iseq(state->prsbuf, '\\'))
254 statecode = WAITNEXTCHAR;
255 oldstate = WAITENDCMPLX;
257 else if (*(state->prsbuf) == '\0')
258 PRSSYNTAXERROR;
259 else
261 RESIZEPRSBUF;
262 COPYCHAR(curpos, state->prsbuf);
263 curpos += pg_mblen(state->prsbuf);
266 else if (statecode == WAITCHARCMPLX)
268 if (t_iseq(state->prsbuf, '\''))
270 RESIZEPRSBUF;
271 COPYCHAR(curpos, state->prsbuf);
272 curpos += pg_mblen(state->prsbuf);
273 statecode = WAITENDCMPLX;
275 else
277 RESIZEPRSBUF;
278 *(curpos) = '\0';
279 if (curpos == state->word)
280 PRSSYNTAXERROR;
281 if (state->oprisdelim)
283 /* state->prsbuf+=pg_mblen(state->prsbuf); */
284 RETURN_TOKEN;
286 else
287 statecode = WAITPOSINFO;
288 continue; /* recheck current character */
291 else if (statecode == WAITPOSINFO)
293 if (t_iseq(state->prsbuf, ':'))
294 statecode = INPOSINFO;
295 else
296 RETURN_TOKEN;
298 else if (statecode == INPOSINFO)
300 if (t_isdigit(state->prsbuf))
302 if (posalen == 0)
304 posalen = 4;
305 pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
306 npos = 0;
308 else if (npos + 1 >= posalen)
310 posalen *= 2;
311 pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
313 npos++;
314 WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
315 /* we cannot get here in tsquery, so no need for 2 errmsgs */
316 if (WEP_GETPOS(pos[npos - 1]) == 0)
317 ereport(ERROR,
318 (errcode(ERRCODE_SYNTAX_ERROR),
319 errmsg("wrong position info in tsvector: \"%s\"",
320 state->bufstart)));
321 WEP_SETWEIGHT(pos[npos - 1], 0);
322 statecode = WAITPOSDELIM;
324 else
325 PRSSYNTAXERROR;
327 else if (statecode == WAITPOSDELIM)
329 if (t_iseq(state->prsbuf, ','))
330 statecode = INPOSINFO;
331 else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
333 if (WEP_GETWEIGHT(pos[npos - 1]))
334 PRSSYNTAXERROR;
335 WEP_SETWEIGHT(pos[npos - 1], 3);
337 else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
339 if (WEP_GETWEIGHT(pos[npos - 1]))
340 PRSSYNTAXERROR;
341 WEP_SETWEIGHT(pos[npos - 1], 2);
343 else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
345 if (WEP_GETWEIGHT(pos[npos - 1]))
346 PRSSYNTAXERROR;
347 WEP_SETWEIGHT(pos[npos - 1], 1);
349 else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
351 if (WEP_GETWEIGHT(pos[npos - 1]))
352 PRSSYNTAXERROR;
353 WEP_SETWEIGHT(pos[npos - 1], 0);
355 else if (t_isspace(state->prsbuf) ||
356 *(state->prsbuf) == '\0')
357 RETURN_TOKEN;
358 else if (!t_isdigit(state->prsbuf))
359 PRSSYNTAXERROR;
361 else /* internal error */
362 elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
363 statecode);
365 /* get next char */
366 state->prsbuf += pg_mblen(state->prsbuf);
369 return false;