1 /*-------------------------------------------------------------------------
6 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
12 *-------------------------------------------------------------------------
17 #include "libpq/pqformat.h"
18 #include "tsearch/ts_type.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_utils.h"
21 #include "utils/memutils.h"
25 * Private state of tsvector parser. Note that tsquery also uses this code to
26 * parse its input, hence the boolean flags. The two flags are both true or
27 * both false in current usage, but we keep them separate for clarity.
28 * is_tsquery affects *only* the content of error messages.
30 struct TSVectorParseStateData
32 char *prsbuf
; /* next input character */
33 char *bufstart
; /* whole string (used only for errors) */
34 char *word
; /* buffer to hold the current word */
35 int len
; /* size in bytes allocated for 'word' */
36 int eml
; /* max bytes per character */
37 bool oprisdelim
; /* treat ! | * ( ) as delimiters? */
38 bool is_tsquery
; /* say "tsquery" not "tsvector" in errors? */
43 * Initializes parser for the input string. If oprisdelim is set, the
44 * following characters are treated as delimiters in addition to whitespace:
48 init_tsvector_parser(char *input
, bool oprisdelim
, bool is_tsquery
)
50 TSVectorParseState state
;
52 state
= (TSVectorParseState
) palloc(sizeof(struct TSVectorParseStateData
));
53 state
->prsbuf
= input
;
54 state
->bufstart
= input
;
56 state
->word
= (char *) palloc(state
->len
);
57 state
->eml
= pg_database_encoding_max_length();
58 state
->oprisdelim
= oprisdelim
;
59 state
->is_tsquery
= is_tsquery
;
65 * Reinitializes parser to parse 'input', instead of previous input.
68 reset_tsvector_parser(TSVectorParseState state
, char *input
)
70 state
->prsbuf
= input
;
74 * Shuts down a tsvector parser.
77 close_tsvector_parser(TSVectorParseState state
)
83 /* increase the size of 'word' if needed to hold one more character */
84 #define RESIZEPRSBUF \
86 int clen = curpos - state->word; \
87 if ( clen + state->eml >= state->len ) \
90 state->word = (char *) repalloc(state->word, state->len); \
91 curpos = state->word + clen; \
95 #define ISOPERATOR(x) ( pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) )
97 /* Fills gettoken_tsvector's output parameters, and returns true */
98 #define RETURN_TOKEN \
100 if (pos_ptr != NULL) \
105 else if (pos != NULL) \
108 if (strval != NULL) \
109 *strval = state->word; \
110 if (lenval != NULL) \
111 *lenval = curpos - state->word; \
112 if (endptr != NULL) \
113 *endptr = state->prsbuf; \
118 /* State codes used in gettoken_tsvector */
120 #define WAITENDWORD 2
121 #define WAITNEXTCHAR 3
122 #define WAITENDCMPLX 4
123 #define WAITPOSINFO 5
125 #define WAITPOSDELIM 7
126 #define WAITCHARCMPLX 8
128 #define PRSSYNTAXERROR prssyntaxerror(state)
131 prssyntaxerror(TSVectorParseState state
)
134 (errcode(ERRCODE_SYNTAX_ERROR
),
136 errmsg("syntax error in tsquery: \"%s\"", state
->bufstart
) :
137 errmsg("syntax error in tsvector: \"%s\"", state
->bufstart
)));
142 * Get next token from string being parsed. Returns true if successful,
143 * false if end of input string is reached. On success, these output
144 * parameters are filled in:
146 * *strval pointer to token
147 * *lenval length of *strval
148 * *pos_ptr pointer to a palloc'd array of positions and weights
149 * associated with the token. If the caller is not interested
150 * in the information, NULL can be supplied. Otherwise
151 * the caller is responsible for pfreeing the array.
152 * *poslen number of elements in *pos_ptr
153 * *endptr scan resumption point
155 * Pass NULL for unwanted output parameters.
158 gettoken_tsvector(TSVectorParseState state
,
159 char **strval
, int *lenval
,
160 WordEntryPos
**pos_ptr
, int *poslen
,
164 char *curpos
= state
->word
;
165 int statecode
= WAITWORD
;
168 * pos is for collecting the comma delimited list of positions followed by
171 WordEntryPos
*pos
= NULL
;
172 int npos
= 0; /* elements of pos used */
173 int posalen
= 0; /* allocated size of pos */
177 if (statecode
== WAITWORD
)
179 if (*(state
->prsbuf
) == '\0')
181 else if (t_iseq(state
->prsbuf
, '\''))
182 statecode
= WAITENDCMPLX
;
183 else if (t_iseq(state
->prsbuf
, '\\'))
185 statecode
= WAITNEXTCHAR
;
186 oldstate
= WAITENDWORD
;
188 else if (state
->oprisdelim
&& ISOPERATOR(state
->prsbuf
))
190 else if (!t_isspace(state
->prsbuf
))
192 COPYCHAR(curpos
, state
->prsbuf
);
193 curpos
+= pg_mblen(state
->prsbuf
);
194 statecode
= WAITENDWORD
;
197 else if (statecode
== WAITNEXTCHAR
)
199 if (*(state
->prsbuf
) == '\0')
201 (errcode(ERRCODE_SYNTAX_ERROR
),
202 errmsg("there is no escaped character: \"%s\"",
207 COPYCHAR(curpos
, state
->prsbuf
);
208 curpos
+= pg_mblen(state
->prsbuf
);
209 Assert(oldstate
!= 0);
210 statecode
= oldstate
;
213 else if (statecode
== WAITENDWORD
)
215 if (t_iseq(state
->prsbuf
, '\\'))
217 statecode
= WAITNEXTCHAR
;
218 oldstate
= WAITENDWORD
;
220 else if (t_isspace(state
->prsbuf
) || *(state
->prsbuf
) == '\0' ||
221 (state
->oprisdelim
&& ISOPERATOR(state
->prsbuf
)))
224 if (curpos
== state
->word
)
229 else if (t_iseq(state
->prsbuf
, ':'))
231 if (curpos
== state
->word
)
234 if (state
->oprisdelim
)
237 statecode
= INPOSINFO
;
242 COPYCHAR(curpos
, state
->prsbuf
);
243 curpos
+= pg_mblen(state
->prsbuf
);
246 else if (statecode
== WAITENDCMPLX
)
248 if (t_iseq(state
->prsbuf
, '\''))
250 statecode
= WAITCHARCMPLX
;
252 else if (t_iseq(state
->prsbuf
, '\\'))
254 statecode
= WAITNEXTCHAR
;
255 oldstate
= WAITENDCMPLX
;
257 else if (*(state
->prsbuf
) == '\0')
262 COPYCHAR(curpos
, state
->prsbuf
);
263 curpos
+= pg_mblen(state
->prsbuf
);
266 else if (statecode
== WAITCHARCMPLX
)
268 if (t_iseq(state
->prsbuf
, '\''))
271 COPYCHAR(curpos
, state
->prsbuf
);
272 curpos
+= pg_mblen(state
->prsbuf
);
273 statecode
= WAITENDCMPLX
;
279 if (curpos
== state
->word
)
281 if (state
->oprisdelim
)
283 /* state->prsbuf+=pg_mblen(state->prsbuf); */
287 statecode
= WAITPOSINFO
;
288 continue; /* recheck current character */
291 else if (statecode
== WAITPOSINFO
)
293 if (t_iseq(state
->prsbuf
, ':'))
294 statecode
= INPOSINFO
;
298 else if (statecode
== INPOSINFO
)
300 if (t_isdigit(state
->prsbuf
))
305 pos
= (WordEntryPos
*) palloc(sizeof(WordEntryPos
) * posalen
);
308 else if (npos
+ 1 >= posalen
)
311 pos
= (WordEntryPos
*) repalloc(pos
, sizeof(WordEntryPos
) * posalen
);
314 WEP_SETPOS(pos
[npos
- 1], LIMITPOS(atoi(state
->prsbuf
)));
315 /* we cannot get here in tsquery, so no need for 2 errmsgs */
316 if (WEP_GETPOS(pos
[npos
- 1]) == 0)
318 (errcode(ERRCODE_SYNTAX_ERROR
),
319 errmsg("wrong position info in tsvector: \"%s\"",
321 WEP_SETWEIGHT(pos
[npos
- 1], 0);
322 statecode
= WAITPOSDELIM
;
327 else if (statecode
== WAITPOSDELIM
)
329 if (t_iseq(state
->prsbuf
, ','))
330 statecode
= INPOSINFO
;
331 else if (t_iseq(state
->prsbuf
, 'a') || t_iseq(state
->prsbuf
, 'A') || t_iseq(state
->prsbuf
, '*'))
333 if (WEP_GETWEIGHT(pos
[npos
- 1]))
335 WEP_SETWEIGHT(pos
[npos
- 1], 3);
337 else if (t_iseq(state
->prsbuf
, 'b') || t_iseq(state
->prsbuf
, 'B'))
339 if (WEP_GETWEIGHT(pos
[npos
- 1]))
341 WEP_SETWEIGHT(pos
[npos
- 1], 2);
343 else if (t_iseq(state
->prsbuf
, 'c') || t_iseq(state
->prsbuf
, 'C'))
345 if (WEP_GETWEIGHT(pos
[npos
- 1]))
347 WEP_SETWEIGHT(pos
[npos
- 1], 1);
349 else if (t_iseq(state
->prsbuf
, 'd') || t_iseq(state
->prsbuf
, 'D'))
351 if (WEP_GETWEIGHT(pos
[npos
- 1]))
353 WEP_SETWEIGHT(pos
[npos
- 1], 0);
355 else if (t_isspace(state
->prsbuf
) ||
356 *(state
->prsbuf
) == '\0')
358 else if (!t_isdigit(state
->prsbuf
))
361 else /* internal error */
362 elog(ERROR
, "unrecognized state in gettoken_tsvector: %d",
366 state
->prsbuf
+= pg_mblen(state
->prsbuf
);