1 /*-------------------------------------------------------------------------
4 * I/O functions for tsquery
6 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
12 *-------------------------------------------------------------------------
17 #include "libpq/pqformat.h"
18 #include "miscadmin.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_type.h"
21 #include "tsearch/ts_utils.h"
22 #include "utils/builtins.h"
23 #include "utils/memutils.h"
24 #include "utils/pg_crc.h"
27 struct TSQueryParserStateData
29 /* State for gettoken_query */
30 char *buffer
; /* entire string we are scanning */
31 char *buf
; /* current scan point */
33 int count
; /* nesting count, incremented by (,
36 /* polish (prefix) notation in list, filled in by push* functions */
40 * Strings from operands are collected in op. curop is a pointer to the
41 * end of used space of op.
45 int lenop
; /* allocated size of op */
46 int sumlen
; /* used size of op */
48 /* state for value's parser */
49 TSVectorParseState valstate
;
54 #define WAITOPERATOR 2
55 #define WAITFIRSTOPERAND 3
56 #define WAITSINGLEOPERAND 4
59 * subroutine to parse the modifiers (weight and prefix flag currently)
60 * part, like ':1AB' of a query.
63 get_modifiers(char *buf
, int16
*weight
, bool *prefix
)
68 if (!t_iseq(buf
, ':'))
72 while (*buf
&& pg_mblen(buf
) == 1)
105 * token types for parsing
118 * get token from query string
120 * *operator is filled in with OP_* when return values is PT_OPR
121 * *strval, *lenval and *weight are filled in when return value is PT_VAL
124 gettoken_query(TSQueryParserState state
,
126 int *lenval
, char **strval
, int16
*weight
, bool *prefix
)
133 switch (state
->state
)
135 case WAITFIRSTOPERAND
:
137 if (t_iseq(state
->buf
, '!'))
139 (state
->buf
)++; /* can safely ++, t_iseq guarantee
140 * that pg_mblen()==1 */
142 state
->state
= WAITOPERAND
;
145 else if (t_iseq(state
->buf
, '('))
149 state
->state
= WAITOPERAND
;
152 else if (t_iseq(state
->buf
, ':'))
155 (errcode(ERRCODE_SYNTAX_ERROR
),
156 errmsg("syntax error in tsquery: \"%s\"",
159 else if (!t_isspace(state
->buf
))
162 * We rely on the tsvector parser to parse the value for
165 reset_tsvector_parser(state
->valstate
, state
->buf
);
166 if (gettoken_tsvector(state
->valstate
, strval
, lenval
, NULL
, NULL
, &state
->buf
))
168 state
->buf
= get_modifiers(state
->buf
, weight
, prefix
);
169 state
->state
= WAITOPERATOR
;
172 else if (state
->state
== WAITFIRSTOPERAND
)
176 (errcode(ERRCODE_SYNTAX_ERROR
),
177 errmsg("no operand in tsquery: \"%s\"",
182 if (t_iseq(state
->buf
, '&'))
184 state
->state
= WAITOPERAND
;
189 if (t_iseq(state
->buf
, '|'))
191 state
->state
= WAITOPERAND
;
196 else if (t_iseq(state
->buf
, ')'))
200 return (state
->count
< 0) ? PT_ERR
: PT_CLOSE
;
202 else if (*(state
->buf
) == '\0')
203 return (state
->count
) ? PT_ERR
: PT_END
;
204 else if (!t_isspace(state
->buf
))
207 case WAITSINGLEOPERAND
:
208 if (*(state
->buf
) == '\0')
210 *strval
= state
->buf
;
211 *lenval
= strlen(state
->buf
);
212 state
->buf
+= strlen(state
->buf
);
219 state
->buf
+= pg_mblen(state
->buf
);
225 * Push an operator to state->polstr
228 pushOperator(TSQueryParserState state
, int8 oper
)
232 Assert(oper
== OP_NOT
|| oper
== OP_AND
|| oper
== OP_OR
);
234 tmp
= (QueryOperator
*) palloc0(sizeof(QueryOperator
));
237 /* left is filled in later with findoprnd */
239 state
->polstr
= lcons(tmp
, state
->polstr
);
243 pushValue_internal(TSQueryParserState state
, pg_crc32 valcrc
, int distance
, int lenval
, int weight
, bool prefix
)
247 if (distance
>= MAXSTRPOS
)
249 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
250 errmsg("value is too big in tsquery: \"%s\"",
252 if (lenval
>= MAXSTRLEN
)
254 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
255 errmsg("operand is too long in tsquery: \"%s\"",
258 tmp
= (QueryOperand
*) palloc0(sizeof(QueryOperand
));
260 tmp
->weight
= weight
;
261 tmp
->prefix
= prefix
;
262 tmp
->valcrc
= (int32
) valcrc
;
263 tmp
->length
= lenval
;
264 tmp
->distance
= distance
;
266 state
->polstr
= lcons(tmp
, state
->polstr
);
270 * Push an operand to state->polstr.
272 * strval must point to a string equal to state->curop. lenval is the length
276 pushValue(TSQueryParserState state
, char *strval
, int lenval
, int2 weight
, bool prefix
)
280 if (lenval
>= MAXSTRLEN
)
282 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
283 errmsg("word is too long in tsquery: \"%s\"",
287 COMP_CRC32(valcrc
, strval
, lenval
);
289 pushValue_internal(state
, valcrc
, state
->curop
- state
->op
, lenval
, weight
, prefix
);
291 /* append the value string to state.op, enlarging buffer if needed first */
292 while (state
->curop
- state
->op
+ lenval
+ 1 >= state
->lenop
)
294 int used
= state
->curop
- state
->op
;
297 state
->op
= (char *) repalloc((void *) state
->op
, state
->lenop
);
298 state
->curop
= state
->op
+ used
;
300 memcpy((void *) state
->curop
, (void *) strval
, lenval
);
301 state
->curop
+= lenval
;
302 *(state
->curop
) = '\0';
304 state
->sumlen
+= lenval
+ 1 /* \0 */ ;
309 * Push a stopword placeholder to state->polstr
312 pushStop(TSQueryParserState state
)
316 tmp
= (QueryOperand
*) palloc0(sizeof(QueryOperand
));
317 tmp
->type
= QI_VALSTOP
;
319 state
->polstr
= lcons(tmp
, state
->polstr
);
323 #define STACKDEPTH 32
326 * Make polish (prefix) notation of query.
328 * See parse_tsquery for explanation of pushval.
331 makepol(TSQueryParserState state
,
332 PushFunction pushval
,
339 int8 opstack
[STACKDEPTH
];
344 /* since this function recurses, it could be driven to stack overflow */
347 while ((type
= gettoken_query(state
, &operator, &lenval
, &strval
, &weight
, &prefix
)) != PT_END
)
352 pushval(opaque
, state
, strval
, lenval
, weight
, prefix
);
353 while (lenstack
&& (opstack
[lenstack
- 1] == OP_AND
||
354 opstack
[lenstack
- 1] == OP_NOT
))
357 pushOperator(state
, opstack
[lenstack
]);
361 if (lenstack
&& operator == OP_OR
)
362 pushOperator(state
, OP_OR
);
365 if (lenstack
== STACKDEPTH
) /* internal error */
366 elog(ERROR
, "tsquery stack too small");
367 opstack
[lenstack
] = operator;
372 makepol(state
, pushval
, opaque
);
374 if (lenstack
&& (opstack
[lenstack
- 1] == OP_AND
||
375 opstack
[lenstack
- 1] == OP_NOT
))
378 pushOperator(state
, opstack
[lenstack
]);
385 pushOperator(state
, opstack
[lenstack
]);
391 (errcode(ERRCODE_SYNTAX_ERROR
),
392 errmsg("syntax error in tsquery: \"%s\"",
399 pushOperator(state
, opstack
[lenstack
]);
404 findoprnd_recurse(QueryItem
*ptr
, uint32
*pos
, int nnodes
)
406 /* since this function recurses, it could be driven to stack overflow. */
410 elog(ERROR
, "malformed tsquery: operand not found");
412 if (ptr
[*pos
].type
== QI_VAL
||
413 ptr
[*pos
].type
== QI_VALSTOP
) /* need to handle VALSTOP here, they
414 * haven't been cleaned away yet. */
420 Assert(ptr
[*pos
].type
== QI_OPR
);
422 if (ptr
[*pos
].operator.oper
== OP_NOT
)
424 ptr
[*pos
].operator.left
= 1;
426 findoprnd_recurse(ptr
, pos
, nnodes
);
430 QueryOperator
*curitem
= &ptr
[*pos
].operator;
433 Assert(curitem
->oper
== OP_AND
|| curitem
->oper
== OP_OR
);
436 findoprnd_recurse(ptr
, pos
, nnodes
);
437 curitem
->left
= *pos
- tmp
;
438 findoprnd_recurse(ptr
, pos
, nnodes
);
445 * Fills in the left-fields previously left unfilled. The input
446 * QueryItems must be in polish (prefix) notation.
449 findoprnd(QueryItem
*ptr
, int size
)
454 findoprnd_recurse(ptr
, &pos
, size
);
457 elog(ERROR
, "malformed tsquery: extra nodes");
462 * Each value (operand) in the query is be passed to pushval. pushval can
463 * transform the simple value to an arbitrarily complex expression using
464 * pushValue and pushOperator. It must push a single value with pushValue,
465 * a complete expression with all operands, or a a stopword placeholder
466 * with pushStop, otherwise the prefix notation representation will be broken,
467 * having an operator with no operand.
469 * opaque is passed on to pushval as is, pushval can use it to store its
472 * The returned query might contain QI_STOPVAL nodes. The caller is responsible
473 * for cleaning them up (with clean_fakeval)
476 parse_tsquery(char *buf
,
477 PushFunction pushval
,
481 struct TSQueryParserStateData state
;
491 state
.state
= (isplain
) ? WAITSINGLEOPERAND
: WAITFIRSTOPERAND
;
495 /* init value parser's state */
496 state
.valstate
= init_tsvector_parser(state
.buffer
, true, true);
498 /* init list of operand */
501 state
.curop
= state
.op
= (char *) palloc(state
.lenop
);
502 *(state
.curop
) = '\0';
504 /* parse query & make polish notation (postfix, but in reverse order) */
505 makepol(&state
, pushval
, opaque
);
507 close_tsvector_parser(state
.valstate
);
509 if (list_length(state
.polstr
) == 0)
512 (errmsg("text-search query doesn't contain lexemes: \"%s\"",
514 query
= (TSQuery
) palloc(HDRSIZETQ
);
515 SET_VARSIZE(query
, HDRSIZETQ
);
520 /* Pack the QueryItems in the final TSQuery struct to return to caller */
521 commonlen
= COMPUTESIZE(list_length(state
.polstr
), state
.sumlen
);
522 query
= (TSQuery
) palloc0(commonlen
);
523 SET_VARSIZE(query
, commonlen
);
524 query
->size
= list_length(state
.polstr
);
525 ptr
= GETQUERY(query
);
527 /* Copy QueryItems to TSQuery */
529 foreach(cell
, state
.polstr
)
531 QueryItem
*item
= (QueryItem
*) lfirst(cell
);
536 memcpy(&ptr
[i
], item
, sizeof(QueryOperand
));
539 ptr
[i
].type
= QI_VALSTOP
;
542 memcpy(&ptr
[i
], item
, sizeof(QueryOperator
));
545 elog(ERROR
, "unrecognized QueryItem type: %d", item
->type
);
550 /* Copy all the operand strings to TSQuery */
551 memcpy((void *) GETOPERAND(query
), (void *) state
.op
, state
.sumlen
);
554 /* Set left operand pointers for every operator. */
555 findoprnd(ptr
, query
->size
);
561 pushval_asis(Datum opaque
, TSQueryParserState state
, char *strval
, int lenval
,
562 int16 weight
, bool prefix
)
564 pushValue(state
, strval
, lenval
, weight
, prefix
);
568 * in without morphology
571 tsqueryin(PG_FUNCTION_ARGS
)
573 char *in
= PG_GETARG_CSTRING(0);
575 pg_verifymbstr(in
, strlen(in
), false);
577 PG_RETURN_TSQUERY(parse_tsquery(in
, pushval_asis
, PointerGetDatum(NULL
), false));
592 /* Makes sure inf->buf is large enough for adding 'addsize' bytes */
593 #define RESIZEBUF(inf, addsize) \
594 while( ( (inf)->cur - (inf)->buf ) + (addsize) + 1 >= (inf)->buflen ) \
596 int len = (inf)->cur - (inf)->buf; \
597 (inf)->buflen *= 2; \
598 (inf)->buf = (char*) repalloc( (void*)(inf)->buf, (inf)->buflen ); \
599 (inf)->cur = (inf)->buf + len; \
603 * recursive walk on tree and print it in
604 * infix (human-readable) view
607 infix(INFIX
*in
, bool first
)
609 /* since this function recurses, it could be driven to stack overflow. */
612 if (in
->curpol
->type
== QI_VAL
)
614 QueryOperand
*curpol
= &in
->curpol
->operand
;
615 char *op
= in
->op
+ curpol
->distance
;
618 RESIZEBUF(in
, curpol
->length
* (pg_database_encoding_max_length() + 1) + 2 + 6);
623 if (t_iseq(op
, '\''))
628 else if (t_iseq(op
, '\\'))
633 COPYCHAR(in
->cur
, op
);
641 if (curpol
->weight
|| curpol
->prefix
)
645 if ( curpol
->prefix
)
650 if (curpol
->weight
& (1 << 3))
655 if (curpol
->weight
& (1 << 2))
660 if (curpol
->weight
& (1 << 1))
665 if (curpol
->weight
& 1)
674 else if (in
->curpol
->operator.oper
== OP_NOT
)
684 if (in
->curpol
->type
== QI_OPR
)
688 sprintf(in
->cur
, "( ");
689 in
->cur
= strchr(in
->cur
, '\0');
696 sprintf(in
->cur
, " )");
697 in
->cur
= strchr(in
->cur
, '\0');
702 int8 op
= in
->curpol
->operator.oper
;
706 if (op
== OP_OR
&& !first
)
709 sprintf(in
->cur
, "( ");
710 in
->cur
= strchr(in
->cur
, '\0');
713 nrm
.curpol
= in
->curpol
;
716 nrm
.cur
= nrm
.buf
= (char *) palloc(sizeof(char) * nrm
.buflen
);
718 /* get right operand */
721 /* get & print left operand */
722 in
->curpol
= nrm
.curpol
;
725 /* print operator & right operand */
726 RESIZEBUF(in
, 3 + (nrm
.cur
- nrm
.buf
));
730 sprintf(in
->cur
, " | %s", nrm
.buf
);
733 sprintf(in
->cur
, " & %s", nrm
.buf
);
736 /* OP_NOT is handled in above if-branch */
737 elog(ERROR
, "unrecognized operator type: %d", op
);
739 in
->cur
= strchr(in
->cur
, '\0');
742 if (op
== OP_OR
&& !first
)
745 sprintf(in
->cur
, " )");
746 in
->cur
= strchr(in
->cur
, '\0');
753 tsqueryout(PG_FUNCTION_ARGS
)
755 TSQuery query
= PG_GETARG_TSQUERY(0);
758 if (query
->size
== 0)
763 PG_RETURN_POINTER(b
);
765 nrm
.curpol
= GETQUERY(query
);
767 nrm
.cur
= nrm
.buf
= (char *) palloc(sizeof(char) * nrm
.buflen
);
769 nrm
.op
= GETOPERAND(query
);
772 PG_FREE_IF_COPY(query
, 0);
773 PG_RETURN_CSTRING(nrm
.buf
);
777 * Binary Input / Output functions. The binary format is as follows:
779 * uint32 number of operators/operands in the query
781 * Followed by the operators and operands, in prefix notation. For each
786 * operand text in client encoding, null-terminated
791 * uint8 operator, one of OP_AND, OP_OR, OP_NOT.
794 tsquerysend(PG_FUNCTION_ARGS
)
796 TSQuery query
= PG_GETARG_TSQUERY(0);
799 QueryItem
*item
= GETQUERY(query
);
801 pq_begintypsend(&buf
);
803 pq_sendint(&buf
, query
->size
, sizeof(uint32
));
804 for (i
= 0; i
< query
->size
; i
++)
806 pq_sendint(&buf
, item
->type
, sizeof(item
->type
));
811 pq_sendint(&buf
, item
->operand
.weight
, sizeof(uint8
));
812 pq_sendint(&buf
, item
->operand
.prefix
, sizeof(uint8
));
813 pq_sendstring(&buf
, GETOPERAND(query
) + item
->operand
.distance
);
816 pq_sendint(&buf
, item
->operator.oper
, sizeof(item
->operator.oper
));
819 elog(ERROR
, "unrecognized tsquery node type: %d", item
->type
);
824 PG_FREE_IF_COPY(query
, 0);
826 PG_RETURN_BYTEA_P(pq_endtypsend(&buf
));
830 tsqueryrecv(PG_FUNCTION_ARGS
)
832 StringInfo buf
= (StringInfo
) PG_GETARG_POINTER(0);
840 const char **operands
;
842 size
= pq_getmsgint(buf
, sizeof(uint32
));
843 if (size
> (MaxAllocSize
/ sizeof(QueryItem
)))
844 elog(ERROR
, "invalid size of tsquery");
846 /* Allocate space to temporarily hold operand strings */
847 operands
= palloc(size
* sizeof(char *));
849 /* Allocate space for all the QueryItems. */
850 len
= HDRSIZETQ
+ sizeof(QueryItem
) * size
;
851 query
= (TSQuery
) palloc0(len
);
853 item
= GETQUERY(query
);
856 for (i
= 0; i
< size
; i
++)
858 item
->type
= (int8
) pq_getmsgint(buf
, sizeof(int8
));
860 if (item
->type
== QI_VAL
)
862 size_t val_len
; /* length after recoding to server encoding */
868 weight
= (uint8
) pq_getmsgint(buf
, sizeof(uint8
));
869 prefix
= (uint8
) pq_getmsgint(buf
, sizeof(uint8
));
870 val
= pq_getmsgstring(buf
);
871 val_len
= strlen(val
);
876 elog(ERROR
, "invalid tsquery: invalid weight bitmap");
878 if (val_len
> MAXSTRLEN
)
879 elog(ERROR
, "invalid tsquery: operand too long");
881 if (datalen
> MAXSTRPOS
)
882 elog(ERROR
, "invalid tsquery: total operand length exceeded");
887 COMP_CRC32(valcrc
, val
, val_len
);
890 item
->operand
.weight
= weight
;
891 item
->operand
.prefix
= (prefix
) ? true : false;
892 item
->operand
.valcrc
= (int32
) valcrc
;
893 item
->operand
.length
= val_len
;
894 item
->operand
.distance
= datalen
;
897 * Operand strings are copied to the final struct after this loop;
898 * here we just collect them to an array
902 datalen
+= val_len
+ 1; /* + 1 for the '\0' terminator */
904 else if (item
->type
== QI_OPR
)
908 oper
= (int8
) pq_getmsgint(buf
, sizeof(int8
));
909 if (oper
!= OP_NOT
&& oper
!= OP_OR
&& oper
!= OP_AND
)
910 elog(ERROR
, "invalid tsquery: unrecognized operator type %d",
913 elog(ERROR
, "invalid pointer to right operand");
915 item
->operator.oper
= oper
;
918 elog(ERROR
, "unrecognized tsquery node type: %d", item
->type
);
923 /* Enlarge buffer to make room for the operand values. */
924 query
= (TSQuery
) repalloc(query
, len
+ datalen
);
925 item
= GETQUERY(query
);
926 ptr
= GETOPERAND(query
);
929 * Fill in the left-pointers. Checks that the tree is well-formed as a
932 findoprnd(item
, size
);
934 /* Copy operands to output struct */
935 for (i
= 0; i
< size
; i
++)
937 if (item
->type
== QI_VAL
)
939 memcpy(ptr
, operands
[i
], item
->operand
.length
+ 1);
940 ptr
+= item
->operand
.length
+ 1;
947 Assert(ptr
- GETOPERAND(query
) == datalen
);
949 SET_VARSIZE(query
, len
+ datalen
);
951 PG_RETURN_TSVECTOR(query
);
955 * debug function, used only for view query
956 * which will be executed in non-leaf pages in index
959 tsquerytree(PG_FUNCTION_ARGS
)
961 TSQuery query
= PG_GETARG_TSQUERY(0);
967 if (query
->size
== 0)
969 res
= (text
*) palloc(VARHDRSZ
);
970 SET_VARSIZE(res
, VARHDRSZ
);
971 PG_RETURN_POINTER(res
);
974 q
= clean_NOT(GETQUERY(query
), &len
);
978 res
= cstring_to_text("T");
984 nrm
.cur
= nrm
.buf
= (char *) palloc(sizeof(char) * nrm
.buflen
);
986 nrm
.op
= GETOPERAND(query
);
988 res
= cstring_to_text_with_len(nrm
.buf
, nrm
.cur
- nrm
.buf
);
992 PG_FREE_IF_COPY(query
, 0);
994 PG_RETURN_TEXT_P(res
);