6 #include "utils/array.h"
7 #include "catalog/pg_type.h"
8 #include "tsearch/ts_locale.h"
12 float4 trgm_limit
= 0.3f
;
14 PG_FUNCTION_INFO_V1(set_limit
);
15 Datum
set_limit(PG_FUNCTION_ARGS
);
17 set_limit(PG_FUNCTION_ARGS
)
19 float4 nlimit
= PG_GETARG_FLOAT4(0);
21 if (nlimit
< 0 || nlimit
> 1.0)
22 elog(ERROR
, "wrong limit, should be between 0 and 1");
24 PG_RETURN_FLOAT4(trgm_limit
);
27 PG_FUNCTION_INFO_V1(show_limit
);
28 Datum
show_limit(PG_FUNCTION_ARGS
);
30 show_limit(PG_FUNCTION_ARGS
)
32 PG_RETURN_FLOAT4(trgm_limit
);
36 comp_trgm(const void *a
, const void *b
)
42 unique_array(trgm
*a
, int len
)
49 if (CMPTRGM(tmp
, curend
))
58 return curend
+ 1 - a
;
62 #define iswordchr(c) (t_isalpha(c) || t_isdigit(c))
64 #define iswordchr(c) (!t_isspace(c))
68 * Finds first word in string, returns pointer to the word,
69 * endword points to the character after word
72 find_word(char *str
, int lenstr
, char **endword
, int *charlen
)
74 char *beginword
= str
;
76 while (beginword
- str
< lenstr
&& !iswordchr(beginword
))
77 beginword
+= pg_mblen(beginword
);
79 if (beginword
- str
>= lenstr
)
84 while (*endword
- str
< lenstr
&& iswordchr(*endword
))
86 *endword
+= pg_mblen(*endword
);
93 #ifdef USE_WIDE_UPPER_LOWER
95 cnt_trigram(trgm
*tptr
, char *str
, int bytelen
)
106 COMP_CRC32(crc
, str
, bytelen
);
110 * use only 3 upper bytes from crc, hope, it's good enough hashing
118 * Adds trigramm from words (already padded).
121 make_trigrams(trgm
*tptr
, char *str
, int bytelen
, int charlen
)
128 #ifdef USE_WIDE_UPPER_LOWER
129 if (pg_database_encoding_max_length() > 1)
131 int lenfirst
= pg_mblen(str
),
132 lenmiddle
= pg_mblen(str
+ lenfirst
),
133 lenlast
= pg_mblen(str
+ lenfirst
+ lenmiddle
);
135 while ((ptr
- str
) + lenfirst
+ lenmiddle
+ lenlast
<= bytelen
)
137 cnt_trigram(tptr
, ptr
, lenfirst
+ lenmiddle
+ lenlast
);
142 lenfirst
= lenmiddle
;
144 lenlast
= pg_mblen(ptr
+ lenfirst
+ lenmiddle
);
150 Assert(bytelen
== charlen
);
152 while (ptr
- str
< bytelen
- 2 /* number of trigrams = strlen - 2 */ )
164 generate_trgm(char *str
, int slen
)
175 trg
= (TRGM
*) palloc(TRGMHDRSIZE
+ sizeof(trgm
) * (slen
/ 2 + 1) *3);
177 SET_VARSIZE(trg
, TRGMHDRSIZE
);
179 if (slen
+ LPADDING
+ RPADDING
< 3 || slen
== 0)
184 buf
= palloc(sizeof(char) * (slen
+ 4));
194 while ((bword
= find_word(eword
, slen
- (eword
- str
), &eword
, &charlen
)) != NULL
)
197 bword
= lowerstr_with_len(bword
, eword
- bword
);
198 bytelen
= strlen(bword
);
200 bytelen
= eword
- bword
;
203 memcpy(buf
+ LPADDING
, bword
, bytelen
);
208 buf
[LPADDING
+ bytelen
] = ' ';
209 buf
[LPADDING
+ bytelen
+ 1] = ' ';
214 tptr
= make_trigrams(tptr
, buf
, bytelen
+ LPADDING
+ RPADDING
,
215 charlen
+ LPADDING
+ RPADDING
);
220 if ((len
= tptr
- GETARR(trg
)) == 0)
225 qsort((void *) GETARR(trg
), len
, sizeof(trgm
), comp_trgm
);
226 len
= unique_array(GETARR(trg
), len
);
229 SET_VARSIZE(trg
, CALCGTSIZE(ARRKEY
, len
));
239 val
|= *(((unsigned char *) ptr
));
241 val
|= *(((unsigned char *) ptr
) + 1);
243 val
|= *(((unsigned char *) ptr
) + 2);
248 PG_FUNCTION_INFO_V1(show_trgm
);
249 Datum
show_trgm(PG_FUNCTION_ARGS
);
251 show_trgm(PG_FUNCTION_ARGS
)
253 text
*in
= PG_GETARG_TEXT_P(0);
260 trg
= generate_trgm(VARDATA(in
), VARSIZE(in
) - VARHDRSZ
);
261 d
= (Datum
*) palloc(sizeof(Datum
) * (1 + ARRNELEM(trg
)));
263 for (i
= 0, ptr
= GETARR(trg
); i
< ARRNELEM(trg
); i
++, ptr
++)
265 text
*item
= (text
*) palloc(VARHDRSZ
+ Max(12, pg_database_encoding_max_length() * 3));
267 if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr
))
269 snprintf(VARDATA(item
), 12, "0x%06x", trgm2int(ptr
));
270 SET_VARSIZE(item
, VARHDRSZ
+ strlen(VARDATA(item
)));
274 SET_VARSIZE(item
, VARHDRSZ
+ 3);
275 CPTRGM(VARDATA(item
), ptr
);
277 d
[i
] = PointerGetDatum(item
);
289 for (i
= 0; i
< ARRNELEM(trg
); i
++)
290 pfree(DatumGetPointer(d
[i
]));
294 PG_FREE_IF_COPY(in
, 0);
296 PG_RETURN_POINTER(a
);
300 cnt_sml(TRGM
*trg1
, TRGM
*trg2
)
311 len1
= ARRNELEM(trg1
);
312 len2
= ARRNELEM(trg2
);
314 while (ptr1
- GETARR(trg1
) < len1
&& ptr2
- GETARR(trg2
) < len2
)
316 int res
= CMPTRGM(ptr1
, ptr2
);
331 return ((((float4
) count
) / ((float4
) (len1
+ len2
- count
))));
333 return (((float) count
) / ((float) ((len1
> len2
) ? len1
: len2
)));
338 PG_FUNCTION_INFO_V1(similarity
);
339 Datum
similarity(PG_FUNCTION_ARGS
);
341 similarity(PG_FUNCTION_ARGS
)
343 text
*in1
= PG_GETARG_TEXT_P(0);
344 text
*in2
= PG_GETARG_TEXT_P(1);
349 trg1
= generate_trgm(VARDATA(in1
), VARSIZE(in1
) - VARHDRSZ
);
350 trg2
= generate_trgm(VARDATA(in2
), VARSIZE(in2
) - VARHDRSZ
);
352 res
= cnt_sml(trg1
, trg2
);
356 PG_FREE_IF_COPY(in1
, 0);
357 PG_FREE_IF_COPY(in2
, 1);
359 PG_RETURN_FLOAT4(res
);
362 PG_FUNCTION_INFO_V1(similarity_op
);
363 Datum
similarity_op(PG_FUNCTION_ARGS
);
365 similarity_op(PG_FUNCTION_ARGS
)
367 float4 res
= DatumGetFloat4(DirectFunctionCall2(
373 PG_RETURN_BOOL(res
>= trgm_limit
);