OSDN Git Service

Update BIGM_LAST_UPDATE to '2020.02.28'.
[pgbigm/pg_bigm.git] / bigm_op.c
1 /*-------------------------------------------------------------------------
2  *
3  * Portions Copyright (c) 2017-2020, pg_bigm Development Group
4  * Portions Copyright (c) 2013-2016, NTT DATA Corporation
5  * Portions Copyright (c) 2004-2012, PostgreSQL Global Development Group
6  *
7  * Changelog:
8  *       2013/01/09
9  *       Support full text search using bigrams.
10  *       Author: NTT DATA Corporation
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15
16 #include <ctype.h>
17
18 #include "bigm.h"
19
20 #include "catalog/pg_type.h"
21 #include "tsearch/ts_locale.h"
22 #include "utils/array.h"
23 #include "utils/memutils.h"
24
25 PG_MODULE_MAGIC;
26
27 /* Last update date of pg_bigm */
28 #define BIGM_LAST_UPDATE        "2020.02.28"
29
30 /* GUC variable */
31 bool            bigm_enable_recheck = false;
32 int                     bigm_gin_key_limit = 0;
33 double          bigm_similarity_limit = 0.3;
34 char       *bigm_last_update = NULL;
35
36 PG_FUNCTION_INFO_V1(show_bigm);
37 PG_FUNCTION_INFO_V1(bigmtextcmp);
38 PG_FUNCTION_INFO_V1(likequery);
39 PG_FUNCTION_INFO_V1(bigm_similarity);
40 PG_FUNCTION_INFO_V1(bigm_similarity_op);
41
42 /*
43  * The function prototypes are created as a part of PG_FUNCTION_INFO_V1
44  * macro since 9.4, and hence the declaration of the function prototypes
45  * here is necessary only for 9.3 or before.
46  */
47 #if PG_VERSION_NUM < 90400
48 Datum           show_bigm(PG_FUNCTION_ARGS);
49 Datum           bigmtextcmp(PG_FUNCTION_ARGS);
50 Datum           likequery(PG_FUNCTION_ARGS);
51 Datum           bigm_similarity(PG_FUNCTION_ARGS);
52 Datum           bigm_similarity_op(PG_FUNCTION_ARGS);
53 #endif
54
55 void            _PG_init(void);
56 void            _PG_fini(void);
57
58 void
59 _PG_init(void)
60 {
61         /* Define custom GUC variables */
62         DefineCustomBoolVariable("pg_bigm.enable_recheck",
63                                                          "Recheck that heap tuples fetched from index "
64                                                          "match the query.",
65                                                          NULL,
66                                                          &bigm_enable_recheck,
67                                                          true,
68                                                          PGC_USERSET,
69                                                          0,
70                                                          NULL,
71                                                          NULL,
72                                                          NULL);
73
74         DefineCustomIntVariable("pg_bigm.gin_key_limit",
75                                                         "Sets the maximum number of bi-gram keys allowed to "
76                                                         "use for GIN index search.",
77                                                         "Zero means no limit.",
78                                                         &bigm_gin_key_limit,
79                                                         0,
80                                                         0, INT_MAX,
81                                                         PGC_USERSET,
82                                                         0,
83                                                         NULL,
84                                                         NULL,
85                                                         NULL);
86
87         DefineCustomRealVariable("pg_bigm.similarity_limit",
88                                                          "Sets the similarity threshold used by the "
89                                                          "=% operator.",
90                                                          NULL,
91                                                          &bigm_similarity_limit,
92                                                          0.3,
93                                                          0.0, 1.0,
94                                                          PGC_USERSET,
95                                                          0,
96                                                          NULL,
97                                                          NULL,
98                                                          NULL);
99
100         /* Can't be set in postgresql.conf */
101         DefineCustomStringVariable("pg_bigm.last_update",
102                                                            "Shows the last update date of pg_bigm.",
103                                                            NULL,
104                                                            &bigm_last_update,
105                                                            BIGM_LAST_UPDATE,
106                                                            PGC_INTERNAL,
107                                                            GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
108                                                            NULL,
109                                                            NULL,
110                                                            NULL);
111
112         EmitWarningsOnPlaceholders("pg_bigm");
113 }
114
115 void
116 _PG_fini(void)
117 {
118 }
119
120 static int
121 comp_bigm(const void *a, const void *b, void *arg)
122 {
123         int                     res;
124         bool       *haveDups = (bool *) arg;
125
126         res = CMPBIGM(a, b);
127
128         if (res == 0)
129                 *haveDups = true;
130
131         return res;
132 }
133
134 static int
135 unique_array(bigm *a, int len)
136 {
137         bigm       *curend,
138                            *tmp;
139
140         curend = tmp = a;
141         while (tmp - a < len)
142                 if (CMPBIGM(tmp, curend))
143                 {
144                         curend++;
145                         if (curend != tmp)
146                                 memcpy(curend, tmp, BIGMSIZE);
147                         tmp++;
148                 }
149                 else
150                         tmp++;
151
152         return curend + 1 - a;
153 }
154
155 #define iswordchr(c)    (!t_isspace(c))
156
157 /*
158  * Finds first word in string, returns pointer to the word,
159  * endword points to the character after word
160  */
161 static char *
162 find_word(char *str, int lenstr, char **endword, int *charlen)
163 {
164         char       *beginword = str;
165
166         while (beginword - str < lenstr && !iswordchr(beginword))
167                 beginword += pg_mblen(beginword);
168
169         if (beginword - str >= lenstr)
170                 return NULL;
171
172         *endword = beginword;
173         *charlen = 0;
174         while (*endword - str < lenstr && iswordchr(*endword))
175         {
176                 *endword += pg_mblen(*endword);
177                 (*charlen)++;
178         }
179
180         return beginword;
181 }
182
183 /*
184  * The function is named compact_bigram to maintain consistency with pg_trgm,
185  * though it does not reduce multibyte characters to hash values like in
186  * compact_trigram.
187  */
188 static void
189 compact_bigram(bigm *bptr, char *str, int bytelen)
190 {
191         CPBIGM(bptr, str, bytelen);
192 }
193
194 /*
195  * Adds bigrams from words (already padded).
196  */
197 static bigm *
198 make_bigrams(bigm *bptr, char *str, int bytelen, int charlen)
199 {
200         char       *ptr = str;
201
202         if (charlen < 2)
203         {
204                 compact_bigram(bptr, ptr, pg_mblen(str));
205                 bptr->pmatch = true;
206                 bptr++;
207                 return bptr;
208         }
209
210         if (bytelen > charlen)
211         {
212                 /* Find multibyte character boundaries and call compact_bigram */
213                 int                     lenfirst = pg_mblen(str),
214                                         lenlast = pg_mblen(str + lenfirst);
215
216                 while ((ptr - str) + lenfirst + lenlast <= bytelen)
217                 {
218                         compact_bigram(bptr, ptr, lenfirst + lenlast);
219
220                         ptr += lenfirst;
221                         bptr++;
222
223                         lenfirst = lenlast;
224                         lenlast = pg_mblen(ptr + lenfirst);
225                 }
226         }
227         else
228         {
229                 /* Fast path when there are no multibyte characters */
230                 Assert(bytelen == charlen);
231
232                 while (ptr - str < bytelen - 1 /* number of bigrams = strlen - 1 */ )
233                 {
234                         CPBIGM(bptr, ptr, 2);
235                         ptr++;
236                         bptr++;
237                 }
238         }
239
240         return bptr;
241 }
242
243 BIGM *
244 generate_bigm(char *str, int slen)
245 {
246         BIGM       *bgm;
247         char       *buf;
248         bigm       *bptr;
249         int                     len,
250                                 charlen,
251                                 bytelen;
252         char       *bword,
253                            *eword;
254
255         /*
256          * Guard against possible overflow in the palloc requests below.
257          * We need to prevent integer overflow in the multiplications here.
258          */
259         if ((Size) slen > (MaxAllocSize - VARHDRSZ) / sizeof(bigm) - 1 ||
260                 (Size) slen > MaxAllocSize - 4)
261                 ereport(ERROR,
262                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
263                                  errmsg("out of memory")));
264
265         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen + 1));
266         SET_VARSIZE(bgm, VARHDRSZ);
267
268         if (slen + LPADDING + RPADDING < 2 || slen == 0)
269                 return bgm;
270
271         bptr = GETARR(bgm);
272
273         buf = palloc(sizeof(char) * (slen + 4));
274
275         if (LPADDING > 0)
276         {
277                 *buf = ' ';
278                 if (LPADDING > 1)
279                         *(buf + 1) = ' ';
280         }
281
282         eword = str;
283         while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
284         {
285                 bytelen = eword - bword;
286                 memcpy(buf + LPADDING, bword, bytelen);
287
288                 buf[LPADDING + bytelen] = ' ';
289                 buf[LPADDING + bytelen + 1] = ' ';
290
291                 /*
292                  * count bigrams
293                  */
294                 bptr = make_bigrams(bptr, buf, bytelen + LPADDING + RPADDING,
295                                                         charlen + LPADDING + RPADDING);
296         }
297
298         pfree(buf);
299
300         if ((len = bptr - GETARR(bgm)) == 0)
301                 return bgm;
302
303         /*
304          * Make bigrams unique.
305          */
306         if (len > 1)
307         {
308                 bool            haveDups = false;
309
310                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
311                 if (haveDups)
312                         len = unique_array(GETARR(bgm), len);
313         }
314
315         SET_VARSIZE(bgm, CALCGTSIZE(len));
316
317         return bgm;
318 }
319
320 /*
321  * Extract the next non-wildcard part of a search string, i.e. a word bounded
322  * by '_' or '%' meta-characters, non-word characters or string end.
323  *
324  * str: source string, of length lenstr bytes (need not be null-terminated)
325  * buf: where to return the substring (must be long enough)
326  * *bytelen: receives byte length of the found substring
327  * *charlen: receives character length of the found substring
328  *
329  * Returns pointer to end+1 of the found substring in the source string.
330  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
331  *
332  * If the found word is bounded by non-word characters or string boundaries
333  * then this function will include corresponding padding spaces into buf.
334  */
335 static const char *
336 get_wildcard_part(const char *str, int lenstr,
337                                   char *buf, int *bytelen, int *charlen)
338 {
339         const char *beginword = str;
340         const char *endword;
341         char       *s = buf;
342         bool            in_leading_wildcard_meta = false;
343         bool            in_trailing_wildcard_meta = false;
344         bool            in_escape = false;
345         int                     clen;
346
347         /*
348          * Find the first word character, remembering whether preceding character
349          * was wildcard meta-character.  Note that the in_escape state persists
350          * from this loop to the next one, since we may exit at a word character
351          * that is in_escape.
352          */
353         while (beginword - str < lenstr)
354         {
355                 if (in_escape)
356                 {
357                         if (iswordchr(beginword))
358                                 break;
359                         in_escape = false;
360                         in_leading_wildcard_meta = false;
361                 }
362                 else
363                 {
364                         if (ISESCAPECHAR(beginword))
365                                 in_escape = true;
366                         else if (ISWILDCARDCHAR(beginword))
367                                 in_leading_wildcard_meta = true;
368                         else if (iswordchr(beginword))
369                                 break;
370                         else
371                                 in_leading_wildcard_meta = false;
372                 }
373                 beginword += pg_mblen(beginword);
374         }
375
376         /*
377          * Handle string end.
378          */
379         if (beginword - str >= lenstr)
380                 return NULL;
381
382         /*
383          * Add left padding spaces if preceding character wasn't wildcard
384          * meta-character.
385          */
386         *charlen = 0;
387         if (!in_leading_wildcard_meta)
388         {
389                 if (LPADDING > 0)
390                 {
391                         *s++ = ' ';
392                         (*charlen)++;
393                         if (LPADDING > 1)
394                         {
395                                 *s++ = ' ';
396                                 (*charlen)++;
397                         }
398                 }
399         }
400
401         /*
402          * Copy data into buf until wildcard meta-character, non-word character or
403          * string boundary.  Strip escapes during copy.
404          */
405         endword = beginword;
406         while (endword - str < lenstr)
407         {
408                 clen = pg_mblen(endword);
409                 if (in_escape)
410                 {
411                         if (iswordchr(endword))
412                         {
413                                 memcpy(s, endword, clen);
414                                 (*charlen)++;
415                                 s += clen;
416                         }
417                         else
418                         {
419                                 /*
420                                  * Back up endword to the escape character when stopping at an
421                                  * escaped char, so that subsequent get_wildcard_part will
422                                  * restart from the escape character.  We assume here that
423                                  * escape chars are single-byte.
424                                  */
425                                 endword--;
426                                 break;
427                         }
428                         in_escape = false;
429                 }
430                 else
431                 {
432                         if (ISESCAPECHAR(endword))
433                                 in_escape = true;
434                         else if (ISWILDCARDCHAR(endword))
435                         {
436                                 in_trailing_wildcard_meta = true;
437                                 break;
438                         }
439                         else if (iswordchr(endword))
440                         {
441                                 memcpy(s, endword, clen);
442                                 (*charlen)++;
443                                 s += clen;
444                         }
445                         else
446                                 break;
447                 }
448                 endword += clen;
449         }
450
451         /*
452          * Add right padding spaces if next character isn't wildcard
453          * meta-character.
454          */
455         if (!in_trailing_wildcard_meta)
456         {
457                 if (RPADDING > 0)
458                 {
459                         *s++ = ' ';
460                         (*charlen)++;
461                         if (RPADDING > 1)
462                         {
463                                 *s++ = ' ';
464                                 (*charlen)++;
465                         }
466                 }
467         }
468
469         *bytelen = s - buf;
470         return endword;
471 }
472
473 /*
474  * Generates bigrams for wildcard search string.
475  *
476  * Returns array of bigrams that must occur in any string that matches the
477  * wildcard string.  For example, given pattern "a%bcd%" the bigrams
478  * " a", "bcd" would be extracted.
479  *
480  * Set 'removeDups' to true if duplicate bigrams are removed.
481  */
482 BIGM *
483 generate_wildcard_bigm(const char *str, int slen, bool *removeDups)
484 {
485         BIGM       *bgm;
486         char       *buf;
487         bigm       *bptr;
488         int                     len,
489                                 charlen,
490                                 bytelen;
491         const char *eword;
492
493         *removeDups = false;
494
495         /*
496          * Guard against possible overflow in the palloc requests below.
497          * We need to prevent integer overflow in the multiplications here.
498          */
499         if ((Size) slen > (MaxAllocSize - VARHDRSZ) / sizeof(bigm) - 1 ||
500                 (Size) slen > MaxAllocSize - 4)
501                 ereport(ERROR,
502                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
503                                  errmsg("out of memory")));
504
505         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen + 1));
506         SET_VARSIZE(bgm, VARHDRSZ);
507
508         if (slen + LPADDING + RPADDING < 2 || slen == 0)
509                 return bgm;
510
511         bptr = GETARR(bgm);
512
513         buf = palloc(sizeof(char) * (slen + 4));
514
515         /*
516          * Extract bigrams from each substring extracted by get_wildcard_part.
517          */
518         eword = str;
519         while ((eword = get_wildcard_part(eword, slen - (eword - str),
520                                                                           buf, &bytelen, &charlen)) != NULL)
521         {
522                 /*
523                  * count bigrams
524                  */
525                 bptr = make_bigrams(bptr, buf, bytelen, charlen);
526         }
527
528         pfree(buf);
529
530         if ((len = bptr - GETARR(bgm)) == 0)
531                 return bgm;
532
533         /*
534          * Make bigrams unique.
535          */
536         if (len > 1)
537         {
538                 bool            haveDups = false;
539
540                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
541                 if (haveDups)
542                 {
543                         *removeDups = true;
544                         len = unique_array(GETARR(bgm), len);
545                 }
546         }
547
548         SET_VARSIZE(bgm, CALCGTSIZE(len));
549
550         return bgm;
551 }
552
553 Datum
554 show_bigm(PG_FUNCTION_ARGS)
555 {
556         text       *in = PG_GETARG_TEXT_P(0);
557         BIGM       *bgm;
558         Datum      *d;
559         ArrayType  *a;
560         bigm       *ptr;
561         int                     i;
562
563         bgm = generate_bigm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
564         d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(bgm)));
565
566         for (i = 0, ptr = GETARR(bgm); i < ARRNELEM(bgm); i++, ptr++)
567         {
568                 text       *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
569
570                 d[i] = PointerGetDatum(item);
571         }
572
573         a = construct_array(
574                                                 d,
575                                                 ARRNELEM(bgm),
576                                                 TEXTOID,
577                                                 -1,
578                                                 false,
579                                                 'i'
580                 );
581
582         for (i = 0; i < ARRNELEM(bgm); i++)
583                 pfree(DatumGetPointer(d[i]));
584
585         pfree(d);
586         pfree(bgm);
587         PG_FREE_IF_COPY(in, 0);
588
589         PG_RETURN_POINTER(a);
590 }
591
592 static float4
593 cnt_sml_bigm(BIGM *bgm1, BIGM *bgm2)
594 {
595         bigm       *ptr1,
596                            *ptr2;
597         int                     count = 0;
598         int                     len1,
599                                 len2;
600
601         ptr1 = GETARR(bgm1);
602         ptr2 = GETARR(bgm2);
603
604         len1 = ARRNELEM(bgm1);
605         len2 = ARRNELEM(bgm2);
606
607         /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
608         if (len1 <= 0 || len2 <= 0)
609                 return (float4) 0.0;
610
611         while (ptr1 - GETARR(bgm1) < len1 && ptr2 - GETARR(bgm2) < len2)
612         {
613                 int                     res = CMPBIGM(ptr1, ptr2);
614
615                 if (res < 0)
616                         ptr1++;
617                 else if (res > 0)
618                         ptr2++;
619                 else
620                 {
621                         ptr1++;
622                         ptr2++;
623                         count++;
624                 }
625         }
626
627 #ifdef DIVUNION
628         return ((float4) count) / ((float4) (len1 + len2 - count));
629 #else
630         return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
631 #endif
632 }
633
634 Datum
635 bigm_similarity(PG_FUNCTION_ARGS)
636 {
637         text       *in1 = PG_GETARG_TEXT_P(0);
638         text       *in2 = PG_GETARG_TEXT_P(1);
639         BIGM       *bgm1,
640                            *bgm2;
641         float4          res;
642
643         bgm1 = generate_bigm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
644         bgm2 = generate_bigm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
645
646         res = cnt_sml_bigm(bgm1, bgm2);
647
648         pfree(bgm1);
649         pfree(bgm2);
650         PG_FREE_IF_COPY(in1, 0);
651         PG_FREE_IF_COPY(in2, 1);
652
653         PG_RETURN_FLOAT4(res);
654 }
655
656 Datum
657 bigm_similarity_op(PG_FUNCTION_ARGS)
658 {
659         float4          res = DatumGetFloat4(DirectFunctionCall2(bigm_similarity,
660                                                                                                                  PG_GETARG_DATUM(0),
661                                                                                                                  PG_GETARG_DATUM(1)));
662
663         PG_RETURN_BOOL(res >= (float4) bigm_similarity_limit);
664 }
665
666 Datum
667 likequery(PG_FUNCTION_ARGS)
668 {
669         text       *query = PG_GETARG_TEXT_PP(0);
670         const char *str;
671         int                     len;
672         const char *sp;
673         text       *result;
674         char       *rp;
675         int                     mblen;
676
677         str = VARDATA_ANY(query);
678         len = VARSIZE_ANY_EXHDR(query);
679
680         if (len == 0)
681                 PG_RETURN_NULL();
682
683         result = (text *) palloc((Size) len * 2 + 2 + VARHDRSZ);
684         rp = VARDATA(result);
685         *rp++ = '%';
686
687         for (sp = str; (sp - str) < len;)
688         {
689                 if (ISWILDCARDCHAR(sp) || ISESCAPECHAR(sp))
690                 {
691                         *rp++ = '\\';
692                         *rp++ = *sp++;
693                 }
694                 else if (IS_HIGHBIT_SET(*sp))
695                 {
696                         mblen = pg_mblen(sp);
697                         memcpy(rp, sp, mblen);
698                         rp += mblen;
699                         sp += mblen;
700                 }
701                 else
702                         *rp++ = *sp++;
703         }
704
705         *rp++ = '%';
706         SET_VARSIZE(result, rp - VARDATA(result) + VARHDRSZ);
707
708         PG_RETURN_TEXT_P(result);
709 }
710
711 Datum
712 bigmtextcmp(PG_FUNCTION_ARGS)
713 {
714         text       *arg1 = PG_GETARG_TEXT_PP(0);
715         text       *arg2 = PG_GETARG_TEXT_PP(1);
716         char       *a1p = VARDATA_ANY(arg1);
717         char       *a2p = VARDATA_ANY(arg2);
718         int                     len1 = VARSIZE_ANY_EXHDR(arg1);
719         int                     len2 = VARSIZE_ANY_EXHDR(arg2);
720
721         PG_RETURN_INT32(bigmstrcmp(a1p, len1, a2p, len2));
722 }