OSDN Git Service

Avoid the redundant declaration of the function prototypes.
[pgbigm/pg_bigm.git] / bigm_op.c
1 /*-------------------------------------------------------------------------
2  *
3  * Portions Copyright (c) 2004-2012, PostgreSQL Global Development Group
4  * Portions Copyright (c) 2013-2015, NTT DATA Corporation
5  *
6  * Changelog:
7  *   2013/01/09
8  *   Support full text search using bigrams.
9  *   Author: NTT DATA Corporation
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14
15 #include <ctype.h>
16
17 #include "bigm.h"
18
19 #include "catalog/pg_type.h"
20 #include "tsearch/ts_locale.h"
21 #include "utils/array.h"
22
23
24 PG_MODULE_MAGIC;
25
26 /* Last update date of pg_bigm */
27 #define BIGM_LAST_UPDATE        "2013.11.22"
28
29 /* GUC variable */
30 bool    bigm_enable_recheck = false;
31 int             bigm_gin_key_limit = 0;
32 double  bigm_similarity_limit = 0.3;
33 char    *bigm_last_update = NULL;
34
35 PG_FUNCTION_INFO_V1(show_bigm);
36 PG_FUNCTION_INFO_V1(bigmtextcmp);
37 PG_FUNCTION_INFO_V1(likequery);
38 PG_FUNCTION_INFO_V1(bigm_similarity);
39 PG_FUNCTION_INFO_V1(bigm_similarity_op);
40
41 /*
42  * The function prototypes are created as a part of PG_FUNCTION_INFO_V1
43  * macro since 9.4, and hence the declaration of the function prototypes
44  * here is necessary only for 9.3 or before.
45  */
46 #if PG_VERSION_NUM < 90400
47 Datum           show_bigm(PG_FUNCTION_ARGS);
48 Datum           bigmtextcmp(PG_FUNCTION_ARGS);
49 Datum           likequery(PG_FUNCTION_ARGS);
50 Datum           bigm_similarity(PG_FUNCTION_ARGS);
51 Datum           bigm_similarity_op(PG_FUNCTION_ARGS);
52 #endif
53
54 void            _PG_init(void);
55 void            _PG_fini(void);
56
57 void
58 _PG_init(void)
59 {
60         /* Define custom GUC variables */
61         DefineCustomBoolVariable("pg_bigm.enable_recheck",
62                                                          "Recheck that heap tuples fetched from index "
63                                                          "match the query.",
64                                                          NULL,
65                                                          &bigm_enable_recheck,
66                                                          true,
67                                                          PGC_USERSET,
68                                                          0,
69                                                          NULL,
70                                                          NULL,
71                                                          NULL);
72
73         DefineCustomIntVariable("pg_bigm.gin_key_limit",
74                                                         "Sets the maximum number of bi-gram keys allowed to "
75                                                         "use for GIN index search.",
76                                                         "Zero means no limit.",
77                                                         &bigm_gin_key_limit,
78                                                         0,
79                                                         0, INT_MAX,
80                                                         PGC_USERSET,
81                                                         0,
82                                                         NULL,
83                                                         NULL,
84                                                         NULL);
85
86         DefineCustomRealVariable("pg_bigm.similarity_limit",
87                                                          "Sets the similarity threshold used by the "
88                                                          "=% operator.",
89                                                          NULL,
90                                                          &bigm_similarity_limit,
91                                                          0.3,
92                                                          0.0, 1.0,
93                                                          PGC_USERSET,
94                                                          0,
95                                                          NULL,
96                                                          NULL,
97                                                          NULL);
98
99         /* Can't be set in postgresql.conf */
100         DefineCustomStringVariable("pg_bigm.last_update",
101                                                            "Shows the last update date of pg_bigm.",
102                                                            NULL,
103                                                            &bigm_last_update,
104                                                            BIGM_LAST_UPDATE,
105                                                            PGC_INTERNAL,
106                                                            GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
107                                                            NULL,
108                                                            NULL,
109                                                            NULL);
110
111         EmitWarningsOnPlaceholders("pg_bigm");
112 }
113
114 void
115 _PG_fini(void)
116 {
117 }
118
119 static int
120 comp_bigm(const void *a, const void *b, void *arg)
121 {
122         int             res;
123         bool    *haveDups = (bool *) arg;
124
125         res = CMPBIGM(a, b);
126
127         if (res == 0)
128                 *haveDups = true;
129
130         return res;
131 }
132
133 static int
134 unique_array(bigm *a, int len)
135 {
136         bigm       *curend,
137                            *tmp;
138
139         curend = tmp = a;
140         while (tmp - a < len)
141                 if (CMPBIGM(tmp, curend))
142                 {
143                         curend++;
144                         if (curend != tmp)
145                                 memcpy(curend, tmp, BIGMSIZE);
146                         tmp++;
147                 }
148                 else
149                         tmp++;
150
151         return curend + 1 - a;
152 }
153
154 #define iswordchr(c)    (!t_isspace(c))
155
156 /*
157  * Finds first word in string, returns pointer to the word,
158  * endword points to the character after word
159  */
160 static char *
161 find_word(char *str, int lenstr, char **endword, int *charlen)
162 {
163         char       *beginword = str;
164
165         while (beginword - str < lenstr && !iswordchr(beginword))
166                 beginword += pg_mblen(beginword);
167
168         if (beginword - str >= lenstr)
169                 return NULL;
170
171         *endword = beginword;
172         *charlen = 0;
173         while (*endword - str < lenstr && iswordchr(*endword))
174         {
175                 *endword += pg_mblen(*endword);
176                 (*charlen)++;
177         }
178
179         return beginword;
180 }
181
182 /* 
183  * The function is named compact_bigram to maintain consistency with pg_trgm,
184  * though it does not reduce multibyte characters to hash values like in
185  * compact_trigram.
186  */
187 static void
188 compact_bigram(bigm *bptr, char *str, int bytelen)
189 {
190         CPBIGM(bptr, str, bytelen);
191 }
192
193 /*
194  * Adds bigrams from words (already padded).
195  */
196 static bigm *
197 make_bigrams(bigm *bptr, char *str, int bytelen, int charlen)
198 {
199         char       *ptr = str;
200
201         if (charlen < 2)
202         {
203                 compact_bigram(bptr, ptr, pg_mblen(str));
204                 bptr->pmatch = true;
205                 bptr++;
206                 return bptr;
207         }
208
209         if (bytelen > charlen)
210         {
211                 /* Find multibyte character boundaries and call compact_bigram */
212                 int                     lenfirst = pg_mblen(str),
213                                         lenlast = pg_mblen(str + lenfirst);
214
215                 while ((ptr - str) + lenfirst + lenlast <= bytelen)
216                 {
217                         compact_bigram(bptr, ptr, lenfirst + lenlast);
218
219                         ptr += lenfirst;
220                         bptr++;
221
222                         lenfirst = lenlast;
223                         lenlast = pg_mblen(ptr + lenfirst);
224                 }
225         }
226         else
227         {
228                 /* Fast path when there are no multibyte characters */
229                 Assert(bytelen == charlen);
230
231                 while (ptr - str < bytelen - 1 /* number of bigrams = strlen - 1 */ )
232                 {
233                         CPBIGM(bptr, ptr, 2);
234                         ptr++;
235                         bptr++;
236                 }
237         }
238
239         return bptr;
240 }
241
242 BIGM *
243 generate_bigm(char *str, int slen)
244 {
245         BIGM       *bgm;
246         char       *buf;
247         bigm       *bptr;
248         int                     len,
249                                 charlen,
250                                 bytelen;
251         char       *bword,
252                            *eword;
253
254         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (Size) (slen / 2 + 1) * 3);
255         SET_VARSIZE(bgm, VARHDRSZ);
256
257         if (slen + LPADDING + RPADDING < 2 || slen == 0)
258                 return bgm;
259
260         bptr = GETARR(bgm);
261
262         buf = palloc(sizeof(char) * (slen + 4));
263
264         if (LPADDING > 0)
265         {
266                 *buf = ' ';
267                 if (LPADDING > 1)
268                         *(buf + 1) = ' ';
269         }
270
271         eword = str;
272         while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
273         {
274                 bytelen = eword - bword;
275                 memcpy(buf + LPADDING, bword, bytelen);
276
277                 buf[LPADDING + bytelen] = ' ';
278                 buf[LPADDING + bytelen + 1] = ' ';
279
280                 /*
281                  * count bigrams
282                  */
283                 bptr = make_bigrams(bptr, buf, bytelen + LPADDING + RPADDING,
284                                                          charlen + LPADDING + RPADDING);
285         }
286
287         pfree(buf);
288
289         if ((len = bptr - GETARR(bgm)) == 0)
290                 return bgm;
291
292         /*
293          * Make bigrams unique.
294          */
295         if (len > 1)
296         {
297                 bool    haveDups = false;
298
299                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
300                 if (haveDups)
301                         len = unique_array(GETARR(bgm), len);
302         }
303
304         SET_VARSIZE(bgm, CALCGTSIZE(len));
305
306         return bgm;
307 }
308
309 /*
310  * Extract the next non-wildcard part of a search string, ie, a word bounded
311  * by '_' or '%' meta-characters, non-word characters or string end.
312  *
313  * str: source string, of length lenstr bytes (need not be null-terminated)
314  * buf: where to return the substring (must be long enough)
315  * *bytelen: receives byte length of the found substring
316  * *charlen: receives character length of the found substring
317  *
318  * Returns pointer to end+1 of the found substring in the source string.
319  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
320  *
321  * If the found word is bounded by non-word characters or string boundaries
322  * then this function will include corresponding padding spaces into buf.
323  */
324 static const char *
325 get_wildcard_part(const char *str, int lenstr,
326                                   char *buf, int *bytelen, int *charlen)
327 {
328         const char *beginword = str;
329         const char *endword;
330         char       *s = buf;
331         bool        in_leading_wildcard_meta = false;
332         bool        in_trailing_wildcard_meta = false;
333         bool            in_escape = false;
334         int                     clen;
335
336         /*
337          * Find the first word character, remembering whether preceding character
338          * was wildcard meta-character.  Note that the in_escape state persists
339          * from this loop to the next one, since we may exit at a word character
340          * that is in_escape.
341          */
342         while (beginword - str < lenstr)
343         {
344                 if (in_escape)
345                 {
346                         if (iswordchr(beginword))
347                                 break;
348                         in_escape = false;
349                         in_leading_wildcard_meta = false;
350                 }
351                 else
352                 {
353                         if (ISESCAPECHAR(beginword))
354                                 in_escape = true;
355                         else if (ISWILDCARDCHAR(beginword))
356                                 in_leading_wildcard_meta = true;
357                         else if (iswordchr(beginword))
358                                 break;
359                         else
360                                 in_leading_wildcard_meta = false;
361                 }
362                 beginword += pg_mblen(beginword);
363         }
364
365         /*
366          * Handle string end.
367          */
368         if (beginword - str >= lenstr)
369                 return NULL;
370
371         /*
372          * Add left padding spaces if preceding character wasn't wildcard
373          * meta-character.
374          */
375         *charlen = 0;
376         if (!in_leading_wildcard_meta)
377         {
378                 if (LPADDING > 0)
379                 {
380                         *s++ = ' ';
381                         (*charlen)++;
382                         if (LPADDING > 1)
383                         {
384                                 *s++ = ' ';
385                                 (*charlen)++;
386                         }
387                 }
388         }
389
390         /*
391          * Copy data into buf until wildcard meta-character, non-word character or
392          * string boundary.  Strip escapes during copy.
393          */
394         endword = beginword;
395         while (endword - str < lenstr)
396         {
397                 clen = pg_mblen(endword);
398                 if (in_escape)
399                 {
400                         if (iswordchr(endword))
401                         {
402                                 memcpy(s, endword, clen);
403                                 (*charlen)++;
404                                 s += clen;
405                         }
406                         else
407                         {
408                                 /*
409                                  * Back up endword to the escape character when stopping at
410                                  * an escaped char, so that subsequent get_wildcard_part will
411                                  * restart from the escape character.  We assume here that
412                                  * escape chars are single-byte.
413                                  */
414                                 endword--;
415                                 break;
416                         }
417                         in_escape = false;
418                 }
419                 else
420                 {
421                         if (ISESCAPECHAR(endword))
422                                 in_escape = true;
423                         else if (ISWILDCARDCHAR(endword))
424                         {
425                                 in_trailing_wildcard_meta = true;
426                                 break;
427                         }
428                         else if (iswordchr(endword))
429                         {
430                                 memcpy(s, endword, clen);
431                                 (*charlen)++;
432                                 s += clen;
433                         }
434                         else
435                                 break;
436                 }
437                 endword += clen;
438         }
439
440         /*
441          * Add right padding spaces if next character isn't wildcard
442          * meta-character.
443          */
444         if (!in_trailing_wildcard_meta)
445         {
446                 if (RPADDING > 0)
447                 {
448                         *s++ = ' ';
449                         (*charlen)++;
450                         if (RPADDING > 1)
451                         {
452                                 *s++ = ' ';
453                                 (*charlen)++;
454                         }
455                 }
456         }
457
458         *bytelen = s - buf;
459         return endword;
460 }
461
462 /*
463  * Generates bigrams for wildcard search string.
464  *
465  * Returns array of bigrams that must occur in any string that matches the
466  * wildcard string.  For example, given pattern "a%bcd%" the bigrams
467  * " a", "bcd" would be extracted.
468  *
469  * Set 'removeDups' to true if duplicate bigrams are removed.
470  */
471 BIGM *
472 generate_wildcard_bigm(const char *str, int slen, bool *removeDups)
473 {
474         BIGM       *bgm;
475         char       *buf;
476         bigm       *bptr;
477         int                     len,
478                                 charlen,
479                                 bytelen;
480         const char *eword;
481
482         *removeDups = false;
483
484         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (Size) (slen / 2 + 1) * 3);
485         SET_VARSIZE(bgm, VARHDRSZ);
486
487         if (slen + LPADDING + RPADDING < 2 || slen == 0)
488                 return bgm;
489
490         bptr = GETARR(bgm);
491
492         buf = palloc(sizeof(char) * (slen + 4));
493
494         /*
495          * Extract bigrams from each substring extracted by get_wildcard_part.
496          */
497         eword = str;
498         while ((eword = get_wildcard_part(eword, slen - (eword - str),
499                                                                           buf, &bytelen, &charlen)) != NULL)
500         {
501                 /*
502                  * count bigrams
503                  */
504                 bptr = make_bigrams(bptr, buf, bytelen, charlen);
505         }
506
507         pfree(buf);
508
509         if ((len = bptr - GETARR(bgm)) == 0)
510                 return bgm;
511
512         /*
513          * Make bigrams unique.
514          */
515         if (len > 1)
516         {
517                 bool    haveDups = false;
518
519                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
520                 if (haveDups)
521                 {
522                         *removeDups = true;
523                         len = unique_array(GETARR(bgm), len);
524                 }
525         }
526
527         SET_VARSIZE(bgm, CALCGTSIZE(len));
528
529         return bgm;
530 }
531
532 Datum
533 show_bigm(PG_FUNCTION_ARGS)
534 {
535         text       *in = PG_GETARG_TEXT_P(0);
536         BIGM       *bgm;
537         Datum      *d;
538         ArrayType  *a;
539         bigm       *ptr;
540         int                     i;
541
542         bgm = generate_bigm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
543         d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(bgm)));
544
545         for (i = 0, ptr = GETARR(bgm); i < ARRNELEM(bgm); i++, ptr++)
546         {
547                 text       *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
548                 d[i] = PointerGetDatum(item);
549         }
550
551         a = construct_array(
552                                                 d,
553                                                 ARRNELEM(bgm),
554                                                 TEXTOID,
555                                                 -1,
556                                                 false,
557                                                 'i'
558                 );
559
560         for (i = 0; i < ARRNELEM(bgm); i++)
561                 pfree(DatumGetPointer(d[i]));
562
563         pfree(d);
564         pfree(bgm);
565         PG_FREE_IF_COPY(in, 0);
566
567         PG_RETURN_POINTER(a);
568 }
569
570 static float4
571 cnt_sml_bigm(BIGM *bgm1, BIGM *bgm2)
572 {
573         bigm       *ptr1,
574                            *ptr2;
575         int                     count = 0;
576         int                     len1,
577                                 len2;
578
579         ptr1 = GETARR(bgm1);
580         ptr2 = GETARR(bgm2);
581
582         len1 = ARRNELEM(bgm1);
583         len2 = ARRNELEM(bgm2);
584
585         /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
586         if (len1 <= 0 || len2 <= 0)
587                 return (float4) 0.0;
588
589         while (ptr1 - GETARR(bgm1) < len1 && ptr2 - GETARR(bgm2) < len2)
590         {
591                 int             res = CMPBIGM(ptr1, ptr2);
592
593                 if (res < 0)
594                         ptr1++;
595                 else if (res > 0)
596                         ptr2++;
597                 else
598                 {
599                         ptr1++;
600                         ptr2++;
601                         count++;
602                 }
603         }
604
605 #ifdef DIVUNION
606         return ((float4) count) / ((float4) (len1 + len2 - count));
607 #else
608         return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
609 #endif
610 }
611
612 Datum
613 bigm_similarity(PG_FUNCTION_ARGS)
614 {
615         text       *in1 = PG_GETARG_TEXT_P(0);
616         text       *in2 = PG_GETARG_TEXT_P(1);
617         BIGM       *bgm1,
618                            *bgm2;
619         float4          res;
620
621         bgm1 = generate_bigm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
622         bgm2 = generate_bigm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
623
624         res = cnt_sml_bigm(bgm1, bgm2);
625
626         pfree(bgm1);
627         pfree(bgm2);
628         PG_FREE_IF_COPY(in1, 0);
629         PG_FREE_IF_COPY(in2, 1);
630
631         PG_RETURN_FLOAT4(res);
632 }
633
634 Datum
635 bigm_similarity_op(PG_FUNCTION_ARGS)
636 {
637         float4          res = DatumGetFloat4(DirectFunctionCall2(bigm_similarity,
638                                                                                                                  PG_GETARG_DATUM(0),
639                                                                                                                  PG_GETARG_DATUM(1)));
640
641         PG_RETURN_BOOL(res >= (float4) bigm_similarity_limit);
642 }
643
644 Datum
645 likequery(PG_FUNCTION_ARGS)
646 {
647         text       *query = PG_GETARG_TEXT_PP(0);
648         const char *str;
649         int                     len;
650         const char *sp;
651         text       *result;
652         char       *rp;
653         int                     mblen;
654
655         str = VARDATA_ANY(query);
656         len = VARSIZE_ANY_EXHDR(query);
657
658         if (len == 0)
659                 PG_RETURN_NULL();
660
661         result = (text *) palloc((Size) len * 2 + 2 + VARHDRSZ);
662         rp = VARDATA(result);
663         *rp++ = '%';
664
665         for (sp = str; (sp - str) < len;)
666         {
667                 if (ISWILDCARDCHAR(sp) || ISESCAPECHAR(sp))
668                 {
669                         *rp++ = '\\';
670                         *rp++ = *sp++;
671                 }
672                 else if (IS_HIGHBIT_SET(*sp))
673                 {
674                         mblen = pg_mblen(sp);
675                         memcpy(rp, sp, mblen);
676                         rp += mblen;
677                         sp += mblen;
678                 }
679                 else
680                         *rp++ = *sp++;
681         }
682
683         *rp++ = '%';
684         SET_VARSIZE(result, rp - VARDATA(result) + VARHDRSZ);
685
686         PG_RETURN_TEXT_P(result);
687 }
688
689 inline int
690 bigmstrcmp(char *arg1, int len1, char *arg2, int len2)
691 {
692         int                     i;
693         int                     len = Min(len1, len2);
694
695         for (i = 0; i < len; i++, arg1++, arg2++)
696         {
697                 if (*arg1 == *arg2)
698                         continue;
699                 if (*arg1 < *arg2)
700                         return -1;
701                 else
702                         return 1;
703         }
704
705         return (len1 == len2) ? 0 : ((len1 < len2) ? -1 : 1);
706 }
707
708 Datum
709 bigmtextcmp(PG_FUNCTION_ARGS)
710 {
711         text    *arg1 = PG_GETARG_TEXT_PP(0);
712         text    *arg2 = PG_GETARG_TEXT_PP(1);
713         char    *a1p = VARDATA_ANY(arg1);
714         char    *a2p = VARDATA_ANY(arg2);
715         int             len1 = VARSIZE_ANY_EXHDR(arg1);
716         int             len2 = VARSIZE_ANY_EXHDR(arg2);
717
718         PG_RETURN_INT32(bigmstrcmp(a1p, len1, a2p, len2));
719 }