OSDN Git Service

Add bigm_similarity function.
[pgbigm/pg_bigm.git] / bigm_op.c
1 /*-------------------------------------------------------------------------
2  *
3  * Portions Copyright (c) 2004-2012, PostgreSQL Global Development Group
4  *
5  * Changelog:
6  *   2013/01/09
7  *   Support full text search using bigrams.
8  *   Author: NTT DATA Corporation
9  *
10  *-------------------------------------------------------------------------
11  */
12 #include "postgres.h"
13
14 #include <ctype.h>
15
16 #include "bigm.h"
17
18 #include "catalog/pg_type.h"
19 #include "tsearch/ts_locale.h"
20 #include "utils/array.h"
21
22
23 PG_MODULE_MAGIC;
24
25 /* Last update date of pg_bigm */
26 #define BIGM_LAST_UPDATE        "2013.04.05"
27
28 /* GUC variable */
29 bool    bigm_enable_recheck = false;
30 int             bigm_gin_key_limit = 0;
31 char    *bigm_last_update = NULL;
32
33 PG_FUNCTION_INFO_V1(show_bigm);
34 Datum           show_bigm(PG_FUNCTION_ARGS);
35
36 PG_FUNCTION_INFO_V1(bigmtextcmp);
37 Datum           bigmtextcmp(PG_FUNCTION_ARGS);
38
39 PG_FUNCTION_INFO_V1(likequery);
40 Datum           likequery(PG_FUNCTION_ARGS);
41
42 PG_FUNCTION_INFO_V1(bigm_similarity);
43 Datum           bigm_similarity(PG_FUNCTION_ARGS);
44
45 void            _PG_init(void);
46 void            _PG_fini(void);
47
48 void
49 _PG_init(void)
50 {
51         /* Define custom GUC variables */
52         DefineCustomBoolVariable("pg_bigm.enable_recheck",
53                                                          "Recheck that heap tuples fetched from index "
54                                                          "match the query.",
55                                                          NULL,
56                                                          &bigm_enable_recheck,
57                                                          true,
58                                                          PGC_USERSET,
59                                                          0,
60                                                          NULL,
61                                                          NULL,
62                                                          NULL);
63
64         DefineCustomIntVariable("pg_bigm.gin_key_limit",
65                                                         "Sets the maximum number of bi-gram keys allowed to "
66                                                         "use for GIN index search.",
67                                                         "Zero means no limit.",
68                                                         &bigm_gin_key_limit,
69                                                         0,
70                                                         0, INT_MAX,
71                                                         PGC_USERSET,
72                                                         0,
73                                                         NULL,
74                                                         NULL,
75                                                         NULL);
76
77         /* Can't be set in postgresql.conf */
78         DefineCustomStringVariable("pg_bigm.last_update",
79                                                            "Shows the last update date of pg_bigm.",
80                                                            NULL,
81                                                            &bigm_last_update,
82                                                            BIGM_LAST_UPDATE,
83                                                            PGC_INTERNAL,
84                                                            GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
85                                                            NULL,
86                                                            NULL,
87                                                            NULL);
88
89         EmitWarningsOnPlaceholders("pg_bigm");
90 }
91
92 void
93 _PG_fini(void)
94 {
95 }
96
97 static int
98 comp_bigm(const void *a, const void *b, void *arg)
99 {
100         int             res;
101         bool    *haveDups = (bool *) arg;
102
103         res = CMPBIGM(a, b);
104
105         if (res == 0)
106                 *haveDups = true;
107
108         return res;
109 }
110
111 static int
112 unique_array(bigm *a, int len)
113 {
114         bigm       *curend,
115                            *tmp;
116
117         curend = tmp = a;
118         while (tmp - a < len)
119                 if (CMPBIGM(tmp, curend))
120                 {
121                         curend++;
122                         if (curend != tmp)
123                                 memcpy(curend, tmp, BIGMSIZE);
124                         tmp++;
125                 }
126                 else
127                         tmp++;
128
129         return curend + 1 - a;
130 }
131
132 #define iswordchr(c)    (!t_isspace(c))
133
134 /*
135  * Finds first word in string, returns pointer to the word,
136  * endword points to the character after word
137  */
138 static char *
139 find_word(char *str, int lenstr, char **endword, int *charlen)
140 {
141         char       *beginword = str;
142
143         while (beginword - str < lenstr && !iswordchr(beginword))
144                 beginword += pg_mblen(beginword);
145
146         if (beginword - str >= lenstr)
147                 return NULL;
148
149         *endword = beginword;
150         *charlen = 0;
151         while (*endword - str < lenstr && iswordchr(*endword))
152         {
153                 *endword += pg_mblen(*endword);
154                 (*charlen)++;
155         }
156
157         return beginword;
158 }
159
160 /* 
161  * The function is named compact_bigram to maintain consistency with pg_trgm,
162  * though it does not reduce multibyte characters to hash values like in
163  * compact_trigram.
164  */
165 static void
166 compact_bigram(bigm *bptr, char *str, int bytelen)
167 {
168         CPBIGM(bptr, str, bytelen);
169 }
170
171 /*
172  * Adds bigrams from words (already padded).
173  */
174 static bigm *
175 make_bigrams(bigm *bptr, char *str, int bytelen, int charlen)
176 {
177         char       *ptr = str;
178
179         if (charlen < 2)
180         {
181                 compact_bigram(bptr, ptr, pg_mblen(str));
182                 bptr->pmatch = true;
183                 bptr++;
184                 return bptr;
185         }
186
187         if (bytelen > charlen)
188         {
189                 /* Find multibyte character boundaries and call compact_bigram */
190                 int                     lenfirst = pg_mblen(str),
191                                         lenlast = pg_mblen(str + lenfirst);
192
193                 while ((ptr - str) + lenfirst + lenlast <= bytelen)
194                 {
195                         compact_bigram(bptr, ptr, lenfirst + lenlast);
196
197                         ptr += lenfirst;
198                         bptr++;
199
200                         lenfirst = lenlast;
201                         lenlast = pg_mblen(ptr + lenfirst);
202                 }
203         }
204         else
205         {
206                 /* Fast path when there are no multibyte characters */
207                 Assert(bytelen == charlen);
208
209                 while (ptr - str < bytelen - 1 /* number of bigrams = strlen - 1 */ )
210                 {
211                         CPBIGM(bptr, ptr, 2);
212                         ptr++;
213                         bptr++;
214                 }
215         }
216
217         return bptr;
218 }
219
220 BIGM *
221 generate_bigm(char *str, int slen)
222 {
223         BIGM       *bgm;
224         char       *buf;
225         bigm       *bptr;
226         int                     len,
227                                 charlen,
228                                 bytelen;
229         char       *bword,
230                            *eword;
231
232         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen / 2 + 1) *3);
233         SET_VARSIZE(bgm, VARHDRSZ);
234
235         if (slen + LPADDING + RPADDING < 2 || slen == 0)
236                 return bgm;
237
238         bptr = GETARR(bgm);
239
240         buf = palloc(sizeof(char) * (slen + 4));
241
242         if (LPADDING > 0)
243         {
244                 *buf = ' ';
245                 if (LPADDING > 1)
246                         *(buf + 1) = ' ';
247         }
248
249         eword = str;
250         while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
251         {
252                 bytelen = eword - bword;
253                 memcpy(buf + LPADDING, bword, bytelen);
254
255                 buf[LPADDING + bytelen] = ' ';
256                 buf[LPADDING + bytelen + 1] = ' ';
257
258                 /*
259                  * count bigrams
260                  */
261                 bptr = make_bigrams(bptr, buf, bytelen + LPADDING + RPADDING,
262                                                          charlen + LPADDING + RPADDING);
263         }
264
265         pfree(buf);
266
267         if ((len = bptr - GETARR(bgm)) == 0)
268                 return bgm;
269
270         if (len > 0)
271         {
272                 bool    haveDups = false;
273
274                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
275                 if (haveDups)
276                         len = unique_array(GETARR(bgm), len);
277         }
278
279         SET_VARSIZE(bgm, CALCGTSIZE(len));
280
281         return bgm;
282 }
283
284 /*
285  * Extract the next non-wildcard part of a search string, ie, a word bounded
286  * by '_' or '%' meta-characters, non-word characters or string end.
287  *
288  * str: source string, of length lenstr bytes (need not be null-terminated)
289  * buf: where to return the substring (must be long enough)
290  * *bytelen: receives byte length of the found substring
291  * *charlen: receives character length of the found substring
292  *
293  * Returns pointer to end+1 of the found substring in the source string.
294  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
295  *
296  * If the found word is bounded by non-word characters or string boundaries
297  * then this function will include corresponding padding spaces into buf.
298  */
299 static const char *
300 get_wildcard_part(const char *str, int lenstr,
301                                   char *buf, int *bytelen, int *charlen)
302 {
303         const char *beginword = str;
304         const char *endword;
305         char       *s = buf;
306         bool        in_leading_wildcard_meta = false;
307         bool        in_trailing_wildcard_meta = false;
308         bool            in_escape = false;
309         int                     clen;
310
311         /*
312          * Find the first word character, remembering whether preceding character
313          * was wildcard meta-character.  Note that the in_escape state persists
314          * from this loop to the next one, since we may exit at a word character
315          * that is in_escape.
316          */
317         while (beginword - str < lenstr)
318         {
319                 if (in_escape)
320                 {
321                         if (iswordchr(beginword))
322                                 break;
323                         in_escape = false;
324                         in_leading_wildcard_meta = false;
325                 }
326                 else
327                 {
328                         if (ISESCAPECHAR(beginword))
329                                 in_escape = true;
330                         else if (ISWILDCARDCHAR(beginword))
331                                 in_leading_wildcard_meta = true;
332                         else if (iswordchr(beginword))
333                                 break;
334                         else
335                                 in_leading_wildcard_meta = false;
336                 }
337                 beginword += pg_mblen(beginword);
338         }
339
340         /*
341          * Handle string end.
342          */
343         if (beginword - str >= lenstr)
344                 return NULL;
345
346         /*
347          * Add left padding spaces if preceding character wasn't wildcard
348          * meta-character.
349          */
350         *charlen = 0;
351         if (!in_leading_wildcard_meta)
352         {
353                 if (LPADDING > 0)
354                 {
355                         *s++ = ' ';
356                         (*charlen)++;
357                         if (LPADDING > 1)
358                         {
359                                 *s++ = ' ';
360                                 (*charlen)++;
361                         }
362                 }
363         }
364
365         /*
366          * Copy data into buf until wildcard meta-character, non-word character or
367          * string boundary.  Strip escapes during copy.
368          */
369         endword = beginword;
370         while (endword - str < lenstr)
371         {
372                 clen = pg_mblen(endword);
373                 if (in_escape)
374                 {
375                         if (iswordchr(endword))
376                         {
377                                 memcpy(s, endword, clen);
378                                 (*charlen)++;
379                                 s += clen;
380                         }
381                         else
382                         {
383                                 /*
384                                  * Back up endword to the escape character when stopping at
385                                  * an escaped char, so that subsequent get_wildcard_part will
386                                  * restart from the escape character.  We assume here that
387                                  * escape chars are single-byte.
388                                  */
389                                 endword--;
390                                 break;
391                         }
392                         in_escape = false;
393                 }
394                 else
395                 {
396                         if (ISESCAPECHAR(endword))
397                                 in_escape = true;
398                         else if (ISWILDCARDCHAR(endword))
399                         {
400                                 in_trailing_wildcard_meta = true;
401                                 break;
402                         }
403                         else if (iswordchr(endword))
404                         {
405                                 memcpy(s, endword, clen);
406                                 (*charlen)++;
407                                 s += clen;
408                         }
409                         else
410                                 break;
411                 }
412                 endword += clen;
413         }
414
415         /*
416          * Add right padding spaces if next character isn't wildcard
417          * meta-character.
418          */
419         if (!in_trailing_wildcard_meta)
420         {
421                 if (RPADDING > 0)
422                 {
423                         *s++ = ' ';
424                         (*charlen)++;
425                         if (RPADDING > 1)
426                         {
427                                 *s++ = ' ';
428                                 (*charlen)++;
429                         }
430                 }
431         }
432
433         *bytelen = s - buf;
434         return endword;
435 }
436
437 /*
438  * Generates bigrams for wildcard search string.
439  *
440  * Returns array of bigrams that must occur in any string that matches the
441  * wildcard string.  For example, given pattern "a%bcd%" the bigrams
442  * " a", "bcd" would be extracted.
443  *
444  * Set 'removeDups' to true if duplicate bigrams are removed.
445  */
446 BIGM *
447 generate_wildcard_bigm(const char *str, int slen, bool *removeDups)
448 {
449         BIGM       *bgm;
450         char       *buf;
451         bigm       *bptr;
452         int                     len,
453                                 charlen,
454                                 bytelen;
455         const char *eword;
456
457         *removeDups = false;
458
459         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen / 2 + 1) *3);
460         SET_VARSIZE(bgm, VARHDRSZ);
461
462         if (slen + LPADDING + RPADDING < 2 || slen == 0)
463                 return bgm;
464
465         bptr = GETARR(bgm);
466
467         buf = palloc(sizeof(char) * (slen + 4));
468
469         /*
470          * Extract bigrams from each substring extracted by get_wildcard_part.
471          */
472         eword = str;
473         while ((eword = get_wildcard_part(eword, slen - (eword - str),
474                                                                           buf, &bytelen, &charlen)) != NULL)
475         {
476                 /*
477                  * count bigrams
478                  */
479                 bptr = make_bigrams(bptr, buf, bytelen, charlen);
480         }
481
482         pfree(buf);
483
484         if ((len = bptr - GETARR(bgm)) == 0)
485                 return bgm;
486
487         /*
488          * Make bigrams unique.
489          */
490         if (len > 0)
491         {
492                 bool    haveDups = false;
493
494                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
495                 if (haveDups)
496                 {
497                         *removeDups = true;
498                         len = unique_array(GETARR(bgm), len);
499                 }
500         }
501
502         SET_VARSIZE(bgm, CALCGTSIZE(len));
503
504         return bgm;
505 }
506
507 Datum
508 show_bigm(PG_FUNCTION_ARGS)
509 {
510         text       *in = PG_GETARG_TEXT_P(0);
511         BIGM       *bgm;
512         Datum      *d;
513         ArrayType  *a;
514         bigm       *ptr;
515         int                     i;
516
517         bgm = generate_bigm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
518         d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(bgm)));
519
520         for (i = 0, ptr = GETARR(bgm); i < ARRNELEM(bgm); i++, ptr++)
521         {
522                 text       *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
523                 d[i] = PointerGetDatum(item);
524         }
525
526         a = construct_array(
527                                                 d,
528                                                 ARRNELEM(bgm),
529                                                 TEXTOID,
530                                                 -1,
531                                                 false,
532                                                 'i'
533                 );
534
535         for (i = 0; i < ARRNELEM(bgm); i++)
536                 pfree(DatumGetPointer(d[i]));
537
538         pfree(d);
539         pfree(bgm);
540         PG_FREE_IF_COPY(in, 0);
541
542         PG_RETURN_POINTER(a);
543 }
544
545 static float4
546 cnt_sml_bigm(BIGM *bgm1, BIGM *bgm2)
547 {
548         bigm       *ptr1,
549                            *ptr2;
550         int                     count = 0;
551         int                     len1,
552                                 len2;
553
554         ptr1 = GETARR(bgm1);
555         ptr2 = GETARR(bgm2);
556
557         len1 = ARRNELEM(bgm1);
558         len2 = ARRNELEM(bgm2);
559
560         /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
561         if (len1 <= 0 || len2 <= 0)
562                 return (float4) 0.0;
563
564         while (ptr1 - GETARR(bgm1) < len1 && ptr2 - GETARR(bgm2) < len2)
565         {
566                 int             res = CMPBIGM(ptr1, ptr2);
567
568                 if (res < 0)
569                         ptr1++;
570                 else if (res > 0)
571                         ptr2++;
572                 else
573                 {
574                         ptr1++;
575                         ptr2++;
576                         count++;
577                 }
578         }
579
580 #ifdef DIVUNION
581         return ((float4) count) / ((float4) (len1 + len2 - count));
582 #else
583         return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
584 #endif
585 }
586
587 Datum
588 bigm_similarity(PG_FUNCTION_ARGS)
589 {
590         text       *in1 = PG_GETARG_TEXT_P(0);
591         text       *in2 = PG_GETARG_TEXT_P(1);
592         BIGM       *bgm1,
593                            *bgm2;
594         float4          res;
595
596         bgm1 = generate_bigm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
597         bgm2 = generate_bigm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
598
599         res = cnt_sml_bigm(bgm1, bgm2);
600
601         pfree(bgm1);
602         pfree(bgm2);
603         PG_FREE_IF_COPY(in1, 0);
604         PG_FREE_IF_COPY(in2, 1);
605
606         PG_RETURN_FLOAT4(res);
607 }
608
609 Datum
610 likequery(PG_FUNCTION_ARGS)
611 {
612         text       *query = PG_GETARG_TEXT_PP(0);
613         const char *str;
614         int                     len;
615         const char *sp;
616         text       *result;
617         char       *rp;
618         int                     mblen;
619
620         str = VARDATA_ANY(query);
621         len = VARSIZE_ANY_EXHDR(query);
622
623         if (len == 0)
624                 PG_RETURN_NULL();
625
626         result = (text *) palloc(len * 2 + 2 + VARHDRSZ);
627         rp = VARDATA(result);
628         *rp++ = '%';
629
630         for (sp = str; (sp - str) < len;)
631         {
632                 if (ISWILDCARDCHAR(sp) || ISESCAPECHAR(sp))
633                 {
634                         *rp++ = '\\';
635                         *rp++ = *sp++;
636                 }
637                 else if (IS_HIGHBIT_SET(*sp))
638                 {
639                         mblen = pg_mblen(sp);
640                         memcpy(rp, sp, mblen);
641                         rp += mblen;
642                         sp += mblen;
643                 }
644                 else
645                         *rp++ = *sp++;
646         }
647
648         *rp++ = '%';
649         SET_VARSIZE(result, rp - VARDATA(result) + VARHDRSZ);
650
651         PG_RETURN_TEXT_P(result);
652 }
653
654 inline int
655 bigmstrcmp(char *arg1, int len1, char *arg2, int len2)
656 {
657         int                     i;
658         int                     len = Min(len1, len2);
659
660         for (i = 0; i < len; i++, arg1++, arg2++)
661         {
662                 if (*arg1 == *arg2)
663                         continue;
664                 if (*arg1 < *arg2)
665                         return -1;
666                 else
667                         return 1;
668         }
669
670         return (len1 == len2) ? 0 : ((len1 < len2) ? -1 : 1);
671 }
672
673 Datum
674 bigmtextcmp(PG_FUNCTION_ARGS)
675 {
676         text    *arg1 = PG_GETARG_TEXT_PP(0);
677         text    *arg2 = PG_GETARG_TEXT_PP(1);
678         char    *a1p = VARDATA_ANY(arg1);
679         char    *a2p = VARDATA_ANY(arg2);
680         int             len1 = VARSIZE_ANY_EXHDR(arg1);
681         int             len2 = VARSIZE_ANY_EXHDR(arg2);
682
683         PG_RETURN_INT32(bigmstrcmp(a1p, len1, a2p, len2));
684 }