bigm_op.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * Portions Copyright (c) 2004-2012, PostgreSQL Global Development Group
   4  *
   5  * Changelog:
   6  *   2013/01/09
   7  *   Support full text search using bigrams.
   8  *   Author: NTT DATA Corporation
   9  *
  10  *-------------------------------------------------------------------------
  11  */
  12 #include "postgres.h"
  13
  14 #include <ctype.h>
  15
  16 #include "bigm.h"
  17
  18 #include "catalog/pg_type.h"
  19 #include "tsearch/ts_locale.h"
  20 #include "utils/array.h"
  21
  22
  23 PG_MODULE_MAGIC;
  24
  25 /* Last update date of pg_bigm */
  26 #define BIGM_LAST_UPDATE        "2013.04.05"
  27
  28 /* GUC variable */
  29 bool    bigm_enable_recheck = false;
  30 int             bigm_gin_key_limit = 0;
  31 char    *bigm_last_update = NULL;
  32
  33 PG_FUNCTION_INFO_V1(show_bigm);
  34 Datum           show_bigm(PG_FUNCTION_ARGS);
  35
  36 PG_FUNCTION_INFO_V1(bigmtextcmp);
  37 Datum           bigmtextcmp(PG_FUNCTION_ARGS);
  38
  39 PG_FUNCTION_INFO_V1(likequery);
  40 Datum           likequery(PG_FUNCTION_ARGS);
  41
  42 PG_FUNCTION_INFO_V1(bigm_similarity);
  43 Datum           bigm_similarity(PG_FUNCTION_ARGS);
  44
  45 void            _PG_init(void);
  46 void            _PG_fini(void);
  47
  48 void
  49 _PG_init(void)
  50 {
  51         /* Define custom GUC variables */
  52         DefineCustomBoolVariable("pg_bigm.enable_recheck",
  53                                                          "Recheck that heap tuples fetched from index "
  54                                                          "match the query.",
  55                                                          NULL,
  56                                                          &bigm_enable_recheck,
  57                                                          true,
  58                                                          PGC_USERSET,
  59                                                          0,
  60                                                          NULL,
  61                                                          NULL,
  62                                                          NULL);
  63
  64         DefineCustomIntVariable("pg_bigm.gin_key_limit",
  65                                                         "Sets the maximum number of bi-gram keys allowed to "
  66                                                         "use for GIN index search.",
  67                                                         "Zero means no limit.",
  68                                                         &bigm_gin_key_limit,
  69                                                         0,
  70                                                         0, INT_MAX,
  71                                                         PGC_USERSET,
  72                                                         0,
  73                                                         NULL,
  74                                                         NULL,
  75                                                         NULL);
  76
  77         /* Can't be set in postgresql.conf */
  78         DefineCustomStringVariable("pg_bigm.last_update",
  79                                                            "Shows the last update date of pg_bigm.",
  80                                                            NULL,
  81                                                            &bigm_last_update,
  82                                                            BIGM_LAST_UPDATE,
  83                                                            PGC_INTERNAL,
  84                                                            GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE,
  85                                                            NULL,
  86                                                            NULL,
  87                                                            NULL);
  88
  89         EmitWarningsOnPlaceholders("pg_bigm");
  90 }
  91
  92 void
  93 _PG_fini(void)
  94 {
  95 }
  96
  97 static int
  98 comp_bigm(const void *a, const void *b, void *arg)
  99 {
 100         int             res;
 101         bool    *haveDups = (bool *) arg;
 102
 103         res = CMPBIGM(a, b);
 104
 105         if (res == 0)
 106                 *haveDups = true;
 107
 108         return res;
 109 }
 110
 111 static int
 112 unique_array(bigm *a, int len)
 113 {
 114         bigm       *curend,
 115                            *tmp;
 116
 117         curend = tmp = a;
 118         while (tmp - a < len)
 119                 if (CMPBIGM(tmp, curend))
 120                 {
 121                         curend++;
 122                         if (curend != tmp)
 123                                 memcpy(curend, tmp, BIGMSIZE);
 124                         tmp++;
 125                 }
 126                 else
 127                         tmp++;
 128
 129         return curend + 1 - a;
 130 }
 131
 132 #define iswordchr(c)    (!t_isspace(c))
 133
 134 /*
 135  * Finds first word in string, returns pointer to the word,
 136  * endword points to the character after word
 137  */
 138 static char *
 139 find_word(char *str, int lenstr, char **endword, int *charlen)
 140 {
 141         char       *beginword = str;
 142
 143         while (beginword - str < lenstr && !iswordchr(beginword))
 144                 beginword += pg_mblen(beginword);
 145
 146         if (beginword - str >= lenstr)
 147                 return NULL;
 148
 149         *endword = beginword;
 150         *charlen = 0;
 151         while (*endword - str < lenstr && iswordchr(*endword))
 152         {
 153                 *endword += pg_mblen(*endword);
 154                 (*charlen)++;
 155         }
 156
 157         return beginword;
 158 }
 159
 160 /*
 161  * The function is named compact_bigram to maintain consistency with pg_trgm,
 162  * though it does not reduce multibyte characters to hash values like in
 163  * compact_trigram.
 164  */
 165 static void
 166 compact_bigram(bigm *bptr, char *str, int bytelen)
 167 {
 168         CPBIGM(bptr, str, bytelen);
 169 }
 170
 171 /*
 172  * Adds bigrams from words (already padded).
 173  */
 174 static bigm *
 175 make_bigrams(bigm *bptr, char *str, int bytelen, int charlen)
 176 {
 177         char       *ptr = str;
 178
 179         if (charlen < 2)
 180         {
 181                 compact_bigram(bptr, ptr, pg_mblen(str));
 182                 bptr->pmatch = true;
 183                 bptr++;
 184                 return bptr;
 185         }
 186
 187         if (bytelen > charlen)
 188         {
 189                 /* Find multibyte character boundaries and call compact_bigram */
 190                 int                     lenfirst = pg_mblen(str),
 191                                         lenlast = pg_mblen(str + lenfirst);
 192
 193                 while ((ptr - str) + lenfirst + lenlast <= bytelen)
 194                 {
 195                         compact_bigram(bptr, ptr, lenfirst + lenlast);
 196
 197                         ptr += lenfirst;
 198                         bptr++;
 199
 200                         lenfirst = lenlast;
 201                         lenlast = pg_mblen(ptr + lenfirst);
 202                 }
 203         }
 204         else
 205         {
 206                 /* Fast path when there are no multibyte characters */
 207                 Assert(bytelen == charlen);
 208
 209                 while (ptr - str < bytelen - 1 /* number of bigrams = strlen - 1 */ )
 210                 {
 211                         CPBIGM(bptr, ptr, 2);
 212                         ptr++;
 213                         bptr++;
 214                 }
 215         }
 216
 217         return bptr;
 218 }
 219
 220 BIGM *
 221 generate_bigm(char *str, int slen)
 222 {
 223         BIGM       *bgm;
 224         char       *buf;
 225         bigm       *bptr;
 226         int                     len,
 227                                 charlen,
 228                                 bytelen;
 229         char       *bword,
 230                            *eword;
 231
 232         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen / 2 + 1) *3);
 233         SET_VARSIZE(bgm, VARHDRSZ);
 234
 235         if (slen + LPADDING + RPADDING < 2 || slen == 0)
 236                 return bgm;
 237
 238         bptr = GETARR(bgm);
 239
 240         buf = palloc(sizeof(char) * (slen + 4));
 241
 242         if (LPADDING > 0)
 243         {
 244                 *buf = ' ';
 245                 if (LPADDING > 1)
 246                         *(buf + 1) = ' ';
 247         }
 248
 249         eword = str;
 250         while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
 251         {
 252                 bytelen = eword - bword;
 253                 memcpy(buf + LPADDING, bword, bytelen);
 254
 255                 buf[LPADDING + bytelen] = ' ';
 256                 buf[LPADDING + bytelen + 1] = ' ';
 257
 258                 /*
 259                  * count bigrams
 260                  */
 261                 bptr = make_bigrams(bptr, buf, bytelen + LPADDING + RPADDING,
 262                                                          charlen + LPADDING + RPADDING);
 263         }
 264
 265         pfree(buf);
 266
 267         if ((len = bptr - GETARR(bgm)) == 0)
 268                 return bgm;
 269
 270         if (len > 0)
 271         {
 272                 bool    haveDups = false;
 273
 274                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
 275                 if (haveDups)
 276                         len = unique_array(GETARR(bgm), len);
 277         }
 278
 279         SET_VARSIZE(bgm, CALCGTSIZE(len));
 280
 281         return bgm;
 282 }
 283
 284 /*
 285  * Extract the next non-wildcard part of a search string, ie, a word bounded
 286  * by '_' or '%' meta-characters, non-word characters or string end.
 287  *
 288  * str: source string, of length lenstr bytes (need not be null-terminated)
 289  * buf: where to return the substring (must be long enough)
 290  * *bytelen: receives byte length of the found substring
 291  * *charlen: receives character length of the found substring
 292  *
 293  * Returns pointer to end+1 of the found substring in the source string.
 294  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
 295  *
 296  * If the found word is bounded by non-word characters or string boundaries
 297  * then this function will include corresponding padding spaces into buf.
 298  */
 299 static const char *
 300 get_wildcard_part(const char *str, int lenstr,
 301                                   char *buf, int *bytelen, int *charlen)
 302 {
 303         const char *beginword = str;
 304         const char *endword;
 305         char       *s = buf;
 306         bool        in_leading_wildcard_meta = false;
 307         bool        in_trailing_wildcard_meta = false;
 308         bool            in_escape = false;
 309         int                     clen;
 310
 311         /*
 312          * Find the first word character, remembering whether preceding character
 313          * was wildcard meta-character.  Note that the in_escape state persists
 314          * from this loop to the next one, since we may exit at a word character
 315          * that is in_escape.
 316          */
 317         while (beginword - str < lenstr)
 318         {
 319                 if (in_escape)
 320                 {
 321                         if (iswordchr(beginword))
 322                                 break;
 323                         in_escape = false;
 324                         in_leading_wildcard_meta = false;
 325                 }
 326                 else
 327                 {
 328                         if (ISESCAPECHAR(beginword))
 329                                 in_escape = true;
 330                         else if (ISWILDCARDCHAR(beginword))
 331                                 in_leading_wildcard_meta = true;
 332                         else if (iswordchr(beginword))
 333                                 break;
 334                         else
 335                                 in_leading_wildcard_meta = false;
 336                 }
 337                 beginword += pg_mblen(beginword);
 338         }
 339
 340         /*
 341          * Handle string end.
 342          */
 343         if (beginword - str >= lenstr)
 344                 return NULL;
 345
 346         /*
 347          * Add left padding spaces if preceding character wasn't wildcard
 348          * meta-character.
 349          */
 350         *charlen = 0;
 351         if (!in_leading_wildcard_meta)
 352         {
 353                 if (LPADDING > 0)
 354                 {
 355                         *s++ = ' ';
 356                         (*charlen)++;
 357                         if (LPADDING > 1)
 358                         {
 359                                 *s++ = ' ';
 360                                 (*charlen)++;
 361                         }
 362                 }
 363         }
 364
 365         /*
 366          * Copy data into buf until wildcard meta-character, non-word character or
 367          * string boundary.  Strip escapes during copy.
 368          */
 369         endword = beginword;
 370         while (endword - str < lenstr)
 371         {
 372                 clen = pg_mblen(endword);
 373                 if (in_escape)
 374                 {
 375                         if (iswordchr(endword))
 376                         {
 377                                 memcpy(s, endword, clen);
 378                                 (*charlen)++;
 379                                 s += clen;
 380                         }
 381                         else
 382                         {
 383                                 /*
 384                                  * Back up endword to the escape character when stopping at
 385                                  * an escaped char, so that subsequent get_wildcard_part will
 386                                  * restart from the escape character.  We assume here that
 387                                  * escape chars are single-byte.
 388                                  */
 389                                 endword--;
 390                                 break;
 391                         }
 392                         in_escape = false;
 393                 }
 394                 else
 395                 {
 396                         if (ISESCAPECHAR(endword))
 397                                 in_escape = true;
 398                         else if (ISWILDCARDCHAR(endword))
 399                         {
 400                                 in_trailing_wildcard_meta = true;
 401                                 break;
 402                         }
 403                         else if (iswordchr(endword))
 404                         {
 405                                 memcpy(s, endword, clen);
 406                                 (*charlen)++;
 407                                 s += clen;
 408                         }
 409                         else
 410                                 break;
 411                 }
 412                 endword += clen;
 413         }
 414
 415         /*
 416          * Add right padding spaces if next character isn't wildcard
 417          * meta-character.
 418          */
 419         if (!in_trailing_wildcard_meta)
 420         {
 421                 if (RPADDING > 0)
 422                 {
 423                         *s++ = ' ';
 424                         (*charlen)++;
 425                         if (RPADDING > 1)
 426                         {
 427                                 *s++ = ' ';
 428                                 (*charlen)++;
 429                         }
 430                 }
 431         }
 432
 433         *bytelen = s - buf;
 434         return endword;
 435 }
 436
 437 /*
 438  * Generates bigrams for wildcard search string.
 439  *
 440  * Returns array of bigrams that must occur in any string that matches the
 441  * wildcard string.  For example, given pattern "a%bcd%" the bigrams
 442  * " a", "bcd" would be extracted.
 443  *
 444  * Set 'removeDups' to true if duplicate bigrams are removed.
 445  */
 446 BIGM *
 447 generate_wildcard_bigm(const char *str, int slen, bool *removeDups)
 448 {
 449         BIGM       *bgm;
 450         char       *buf;
 451         bigm       *bptr;
 452         int                     len,
 453                                 charlen,
 454                                 bytelen;
 455         const char *eword;
 456
 457         *removeDups = false;
 458
 459         bgm = (BIGM *) palloc(VARHDRSZ + sizeof(bigm) * (slen / 2 + 1) *3);
 460         SET_VARSIZE(bgm, VARHDRSZ);
 461
 462         if (slen + LPADDING + RPADDING < 2 || slen == 0)
 463                 return bgm;
 464
 465         bptr = GETARR(bgm);
 466
 467         buf = palloc(sizeof(char) * (slen + 4));
 468
 469         /*
 470          * Extract bigrams from each substring extracted by get_wildcard_part.
 471          */
 472         eword = str;
 473         while ((eword = get_wildcard_part(eword, slen - (eword - str),
 474                                                                           buf, &bytelen, &charlen)) != NULL)
 475         {
 476                 /*
 477                  * count bigrams
 478                  */
 479                 bptr = make_bigrams(bptr, buf, bytelen, charlen);
 480         }
 481
 482         pfree(buf);
 483
 484         if ((len = bptr - GETARR(bgm)) == 0)
 485                 return bgm;
 486
 487         /*
 488          * Make bigrams unique.
 489          */
 490         if (len > 0)
 491         {
 492                 bool    haveDups = false;
 493
 494                 qsort_arg((void *) GETARR(bgm), len, sizeof(bigm), comp_bigm, (void *) &haveDups);
 495                 if (haveDups)
 496                 {
 497                         *removeDups = true;
 498                         len = unique_array(GETARR(bgm), len);
 499                 }
 500         }
 501
 502         SET_VARSIZE(bgm, CALCGTSIZE(len));
 503
 504         return bgm;
 505 }
 506
 507 Datum
 508 show_bigm(PG_FUNCTION_ARGS)
 509 {
 510         text       *in = PG_GETARG_TEXT_P(0);
 511         BIGM       *bgm;
 512         Datum      *d;
 513         ArrayType  *a;
 514         bigm       *ptr;
 515         int                     i;
 516
 517         bgm = generate_bigm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
 518         d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(bgm)));
 519
 520         for (i = 0, ptr = GETARR(bgm); i < ARRNELEM(bgm); i++, ptr++)
 521         {
 522                 text       *item = cstring_to_text_with_len(ptr->str, ptr->bytelen);
 523                 d[i] = PointerGetDatum(item);
 524         }
 525
 526         a = construct_array(
 527                                                 d,
 528                                                 ARRNELEM(bgm),
 529                                                 TEXTOID,
 530                                                 -1,
 531                                                 false,
 532                                                 'i'
 533                 );
 534
 535         for (i = 0; i < ARRNELEM(bgm); i++)
 536                 pfree(DatumGetPointer(d[i]));
 537
 538         pfree(d);
 539         pfree(bgm);
 540         PG_FREE_IF_COPY(in, 0);
 541
 542         PG_RETURN_POINTER(a);
 543 }
 544
 545 static float4
 546 cnt_sml_bigm(BIGM *bgm1, BIGM *bgm2)
 547 {
 548         bigm       *ptr1,
 549                            *ptr2;
 550         int                     count = 0;
 551         int                     len1,
 552                                 len2;
 553
 554         ptr1 = GETARR(bgm1);
 555         ptr2 = GETARR(bgm2);
 556
 557         len1 = ARRNELEM(bgm1);
 558         len2 = ARRNELEM(bgm2);
 559
 560         /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
 561         if (len1 <= 0 || len2 <= 0)
 562                 return (float4) 0.0;
 563
 564         while (ptr1 - GETARR(bgm1) < len1 && ptr2 - GETARR(bgm2) < len2)
 565         {
 566                 int             res = CMPBIGM(ptr1, ptr2);
 567
 568                 if (res < 0)
 569                         ptr1++;
 570                 else if (res > 0)
 571                         ptr2++;
 572                 else
 573                 {
 574                         ptr1++;
 575                         ptr2++;
 576                         count++;
 577                 }
 578         }
 579
 580 #ifdef DIVUNION
 581         return ((float4) count) / ((float4) (len1 + len2 - count));
 582 #else
 583         return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
 584 #endif
 585 }
 586
 587 Datum
 588 bigm_similarity(PG_FUNCTION_ARGS)
 589 {
 590         text       *in1 = PG_GETARG_TEXT_P(0);
 591         text       *in2 = PG_GETARG_TEXT_P(1);
 592         BIGM       *bgm1,
 593                            *bgm2;
 594         float4          res;
 595
 596         bgm1 = generate_bigm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
 597         bgm2 = generate_bigm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
 598
 599         res = cnt_sml_bigm(bgm1, bgm2);
 600
 601         pfree(bgm1);
 602         pfree(bgm2);
 603         PG_FREE_IF_COPY(in1, 0);
 604         PG_FREE_IF_COPY(in2, 1);
 605
 606         PG_RETURN_FLOAT4(res);
 607 }
 608
 609 Datum
 610 likequery(PG_FUNCTION_ARGS)
 611 {
 612         text       *query = PG_GETARG_TEXT_PP(0);
 613         const char *str;
 614         int                     len;
 615         const char *sp;
 616         text       *result;
 617         char       *rp;
 618         int                     mblen;
 619
 620         str = VARDATA_ANY(query);
 621         len = VARSIZE_ANY_EXHDR(query);
 622
 623         if (len == 0)
 624                 PG_RETURN_NULL();
 625
 626         result = (text *) palloc(len * 2 + 2 + VARHDRSZ);
 627         rp = VARDATA(result);
 628         *rp++ = '%';
 629
 630         for (sp = str; (sp - str) < len;)
 631         {
 632                 if (ISWILDCARDCHAR(sp) || ISESCAPECHAR(sp))
 633                 {
 634                         *rp++ = '\\';
 635                         *rp++ = *sp++;
 636                 }
 637                 else if (IS_HIGHBIT_SET(*sp))
 638                 {
 639                         mblen = pg_mblen(sp);
 640                         memcpy(rp, sp, mblen);
 641                         rp += mblen;
 642                         sp += mblen;
 643                 }
 644                 else
 645                         *rp++ = *sp++;
 646         }
 647
 648         *rp++ = '%';
 649         SET_VARSIZE(result, rp - VARDATA(result) + VARHDRSZ);
 650
 651         PG_RETURN_TEXT_P(result);
 652 }
 653
 654 inline int
 655 bigmstrcmp(char *arg1, int len1, char *arg2, int len2)
 656 {
 657         int                     i;
 658         int                     len = Min(len1, len2);
 659
 660         for (i = 0; i < len; i++, arg1++, arg2++)
 661         {
 662                 if (*arg1 == *arg2)
 663                         continue;
 664                 if (*arg1 < *arg2)
 665                         return -1;
 666                 else
 667                         return 1;
 668         }
 669
 670         return (len1 == len2) ? 0 : ((len1 < len2) ? -1 : 1);
 671 }
 672
 673 Datum
 674 bigmtextcmp(PG_FUNCTION_ARGS)
 675 {
 676         text    *arg1 = PG_GETARG_TEXT_PP(0);
 677         text    *arg2 = PG_GETARG_TEXT_PP(1);
 678         char    *a1p = VARDATA_ANY(arg1);
 679         char    *a2p = VARDATA_ANY(arg2);
 680         int             len1 = VARSIZE_ANY_EXHDR(arg1);
 681         int             len2 = VARSIZE_ANY_EXHDR(arg2);
 682
 683         PG_RETURN_INT32(bigmstrcmp(a1p, len1, a2p, len2));
 684 }