1 /* Copyright(C) 2004 Brazil
3 This library is free software; you can redistribute it and/or
4 modify it under the terms of the GNU Lesser General Public
5 License as published by the Free Software Foundation; either
6 version 2.1 of the License, or (at your option) any later version.
8 This library is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 Lesser General Public License for more details.
13 You should have received a copy of the GNU Lesser General Public
14 License along with this library; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29 void sen_inv_seg_expire08(sen_inv *inv);
30 sen_inv * sen_inv_create08(const char *path, sen_sym *lexicon,
31 uint32_t initial_n_segments);
32 sen_inv * sen_inv_open08(const char *path, sen_sym *lexicon);
33 sen_rc sen_inv_update08(sen_inv *inv, uint32_t key, sen_inv_updspec *u, sen_set *h,
35 sen_rc sen_inv_delete08(sen_inv *inv, uint32_t key, sen_inv_updspec *u, sen_set *h);
36 sen_inv_cursor *sen_inv_cursor_open08(sen_inv *inv, uint32_t key);
37 sen_rc sen_inv_cursor_next08(sen_inv_cursor *c);
38 sen_rc sen_inv_cursor_next_pos08(sen_inv_cursor *c);
39 sen_rc sen_inv_cursor_close08(sen_inv_cursor *c);
40 uint32_t sen_inv_estimate_size08(sen_inv *inv, uint32_t key);
41 int sen_inv_entry_info08(sen_inv *inv, unsigned key, unsigned *a, unsigned *pocket,
42 unsigned *chunk, unsigned *chunk_size, unsigned *buffer_free,
43 unsigned *nterms, unsigned *nterms_void, unsigned *tid,
44 unsigned *size_in_chunk, unsigned *pos_in_chunk,
45 unsigned *size_in_buffer, unsigned *pos_in_buffer);
52 struct sen_inv_header *header;
55 struct sen_inv_header {
57 uint32_t initial_n_segments;
58 uint32_t total_chunk_size;
63 uint16_t ainfo[SEN_INV_MAX_SEGMENT];
64 uint16_t binfo[SEN_INV_MAX_SEGMENT];
65 uint8_t chunks[1]; /* dummy */
68 #define SEN_INV_IDSTR "SENNA:INV:01.00"
69 #define SEN_INV_SEGMENT_SIZE 0x40000
70 #define SEN_INV_CHUNK_SIZE 0x40000
71 #define N_CHUNKS_PER_FILE (SEN_IO_FILE_SIZE / SEN_INV_CHUNK_SIZE)
72 #define W_OF_SEGMENT 18
73 #define W_OF_ARRAY (W_OF_SEGMENT - 2)
74 #define ARRAY_MASK_IN_A_SEGMENT ((SEN_INV_SEGMENT_SIZE >> 2) - 1)
75 #define BUFFER_MASK_IN_A_SEGMENT (SEN_INV_SEGMENT_SIZE - 1)
76 #define CHUNK_NOT_ASSIGNED 0xffffffff
77 #define SEG_NOT_ASSIGNED 0xffff
79 #define SEGMENT_ARRAY 0x8000
80 #define SEGMENT_BUFFER 0x4000
81 #define SEGMENT_MASK (SEN_INV_MAX_SEGMENT - 1)
83 #define BIT11_01(x) ((x >> 1) & 0x7ff)
84 #define BIT31_12(x) (x >> 12)
86 #define SEN_INV_INITIAL_N_SEGMENTS 512
87 #define MAX_CHUNK_RATIO 64
89 #define NEXT_ADDR(p) (((byte *)(p)) + sizeof *(p))
93 inline static uint16_t
94 segment_get(sen_inv *inv)
98 char used[SEN_INV_MAX_SEGMENT];
99 memset(used, 0, SEN_INV_MAX_SEGMENT);
100 for (i = 0; i < SEN_INV_MAX_SEGMENT; i++) {
101 if ((seg = inv->header->ainfo[i]) != SEG_NOT_ASSIGNED) { used[seg] = 1; }
102 if ((seg = inv->header->binfo[i]) != SEG_NOT_ASSIGNED) { used[seg] = 1; }
104 for (seg = 0; used[seg] && seg < SEN_INV_MAX_SEGMENT; seg++) ;
109 segment_get_clear(sen_inv *inv, uint16_t *pseg)
111 uint16_t seg = segment_get(inv);
112 if (seg < SEN_INV_MAX_SEGMENT) {
114 SEN_IO_SEG_REF(inv->seg, seg, p);
115 if (!p) { return sen_memory_exhausted; }
116 memset(p, 0, SEN_INV_SEGMENT_SIZE);
117 SEN_IO_SEG_UNREF(inv->seg, seg);
121 return sen_memory_exhausted;
126 buffer_segment_new(sen_inv *inv, uint16_t *segno)
129 if (*segno < SEN_INV_MAX_SEGMENT) {
130 if (inv->header->binfo[*segno] != SEG_NOT_ASSIGNED) {
131 return sen_invalid_argument;
135 for (lseg = 0; lseg < SEN_INV_MAX_SEGMENT; lseg++) {
136 if (inv->header->binfo[lseg] == SEG_NOT_ASSIGNED) { break; }
138 if (lseg == SEN_INV_MAX_SEGMENT) { return sen_memory_exhausted; }
141 pseg = segment_get(inv);
142 if (pseg < SEN_INV_MAX_SEGMENT) {
143 inv->header->binfo[lseg] = pseg;
144 if (lseg >= inv->header->bmax) { inv->header->bmax = lseg + 1; }
147 return sen_memory_exhausted;
152 sen_inv_seg_expire(sen_inv *inv, int32_t threshold)
156 if (inv->v08p) { sen_inv_seg_expire08(inv); return; }
157 th = (threshold < 0) ? (inv->header->initial_n_segments * 2) : (uint32_t) threshold;
158 if ((nmaps = inv->seg->nmaps) <= th) { return; }
159 for (seg = inv->header->bmax; seg && (inv->seg->nmaps > th); seg--) {
160 uint16_t pseg = inv->header->binfo[seg - 1];
161 if (pseg != SEG_NOT_ASSIGNED) {
162 sen_io_mapinfo *info = &inv->seg->maps[pseg];
163 uint32_t *pnref = &inv->seg->nrefs[pseg];
164 if (info->map && !*pnref) { sen_io_seg_expire(inv->seg, pseg, 0); }
167 for (seg = inv->header->amax; seg && (inv->seg->nmaps > th); seg--) {
168 uint16_t pseg = inv->header->ainfo[seg - 1];
169 if (pseg != SEG_NOT_ASSIGNED) {
170 sen_io_mapinfo *info = &inv->seg->maps[pseg];
171 uint32_t *pnref = &inv->seg->nrefs[pseg];
172 if (info->map && !*pnref) { sen_io_seg_expire(inv->seg, pseg, 0); }
175 SEN_LOG(sen_log_notice, "expired(%d) (%u -> %u)", threshold, nmaps, inv->seg->nmaps);
181 chunk_new(sen_inv *inv, uint32_t *res, uint32_t size)
184 uint32_t n = size / SEN_INV_CHUNK_SIZE;
185 int max_chunk = inv->header->initial_n_segments * MAX_CHUNK_RATIO;
186 uint32_t base_seg = sen_io_base_seg(inv->chunk);
187 if (n * SEN_INV_CHUNK_SIZE < size) { n++; }
188 for (i = 0, j = -1; i < max_chunk; i++) {
189 if (inv->header->chunks[i]) {
193 if (res) { *res = j + 1; }
195 inv->header->chunks[++j] = 1;
199 if ((i + base_seg)/ N_CHUNKS_PER_FILE !=
200 (i + base_seg + 1) / N_CHUNKS_PER_FILE) { j = i; }
203 SEN_LOG(sen_log_crit, "index full. set bigger value to initial_n_segments. current value = %d",
204 inv->header->initial_n_segments);
205 return sen_memory_exhausted;
209 chunk_free(sen_inv *inv, int start, uint32_t size)
211 uint32_t i, n = size / SEN_INV_CHUNK_SIZE;
212 if (n * SEN_INV_CHUNK_SIZE < size) { n++; }
213 for (i = 0; i < n; i++) {
214 inv->header->chunks[start + i] = 0;
223 uint32_t size_in_chunk;
224 uint32_t pos_in_chunk;
225 uint16_t size_in_buffer;
226 uint16_t pos_in_buffer;
237 uint32_t buffer_free;
239 uint16_t nterms_void;
242 struct sen_inv_buffer {
243 buffer_header header;
244 buffer_term terms[(SEN_INV_SEGMENT_SIZE - sizeof(buffer_header))/sizeof(buffer_term)];
247 typedef struct sen_inv_buffer buffer;
249 inline static uint16_t
250 buffer_open(sen_inv *inv, uint32_t pos, buffer_term **bt, buffer **b)
253 uint16_t lseg = (uint16_t) (pos >> W_OF_SEGMENT);
254 uint16_t pseg = inv->header->binfo[lseg];
255 if (pseg != SEG_NOT_ASSIGNED) {
256 SEN_IO_SEG_REF(inv->seg, pseg, p);
257 if (!p) { return SEG_NOT_ASSIGNED; }
258 if (b) { *b = (buffer *)p; }
259 if (bt) { *bt = (buffer_term *)(p + (pos & BUFFER_MASK_IN_A_SEGMENT)); }
265 buffer_close(sen_inv *inv, uint16_t pseg)
267 if (pseg >= SEN_INV_MAX_SEGMENT) {
268 SEN_LOG(sen_log_notice, "invalid pseg buffer_close(%d)", pseg);
269 return sen_invalid_argument;
271 SEN_IO_SEG_UNREF(inv->seg, pseg);
275 inline static uint16_t
276 buffer_open_if_capable(sen_inv *inv, int32_t seg, int size, buffer **b)
279 uint32_t pos = ((uint32_t) seg) * SEN_INV_SEGMENT_SIZE;
280 if ((pseg = buffer_open(inv, pos, NULL, b)) != SEG_NOT_ASSIGNED) {
281 uint16_t nterms = (*b)->header.nterms - (*b)->header.nterms_void;
282 if (!((nterms < 4096 ||
283 (inv->header->total_chunk_size >> ((nterms >> 8) - 6))
284 > (*b)->header.chunk_size) &&
285 ((*b)->header.buffer_free >= size + sizeof(buffer_term)))) {
286 buffer_close(inv, pseg);
287 return SEG_NOT_ASSIGNED;
293 inline static uint16_t
294 buffer_new(sen_inv *inv, int size, uint32_t *pos,
295 buffer_term **bt, buffer_rec **br, buffer **bp, int hint)
298 uint16_t nseg0 = inv->header->initial_n_segments;
299 uint16_t pseg, seg, offset, seg0 = hint % nseg0;
300 uint16_t segmax = (uint16_t) (inv->header->total_chunk_size >> 7) + nseg0;
301 if (size + sizeof(buffer_header) + sizeof(buffer_term) > SEN_INV_SEGMENT_SIZE) {
302 return SEG_NOT_ASSIGNED;
304 for (seg = seg0; seg < segmax; seg += nseg0) {
305 if (inv->header->binfo[seg] == SEG_NOT_ASSIGNED) { break; }
306 if ((pseg = buffer_open_if_capable(inv, seg, size, &b)) != SEG_NOT_ASSIGNED) {
311 for (seg = (seg0 + 1) % nseg0; seg != seg0; seg = (seg + 1) % nseg0) {
312 if (inv->header->binfo[seg] == SEG_NOT_ASSIGNED) { break; }
313 if ((pseg = buffer_open_if_capable(inv, seg, size, &b)) != SEG_NOT_ASSIGNED) {
318 for (seg = nseg0; seg < SEN_INV_MAX_SEGMENT; seg++) {
319 if (inv->header->binfo[seg] == SEG_NOT_ASSIGNED) { break; }
320 if ((pseg = buffer_open_if_capable(inv, seg, size, &b)) != SEG_NOT_ASSIGNED) {
326 SEN_LOG(sen_log_debug, "inv=%p new seg=%d", inv, seg);
327 if (buffer_segment_new(inv, &seg) ||
328 (pseg = buffer_open(inv, seg * SEN_INV_SEGMENT_SIZE, NULL, &b)) == SEG_NOT_ASSIGNED) {
329 return SEG_NOT_ASSIGNED;
331 memset(b, 0, SEN_INV_SEGMENT_SIZE);
332 b->header.buffer_free = SEN_INV_SEGMENT_SIZE - sizeof(buffer_header);
333 b->header.chunk = CHUNK_NOT_ASSIGNED;
334 b->header.chunk_size = 0;
336 if (b->header.nterms_void) {
337 for (offset = 0; offset < b->header.nterms; offset++) {
338 if (!b->terms[offset].tid) { break; }
340 if (offset == b->header.nterms) {
341 SEN_LOG(sen_log_notice, "inconsistent buffer(%d)", seg);
342 b->header.nterms_void = 0;
344 b->header.buffer_free -= size + sizeof(buffer_term);
346 b->header.nterms_void--;
347 b->header.buffer_free -= size;
350 offset = b->header.nterms++;
351 b->header.buffer_free -= size + sizeof(buffer_term);
353 *pos = seg * SEN_INV_SEGMENT_SIZE
354 + sizeof(buffer_header) + sizeof(buffer_term) * offset;
355 *bt = &b->terms[offset];
356 *br = (buffer_rec *)(((byte *)&b->terms[b->header.nterms]) + b->header.buffer_free);
366 #define BUFFER_REC_DEL(r) ((r)->jump = 1)
367 #define BUFFER_REC_DELETED(r) ((r)->jump == 1)
369 #define BUFFER_REC_AT(b,pos) ((buffer_rec *)(b) + (pos))
370 #define BUFFER_REC_POS(b,rec) ((uint16_t)((rec) - (buffer_rec *)(b)))
373 buffer_term_dump(buffer *b, buffer_term *bt)
378 SEN_LOG(sen_log_debug,
379 "b=(%x %u %u %u)", b->header.chunk, b->header.chunk_size, b->header.buffer_free, b->header.nterms);
380 SEN_LOG(sen_log_debug,
381 "bt=(%u %u %u %u %u)", bt->tid, bt->size_in_chunk, bt->pos_in_chunk, bt->size_in_buffer, bt->pos_in_buffer);
382 for (pos = bt->pos_in_buffer; pos; pos = r->step) {
383 r = BUFFER_REC_AT(b, pos);
387 SEN_LOG(sen_log_debug, "%d=(%d:%d),(%d:%d)", pos, r->jump, r->step, rid, sid);
391 static buffer_term *tmp_bt;
394 check_jump(buffer *b, buffer_rec *r, int j)
396 uint16_t i = BUFFER_REC_POS(b, r);
400 if (!j) { return sen_success; }
402 SEN_B_DEC(id.rid, p);
403 SEN_B_DEC(id.sid, p);
405 SEN_LOG(sen_log_debug, "deleting! %d(%d:%d)", i, id.rid, id.sid);
408 r2 = BUFFER_REC_AT(b, j);
410 SEN_B_DEC(id2.rid, p);
411 SEN_B_DEC(id2.sid, p);
413 SEN_LOG(sen_log_emerg, "cycle! %d(%d:%d)<->%d(%d:%d)", i, id.rid, id.sid, j, id2.rid, id2.sid);
414 buffer_term_dump(b, tmp_bt);
415 return sen_abnormal_error;
417 if (id2.rid < id.rid || (id2.rid == id.rid && id2.sid <= id.sid)) {
418 SEN_LOG(sen_log_crit, "invalid jump! %d(%d:%d)(%d:%d)->%d(%d:%d)(%d:%d)", i, r->jump, r->step, id.rid, id.sid, j, r2->jump, r2->step, id2.rid, id2.sid);
419 return sen_abnormal_error;
425 buffer_check(buffer *b, uint32_t *nerrors)
431 int n = b->header.nterms;
433 for (bt = b->terms; n; n--, bt++) {
434 uint32_t rid = 0, sid = 0;
435 if (!bt->tid) { continue; }
437 for (nextb = bt->pos_in_buffer; nextb; nextb = br->step) {
438 uint32_t lrid = rid, lsid = sid;
439 br = BUFFER_REC_AT(b, nextb);
440 if (check_jump(b, br, br->jump)) { (*nerrors)++; }
444 if (lrid > rid || (lrid == rid && lsid >= sid)) {
445 SEN_LOG(sen_log_crit, "brokeng!! tid=%d (%d:%d) -> (%d:%d)", bt->tid, lrid, lsid, rid, sid);
455 sen_inv_check(sen_inv *inv)
458 uint32_t pos, total_nterms = 0, nerrors = 0;
459 uint16_t nseg0 = inv->header->initial_n_segments;
460 uint16_t pseg, seg, nsegs = 0;
461 uint16_t segmax = (uint16_t) (inv->header->total_chunk_size >> 7) + nseg0;
462 for (seg = 0; seg < segmax; seg++) {
463 if (inv->header->binfo[seg] == SEG_NOT_ASSIGNED) { continue; }
464 pos = ((uint32_t) seg) * SEN_INV_SEGMENT_SIZE;
465 if ((pseg = buffer_open(inv, pos, NULL, &b)) == SEG_NOT_ASSIGNED) { continue; }
467 total_nterms += buffer_check(b, &nerrors);
468 buffer_close(inv, pseg);
470 sen_log("sen_inv_check done nsegs=%d total_nterms=%d", nsegs, total_nterms);
475 set_jump_r(buffer *b, buffer_rec *from, int to)
477 int i, j, max_jump = 100;
479 for (r = from, j = to; j > 1 && max_jump--; r = BUFFER_REC_AT(b, r->step)) {
480 r2 = BUFFER_REC_AT(b, j);
481 if (r == r2) { break; }
482 if (BUFFER_REC_DELETED(r2)) { break; }
483 if (j == (i = r->jump)) { break; }
484 if (j == r->step) { break; }
485 if (check_jump(b, r, j)) { return sen_internal_error; }
488 if (!r->step) { return sen_abnormal_error; }
493 #define GET_NUM_BITS(x,n) {\
495 n = (n & 0x55555555) + ((n >> 1) & 0x55555555);\
496 n = (n & 0x33333333) + ((n >> 2) & 0x33333333);\
497 n = (n & 0x0F0F0F0F) + ((n >> 4) & 0x0F0F0F0F);\
498 n = (n & 0x00FF00FF) + ((n >> 8) & 0x00FF00FF);\
499 n = (n & 0x0000FFFF) + ((n >>16) & 0x0000FFFF);\
503 buffer_put(buffer *b, buffer_term *bt, buffer_rec *rnew, uint8_t *bs,
504 sen_inv_updspec *u, int size)
507 sen_rc rc = sen_success;
508 docid id_curr = {0, 0}, id_start = {0, 0}, id_post = {0, 0};
509 buffer_rec *r_curr, *r_start = NULL;
510 uint16_t last = 0, *lastp = &bt->pos_in_buffer, pos = BUFFER_REC_POS(b, rnew);
511 int vdelta = 0, delta, delta0 = 0, vhops = 0, nhops = 0, reset = 1;
515 memcpy(NEXT_ADDR(rnew), bs, size - sizeof(buffer_rec));
516 // sen_log("tid=%d u->rid=%d u->sid=%d", bt->tid, u->rid, u->sid);
518 // sen_log("*lastp=%d", *lastp);
523 if (bt->size_in_buffer++ > 1) {
524 buffer_rec *rhead = BUFFER_REC_AT(b, bt->pos_in_buffer);
526 if (!(bt->size_in_buffer & 1)) {
528 buffer_rec *r = BUFFER_REC_AT(b, rhead->step), *r2;
529 GET_NUM_BITS(bt->size_in_buffer, n);
530 while (n-- && (r->jump > 1)) {
531 r2 = BUFFER_REC_AT(b, r->jump);
532 if (BUFFER_REC_DELETED(r2)) { break; }
535 if (r != rnew) { set_jump_r(b, r, last); }
540 r_curr = BUFFER_REC_AT(b, *lastp);
541 p = NEXT_ADDR(r_curr);
542 SEN_B_DEC(id_curr.rid, p);
543 SEN_B_DEC(id_curr.sid, p);
544 if (id_curr.rid < id_post.rid ||
545 (id_curr.rid == id_post.rid && id_curr.sid < id_post.sid)) {
546 SEN_LOG(sen_log_emerg, "loop found!!! (%d:%d)->(%d:%d)",
547 id_post.rid, id_post.sid, id_curr.rid, id_curr.sid);
548 buffer_term_dump(b, bt);
549 /* abandon corrupt list */
550 bt->pos_in_buffer = 0;
551 bt->size_in_buffer = 0;
552 lastp = &bt->pos_in_buffer;
553 rc = sen_invalid_format;
556 id_post.rid = id_curr.rid;
557 id_post.sid = id_curr.sid;
558 if (u->rid < id_curr.rid || (u->rid == id_curr.rid && u->sid <= id_curr.sid)) {
559 uint16_t step = *lastp, jump = r_curr->jump;
560 if (u->rid == id_curr.rid) {
562 while (id_curr.rid == u->rid) {
563 BUFFER_REC_DEL(r_curr);
564 if (!(step = r_curr->step)) { break; }
565 r_curr = BUFFER_REC_AT(b, step);
566 p = NEXT_ADDR(r_curr);
567 SEN_B_DEC(id_curr.rid, p);
568 SEN_B_DEC(id_curr.sid, p);
570 } else if (u->sid == id_curr.sid) {
571 BUFFER_REC_DEL(r_curr);
576 rnew->jump = check_jump(b, rnew, jump) ? 0 : jump;
583 id_start.rid = id_curr.rid;
584 id_start.sid = id_curr.sid;
585 if (!(delta0 = u->rid - id_start.rid)) { delta0 = u->sid - id_start.sid; }
588 vdelta = delta0 >> 1;
590 if (!(delta = id_curr.rid - id_start.rid)) { delta = id_curr.sid - id_start.sid; }
591 if (vdelta < delta) {
592 vdelta += (delta0 >> ++vhops);
596 set_jump_r(b, r_start, *lastp);
603 lastp = &r_curr->step;
606 uint16_t posj = r_curr->jump;
608 buffer_rec *rj = BUFFER_REC_AT(b, posj);
609 if (!BUFFER_REC_DELETED(rj)) {
612 SEN_B_DEC(idj.rid, p);
613 SEN_B_DEC(idj.sid, p);
614 if (idj.rid < u->rid || (idj.rid == u->rid && idj.sid < u->sid)) {
630 inline static uint32_t *
631 array_at(sen_inv *inv, uint32_t id)
635 if (id > SEN_SYM_MAX_ID) { return NULL; }
636 seg = id >> W_OF_ARRAY;
637 if ((pseg = inv->header->ainfo[seg]) == SEG_NOT_ASSIGNED) { return NULL; }
638 SEN_IO_SEG_REF(inv->seg, pseg, p);
639 if (!p) { return NULL; }
640 return (uint32_t *)(p + (id & ARRAY_MASK_IN_A_SEGMENT) * sizeof(uint32_t));
643 inline static uint32_t *
644 array_get(sen_inv *inv, uint32_t id)
648 if (id > SEN_SYM_MAX_ID) { return NULL; }
649 seg = id >> W_OF_ARRAY;
650 if ((pseg = inv->header->ainfo[seg]) == SEG_NOT_ASSIGNED) {
651 if (segment_get_clear(inv, &pseg)) { return NULL; }
652 inv->header->ainfo[seg] = pseg;
653 if (seg >= inv->header->amax) { inv->header->amax = seg + 1; }
655 SEN_IO_SEG_REF(inv->seg, pseg, p)
656 if (!p) { return NULL; }
657 return (uint32_t *)(p + (id & ARRAY_MASK_IN_A_SEGMENT) * sizeof(uint32_t));
661 array_unref(sen_inv *inv, uint32_t id)
663 SEN_IO_SEG_UNREF(inv->seg, inv->header->ainfo[id >> W_OF_ARRAY]);
669 sen_inv_updspec_open(uint32_t rid, uint32_t sid)
672 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
673 if (!(u = SEN_MALLOC(sizeof(sen_inv_updspec)))) { return NULL; }
685 #define SEN_INV_MAX_TF 0x1ffff
688 sen_inv_updspec_add(sen_inv_updspec *u, int pos, int32_t weight)
690 struct _sen_inv_pos *p;
691 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
693 if (u->tf >= SEN_INV_MAX_TF) { return sen_success; }
694 if (!(p = SEN_MALLOC(sizeof(struct _sen_inv_pos)))) {
695 return sen_memory_exhausted;
711 sen_inv_updspec_cmp(sen_inv_updspec *a, sen_inv_updspec *b)
713 struct _sen_inv_pos *pa, *pb;
714 if (a->rid != b->rid) { return a->rid - b->rid; }
715 if (a->sid != b->sid) { return a->sid - b->sid; }
716 if (a->score != b->score) { return a->score - b->score; }
717 if (a->tf != b->tf) { return a->tf - b->tf; }
718 for (pa = a->pos, pb = b->pos; pa && pb; pa = pa->next, pb = pb->next) {
719 if (pa->pos != pb->pos) { return pa->pos - pb->pos; }
721 if (pa) { return 1; }
722 if (pb) { return -1; }
727 sen_inv_updspec_close(sen_inv_updspec *u)
729 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
730 struct _sen_inv_pos *p = u->pos, *q;
740 inline static uint8_t *
741 encode_rec(sen_inv_updspec *u, unsigned int *size, int deletep)
743 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
745 struct _sen_inv_pos *pp;
746 uint32_t lpos, tf, score;
754 if (!(br = SEN_MALLOC((tf + 4) * 5))) {
758 SEN_B_ENC(u->rid, p);
759 SEN_B_ENC(u->sid, p);
761 SEN_B_ENC(tf * 2, p);
763 SEN_B_ENC(tf * 2 + 1, p);
766 for (lpos = 0, pp = u->pos; pp && tf--; lpos = pp->pos, pp = pp->next) {
767 SEN_B_ENC(pp->pos - lpos, p);
769 while (((intptr_t)p & 0x03)) { *p++ = 0; }
770 *size = (unsigned int) ((p - br) + sizeof(buffer_rec));
775 sym_deletable(uint32_t tid, sen_set *h)
777 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
779 if (!h) { return 1; }
780 if (!sen_set_at(h, &tid, (void **) &u)) {
781 return (ERRP(ctx, SEN_ERROR)) ? 0 : 1;
783 if (!(*u)->tf || !(*u)->sid) { return 1; }
793 sis_deletable(sen_id tid, void *arg)
796 sen_set *h = ((sis_deletable_arg *)arg)->h;
797 sen_inv *inv = ((sis_deletable_arg *)arg)->inv;
798 if ((a = array_at(inv, tid))) {
800 array_unref(inv, tid);
803 array_unref(inv, tid);
805 return sym_deletable(tid, h);
809 sym_delete(sen_inv *inv, uint32_t tid, sen_set *h)
811 sis_deletable_arg arg = {inv, h};
812 if ((inv->lexicon->flags & SEN_INDEX_SHARED_LEXICON)) {
813 sen_sym_pocket_decr(inv->lexicon, tid);
815 if (inv->lexicon->flags & SEN_SYM_WITH_SIS) {
816 sen_sym_del_with_sis(inv->lexicon, tid, sis_deletable, &arg);
819 while ((tid = sen_sym_del_with_sis(inv->lexicon, tid))) {
820 if ((a = array_at(inv, tid))) {
822 array_unref(inv, tid);
825 array_unref(inv, tid);
827 if (!sym_deletable(tid, h)) { break; }
831 if (sym_deletable(tid, h)) {
832 sen_sym_del(inv->lexicon, _sen_sym_key(inv->lexicon, tid));
845 #define SEN_C_ENC(dgap,tf,p)\
847 uint8_t *p2 = (uint8_t *)p;\
850 if (_d < 0x46 && _t <= 2) {\
851 *p2++ = (_d << 1) + (_t - 1);\
852 } else if (_d < 0x46 && _t <= 6) {\
853 *p2++ = 0xc0 + (_d >> 6);\
854 *p2++ = (_d << 2) + (_t - 3);\
855 } else if (_d < 0x1000 && _t <= 4) {\
856 *p2++ = 0xc0 + (_d >> 6);\
857 *p2++ = (_d << 2) + (_t - 1);\
858 } else if (_d < 0x4000 && _t <= 0x40) {\
859 *p2++ = 0x90 + (_d >> 10);\
861 *p2++ = (_d << 6) + (_t - 1);\
862 } else if (_d < 0x80000 && _t <= 4) {\
863 *p2++ = 0xa0 + (_d >> 14);\
865 *p2++ = (_d << 2) + (_t - 1);\
874 #define SEN_C_DEC(dgap,tf,p)\
876 uint8_t *p2 = (uint8_t *)p;\
877 uint32_t _v = *p2++;\
881 SEN_B_DEC(dgap, p2);\
890 dgap = ((_v - 0x90) << 10) + ((*p2++) << 2);\
893 tf = (_v & 0x3f) + 1;\
897 dgap = ((_v - 0xa0) << 14) + ((*p2++) << 6);\
906 dgap = (_v - 0xc0) << 6;\
910 if (dgap < 0x46) { tf += 2; }\
921 buffer_flush(sen_inv *inv, sen_ctx *ctx, uint32_t seg, sen_set *h)
923 buffer *sb, *db = NULL;
924 sen_rc rc = sen_success;
926 uint8_t *tc, *tp, *dc, *sc = NULL;
927 uint16_t ss, ds, pseg;
928 uint32_t scn, dcn, max_dest_chunk_size;
929 ss = inv->header->binfo[seg];
930 if (ss == SEG_NOT_ASSIGNED) { return sen_invalid_format; }
931 pseg = buffer_open(inv, seg * SEN_INV_SEGMENT_SIZE, NULL, &sb);
932 if (pseg == SEG_NOT_ASSIGNED) { return sen_memory_exhausted; }
933 if ((ds = segment_get(inv)) == SEN_INV_MAX_SEGMENT) {
934 buffer_close(inv, pseg);
935 return sen_memory_exhausted;
937 SEN_IO_SEG_REF(inv->seg, ds, db);
939 buffer_close(inv, pseg);
940 return sen_memory_exhausted;
942 memset(db, 0, SEN_INV_SEGMENT_SIZE);
943 max_dest_chunk_size = sb->header.chunk_size + SEN_INV_SEGMENT_SIZE;
944 if (!(tc = SEN_MALLOC(max_dest_chunk_size * 2))) {
945 buffer_close(inv, pseg);
946 SEN_IO_SEG_UNREF(inv->seg, ds);
947 return sen_memory_exhausted;
949 tp = tc + max_dest_chunk_size;
950 if (chunk_new(inv, &dcn, max_dest_chunk_size)) {
951 buffer_close(inv, pseg);
952 SEN_IO_SEG_UNREF(inv->seg, ds);
954 return sen_memory_exhausted;
956 // sen_log("db=%p ds=%d sb=%p seg=%d", db, ds, sb, seg);
957 if ((scn = sb->header.chunk) != CHUNK_NOT_ASSIGNED) {
958 sc = sen_io_win_map(inv->chunk, ctx, &sw, scn, 0, sb->header.chunk_size, SEN_IO_COPY);
960 SEN_LOG(sen_log_alert, "io_win_map(%d, %d) failed!", scn, sb->header.chunk_size);
961 buffer_close(inv, pseg);
962 SEN_IO_SEG_UNREF(inv->seg, ds);
964 chunk_free(inv, dcn, max_dest_chunk_size);
965 return sen_memory_exhausted;
968 dc = sen_io_win_map(inv->chunk, ctx, &dw, dcn, 0, max_dest_chunk_size, SEN_IO_UPDATE);
970 SEN_LOG(sen_log_alert, "io_win_map(%d, %d) failed!!", dcn, max_dest_chunk_size);
971 buffer_close(inv, pseg);
972 SEN_IO_SEG_UNREF(inv->seg, ds);
974 chunk_free(inv, dcn, max_dest_chunk_size);
975 if (scn != CHUNK_NOT_ASSIGNED) { sen_io_win_unmap(&sw); }
976 return sen_memory_exhausted;
979 uint8_t *sbp = NULL, *scp = NULL, *sce = NULL, *dcp = dc;
983 int n = sb->header.nterms;
985 uint8_t *tpp, *tcp, *spp = NULL;
986 memcpy(db->terms, sb->terms, n * sizeof(buffer_term));
987 // sen_log(" scn=%d, dcn=%d, nterms=%d", sb->header.chunk, dcn, n);
988 for (bt = db->terms; n; n--, bt++) {
989 uint32_t ndf = 0, dgap_ = 0, sgap_ = 0;
990 docinfo cid = {0, 0, 0, 0, 0}, lid = {0, 0, 0, 0, 0}, bid = {0, 0};
998 scp = sc + bt->pos_in_chunk;
999 sce = scp + bt->size_in_chunk;
1000 if (bt->size_in_chunk) {
1002 sce = spp = scp + o;
1005 nextb = bt->pos_in_buffer;
1006 bt->pos_in_chunk = (uint32_t)(dcp - dc);
1007 bt->size_in_buffer = 0;
1008 bt->pos_in_buffer = 0;
1010 #define GETNEXTC_() {\
1013 if (*scp == 0x8c) { cid.flags |= 1; scp++; } else { cid.flags &= ~1; }\
1014 if (*scp == 0x8d) { cid.flags ^= 2; scp++; }\
1015 if (*scp == 0x8e) { cid.flags ^= 4; scp++; }\
1016 SEN_C_DEC(dgap, cid.tf, scp);\
1018 if (dgap) { cid.sid = 0; }\
1019 if (cid.flags & 4) { SEN_B_DEC(dgap, scp); } else { dgap = 0; }\
1020 if (cid.flags & 3) { SEN_B_DEC(cid.score, scp); } else { cid.score = 0; }\
1021 cid.sid += dgap + 1;\
1026 #define GETNEXTC() {\
1027 if (scp < sce && cid.rid) { while (cid.tf--) { SEN_B_SKIP(spp); } }\
1030 #define PUTNEXT_(id,pp) {\
1031 uint32_t dgap = id.rid - lid.rid;\
1032 uint32_t sgap = (dgap ? id.sid : id.sid - lid.sid);\
1036 if (!(lid.flags & 2)) {\
1038 lid.flags |= 2; *tcp++ = 0x8d;\
1040 lid.flags |= 1; *tcp++ = 0x8c;\
1044 if (!id.score && (lid.flags & 2)) { lid.flags &= ~2; *tcp++ = 0x8d; }\
1048 if (!(lid.flags & 4)) { lid.flags |= 4; *tcp++ = 0x8e; }\
1050 if (sgap == 1 && (lid.flags & 4)) { lid.flags &= ~4; *tcp++ = 0x8e; }\
1052 SEN_C_ENC(dgap_, lid.tf, tcp);\
1053 if (lid.flags & 4) { SEN_B_ENC(sgap_, tcp); }\
1054 if (lid.flags & 3) { SEN_B_ENC(lid.score, tcp); }\
1059 lid.score = id.score;\
1060 while (id.tf--) { SEN_B_COPY(tpp, pp); }\
1064 #define PUTNEXTC() {\
1067 if (lid.rid > cid.rid || (lid.rid == cid.rid && lid.sid >= cid.sid)) {\
1068 SEN_LOG(sen_log_crit, "brokenc!! (%d:%d) -> (%d:%d)", lid.rid, lid.sid, bid.rid, bid.sid);\
1069 rc = sen_invalid_format;\
1073 PUTNEXT_(cid, spp);\
1075 SEN_LOG(sen_log_crit, "invalid chunk(%d,%d)", bt->tid, cid.rid);\
1076 rc = sen_invalid_format;\
1082 #define GETNEXTB() {\
1084 uint32_t lrid = bid.rid, lsid = bid.sid;\
1085 br = BUFFER_REC_AT(sb, nextb);\
1086 sbp = NEXT_ADDR(br);\
1087 SEN_B_DEC(bid.rid, sbp);\
1088 SEN_B_DEC(bid.sid, sbp);\
1089 if (lrid > bid.rid || (lrid == bid.rid && lsid >= bid.sid)) {\
1090 SEN_LOG(sen_log_crit, "brokeng!! (%d:%d) -> (%d:%d)", lrid, lsid, bid.rid, bid.sid);\
1091 rc = sen_invalid_format;\
1099 #define PUTNEXTB() {\
1100 if (bid.rid && bid.sid) {\
1101 SEN_B_DEC(bid.tf, sbp);\
1103 if (lid.rid > bid.rid || (lid.rid == bid.rid && lid.sid >= bid.sid)) {\
1104 SEN_LOG(sen_log_crit, "brokenb!! (%d:%d) -> (%d:%d)", lid.rid, lid.sid, bid.rid, bid.sid);\
1105 rc = sen_invalid_format;\
1108 if (bid.tf & 1) { SEN_B_DEC(bid.score, sbp); } else { bid.score = 0; }\
1111 PUTNEXT_(bid, sbp);\
1122 if (cid.rid < bid.rid) {
1125 if (bid.rid < cid.rid) {
1129 if (cid.sid < bid.sid) {
1132 if (bid.sid == cid.sid) { GETNEXTC(); }
1153 if (lid.score && !(lid.flags & 2)) { lid.flags |= 2; *tcp++ = 0x8d; }
1155 if (sgap_ && !(lid.flags & 4)) { lid.flags |= 4; *tcp++ = 0x8e; }
1156 SEN_C_ENC(dgap_, lid.tf, tcp);
1157 if (lid.flags & 4) { SEN_B_ENC(sgap_, tcp); }
1158 if (lid.flags & 2) { SEN_B_ENC(lid.score, tcp); }
1163 bt->pos_in_chunk = 0;\
1164 bt->size_in_chunk = 0;\
1168 uint32_t o = tcp - tc;\
1170 memcpy(dcp, tc, o);\
1173 memcpy(dcp, tp, o);\
1175 bt->size_in_chunk = (uint32_t)((dcp - dc) - bt->pos_in_chunk);\
1181 if ((a = array_at(inv, bt->tid))) {
1182 if (!(inv->lexicon->flags & SEN_INDEX_SHARED_LEXICON)) {
1183 sen_sym_pocket_set(inv->lexicon, bt->tid, 0);
1186 sym_delete(inv, bt->tid, h);
1187 array_unref(inv, bt->tid);
1192 } else if (ndf == 1 && lid.rid < 0x100000 && lid.sid < 0x800 && lid.tf == 1 && lid.score == 0) {
1195 SEN_B_DEC(pos_, spp);
1196 if (inv->lexicon->flags & SEN_INDEX_SHARED_LEXICON) {
1197 if (lid.sid == 1 && pos_ < 0x800 && (a = array_at(inv, bt->tid))) {
1198 *a = (lid.rid << 12) + (pos_ << 1) + 1;
1199 array_unref(inv, bt->tid);
1205 if (pos_ < 0x4000 && (a = array_at(inv, bt->tid))) {
1206 sen_sym_pocket_set(inv->lexicon, bt->tid, pos_);
1207 *a = (lid.rid << 12) + (lid.sid << 1) + 1;
1208 array_unref(inv, bt->tid);
1219 db->header.chunk_size = (uint32_t)(dcp - dc);
1220 db->header.nterms_void = nterms_void;
1221 inv->header->total_chunk_size += db->header.chunk_size >> 10;
1223 db->header.chunk = db->header.chunk_size ? dcn : CHUNK_NOT_ASSIGNED;
1224 db->header.buffer_free = SEN_INV_SEGMENT_SIZE
1225 - sizeof(buffer_header) - sb->header.nterms * sizeof(buffer_term);
1226 db->header.nterms = sb->header.nterms;
1230 mc = (max_dest_chunk_size + SEN_INV_CHUNK_SIZE - 1) / SEN_INV_CHUNK_SIZE;
1231 ec = (db->header.chunk_size + SEN_INV_CHUNK_SIZE - 1) / SEN_INV_CHUNK_SIZE;
1233 inv->header->chunks[dcn + ec++] = 0;
1236 buffer_close(inv, pseg);
1238 SEN_IO_SEG_UNREF(inv->seg, ds);
1239 inv->header->binfo[seg] = ds;
1240 if (scn != CHUNK_NOT_ASSIGNED) {
1241 sen_io_win_unmap(&sw);
1242 chunk_free(inv, scn, sb->header.chunk_size);
1243 inv->header->total_chunk_size -= sb->header.chunk_size >> 10;
1245 sen_io_win_unmap(&dw);
1252 sen_inv_create(const char *path, sen_sym *lexicon, uint32_t initial_n_segments)
1255 sen_io *seg, *chunk;
1257 char path2[PATH_MAX];
1258 struct sen_inv_header *header;
1259 if ((lexicon->flags & 0x70000)) {
1260 return sen_inv_create08(path, lexicon, initial_n_segments);
1262 if (strlen(path) + 6 >= PATH_MAX) { return NULL; }
1263 strcpy(path2, path);
1264 strcat(path2, ".c");
1265 if (!initial_n_segments) { initial_n_segments = SEN_INV_INITIAL_N_SEGMENTS; }
1266 if (initial_n_segments > SEN_INV_MAX_SEGMENT) {
1267 initial_n_segments = SEN_INV_MAX_SEGMENT;
1269 max_chunk = initial_n_segments * MAX_CHUNK_RATIO;
1270 seg = sen_io_create(path, sizeof(struct sen_inv_header) + max_chunk,
1271 SEN_INV_SEGMENT_SIZE, SEN_INV_MAX_SEGMENT,
1272 sen_io_auto, SEN_INV_MAX_SEGMENT);
1273 if (!seg) { return NULL; }
1274 chunk = sen_io_create(path2, 0, SEN_INV_CHUNK_SIZE,
1275 max_chunk, sen_io_auto, max_chunk);
1280 header = sen_io_header(seg);
1281 memcpy(header->idstr, SEN_INV_IDSTR, 16);
1282 for (i = 0; i < SEN_INV_MAX_SEGMENT; i++) {
1283 header->ainfo[i] = SEG_NOT_ASSIGNED;
1284 header->binfo[i] = SEG_NOT_ASSIGNED;
1286 header->initial_n_segments = initial_n_segments;
1287 if (!(inv = SEN_GMALLOC(sizeof(sen_inv)))) {
1289 sen_io_close(chunk);
1295 inv->header = header;
1296 inv->lexicon = lexicon;
1297 inv->header->total_chunk_size = 0;
1302 sen_inv_remove(const char *path)
1305 char buffer[PATH_MAX];
1306 if (!path || strlen(path) > PATH_MAX - 4) { return sen_invalid_argument; }
1307 if ((rc = sen_sym_remove(path))) { goto exit; }
1308 snprintf(buffer, PATH_MAX, "%s.c", path);
1309 rc = sen_io_remove(buffer);
1315 sen_inv_open(const char *path, sen_sym *lexicon)
1317 sen_io *seg, *chunk;
1319 char path2[PATH_MAX];
1320 struct sen_inv_header *header;
1321 if ((lexicon->flags & 0x70000)) {
1322 return sen_inv_open08(path, lexicon);
1324 if (strlen(path) + 6 >= PATH_MAX) { return NULL; }
1325 strcpy(path2, path);
1326 strcat(path2, ".c");
1327 seg = sen_io_open(path, sen_io_auto, SEN_INV_MAX_SEGMENT);
1328 if (!seg) { return NULL; }
1329 chunk = sen_io_open(path2, sen_io_auto, SEN_INV_MAX_SEGMENT);
1334 header = sen_io_header(seg);
1335 if (memcmp(header->idstr, SEN_INV_IDSTR, 16)) {
1336 SEN_LOG(sen_log_notice, "inv_idstr (%s)", header->idstr);
1338 sen_io_close(chunk);
1339 return sen_inv_open08(path, lexicon);
1341 if (!(inv = SEN_GMALLOC(sizeof(sen_inv)))) {
1343 sen_io_close(chunk);
1349 inv->header = header;
1350 inv->lexicon = lexicon;
1355 sen_inv_close(sen_inv *inv)
1358 if (!inv) { return sen_invalid_argument; }
1359 if ((rc = sen_io_close(inv->seg))) { return rc; }
1360 if ((rc = sen_io_close(inv->chunk))) { return rc; }
1366 sen_inv_info(sen_inv *inv, uint64_t *seg_size, uint64_t *chunk_size)
1371 if ((rc = sen_io_size(inv->seg, seg_size))) {
1377 if ((rc = sen_io_size(inv->chunk, chunk_size))) {
1386 sen_inv_update(sen_inv *inv, uint32_t key, sen_inv_updspec *u, sen_set *h, int hint)
1388 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
1389 sen_rc rc = sen_success;
1393 buffer_rec *br = NULL;
1395 uint32_t pos = 0, size, *a;
1397 return sen_inv_update08(inv, key, u, h, hint);
1399 // sen_log("key=%d tf=%d pos0=%d rid=%d", key, u->tf, u->pos->pos, u->rid);
1400 if (!u->tf || !u->sid) { return sen_inv_delete(inv, key, u, h); }
1401 if (u->sid > inv->header->smax) { inv->header->smax = u->sid; }
1402 if (!(a = array_get(inv, key))) { return sen_memory_exhausted; }
1403 if (!(bs = encode_rec(u, &size, 0))) { rc = sen_memory_exhausted; goto exit; }
1408 if ((pseg = buffer_open(inv, pos, &bt, &b)) == SEG_NOT_ASSIGNED) {
1409 rc = sen_memory_exhausted;
1412 if (b->header.buffer_free < size) {
1413 int bfb = b->header.buffer_free;
1414 SEN_LOG(sen_log_debug, "flushing *a=%d seg=%d(%p) free=%d",
1415 *a, *a >> W_OF_SEGMENT, b, b->header.buffer_free);
1416 buffer_close(inv, pseg);
1417 if ((rc = buffer_flush(inv, ctx, pos >> W_OF_SEGMENT, h))) { goto exit; }
1419 SEN_LOG(sen_log_debug, "sen_inv_update: *a changed %d->%d", *a, pos);
1422 if ((pseg = buffer_open(inv, pos, &bt, &b)) == SEG_NOT_ASSIGNED) {
1423 SEN_LOG(sen_log_crit, "buffer not found *a=%d", *a);
1424 rc = sen_memory_exhausted;
1427 SEN_LOG(sen_log_debug, "flushed *a=%d seg=%d(%p) free=%d->%d nterms=%d v=%d",
1428 *a, *a >> W_OF_SEGMENT, b, bfb, b->header.buffer_free,
1429 b->header.nterms, b->header.nterms_void);
1430 if (b->header.buffer_free < size) {
1431 buffer_close(inv, pseg);
1432 SEN_LOG(sen_log_crit, "buffer(%d) is full (%d < %d) in sen_inv_update",
1433 *a, b->header.buffer_free, size);
1434 /* todo: must be splitted */
1435 rc = sen_memory_exhausted;
1439 b->header.buffer_free -= size;
1440 br = (buffer_rec *)(((byte *)&b->terms[b->header.nterms])
1441 + b->header.buffer_free);
1444 uint32_t size2 = 0, v = *a;
1445 struct _sen_inv_pos pos2;
1446 if (inv->lexicon->flags & SEN_INDEX_SHARED_LEXICON) {
1447 pos2.pos = BIT11_01(v);
1450 u2.rid = BIT31_12(v);
1455 pos2.pos = sen_sym_pocket_get(inv->lexicon, key);
1458 u2.rid = BIT31_12(v);
1459 u2.sid = BIT11_01(v);
1463 if (u2.rid != u->rid || u2.sid != u->sid) {
1464 uint8_t *bs2 = encode_rec(&u2, &size2, 0);
1466 SEN_LOG(sen_log_alert, "encode_rec on sen_inv_update failed !");
1467 rc = sen_memory_exhausted;
1470 pseg = buffer_new(inv, size + size2, &pos, &bt, &br, &b, hint);
1471 if (pseg == SEG_NOT_ASSIGNED) {
1476 bt->size_in_chunk = 0;
1477 bt->pos_in_chunk = 0;
1478 bt->size_in_buffer = 0;
1479 bt->pos_in_buffer = 0;
1480 if ((rc = buffer_put(b, bt, br, bs2, &u2, size2))) {
1482 buffer_close(inv, pseg);
1485 br = (buffer_rec *)(((byte *)br) + size2);
1492 if (!*a && (inv->lexicon->flags & SEN_INDEX_SHARED_LEXICON)) {
1493 sen_sym_pocket_incr(inv->lexicon, key);
1496 if (inv->lexicon->flags & SEN_INDEX_SHARED_LEXICON) {
1497 if (u->rid < 0x100000 && u->sid == 1 &&
1498 u->tf == 1 && u->score == 0 && u->pos->pos < 0x800) {
1499 *a = (u->rid << 12) + (u->pos->pos << 1) + 1;
1503 if (u->rid < 0x100000 && u->sid < 0x800 &&
1504 u->tf == 1 && u->score == 0 && u->pos->pos < 0x4000) {
1505 sen_sym_pocket_set(inv->lexicon, key, u->pos->pos);
1506 *a = (u->rid << 12) + (u->sid << 1) + 1;
1510 pseg = buffer_new(inv, size, &pos, &bt, &br, &b, hint);
1511 if (pseg == SEG_NOT_ASSIGNED) { goto exit; }
1513 bt->size_in_chunk = 0;
1514 bt->pos_in_chunk = 0;
1515 bt->size_in_buffer = 0;
1516 bt->pos_in_buffer = 0;
1518 rc = buffer_put(b, bt, br, bs, u, size);
1519 buffer_close(inv, pseg);
1520 if (!*a || (*a & 1)) {
1522 if (!(inv->lexicon->flags & SEN_INDEX_SHARED_LEXICON)) {
1523 sen_sym_pocket_set(inv->lexicon, key, 0);
1527 array_unref(inv, key);
1528 if (bs) { SEN_FREE(bs); }
1529 if (u->tf != u->atf) {
1530 SEN_LOG(sen_log_warning, "too many postings(%d) on '%s'. discarded %d.", u->atf, _sen_sym_key(inv->lexicon, key), u->atf - u->tf);
1536 sen_inv_delete(sen_inv *inv, uint32_t key, sen_inv_updspec *u, sen_set *h)
1538 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
1539 sen_rc rc = sen_success;
1547 return sen_inv_delete08(inv, key, u, h);
1549 if (!(a = array_at(inv, key))) { return sen_invalid_argument; }
1551 if (!*a) { goto exit; }
1553 uint32_t rid = BIT31_12(*a);
1554 uint32_t sid = BIT11_01(*a);
1555 if (u->rid == rid && (!u->sid || u->sid == sid)) {
1557 sym_delete(inv, key, h);
1561 if (!(bs = encode_rec(u, &size, 1))) {
1562 rc = sen_memory_exhausted;
1565 if ((pseg = buffer_open(inv, *a, &bt, &b)) == SEG_NOT_ASSIGNED) {
1566 rc = sen_memory_exhausted;
1569 // sen_log("b->header.buffer_free=%d size=%d", b->header.buffer_free, size);
1570 if (b->header.buffer_free < size) {
1572 SEN_LOG(sen_log_debug, "flushing! b=%p free=%d, seg(%d)", b, b->header.buffer_free, *a >> W_OF_SEGMENT);
1573 buffer_close(inv, pseg);
1574 if ((rc = buffer_flush(inv, ctx, *a >> W_OF_SEGMENT, h))) { goto exit; }
1576 SEN_LOG(sen_log_debug, "sen_inv_delete: *a changed %d->%d)", *a, _a);
1579 if ((pseg = buffer_open(inv, *a, &bt, &b)) == SEG_NOT_ASSIGNED) {
1580 rc = sen_memory_exhausted;
1583 SEN_LOG(sen_log_debug, "flushed! b=%p free=%d, seg(%d)", b, b->header.buffer_free, *a >> W_OF_SEGMENT);
1584 if (b->header.buffer_free < size) {
1585 /* todo: must be splitted ? */
1586 SEN_LOG(sen_log_crit, "buffer(%d) is full (%d < %d) in sen_inv_delete",
1587 *a, b->header.buffer_free, size);
1588 rc = sen_memory_exhausted;
1589 buffer_close(inv, pseg);
1594 b->header.buffer_free -= size;
1595 br = (buffer_rec *)(((byte *)&b->terms[b->header.nterms]) + b->header.buffer_free);
1596 rc = buffer_put(b, bt, br, bs, u, size);
1597 buffer_close(inv, pseg);
1601 array_unref(inv, key);
1602 if (bs) { SEN_FREE(bs); }
1607 sen_inv_initial_n_segments(sen_inv *inv)
1609 return inv->header->initial_n_segments;
1612 #define CHUNK_USED 1
1613 #define BUFFER_USED 2
1614 #define SOLE_DOC_USED 4
1615 #define SOLE_POS_USED 8
1618 sen_inv_cursor_open(sen_inv *inv, uint32_t key, int with_pos)
1620 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
1621 sen_inv_cursor *c = NULL;
1624 return sen_inv_cursor_open08(inv, key);
1626 if (!(a = array_at(inv, key))) { return NULL; }
1627 if (!(pos = *a)) { goto exit; }
1628 if (!(c = SEN_MALLOC(sizeof(sen_inv_cursor)))) { goto exit; }
1629 memset(c, 0, sizeof(sen_inv_cursor));
1632 c->with_pos = (uint16_t) with_pos;
1635 c->pb.rid = BIT31_12(pos);
1638 if (inv->lexicon->flags & SEN_INDEX_SHARED_LEXICON) {
1640 c->pb.pos = BIT11_01(pos);
1642 c->pb.sid = BIT11_01(pos);
1643 c->pb.pos = sen_sym_pocket_get(inv->lexicon, key);
1648 c->pb.rid = 0; c->pb.sid = 0; /* for check */
1649 if ((c->buffer_pseg = buffer_open(inv, pos, &bt, &c->buf)) == SEG_NOT_ASSIGNED) {
1654 if (bt->size_in_chunk && (chunk = c->buf->header.chunk) != CHUNK_NOT_ASSIGNED) {
1655 c->cp = sen_io_win_map(inv->chunk, ctx, &c->iw,
1656 chunk, bt->pos_in_chunk, bt->size_in_chunk, sen_io_rdonly);
1658 buffer_close(inv, c->buffer_pseg);
1663 c->cpe = c->cp + bt->size_in_chunk;
1664 if (bt->size_in_chunk) {
1666 SEN_B_DEC(o, c->cp);
1667 c->cpe = c->cpp = c->cp + o;
1673 c->nextb = bt->pos_in_buffer;
1674 c->stat = CHUNK_USED|BUFFER_USED;
1677 array_unref(inv, key);
1683 sen_inv_cursor_openv1(sen_inv *inv, uint32_t key)
1685 sen_inv_cursor *c = NULL;
1686 uint32_t pos, *a = array_at(inv, key);
1687 if (!a) { return NULL; }
1688 if (!(pos = *a)) { goto exit; }
1689 if (!(c = SEN_MALLOC(sizeof(sen_inv_cursor)))) { goto exit; }
1690 memset(c, 0, sizeof(sen_inv_cursor));
1694 c->pb.rid = BIT31_12(pos);
1697 if (inv->lexicon->flags & SEN_INDEX_SHARED_LEXICON) {
1699 c->pb.pos = BIT11_01(pos);
1701 c->pb.sid = BIT11_01(pos);
1702 c->pb.pos = sen_sym_pocket_get(inv->lexicon, key);
1706 c->pb.rid = 0; c->pb.sid = 0;
1707 if ((c->buffer_pseg = buffer_open(inv, pos, &bt, &c->buf)) == SEG_NOT_ASSIGNED) {
1712 c->iw.io = inv->chunk;
1713 c->iw.mode = sen_io_rdonly;
1714 c->iw.segment = c->buf->header.chunk;
1715 c->iw.offset = bt->pos_in_chunk;
1716 c->iw.size = bt->size_in_chunk;
1717 c->nextb = bt->pos_in_buffer;
1718 c->stat = CHUNK_USED|BUFFER_USED;
1721 array_unref(inv, key);
1726 sen_inv_cursor_openv2(sen_inv_cursor **cursors, int ncursors)
1728 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
1729 sen_rc rc = sen_success;
1732 sen_io_win **iws = SEN_MALLOC(sizeof(sen_io_win *) * ncursors);
1733 if (!iws) { return sen_memory_exhausted; }
1734 for (i = 0; i < ncursors; i++) {
1736 if (c->stat && c->iw.size && c->iw.segment != CHUNK_NOT_ASSIGNED) {
1740 if (j) { rc = sen_io_win_mapv(iws, ctx, j); }
1741 for (i = 0; i < ncursors; i++) {
1744 c->cp = c->iw.addr + c->iw.diff;
1745 c->cpe = c->cp + c->iw.size;
1753 #endif /* USE_AIO */
1756 sen_inv_cursor_next(sen_inv_cursor *c)
1759 return sen_inv_cursor_next08(c);
1763 if (c->stat & CHUNK_USED) {
1764 if (c->cp < c->cpe) {
1766 if (c->with_pos) { while (c->pc.rest--) { SEN_B_SKIP(c->cpp); } }
1767 if (*c->cp == 0x8c) { c->flags |= 1; c->cp++; } else { c->flags &= ~1; }
1768 if (*c->cp == 0x8d) { c->flags ^= 2; c->cp++; }
1769 if (*c->cp == 0x8e) { c->flags ^= 4; c->cp++; }
1770 SEN_C_DEC(dgap, c->pc.tf, c->cp);
1772 if (dgap) { c->pc.sid = 0; }
1773 if (c->flags & 4) { SEN_B_DEC(dgap, c->cp); } else { dgap = 0; }
1774 if (c->flags & 3) { SEN_B_DEC(c->pc.score, c->cp); } else { c->pc.score = 0; }
1775 c->pc.sid += dgap + 1;
1776 c->pc.rest = c->pc.tf;
1782 if (c->stat & BUFFER_USED) {
1784 uint32_t lrid = c->pb.rid, lsid = c->pb.sid; /* for check */
1785 buffer_rec *br = BUFFER_REC_AT(c->buf, c->nextb);
1786 c->bp = NEXT_ADDR(br);
1787 SEN_B_DEC(c->pb.rid, c->bp);
1788 SEN_B_DEC(c->pb.sid, c->bp);
1789 if (lrid > c->pb.rid || (lrid == c->pb.rid && lsid >= c->pb.sid)) {
1790 SEN_LOG(sen_log_crit, "brokend!! (%d:%d) -> (%d:%d)", lrid, lsid, c->pb.rid, c->pb.sid);
1791 return sen_abnormal_error;
1793 c->nextb = br->step;
1794 SEN_B_DEC(c->pb.tf, c->bp);
1795 if (c->pb.tf & 1) { SEN_B_DEC(c->pb.score, c->bp); } else { c->pb.score = 0; }
1796 c->pb.rest = c->pb.tf >>= 1;
1804 if (c->pc.rid < c->pb.rid) {
1805 c->stat = CHUNK_USED;
1806 if (c->pc.tf && c->pc.sid) { c->post = &c->pc; break; }
1808 if (c->pb.rid < c->pc.rid) {
1809 c->stat = BUFFER_USED;
1810 if (c->pb.tf && c->pb.sid) { c->post = &c->pb; break; }
1813 if (c->pc.sid < c->pb.sid) {
1814 c->stat = CHUNK_USED;
1815 if (c->pc.tf && c->pc.sid) { c->post = &c->pc; break; }
1817 c->stat = BUFFER_USED;
1818 if (c->pb.sid == c->pc.sid) { c->stat |= CHUNK_USED; }
1819 if (c->pb.tf) { c->post = &c->pb; break; }
1822 c->stat = CHUNK_USED;
1827 c->stat = BUFFER_USED;
1828 if (c->pb.tf && c->pb.sid) { c->post = &c->pb; break; }
1832 c->stat = CHUNK_USED;
1833 if (c->pc.tf && c->pc.sid) { c->post = &c->pc; break; }
1836 return sen_abnormal_error;
1841 if (c->stat & SOLE_DOC_USED) {
1843 return sen_abnormal_error;
1846 c->stat |= SOLE_DOC_USED;
1853 sen_inv_cursor_next_pos(sen_inv_cursor *c)
1856 sen_rc rc = sen_success;
1858 return sen_inv_cursor_next_pos08(c);
1862 if (c->post == &c->pc) {
1865 SEN_B_DEC(gap, c->cpp);
1868 rc = sen_abnormal_error;
1870 } else if (c->post == &c->pb) {
1873 SEN_B_DEC(gap, c->bp);
1876 rc = sen_abnormal_error;
1879 rc = sen_abnormal_error;
1882 if (c->stat & SOLE_POS_USED) {
1883 rc = sen_abnormal_error;
1885 c->stat |= SOLE_POS_USED;
1893 sen_inv_cursor_close(sen_inv_cursor *c)
1895 sen_ctx *ctx = c->iw.ctx;
1897 return sen_inv_cursor_close08(c);
1899 if (!c) { return sen_invalid_argument; }
1900 if (c->cp) { sen_io_win_unmap(&c->iw); }
1901 if (c->buf) { buffer_close(c->inv, c->buffer_pseg); }
1907 sen_inv_estimate_size(sen_inv *inv, uint32_t key)
1909 uint32_t res, pos, *a;
1911 return sen_inv_estimate_size08(inv, key);
1913 a = array_at(inv, key);
1914 if (!a) { return 0; }
1922 if ((pseg = buffer_open(inv, pos, &bt, &buf)) == SEG_NOT_ASSIGNED) {
1925 res = (bt->size_in_chunk >> 1) + bt->size_in_buffer + 2;
1926 buffer_close(inv, pseg);
1932 array_unref(inv, key);
1937 sen_inv_entry_info(sen_inv *inv, unsigned key, unsigned *a, unsigned *pocket,
1938 unsigned *chunk, unsigned *chunk_size, unsigned *buffer_free,
1939 unsigned *nterms, unsigned *nterms_void, unsigned *tid,
1940 unsigned *size_in_chunk, unsigned *pos_in_chunk,
1941 unsigned *size_in_buffer, unsigned *pos_in_buffer)
1949 return sen_inv_entry_info08(inv, key, a, pocket,
1950 chunk, chunk_size, buffer_free,
1951 nterms, nterms_void, tid,
1952 size_in_chunk, pos_in_chunk,
1953 size_in_buffer, pos_in_buffer);
1955 ap = array_at(inv, key);
1956 *pocket = sen_sym_pocket_get(inv->lexicon, key);
1957 if (!ap) { return 0; }
1959 array_unref(inv, key);
1960 if (!*a) { return 1; }
1961 if (*a & 1) { return 2; }
1962 if ((pseg = buffer_open(inv, *a, &bt, &b)) == SEG_NOT_ASSIGNED) { return 3; }
1963 *chunk = b->header.chunk;
1964 *chunk_size = b->header.chunk_size;
1965 *buffer_free = b->header.buffer_free;
1966 *nterms = b->header.nterms;
1968 *size_in_chunk = bt->size_in_chunk;
1969 *pos_in_chunk = bt->pos_in_chunk;
1970 *size_in_buffer = bt->size_in_buffer;
1971 *pos_in_buffer = bt->pos_in_buffer;
1972 buffer_close(inv, pseg);
1977 sen_inv_path(sen_inv *inv)
1979 return sen_io_path(inv->seg);
1983 sen_inv_max_section(sen_inv *inv)
1986 return SEN_SYM_MAX_ID;
1988 return inv->header->smax;