Add a test for the fixes on this branch.
[sqlite.git] / ext / fts5 / fts5_hash.c
blob5e0959aa8ef3ba324c173f4f189d1aaeacb15c53
1 /*
2 ** 2014 August 11
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 ******************************************************************************
17 #include "fts5Int.h"
19 typedef struct Fts5HashEntry Fts5HashEntry;
22 ** This file contains the implementation of an in-memory hash table used
23 ** to accumuluate "term -> doclist" content before it is flused to a level-0
24 ** segment.
28 struct Fts5Hash {
29 int eDetail; /* Copy of Fts5Config.eDetail */
30 int *pnByte; /* Pointer to bytes counter */
31 int nEntry; /* Number of entries currently in hash */
32 int nSlot; /* Size of aSlot[] array */
33 Fts5HashEntry *pScan; /* Current ordered scan item */
34 Fts5HashEntry **aSlot; /* Array of hash slots */
38 ** Each entry in the hash table is represented by an object of the
39 ** following type. Each object, its key, and its current data are stored
40 ** in a single memory allocation. The key immediately follows the object
41 ** in memory. The position list data immediately follows the key data
42 ** in memory.
44 ** The key is Fts5HashEntry.nKey bytes in size. It consists of a single
45 ** byte identifying the index (either the main term index or a prefix-index),
46 ** followed by the term data. For example: "0token". There is no
47 ** nul-terminator - in this case nKey=6.
49 ** The data that follows the key is in a similar, but not identical format
50 ** to the doclist data stored in the database. It is:
52 ** * Rowid, as a varint
53 ** * Position list, without 0x00 terminator.
54 ** * Size of previous position list and rowid, as a 4 byte
55 ** big-endian integer.
57 ** iRowidOff:
58 ** Offset of last rowid written to data area. Relative to first byte of
59 ** structure.
61 ** nData:
62 ** Bytes of data written since iRowidOff.
64 struct Fts5HashEntry {
65 Fts5HashEntry *pHashNext; /* Next hash entry with same hash-key */
66 Fts5HashEntry *pScanNext; /* Next entry in sorted order */
68 int nAlloc; /* Total size of allocation */
69 int iSzPoslist; /* Offset of space for 4-byte poslist size */
70 int nData; /* Total bytes of data (incl. structure) */
71 int nKey; /* Length of key in bytes */
72 u8 bDel; /* Set delete-flag @ iSzPoslist */
73 u8 bContent; /* Set content-flag (detail=none mode) */
74 i16 iCol; /* Column of last value written */
75 int iPos; /* Position of last value written */
76 i64 iRowid; /* Rowid of last value written */
80 ** Eqivalent to:
82 ** char *fts5EntryKey(Fts5HashEntry *pEntry){ return zKey; }
84 #define fts5EntryKey(p) ( ((char *)(&(p)[1])) )
88 ** Allocate a new hash table.
90 int sqlite3Fts5HashNew(Fts5Config *pConfig, Fts5Hash **ppNew, int *pnByte){
91 int rc = SQLITE_OK;
92 Fts5Hash *pNew;
94 *ppNew = pNew = (Fts5Hash*)sqlite3_malloc(sizeof(Fts5Hash));
95 if( pNew==0 ){
96 rc = SQLITE_NOMEM;
97 }else{
98 sqlite3_int64 nByte;
99 memset(pNew, 0, sizeof(Fts5Hash));
100 pNew->pnByte = pnByte;
101 pNew->eDetail = pConfig->eDetail;
103 pNew->nSlot = 1024;
104 nByte = sizeof(Fts5HashEntry*) * pNew->nSlot;
105 pNew->aSlot = (Fts5HashEntry**)sqlite3_malloc64(nByte);
106 if( pNew->aSlot==0 ){
107 sqlite3_free(pNew);
108 *ppNew = 0;
109 rc = SQLITE_NOMEM;
110 }else{
111 memset(pNew->aSlot, 0, (size_t)nByte);
114 return rc;
118 ** Free a hash table object.
120 void sqlite3Fts5HashFree(Fts5Hash *pHash){
121 if( pHash ){
122 sqlite3Fts5HashClear(pHash);
123 sqlite3_free(pHash->aSlot);
124 sqlite3_free(pHash);
129 ** Empty (but do not delete) a hash table.
131 void sqlite3Fts5HashClear(Fts5Hash *pHash){
132 int i;
133 for(i=0; i<pHash->nSlot; i++){
134 Fts5HashEntry *pNext;
135 Fts5HashEntry *pSlot;
136 for(pSlot=pHash->aSlot[i]; pSlot; pSlot=pNext){
137 pNext = pSlot->pHashNext;
138 sqlite3_free(pSlot);
141 memset(pHash->aSlot, 0, pHash->nSlot * sizeof(Fts5HashEntry*));
142 pHash->nEntry = 0;
145 static unsigned int fts5HashKey(int nSlot, const u8 *p, int n){
146 int i;
147 unsigned int h = 13;
148 for(i=n-1; i>=0; i--){
149 h = (h << 3) ^ h ^ p[i];
151 return (h % nSlot);
154 static unsigned int fts5HashKey2(int nSlot, u8 b, const u8 *p, int n){
155 int i;
156 unsigned int h = 13;
157 for(i=n-1; i>=0; i--){
158 h = (h << 3) ^ h ^ p[i];
160 h = (h << 3) ^ h ^ b;
161 return (h % nSlot);
165 ** Resize the hash table by doubling the number of slots.
167 static int fts5HashResize(Fts5Hash *pHash){
168 int nNew = pHash->nSlot*2;
169 int i;
170 Fts5HashEntry **apNew;
171 Fts5HashEntry **apOld = pHash->aSlot;
173 apNew = (Fts5HashEntry**)sqlite3_malloc64(nNew*sizeof(Fts5HashEntry*));
174 if( !apNew ) return SQLITE_NOMEM;
175 memset(apNew, 0, nNew*sizeof(Fts5HashEntry*));
177 for(i=0; i<pHash->nSlot; i++){
178 while( apOld[i] ){
179 unsigned int iHash;
180 Fts5HashEntry *p = apOld[i];
181 apOld[i] = p->pHashNext;
182 iHash = fts5HashKey(nNew, (u8*)fts5EntryKey(p), p->nKey);
183 p->pHashNext = apNew[iHash];
184 apNew[iHash] = p;
188 sqlite3_free(apOld);
189 pHash->nSlot = nNew;
190 pHash->aSlot = apNew;
191 return SQLITE_OK;
194 static int fts5HashAddPoslistSize(
195 Fts5Hash *pHash,
196 Fts5HashEntry *p,
197 Fts5HashEntry *p2
199 int nRet = 0;
200 if( p->iSzPoslist ){
201 u8 *pPtr = p2 ? (u8*)p2 : (u8*)p;
202 int nData = p->nData;
203 if( pHash->eDetail==FTS5_DETAIL_NONE ){
204 assert( nData==p->iSzPoslist );
205 if( p->bDel ){
206 pPtr[nData++] = 0x00;
207 if( p->bContent ){
208 pPtr[nData++] = 0x00;
211 }else{
212 int nSz = (nData - p->iSzPoslist - 1); /* Size in bytes */
213 int nPos = nSz*2 + p->bDel; /* Value of nPos field */
215 assert( p->bDel==0 || p->bDel==1 );
216 if( nPos<=127 ){
217 pPtr[p->iSzPoslist] = (u8)nPos;
218 }else{
219 int nByte = sqlite3Fts5GetVarintLen((u32)nPos);
220 memmove(&pPtr[p->iSzPoslist + nByte], &pPtr[p->iSzPoslist + 1], nSz);
221 sqlite3Fts5PutVarint(&pPtr[p->iSzPoslist], nPos);
222 nData += (nByte-1);
226 nRet = nData - p->nData;
227 if( p2==0 ){
228 p->iSzPoslist = 0;
229 p->bDel = 0;
230 p->bContent = 0;
231 p->nData = nData;
234 return nRet;
238 ** Add an entry to the in-memory hash table. The key is the concatenation
239 ** of bByte and (pToken/nToken). The value is (iRowid/iCol/iPos).
241 ** (bByte || pToken) -> (iRowid,iCol,iPos)
243 ** Or, if iCol is negative, then the value is a delete marker.
245 int sqlite3Fts5HashWrite(
246 Fts5Hash *pHash,
247 i64 iRowid, /* Rowid for this entry */
248 int iCol, /* Column token appears in (-ve -> delete) */
249 int iPos, /* Position of token within column */
250 char bByte, /* First byte of token */
251 const char *pToken, int nToken /* Token to add or remove to or from index */
253 unsigned int iHash;
254 Fts5HashEntry *p;
255 u8 *pPtr;
256 int nIncr = 0; /* Amount to increment (*pHash->pnByte) by */
257 int bNew; /* If non-delete entry should be written */
259 bNew = (pHash->eDetail==FTS5_DETAIL_FULL);
261 /* Attempt to locate an existing hash entry */
262 iHash = fts5HashKey2(pHash->nSlot, (u8)bByte, (const u8*)pToken, nToken);
263 for(p=pHash->aSlot[iHash]; p; p=p->pHashNext){
264 char *zKey = fts5EntryKey(p);
265 if( zKey[0]==bByte
266 && p->nKey==nToken+1
267 && memcmp(&zKey[1], pToken, nToken)==0
269 break;
273 /* If an existing hash entry cannot be found, create a new one. */
274 if( p==0 ){
275 /* Figure out how much space to allocate */
276 char *zKey;
277 sqlite3_int64 nByte = sizeof(Fts5HashEntry) + (nToken+1) + 1 + 64;
278 if( nByte<128 ) nByte = 128;
280 /* Grow the Fts5Hash.aSlot[] array if necessary. */
281 if( (pHash->nEntry*2)>=pHash->nSlot ){
282 int rc = fts5HashResize(pHash);
283 if( rc!=SQLITE_OK ) return rc;
284 iHash = fts5HashKey2(pHash->nSlot, (u8)bByte, (const u8*)pToken, nToken);
287 /* Allocate new Fts5HashEntry and add it to the hash table. */
288 p = (Fts5HashEntry*)sqlite3_malloc64(nByte);
289 if( !p ) return SQLITE_NOMEM;
290 memset(p, 0, sizeof(Fts5HashEntry));
291 p->nAlloc = (int)nByte;
292 zKey = fts5EntryKey(p);
293 zKey[0] = bByte;
294 memcpy(&zKey[1], pToken, nToken);
295 assert( iHash==fts5HashKey(pHash->nSlot, (u8*)zKey, nToken+1) );
296 p->nKey = nToken+1;
297 zKey[nToken+1] = '\0';
298 p->nData = nToken+1 + sizeof(Fts5HashEntry);
299 p->pHashNext = pHash->aSlot[iHash];
300 pHash->aSlot[iHash] = p;
301 pHash->nEntry++;
303 /* Add the first rowid field to the hash-entry */
304 p->nData += sqlite3Fts5PutVarint(&((u8*)p)[p->nData], iRowid);
305 p->iRowid = iRowid;
307 p->iSzPoslist = p->nData;
308 if( pHash->eDetail!=FTS5_DETAIL_NONE ){
309 p->nData += 1;
310 p->iCol = (pHash->eDetail==FTS5_DETAIL_FULL ? 0 : -1);
313 }else{
315 /* Appending to an existing hash-entry. Check that there is enough
316 ** space to append the largest possible new entry. Worst case scenario
317 ** is:
319 ** + 9 bytes for a new rowid,
320 ** + 4 byte reserved for the "poslist size" varint.
321 ** + 1 byte for a "new column" byte,
322 ** + 3 bytes for a new column number (16-bit max) as a varint,
323 ** + 5 bytes for the new position offset (32-bit max).
325 if( (p->nAlloc - p->nData) < (9 + 4 + 1 + 3 + 5) ){
326 sqlite3_int64 nNew = p->nAlloc * 2;
327 Fts5HashEntry *pNew;
328 Fts5HashEntry **pp;
329 pNew = (Fts5HashEntry*)sqlite3_realloc64(p, nNew);
330 if( pNew==0 ) return SQLITE_NOMEM;
331 pNew->nAlloc = (int)nNew;
332 for(pp=&pHash->aSlot[iHash]; *pp!=p; pp=&(*pp)->pHashNext);
333 *pp = pNew;
334 p = pNew;
336 nIncr -= p->nData;
338 assert( (p->nAlloc - p->nData) >= (9 + 4 + 1 + 3 + 5) );
340 pPtr = (u8*)p;
342 /* If this is a new rowid, append the 4-byte size field for the previous
343 ** entry, and the new rowid for this entry. */
344 if( iRowid!=p->iRowid ){
345 u64 iDiff = (u64)iRowid - (u64)p->iRowid;
346 fts5HashAddPoslistSize(pHash, p, 0);
347 p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iDiff);
348 p->iRowid = iRowid;
349 bNew = 1;
350 p->iSzPoslist = p->nData;
351 if( pHash->eDetail!=FTS5_DETAIL_NONE ){
352 p->nData += 1;
353 p->iCol = (pHash->eDetail==FTS5_DETAIL_FULL ? 0 : -1);
354 p->iPos = 0;
358 if( iCol>=0 ){
359 if( pHash->eDetail==FTS5_DETAIL_NONE ){
360 p->bContent = 1;
361 }else{
362 /* Append a new column value, if necessary */
363 assert_nc( iCol>=p->iCol );
364 if( iCol!=p->iCol ){
365 if( pHash->eDetail==FTS5_DETAIL_FULL ){
366 pPtr[p->nData++] = 0x01;
367 p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iCol);
368 p->iCol = (i16)iCol;
369 p->iPos = 0;
370 }else{
371 bNew = 1;
372 p->iCol = (i16)(iPos = iCol);
376 /* Append the new position offset, if necessary */
377 if( bNew ){
378 p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iPos - p->iPos + 2);
379 p->iPos = iPos;
382 }else{
383 /* This is a delete. Set the delete flag. */
384 p->bDel = 1;
387 nIncr += p->nData;
388 *pHash->pnByte += nIncr;
389 return SQLITE_OK;
394 ** Arguments pLeft and pRight point to linked-lists of hash-entry objects,
395 ** each sorted in key order. This function merges the two lists into a
396 ** single list and returns a pointer to its first element.
398 static Fts5HashEntry *fts5HashEntryMerge(
399 Fts5HashEntry *pLeft,
400 Fts5HashEntry *pRight
402 Fts5HashEntry *p1 = pLeft;
403 Fts5HashEntry *p2 = pRight;
404 Fts5HashEntry *pRet = 0;
405 Fts5HashEntry **ppOut = &pRet;
407 while( p1 || p2 ){
408 if( p1==0 ){
409 *ppOut = p2;
410 p2 = 0;
411 }else if( p2==0 ){
412 *ppOut = p1;
413 p1 = 0;
414 }else{
415 char *zKey1 = fts5EntryKey(p1);
416 char *zKey2 = fts5EntryKey(p2);
417 int nMin = MIN(p1->nKey, p2->nKey);
419 int cmp = memcmp(zKey1, zKey2, nMin);
420 if( cmp==0 ){
421 cmp = p1->nKey - p2->nKey;
423 assert( cmp!=0 );
425 if( cmp>0 ){
426 /* p2 is smaller */
427 *ppOut = p2;
428 ppOut = &p2->pScanNext;
429 p2 = p2->pScanNext;
430 }else{
431 /* p1 is smaller */
432 *ppOut = p1;
433 ppOut = &p1->pScanNext;
434 p1 = p1->pScanNext;
436 *ppOut = 0;
440 return pRet;
444 ** Link all tokens from hash table iHash into a list in sorted order. The
445 ** tokens are not removed from the hash table.
447 static int fts5HashEntrySort(
448 Fts5Hash *pHash,
449 const char *pTerm, int nTerm, /* Query prefix, if any */
450 Fts5HashEntry **ppSorted
452 const int nMergeSlot = 32;
453 Fts5HashEntry **ap;
454 Fts5HashEntry *pList;
455 int iSlot;
456 int i;
458 *ppSorted = 0;
459 ap = sqlite3_malloc64(sizeof(Fts5HashEntry*) * nMergeSlot);
460 if( !ap ) return SQLITE_NOMEM;
461 memset(ap, 0, sizeof(Fts5HashEntry*) * nMergeSlot);
463 for(iSlot=0; iSlot<pHash->nSlot; iSlot++){
464 Fts5HashEntry *pIter;
465 for(pIter=pHash->aSlot[iSlot]; pIter; pIter=pIter->pHashNext){
466 if( pTerm==0
467 || (pIter->nKey>=nTerm && 0==memcmp(fts5EntryKey(pIter), pTerm, nTerm))
469 Fts5HashEntry *pEntry = pIter;
470 pEntry->pScanNext = 0;
471 for(i=0; ap[i]; i++){
472 pEntry = fts5HashEntryMerge(pEntry, ap[i]);
473 ap[i] = 0;
475 ap[i] = pEntry;
480 pList = 0;
481 for(i=0; i<nMergeSlot; i++){
482 pList = fts5HashEntryMerge(pList, ap[i]);
485 sqlite3_free(ap);
486 *ppSorted = pList;
487 return SQLITE_OK;
491 ** Query the hash table for a doclist associated with term pTerm/nTerm.
493 int sqlite3Fts5HashQuery(
494 Fts5Hash *pHash, /* Hash table to query */
495 int nPre,
496 const char *pTerm, int nTerm, /* Query term */
497 void **ppOut, /* OUT: Pointer to new object */
498 int *pnDoclist /* OUT: Size of doclist in bytes */
500 unsigned int iHash = fts5HashKey(pHash->nSlot, (const u8*)pTerm, nTerm);
501 char *zKey = 0;
502 Fts5HashEntry *p;
504 for(p=pHash->aSlot[iHash]; p; p=p->pHashNext){
505 zKey = fts5EntryKey(p);
506 if( nTerm==p->nKey && memcmp(zKey, pTerm, nTerm)==0 ) break;
509 if( p ){
510 int nHashPre = sizeof(Fts5HashEntry) + nTerm;
511 int nList = p->nData - nHashPre;
512 u8 *pRet = (u8*)(*ppOut = sqlite3_malloc64(nPre + nList + 10));
513 if( pRet ){
514 Fts5HashEntry *pFaux = (Fts5HashEntry*)&pRet[nPre-nHashPre];
515 memcpy(&pRet[nPre], &((u8*)p)[nHashPre], nList);
516 nList += fts5HashAddPoslistSize(pHash, p, pFaux);
517 *pnDoclist = nList;
518 }else{
519 *pnDoclist = 0;
520 return SQLITE_NOMEM;
522 }else{
523 *ppOut = 0;
524 *pnDoclist = 0;
527 return SQLITE_OK;
530 int sqlite3Fts5HashScanInit(
531 Fts5Hash *p, /* Hash table to query */
532 const char *pTerm, int nTerm /* Query prefix */
534 return fts5HashEntrySort(p, pTerm, nTerm, &p->pScan);
537 #ifdef SQLITE_DEBUG
538 static int fts5HashCount(Fts5Hash *pHash){
539 int nEntry = 0;
540 int ii;
541 for(ii=0; ii<pHash->nSlot; ii++){
542 Fts5HashEntry *p = 0;
543 for(p=pHash->aSlot[ii]; p; p=p->pHashNext){
544 nEntry++;
547 return nEntry;
549 #endif
552 ** Return true if the hash table is empty, false otherwise.
554 int sqlite3Fts5HashIsEmpty(Fts5Hash *pHash){
555 assert( pHash->nEntry==fts5HashCount(pHash) );
556 return pHash->nEntry==0;
559 void sqlite3Fts5HashScanNext(Fts5Hash *p){
560 assert( !sqlite3Fts5HashScanEof(p) );
561 p->pScan = p->pScan->pScanNext;
564 int sqlite3Fts5HashScanEof(Fts5Hash *p){
565 return (p->pScan==0);
568 void sqlite3Fts5HashScanEntry(
569 Fts5Hash *pHash,
570 const char **pzTerm, /* OUT: term (nul-terminated) */
571 int *pnTerm, /* OUT: Size of term in bytes */
572 const u8 **ppDoclist, /* OUT: pointer to doclist */
573 int *pnDoclist /* OUT: size of doclist in bytes */
575 Fts5HashEntry *p;
576 if( (p = pHash->pScan) ){
577 char *zKey = fts5EntryKey(p);
578 int nTerm = p->nKey;
579 fts5HashAddPoslistSize(pHash, p, 0);
580 *pzTerm = zKey;
581 *pnTerm = nTerm;
582 *ppDoclist = (const u8*)&zKey[nTerm];
583 *pnDoclist = p->nData - (sizeof(Fts5HashEntry) + nTerm);
584 }else{
585 *pzTerm = 0;
586 *pnTerm = 0;
587 *ppDoclist = 0;
588 *pnDoclist = 0;