lib/tdb/common/freelist.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              1999-2005
   7    Copyright (C) Paul `Rusty' Russell              2000
   8    Copyright (C) Jeremy Allison                    2000-2003
   9
  10      ** NOTE! The following LGPL license applies to the tdb
  11      ** library. This does NOT imply that all of Samba is released
  12      ** under the LGPL
  13
  14    This library is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU Lesser General Public
  16    License as published by the Free Software Foundation; either
  17    version 3 of the License, or (at your option) any later version.
  18
  19    This library is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    Lesser General Public License for more details.
  23
  24    You should have received a copy of the GNU Lesser General Public
  25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  26 */
  27
  28 #include "tdb_private.h"
  29
  30 /* 'right' merges can involve O(n^2) cost when combined with a
  31    traverse, so they are disabled until we find a way to do them in
  32    O(1) time
  33 */
  34 #define USE_RIGHT_MERGES 0
  35
  36 /* read a freelist record and check for simple errors */
  37 int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off, struct tdb_record *rec)
  38 {
  39         if (tdb->methods->tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
  40                 return -1;
  41
  42         if (rec->magic == TDB_MAGIC) {
  43                 /* this happens when a app is showdown while deleting a record - we should
  44                    not completely fail when this happens */
  45                 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_rec_free_read non-free magic 0x%x at offset=%u - fixing\n",
  46                          rec->magic, off));
  47                 rec->magic = TDB_FREE_MAGIC;
  48                 if (tdb_rec_write(tdb, off, rec) == -1)
  49                         return -1;
  50         }
  51
  52         if (rec->magic != TDB_FREE_MAGIC) {
  53                 /* Ensure ecode is set for log fn. */
  54                 tdb->ecode = TDB_ERR_CORRUPT;
  55                 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_rec_free_read bad magic 0x%x at offset=%u\n",
  56                            rec->magic, off));
  57                 return -1;
  58         }
  59         if (tdb->methods->tdb_oob(tdb, rec->next, sizeof(*rec), 0) != 0)
  60                 return -1;
  61         return 0;
  62 }
  63
  64
  65 #if USE_RIGHT_MERGES
  66 /* Remove an element from the freelist.  Must have alloc lock. */
  67 static int remove_from_freelist(struct tdb_context *tdb, tdb_off_t off, tdb_off_t next)
  68 {
  69         tdb_off_t last_ptr, i;
  70
  71         /* read in the freelist top */
  72         last_ptr = FREELIST_TOP;
  73         while (tdb_ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
  74                 if (i == off) {
  75                         /* We've found it! */
  76                         return tdb_ofs_write(tdb, last_ptr, &next);
  77                 }
  78                 /* Follow chain (next offset is at start of record) */
  79                 last_ptr = i;
  80         }
  81         tdb->ecode = TDB_ERR_CORRUPT;
  82         TDB_LOG((tdb, TDB_DEBUG_FATAL,"remove_from_freelist: not on list at off=%u\n", off));
  83         return -1;
  84 }
  85 #endif
  86
  87
  88 /* update a record tailer (must hold allocation lock) */
  89 static int update_tailer(struct tdb_context *tdb, tdb_off_t offset,
  90                          const struct tdb_record *rec)
  91 {
  92         tdb_off_t totalsize;
  93
  94         /* Offset of tailer from record header */
  95         totalsize = sizeof(*rec) + rec->rec_len;
  96         return tdb_ofs_write(tdb, offset + totalsize - sizeof(tdb_off_t),
  97                          &totalsize);
  98 }
  99
 100 /**
 101  * Read the record directly on the left.
 102  * Fail if there is no record on the left.
 103  */
 104 static int read_record_on_left(struct tdb_context *tdb, tdb_off_t rec_ptr,
 105                                tdb_off_t *left_p,
 106                                struct tdb_record *left_r)
 107 {
 108         tdb_off_t left_ptr;
 109         tdb_off_t left_size;
 110         struct tdb_record left_rec;
 111         int ret;
 112
 113         left_ptr = rec_ptr - sizeof(tdb_off_t);
 114
 115         if (left_ptr <= TDB_DATA_START(tdb->hash_size)) {
 116                 /* no record on the left */
 117                 return -1;
 118         }
 119
 120         /* Read in tailer and jump back to header */
 121         ret = tdb_ofs_read(tdb, left_ptr, &left_size);
 122         if (ret == -1) {
 123                 TDB_LOG((tdb, TDB_DEBUG_FATAL,
 124                         "tdb_free: left offset read failed at %u\n", left_ptr));
 125                 return -1;
 126         }
 127
 128         /* it could be uninitialised data */
 129         if (left_size == 0 || left_size == TDB_PAD_U32) {
 130                 return -1;
 131         }
 132
 133         if (left_size > rec_ptr) {
 134                 return -1;
 135         }
 136
 137         left_ptr = rec_ptr - left_size;
 138
 139         if (left_ptr < TDB_DATA_START(tdb->hash_size)) {
 140                 return -1;
 141         }
 142
 143         /* Now read in the left record */
 144         ret = tdb->methods->tdb_read(tdb, left_ptr, &left_rec,
 145                                      sizeof(left_rec), DOCONV());
 146         if (ret == -1) {
 147                 TDB_LOG((tdb, TDB_DEBUG_FATAL,
 148                          "tdb_free: left read failed at %u (%u)\n",
 149                          left_ptr, left_size));
 150                 return -1;
 151         }
 152
 153         *left_p = left_ptr;
 154         *left_r = left_rec;
 155
 156         return 0;
 157 }
 158
 159 /**
 160  * Merge new freelist record with the direct left neighbour.
 161  * This assumes that left_rec represents the record
 162  * directly to the left of right_rec and that this is
 163  * a freelist record.
 164  */
 165 static int merge_with_left_record(struct tdb_context *tdb,
 166                                   tdb_off_t left_ptr,
 167                                   struct tdb_record *left_rec,
 168                                   struct tdb_record *right_rec)
 169 {
 170         int ret;
 171
 172         left_rec->rec_len += sizeof(*right_rec) + right_rec->rec_len;
 173
 174         ret = tdb_rec_write(tdb, left_ptr, left_rec);
 175         if (ret == -1) {
 176                 TDB_LOG((tdb, TDB_DEBUG_FATAL,
 177                          "merge_with_left_record: update_left failed at %u\n",
 178                          left_ptr));
 179                 return -1;
 180         }
 181
 182         ret = update_tailer(tdb, left_ptr, left_rec);
 183         if (ret == -1) {
 184                 TDB_LOG((tdb, TDB_DEBUG_FATAL,
 185                          "merge_with_left_record: update_tailer failed at %u\n",
 186                          left_ptr));
 187                 return -1;
 188         }
 189
 190         return 0;
 191 }
 192
 193 /* Add an element into the freelist. Merge adjacent records if
 194    necessary. */
 195 int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
 196 {
 197         tdb_off_t left;
 198         struct tdb_record l;
 199
 200         /* Allocation and tailer lock */
 201         if (tdb_lock(tdb, -1, F_WRLCK) != 0)
 202                 return -1;
 203
 204         /* set an initial tailer, so if we fail we don't leave a bogus record */
 205         if (update_tailer(tdb, offset, rec) != 0) {
 206                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed!\n"));
 207                 goto fail;
 208         }
 209
 210 #if USE_RIGHT_MERGES
 211         /* Look right first (I'm an Australian, dammit) */
 212         if (offset + sizeof(*rec) + rec->rec_len + sizeof(*rec) <= tdb->map_size) {
 213                 tdb_off_t right = offset + sizeof(*rec) + rec->rec_len;
 214                 struct tdb_record r;
 215
 216                 if (tdb->methods->tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
 217                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: right read failed at %u\n", right));
 218                         goto left;
 219                 }
 220
 221                 /* If it's free, expand to include it. */
 222                 if (r.magic == TDB_FREE_MAGIC) {
 223                         if (remove_from_freelist(tdb, right, r.next) == -1) {
 224                                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: right free failed at %u\n", right));
 225                                 goto left;
 226                         }
 227                         rec->rec_len += sizeof(r) + r.rec_len;
 228                         if (update_tailer(tdb, offset, rec) == -1) {
 229                                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed at %u\n", offset));
 230                                 goto fail;
 231                         }
 232                 }
 233         }
 234 left:
 235 #endif
 236
 237         if (read_record_on_left(tdb, offset, &left, &l) != 0) {
 238                 goto update;
 239         }
 240
 241         if (l.magic != TDB_FREE_MAGIC) {
 242                 goto update;
 243         }
 244
 245         /* It's free - expand to include it. */
 246
 247         /* we now merge the new record into the left record, rather than the other
 248            way around. This makes the operation O(1) instead of O(n). This change
 249            prevents traverse from being O(n^2) after a lot of deletes */
 250
 251         if (merge_with_left_record(tdb, left, &l, rec) != 0) {
 252                 goto fail;
 253         }
 254
 255         tdb_unlock(tdb, -1, F_WRLCK);
 256         return 0;
 257
 258 update:
 259
 260         /* Now, prepend to free list */
 261         rec->magic = TDB_FREE_MAGIC;
 262
 263         if (tdb_ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
 264             tdb_rec_write(tdb, offset, rec) == -1 ||
 265             tdb_ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
 266                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free record write failed at offset=%u\n", offset));
 267                 goto fail;
 268         }
 269
 270         /* And we're done. */
 271         tdb_unlock(tdb, -1, F_WRLCK);
 272         return 0;
 273
 274  fail:
 275         tdb_unlock(tdb, -1, F_WRLCK);
 276         return -1;
 277 }
 278
 279
 280
 281 /*
 282    the core of tdb_allocate - called when we have decided which
 283    free list entry to use
 284
 285    Note that we try to allocate by grabbing data from the end of an existing record,
 286    not the beginning. This is so the left merge in a free is more likely to be
 287    able to free up the record without fragmentation
 288  */
 289 static tdb_off_t tdb_allocate_ofs(struct tdb_context *tdb,
 290                                   tdb_len_t length, tdb_off_t rec_ptr,
 291                                   struct tdb_record *rec, tdb_off_t last_ptr)
 292 {
 293 #define MIN_REC_SIZE (sizeof(struct tdb_record) + sizeof(tdb_off_t) + 8)
 294
 295         if (rec->rec_len < length + MIN_REC_SIZE) {
 296                 /* we have to grab the whole record */
 297
 298                 /* unlink it from the previous record */
 299                 if (tdb_ofs_write(tdb, last_ptr, &rec->next) == -1) {
 300                         return 0;
 301                 }
 302
 303                 /* mark it not free */
 304                 rec->magic = TDB_MAGIC;
 305                 if (tdb_rec_write(tdb, rec_ptr, rec) == -1) {
 306                         return 0;
 307                 }
 308                 return rec_ptr;
 309         }
 310
 311         /* we're going to just shorten the existing record */
 312         rec->rec_len -= (length + sizeof(*rec));
 313         if (tdb_rec_write(tdb, rec_ptr, rec) == -1) {
 314                 return 0;
 315         }
 316         if (update_tailer(tdb, rec_ptr, rec) == -1) {
 317                 return 0;
 318         }
 319
 320         /* and setup the new record */
 321         rec_ptr += sizeof(*rec) + rec->rec_len;
 322
 323         memset(rec, '\0', sizeof(*rec));
 324         rec->rec_len = length;
 325         rec->magic = TDB_MAGIC;
 326
 327         if (tdb_rec_write(tdb, rec_ptr, rec) == -1) {
 328                 return 0;
 329         }
 330
 331         if (update_tailer(tdb, rec_ptr, rec) == -1) {
 332                 return 0;
 333         }
 334
 335         return rec_ptr;
 336 }
 337
 338 /* allocate some space from the free list. The offset returned points
 339    to a unconnected tdb_record within the database with room for at
 340    least length bytes of total data
 341
 342    0 is returned if the space could not be allocated
 343  */
 344 static tdb_off_t tdb_allocate_from_freelist(
 345         struct tdb_context *tdb, tdb_len_t length, struct tdb_record *rec)
 346 {
 347         tdb_off_t rec_ptr, last_ptr, newrec_ptr;
 348         struct {
 349                 tdb_off_t rec_ptr, last_ptr;
 350                 tdb_len_t rec_len;
 351         } bestfit;
 352         float multiplier = 1.0;
 353
 354         /* over-allocate to reduce fragmentation */
 355         length *= 1.25;
 356
 357         /* Extra bytes required for tailer */
 358         length += sizeof(tdb_off_t);
 359         length = TDB_ALIGN(length, TDB_ALIGNMENT);
 360
 361  again:
 362         last_ptr = FREELIST_TOP;
 363
 364         /* read in the freelist top */
 365         if (tdb_ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
 366                 return 0;
 367
 368         bestfit.rec_ptr = 0;
 369         bestfit.last_ptr = 0;
 370         bestfit.rec_len = 0;
 371
 372         /*
 373            this is a best fit allocation strategy. Originally we used
 374            a first fit strategy, but it suffered from massive fragmentation
 375            issues when faced with a slowly increasing record size.
 376          */
 377         while (rec_ptr) {
 378                 if (tdb_rec_free_read(tdb, rec_ptr, rec) == -1) {
 379                         return 0;
 380                 }
 381
 382                 if (rec->rec_len >= length) {
 383                         if (bestfit.rec_ptr == 0 ||
 384                             rec->rec_len < bestfit.rec_len) {
 385                                 bestfit.rec_len = rec->rec_len;
 386                                 bestfit.rec_ptr = rec_ptr;
 387                                 bestfit.last_ptr = last_ptr;
 388                         }
 389                 }
 390
 391                 /* move to the next record */
 392                 last_ptr = rec_ptr;
 393                 rec_ptr = rec->next;
 394
 395                 /* if we've found a record that is big enough, then
 396                    stop searching if its also not too big. The
 397                    definition of 'too big' changes as we scan
 398                    through */
 399                 if (bestfit.rec_len > 0 &&
 400                     bestfit.rec_len < length * multiplier) {
 401                         break;
 402                 }
 403
 404                 /* this multiplier means we only extremely rarely
 405                    search more than 50 or so records. At 50 records we
 406                    accept records up to 11 times larger than what we
 407                    want */
 408                 multiplier *= 1.05;
 409         }
 410
 411         if (bestfit.rec_ptr != 0) {
 412                 if (tdb_rec_free_read(tdb, bestfit.rec_ptr, rec) == -1) {
 413                         return 0;
 414                 }
 415
 416                 newrec_ptr = tdb_allocate_ofs(tdb, length, bestfit.rec_ptr,
 417                                               rec, bestfit.last_ptr);
 418                 return newrec_ptr;
 419         }
 420
 421         /* we didn't find enough space. See if we can expand the
 422            database and if we can then try again */
 423         if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
 424                 goto again;
 425
 426         return 0;
 427 }
 428
 429 static bool tdb_alloc_dead(
 430         struct tdb_context *tdb, int hash, tdb_len_t length,
 431         tdb_off_t *rec_ptr, struct tdb_record *rec)
 432 {
 433         tdb_off_t last_ptr;
 434
 435         *rec_ptr = tdb_find_dead(tdb, hash, rec, length, &last_ptr);
 436         if (*rec_ptr == 0) {
 437                 return false;
 438         }
 439         /*
 440          * Unlink the record from the hash chain, it's about to be moved into
 441          * another one.
 442          */
 443         return (tdb_ofs_write(tdb, last_ptr, &rec->next) == 0);
 444 }
 445
 446 /*
 447  * Chain "hash" is assumed to be locked
 448  */
 449
 450 tdb_off_t tdb_allocate(struct tdb_context *tdb, int hash, tdb_len_t length,
 451                        struct tdb_record *rec)
 452 {
 453         tdb_off_t ret;
 454         int i;
 455
 456         if (tdb->max_dead_records == 0) {
 457                 /*
 458                  * No dead records to expect anywhere. Do the blocking
 459                  * freelist lock without trying to steal from others
 460                  */
 461                 goto blocking_freelist_allocate;
 462         }
 463
 464         /*
 465          * The following loop tries to get the freelist lock nonblocking. If
 466          * it gets the lock, allocate from there. If the freelist is busy,
 467          * instead of waiting we try to steal dead records from other hash
 468          * chains.
 469          *
 470          * Be aware that we do nonblocking locks on the other hash chains as
 471          * well and fail gracefully. This way we avoid deadlocks (we block two
 472          * hash chains, something which is pretty bad normally)
 473          */
 474
 475         for (i=0; i<tdb->hash_size; i++) {
 476
 477                 int list;
 478
 479                 list = BUCKET(hash+i);
 480
 481                 if (tdb_lock_nonblock(tdb, list, F_WRLCK) == 0) {
 482                         bool got_dead;
 483
 484                         got_dead = tdb_alloc_dead(tdb, list, length, &ret, rec);
 485                         tdb_unlock(tdb, list, F_WRLCK);
 486
 487                         if (got_dead) {
 488                                 return ret;
 489                         }
 490                 }
 491
 492                 if (tdb_lock_nonblock(tdb, -1, F_WRLCK) == 0) {
 493                         /*
 494                          * Under the freelist lock take the chance to give
 495                          * back our dead records.
 496                          */
 497                         tdb_purge_dead(tdb, hash);
 498
 499                         ret = tdb_allocate_from_freelist(tdb, length, rec);
 500                         tdb_unlock(tdb, -1, F_WRLCK);
 501                         return ret;
 502                 }
 503         }
 504
 505 blocking_freelist_allocate:
 506
 507         if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
 508                 return 0;
 509         }
 510         ret = tdb_allocate_from_freelist(tdb, length, rec);
 511         tdb_unlock(tdb, -1, F_WRLCK);
 512         return ret;
 513 }
 514
 515 /*
 516    return the size of the freelist - used to decide if we should repack
 517 */
 518 _PUBLIC_ int tdb_freelist_size(struct tdb_context *tdb)
 519 {
 520         tdb_off_t ptr;
 521         int count=0;
 522
 523         if (tdb_lock(tdb, -1, F_RDLCK) == -1) {
 524                 return -1;
 525         }
 526
 527         ptr = FREELIST_TOP;
 528         while (tdb_ofs_read(tdb, ptr, &ptr) == 0 && ptr != 0) {
 529                 count++;
 530         }
 531
 532         tdb_unlock(tdb, -1, F_RDLCK);
 533         return count;
 534 }