source3/tdb/common/transaction.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              2005
   7
   8      ** NOTE! The following LGPL license applies to the tdb
   9      ** library. This does NOT imply that all of Samba is released
  10      ** under the LGPL
  11
  12    This library is free software; you can redistribute it and/or
  13    modify it under the terms of the GNU Lesser General Public
  14    License as published by the Free Software Foundation; either
  15    version 2 of the License, or (at your option) any later version.
  16
  17    This library is distributed in the hope that it will be useful,
  18    but WITHOUT ANY WARRANTY; without even the implied warranty of
  19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20    Lesser General Public License for more details.
  21
  22    You should have received a copy of the GNU Lesser General Public
  23    License along with this library; if not, write to the Free Software
  24    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  25 */
  26
  27 #include "tdb_private.h"
  28
  29 /*
  30   transaction design:
  31
  32   - only allow a single transaction at a time per database. This makes
  33     using the transaction API simpler, as otherwise the caller would
  34     have to cope with temporary failures in transactions that conflict
  35     with other current transactions
  36
  37   - keep the transaction recovery information in the same file as the
  38     database, using a special 'transaction recovery' record pointed at
  39     by the header. This removes the need for extra journal files as
  40     used by some other databases
  41
  42   - dynamically allocated the transaction recover record, re-using it
  43     for subsequent transactions. If a larger record is needed then
  44     tdb_free() the old record to place it on the normal tdb freelist
  45     before allocating the new record
  46
  47   - during transactions, keep a linked list of writes all that have
  48     been performed by intercepting all tdb_write() calls. The hooked
  49     transaction versions of tdb_read() and tdb_write() check this
  50     linked list and try to use the elements of the list in preference
  51     to the real database.
  52
  53   - don't allow any locks to be held when a transaction starts,
  54     otherwise we can end up with deadlock (plus lack of lock nesting
  55     in posix locks would mean the lock is lost)
  56
  57   - if the caller gains a lock during the transaction but doesn't
  58     release it then fail the commit
  59
  60   - allow for nested calls to tdb_transaction_start(), re-using the
  61     existing transaction record. If the inner transaction is cancelled
  62     then a subsequent commit will fail
  63
  64   - keep a mirrored copy of the tdb hash chain heads to allow for the
  65     fast hash heads scan on traverse, updating the mirrored copy in
  66     the transaction version of tdb_write
  67
  68   - allow callers to mix transaction and non-transaction use of tdb,
  69     although once a transaction is started then an exclusive lock is
  70     gained until the transaction is committed or cancelled
  71
  72   - the commit stategy involves first saving away all modified data
  73     into a linearised buffer in the transaction recovery area, then
  74     marking the transaction recovery area with a magic value to
  75     indicate a valid recovery record. In total 4 fsync/msync calls are
  76     needed per commit to prevent race conditions. It might be possible
  77     to reduce this to 3 or even 2 with some more work.
  78
  79   - check for a valid recovery record on open of the tdb, while the
  80     global lock is held. Automatically recover from the transaction
  81     recovery area if needed, then continue with the open as
  82     usual. This allows for smooth crash recovery with no administrator
  83     intervention.
  84
  85   - if TDB_NOSYNC is passed to flags in tdb_open then transactions are
  86     still available, but no transaction recovery area is used and no
  87     fsync/msync calls are made.
  88
  89 */
  90
  91 int transaction_brlock(struct tdb_context *tdb, tdb_off_t offset,
  92                        int rw_type, int lck_type, int probe, size_t len);
  93
  94 struct tdb_transaction_el {
  95         struct tdb_transaction_el *next, *prev;
  96         tdb_off_t offset;
  97         tdb_len_t length;
  98         unsigned char *data;
  99 };
 100
 101 /*
 102   hold the context of any current transaction
 103 */
 104 struct tdb_transaction {
 105         /* we keep a mirrored copy of the tdb hash heads here so
 106            tdb_next_hash_chain() can operate efficiently */
 107         u32 *hash_heads;
 108
 109         /* the original io methods - used to do IOs to the real db */
 110         const struct tdb_methods *io_methods;
 111
 112         /* the list of transaction elements. We use a doubly linked
 113            list with a last pointer to allow us to keep the list
 114            ordered, with first element at the front of the list. It
 115            needs to be doubly linked as the read/write traversals need
 116            to be backwards, while the commit needs to be forwards */
 117         struct tdb_transaction_el *elements, *elements_last;
 118
 119         /* non-zero when an internal transaction error has
 120            occurred. All write operations will then fail until the
 121            transaction is ended */
 122         int transaction_error;
 123
 124         /* when inside a transaction we need to keep track of any
 125            nested tdb_transaction_start() calls, as these are allowed,
 126            but don't create a new transaction */
 127         int nesting;
 128
 129         /* old file size before transaction */
 130         tdb_len_t old_map_size;
 131 };
 132
 133
 134 /*
 135   read while in a transaction. We need to check first if the data is in our list
 136   of transaction elements, then if not do a real read
 137 */
 138 static int transaction_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
 139                             tdb_len_t len, int cv)
 140 {
 141         struct tdb_transaction_el *el;
 142
 143         /* we need to walk the list backwards to get the most recent data */
 144         for (el=tdb->transaction->elements_last;el;el=el->prev) {
 145                 tdb_len_t partial;
 146
 147                 if (off+len <= el->offset) {
 148                         continue;
 149                 }
 150                 if (off >= el->offset + el->length) {
 151                         continue;
 152                 }
 153
 154                 /* an overlapping read - needs to be split into up to
 155                    2 reads and a memcpy */
 156                 if (off < el->offset) {
 157                         partial = el->offset - off;
 158                         if (transaction_read(tdb, off, buf, partial, cv) != 0) {
 159                                 goto fail;
 160                         }
 161                         len -= partial;
 162                         off += partial;
 163                         buf = (void *)(partial + (char *)buf);
 164                 }
 165                 if (off + len <= el->offset + el->length) {
 166                         partial = len;
 167                 } else {
 168                         partial = el->offset + el->length - off;
 169                 }
 170                 memcpy(buf, el->data + (off - el->offset), partial);
 171                 if (cv) {
 172                         tdb_convert(buf, len);
 173                 }
 174                 len -= partial;
 175                 off += partial;
 176                 buf = (void *)(partial + (char *)buf);
 177
 178                 if (len != 0 && transaction_read(tdb, off, buf, len, cv) != 0) {
 179                         goto fail;
 180                 }
 181
 182                 return 0;
 183         }
 184
 185         /* its not in the transaction elements - do a real read */
 186         return tdb->transaction->io_methods->tdb_read(tdb, off, buf, len, cv);
 187
 188 fail:
 189         TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_read: failed at off=%d len=%d\n", off, len));
 190         tdb->ecode = TDB_ERR_IO;
 191         tdb->transaction->transaction_error = 1;
 192         return -1;
 193 }
 194
 195
 196 /*
 197   write while in a transaction
 198 */
 199 static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
 200                              const void *buf, tdb_len_t len)
 201 {
 202         struct tdb_transaction_el *el, *best_el=NULL;
 203
 204         if (len == 0) {
 205                 return 0;
 206         }
 207
 208         /* if the write is to a hash head, then update the transaction
 209            hash heads */
 210         if (len == sizeof(tdb_off_t) && off >= FREELIST_TOP &&
 211             off < FREELIST_TOP+TDB_HASHTABLE_SIZE(tdb)) {
 212                 u32 chain = (off-FREELIST_TOP) / sizeof(tdb_off_t);
 213                 memcpy(&tdb->transaction->hash_heads[chain], buf, len);
 214         }
 215
 216         /* first see if we can replace an existing entry */
 217         for (el=tdb->transaction->elements_last;el;el=el->prev) {
 218                 tdb_len_t partial;
 219
 220                 if (best_el == NULL && off == el->offset+el->length) {
 221                         best_el = el;
 222                 }
 223
 224                 if (off+len <= el->offset) {
 225                         continue;
 226                 }
 227                 if (off >= el->offset + el->length) {
 228                         continue;
 229                 }
 230
 231                 /* an overlapping write - needs to be split into up to
 232                    2 writes and a memcpy */
 233                 if (off < el->offset) {
 234                         partial = el->offset - off;
 235                         if (transaction_write(tdb, off, buf, partial) != 0) {
 236                                 goto fail;
 237                         }
 238                         len -= partial;
 239                         off += partial;
 240                         buf = (const void *)(partial + (const char *)buf);
 241                 }
 242                 if (off + len <= el->offset + el->length) {
 243                         partial = len;
 244                 } else {
 245                         partial = el->offset + el->length - off;
 246                 }
 247                 memcpy(el->data + (off - el->offset), buf, partial);
 248                 len -= partial;
 249                 off += partial;
 250                 buf = (const void *)(partial + (const char *)buf);
 251
 252                 if (len != 0 && transaction_write(tdb, off, buf, len) != 0) {
 253                         goto fail;
 254                 }
 255
 256                 return 0;
 257         }
 258
 259         /* see if we can append the new entry to an existing entry */
 260         if (best_el && best_el->offset + best_el->length == off &&
 261             (off+len < tdb->transaction->old_map_size ||
 262              off > tdb->transaction->old_map_size)) {
 263                 unsigned char *data = best_el->data;
 264                 el = best_el;
 265                 el->data = (unsigned char *)realloc(el->data,
 266                                                     el->length + len);
 267                 if (el->data == NULL) {
 268                         tdb->ecode = TDB_ERR_OOM;
 269                         tdb->transaction->transaction_error = 1;
 270                         el->data = data;
 271                         return -1;
 272                 }
 273                 if (buf) {
 274                         memcpy(el->data + el->length, buf, len);
 275                 } else {
 276                         memset(el->data + el->length, TDB_PAD_BYTE, len);
 277                 }
 278                 el->length += len;
 279                 return 0;
 280         }
 281
 282         /* add a new entry at the end of the list */
 283         el = (struct tdb_transaction_el *)malloc(sizeof(*el));
 284         if (el == NULL) {
 285                 tdb->ecode = TDB_ERR_OOM;
 286                 tdb->transaction->transaction_error = 1;
 287                 return -1;
 288         }
 289         el->next = NULL;
 290         el->prev = tdb->transaction->elements_last;
 291         el->offset = off;
 292         el->length = len;
 293         el->data = (unsigned char *)malloc(len);
 294         if (el->data == NULL) {
 295                 free(el);
 296                 tdb->ecode = TDB_ERR_OOM;
 297                 tdb->transaction->transaction_error = 1;
 298                 return -1;
 299         }
 300         if (buf) {
 301                 memcpy(el->data, buf, len);
 302         } else {
 303                 memset(el->data, TDB_PAD_BYTE, len);
 304         }
 305         if (el->prev) {
 306                 el->prev->next = el;
 307         } else {
 308                 tdb->transaction->elements = el;
 309         }
 310         tdb->transaction->elements_last = el;
 311         return 0;
 312
 313 fail:
 314         TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: failed at off=%d len=%d\n", off, len));
 315         tdb->ecode = TDB_ERR_IO;
 316         tdb->transaction->transaction_error = 1;
 317         return -1;
 318 }
 319
 320 /*
 321   accelerated hash chain head search, using the cached hash heads
 322 */
 323 static void transaction_next_hash_chain(struct tdb_context *tdb, u32 *chain)
 324 {
 325         u32 h = *chain;
 326         for (;h < tdb->header.hash_size;h++) {
 327                 /* the +1 takes account of the freelist */
 328                 if (0 != tdb->transaction->hash_heads[h+1]) {
 329                         break;
 330                 }
 331         }
 332         (*chain) = h;
 333 }
 334
 335 /*
 336   out of bounds check during a transaction
 337 */
 338 static int transaction_oob(struct tdb_context *tdb, tdb_off_t len, int probe)
 339 {
 340         if (len <= tdb->map_size) {
 341                 return 0;
 342         }
 343         return TDB_ERRCODE(TDB_ERR_IO, -1);
 344 }
 345
 346 /*
 347   transaction version of tdb_expand().
 348 */
 349 static int transaction_expand_file(struct tdb_context *tdb, tdb_off_t size,
 350                                    tdb_off_t addition)
 351 {
 352         /* add a write to the transaction elements, so subsequent
 353            reads see the zero data */
 354         if (transaction_write(tdb, size, NULL, addition) != 0) {
 355                 return -1;
 356         }
 357
 358         return 0;
 359 }
 360
 361 /*
 362   brlock during a transaction - ignore them
 363 */
 364 int transaction_brlock(struct tdb_context *tdb, tdb_off_t offset,
 365                        int rw_type, int lck_type, int probe, size_t len)
 366 {
 367         return 0;
 368 }
 369
 370 static const struct tdb_methods transaction_methods = {
 371         transaction_read,
 372         transaction_write,
 373         transaction_next_hash_chain,
 374         transaction_oob,
 375         transaction_expand_file,
 376         transaction_brlock
 377 };
 378
 379
 380 /*
 381   start a tdb transaction. No token is returned, as only a single
 382   transaction is allowed to be pending per tdb_context
 383 */
 384 int tdb_transaction_start(struct tdb_context *tdb)
 385 {
 386         /* some sanity checks */
 387         if (tdb->read_only || (tdb->flags & TDB_INTERNAL) || tdb->traverse_read) {
 388                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction on a read-only or internal db\n"));
 389                 tdb->ecode = TDB_ERR_EINVAL;
 390                 return -1;
 391         }
 392
 393         /* cope with nested tdb_transaction_start() calls */
 394         if (tdb->transaction != NULL) {
 395                 tdb->transaction->nesting++;
 396                 TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_start: nesting %d\n",
 397                          tdb->transaction->nesting));
 398                 return 0;
 399         }
 400
 401         if (tdb->num_locks != 0 || tdb->global_lock.count) {
 402                 /* the caller must not have any locks when starting a
 403                    transaction as otherwise we'll be screwed by lack
 404                    of nested locks in posix */
 405                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction with locks held\n"));
 406                 tdb->ecode = TDB_ERR_LOCK;
 407                 return -1;
 408         }
 409
 410         if (tdb->travlocks.next != NULL) {
 411                 /* you cannot use transactions inside a traverse (although you can use
 412                    traverse inside a transaction) as otherwise you can end up with
 413                    deadlock */
 414                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction within a traverse\n"));
 415                 tdb->ecode = TDB_ERR_LOCK;
 416                 return -1;
 417         }
 418
 419         tdb->transaction = (struct tdb_transaction *)
 420                 calloc(sizeof(struct tdb_transaction), 1);
 421         if (tdb->transaction == NULL) {
 422                 tdb->ecode = TDB_ERR_OOM;
 423                 return -1;
 424         }
 425
 426         /* get the transaction write lock. This is a blocking lock. As
 427            discussed with Volker, there are a number of ways we could
 428            make this async, which we will probably do in the future */
 429         if (tdb_brlock(tdb, TRANSACTION_LOCK, F_WRLCK, F_SETLKW, 0, 1) == -1) {
 430                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: failed to get transaction lock\n"));
 431                 tdb->ecode = TDB_ERR_LOCK;
 432                 SAFE_FREE(tdb->transaction);
 433                 return -1;
 434         }
 435
 436         /* get a read lock from the freelist to the end of file. This
 437            is upgraded to a write lock during the commit */
 438         if (tdb_brlock(tdb, FREELIST_TOP, F_RDLCK, F_SETLKW, 0, 0) == -1) {
 439                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: failed to get hash locks\n"));
 440                 tdb->ecode = TDB_ERR_LOCK;
 441                 goto fail;
 442         }
 443
 444         /* setup a copy of the hash table heads so the hash scan in
 445            traverse can be fast */
 446         tdb->transaction->hash_heads = (u32 *)
 447                 calloc(tdb->header.hash_size+1, sizeof(u32));
 448         if (tdb->transaction->hash_heads == NULL) {
 449                 tdb->ecode = TDB_ERR_OOM;
 450                 goto fail;
 451         }
 452         if (tdb->methods->tdb_read(tdb, FREELIST_TOP, tdb->transaction->hash_heads,
 453                                    TDB_HASHTABLE_SIZE(tdb), 0) != 0) {
 454                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_start: failed to read hash heads\n"));
 455                 tdb->ecode = TDB_ERR_IO;
 456                 goto fail;
 457         }
 458
 459         /* make sure we know about any file expansions already done by
 460            anyone else */
 461         tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
 462         tdb->transaction->old_map_size = tdb->map_size;
 463
 464         /* finally hook the io methods, replacing them with
 465            transaction specific methods */
 466         tdb->transaction->io_methods = tdb->methods;
 467         tdb->methods = &transaction_methods;
 468
 469         /* by calling this transaction write here, we ensure that we don't grow the
 470            transaction linked list due to hash table updates */
 471         if (transaction_write(tdb, FREELIST_TOP, tdb->transaction->hash_heads,
 472                               TDB_HASHTABLE_SIZE(tdb)) != 0) {
 473                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_start: failed to prime hash table\n"));
 474                 tdb->ecode = TDB_ERR_IO;
 475                 goto fail;
 476         }
 477
 478         return 0;
 479
 480 fail:
 481         tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 0);
 482         tdb_brlock(tdb, TRANSACTION_LOCK, F_UNLCK, F_SETLKW, 0, 1);
 483         SAFE_FREE(tdb->transaction->hash_heads);
 484         SAFE_FREE(tdb->transaction);
 485         return -1;
 486 }
 487
 488
 489 /*
 490   cancel the current transaction
 491 */
 492 int tdb_transaction_cancel(struct tdb_context *tdb)
 493 {
 494         if (tdb->transaction == NULL) {
 495                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_cancel: no transaction\n"));
 496                 return -1;
 497         }
 498
 499         if (tdb->transaction->nesting != 0) {
 500                 tdb->transaction->transaction_error = 1;
 501                 tdb->transaction->nesting--;
 502                 return 0;
 503         }
 504
 505         tdb->map_size = tdb->transaction->old_map_size;
 506
 507         /* free all the transaction elements */
 508         while (tdb->transaction->elements) {
 509                 struct tdb_transaction_el *el = tdb->transaction->elements;
 510                 tdb->transaction->elements = el->next;
 511                 free(el->data);
 512                 free(el);
 513         }
 514
 515         /* remove any global lock created during the transaction */
 516         if (tdb->global_lock.count != 0) {
 517                 tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 4*tdb->header.hash_size);
 518                 tdb->global_lock.count = 0;
 519         }
 520
 521         /* remove any locks created during the transaction */
 522         if (tdb->num_locks != 0) {
 523                 int i;
 524                 for (i=0;i<tdb->num_lockrecs;i++) {
 525                         tdb_brlock(tdb,FREELIST_TOP+4*tdb->lockrecs[i].list,
 526                                    F_UNLCK,F_SETLKW, 0, 1);
 527                 }
 528                 tdb->num_locks = 0;
 529         }
 530
 531         /* restore the normal io methods */
 532         tdb->methods = tdb->transaction->io_methods;
 533
 534         tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 0);
 535         tdb_brlock(tdb, TRANSACTION_LOCK, F_UNLCK, F_SETLKW, 0, 1);
 536         SAFE_FREE(tdb->transaction->hash_heads);
 537         SAFE_FREE(tdb->transaction);
 538
 539         return 0;
 540 }
 541
 542 /*
 543   sync to disk
 544 */
 545 static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t length)
 546 {
 547         if (fsync(tdb->fd) != 0) {
 548                 tdb->ecode = TDB_ERR_IO;
 549                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: fsync failed\n"));
 550                 return -1;
 551         }
 552 #ifdef MS_SYNC
 553         if (tdb->map_ptr) {
 554                 tdb_off_t moffset = offset & ~(tdb->page_size-1);
 555                 if (msync(moffset + (char *)tdb->map_ptr,
 556                           length + (offset - moffset), MS_SYNC) != 0) {
 557                         tdb->ecode = TDB_ERR_IO;
 558                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: msync failed - %s\n",
 559                                  strerror(errno)));
 560                         return -1;
 561                 }
 562         }
 563 #endif
 564         return 0;
 565 }
 566
 567
 568 /*
 569   work out how much space the linearised recovery data will consume
 570 */
 571 static tdb_len_t tdb_recovery_size(struct tdb_context *tdb)
 572 {
 573         struct tdb_transaction_el *el;
 574         tdb_len_t recovery_size = 0;
 575
 576         recovery_size = sizeof(u32);
 577         for (el=tdb->transaction->elements;el;el=el->next) {
 578                 if (el->offset >= tdb->transaction->old_map_size) {
 579                         continue;
 580                 }
 581                 recovery_size += 2*sizeof(tdb_off_t) + el->length;
 582         }
 583
 584         return recovery_size;
 585 }
 586
 587 /*
 588   allocate the recovery area, or use an existing recovery area if it is
 589   large enough
 590 */
 591 static int tdb_recovery_allocate(struct tdb_context *tdb,
 592                                  tdb_len_t *recovery_size,
 593                                  tdb_off_t *recovery_offset,
 594                                  tdb_len_t *recovery_max_size)
 595 {
 596         struct list_struct rec;
 597         const struct tdb_methods *methods = tdb->transaction->io_methods;
 598         tdb_off_t recovery_head;
 599
 600         if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
 601                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to read recovery head\n"));
 602                 return -1;
 603         }
 604
 605         rec.rec_len = 0;
 606
 607         if (recovery_head != 0 &&
 608             methods->tdb_read(tdb, recovery_head, &rec, sizeof(rec), DOCONV()) == -1) {
 609                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to read recovery record\n"));
 610                 return -1;
 611         }
 612
 613         *recovery_size = tdb_recovery_size(tdb);
 614
 615         if (recovery_head != 0 && *recovery_size <= rec.rec_len) {
 616                 /* it fits in the existing area */
 617                 *recovery_max_size = rec.rec_len;
 618                 *recovery_offset = recovery_head;
 619                 return 0;
 620         }
 621
 622         /* we need to free up the old recovery area, then allocate a
 623            new one at the end of the file. Note that we cannot use
 624            tdb_allocate() to allocate the new one as that might return
 625            us an area that is being currently used (as of the start of
 626            the transaction) */
 627         if (recovery_head != 0) {
 628                 if (tdb_free(tdb, recovery_head, &rec) == -1) {
 629                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to free previous recovery area\n"));
 630                         return -1;
 631                 }
 632         }
 633
 634         /* the tdb_free() call might have increased the recovery size */
 635         *recovery_size = tdb_recovery_size(tdb);
 636
 637         /* round up to a multiple of page size */
 638         *recovery_max_size = TDB_ALIGN(sizeof(rec) + *recovery_size, tdb->page_size) - sizeof(rec);
 639         *recovery_offset = tdb->map_size;
 640         recovery_head = *recovery_offset;
 641
 642         if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size,
 643                                      (tdb->map_size - tdb->transaction->old_map_size) +
 644                                      sizeof(rec) + *recovery_max_size) == -1) {
 645                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to create recovery area\n"));
 646                 return -1;
 647         }
 648
 649         /* remap the file (if using mmap) */
 650         methods->tdb_oob(tdb, tdb->map_size + 1, 1);
 651
 652         /* we have to reset the old map size so that we don't try to expand the file
 653            again in the transaction commit, which would destroy the recovery area */
 654         tdb->transaction->old_map_size = tdb->map_size;
 655
 656         /* write the recovery header offset and sync - we can sync without a race here
 657            as the magic ptr in the recovery record has not been set */
 658         CONVERT(recovery_head);
 659         if (methods->tdb_write(tdb, TDB_RECOVERY_HEAD,
 660                                &recovery_head, sizeof(tdb_off_t)) == -1) {
 661                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to write recovery head\n"));
 662                 return -1;
 663         }
 664
 665         return 0;
 666 }
 667
 668
 669 /*
 670   setup the recovery data that will be used on a crash during commit
 671 */
 672 static int transaction_setup_recovery(struct tdb_context *tdb,
 673                                       tdb_off_t *magic_offset)
 674 {
 675         struct tdb_transaction_el *el;
 676         tdb_len_t recovery_size;
 677         unsigned char *data, *p;
 678         const struct tdb_methods *methods = tdb->transaction->io_methods;
 679         struct list_struct *rec;
 680         tdb_off_t recovery_offset, recovery_max_size;
 681         tdb_off_t old_map_size = tdb->transaction->old_map_size;
 682         u32 magic, tailer;
 683
 684         /*
 685           check that the recovery area has enough space
 686         */
 687         if (tdb_recovery_allocate(tdb, &recovery_size,
 688                                   &recovery_offset, &recovery_max_size) == -1) {
 689                 return -1;
 690         }
 691
 692         data = (unsigned char *)malloc(recovery_size + sizeof(*rec));
 693         if (data == NULL) {
 694                 tdb->ecode = TDB_ERR_OOM;
 695                 return -1;
 696         }
 697
 698         rec = (struct list_struct *)data;
 699         memset(rec, 0, sizeof(*rec));
 700
 701         rec->magic    = 0;
 702         rec->data_len = recovery_size;
 703         rec->rec_len  = recovery_max_size;
 704         rec->key_len  = old_map_size;
 705         CONVERT(rec);
 706
 707         /* build the recovery data into a single blob to allow us to do a single
 708            large write, which should be more efficient */
 709         p = data + sizeof(*rec);
 710         for (el=tdb->transaction->elements;el;el=el->next) {
 711                 if (el->offset >= old_map_size) {
 712                         continue;
 713                 }
 714                 if (el->offset + el->length > tdb->transaction->old_map_size) {
 715                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: transaction data over new region boundary\n"));
 716                         free(data);
 717                         tdb->ecode = TDB_ERR_CORRUPT;
 718                         return -1;
 719                 }
 720                 memcpy(p, &el->offset, 4);
 721                 memcpy(p+4, &el->length, 4);
 722                 if (DOCONV()) {
 723                         tdb_convert(p, 8);
 724                 }
 725                 /* the recovery area contains the old data, not the
 726                    new data, so we have to call the original tdb_read
 727                    method to get it */
 728                 if (methods->tdb_read(tdb, el->offset, p + 8, el->length, 0) != 0) {
 729                         free(data);
 730                         tdb->ecode = TDB_ERR_IO;
 731                         return -1;
 732                 }
 733                 p += 8 + el->length;
 734         }
 735
 736         /* and the tailer */
 737         tailer = sizeof(*rec) + recovery_max_size;
 738         memcpy(p, &tailer, 4);
 739         CONVERT(p);
 740
 741         /* write the recovery data to the recovery area */
 742         if (methods->tdb_write(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) {
 743                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery data\n"));
 744                 free(data);
 745                 tdb->ecode = TDB_ERR_IO;
 746                 return -1;
 747         }
 748
 749         /* as we don't have ordered writes, we have to sync the recovery
 750            data before we update the magic to indicate that the recovery
 751            data is present */
 752         if (transaction_sync(tdb, recovery_offset, sizeof(*rec) + recovery_size) == -1) {
 753                 free(data);
 754                 return -1;
 755         }
 756
 757         free(data);
 758
 759         magic = TDB_RECOVERY_MAGIC;
 760         CONVERT(magic);
 761
 762         *magic_offset = recovery_offset + offsetof(struct list_struct, magic);
 763
 764         if (methods->tdb_write(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
 765                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery magic\n"));
 766                 tdb->ecode = TDB_ERR_IO;
 767                 return -1;
 768         }
 769
 770         /* ensure the recovery magic marker is on disk */
 771         if (transaction_sync(tdb, *magic_offset, sizeof(magic)) == -1) {
 772                 return -1;
 773         }
 774
 775         return 0;
 776 }
 777
 778 /*
 779   commit the current transaction
 780 */
 781 int tdb_transaction_commit(struct tdb_context *tdb)
 782 {
 783         const struct tdb_methods *methods;
 784         tdb_off_t magic_offset = 0;
 785         u32 zero = 0;
 786
 787         if (tdb->transaction == NULL) {
 788                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: no transaction\n"));
 789                 return -1;
 790         }
 791
 792         if (tdb->transaction->transaction_error) {
 793                 tdb->ecode = TDB_ERR_IO;
 794                 tdb_transaction_cancel(tdb);
 795                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: transaction error pending\n"));
 796                 return -1;
 797         }
 798
 799         if (tdb->transaction->nesting != 0) {
 800                 tdb->transaction->nesting--;
 801                 return 0;
 802         }
 803
 804         /* check for a null transaction */
 805         if (tdb->transaction->elements == NULL) {
 806                 tdb_transaction_cancel(tdb);
 807                 return 0;
 808         }
 809
 810         methods = tdb->transaction->io_methods;
 811
 812         /* if there are any locks pending then the caller has not
 813            nested their locks properly, so fail the transaction */
 814         if (tdb->num_locks || tdb->global_lock.count) {
 815                 tdb->ecode = TDB_ERR_LOCK;
 816                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: locks pending on commit\n"));
 817                 tdb_transaction_cancel(tdb);
 818                 return -1;
 819         }
 820
 821         /* upgrade the main transaction lock region to a write lock */
 822         if (tdb_brlock_upgrade(tdb, FREELIST_TOP, 0) == -1) {
 823                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: failed to upgrade hash locks\n"));
 824                 tdb->ecode = TDB_ERR_LOCK;
 825                 tdb_transaction_cancel(tdb);
 826                 return -1;
 827         }
 828
 829         /* get the global lock - this prevents new users attaching to the database
 830            during the commit */
 831         if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0, 1) == -1) {
 832                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: failed to get global lock\n"));
 833                 tdb->ecode = TDB_ERR_LOCK;
 834                 tdb_transaction_cancel(tdb);
 835                 return -1;
 836         }
 837
 838         if (!(tdb->flags & TDB_NOSYNC)) {
 839                 /* write the recovery data to the end of the file */
 840                 if (transaction_setup_recovery(tdb, &magic_offset) == -1) {
 841                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: failed to setup recovery data\n"));
 842                         tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
 843                         tdb_transaction_cancel(tdb);
 844                         return -1;
 845                 }
 846         }
 847
 848         /* expand the file to the new size if needed */
 849         if (tdb->map_size != tdb->transaction->old_map_size) {
 850                 if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size,
 851                                              tdb->map_size -
 852                                              tdb->transaction->old_map_size) == -1) {
 853                         tdb->ecode = TDB_ERR_IO;
 854                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: expansion failed\n"));
 855                         tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
 856                         tdb_transaction_cancel(tdb);
 857                         return -1;
 858                 }
 859                 tdb->map_size = tdb->transaction->old_map_size;
 860                 methods->tdb_oob(tdb, tdb->map_size + 1, 1);
 861         }
 862
 863         /* perform all the writes */
 864         while (tdb->transaction->elements) {
 865                 struct tdb_transaction_el *el = tdb->transaction->elements;
 866
 867                 if (methods->tdb_write(tdb, el->offset, el->data, el->length) == -1) {
 868                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed during commit\n"));
 869
 870                         /* we've overwritten part of the data and
 871                            possibly expanded the file, so we need to
 872                            run the crash recovery code */
 873                         tdb->methods = methods;
 874                         tdb_transaction_recover(tdb);
 875
 876                         tdb_transaction_cancel(tdb);
 877                         tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
 878
 879                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed\n"));
 880                         return -1;
 881                 }
 882                 tdb->transaction->elements = el->next;
 883                 free(el->data);
 884                 free(el);
 885         }
 886
 887         if (!(tdb->flags & TDB_NOSYNC)) {
 888                 /* ensure the new data is on disk */
 889                 if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
 890                         return -1;
 891                 }
 892
 893                 /* remove the recovery marker */
 894                 if (methods->tdb_write(tdb, magic_offset, &zero, 4) == -1) {
 895                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: failed to remove recovery magic\n"));
 896                         return -1;
 897                 }
 898
 899                 /* ensure the recovery marker has been removed on disk */
 900                 if (transaction_sync(tdb, magic_offset, 4) == -1) {
 901                         return -1;
 902                 }
 903         }
 904
 905         tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
 906
 907         /*
 908           TODO: maybe write to some dummy hdr field, or write to magic
 909           offset without mmap, before the last sync, instead of the
 910           utime() call
 911         */
 912
 913         /* on some systems (like Linux 2.6.x) changes via mmap/msync
 914            don't change the mtime of the file, this means the file may
 915            not be backed up (as tdb rounding to block sizes means that
 916            file size changes are quite rare too). The following forces
 917            mtime changes when a transaction completes */
 918 #ifdef HAVE_UTIME
 919         utime(tdb->name, NULL);
 920 #endif
 921
 922         /* use a transaction cancel to free memory and remove the
 923            transaction locks */
 924         tdb_transaction_cancel(tdb);
 925         return 0;
 926 }
 927
 928
 929 /*
 930   recover from an aborted transaction. Must be called with exclusive
 931   database write access already established (including the global
 932   lock to prevent new processes attaching)
 933 */
 934 int tdb_transaction_recover(struct tdb_context *tdb)
 935 {
 936         tdb_off_t recovery_head, recovery_eof;
 937         unsigned char *data, *p;
 938         u32 zero = 0;
 939         struct list_struct rec;
 940
 941         /* find the recovery area */
 942         if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
 943                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery head\n"));
 944                 tdb->ecode = TDB_ERR_IO;
 945                 return -1;
 946         }
 947
 948         if (recovery_head == 0) {
 949                 /* we have never allocated a recovery record */
 950                 return 0;
 951         }
 952
 953         /* read the recovery record */
 954         if (tdb->methods->tdb_read(tdb, recovery_head, &rec,
 955                                    sizeof(rec), DOCONV()) == -1) {
 956                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery record\n"));
 957                 tdb->ecode = TDB_ERR_IO;
 958                 return -1;
 959         }
 960
 961         if (rec.magic != TDB_RECOVERY_MAGIC) {
 962                 /* there is no valid recovery data */
 963                 return 0;
 964         }
 965
 966         if (tdb->read_only) {
 967                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: attempt to recover read only database\n"));
 968                 tdb->ecode = TDB_ERR_CORRUPT;
 969                 return -1;
 970         }
 971
 972         recovery_eof = rec.key_len;
 973
 974         data = (unsigned char *)malloc(rec.data_len);
 975         if (data == NULL) {
 976                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to allocate recovery data\n"));
 977                 tdb->ecode = TDB_ERR_OOM;
 978                 return -1;
 979         }
 980
 981         /* read the full recovery data */
 982         if (tdb->methods->tdb_read(tdb, recovery_head + sizeof(rec), data,
 983                                    rec.data_len, 0) == -1) {
 984                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery data\n"));
 985                 tdb->ecode = TDB_ERR_IO;
 986                 return -1;
 987         }
 988
 989         /* recover the file data */
 990         p = data;
 991         while (p+8 < data + rec.data_len) {
 992                 u32 ofs, len;
 993                 if (DOCONV()) {
 994                         tdb_convert(p, 8);
 995                 }
 996                 memcpy(&ofs, p, 4);
 997                 memcpy(&len, p+4, 4);
 998
 999                 if (tdb->methods->tdb_write(tdb, ofs, p+8, len) == -1) {
1000                         free(data);
1001                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to recover %d bytes at offset %d\n", len, ofs));
1002                         tdb->ecode = TDB_ERR_IO;
1003                         return -1;
1004                 }
1005                 p += 8 + len;
1006         }
1007
1008         free(data);
1009
1010         if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
1011                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync recovery\n"));
1012                 tdb->ecode = TDB_ERR_IO;
1013                 return -1;
1014         }
1015
1016         /* if the recovery area is after the recovered eof then remove it */
1017         if (recovery_eof <= recovery_head) {
1018                 if (tdb_ofs_write(tdb, TDB_RECOVERY_HEAD, &zero) == -1) {
1019                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery head\n"));
1020                         tdb->ecode = TDB_ERR_IO;
1021                         return -1;
1022                 }
1023         }
1024
1025         /* remove the recovery magic */
1026         if (tdb_ofs_write(tdb, recovery_head + offsetof(struct list_struct, magic),
1027                           &zero) == -1) {
1028                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery magic\n"));
1029                 tdb->ecode = TDB_ERR_IO;
1030                 return -1;
1031         }
1032
1033         /* reduce the file size to the old size */
1034         tdb_munmap(tdb);
1035         if (ftruncate(tdb->fd, recovery_eof) != 0) {
1036                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to reduce to recovery size\n"));
1037                 tdb->ecode = TDB_ERR_IO;
1038                 return -1;
1039         }
1040         tdb->map_size = recovery_eof;
1041         tdb_mmap(tdb);
1042
1043         if (transaction_sync(tdb, 0, recovery_eof) == -1) {
1044                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync2 recovery\n"));
1045                 tdb->ecode = TDB_ERR_IO;
1046                 return -1;
1047         }
1048
1049         TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_recover: recovered %d byte database\n",
1050                  recovery_eof));
1051
1052         /* all done */
1053         return 0;
1054 }