trans.c

   1
   2 /*
   3  * libjio - A library for Journaled I/O
   4  * Alberto Bertogli (albertogli@telpin.com.ar)
   5  *
   6  * Core transaction API and recovery functions
   7  */
   8
   9 #include <sys/types.h>
  10 #include <sys/stat.h>
  11 #include <fcntl.h>
  12 #include <unistd.h>
  13 #include <stdlib.h>
  14 #include <limits.h>
  15 #include <string.h>
  16 #include <libgen.h>
  17 #include <stdio.h>
  18 #include <dirent.h>
  19 #include <errno.h>
  20 #include <sys/mman.h>
  21
  22 #include "libjio.h"
  23 #include "common.h"
  24
  25
  26 /*
  27  * helper functions
  28  */
  29
  30 /* gets a new transaction id */
  31 static unsigned int get_tid(struct jfs *fs)
  32 {
  33         unsigned int curid, rv;
  34
  35         /* lock the whole file */
  36         plockf(fs->jfd, F_LOCKW, 0, 0);
  37
  38         /* read the current max. curid */
  39         curid = *(fs->jmap);
  40
  41         /* increment it and handle overflows */
  42         rv = curid + 1;
  43         if (rv == 0)
  44                 goto exit;
  45
  46         /* write to the file descriptor */
  47         *(fs->jmap) = rv;
  48
  49 exit:
  50         plockf(fs->jfd, F_UNLOCK, 0, 0);
  51         return rv;
  52 }
  53
  54 /* frees a transaction id */
  55 static void free_tid(struct jfs *fs, unsigned int tid)
  56 {
  57         unsigned int curid, i;
  58         char name[PATH_MAX];
  59
  60         /* lock the whole file */
  61         plockf(fs->jfd, F_LOCKW, 0, 0);
  62
  63         /* read the current max. curid */
  64         curid = *(fs->jmap);
  65
  66         if (tid < curid) {
  67                 /* we're not freeing the max. curid, so we just return */
  68                 goto exit;
  69         } else {
  70                 /* look up the new max. */
  71                 for (i = curid - 1; i > 0; i--) {
  72                         /* this can fail if we're low on mem, but we don't
  73                          * care checking here because the problem will come
  74                          * out later and we can fail more properly */
  75                         get_jtfile(fs->name, i, name);
  76                         if (access(name, R_OK | W_OK) == 0) {
  77                                 curid = i;
  78                                 break;
  79                         }
  80                 }
  81
  82                 /* and save it */
  83                 *(fs->jmap) = i;
  84         }
  85
  86 exit:
  87         plockf(fs->jfd, F_UNLOCK, 0, 0);
  88         return;
  89 }
  90
  91
  92 /*
  93  * transaction functions
  94  */
  95
  96 /* initialize a transaction structure */
  97 void jtrans_init(struct jfs *fs, struct jtrans *ts)
  98 {
  99         ts->fs = fs;
 100         ts->name = NULL;
 101         ts->id = 0;
 102         ts->flags = fs->flags;
 103         ts->op = NULL;
 104         ts->numops = 0;
 105         pthread_mutex_init( &(ts->lock), NULL);
 106 }
 107
 108
 109 /* free the contents of a transaction structure */
 110 void jtrans_free(struct jtrans *ts)
 111 {
 112         struct joper *tmpop;
 113
 114         ts->fs = NULL;
 115
 116         if (ts->name)
 117                 free(ts->name);
 118
 119         while (ts->op != NULL) {
 120                 tmpop = ts->op->next;
 121
 122                 if (ts->op->buf)
 123                         free(ts->op->buf);
 124                 if (ts->op->pdata)
 125                         free(ts->op->pdata);
 126                 free(ts->op);
 127
 128                 ts->op = tmpop;
 129         }
 130         pthread_mutex_destroy(&(ts->lock));
 131 }
 132
 133
 134 int jtrans_add(struct jtrans *ts, const void *buf, size_t count, off_t offset)
 135 {
 136         struct joper *jop, *tmpop;
 137
 138         /* find the last operation in the transaction and create a new one at
 139          * the end */
 140         pthread_mutex_lock(&(ts->lock));
 141         if (ts->op == NULL) {
 142                 ts->op = malloc(sizeof(struct joper));
 143                 if (ts->op == NULL)
 144                         return 0;
 145                 jop = ts->op;
 146                 jop->prev = NULL;
 147         } else {
 148                 for (tmpop = ts->op; tmpop->next != NULL; tmpop = tmpop->next)
 149                         ;
 150                 tmpop->next = malloc(sizeof(struct joper));
 151                 if (tmpop->next == NULL)
 152                         return 0;
 153                 tmpop->next->prev = tmpop;
 154                 jop = tmpop->next;
 155         }
 156         pthread_mutex_unlock(&(ts->lock));
 157
 158         jop->buf = malloc(count);
 159         if (jop->buf == NULL) {
 160                 free(jop);
 161                 return 0;
 162         }
 163
 164         /* we copy the buffer because then the caller can reuse it */
 165         memcpy(jop->buf, buf, count);
 166         jop->len = count;
 167         jop->offset = offset;
 168         jop->next = NULL;
 169         jop->plen = 0;
 170         jop->pdata = NULL;
 171         jop->locked = 0;
 172
 173         ts->numops++;
 174
 175         return 1;
 176 }
 177
 178 /* commit a transaction */
 179 int jtrans_commit(struct jtrans *ts)
 180 {
 181         int id, rv, fd = -1;
 182         uint32_t csum;
 183         char *name;
 184         unsigned char *buf_init, *bufp;
 185         struct joper *op;
 186         struct jlinger *linger;
 187         off_t curpos = 0;
 188         size_t written = 0;
 189
 190         pthread_mutex_lock(&(ts->lock));
 191
 192         name = (char *) malloc(PATH_MAX);
 193         if (name == NULL)
 194                 goto exit;
 195
 196         id = get_tid(ts->fs);
 197         if (id == 0)
 198                 goto exit;
 199
 200         /* open the transaction file */
 201         if (!get_jtfile(ts->fs->name, id, name))
 202                 goto exit;
 203         fd = open(name, O_RDWR | O_CREAT | O_TRUNC | O_LARGEFILE, 0600);
 204         if (fd < 0)
 205                 goto exit;
 206
 207         /* and lock it */
 208         plockf(fd, F_LOCKW, 0, 0);
 209
 210         ts->id = id;
 211         ts->name = name;
 212
 213         /* save the header */
 214         buf_init = malloc(J_DISKHEADSIZE);
 215         if (buf_init == NULL)
 216                 goto unlink_exit;
 217
 218         bufp = buf_init;
 219
 220         memcpy(bufp, (void *) &(ts->id), 4);
 221         bufp += 4;
 222
 223         memcpy(bufp, (void *) &(ts->flags), 4);
 224         bufp += 4;
 225
 226         memcpy(bufp, (void *) &(ts->numops), 4);
 227         bufp += 4;
 228
 229         rv = spwrite(fd, buf_init, J_DISKHEADSIZE, 0);
 230         if (rv != J_DISKHEADSIZE) {
 231                 free(buf_init);
 232                 goto unlink_exit;
 233         }
 234
 235         free(buf_init);
 236
 237         curpos = J_DISKHEADSIZE;
 238
 239         /* first of all lock all the regions we're going to work with;
 240          * otherwise there could be another transaction trying to write the
 241          * same spots and we could end up with interleaved writes, that could
 242          * break atomicity warantees if we need to rollback */
 243         if (!(ts->flags & J_NOLOCK)) {
 244                 for (op = ts->op; op != NULL; op = op->next) {
 245                         rv = plockf(ts->fs->fd, F_LOCKW, op->offset, op->len);
 246                         if (rv == -1)
 247                                 /* note it can fail with EDEADLK */
 248                                 goto unlink_exit;
 249                         op->locked = 1;
 250                 }
 251         }
 252
 253         /* save each transacion in the file */
 254         for (op = ts->op; op != NULL; op = op->next) {
 255                 /* read the current content only if the transaction is not
 256                  * marked as NOROLLBACK, and if the data is not there yet,
 257                  * which is the normal case, but for rollbacking we fill it
 258                  * ourselves */
 259                 if (!(ts->flags & J_NOROLLBACK) && (op->pdata == NULL)) {
 260                         op->pdata = malloc(op->len);
 261                         if (op->pdata == NULL)
 262                                 goto unlink_exit;
 263
 264                         op->plen = op->len;
 265
 266                         rv = spread(ts->fs->fd, op->pdata, op->len,
 267                                         op->offset);
 268                         if (rv < 0)
 269                                 goto unlink_exit;
 270                         if (rv < op->len) {
 271                                 /* we are extending the file! */
 272                                 /* ftruncate(ts->fs->fd, op->offset + op->len); */
 273                                 op->plen = rv;
 274                         }
 275                 }
 276
 277                 /* save the operation's header */
 278                 buf_init = malloc(J_DISKOPHEADSIZE);
 279                 if (buf_init == NULL)
 280                         goto unlink_exit;
 281
 282                 bufp = buf_init;
 283
 284                 memcpy(bufp, (void *) &(op->len), 4);
 285                 bufp += 4;
 286
 287                 memcpy(bufp, (void *) &(op->plen), 4);
 288                 bufp += 4;
 289
 290                 memcpy(bufp, (void *) &(op->offset), 8);
 291                 bufp += 8;
 292
 293                 rv = spwrite(fd, buf_init, J_DISKOPHEADSIZE, curpos);
 294                 if (rv != J_DISKOPHEADSIZE) {
 295                         free(buf_init);
 296                         goto unlink_exit;
 297                 }
 298
 299                 free(buf_init);
 300
 301                 curpos += J_DISKOPHEADSIZE;
 302
 303                 /* and save it to the disk */
 304                 rv = spwrite(fd, op->buf, op->len, curpos);
 305                 if (rv != op->len)
 306                         goto unlink_exit;
 307
 308                 curpos += op->len;
 309         }
 310
 311         /* compute and save the checksum */
 312         if (!checksum(fd, curpos, &csum))
 313                 goto unlink_exit;
 314
 315         rv = spwrite(fd, &csum, sizeof(uint32_t), curpos);
 316         if (rv != sizeof(uint32_t))
 317                 goto unlink_exit;
 318         curpos += sizeof(uint32_t);
 319
 320         /* this is a simple but efficient optimization: instead of doing
 321          * everything O_SYNC, we sync at this point only, this way we avoid
 322          * doing a lot of very small writes; in case of a crash the
 323          * transaction file is only useful if it's complete (ie. after this
 324          * point) so we only flush here (both data and metadata) */
 325         if (fsync(fd) != 0)
 326                 goto unlink_exit;
 327         if (fsync(ts->fs->jdirfd) != 0) {
 328                 /* it seems to be legal that fsync() on directories is not
 329                  * implemented, so if this fails with EINVAL or EBADF, just
 330                  * call a global sync(); which is awful (and might still
 331                  * return before metadata is done) but it seems to be the
 332                  * saner choice; otherwise we just fail */
 333                 if (errno == EINVAL || errno == EBADF) {
 334                         sync();
 335                 } else {
 336                         goto unlink_exit;
 337                 }
 338         }
 339
 340         /* now that we have a safe transaction file, let's apply it */
 341         written = 0;
 342         for (op = ts->op; op != NULL; op = op->next) {
 343                 rv = spwrite(ts->fs->fd, op->buf, op->len, op->offset);
 344
 345                 plockf(ts->fs->fd, F_UNLOCK, op->offset, op->len);
 346                 op->locked = 0;
 347
 348                 if (rv != op->len)
 349                         goto rollback_exit;
 350
 351                 written += rv;
 352         }
 353
 354         if (ts->flags & J_LINGER) {
 355                 linger = malloc(sizeof(struct jlinger));
 356                 if (linger == NULL)
 357                         goto rollback_exit;
 358
 359                 linger->id = id;
 360                 linger->name = strdup(name);
 361                 linger->next = ts->fs->ltrans;
 362
 363                 ts->fs->ltrans = linger;
 364         } else {
 365                 /* the transaction has been applied, so we cleanup and remove
 366                  * it from the disk */
 367                 unlink(name);
 368                 free_tid(ts->fs, ts->id);
 369         }
 370
 371         /* mark the transaction as commited, _after_ it was removed */
 372         ts->flags = ts->flags | J_COMMITED;
 373
 374
 375 rollback_exit:
 376         /* If the transaction failed we try to recover by rollbacking it
 377          * NOTE: on extreme conditions (ENOSPC/disk failure) this can fail
 378          * too! There's nothing much we can do in that case, the caller should
 379          * take care of it by itself.
 380          * The transaction file might be OK at this point, so the data could
 381          * be recovered by a posterior jfsck(); however, that's not what the
 382          * user expects (after all, if we return failure, new data should
 383          * never appear), so we remove the transaction file.
 384          * Transactions that were successfuly recovered by rollbacking them
 385          * will have J_ROLLBACKED in their flags, so the caller can verify if
 386          * the failure was recovered or not. */
 387         if (!(ts->flags & J_COMMITED) && !(ts->flags & J_ROLLBACKING)) {
 388                 rv = ts->flags;
 389                 ts->flags = ts->flags | J_NOLOCK | J_ROLLBACKING;
 390                 if (jtrans_rollback(ts) >= 0) {
 391                         ts->flags = rv | J_ROLLBACKED;
 392                 } else {
 393                         ts->flags = rv;
 394                 }
 395         }
 396
 397 unlink_exit:
 398         if (!(ts->flags & J_COMMITED)) {
 399                 unlink(name);
 400                 free_tid(ts->fs, ts->id);
 401         }
 402
 403         close(fd);
 404         for (op = ts->op; op != NULL; op = op->next) {
 405                 if (op->locked)
 406                         plockf(ts->fs->fd, F_UNLOCK, op->offset, op->len);
 407         }
 408
 409 exit:
 410         pthread_mutex_unlock(&(ts->lock));
 411
 412         /* return the length only if it was properly commited */
 413         if (ts->flags & J_COMMITED)
 414                 return written;
 415         else
 416                 return -1;
 417
 418 }
 419
 420 /* rollback a transaction */
 421 int jtrans_rollback(struct jtrans *ts)
 422 {
 423         int rv;
 424         struct jtrans newts;
 425         struct joper *op, *curop, *lop;
 426
 427         jtrans_init(ts->fs, &newts);
 428         newts.flags = ts->flags;
 429
 430         if (ts->op == NULL || ts->flags & J_NOROLLBACK) {
 431                 rv = -1;
 432                 goto exit;
 433         }
 434
 435         /* find the last operation */
 436         for (op = ts->op; op->next != NULL; op = op->next)
 437                 ;
 438
 439         /* and traverse the list backwards */
 440         for ( ; op != NULL; op = op->prev) {
 441                 /* if we extended the data in the previous transaction, we
 442                  * should truncate it back */
 443                 /* DANGEROUS: this is one of the main reasons why rollbacking
 444                  * is dangerous and should only be done with extreme caution:
 445                  * if for some reason, after the previous transacton, we have
 446                  * extended the file further, this will cut it back to what it
 447                  * was; read the docs for more detail */
 448                 if (op->plen < op->len)
 449                         ftruncate(ts->fs->fd, op->offset + op->plen);
 450
 451                 /* manually add the operation to the new transaction */
 452                 curop = malloc(sizeof(struct joper));
 453                 if (curop == NULL) {
 454                         rv = -1;
 455                         goto exit;
 456                 }
 457
 458                 curop->offset = op->offset;
 459                 curop->len = op->plen;
 460                 curop->buf = op->pdata;
 461                 curop->plen = op->plen;
 462                 curop->pdata = op->pdata;
 463                 curop->locked = 0;
 464
 465                 /* add the new transaction to the list */
 466                 if (newts.op == NULL) {
 467                         newts.op = curop;
 468                         curop->prev = NULL;
 469                         curop->next = NULL;
 470                 } else {
 471                         for (lop = newts.op; lop->next != NULL; lop = lop->next)
 472                                 ;
 473                         lop->next = curop;
 474                         curop->prev = lop;
 475                         curop->next = NULL;
 476                 }
 477         }
 478
 479         rv = jtrans_commit(&newts);
 480
 481 exit:
 482         /* free the transaction */
 483         for (curop = newts.op; curop != NULL; curop = curop->next) {
 484                 curop->buf = NULL;
 485                 curop->pdata = NULL;
 486         }
 487         jtrans_free(&newts);
 488
 489         return rv;
 490 }
 491
 492 /*
 493  * basic operations
 494  */
 495
 496 /* open a file */
 497 int jopen(struct jfs *fs, const char *name, int flags, int mode, int jflags)
 498 {
 499         int fd, jfd, rv;
 500         unsigned int t;
 501         char jdir[PATH_MAX], jlockfile[PATH_MAX];
 502         struct stat sinfo;
 503
 504         /* we always need read and write access, because when we commit a
 505          * transaction we read the current contents before applying, and write
 506          * access is needed for locking with fcntl */
 507         flags = flags & ~O_WRONLY;
 508         flags = flags & ~O_RDONLY;
 509         flags = flags | O_RDWR;
 510
 511         fd = open(name, flags, mode);
 512         if (fd < 0)
 513                 return -1;
 514
 515         fs->fd = fd;
 516         fs->name = strdup(name);
 517         fs->flags = jflags;
 518         fs->ltrans = NULL;
 519
 520         /* Note on fs->lock usage: this lock is used only inside the wrappers,
 521          * and exclusively to protect the file pointer. This means that it
 522          * must only be held while performing operations that depend or alter
 523          * the file pointer (jread, jreadv, jwrite, jwritev), but the others
 524          * (jpread, jpwrite) are left unprotected because they can be
 525          * performed in paralell as long as they don't affect the same portion
 526          * of the file (this is protected by lockf). The lock doesn't slow
 527          * things down tho: any threaded app MUST implement this kind of
 528          * locking anyways if it wants to prevent data corruption, we only
 529          * make it easier for them by taking care of it here. If performance
 530          * is essential, the jpread/jpwrite functions should be used, just as
 531          * real life. */
 532         pthread_mutex_init( &(fs->lock), NULL);
 533
 534         if (!get_jdir(name, jdir))
 535                 return -1;
 536         rv = mkdir(jdir, 0750);
 537         rv = lstat(jdir, &sinfo);
 538         if (rv < 0 || !S_ISDIR(sinfo.st_mode))
 539                 return -1;
 540
 541         /* open the directory, we will use it to flush transaction files'
 542          * metadata in jtrans_commit() */
 543         fs->jdirfd = open(jdir, O_RDONLY);
 544         if (fs->jdirfd < 0)
 545                 return -1;
 546
 547         snprintf(jlockfile, PATH_MAX, "%s/%s", jdir, "lock");
 548         jfd = open(jlockfile, O_RDWR | O_CREAT, 0600);
 549         if (jfd < 0)
 550                 return -1;
 551
 552         /* initialize the lock file by writing the first tid to it, but only
 553          * if its empty, otherwise there is a race if two processes call
 554          * jopen() simultaneously and both initialize the file */
 555         plockf(jfd, F_LOCKW, 0, 0);
 556         lstat(jlockfile, &sinfo);
 557         if (sinfo.st_size != sizeof(unsigned int)) {
 558                 t = 0;
 559                 rv = spwrite(jfd, &t, sizeof(t), 0);
 560                 if (rv != sizeof(t)) {
 561                         plockf(jfd, F_UNLOCK, 0, 0);
 562                         return -1;
 563                 }
 564         }
 565         plockf(jfd, F_UNLOCK, 0, 0);
 566
 567         fs->jfd = jfd;
 568
 569         fs->jmap = (unsigned int *) mmap(NULL, sizeof(unsigned int),
 570                         PROT_READ | PROT_WRITE, MAP_SHARED, jfd, 0);
 571         if (fs->jmap == MAP_FAILED)
 572                 return -1;
 573
 574         return fd;
 575 }
 576
 577 /* sync a file (makes sense only if using lingering transactions) */
 578 int jsync(struct jfs *fs)
 579 {
 580         int rv;
 581         struct jlinger *linger, *ltmp;
 582
 583         pthread_mutex_lock(&(fs->lock));
 584
 585         rv = fsync(fs->fd);
 586         if (rv != 0)
 587                 goto exit;
 588
 589         linger = fs->ltrans;
 590         while (linger != NULL) {
 591                 free_tid(fs, linger->id);
 592                 unlink(linger->name);
 593                 free(linger->name);
 594
 595                 ltmp = linger->next;
 596                 free(linger);
 597
 598                 linger = ltmp;
 599         }
 600
 601 exit:
 602         pthread_mutex_unlock(&(fs->lock));
 603         return rv;
 604 }
 605
 606 /* close a file */
 607 int jclose(struct jfs *fs)
 608 {
 609         if (jsync(fs))
 610                 return -1;
 611         if (close(fs->fd))
 612                 return -1;
 613         if (close(fs->jfd))
 614                 return -1;
 615         if (close(fs->jdirfd))
 616                 return -1;
 617         if (fs->name)
 618                 /* allocated by strdup() in jopen() */
 619                 free(fs->name);
 620         munmap(fs->jmap, sizeof(unsigned int));
 621         pthread_mutex_destroy(&(fs->lock));
 622
 623         return 0;
 624 }
 625