drivers/md/dm-exception-store.c

   1 /*
   2  * dm-snapshot.c
   3  *
   4  * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
   5  *
   6  * This file is released under the GPL.
   7  */
   8
   9 #include "dm.h"
  10 #include "dm-snap.h"
  11 #include "dm-io.h"
  12 #include "kcopyd.h"
  13
  14 #include <linux/mm.h>
  15 #include <linux/pagemap.h>
  16 #include <linux/vmalloc.h>
  17 #include <linux/slab.h>
  18
  19 #define DM_MSG_PREFIX "snapshots"
  20
  21 /*-----------------------------------------------------------------
  22  * Persistent snapshots, by persistent we mean that the snapshot
  23  * will survive a reboot.
  24  *---------------------------------------------------------------*/
  25
  26 /*
  27  * We need to store a record of which parts of the origin have
  28  * been copied to the snapshot device.  The snapshot code
  29  * requires that we copy exception chunks to chunk aligned areas
  30  * of the COW store.  It makes sense therefore, to store the
  31  * metadata in chunk size blocks.
  32  *
  33  * There is no backward or forward compatibility implemented,
  34  * snapshots with different disk versions than the kernel will
  35  * not be usable.  It is expected that "lvcreate" will blank out
  36  * the start of a fresh COW device before calling the snapshot
  37  * constructor.
  38  *
  39  * The first chunk of the COW device just contains the header.
  40  * After this there is a chunk filled with exception metadata,
  41  * followed by as many exception chunks as can fit in the
  42  * metadata areas.
  43  *
  44  * All on disk structures are in little-endian format.  The end
  45  * of the exceptions info is indicated by an exception with a
  46  * new_chunk of 0, which is invalid since it would point to the
  47  * header chunk.
  48  */
  49
  50 /*
  51  * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
  52  */
  53 #define SNAP_MAGIC 0x70416e53
  54
  55 /*
  56  * The on-disk version of the metadata.
  57  */
  58 #define SNAPSHOT_DISK_VERSION 1
  59
  60 struct disk_header {
  61         uint32_t magic;
  62
  63         /*
  64          * Is this snapshot valid.  There is no way of recovering
  65          * an invalid snapshot.
  66          */
  67         uint32_t valid;
  68
  69         /*
  70          * Simple, incrementing version. no backward
  71          * compatibility.
  72          */
  73         uint32_t version;
  74
  75         /* In sectors */
  76         uint32_t chunk_size;
  77 };
  78
  79 struct disk_exception {
  80         uint64_t old_chunk;
  81         uint64_t new_chunk;
  82 };
  83
  84 struct commit_callback {
  85         void (*callback)(void *, int success);
  86         void *context;
  87 };
  88
  89 /*
  90  * The top level structure for a persistent exception store.
  91  */
  92 struct pstore {
  93         struct dm_snapshot *snap;       /* up pointer to my snapshot */
  94         int version;
  95         int valid;
  96         uint32_t exceptions_per_area;
  97
  98         /*
  99          * Now that we have an asynchronous kcopyd there is no
 100          * need for large chunk sizes, so it wont hurt to have a
 101          * whole chunks worth of metadata in memory at once.
 102          */
 103         void *area;
 104
 105         /*
 106          * Used to keep track of which metadata area the data in
 107          * 'chunk' refers to.
 108          */
 109         uint32_t current_area;
 110
 111         /*
 112          * The next free chunk for an exception.
 113          */
 114         uint32_t next_free;
 115
 116         /*
 117          * The index of next free exception in the current
 118          * metadata area.
 119          */
 120         uint32_t current_committed;
 121
 122         atomic_t pending_count;
 123         uint32_t callback_count;
 124         struct commit_callback *callbacks;
 125 };
 126
 127 static inline unsigned int sectors_to_pages(unsigned int sectors)
 128 {
 129         return sectors / (PAGE_SIZE >> 9);
 130 }
 131
 132 static int alloc_area(struct pstore *ps)
 133 {
 134         int r = -ENOMEM;
 135         size_t len;
 136
 137         len = ps->snap->chunk_size << SECTOR_SHIFT;
 138
 139         /*
 140          * Allocate the chunk_size block of memory that will hold
 141          * a single metadata area.
 142          */
 143         ps->area = vmalloc(len);
 144         if (!ps->area)
 145                 return r;
 146
 147         return 0;
 148 }
 149
 150 static void free_area(struct pstore *ps)
 151 {
 152         vfree(ps->area);
 153 }
 154
 155 /*
 156  * Read or write a chunk aligned and sized block of data from a device.
 157  */
 158 static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
 159 {
 160         struct io_region where;
 161         unsigned long bits;
 162
 163         where.bdev = ps->snap->cow->bdev;
 164         where.sector = ps->snap->chunk_size * chunk;
 165         where.count = ps->snap->chunk_size;
 166
 167         return dm_io_sync_vm(1, &where, rw, ps->area, &bits);
 168 }
 169
 170 /*
 171  * Read or write a metadata area.  Remembering to skip the first
 172  * chunk which holds the header.
 173  */
 174 static int area_io(struct pstore *ps, uint32_t area, int rw)
 175 {
 176         int r;
 177         uint32_t chunk;
 178
 179         /* convert a metadata area index to a chunk index */
 180         chunk = 1 + ((ps->exceptions_per_area + 1) * area);
 181
 182         r = chunk_io(ps, chunk, rw);
 183         if (r)
 184                 return r;
 185
 186         ps->current_area = area;
 187         return 0;
 188 }
 189
 190 static int zero_area(struct pstore *ps, uint32_t area)
 191 {
 192         memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
 193         return area_io(ps, area, WRITE);
 194 }
 195
 196 static int read_header(struct pstore *ps, int *new_snapshot)
 197 {
 198         int r;
 199         struct disk_header *dh;
 200         chunk_t chunk_size;
 201
 202         r = chunk_io(ps, 0, READ);
 203         if (r)
 204                 return r;
 205
 206         dh = (struct disk_header *) ps->area;
 207
 208         if (le32_to_cpu(dh->magic) == 0) {
 209                 *new_snapshot = 1;
 210
 211         } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) {
 212                 *new_snapshot = 0;
 213                 ps->valid = le32_to_cpu(dh->valid);
 214                 ps->version = le32_to_cpu(dh->version);
 215                 chunk_size = le32_to_cpu(dh->chunk_size);
 216                 if (ps->snap->chunk_size != chunk_size) {
 217                         DMWARN("chunk size %llu in device metadata overrides "
 218                                "table chunk size of %llu.",
 219                                (unsigned long long)chunk_size,
 220                                (unsigned long long)ps->snap->chunk_size);
 221
 222                         /* We had a bogus chunk_size. Fix stuff up. */
 223                         dm_io_put(sectors_to_pages(ps->snap->chunk_size));
 224                         free_area(ps);
 225
 226                         ps->snap->chunk_size = chunk_size;
 227                         ps->snap->chunk_mask = chunk_size - 1;
 228                         ps->snap->chunk_shift = ffs(chunk_size) - 1;
 229
 230                         r = alloc_area(ps);
 231                         if (r)
 232                                 return r;
 233
 234                         r = dm_io_get(sectors_to_pages(chunk_size));
 235                         if (r)
 236                                 return r;
 237                 }
 238         } else {
 239                 DMWARN("Invalid/corrupt snapshot");
 240                 r = -ENXIO;
 241         }
 242
 243         return r;
 244 }
 245
 246 static int write_header(struct pstore *ps)
 247 {
 248         struct disk_header *dh;
 249
 250         memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
 251
 252         dh = (struct disk_header *) ps->area;
 253         dh->magic = cpu_to_le32(SNAP_MAGIC);
 254         dh->valid = cpu_to_le32(ps->valid);
 255         dh->version = cpu_to_le32(ps->version);
 256         dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
 257
 258         return chunk_io(ps, 0, WRITE);
 259 }
 260
 261 /*
 262  * Access functions for the disk exceptions, these do the endian conversions.
 263  */
 264 static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
 265 {
 266         if (index >= ps->exceptions_per_area)
 267                 return NULL;
 268
 269         return ((struct disk_exception *) ps->area) + index;
 270 }
 271
 272 static int read_exception(struct pstore *ps,
 273                           uint32_t index, struct disk_exception *result)
 274 {
 275         struct disk_exception *e;
 276
 277         e = get_exception(ps, index);
 278         if (!e)
 279                 return -EINVAL;
 280
 281         /* copy it */
 282         result->old_chunk = le64_to_cpu(e->old_chunk);
 283         result->new_chunk = le64_to_cpu(e->new_chunk);
 284
 285         return 0;
 286 }
 287
 288 static int write_exception(struct pstore *ps,
 289                            uint32_t index, struct disk_exception *de)
 290 {
 291         struct disk_exception *e;
 292
 293         e = get_exception(ps, index);
 294         if (!e)
 295                 return -EINVAL;
 296
 297         /* copy it */
 298         e->old_chunk = cpu_to_le64(de->old_chunk);
 299         e->new_chunk = cpu_to_le64(de->new_chunk);
 300
 301         return 0;
 302 }
 303
 304 /*
 305  * Registers the exceptions that are present in the current area.
 306  * 'full' is filled in to indicate if the area has been
 307  * filled.
 308  */
 309 static int insert_exceptions(struct pstore *ps, int *full)
 310 {
 311         int r;
 312         unsigned int i;
 313         struct disk_exception de;
 314
 315         /* presume the area is full */
 316         *full = 1;
 317
 318         for (i = 0; i < ps->exceptions_per_area; i++) {
 319                 r = read_exception(ps, i, &de);
 320
 321                 if (r)
 322                         return r;
 323
 324                 /*
 325                  * If the new_chunk is pointing at the start of
 326                  * the COW device, where the first metadata area
 327                  * is we know that we've hit the end of the
 328                  * exceptions.  Therefore the area is not full.
 329                  */
 330                 if (de.new_chunk == 0LL) {
 331                         ps->current_committed = i;
 332                         *full = 0;
 333                         break;
 334                 }
 335
 336                 /*
 337                  * Keep track of the start of the free chunks.
 338                  */
 339                 if (ps->next_free <= de.new_chunk)
 340                         ps->next_free = de.new_chunk + 1;
 341
 342                 /*
 343                  * Otherwise we add the exception to the snapshot.
 344                  */
 345                 r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
 346                 if (r)
 347                         return r;
 348         }
 349
 350         return 0;
 351 }
 352
 353 static int read_exceptions(struct pstore *ps)
 354 {
 355         uint32_t area;
 356         int r, full = 1;
 357
 358         /*
 359          * Keeping reading chunks and inserting exceptions until
 360          * we find a partially full area.
 361          */
 362         for (area = 0; full; area++) {
 363                 r = area_io(ps, area, READ);
 364                 if (r)
 365                         return r;
 366
 367                 r = insert_exceptions(ps, &full);
 368                 if (r)
 369                         return r;
 370         }
 371
 372         return 0;
 373 }
 374
 375 static inline struct pstore *get_info(struct exception_store *store)
 376 {
 377         return (struct pstore *) store->context;
 378 }
 379
 380 static void persistent_fraction_full(struct exception_store *store,
 381                                      sector_t *numerator, sector_t *denominator)
 382 {
 383         *numerator = get_info(store)->next_free * store->snap->chunk_size;
 384         *denominator = get_dev_size(store->snap->cow->bdev);
 385 }
 386
 387 static void persistent_destroy(struct exception_store *store)
 388 {
 389         struct pstore *ps = get_info(store);
 390
 391         dm_io_put(sectors_to_pages(ps->snap->chunk_size));
 392         vfree(ps->callbacks);
 393         free_area(ps);
 394         kfree(ps);
 395 }
 396
 397 static int persistent_read_metadata(struct exception_store *store)
 398 {
 399         int r, new_snapshot;
 400         struct pstore *ps = get_info(store);
 401
 402         /*
 403          * Read the snapshot header.
 404          */
 405         r = read_header(ps, &new_snapshot);
 406         if (r)
 407                 return r;
 408
 409         /*
 410          * Now we know correct chunk_size, complete the initialisation.
 411          */
 412         ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
 413                                   sizeof(struct disk_exception);
 414         ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
 415                         sizeof(*ps->callbacks));
 416         if (!ps->callbacks)
 417                 return -ENOMEM;
 418
 419         /*
 420          * Do we need to setup a new snapshot ?
 421          */
 422         if (new_snapshot) {
 423                 r = write_header(ps);
 424                 if (r) {
 425                         DMWARN("write_header failed");
 426                         return r;
 427                 }
 428
 429                 r = zero_area(ps, 0);
 430                 if (r) {
 431                         DMWARN("zero_area(0) failed");
 432                         return r;
 433                 }
 434
 435         } else {
 436                 /*
 437                  * Sanity checks.
 438                  */
 439                 if (!ps->valid) {
 440                         DMWARN("snapshot is marked invalid");
 441                         return -EINVAL;
 442                 }
 443
 444                 if (ps->version != SNAPSHOT_DISK_VERSION) {
 445                         DMWARN("unable to handle snapshot disk version %d",
 446                                ps->version);
 447                         return -EINVAL;
 448                 }
 449
 450                 /*
 451                  * Read the metadata.
 452                  */
 453                 r = read_exceptions(ps);
 454                 if (r)
 455                         return r;
 456         }
 457
 458         return 0;
 459 }
 460
 461 static int persistent_prepare(struct exception_store *store,
 462                               struct exception *e)
 463 {
 464         struct pstore *ps = get_info(store);
 465         uint32_t stride;
 466         sector_t size = get_dev_size(store->snap->cow->bdev);
 467
 468         /* Is there enough room ? */
 469         if (size < ((ps->next_free + 1) * store->snap->chunk_size))
 470                 return -ENOSPC;
 471
 472         e->new_chunk = ps->next_free;
 473
 474         /*
 475          * Move onto the next free pending, making sure to take
 476          * into account the location of the metadata chunks.
 477          */
 478         stride = (ps->exceptions_per_area + 1);
 479         if ((++ps->next_free % stride) == 1)
 480                 ps->next_free++;
 481
 482         atomic_inc(&ps->pending_count);
 483         return 0;
 484 }
 485
 486 static void persistent_commit(struct exception_store *store,
 487                               struct exception *e,
 488                               void (*callback) (void *, int success),
 489                               void *callback_context)
 490 {
 491         int r;
 492         unsigned int i;
 493         struct pstore *ps = get_info(store);
 494         struct disk_exception de;
 495         struct commit_callback *cb;
 496
 497         de.old_chunk = e->old_chunk;
 498         de.new_chunk = e->new_chunk;
 499         write_exception(ps, ps->current_committed++, &de);
 500
 501         /*
 502          * Add the callback to the back of the array.  This code
 503          * is the only place where the callback array is
 504          * manipulated, and we know that it will never be called
 505          * multiple times concurrently.
 506          */
 507         cb = ps->callbacks + ps->callback_count++;
 508         cb->callback = callback;
 509         cb->context = callback_context;
 510
 511         /*
 512          * If there are no more exceptions in flight, or we have
 513          * filled this metadata area we commit the exceptions to
 514          * disk.
 515          */
 516         if (atomic_dec_and_test(&ps->pending_count) ||
 517             (ps->current_committed == ps->exceptions_per_area)) {
 518                 r = area_io(ps, ps->current_area, WRITE);
 519                 if (r)
 520                         ps->valid = 0;
 521
 522                 for (i = 0; i < ps->callback_count; i++) {
 523                         cb = ps->callbacks + i;
 524                         cb->callback(cb->context, r == 0 ? 1 : 0);
 525                 }
 526
 527                 ps->callback_count = 0;
 528         }
 529
 530         /*
 531          * Have we completely filled the current area ?
 532          */
 533         if (ps->current_committed == ps->exceptions_per_area) {
 534                 ps->current_committed = 0;
 535                 r = zero_area(ps, ps->current_area + 1);
 536                 if (r)
 537                         ps->valid = 0;
 538         }
 539 }
 540
 541 static void persistent_drop(struct exception_store *store)
 542 {
 543         struct pstore *ps = get_info(store);
 544
 545         ps->valid = 0;
 546         if (write_header(ps))
 547                 DMWARN("write header failed");
 548 }
 549
 550 int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
 551 {
 552         int r;
 553         struct pstore *ps;
 554
 555         r = dm_io_get(sectors_to_pages(chunk_size));
 556         if (r)
 557                 return r;
 558
 559         /* allocate the pstore */
 560         ps = kmalloc(sizeof(*ps), GFP_KERNEL);
 561         if (!ps) {
 562                 r = -ENOMEM;
 563                 goto bad;
 564         }
 565
 566         ps->snap = store->snap;
 567         ps->valid = 1;
 568         ps->version = SNAPSHOT_DISK_VERSION;
 569         ps->next_free = 2;      /* skipping the header and first area */
 570         ps->current_committed = 0;
 571
 572         r = alloc_area(ps);
 573         if (r)
 574                 goto bad;
 575
 576         ps->callback_count = 0;
 577         atomic_set(&ps->pending_count, 0);
 578         ps->callbacks = NULL;
 579
 580         store->destroy = persistent_destroy;
 581         store->read_metadata = persistent_read_metadata;
 582         store->prepare_exception = persistent_prepare;
 583         store->commit_exception = persistent_commit;
 584         store->drop_snapshot = persistent_drop;
 585         store->fraction_full = persistent_fraction_full;
 586         store->context = ps;
 587
 588         return 0;
 589
 590       bad:
 591         dm_io_put(sectors_to_pages(chunk_size));
 592         if (ps && ps->area)
 593                 free_area(ps);
 594         kfree(ps);
 595         return r;
 596 }
 597
 598 /*-----------------------------------------------------------------
 599  * Implementation of the store for non-persistent snapshots.
 600  *---------------------------------------------------------------*/
 601 struct transient_c {
 602         sector_t next_free;
 603 };
 604
 605 static void transient_destroy(struct exception_store *store)
 606 {
 607         kfree(store->context);
 608 }
 609
 610 static int transient_read_metadata(struct exception_store *store)
 611 {
 612         return 0;
 613 }
 614
 615 static int transient_prepare(struct exception_store *store, struct exception *e)
 616 {
 617         struct transient_c *tc = (struct transient_c *) store->context;
 618         sector_t size = get_dev_size(store->snap->cow->bdev);
 619
 620         if (size < (tc->next_free + store->snap->chunk_size))
 621                 return -1;
 622
 623         e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
 624         tc->next_free += store->snap->chunk_size;
 625
 626         return 0;
 627 }
 628
 629 static void transient_commit(struct exception_store *store,
 630                       struct exception *e,
 631                       void (*callback) (void *, int success),
 632                       void *callback_context)
 633 {
 634         /* Just succeed */
 635         callback(callback_context, 1);
 636 }
 637
 638 static void transient_fraction_full(struct exception_store *store,
 639                                     sector_t *numerator, sector_t *denominator)
 640 {
 641         *numerator = ((struct transient_c *) store->context)->next_free;
 642         *denominator = get_dev_size(store->snap->cow->bdev);
 643 }
 644
 645 int dm_create_transient(struct exception_store *store,
 646                         struct dm_snapshot *s, int blocksize)
 647 {
 648         struct transient_c *tc;
 649
 650         memset(store, 0, sizeof(*store));
 651         store->destroy = transient_destroy;
 652         store->read_metadata = transient_read_metadata;
 653         store->prepare_exception = transient_prepare;
 654         store->commit_exception = transient_commit;
 655         store->fraction_full = transient_fraction_full;
 656         store->snap = s;
 657
 658         tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
 659         if (!tc)
 660                 return -ENOMEM;
 661
 662         tc->next_free = 0;
 663         store->context = tc;
 664
 665         return 0;
 666 }