sys/vfs/hammer/hammer_redo.c

   1 /*
   2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34
  35 /*
  36  * HAMMER redo - REDO record support for the UNDO/REDO FIFO.
  37  *
  38  * See also hammer_undo.c
  39  */
  40
  41 #include "hammer.h"
  42
  43 RB_GENERATE2(hammer_redo_rb_tree, hammer_inode, rb_redonode,
  44              hammer_redo_rb_compare, hammer_off_t, redo_fifo_start);
  45
  46 /*
  47  * HAMMER version 4+ REDO support.
  48  *
  49  * REDO records are used to improve fsync() performance.  Instead of having
  50  * to go through a complete double-flush cycle involving at least two disk
  51  * synchronizations the fsync need only flush UNDO/REDO FIFO buffers through
  52  * the related REDO records, which is a single synchronization requiring
  53  * no track seeking.  If a recovery becomes necessary the recovery code
  54  * will generate logical data writes based on the REDO records encountered.
  55  * That is, the recovery code will UNDO any partial meta-data/data writes
  56  * at the raw disk block level and then REDO the data writes at the logical
  57  * level.
  58  */
  59 int
  60 hammer_generate_redo(hammer_transaction_t trans, hammer_inode_t ip,
  61                      hammer_off_t file_off, uint32_t flags,
  62                      void *base, int len)
  63 {
  64         hammer_mount_t hmp;
  65         hammer_volume_t root_volume;
  66         hammer_blockmap_t undomap;
  67         hammer_buffer_t buffer = NULL;
  68         hammer_fifo_redo_t redo;
  69         hammer_fifo_tail_t tail;
  70         hammer_off_t next_offset;
  71         int error;
  72         int bytes;
  73         int n;
  74
  75         /*
  76          * Setup
  77          */
  78         hmp = trans->hmp;
  79
  80         root_volume = trans->rootvol;
  81         undomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
  82
  83         /*
  84          * No undo recursion when modifying the root volume
  85          */
  86         hammer_modify_volume_noundo(NULL, root_volume);
  87         hammer_lock_ex(&hmp->undo_lock);
  88
  89         /* undo had better not roll over (loose test) */
  90         if (hammer_undo_space(trans) < len + HAMMER_BUFSIZE*3)
  91                 hpanic("insufficient UNDO/REDO FIFO space for redo!");
  92
  93         /*
  94          * Loop until the undo for the entire range has been laid down.
  95          * Loop at least once (len might be 0 as a degenerate case).
  96          */
  97         for (;;) {
  98                 /*
  99                  * Fetch the layout offset in the UNDO FIFO, wrap it as
 100                  * necessary.
 101                  */
 102                 if (undomap->next_offset == undomap->alloc_offset)
 103                         undomap->next_offset = HAMMER_ENCODE_UNDO(0);
 104                 next_offset = undomap->next_offset;
 105
 106                 /*
 107                  * This is a tail-chasing FIFO, when we hit the start of a new
 108                  * buffer we don't have to read it in.
 109                  */
 110                 if ((next_offset & HAMMER_BUFMASK) == 0) {
 111                         redo = hammer_bnew(hmp, next_offset, &error, &buffer);
 112                         hammer_format_undo(redo, hmp->undo_seqno ^ 0x40000000);
 113                 } else {
 114                         redo = hammer_bread(hmp, next_offset, &error, &buffer);
 115                 }
 116                 if (error)
 117                         break;
 118                 hammer_modify_buffer_noundo(NULL, buffer);
 119
 120                 /*
 121                  * Calculate how big a media structure fits up to the next
 122                  * alignment point and how large a data payload we can
 123                  * accomodate.
 124                  *
 125                  * If n calculates to 0 or negative there is no room for
 126                  * anything but a PAD.
 127                  */
 128                 bytes = HAMMER_UNDO_ALIGN -
 129                         ((int)next_offset & HAMMER_UNDO_MASK);
 130                 n = bytes -
 131                     (int)sizeof(struct hammer_fifo_redo) -
 132                     (int)sizeof(struct hammer_fifo_tail);
 133
 134                 /*
 135                  * If available space is insufficient for any payload
 136                  * we have to lay down a PAD.
 137                  *
 138                  * The minimum PAD is 8 bytes and the head and tail will
 139                  * overlap each other in that case.  PADs do not have
 140                  * sequence numbers or CRCs.
 141                  *
 142                  * A PAD may not start on a boundary.  That is, every
 143                  * 512-byte block in the UNDO/REDO FIFO must begin with
 144                  * a record containing a sequence number.
 145                  */
 146                 if (n <= 0) {
 147                         KKASSERT(bytes >= sizeof(struct hammer_fifo_tail));
 148                         KKASSERT(((int)next_offset & HAMMER_UNDO_MASK) != 0);
 149                         tail = (void *)((char *)redo + bytes - sizeof(*tail));
 150                         if ((void *)redo != (void *)tail) {
 151                                 tail->tail_signature = HAMMER_TAIL_SIGNATURE;
 152                                 tail->tail_type = HAMMER_HEAD_TYPE_PAD;
 153                                 tail->tail_size = bytes;
 154                         }
 155                         redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
 156                         redo->head.hdr_type = HAMMER_HEAD_TYPE_PAD;
 157                         redo->head.hdr_size = bytes;
 158                         /* NO CRC OR SEQ NO */
 159                         undomap->next_offset += bytes;
 160                         hammer_modify_buffer_done(buffer);
 161                         hammer_stats_redo += bytes;
 162                         continue;
 163                 }
 164
 165                 /*
 166                  * When generating an inode-related REDO record we track
 167                  * the point in the UNDO/REDO FIFO containing the inode's
 168                  * earliest REDO record.  See hammer_generate_redo_sync().
 169                  *
 170                  * redo_fifo_next is cleared when an inode is staged to
 171                  * the backend and then used to determine how to reassign
 172                  * redo_fifo_start after the inode flush completes.
 173                  */
 174                 if (ip) {
 175                         redo->redo_objid = ip->obj_id;
 176                         redo->redo_localization = ip->obj_localization;
 177                         if ((ip->flags & HAMMER_INODE_RDIRTY) == 0) {
 178                                 ip->redo_fifo_start = next_offset;
 179                                 if (RB_INSERT(hammer_redo_rb_tree,
 180                                               &hmp->rb_redo_root, ip)) {
 181                                         hpanic("cannot insert inode %p on "
 182                                               "redo FIFO", ip);
 183                                 }
 184                                 ip->flags |= HAMMER_INODE_RDIRTY;
 185                         }
 186                         if (ip->redo_fifo_next == 0)
 187                                 ip->redo_fifo_next = next_offset;
 188                 } else {
 189                         redo->redo_objid = 0;
 190                         redo->redo_localization = 0;
 191                 }
 192
 193                 /*
 194                  * Calculate the actual payload and recalculate the size
 195                  * of the media structure as necessary.  If no data buffer
 196                  * is supplied there is no payload.
 197                  */
 198                 if (base == NULL) {
 199                         n = 0;
 200                 } else if (n > len) {
 201                         n = len;
 202                 }
 203                 bytes = ((n + HAMMER_HEAD_ALIGN_MASK) &
 204                          ~HAMMER_HEAD_ALIGN_MASK) +
 205                         (int)sizeof(struct hammer_fifo_redo) +
 206                         (int)sizeof(struct hammer_fifo_tail);
 207                 if (hammer_debug_general & 0x0080) {
 208                         hdkprintf("redo %016jx %d %d\n",
 209                                 (intmax_t)next_offset, bytes, n);
 210                 }
 211
 212                 redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
 213                 redo->head.hdr_type = HAMMER_HEAD_TYPE_REDO;
 214                 redo->head.hdr_size = bytes;
 215                 redo->head.hdr_seq = hmp->undo_seqno++;
 216                 redo->head.hdr_crc = 0;
 217                 redo->redo_mtime = trans->time;
 218                 redo->redo_offset = file_off;
 219                 redo->redo_flags = flags;
 220
 221                 /*
 222                  * Incremental payload.  If no payload we throw the entire
 223                  * len into redo_data_bytes and will not loop.
 224                  */
 225                 if (base) {
 226                         redo->redo_data_bytes = n;
 227                         bcopy(base, redo + 1, n);
 228                         len -= n;
 229                         base = (char *)base + n;
 230                         file_off += n;
 231                 } else {
 232                         redo->redo_data_bytes = len;
 233                         file_off += len;
 234                         len = 0;
 235                 }
 236
 237                 tail = (void *)((char *)redo + bytes - sizeof(*tail));
 238                 tail->tail_signature = HAMMER_TAIL_SIGNATURE;
 239                 tail->tail_type = HAMMER_HEAD_TYPE_REDO;
 240                 tail->tail_size = bytes;
 241
 242                 KKASSERT(bytes >= sizeof(redo->head));
 243                 hammer_crc_set_fifo_head(&redo->head, bytes);
 244                 undomap->next_offset += bytes;
 245                 hammer_stats_redo += bytes;
 246
 247                 /*
 248                  * Before we finish off the buffer we have to deal with any
 249                  * junk between the end of the media structure we just laid
 250                  * down and the UNDO alignment boundary.  We do this by laying
 251                  * down a dummy PAD.  Even though we will probably overwrite
 252                  * it almost immediately we have to do this so recovery runs
 253                  * can iterate the UNDO space without having to depend on
 254                  * the indices in the volume header.
 255                  *
 256                  * This dummy PAD will be overwritten on the next undo so
 257                  * we do not adjust undomap->next_offset.
 258                  */
 259                 bytes = HAMMER_UNDO_ALIGN -
 260                         ((int)undomap->next_offset & HAMMER_UNDO_MASK);
 261                 if (bytes != HAMMER_UNDO_ALIGN) {
 262                         KKASSERT(bytes >= sizeof(struct hammer_fifo_tail));
 263                         redo = (void *)(tail + 1);
 264                         tail = (void *)((char *)redo + bytes - sizeof(*tail));
 265                         if ((void *)redo != (void *)tail) {
 266                                 tail->tail_signature = HAMMER_TAIL_SIGNATURE;
 267                                 tail->tail_type = HAMMER_HEAD_TYPE_PAD;
 268                                 tail->tail_size = bytes;
 269                         }
 270                         redo->head.hdr_signature = HAMMER_HEAD_SIGNATURE;
 271                         redo->head.hdr_type = HAMMER_HEAD_TYPE_PAD;
 272                         redo->head.hdr_size = bytes;
 273                         /* NO CRC OR SEQ NO */
 274                 }
 275                 hammer_modify_buffer_done(buffer);
 276                 if (len == 0)
 277                         break;
 278         }
 279         hammer_modify_volume_done(root_volume);
 280         hammer_unlock(&hmp->undo_lock);
 281
 282         if (buffer)
 283                 hammer_rel_buffer(buffer, 0);
 284
 285         /*
 286          * Make sure the nominal undo span contains at least one REDO_SYNC,
 287          * otherwise the REDO recovery will not be triggered.
 288          */
 289         if ((hmp->flags & HAMMER_MOUNT_REDO_SYNC) == 0 &&
 290             flags != HAMMER_REDO_SYNC) {
 291                 hammer_generate_redo_sync(trans);
 292         }
 293
 294         return(error);
 295 }
 296
 297 /*
 298  * Generate a REDO SYNC record.  At least one such record must be generated
 299  * in the nominal recovery span for the recovery code to be able to run
 300  * REDOs outside of the span.
 301  *
 302  * The SYNC record contains the aggregate earliest UNDO/REDO FIFO offset
 303  * for all inodes with active REDOs.  This changes dynamically as inodes
 304  * get flushed.
 305  *
 306  * During recovery stage2 any new flush cycles must specify the original
 307  * redo sync offset.  That way a crash will re-run the REDOs, at least
 308  * up to the point where the UNDO FIFO does not overwrite the area.
 309  */
 310 void
 311 hammer_generate_redo_sync(hammer_transaction_t trans)
 312 {
 313         hammer_mount_t hmp = trans->hmp;
 314         hammer_inode_t ip;
 315         hammer_off_t redo_fifo_start;
 316
 317         if (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) {
 318                 ip = NULL;
 319                 redo_fifo_start = hmp->recover_stage2_offset;
 320         } else {
 321                 ip = RB_FIRST(hammer_redo_rb_tree, &hmp->rb_redo_root);
 322                 if (ip)
 323                         redo_fifo_start = ip->redo_fifo_start;
 324                 else
 325                         redo_fifo_start = 0;
 326         }
 327         if (redo_fifo_start) {
 328                 if (hammer_debug_io & 0x0004) {
 329                         hdkprintf("SYNC IP %p %016jx\n",
 330                                 ip, (intmax_t)redo_fifo_start);
 331                 }
 332                 hammer_generate_redo(trans, NULL, redo_fifo_start,
 333                                      HAMMER_REDO_SYNC, NULL, 0);
 334                 trans->hmp->flags |= HAMMER_MOUNT_REDO_SYNC;
 335         }
 336 }
 337
 338 /*
 339  * This is called when an inode is queued to the backend.
 340  */
 341 void
 342 hammer_redo_fifo_start_flush(hammer_inode_t ip)
 343 {
 344         ip->redo_fifo_next = 0;
 345 }
 346
 347 /*
 348  * This is called when an inode backend flush is finished.  We have to make
 349  * sure that RDIRTY is not set unless dirty bufs are present.  Dirty bufs
 350  * can get destroyed through operations such as truncations and leave
 351  * us with a stale redo_fifo_next.
 352  */
 353 void
 354 hammer_redo_fifo_end_flush(hammer_inode_t ip)
 355 {
 356         hammer_mount_t hmp = ip->hmp;
 357
 358         if (ip->flags & HAMMER_INODE_RDIRTY) {
 359                 RB_REMOVE(hammer_redo_rb_tree, &hmp->rb_redo_root, ip);
 360                 ip->flags &= ~HAMMER_INODE_RDIRTY;
 361         }
 362         if ((ip->flags & HAMMER_INODE_BUFS) == 0)
 363                 ip->redo_fifo_next = 0;
 364         if (ip->redo_fifo_next) {
 365                 ip->redo_fifo_start = ip->redo_fifo_next;
 366                 if (RB_INSERT(hammer_redo_rb_tree, &hmp->rb_redo_root, ip)) {
 367                         hpanic("cannot reinsert inode %p on redo FIFO", ip);
 368                 }
 369                 ip->flags |= HAMMER_INODE_RDIRTY;
 370         }
 371 }