fs/xfs/xfs_buf_item.c

   1 /*
   2  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 #include "xfs.h"
  19 #include "xfs_fs.h"
  20 #include "xfs_types.h"
  21 #include "xfs_bit.h"
  22 #include "xfs_log.h"
  23 #include "xfs_trans.h"
  24 #include "xfs_sb.h"
  25 #include "xfs_ag.h"
  26 #include "xfs_mount.h"
  27 #include "xfs_buf_item.h"
  28 #include "xfs_trans_priv.h"
  29 #include "xfs_error.h"
  30 #include "xfs_trace.h"
  31
  32
  33 kmem_zone_t     *xfs_buf_item_zone;
  34
  35 static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
  36 {
  37         return container_of(lip, struct xfs_buf_log_item, bli_item);
  38 }
  39
  40 STATIC void     xfs_buf_do_callbacks(struct xfs_buf *bp);
  41
  42 /*
  43  * This returns the number of log iovecs needed to log the
  44  * given buf log item.
  45  *
  46  * It calculates this as 1 iovec for the buf log format structure
  47  * and 1 for each stretch of non-contiguous chunks to be logged.
  48  * Contiguous chunks are logged in a single iovec.
  49  *
  50  * If the XFS_BLI_STALE flag has been set, then log nothing.
  51  */
  52 STATIC uint
  53 xfs_buf_item_size_segment(
  54         struct xfs_buf_log_item *bip,
  55         struct xfs_buf_log_format *blfp)
  56 {
  57         struct xfs_buf          *bp = bip->bli_buf;
  58         uint                    nvecs;
  59         int                     next_bit;
  60         int                     last_bit;
  61
  62         last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
  63         if (last_bit == -1)
  64                 return 0;
  65
  66         /*
  67          * initial count for a dirty buffer is 2 vectors - the format structure
  68          * and the first dirty region.
  69          */
  70         nvecs = 2;
  71
  72         while (last_bit != -1) {
  73                 /*
  74                  * This takes the bit number to start looking from and
  75                  * returns the next set bit from there.  It returns -1
  76                  * if there are no more bits set or the start bit is
  77                  * beyond the end of the bitmap.
  78                  */
  79                 next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
  80                                         last_bit + 1);
  81                 /*
  82                  * If we run out of bits, leave the loop,
  83                  * else if we find a new set of bits bump the number of vecs,
  84                  * else keep scanning the current set of bits.
  85                  */
  86                 if (next_bit == -1) {
  87                         break;
  88                 } else if (next_bit != last_bit + 1) {
  89                         last_bit = next_bit;
  90                         nvecs++;
  91                 } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
  92                            (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
  93                             XFS_BLF_CHUNK)) {
  94                         last_bit = next_bit;
  95                         nvecs++;
  96                 } else {
  97                         last_bit++;
  98                 }
  99         }
 100
 101         return nvecs;
 102 }
 103
 104 /*
 105  * This returns the number of log iovecs needed to log the given buf log item.
 106  *
 107  * It calculates this as 1 iovec for the buf log format structure and 1 for each
 108  * stretch of non-contiguous chunks to be logged.  Contiguous chunks are logged
 109  * in a single iovec.
 110  *
 111  * Discontiguous buffers need a format structure per region that that is being
 112  * logged. This makes the changes in the buffer appear to log recovery as though
 113  * they came from separate buffers, just like would occur if multiple buffers
 114  * were used instead of a single discontiguous buffer. This enables
 115  * discontiguous buffers to be in-memory constructs, completely transparent to
 116  * what ends up on disk.
 117  *
 118  * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
 119  * format structures.
 120  */
 121 STATIC uint
 122 xfs_buf_item_size(
 123         struct xfs_log_item     *lip)
 124 {
 125         struct xfs_buf_log_item *bip = BUF_ITEM(lip);
 126         uint                    nvecs;
 127         int                     i;
 128
 129         ASSERT(atomic_read(&bip->bli_refcount) > 0);
 130         if (bip->bli_flags & XFS_BLI_STALE) {
 131                 /*
 132                  * The buffer is stale, so all we need to log
 133                  * is the buf log format structure with the
 134                  * cancel flag in it.
 135                  */
 136                 trace_xfs_buf_item_size_stale(bip);
 137                 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
 138                 return bip->bli_format_count;
 139         }
 140
 141         ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
 142
 143         if (bip->bli_flags & XFS_BLI_ORDERED) {
 144                 /*
 145                  * The buffer has been logged just to order it.
 146                  * It is not being included in the transaction
 147                  * commit, so no vectors are used at all.
 148                  */
 149                 trace_xfs_buf_item_size_ordered(bip);
 150                 return XFS_LOG_VEC_ORDERED;
 151         }
 152
 153         /*
 154          * the vector count is based on the number of buffer vectors we have
 155          * dirty bits in. This will only be greater than one when we have a
 156          * compound buffer with more than one segment dirty. Hence for compound
 157          * buffers we need to track which segment the dirty bits correspond to,
 158          * and when we move from one segment to the next increment the vector
 159          * count for the extra buf log format structure that will need to be
 160          * written.
 161          */
 162         nvecs = 0;
 163         for (i = 0; i < bip->bli_format_count; i++) {
 164                 nvecs += xfs_buf_item_size_segment(bip, &bip->bli_formats[i]);
 165         }
 166
 167         trace_xfs_buf_item_size(bip);
 168         return nvecs;
 169 }
 170
 171 static struct xfs_log_iovec *
 172 xfs_buf_item_format_segment(
 173         struct xfs_buf_log_item *bip,
 174         struct xfs_log_iovec    *vecp,
 175         uint                    offset,
 176         struct xfs_buf_log_format *blfp)
 177 {
 178         struct xfs_buf  *bp = bip->bli_buf;
 179         uint            base_size;
 180         uint            nvecs;
 181         int             first_bit;
 182         int             last_bit;
 183         int             next_bit;
 184         uint            nbits;
 185         uint            buffer_offset;
 186
 187         /* copy the flags across from the base format item */
 188         blfp->blf_flags = bip->__bli_format.blf_flags;
 189
 190         /*
 191          * Base size is the actual size of the ondisk structure - it reflects
 192          * the actual size of the dirty bitmap rather than the size of the in
 193          * memory structure.
 194          */
 195         base_size = offsetof(struct xfs_buf_log_format, blf_data_map) +
 196                         (blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
 197
 198         nvecs = 0;
 199         first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
 200         if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) {
 201                 /*
 202                  * If the map is not be dirty in the transaction, mark
 203                  * the size as zero and do not advance the vector pointer.
 204                  */
 205                 goto out;
 206         }
 207
 208         vecp->i_addr = blfp;
 209         vecp->i_len = base_size;
 210         vecp->i_type = XLOG_REG_TYPE_BFORMAT;
 211         vecp++;
 212         nvecs = 1;
 213
 214         if (bip->bli_flags & XFS_BLI_STALE) {
 215                 /*
 216                  * The buffer is stale, so all we need to log
 217                  * is the buf log format structure with the
 218                  * cancel flag in it.
 219                  */
 220                 trace_xfs_buf_item_format_stale(bip);
 221                 ASSERT(blfp->blf_flags & XFS_BLF_CANCEL);
 222                 goto out;
 223         }
 224
 225
 226         /*
 227          * Fill in an iovec for each set of contiguous chunks.
 228          */
 229
 230         last_bit = first_bit;
 231         nbits = 1;
 232         for (;;) {
 233                 /*
 234                  * This takes the bit number to start looking from and
 235                  * returns the next set bit from there.  It returns -1
 236                  * if there are no more bits set or the start bit is
 237                  * beyond the end of the bitmap.
 238                  */
 239                 next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
 240                                         (uint)last_bit + 1);
 241                 /*
 242                  * If we run out of bits fill in the last iovec and get
 243                  * out of the loop.
 244                  * Else if we start a new set of bits then fill in the
 245                  * iovec for the series we were looking at and start
 246                  * counting the bits in the new one.
 247                  * Else we're still in the same set of bits so just
 248                  * keep counting and scanning.
 249                  */
 250                 if (next_bit == -1) {
 251                         buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
 252                         vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
 253                         vecp->i_len = nbits * XFS_BLF_CHUNK;
 254                         vecp->i_type = XLOG_REG_TYPE_BCHUNK;
 255                         nvecs++;
 256                         break;
 257                 } else if (next_bit != last_bit + 1) {
 258                         buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
 259                         vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
 260                         vecp->i_len = nbits * XFS_BLF_CHUNK;
 261                         vecp->i_type = XLOG_REG_TYPE_BCHUNK;
 262                         nvecs++;
 263                         vecp++;
 264                         first_bit = next_bit;
 265                         last_bit = next_bit;
 266                         nbits = 1;
 267                 } else if (xfs_buf_offset(bp, offset +
 268                                               (next_bit << XFS_BLF_SHIFT)) !=
 269                            (xfs_buf_offset(bp, offset +
 270                                                (last_bit << XFS_BLF_SHIFT)) +
 271                             XFS_BLF_CHUNK)) {
 272                         buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
 273                         vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
 274                         vecp->i_len = nbits * XFS_BLF_CHUNK;
 275                         vecp->i_type = XLOG_REG_TYPE_BCHUNK;
 276                         nvecs++;
 277                         vecp++;
 278                         first_bit = next_bit;
 279                         last_bit = next_bit;
 280                         nbits = 1;
 281                 } else {
 282                         last_bit++;
 283                         nbits++;
 284                 }
 285         }
 286 out:
 287         blfp->blf_size = nvecs;
 288         return vecp;
 289 }
 290
 291 /*
 292  * This is called to fill in the vector of log iovecs for the
 293  * given log buf item.  It fills the first entry with a buf log
 294  * format structure, and the rest point to contiguous chunks
 295  * within the buffer.
 296  */
 297 STATIC void
 298 xfs_buf_item_format(
 299         struct xfs_log_item     *lip,
 300         struct xfs_log_iovec    *vecp)
 301 {
 302         struct xfs_buf_log_item *bip = BUF_ITEM(lip);
 303         struct xfs_buf          *bp = bip->bli_buf;
 304         uint                    offset = 0;
 305         int                     i;
 306
 307         ASSERT(atomic_read(&bip->bli_refcount) > 0);
 308         ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
 309                (bip->bli_flags & XFS_BLI_STALE));
 310
 311         /*
 312          * If it is an inode buffer, transfer the in-memory state to the
 313          * format flags and clear the in-memory state.
 314          *
 315          * For buffer based inode allocation, we do not transfer
 316          * this state if the inode buffer allocation has not yet been committed
 317          * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
 318          * correct replay of the inode allocation.
 319          *
 320          * For icreate item based inode allocation, the buffers aren't written
 321          * to the journal during allocation, and hence we should always tag the
 322          * buffer as an inode buffer so that the correct unlinked list replay
 323          * occurs during recovery.
 324          */
 325         if (bip->bli_flags & XFS_BLI_INODE_BUF) {
 326                 if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) ||
 327                     !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
 328                       xfs_log_item_in_current_chkpt(lip)))
 329                         bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
 330                 bip->bli_flags &= ~XFS_BLI_INODE_BUF;
 331         }
 332
 333         if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
 334                                                         XFS_BLI_ORDERED) {
 335                 /*
 336                  * The buffer has been logged just to order it.  It is not being
 337                  * included in the transaction commit, so don't format it.
 338                  */
 339                 trace_xfs_buf_item_format_ordered(bip);
 340                 return;
 341         }
 342
 343         for (i = 0; i < bip->bli_format_count; i++) {
 344                 vecp = xfs_buf_item_format_segment(bip, vecp, offset,
 345                                                 &bip->bli_formats[i]);
 346                 offset += bp->b_maps[i].bm_len;
 347         }
 348
 349         /*
 350          * Check to make sure everything is consistent.
 351          */
 352         trace_xfs_buf_item_format(bip);
 353 }
 354
 355 /*
 356  * This is called to pin the buffer associated with the buf log item in memory
 357  * so it cannot be written out.
 358  *
 359  * We also always take a reference to the buffer log item here so that the bli
 360  * is held while the item is pinned in memory. This means that we can
 361  * unconditionally drop the reference count a transaction holds when the
 362  * transaction is completed.
 363  */
 364 STATIC void
 365 xfs_buf_item_pin(
 366         struct xfs_log_item     *lip)
 367 {
 368         struct xfs_buf_log_item *bip = BUF_ITEM(lip);
 369
 370         ASSERT(atomic_read(&bip->bli_refcount) > 0);
 371         ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
 372                (bip->bli_flags & XFS_BLI_ORDERED) ||
 373                (bip->bli_flags & XFS_BLI_STALE));
 374
 375         trace_xfs_buf_item_pin(bip);
 376
 377         atomic_inc(&bip->bli_refcount);
 378         atomic_inc(&bip->bli_buf->b_pin_count);
 379 }
 380
 381 /*
 382  * This is called to unpin the buffer associated with the buf log
 383  * item which was previously pinned with a call to xfs_buf_item_pin().
 384  *
 385  * Also drop the reference to the buf item for the current transaction.
 386  * If the XFS_BLI_STALE flag is set and we are the last reference,
 387  * then free up the buf log item and unlock the buffer.
 388  *
 389  * If the remove flag is set we are called from uncommit in the
 390  * forced-shutdown path.  If that is true and the reference count on
 391  * the log item is going to drop to zero we need to free the item's
 392  * descriptor in the transaction.
 393  */
 394 STATIC void
 395 xfs_buf_item_unpin(
 396         struct xfs_log_item     *lip,
 397         int                     remove)
 398 {
 399         struct xfs_buf_log_item *bip = BUF_ITEM(lip);
 400         xfs_buf_t       *bp = bip->bli_buf;
 401         struct xfs_ail  *ailp = lip->li_ailp;
 402         int             stale = bip->bli_flags & XFS_BLI_STALE;
 403         int             freed;
 404
 405         ASSERT(bp->b_fspriv == bip);
 406         ASSERT(atomic_read(&bip->bli_refcount) > 0);
 407
 408         trace_xfs_buf_item_unpin(bip);
 409
 410         freed = atomic_dec_and_test(&bip->bli_refcount);
 411
 412         if (atomic_dec_and_test(&bp->b_pin_count))
 413                 wake_up_all(&bp->b_waiters);
 414
 415         if (freed && stale) {
 416                 ASSERT(bip->bli_flags & XFS_BLI_STALE);
 417                 ASSERT(xfs_buf_islocked(bp));
 418                 ASSERT(XFS_BUF_ISSTALE(bp));
 419                 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
 420
 421                 trace_xfs_buf_item_unpin_stale(bip);
 422
 423                 if (remove) {
 424                         /*
 425                          * If we are in a transaction context, we have to
 426                          * remove the log item from the transaction as we are
 427                          * about to release our reference to the buffer.  If we
 428                          * don't, the unlock that occurs later in
 429                          * xfs_trans_uncommit() will try to reference the
 430                          * buffer which we no longer have a hold on.
 431                          */
 432                         if (lip->li_desc)
 433                                 xfs_trans_del_item(lip);
 434
 435                         /*
 436                          * Since the transaction no longer refers to the buffer,
 437                          * the buffer should no longer refer to the transaction.
 438                          */
 439                         bp->b_transp = NULL;
 440                 }
 441
 442                 /*
 443                  * If we get called here because of an IO error, we may
 444                  * or may not have the item on the AIL. xfs_trans_ail_delete()
 445                  * will take care of that situation.
 446                  * xfs_trans_ail_delete() drops the AIL lock.
 447                  */
 448                 if (bip->bli_flags & XFS_BLI_STALE_INODE) {
 449                         xfs_buf_do_callbacks(bp);
 450                         bp->b_fspriv = NULL;
 451                         bp->b_iodone = NULL;
 452                 } else {
 453                         spin_lock(&ailp->xa_lock);
 454                         xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR);
 455                         xfs_buf_item_relse(bp);
 456                         ASSERT(bp->b_fspriv == NULL);
 457                 }
 458                 xfs_buf_relse(bp);
 459         } else if (freed && remove) {
 460                 /*
 461                  * There are currently two references to the buffer - the active
 462                  * LRU reference and the buf log item. What we are about to do
 463                  * here - simulate a failed IO completion - requires 3
 464                  * references.
 465                  *
 466                  * The LRU reference is removed by the xfs_buf_stale() call. The
 467                  * buf item reference is removed by the xfs_buf_iodone()
 468                  * callback that is run by xfs_buf_do_callbacks() during ioend
 469                  * processing (via the bp->b_iodone callback), and then finally
 470                  * the ioend processing will drop the IO reference if the buffer
 471                  * is marked XBF_ASYNC.
 472                  *
 473                  * Hence we need to take an additional reference here so that IO
 474                  * completion processing doesn't free the buffer prematurely.
 475                  */
 476                 xfs_buf_lock(bp);
 477                 xfs_buf_hold(bp);
 478                 bp->b_flags |= XBF_ASYNC;
 479                 xfs_buf_ioerror(bp, EIO);
 480                 XFS_BUF_UNDONE(bp);
 481                 xfs_buf_stale(bp);
 482                 xfs_buf_ioend(bp, 0);
 483         }
 484 }
 485
 486 STATIC uint
 487 xfs_buf_item_push(
 488         struct xfs_log_item     *lip,
 489         struct list_head        *buffer_list)
 490 {
 491         struct xfs_buf_log_item *bip = BUF_ITEM(lip);
 492         struct xfs_buf          *bp = bip->bli_buf;
 493         uint                    rval = XFS_ITEM_SUCCESS;
 494
 495         if (xfs_buf_ispinned(bp))
 496                 return XFS_ITEM_PINNED;
 497         if (!xfs_buf_trylock(bp)) {
 498                 /*
 499                  * If we have just raced with a buffer being pinned and it has
 500                  * been marked stale, we could end up stalling until someone else
 501                  * issues a log force to unpin the stale buffer. Check for the
 502                  * race condition here so xfsaild recognizes the buffer is pinned
 503                  * and queues a log force to move it along.
 504                  */
 505                 if (xfs_buf_ispinned(bp))
 506                         return XFS_ITEM_PINNED;
 507                 return XFS_ITEM_LOCKED;
 508         }
 509
 510         ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
 511
 512         trace_xfs_buf_item_push(bip);
 513
 514         if (!xfs_buf_delwri_queue(bp, buffer_list))
 515                 rval = XFS_ITEM_FLUSHING;
 516         xfs_buf_unlock(bp);
 517         return rval;
 518 }
 519
 520 /*
 521  * Release the buffer associated with the buf log item.  If there is no dirty
 522  * logged data associated with the buffer recorded in the buf log item, then
 523  * free the buf log item and remove the reference to it in the buffer.
 524  *
 525  * This call ignores the recursion count.  It is only called when the buffer
 526  * should REALLY be unlocked, regardless of the recursion count.
 527  *
 528  * We unconditionally drop the transaction's reference to the log item. If the
 529  * item was logged, then another reference was taken when it was pinned, so we
 530  * can safely drop the transaction reference now.  This also allows us to avoid
 531  * potential races with the unpin code freeing the bli by not referencing the
 532  * bli after we've dropped the reference count.
 533  *
 534  * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
 535  * if necessary but do not unlock the buffer.  This is for support of
 536  * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
 537  * free the item.
 538  */
 539 STATIC void
 540 xfs_buf_item_unlock(
 541         struct xfs_log_item     *lip)
 542 {
 543         struct xfs_buf_log_item *bip = BUF_ITEM(lip);
 544         struct xfs_buf          *bp = bip->bli_buf;
 545         bool                    clean;
 546         bool                    aborted;
 547         int                     flags;
 548
 549         /* Clear the buffer's association with this transaction. */
 550         bp->b_transp = NULL;
 551
 552         /*
 553          * If this is a transaction abort, don't return early.  Instead, allow
 554          * the brelse to happen.  Normally it would be done for stale
 555          * (cancelled) buffers at unpin time, but we'll never go through the
 556          * pin/unpin cycle if we abort inside commit.
 557          */
 558         aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
 559         /*
 560          * Before possibly freeing the buf item, copy the per-transaction state
 561          * so we can reference it safely later after clearing it from the
 562          * buffer log item.
 563          */
 564         flags = bip->bli_flags;
 565         bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
 566
 567         /*
 568          * If the buf item is marked stale, then don't do anything.  We'll
 569          * unlock the buffer and free the buf item when the buffer is unpinned
 570          * for the last time.
 571          */
 572         if (flags & XFS_BLI_STALE) {
 573                 trace_xfs_buf_item_unlock_stale(bip);
 574                 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
 575                 if (!aborted) {
 576                         atomic_dec(&bip->bli_refcount);
 577                         return;
 578                 }
 579         }
 580
 581         trace_xfs_buf_item_unlock(bip);
 582
 583         /*
 584          * If the buf item isn't tracking any data, free it, otherwise drop the
 585          * reference we hold to it. If we are aborting the transaction, this may
 586          * be the only reference to the buf item, so we free it anyway
 587          * regardless of whether it is dirty or not. A dirty abort implies a
 588          * shutdown, anyway.
 589          *
 590          * Ordered buffers are dirty but may have no recorded changes, so ensure
 591          * we only release clean items here.
 592          */
 593         clean = (flags & XFS_BLI_DIRTY) ? false : true;
 594         if (clean) {
 595                 int i;
 596                 for (i = 0; i < bip->bli_format_count; i++) {
 597                         if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
 598                                      bip->bli_formats[i].blf_map_size)) {
 599                                 clean = false;
 600                                 break;
 601                         }
 602                 }
 603         }
 604         if (clean)
 605                 xfs_buf_item_relse(bp);
 606         else if (aborted) {
 607                 if (atomic_dec_and_test(&bip->bli_refcount)) {
 608                         ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
 609                         xfs_buf_item_relse(bp);
 610                 }
 611         } else
 612                 atomic_dec(&bip->bli_refcount);
 613
 614         if (!(flags & XFS_BLI_HOLD))
 615                 xfs_buf_relse(bp);
 616 }
 617
 618 /*
 619  * This is called to find out where the oldest active copy of the
 620  * buf log item in the on disk log resides now that the last log
 621  * write of it completed at the given lsn.
 622  * We always re-log all the dirty data in a buffer, so usually the
 623  * latest copy in the on disk log is the only one that matters.  For
 624  * those cases we simply return the given lsn.
 625  *
 626  * The one exception to this is for buffers full of newly allocated
 627  * inodes.  These buffers are only relogged with the XFS_BLI_INODE_BUF
 628  * flag set, indicating that only the di_next_unlinked fields from the
 629  * inodes in the buffers will be replayed during recovery.  If the
 630  * original newly allocated inode images have not yet been flushed
 631  * when the buffer is so relogged, then we need to make sure that we
 632  * keep the old images in the 'active' portion of the log.  We do this
 633  * by returning the original lsn of that transaction here rather than
 634  * the current one.
 635  */
 636 STATIC xfs_lsn_t
 637 xfs_buf_item_committed(
 638         struct xfs_log_item     *lip,
 639         xfs_lsn_t               lsn)
 640 {
 641         struct xfs_buf_log_item *bip = BUF_ITEM(lip);
 642
 643         trace_xfs_buf_item_committed(bip);
 644
 645         if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0)
 646                 return lip->li_lsn;
 647         return lsn;
 648 }
 649
 650 STATIC void
 651 xfs_buf_item_committing(
 652         struct xfs_log_item     *lip,
 653         xfs_lsn_t               commit_lsn)
 654 {
 655 }
 656
 657 /*
 658  * This is the ops vector shared by all buf log items.
 659  */
 660 static const struct xfs_item_ops xfs_buf_item_ops = {
 661         .iop_size       = xfs_buf_item_size,
 662         .iop_format     = xfs_buf_item_format,
 663         .iop_pin        = xfs_buf_item_pin,
 664         .iop_unpin      = xfs_buf_item_unpin,
 665         .iop_unlock     = xfs_buf_item_unlock,
 666         .iop_committed  = xfs_buf_item_committed,
 667         .iop_push       = xfs_buf_item_push,
 668         .iop_committing = xfs_buf_item_committing
 669 };
 670
 671 STATIC int
 672 xfs_buf_item_get_format(
 673         struct xfs_buf_log_item *bip,
 674         int                     count)
 675 {
 676         ASSERT(bip->bli_formats == NULL);
 677         bip->bli_format_count = count;
 678
 679         if (count == 1) {
 680                 bip->bli_formats = &bip->__bli_format;
 681                 return 0;
 682         }
 683
 684         bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
 685                                 KM_SLEEP);
 686         if (!bip->bli_formats)
 687                 return ENOMEM;
 688         return 0;
 689 }
 690
 691 STATIC void
 692 xfs_buf_item_free_format(
 693         struct xfs_buf_log_item *bip)
 694 {
 695         if (bip->bli_formats != &bip->__bli_format) {
 696                 kmem_free(bip->bli_formats);
 697                 bip->bli_formats = NULL;
 698         }
 699 }
 700
 701 /*
 702  * Allocate a new buf log item to go with the given buffer.
 703  * Set the buffer's b_fsprivate field to point to the new
 704  * buf log item.  If there are other item's attached to the
 705  * buffer (see xfs_buf_attach_iodone() below), then put the
 706  * buf log item at the front.
 707  */
 708 void
 709 xfs_buf_item_init(
 710         xfs_buf_t       *bp,
 711         xfs_mount_t     *mp)
 712 {
 713         xfs_log_item_t          *lip = bp->b_fspriv;
 714         xfs_buf_log_item_t      *bip;
 715         int                     chunks;
 716         int                     map_size;
 717         int                     error;
 718         int                     i;
 719
 720         /*
 721          * Check to see if there is already a buf log item for
 722          * this buffer.  If there is, it is guaranteed to be
 723          * the first.  If we do already have one, there is
 724          * nothing to do here so return.
 725          */
 726         ASSERT(bp->b_target->bt_mount == mp);
 727         if (lip != NULL && lip->li_type == XFS_LI_BUF)
 728                 return;
 729
 730         bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP);
 731         xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
 732         bip->bli_buf = bp;
 733         xfs_buf_hold(bp);
 734
 735         /*
 736          * chunks is the number of XFS_BLF_CHUNK size pieces the buffer
 737          * can be divided into. Make sure not to truncate any pieces.
 738          * map_size is the size of the bitmap needed to describe the
 739          * chunks of the buffer.
 740          *
 741          * Discontiguous buffer support follows the layout of the underlying
 742          * buffer. This makes the implementation as simple as possible.
 743          */
 744         error = xfs_buf_item_get_format(bip, bp->b_map_count);
 745         ASSERT(error == 0);
 746
 747         for (i = 0; i < bip->bli_format_count; i++) {
 748                 chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
 749                                       XFS_BLF_CHUNK);
 750                 map_size = DIV_ROUND_UP(chunks, NBWORD);
 751
 752                 bip->bli_formats[i].blf_type = XFS_LI_BUF;
 753                 bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn;
 754                 bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len;
 755                 bip->bli_formats[i].blf_map_size = map_size;
 756         }
 757
 758 #ifdef XFS_TRANS_DEBUG
 759         /*
 760          * Allocate the arrays for tracking what needs to be logged
 761          * and what our callers request to be logged.  bli_orig
 762          * holds a copy of the original, clean buffer for comparison
 763          * against, and bli_logged keeps a 1 bit flag per byte in
 764          * the buffer to indicate which bytes the callers have asked
 765          * to have logged.
 766          */
 767         bip->bli_orig = kmem_alloc(BBTOB(bp->b_length), KM_SLEEP);
 768         memcpy(bip->bli_orig, bp->b_addr, BBTOB(bp->b_length));
 769         bip->bli_logged = kmem_zalloc(BBTOB(bp->b_length) / NBBY, KM_SLEEP);
 770 #endif
 771
 772         /*
 773          * Put the buf item into the list of items attached to the
 774          * buffer at the front.
 775          */
 776         if (bp->b_fspriv)
 777                 bip->bli_item.li_bio_list = bp->b_fspriv;
 778         bp->b_fspriv = bip;
 779 }
 780
 781
 782 /*
 783  * Mark bytes first through last inclusive as dirty in the buf
 784  * item's bitmap.
 785  */
 786 void
 787 xfs_buf_item_log_segment(
 788         struct xfs_buf_log_item *bip,
 789         uint                    first,
 790         uint                    last,
 791         uint                    *map)
 792 {
 793         uint            first_bit;
 794         uint            last_bit;
 795         uint            bits_to_set;
 796         uint            bits_set;
 797         uint            word_num;
 798         uint            *wordp;
 799         uint            bit;
 800         uint            end_bit;
 801         uint            mask;
 802
 803         /*
 804          * Convert byte offsets to bit numbers.
 805          */
 806         first_bit = first >> XFS_BLF_SHIFT;
 807         last_bit = last >> XFS_BLF_SHIFT;
 808
 809         /*
 810          * Calculate the total number of bits to be set.
 811          */
 812         bits_to_set = last_bit - first_bit + 1;
 813
 814         /*
 815          * Get a pointer to the first word in the bitmap
 816          * to set a bit in.
 817          */
 818         word_num = first_bit >> BIT_TO_WORD_SHIFT;
 819         wordp = &map[word_num];
 820
 821         /*
 822          * Calculate the starting bit in the first word.
 823          */
 824         bit = first_bit & (uint)(NBWORD - 1);
 825
 826         /*
 827          * First set any bits in the first word of our range.
 828          * If it starts at bit 0 of the word, it will be
 829          * set below rather than here.  That is what the variable
 830          * bit tells us. The variable bits_set tracks the number
 831          * of bits that have been set so far.  End_bit is the number
 832          * of the last bit to be set in this word plus one.
 833          */
 834         if (bit) {
 835                 end_bit = MIN(bit + bits_to_set, (uint)NBWORD);
 836                 mask = ((1 << (end_bit - bit)) - 1) << bit;
 837                 *wordp |= mask;
 838                 wordp++;
 839                 bits_set = end_bit - bit;
 840         } else {
 841                 bits_set = 0;
 842         }
 843
 844         /*
 845          * Now set bits a whole word at a time that are between
 846          * first_bit and last_bit.
 847          */
 848         while ((bits_to_set - bits_set) >= NBWORD) {
 849                 *wordp |= 0xffffffff;
 850                 bits_set += NBWORD;
 851                 wordp++;
 852         }
 853
 854         /*
 855          * Finally, set any bits left to be set in one last partial word.
 856          */
 857         end_bit = bits_to_set - bits_set;
 858         if (end_bit) {
 859                 mask = (1 << end_bit) - 1;
 860                 *wordp |= mask;
 861         }
 862 }
 863
 864 /*
 865  * Mark bytes first through last inclusive as dirty in the buf
 866  * item's bitmap.
 867  */
 868 void
 869 xfs_buf_item_log(
 870         xfs_buf_log_item_t      *bip,
 871         uint                    first,
 872         uint                    last)
 873 {
 874         int                     i;
 875         uint                    start;
 876         uint                    end;
 877         struct xfs_buf          *bp = bip->bli_buf;
 878
 879         /*
 880          * walk each buffer segment and mark them dirty appropriately.
 881          */
 882         start = 0;
 883         for (i = 0; i < bip->bli_format_count; i++) {
 884                 if (start > last)
 885                         break;
 886                 end = start + BBTOB(bp->b_maps[i].bm_len);
 887                 if (first > end) {
 888                         start += BBTOB(bp->b_maps[i].bm_len);
 889                         continue;
 890                 }
 891                 if (first < start)
 892                         first = start;
 893                 if (end > last)
 894                         end = last;
 895
 896                 xfs_buf_item_log_segment(bip, first, end,
 897                                          &bip->bli_formats[i].blf_data_map[0]);
 898
 899                 start += bp->b_maps[i].bm_len;
 900         }
 901 }
 902
 903
 904 /*
 905  * Return 1 if the buffer has been logged or ordered in a transaction (at any
 906  * point, not just the current transaction) and 0 if not.
 907  */
 908 uint
 909 xfs_buf_item_dirty(
 910         xfs_buf_log_item_t      *bip)
 911 {
 912         return (bip->bli_flags & XFS_BLI_DIRTY);
 913 }
 914
 915 STATIC void
 916 xfs_buf_item_free(
 917         xfs_buf_log_item_t      *bip)
 918 {
 919 #ifdef XFS_TRANS_DEBUG
 920         kmem_free(bip->bli_orig);
 921         kmem_free(bip->bli_logged);
 922 #endif /* XFS_TRANS_DEBUG */
 923
 924         xfs_buf_item_free_format(bip);
 925         kmem_zone_free(xfs_buf_item_zone, bip);
 926 }
 927
 928 /*
 929  * This is called when the buf log item is no longer needed.  It should
 930  * free the buf log item associated with the given buffer and clear
 931  * the buffer's pointer to the buf log item.  If there are no more
 932  * items in the list, clear the b_iodone field of the buffer (see
 933  * xfs_buf_attach_iodone() below).
 934  */
 935 void
 936 xfs_buf_item_relse(
 937         xfs_buf_t       *bp)
 938 {
 939         xfs_buf_log_item_t      *bip = bp->b_fspriv;
 940
 941         trace_xfs_buf_item_relse(bp, _RET_IP_);
 942         ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
 943
 944         bp->b_fspriv = bip->bli_item.li_bio_list;
 945         if (bp->b_fspriv == NULL)
 946                 bp->b_iodone = NULL;
 947
 948         xfs_buf_rele(bp);
 949         xfs_buf_item_free(bip);
 950 }
 951
 952
 953 /*
 954  * Add the given log item with its callback to the list of callbacks
 955  * to be called when the buffer's I/O completes.  If it is not set
 956  * already, set the buffer's b_iodone() routine to be
 957  * xfs_buf_iodone_callbacks() and link the log item into the list of
 958  * items rooted at b_fsprivate.  Items are always added as the second
 959  * entry in the list if there is a first, because the buf item code
 960  * assumes that the buf log item is first.
 961  */
 962 void
 963 xfs_buf_attach_iodone(
 964         xfs_buf_t       *bp,
 965         void            (*cb)(xfs_buf_t *, xfs_log_item_t *),
 966         xfs_log_item_t  *lip)
 967 {
 968         xfs_log_item_t  *head_lip;
 969
 970         ASSERT(xfs_buf_islocked(bp));
 971
 972         lip->li_cb = cb;
 973         head_lip = bp->b_fspriv;
 974         if (head_lip) {
 975                 lip->li_bio_list = head_lip->li_bio_list;
 976                 head_lip->li_bio_list = lip;
 977         } else {
 978                 bp->b_fspriv = lip;
 979         }
 980
 981         ASSERT(bp->b_iodone == NULL ||
 982                bp->b_iodone == xfs_buf_iodone_callbacks);
 983         bp->b_iodone = xfs_buf_iodone_callbacks;
 984 }
 985
 986 /*
 987  * We can have many callbacks on a buffer. Running the callbacks individually
 988  * can cause a lot of contention on the AIL lock, so we allow for a single
 989  * callback to be able to scan the remaining lip->li_bio_list for other items
 990  * of the same type and callback to be processed in the first call.
 991  *
 992  * As a result, the loop walking the callback list below will also modify the
 993  * list. it removes the first item from the list and then runs the callback.
 994  * The loop then restarts from the new head of the list. This allows the
 995  * callback to scan and modify the list attached to the buffer and we don't
 996  * have to care about maintaining a next item pointer.
 997  */
 998 STATIC void
 999 xfs_buf_do_callbacks(
1000         struct xfs_buf          *bp)
1001 {
1002         struct xfs_log_item     *lip;
1003
1004         while ((lip = bp->b_fspriv) != NULL) {
1005                 bp->b_fspriv = lip->li_bio_list;
1006                 ASSERT(lip->li_cb != NULL);
1007                 /*
1008                  * Clear the next pointer so we don't have any
1009                  * confusion if the item is added to another buf.
1010                  * Don't touch the log item after calling its
1011                  * callback, because it could have freed itself.
1012                  */
1013                 lip->li_bio_list = NULL;
1014                 lip->li_cb(bp, lip);
1015         }
1016 }
1017
1018 /*
1019  * This is the iodone() function for buffers which have had callbacks
1020  * attached to them by xfs_buf_attach_iodone().  It should remove each
1021  * log item from the buffer's list and call the callback of each in turn.
1022  * When done, the buffer's fsprivate field is set to NULL and the buffer
1023  * is unlocked with a call to iodone().
1024  */
1025 void
1026 xfs_buf_iodone_callbacks(
1027         struct xfs_buf          *bp)
1028 {
1029         struct xfs_log_item     *lip = bp->b_fspriv;
1030         struct xfs_mount        *mp = lip->li_mountp;
1031         static ulong            lasttime;
1032         static xfs_buftarg_t    *lasttarg;
1033
1034         if (likely(!xfs_buf_geterror(bp)))
1035                 goto do_callbacks;
1036
1037         /*
1038          * If we've already decided to shutdown the filesystem because of
1039          * I/O errors, there's no point in giving this a retry.
1040          */
1041         if (XFS_FORCED_SHUTDOWN(mp)) {
1042                 xfs_buf_stale(bp);
1043                 XFS_BUF_DONE(bp);
1044                 trace_xfs_buf_item_iodone(bp, _RET_IP_);
1045                 goto do_callbacks;
1046         }
1047
1048         if (bp->b_target != lasttarg ||
1049             time_after(jiffies, (lasttime + 5*HZ))) {
1050                 lasttime = jiffies;
1051                 xfs_buf_ioerror_alert(bp, __func__);
1052         }
1053         lasttarg = bp->b_target;
1054
1055         /*
1056          * If the write was asynchronous then no one will be looking for the
1057          * error.  Clear the error state and write the buffer out again.
1058          *
1059          * XXX: This helps against transient write errors, but we need to find
1060          * a way to shut the filesystem down if the writes keep failing.
1061          *
1062          * In practice we'll shut the filesystem down soon as non-transient
1063          * erorrs tend to affect the whole device and a failing log write
1064          * will make us give up.  But we really ought to do better here.
1065          */
1066         if (XFS_BUF_ISASYNC(bp)) {
1067                 ASSERT(bp->b_iodone != NULL);
1068
1069                 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1070
1071                 xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
1072
1073                 if (!XFS_BUF_ISSTALE(bp)) {
1074                         bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE;
1075                         xfs_buf_iorequest(bp);
1076                 } else {
1077                         xfs_buf_relse(bp);
1078                 }
1079
1080                 return;
1081         }
1082
1083         /*
1084          * If the write of the buffer was synchronous, we want to make
1085          * sure to return the error to the caller of xfs_bwrite().
1086          */
1087         xfs_buf_stale(bp);
1088         XFS_BUF_DONE(bp);
1089
1090         trace_xfs_buf_error_relse(bp, _RET_IP_);
1091
1092 do_callbacks:
1093         xfs_buf_do_callbacks(bp);
1094         bp->b_fspriv = NULL;
1095         bp->b_iodone = NULL;
1096         xfs_buf_ioend(bp, 0);
1097 }
1098
1099 /*
1100  * This is the iodone() function for buffers which have been
1101  * logged.  It is called when they are eventually flushed out.
1102  * It should remove the buf item from the AIL, and free the buf item.
1103  * It is called by xfs_buf_iodone_callbacks() above which will take
1104  * care of cleaning up the buffer itself.
1105  */
1106 void
1107 xfs_buf_iodone(
1108         struct xfs_buf          *bp,
1109         struct xfs_log_item     *lip)
1110 {
1111         struct xfs_ail          *ailp = lip->li_ailp;
1112
1113         ASSERT(BUF_ITEM(lip)->bli_buf == bp);
1114
1115         xfs_buf_rele(bp);
1116
1117         /*
1118          * If we are forcibly shutting down, this may well be
1119          * off the AIL already. That's because we simulate the
1120          * log-committed callbacks to unpin these buffers. Or we may never
1121          * have put this item on AIL because of the transaction was
1122          * aborted forcibly. xfs_trans_ail_delete() takes care of these.
1123          *
1124          * Either way, AIL is useless if we're forcing a shutdown.
1125          */
1126         spin_lock(&ailp->xa_lock);
1127         xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
1128         xfs_buf_item_free(BUF_ITEM(lip));
1129 }