fs/xfs/xfs_mount.c

   1 /*
   2  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 #include "xfs.h"
  19 #include "xfs_fs.h"
  20 #include "xfs_types.h"
  21 #include "xfs_bit.h"
  22 #include "xfs_log.h"
  23 #include "xfs_inum.h"
  24 #include "xfs_trans.h"
  25 #include "xfs_sb.h"
  26 #include "xfs_ag.h"
  27 #include "xfs_dir2.h"
  28 #include "xfs_mount.h"
  29 #include "xfs_bmap_btree.h"
  30 #include "xfs_alloc_btree.h"
  31 #include "xfs_ialloc_btree.h"
  32 #include "xfs_dinode.h"
  33 #include "xfs_inode.h"
  34 #include "xfs_btree.h"
  35 #include "xfs_ialloc.h"
  36 #include "xfs_alloc.h"
  37 #include "xfs_rtalloc.h"
  38 #include "xfs_bmap.h"
  39 #include "xfs_error.h"
  40 #include "xfs_rw.h"
  41 #include "xfs_quota.h"
  42 #include "xfs_fsops.h"
  43 #include "xfs_utils.h"
  44 #include "xfs_trace.h"
  45
  46
  47 #ifdef HAVE_PERCPU_SB
  48 STATIC void     xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
  49                                                 int);
  50 STATIC void     xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
  51                                                 int);
  52 STATIC void     xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
  53 #else
  54
  55 #define xfs_icsb_balance_counter(mp, a, b)              do { } while (0)
  56 #define xfs_icsb_balance_counter_locked(mp, a, b)       do { } while (0)
  57 #endif
  58
  59 static const struct {
  60         short offset;
  61         short type;     /* 0 = integer
  62                          * 1 = binary / string (no translation)
  63                          */
  64 } xfs_sb_info[] = {
  65     { offsetof(xfs_sb_t, sb_magicnum),   0 },
  66     { offsetof(xfs_sb_t, sb_blocksize),  0 },
  67     { offsetof(xfs_sb_t, sb_dblocks),    0 },
  68     { offsetof(xfs_sb_t, sb_rblocks),    0 },
  69     { offsetof(xfs_sb_t, sb_rextents),   0 },
  70     { offsetof(xfs_sb_t, sb_uuid),       1 },
  71     { offsetof(xfs_sb_t, sb_logstart),   0 },
  72     { offsetof(xfs_sb_t, sb_rootino),    0 },
  73     { offsetof(xfs_sb_t, sb_rbmino),     0 },
  74     { offsetof(xfs_sb_t, sb_rsumino),    0 },
  75     { offsetof(xfs_sb_t, sb_rextsize),   0 },
  76     { offsetof(xfs_sb_t, sb_agblocks),   0 },
  77     { offsetof(xfs_sb_t, sb_agcount),    0 },
  78     { offsetof(xfs_sb_t, sb_rbmblocks),  0 },
  79     { offsetof(xfs_sb_t, sb_logblocks),  0 },
  80     { offsetof(xfs_sb_t, sb_versionnum), 0 },
  81     { offsetof(xfs_sb_t, sb_sectsize),   0 },
  82     { offsetof(xfs_sb_t, sb_inodesize),  0 },
  83     { offsetof(xfs_sb_t, sb_inopblock),  0 },
  84     { offsetof(xfs_sb_t, sb_fname[0]),   1 },
  85     { offsetof(xfs_sb_t, sb_blocklog),   0 },
  86     { offsetof(xfs_sb_t, sb_sectlog),    0 },
  87     { offsetof(xfs_sb_t, sb_inodelog),   0 },
  88     { offsetof(xfs_sb_t, sb_inopblog),   0 },
  89     { offsetof(xfs_sb_t, sb_agblklog),   0 },
  90     { offsetof(xfs_sb_t, sb_rextslog),   0 },
  91     { offsetof(xfs_sb_t, sb_inprogress), 0 },
  92     { offsetof(xfs_sb_t, sb_imax_pct),   0 },
  93     { offsetof(xfs_sb_t, sb_icount),     0 },
  94     { offsetof(xfs_sb_t, sb_ifree),      0 },
  95     { offsetof(xfs_sb_t, sb_fdblocks),   0 },
  96     { offsetof(xfs_sb_t, sb_frextents),  0 },
  97     { offsetof(xfs_sb_t, sb_uquotino),   0 },
  98     { offsetof(xfs_sb_t, sb_gquotino),   0 },
  99     { offsetof(xfs_sb_t, sb_qflags),     0 },
 100     { offsetof(xfs_sb_t, sb_flags),      0 },
 101     { offsetof(xfs_sb_t, sb_shared_vn),  0 },
 102     { offsetof(xfs_sb_t, sb_inoalignmt), 0 },
 103     { offsetof(xfs_sb_t, sb_unit),       0 },
 104     { offsetof(xfs_sb_t, sb_width),      0 },
 105     { offsetof(xfs_sb_t, sb_dirblklog),  0 },
 106     { offsetof(xfs_sb_t, sb_logsectlog), 0 },
 107     { offsetof(xfs_sb_t, sb_logsectsize),0 },
 108     { offsetof(xfs_sb_t, sb_logsunit),   0 },
 109     { offsetof(xfs_sb_t, sb_features2),  0 },
 110     { offsetof(xfs_sb_t, sb_bad_features2), 0 },
 111     { sizeof(xfs_sb_t),                  0 }
 112 };
 113
 114 static DEFINE_MUTEX(xfs_uuid_table_mutex);
 115 static int xfs_uuid_table_size;
 116 static uuid_t *xfs_uuid_table;
 117
 118 /*
 119  * See if the UUID is unique among mounted XFS filesystems.
 120  * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
 121  */
 122 STATIC int
 123 xfs_uuid_mount(
 124         struct xfs_mount        *mp)
 125 {
 126         uuid_t                  *uuid = &mp->m_sb.sb_uuid;
 127         int                     hole, i;
 128
 129         if (mp->m_flags & XFS_MOUNT_NOUUID)
 130                 return 0;
 131
 132         if (uuid_is_nil(uuid)) {
 133                 xfs_warn(mp, "Filesystem has nil UUID - can't mount");
 134                 return XFS_ERROR(EINVAL);
 135         }
 136
 137         mutex_lock(&xfs_uuid_table_mutex);
 138         for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) {
 139                 if (uuid_is_nil(&xfs_uuid_table[i])) {
 140                         hole = i;
 141                         continue;
 142                 }
 143                 if (uuid_equal(uuid, &xfs_uuid_table[i]))
 144                         goto out_duplicate;
 145         }
 146
 147         if (hole < 0) {
 148                 xfs_uuid_table = kmem_realloc(xfs_uuid_table,
 149                         (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
 150                         xfs_uuid_table_size  * sizeof(*xfs_uuid_table),
 151                         KM_SLEEP);
 152                 hole = xfs_uuid_table_size++;
 153         }
 154         xfs_uuid_table[hole] = *uuid;
 155         mutex_unlock(&xfs_uuid_table_mutex);
 156
 157         return 0;
 158
 159  out_duplicate:
 160         mutex_unlock(&xfs_uuid_table_mutex);
 161         xfs_warn(mp, "Filesystem has duplicate UUID - can't mount");
 162         return XFS_ERROR(EINVAL);
 163 }
 164
 165 STATIC void
 166 xfs_uuid_unmount(
 167         struct xfs_mount        *mp)
 168 {
 169         uuid_t                  *uuid = &mp->m_sb.sb_uuid;
 170         int                     i;
 171
 172         if (mp->m_flags & XFS_MOUNT_NOUUID)
 173                 return;
 174
 175         mutex_lock(&xfs_uuid_table_mutex);
 176         for (i = 0; i < xfs_uuid_table_size; i++) {
 177                 if (uuid_is_nil(&xfs_uuid_table[i]))
 178                         continue;
 179                 if (!uuid_equal(uuid, &xfs_uuid_table[i]))
 180                         continue;
 181                 memset(&xfs_uuid_table[i], 0, sizeof(uuid_t));
 182                 break;
 183         }
 184         ASSERT(i < xfs_uuid_table_size);
 185         mutex_unlock(&xfs_uuid_table_mutex);
 186 }
 187
 188
 189 /*
 190  * Reference counting access wrappers to the perag structures.
 191  * Because we never free per-ag structures, the only thing we
 192  * have to protect against changes is the tree structure itself.
 193  */
 194 struct xfs_perag *
 195 xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
 196 {
 197         struct xfs_perag        *pag;
 198         int                     ref = 0;
 199
 200         rcu_read_lock();
 201         pag = radix_tree_lookup(&mp->m_perag_tree, agno);
 202         if (pag) {
 203                 ASSERT(atomic_read(&pag->pag_ref) >= 0);
 204                 ref = atomic_inc_return(&pag->pag_ref);
 205         }
 206         rcu_read_unlock();
 207         trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
 208         return pag;
 209 }
 210
 211 /*
 212  * search from @first to find the next perag with the given tag set.
 213  */
 214 struct xfs_perag *
 215 xfs_perag_get_tag(
 216         struct xfs_mount        *mp,
 217         xfs_agnumber_t          first,
 218         int                     tag)
 219 {
 220         struct xfs_perag        *pag;
 221         int                     found;
 222         int                     ref;
 223
 224         rcu_read_lock();
 225         found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
 226                                         (void **)&pag, first, 1, tag);
 227         if (found <= 0) {
 228                 rcu_read_unlock();
 229                 return NULL;
 230         }
 231         ref = atomic_inc_return(&pag->pag_ref);
 232         rcu_read_unlock();
 233         trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
 234         return pag;
 235 }
 236
 237 void
 238 xfs_perag_put(struct xfs_perag *pag)
 239 {
 240         int     ref;
 241
 242         ASSERT(atomic_read(&pag->pag_ref) > 0);
 243         ref = atomic_dec_return(&pag->pag_ref);
 244         trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
 245 }
 246
 247 STATIC void
 248 __xfs_free_perag(
 249         struct rcu_head *head)
 250 {
 251         struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
 252
 253         ASSERT(atomic_read(&pag->pag_ref) == 0);
 254         kmem_free(pag);
 255 }
 256
 257 /*
 258  * Free up the per-ag resources associated with the mount structure.
 259  */
 260 STATIC void
 261 xfs_free_perag(
 262         xfs_mount_t     *mp)
 263 {
 264         xfs_agnumber_t  agno;
 265         struct xfs_perag *pag;
 266
 267         for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
 268                 spin_lock(&mp->m_perag_lock);
 269                 pag = radix_tree_delete(&mp->m_perag_tree, agno);
 270                 spin_unlock(&mp->m_perag_lock);
 271                 ASSERT(pag);
 272                 ASSERT(atomic_read(&pag->pag_ref) == 0);
 273                 call_rcu(&pag->rcu_head, __xfs_free_perag);
 274         }
 275 }
 276
 277 /*
 278  * Check size of device based on the (data/realtime) block count.
 279  * Note: this check is used by the growfs code as well as mount.
 280  */
 281 int
 282 xfs_sb_validate_fsb_count(
 283         xfs_sb_t        *sbp,
 284         __uint64_t      nblocks)
 285 {
 286         ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
 287         ASSERT(sbp->sb_blocklog >= BBSHIFT);
 288
 289 #if XFS_BIG_BLKNOS     /* Limited by ULONG_MAX of page cache index */
 290         if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
 291                 return EFBIG;
 292 #else                  /* Limited by UINT_MAX of sectors */
 293         if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX)
 294                 return EFBIG;
 295 #endif
 296         return 0;
 297 }
 298
 299 /*
 300  * Check the validity of the SB found.
 301  */
 302 STATIC int
 303 xfs_mount_validate_sb(
 304         xfs_mount_t     *mp,
 305         xfs_sb_t        *sbp,
 306         int             flags)
 307 {
 308         int             loud = !(flags & XFS_MFSI_QUIET);
 309
 310         /*
 311          * If the log device and data device have the
 312          * same device number, the log is internal.
 313          * Consequently, the sb_logstart should be non-zero.  If
 314          * we have a zero sb_logstart in this case, we may be trying to mount
 315          * a volume filesystem in a non-volume manner.
 316          */
 317         if (sbp->sb_magicnum != XFS_SB_MAGIC) {
 318                 if (loud)
 319                         xfs_warn(mp, "bad magic number");
 320                 return XFS_ERROR(EWRONGFS);
 321         }
 322
 323         if (!xfs_sb_good_version(sbp)) {
 324                 if (loud)
 325                         xfs_warn(mp, "bad version");
 326                 return XFS_ERROR(EWRONGFS);
 327         }
 328
 329         if (unlikely(
 330             sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
 331                 if (loud)
 332                         xfs_warn(mp,
 333                 "filesystem is marked as having an external log; "
 334                 "specify logdev on the mount command line.");
 335                 return XFS_ERROR(EINVAL);
 336         }
 337
 338         if (unlikely(
 339             sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
 340                 if (loud)
 341                         xfs_warn(mp,
 342                 "filesystem is marked as having an internal log; "
 343                 "do not specify logdev on the mount command line.");
 344                 return XFS_ERROR(EINVAL);
 345         }
 346
 347         /*
 348          * More sanity checking. These were stolen directly from
 349          * xfs_repair.
 350          */
 351         if (unlikely(
 352             sbp->sb_agcount <= 0                                        ||
 353             sbp->sb_sectsize < XFS_MIN_SECTORSIZE                       ||
 354             sbp->sb_sectsize > XFS_MAX_SECTORSIZE                       ||
 355             sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG                    ||
 356             sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG                    ||
 357             sbp->sb_sectsize != (1 << sbp->sb_sectlog)                  ||
 358             sbp->sb_blocksize < XFS_MIN_BLOCKSIZE                       ||
 359             sbp->sb_blocksize > XFS_MAX_BLOCKSIZE                       ||
 360             sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG                    ||
 361             sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG                    ||
 362             sbp->sb_blocksize != (1 << sbp->sb_blocklog)                ||
 363             sbp->sb_inodesize < XFS_DINODE_MIN_SIZE                     ||
 364             sbp->sb_inodesize > XFS_DINODE_MAX_SIZE                     ||
 365             sbp->sb_inodelog < XFS_DINODE_MIN_LOG                       ||
 366             sbp->sb_inodelog > XFS_DINODE_MAX_LOG                       ||
 367             sbp->sb_inodesize != (1 << sbp->sb_inodelog)                ||
 368             (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)   ||
 369             (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)  ||
 370             (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)  ||
 371             (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) {
 372                 if (loud)
 373                         xfs_warn(mp, "SB sanity check 1 failed");
 374                 return XFS_ERROR(EFSCORRUPTED);
 375         }
 376
 377         /*
 378          * Sanity check AG count, size fields against data size field
 379          */
 380         if (unlikely(
 381             sbp->sb_dblocks == 0 ||
 382             sbp->sb_dblocks >
 383              (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks ||
 384             sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) *
 385                               sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) {
 386                 if (loud)
 387                         xfs_warn(mp, "SB sanity check 2 failed");
 388                 return XFS_ERROR(EFSCORRUPTED);
 389         }
 390
 391         /*
 392          * Until this is fixed only page-sized or smaller data blocks work.
 393          */
 394         if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
 395                 if (loud) {
 396                         xfs_warn(mp,
 397                 "File system with blocksize %d bytes. "
 398                 "Only pagesize (%ld) or less will currently work.",
 399                                 sbp->sb_blocksize, PAGE_SIZE);
 400                 }
 401                 return XFS_ERROR(ENOSYS);
 402         }
 403
 404         /*
 405          * Currently only very few inode sizes are supported.
 406          */
 407         switch (sbp->sb_inodesize) {
 408         case 256:
 409         case 512:
 410         case 1024:
 411         case 2048:
 412                 break;
 413         default:
 414                 if (loud)
 415                         xfs_warn(mp, "inode size of %d bytes not supported",
 416                                 sbp->sb_inodesize);
 417                 return XFS_ERROR(ENOSYS);
 418         }
 419
 420         if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
 421             xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
 422                 if (loud)
 423                         xfs_warn(mp,
 424                 "file system too large to be mounted on this system.");
 425                 return XFS_ERROR(EFBIG);
 426         }
 427
 428         if (unlikely(sbp->sb_inprogress)) {
 429                 if (loud)
 430                         xfs_warn(mp, "file system busy");
 431                 return XFS_ERROR(EFSCORRUPTED);
 432         }
 433
 434         /*
 435          * Version 1 directory format has never worked on Linux.
 436          */
 437         if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
 438                 if (loud)
 439                         xfs_warn(mp,
 440                                 "file system using version 1 directory format");
 441                 return XFS_ERROR(ENOSYS);
 442         }
 443
 444         return 0;
 445 }
 446
 447 int
 448 xfs_initialize_perag(
 449         xfs_mount_t     *mp,
 450         xfs_agnumber_t  agcount,
 451         xfs_agnumber_t  *maxagi)
 452 {
 453         xfs_agnumber_t  index, max_metadata;
 454         xfs_agnumber_t  first_initialised = 0;
 455         xfs_perag_t     *pag;
 456         xfs_agino_t     agino;
 457         xfs_ino_t       ino;
 458         xfs_sb_t        *sbp = &mp->m_sb;
 459         int             error = -ENOMEM;
 460
 461         /*
 462          * Walk the current per-ag tree so we don't try to initialise AGs
 463          * that already exist (growfs case). Allocate and insert all the
 464          * AGs we don't find ready for initialisation.
 465          */
 466         for (index = 0; index < agcount; index++) {
 467                 pag = xfs_perag_get(mp, index);
 468                 if (pag) {
 469                         xfs_perag_put(pag);
 470                         continue;
 471                 }
 472                 if (!first_initialised)
 473                         first_initialised = index;
 474
 475                 pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
 476                 if (!pag)
 477                         goto out_unwind;
 478                 pag->pag_agno = index;
 479                 pag->pag_mount = mp;
 480                 spin_lock_init(&pag->pag_ici_lock);
 481                 mutex_init(&pag->pag_ici_reclaim_lock);
 482                 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
 483                 spin_lock_init(&pag->pag_buf_lock);
 484                 pag->pag_buf_tree = RB_ROOT;
 485
 486                 if (radix_tree_preload(GFP_NOFS))
 487                         goto out_unwind;
 488
 489                 spin_lock(&mp->m_perag_lock);
 490                 if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
 491                         BUG();
 492                         spin_unlock(&mp->m_perag_lock);
 493                         radix_tree_preload_end();
 494                         error = -EEXIST;
 495                         goto out_unwind;
 496                 }
 497                 spin_unlock(&mp->m_perag_lock);
 498                 radix_tree_preload_end();
 499         }
 500
 501         /*
 502          * If we mount with the inode64 option, or no inode overflows
 503          * the legacy 32-bit address space clear the inode32 option.
 504          */
 505         agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
 506         ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
 507
 508         if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
 509                 mp->m_flags |= XFS_MOUNT_32BITINODES;
 510         else
 511                 mp->m_flags &= ~XFS_MOUNT_32BITINODES;
 512
 513         if (mp->m_flags & XFS_MOUNT_32BITINODES) {
 514                 /*
 515                  * Calculate how much should be reserved for inodes to meet
 516                  * the max inode percentage.
 517                  */
 518                 if (mp->m_maxicount) {
 519                         __uint64_t      icount;
 520
 521                         icount = sbp->sb_dblocks * sbp->sb_imax_pct;
 522                         do_div(icount, 100);
 523                         icount += sbp->sb_agblocks - 1;
 524                         do_div(icount, sbp->sb_agblocks);
 525                         max_metadata = icount;
 526                 } else {
 527                         max_metadata = agcount;
 528                 }
 529
 530                 for (index = 0; index < agcount; index++) {
 531                         ino = XFS_AGINO_TO_INO(mp, index, agino);
 532                         if (ino > XFS_MAXINUMBER_32) {
 533                                 index++;
 534                                 break;
 535                         }
 536
 537                         pag = xfs_perag_get(mp, index);
 538                         pag->pagi_inodeok = 1;
 539                         if (index < max_metadata)
 540                                 pag->pagf_metadata = 1;
 541                         xfs_perag_put(pag);
 542                 }
 543         } else {
 544                 for (index = 0; index < agcount; index++) {
 545                         pag = xfs_perag_get(mp, index);
 546                         pag->pagi_inodeok = 1;
 547                         xfs_perag_put(pag);
 548                 }
 549         }
 550
 551         if (maxagi)
 552                 *maxagi = index;
 553         return 0;
 554
 555 out_unwind:
 556         kmem_free(pag);
 557         for (; index > first_initialised; index--) {
 558                 pag = radix_tree_delete(&mp->m_perag_tree, index);
 559                 kmem_free(pag);
 560         }
 561         return error;
 562 }
 563
 564 void
 565 xfs_sb_from_disk(
 566         xfs_sb_t        *to,
 567         xfs_dsb_t       *from)
 568 {
 569         to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
 570         to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
 571         to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
 572         to->sb_rblocks = be64_to_cpu(from->sb_rblocks);
 573         to->sb_rextents = be64_to_cpu(from->sb_rextents);
 574         memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
 575         to->sb_logstart = be64_to_cpu(from->sb_logstart);
 576         to->sb_rootino = be64_to_cpu(from->sb_rootino);
 577         to->sb_rbmino = be64_to_cpu(from->sb_rbmino);
 578         to->sb_rsumino = be64_to_cpu(from->sb_rsumino);
 579         to->sb_rextsize = be32_to_cpu(from->sb_rextsize);
 580         to->sb_agblocks = be32_to_cpu(from->sb_agblocks);
 581         to->sb_agcount = be32_to_cpu(from->sb_agcount);
 582         to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks);
 583         to->sb_logblocks = be32_to_cpu(from->sb_logblocks);
 584         to->sb_versionnum = be16_to_cpu(from->sb_versionnum);
 585         to->sb_sectsize = be16_to_cpu(from->sb_sectsize);
 586         to->sb_inodesize = be16_to_cpu(from->sb_inodesize);
 587         to->sb_inopblock = be16_to_cpu(from->sb_inopblock);
 588         memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
 589         to->sb_blocklog = from->sb_blocklog;
 590         to->sb_sectlog = from->sb_sectlog;
 591         to->sb_inodelog = from->sb_inodelog;
 592         to->sb_inopblog = from->sb_inopblog;
 593         to->sb_agblklog = from->sb_agblklog;
 594         to->sb_rextslog = from->sb_rextslog;
 595         to->sb_inprogress = from->sb_inprogress;
 596         to->sb_imax_pct = from->sb_imax_pct;
 597         to->sb_icount = be64_to_cpu(from->sb_icount);
 598         to->sb_ifree = be64_to_cpu(from->sb_ifree);
 599         to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks);
 600         to->sb_frextents = be64_to_cpu(from->sb_frextents);
 601         to->sb_uquotino = be64_to_cpu(from->sb_uquotino);
 602         to->sb_gquotino = be64_to_cpu(from->sb_gquotino);
 603         to->sb_qflags = be16_to_cpu(from->sb_qflags);
 604         to->sb_flags = from->sb_flags;
 605         to->sb_shared_vn = from->sb_shared_vn;
 606         to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt);
 607         to->sb_unit = be32_to_cpu(from->sb_unit);
 608         to->sb_width = be32_to_cpu(from->sb_width);
 609         to->sb_dirblklog = from->sb_dirblklog;
 610         to->sb_logsectlog = from->sb_logsectlog;
 611         to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize);
 612         to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
 613         to->sb_features2 = be32_to_cpu(from->sb_features2);
 614         to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
 615 }
 616
 617 /*
 618  * Copy in core superblock to ondisk one.
 619  *
 620  * The fields argument is mask of superblock fields to copy.
 621  */
 622 void
 623 xfs_sb_to_disk(
 624         xfs_dsb_t       *to,
 625         xfs_sb_t        *from,
 626         __int64_t       fields)
 627 {
 628         xfs_caddr_t     to_ptr = (xfs_caddr_t)to;
 629         xfs_caddr_t     from_ptr = (xfs_caddr_t)from;
 630         xfs_sb_field_t  f;
 631         int             first;
 632         int             size;
 633
 634         ASSERT(fields);
 635         if (!fields)
 636                 return;
 637
 638         while (fields) {
 639                 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
 640                 first = xfs_sb_info[f].offset;
 641                 size = xfs_sb_info[f + 1].offset - first;
 642
 643                 ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
 644
 645                 if (size == 1 || xfs_sb_info[f].type == 1) {
 646                         memcpy(to_ptr + first, from_ptr + first, size);
 647                 } else {
 648                         switch (size) {
 649                         case 2:
 650                                 *(__be16 *)(to_ptr + first) =
 651                                         cpu_to_be16(*(__u16 *)(from_ptr + first));
 652                                 break;
 653                         case 4:
 654                                 *(__be32 *)(to_ptr + first) =
 655                                         cpu_to_be32(*(__u32 *)(from_ptr + first));
 656                                 break;
 657                         case 8:
 658                                 *(__be64 *)(to_ptr + first) =
 659                                         cpu_to_be64(*(__u64 *)(from_ptr + first));
 660                                 break;
 661                         default:
 662                                 ASSERT(0);
 663                         }
 664                 }
 665
 666                 fields &= ~(1LL << f);
 667         }
 668 }
 669
 670 /*
 671  * xfs_readsb
 672  *
 673  * Does the initial read of the superblock.
 674  */
 675 int
 676 xfs_readsb(xfs_mount_t *mp, int flags)
 677 {
 678         unsigned int    sector_size;
 679         xfs_buf_t       *bp;
 680         int             error;
 681         int             loud = !(flags & XFS_MFSI_QUIET);
 682
 683         ASSERT(mp->m_sb_bp == NULL);
 684         ASSERT(mp->m_ddev_targp != NULL);
 685
 686         /*
 687          * Allocate a (locked) buffer to hold the superblock.
 688          * This will be kept around at all times to optimize
 689          * access to the superblock.
 690          */
 691         sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
 692
 693 reread:
 694         bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
 695                                         XFS_SB_DADDR, sector_size, 0);
 696         if (!bp) {
 697                 if (loud)
 698                         xfs_warn(mp, "SB buffer read failed");
 699                 return EIO;
 700         }
 701
 702         /*
 703          * Initialize the mount structure from the superblock.
 704          * But first do some basic consistency checking.
 705          */
 706         xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
 707         error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
 708         if (error) {
 709                 if (loud)
 710                         xfs_warn(mp, "SB validate failed");
 711                 goto release_buf;
 712         }
 713
 714         /*
 715          * We must be able to do sector-sized and sector-aligned IO.
 716          */
 717         if (sector_size > mp->m_sb.sb_sectsize) {
 718                 if (loud)
 719                         xfs_warn(mp, "device supports %u byte sectors (not %u)",
 720                                 sector_size, mp->m_sb.sb_sectsize);
 721                 error = ENOSYS;
 722                 goto release_buf;
 723         }
 724
 725         /*
 726          * If device sector size is smaller than the superblock size,
 727          * re-read the superblock so the buffer is correctly sized.
 728          */
 729         if (sector_size < mp->m_sb.sb_sectsize) {
 730                 xfs_buf_relse(bp);
 731                 sector_size = mp->m_sb.sb_sectsize;
 732                 goto reread;
 733         }
 734
 735         /* Initialize per-cpu counters */
 736         xfs_icsb_reinit_counters(mp);
 737
 738         mp->m_sb_bp = bp;
 739         xfs_buf_unlock(bp);
 740         return 0;
 741
 742 release_buf:
 743         xfs_buf_relse(bp);
 744         return error;
 745 }
 746
 747
 748 /*
 749  * xfs_mount_common
 750  *
 751  * Mount initialization code establishing various mount
 752  * fields from the superblock associated with the given
 753  * mount structure
 754  */
 755 STATIC void
 756 xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
 757 {
 758         mp->m_agfrotor = mp->m_agirotor = 0;
 759         spin_lock_init(&mp->m_agirotor_lock);
 760         mp->m_maxagi = mp->m_sb.sb_agcount;
 761         mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
 762         mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
 763         mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
 764         mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
 765         mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
 766         mp->m_blockmask = sbp->sb_blocksize - 1;
 767         mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
 768         mp->m_blockwmask = mp->m_blockwsize - 1;
 769
 770         mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
 771         mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
 772         mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
 773         mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
 774
 775         mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
 776         mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
 777         mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
 778         mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
 779
 780         mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
 781         mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
 782         mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
 783         mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
 784
 785         mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
 786         mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
 787                                         sbp->sb_inopblock);
 788         mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
 789 }
 790
 791 /*
 792  * xfs_initialize_perag_data
 793  *
 794  * Read in each per-ag structure so we can count up the number of
 795  * allocated inodes, free inodes and used filesystem blocks as this
 796  * information is no longer persistent in the superblock. Once we have
 797  * this information, write it into the in-core superblock structure.
 798  */
 799 STATIC int
 800 xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
 801 {
 802         xfs_agnumber_t  index;
 803         xfs_perag_t     *pag;
 804         xfs_sb_t        *sbp = &mp->m_sb;
 805         uint64_t        ifree = 0;
 806         uint64_t        ialloc = 0;
 807         uint64_t        bfree = 0;
 808         uint64_t        bfreelst = 0;
 809         uint64_t        btree = 0;
 810         int             error;
 811
 812         for (index = 0; index < agcount; index++) {
 813                 /*
 814                  * read the agf, then the agi. This gets us
 815                  * all the information we need and populates the
 816                  * per-ag structures for us.
 817                  */
 818                 error = xfs_alloc_pagf_init(mp, NULL, index, 0);
 819                 if (error)
 820                         return error;
 821
 822                 error = xfs_ialloc_pagi_init(mp, NULL, index);
 823                 if (error)
 824                         return error;
 825                 pag = xfs_perag_get(mp, index);
 826                 ifree += pag->pagi_freecount;
 827                 ialloc += pag->pagi_count;
 828                 bfree += pag->pagf_freeblks;
 829                 bfreelst += pag->pagf_flcount;
 830                 btree += pag->pagf_btreeblks;
 831                 xfs_perag_put(pag);
 832         }
 833         /*
 834          * Overwrite incore superblock counters with just-read data
 835          */
 836         spin_lock(&mp->m_sb_lock);
 837         sbp->sb_ifree = ifree;
 838         sbp->sb_icount = ialloc;
 839         sbp->sb_fdblocks = bfree + bfreelst + btree;
 840         spin_unlock(&mp->m_sb_lock);
 841
 842         /* Fixup the per-cpu counters as well. */
 843         xfs_icsb_reinit_counters(mp);
 844
 845         return 0;
 846 }
 847
 848 /*
 849  * Update alignment values based on mount options and sb values
 850  */
 851 STATIC int
 852 xfs_update_alignment(xfs_mount_t *mp)
 853 {
 854         xfs_sb_t        *sbp = &(mp->m_sb);
 855
 856         if (mp->m_dalign) {
 857                 /*
 858                  * If stripe unit and stripe width are not multiples
 859                  * of the fs blocksize turn off alignment.
 860                  */
 861                 if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
 862                     (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
 863                         if (mp->m_flags & XFS_MOUNT_RETERR) {
 864                                 xfs_warn(mp, "alignment check 1 failed");
 865                                 return XFS_ERROR(EINVAL);
 866                         }
 867                         mp->m_dalign = mp->m_swidth = 0;
 868                 } else {
 869                         /*
 870                          * Convert the stripe unit and width to FSBs.
 871                          */
 872                         mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
 873                         if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
 874                                 if (mp->m_flags & XFS_MOUNT_RETERR) {
 875                                         return XFS_ERROR(EINVAL);
 876                                 }
 877                                 xfs_warn(mp,
 878                 "stripe alignment turned off: sunit(%d)/swidth(%d) "
 879                 "incompatible with agsize(%d)",
 880                                         mp->m_dalign, mp->m_swidth,
 881                                         sbp->sb_agblocks);
 882
 883                                 mp->m_dalign = 0;
 884                                 mp->m_swidth = 0;
 885                         } else if (mp->m_dalign) {
 886                                 mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
 887                         } else {
 888                                 if (mp->m_flags & XFS_MOUNT_RETERR) {
 889                                         xfs_warn(mp,
 890                 "stripe alignment turned off: sunit(%d) less than bsize(%d)",
 891                                                 mp->m_dalign,
 892                                                 mp->m_blockmask +1);
 893                                         return XFS_ERROR(EINVAL);
 894                                 }
 895                                 mp->m_swidth = 0;
 896                         }
 897                 }
 898
 899                 /*
 900                  * Update superblock with new values
 901                  * and log changes
 902                  */
 903                 if (xfs_sb_version_hasdalign(sbp)) {
 904                         if (sbp->sb_unit != mp->m_dalign) {
 905                                 sbp->sb_unit = mp->m_dalign;
 906                                 mp->m_update_flags |= XFS_SB_UNIT;
 907                         }
 908                         if (sbp->sb_width != mp->m_swidth) {
 909                                 sbp->sb_width = mp->m_swidth;
 910                                 mp->m_update_flags |= XFS_SB_WIDTH;
 911                         }
 912                 }
 913         } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
 914                     xfs_sb_version_hasdalign(&mp->m_sb)) {
 915                         mp->m_dalign = sbp->sb_unit;
 916                         mp->m_swidth = sbp->sb_width;
 917         }
 918
 919         return 0;
 920 }
 921
 922 /*
 923  * Set the maximum inode count for this filesystem
 924  */
 925 STATIC void
 926 xfs_set_maxicount(xfs_mount_t *mp)
 927 {
 928         xfs_sb_t        *sbp = &(mp->m_sb);
 929         __uint64_t      icount;
 930
 931         if (sbp->sb_imax_pct) {
 932                 /*
 933                  * Make sure the maximum inode count is a multiple
 934                  * of the units we allocate inodes in.
 935                  */
 936                 icount = sbp->sb_dblocks * sbp->sb_imax_pct;
 937                 do_div(icount, 100);
 938                 do_div(icount, mp->m_ialloc_blks);
 939                 mp->m_maxicount = (icount * mp->m_ialloc_blks)  <<
 940                                    sbp->sb_inopblog;
 941         } else {
 942                 mp->m_maxicount = 0;
 943         }
 944 }
 945
 946 /*
 947  * Set the default minimum read and write sizes unless
 948  * already specified in a mount option.
 949  * We use smaller I/O sizes when the file system
 950  * is being used for NFS service (wsync mount option).
 951  */
 952 STATIC void
 953 xfs_set_rw_sizes(xfs_mount_t *mp)
 954 {
 955         xfs_sb_t        *sbp = &(mp->m_sb);
 956         int             readio_log, writeio_log;
 957
 958         if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
 959                 if (mp->m_flags & XFS_MOUNT_WSYNC) {
 960                         readio_log = XFS_WSYNC_READIO_LOG;
 961                         writeio_log = XFS_WSYNC_WRITEIO_LOG;
 962                 } else {
 963                         readio_log = XFS_READIO_LOG_LARGE;
 964                         writeio_log = XFS_WRITEIO_LOG_LARGE;
 965                 }
 966         } else {
 967                 readio_log = mp->m_readio_log;
 968                 writeio_log = mp->m_writeio_log;
 969         }
 970
 971         if (sbp->sb_blocklog > readio_log) {
 972                 mp->m_readio_log = sbp->sb_blocklog;
 973         } else {
 974                 mp->m_readio_log = readio_log;
 975         }
 976         mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog);
 977         if (sbp->sb_blocklog > writeio_log) {
 978                 mp->m_writeio_log = sbp->sb_blocklog;
 979         } else {
 980                 mp->m_writeio_log = writeio_log;
 981         }
 982         mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog);
 983 }
 984
 985 /*
 986  * precalculate the low space thresholds for dynamic speculative preallocation.
 987  */
 988 void
 989 xfs_set_low_space_thresholds(
 990         struct xfs_mount        *mp)
 991 {
 992         int i;
 993
 994         for (i = 0; i < XFS_LOWSP_MAX; i++) {
 995                 __uint64_t space = mp->m_sb.sb_dblocks;
 996
 997                 do_div(space, 100);
 998                 mp->m_low_space[i] = space * (i + 1);
 999         }
1000 }
1001
1002
1003 /*
1004  * Set whether we're using inode alignment.
1005  */
1006 STATIC void
1007 xfs_set_inoalignment(xfs_mount_t *mp)
1008 {
1009         if (xfs_sb_version_hasalign(&mp->m_sb) &&
1010             mp->m_sb.sb_inoalignmt >=
1011             XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
1012                 mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1;
1013         else
1014                 mp->m_inoalign_mask = 0;
1015         /*
1016          * If we are using stripe alignment, check whether
1017          * the stripe unit is a multiple of the inode alignment
1018          */
1019         if (mp->m_dalign && mp->m_inoalign_mask &&
1020             !(mp->m_dalign & mp->m_inoalign_mask))
1021                 mp->m_sinoalign = mp->m_dalign;
1022         else
1023                 mp->m_sinoalign = 0;
1024 }
1025
1026 /*
1027  * Check that the data (and log if separate) are an ok size.
1028  */
1029 STATIC int
1030 xfs_check_sizes(xfs_mount_t *mp)
1031 {
1032         xfs_buf_t       *bp;
1033         xfs_daddr_t     d;
1034
1035         d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
1036         if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
1037                 xfs_warn(mp, "filesystem size mismatch detected");
1038                 return XFS_ERROR(EFBIG);
1039         }
1040         bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
1041                                         d - XFS_FSS_TO_BB(mp, 1),
1042                                         BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
1043         if (!bp) {
1044                 xfs_warn(mp, "last sector read failed");
1045                 return EIO;
1046         }
1047         xfs_buf_relse(bp);
1048
1049         if (mp->m_logdev_targp != mp->m_ddev_targp) {
1050                 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
1051                 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
1052                         xfs_warn(mp, "log size mismatch detected");
1053                         return XFS_ERROR(EFBIG);
1054                 }
1055                 bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
1056                                         d - XFS_FSB_TO_BB(mp, 1),
1057                                         XFS_FSB_TO_B(mp, 1), 0);
1058                 if (!bp) {
1059                         xfs_warn(mp, "log device read failed");
1060                         return EIO;
1061                 }
1062                 xfs_buf_relse(bp);
1063         }
1064         return 0;
1065 }
1066
1067 /*
1068  * Clear the quotaflags in memory and in the superblock.
1069  */
1070 int
1071 xfs_mount_reset_sbqflags(
1072         struct xfs_mount        *mp)
1073 {
1074         int                     error;
1075         struct xfs_trans        *tp;
1076
1077         mp->m_qflags = 0;
1078
1079         /*
1080          * It is OK to look at sb_qflags here in mount path,
1081          * without m_sb_lock.
1082          */
1083         if (mp->m_sb.sb_qflags == 0)
1084                 return 0;
1085         spin_lock(&mp->m_sb_lock);
1086         mp->m_sb.sb_qflags = 0;
1087         spin_unlock(&mp->m_sb_lock);
1088
1089         /*
1090          * If the fs is readonly, let the incore superblock run
1091          * with quotas off but don't flush the update out to disk
1092          */
1093         if (mp->m_flags & XFS_MOUNT_RDONLY)
1094                 return 0;
1095
1096 #ifdef QUOTADEBUG
1097         xfs_notice(mp, "Writing superblock quota changes");
1098 #endif
1099
1100         tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
1101         error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
1102                                       XFS_DEFAULT_LOG_COUNT);
1103         if (error) {
1104                 xfs_trans_cancel(tp, 0);
1105                 xfs_alert(mp, "%s: Superblock update failed!", __func__);
1106                 return error;
1107         }
1108
1109         xfs_mod_sb(tp, XFS_SB_QFLAGS);
1110         return xfs_trans_commit(tp, 0);
1111 }
1112
1113 __uint64_t
1114 xfs_default_resblks(xfs_mount_t *mp)
1115 {
1116         __uint64_t resblks;
1117
1118         /*
1119          * We default to 5% or 8192 fsbs of space reserved, whichever is
1120          * smaller.  This is intended to cover concurrent allocation
1121          * transactions when we initially hit enospc. These each require a 4
1122          * block reservation. Hence by default we cover roughly 2000 concurrent
1123          * allocation reservations.
1124          */
1125         resblks = mp->m_sb.sb_dblocks;
1126         do_div(resblks, 20);
1127         resblks = min_t(__uint64_t, resblks, 8192);
1128         return resblks;
1129 }
1130
1131 /*
1132  * This function does the following on an initial mount of a file system:
1133  *      - reads the superblock from disk and init the mount struct
1134  *      - if we're a 32-bit kernel, do a size check on the superblock
1135  *              so we don't mount terabyte filesystems
1136  *      - init mount struct realtime fields
1137  *      - allocate inode hash table for fs
1138  *      - init directory manager
1139  *      - perform recovery and init the log manager
1140  */
1141 int
1142 xfs_mountfs(
1143         xfs_mount_t     *mp)
1144 {
1145         xfs_sb_t        *sbp = &(mp->m_sb);
1146         xfs_inode_t     *rip;
1147         __uint64_t      resblks;
1148         uint            quotamount = 0;
1149         uint            quotaflags = 0;
1150         int             error = 0;
1151
1152         xfs_mount_common(mp, sbp);
1153
1154         /*
1155          * Check for a mismatched features2 values.  Older kernels
1156          * read & wrote into the wrong sb offset for sb_features2
1157          * on some platforms due to xfs_sb_t not being 64bit size aligned
1158          * when sb_features2 was added, which made older superblock
1159          * reading/writing routines swap it as a 64-bit value.
1160          *
1161          * For backwards compatibility, we make both slots equal.
1162          *
1163          * If we detect a mismatched field, we OR the set bits into the
1164          * existing features2 field in case it has already been modified; we
1165          * don't want to lose any features.  We then update the bad location
1166          * with the ORed value so that older kernels will see any features2
1167          * flags, and mark the two fields as needing updates once the
1168          * transaction subsystem is online.
1169          */
1170         if (xfs_sb_has_mismatched_features2(sbp)) {
1171                 xfs_warn(mp, "correcting sb_features alignment problem");
1172                 sbp->sb_features2 |= sbp->sb_bad_features2;
1173                 sbp->sb_bad_features2 = sbp->sb_features2;
1174                 mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
1175
1176                 /*
1177                  * Re-check for ATTR2 in case it was found in bad_features2
1178                  * slot.
1179                  */
1180                 if (xfs_sb_version_hasattr2(&mp->m_sb) &&
1181                    !(mp->m_flags & XFS_MOUNT_NOATTR2))
1182                         mp->m_flags |= XFS_MOUNT_ATTR2;
1183         }
1184
1185         if (xfs_sb_version_hasattr2(&mp->m_sb) &&
1186            (mp->m_flags & XFS_MOUNT_NOATTR2)) {
1187                 xfs_sb_version_removeattr2(&mp->m_sb);
1188                 mp->m_update_flags |= XFS_SB_FEATURES2;
1189
1190                 /* update sb_versionnum for the clearing of the morebits */
1191                 if (!sbp->sb_features2)
1192                         mp->m_update_flags |= XFS_SB_VERSIONNUM;
1193         }
1194
1195         /*
1196          * Check if sb_agblocks is aligned at stripe boundary
1197          * If sb_agblocks is NOT aligned turn off m_dalign since
1198          * allocator alignment is within an ag, therefore ag has
1199          * to be aligned at stripe boundary.
1200          */
1201         error = xfs_update_alignment(mp);
1202         if (error)
1203                 goto out;
1204
1205         xfs_alloc_compute_maxlevels(mp);
1206         xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
1207         xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
1208         xfs_ialloc_compute_maxlevels(mp);
1209
1210         xfs_set_maxicount(mp);
1211
1212         mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog);
1213
1214         error = xfs_uuid_mount(mp);
1215         if (error)
1216                 goto out;
1217
1218         /*
1219          * Set the minimum read and write sizes
1220          */
1221         xfs_set_rw_sizes(mp);
1222
1223         /* set the low space thresholds for dynamic preallocation */
1224         xfs_set_low_space_thresholds(mp);
1225
1226         /*
1227          * Set the inode cluster size.
1228          * This may still be overridden by the file system
1229          * block size if it is larger than the chosen cluster size.
1230          */
1231         mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
1232
1233         /*
1234          * Set inode alignment fields
1235          */
1236         xfs_set_inoalignment(mp);
1237
1238         /*
1239          * Check that the data (and log if separate) are an ok size.
1240          */
1241         error = xfs_check_sizes(mp);
1242         if (error)
1243                 goto out_remove_uuid;
1244
1245         /*
1246          * Initialize realtime fields in the mount structure
1247          */
1248         error = xfs_rtmount_init(mp);
1249         if (error) {
1250                 xfs_warn(mp, "RT mount failed");
1251                 goto out_remove_uuid;
1252         }
1253
1254         /*
1255          *  Copies the low order bits of the timestamp and the randomly
1256          *  set "sequence" number out of a UUID.
1257          */
1258         uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid);
1259
1260         mp->m_dmevmask = 0;     /* not persistent; set after each mount */
1261
1262         xfs_dir_mount(mp);
1263
1264         /*
1265          * Initialize the attribute manager's entries.
1266          */
1267         mp->m_attr_magicpct = (mp->m_sb.sb_blocksize * 37) / 100;
1268
1269         /*
1270          * Initialize the precomputed transaction reservations values.
1271          */
1272         xfs_trans_init(mp);
1273
1274         /*
1275          * Allocate and initialize the per-ag data.
1276          */
1277         spin_lock_init(&mp->m_perag_lock);
1278         INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
1279         error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
1280         if (error) {
1281                 xfs_warn(mp, "Failed per-ag init: %d", error);
1282                 goto out_remove_uuid;
1283         }
1284
1285         if (!sbp->sb_logblocks) {
1286                 xfs_warn(mp, "no log defined");
1287                 XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
1288                 error = XFS_ERROR(EFSCORRUPTED);
1289                 goto out_free_perag;
1290         }
1291
1292         /*
1293          * log's mount-time initialization. Perform 1st part recovery if needed
1294          */
1295         error = xfs_log_mount(mp, mp->m_logdev_targp,
1296                               XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
1297                               XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
1298         if (error) {
1299                 xfs_warn(mp, "log mount failed");
1300                 goto out_free_perag;
1301         }
1302
1303         /*
1304          * Now the log is mounted, we know if it was an unclean shutdown or
1305          * not. If it was, with the first phase of recovery has completed, we
1306          * have consistent AG blocks on disk. We have not recovered EFIs yet,
1307          * but they are recovered transactionally in the second recovery phase
1308          * later.
1309          *
1310          * Hence we can safely re-initialise incore superblock counters from
1311          * the per-ag data. These may not be correct if the filesystem was not
1312          * cleanly unmounted, so we need to wait for recovery to finish before
1313          * doing this.
1314          *
1315          * If the filesystem was cleanly unmounted, then we can trust the
1316          * values in the superblock to be correct and we don't need to do
1317          * anything here.
1318          *
1319          * If we are currently making the filesystem, the initialisation will
1320          * fail as the perag data is in an undefined state.
1321          */
1322         if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
1323             !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) &&
1324              !mp->m_sb.sb_inprogress) {
1325                 error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
1326                 if (error)
1327                         goto out_free_perag;
1328         }
1329
1330         /*
1331          * Get and sanity-check the root inode.
1332          * Save the pointer to it in the mount structure.
1333          */
1334         error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip);
1335         if (error) {
1336                 xfs_warn(mp, "failed to read root inode");
1337                 goto out_log_dealloc;
1338         }
1339
1340         ASSERT(rip != NULL);
1341
1342         if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
1343                 xfs_warn(mp, "corrupted root inode %llu: not a directory",
1344                         (unsigned long long)rip->i_ino);
1345                 xfs_iunlock(rip, XFS_ILOCK_EXCL);
1346                 XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
1347                                  mp);
1348                 error = XFS_ERROR(EFSCORRUPTED);
1349                 goto out_rele_rip;
1350         }
1351         mp->m_rootip = rip;     /* save it */
1352
1353         xfs_iunlock(rip, XFS_ILOCK_EXCL);
1354
1355         /*
1356          * Initialize realtime inode pointers in the mount structure
1357          */
1358         error = xfs_rtmount_inodes(mp);
1359         if (error) {
1360                 /*
1361                  * Free up the root inode.
1362                  */
1363                 xfs_warn(mp, "failed to read RT inodes");
1364                 goto out_rele_rip;
1365         }
1366
1367         /*
1368          * If this is a read-only mount defer the superblock updates until
1369          * the next remount into writeable mode.  Otherwise we would never
1370          * perform the update e.g. for the root filesystem.
1371          */
1372         if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
1373                 error = xfs_mount_log_sb(mp, mp->m_update_flags);
1374                 if (error) {
1375                         xfs_warn(mp, "failed to write sb changes");
1376                         goto out_rtunmount;
1377                 }
1378         }
1379
1380         /*
1381          * Initialise the XFS quota management subsystem for this mount
1382          */
1383         if (XFS_IS_QUOTA_RUNNING(mp)) {
1384                 error = xfs_qm_newmount(mp, &quotamount, &quotaflags);
1385                 if (error)
1386                         goto out_rtunmount;
1387         } else {
1388                 ASSERT(!XFS_IS_QUOTA_ON(mp));
1389
1390                 /*
1391                  * If a file system had quotas running earlier, but decided to
1392                  * mount without -o uquota/pquota/gquota options, revoke the
1393                  * quotachecked license.
1394                  */
1395                 if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
1396                         xfs_notice(mp, "resetting quota flags");
1397                         error = xfs_mount_reset_sbqflags(mp);
1398                         if (error)
1399                                 return error;
1400                 }
1401         }
1402
1403         /*
1404          * Finish recovering the file system.  This part needed to be
1405          * delayed until after the root and real-time bitmap inodes
1406          * were consistently read in.
1407          */
1408         error = xfs_log_mount_finish(mp);
1409         if (error) {
1410                 xfs_warn(mp, "log mount finish failed");
1411                 goto out_rtunmount;
1412         }
1413
1414         /*
1415          * Complete the quota initialisation, post-log-replay component.
1416          */
1417         if (quotamount) {
1418                 ASSERT(mp->m_qflags == 0);
1419                 mp->m_qflags = quotaflags;
1420
1421                 xfs_qm_mount_quotas(mp);
1422         }
1423
1424         /*
1425          * Now we are mounted, reserve a small amount of unused space for
1426          * privileged transactions. This is needed so that transaction
1427          * space required for critical operations can dip into this pool
1428          * when at ENOSPC. This is needed for operations like create with
1429          * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
1430          * are not allowed to use this reserved space.
1431          *
1432          * This may drive us straight to ENOSPC on mount, but that implies
1433          * we were already there on the last unmount. Warn if this occurs.
1434          */
1435         if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
1436                 resblks = xfs_default_resblks(mp);
1437                 error = xfs_reserve_blocks(mp, &resblks, NULL);
1438                 if (error)
1439                         xfs_warn(mp,
1440         "Unable to allocate reserve blocks. Continuing without reserve pool.");
1441         }
1442
1443         return 0;
1444
1445  out_rtunmount:
1446         xfs_rtunmount_inodes(mp);
1447  out_rele_rip:
1448         IRELE(rip);
1449  out_log_dealloc:
1450         xfs_log_unmount(mp);
1451  out_free_perag:
1452         xfs_free_perag(mp);
1453  out_remove_uuid:
1454         xfs_uuid_unmount(mp);
1455  out:
1456         return error;
1457 }
1458
1459 /*
1460  * This flushes out the inodes,dquots and the superblock, unmounts the
1461  * log and makes sure that incore structures are freed.
1462  */
1463 void
1464 xfs_unmountfs(
1465         struct xfs_mount        *mp)
1466 {
1467         __uint64_t              resblks;
1468         int                     error;
1469
1470         xfs_qm_unmount_quotas(mp);
1471         xfs_rtunmount_inodes(mp);
1472         IRELE(mp->m_rootip);
1473
1474         /*
1475          * We can potentially deadlock here if we have an inode cluster
1476          * that has been freed has its buffer still pinned in memory because
1477          * the transaction is still sitting in a iclog. The stale inodes
1478          * on that buffer will have their flush locks held until the
1479          * transaction hits the disk and the callbacks run. the inode
1480          * flush takes the flush lock unconditionally and with nothing to
1481          * push out the iclog we will never get that unlocked. hence we
1482          * need to force the log first.
1483          */
1484         xfs_log_force(mp, XFS_LOG_SYNC);
1485
1486         /*
1487          * Do a delwri reclaim pass first so that as many dirty inodes are
1488          * queued up for IO as possible. Then flush the buffers before making
1489          * a synchronous path to catch all the remaining inodes are reclaimed.
1490          * This makes the reclaim process as quick as possible by avoiding
1491          * synchronous writeout and blocking on inodes already in the delwri
1492          * state as much as possible.
1493          */
1494         xfs_reclaim_inodes(mp, 0);
1495         XFS_bflush(mp->m_ddev_targp);
1496         xfs_reclaim_inodes(mp, SYNC_WAIT);
1497
1498         xfs_qm_unmount(mp);
1499
1500         /*
1501          * Flush out the log synchronously so that we know for sure
1502          * that nothing is pinned.  This is important because bflush()
1503          * will skip pinned buffers.
1504          */
1505         xfs_log_force(mp, XFS_LOG_SYNC);
1506
1507         /*
1508          * Unreserve any blocks we have so that when we unmount we don't account
1509          * the reserved free space as used. This is really only necessary for
1510          * lazy superblock counting because it trusts the incore superblock
1511          * counters to be absolutely correct on clean unmount.
1512          *
1513          * We don't bother correcting this elsewhere for lazy superblock
1514          * counting because on mount of an unclean filesystem we reconstruct the
1515          * correct counter value and this is irrelevant.
1516          *
1517          * For non-lazy counter filesystems, this doesn't matter at all because
1518          * we only every apply deltas to the superblock and hence the incore
1519          * value does not matter....
1520          */
1521         resblks = 0;
1522         error = xfs_reserve_blocks(mp, &resblks, NULL);
1523         if (error)
1524                 xfs_warn(mp, "Unable to free reserved block pool. "
1525                                 "Freespace may not be correct on next mount.");
1526
1527         error = xfs_log_sbcount(mp, 1);
1528         if (error)
1529                 xfs_warn(mp, "Unable to update superblock counters. "
1530                                 "Freespace may not be correct on next mount.");
1531         xfs_unmountfs_writesb(mp);
1532
1533         /*
1534          * Make sure all buffers have been flushed and completed before
1535          * unmounting the log.
1536          */
1537         error = xfs_flush_buftarg(mp->m_ddev_targp, 1);
1538         if (error)
1539                 xfs_warn(mp, "%d busy buffers during unmount.", error);
1540         xfs_wait_buftarg(mp->m_ddev_targp);
1541
1542         xfs_log_unmount_write(mp);
1543         xfs_log_unmount(mp);
1544         xfs_uuid_unmount(mp);
1545
1546 #if defined(DEBUG)
1547         xfs_errortag_clearall(mp, 0);
1548 #endif
1549         xfs_free_perag(mp);
1550 }
1551
1552 int
1553 xfs_fs_writable(xfs_mount_t *mp)
1554 {
1555         return !(xfs_test_for_freeze(mp) || XFS_FORCED_SHUTDOWN(mp) ||
1556                 (mp->m_flags & XFS_MOUNT_RDONLY));
1557 }
1558
1559 /*
1560  * xfs_log_sbcount
1561  *
1562  * Called either periodically to keep the on disk superblock values
1563  * roughly up to date or from unmount to make sure the values are
1564  * correct on a clean unmount.
1565  *
1566  * Note this code can be called during the process of freezing, so
1567  * we may need to use the transaction allocator which does not not
1568  * block when the transaction subsystem is in its frozen state.
1569  */
1570 int
1571 xfs_log_sbcount(
1572         xfs_mount_t     *mp,
1573         uint            sync)
1574 {
1575         xfs_trans_t     *tp;
1576         int             error;
1577
1578         if (!xfs_fs_writable(mp))
1579                 return 0;
1580
1581         xfs_icsb_sync_counters(mp, 0);
1582
1583         /*
1584          * we don't need to do this if we are updating the superblock
1585          * counters on every modification.
1586          */
1587         if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
1588                 return 0;
1589
1590         tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
1591         error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
1592                                         XFS_DEFAULT_LOG_COUNT);
1593         if (error) {
1594                 xfs_trans_cancel(tp, 0);
1595                 return error;
1596         }
1597
1598         xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS);
1599         if (sync)
1600                 xfs_trans_set_sync(tp);
1601         error = xfs_trans_commit(tp, 0);
1602         return error;
1603 }
1604
1605 int
1606 xfs_unmountfs_writesb(xfs_mount_t *mp)
1607 {
1608         xfs_buf_t       *sbp;
1609         int             error = 0;
1610
1611         /*
1612          * skip superblock write if fs is read-only, or
1613          * if we are doing a forced umount.
1614          */
1615         if (!((mp->m_flags & XFS_MOUNT_RDONLY) ||
1616                 XFS_FORCED_SHUTDOWN(mp))) {
1617
1618                 sbp = xfs_getsb(mp, 0);
1619
1620                 XFS_BUF_UNDONE(sbp);
1621                 XFS_BUF_UNREAD(sbp);
1622                 XFS_BUF_UNDELAYWRITE(sbp);
1623                 XFS_BUF_WRITE(sbp);
1624                 XFS_BUF_UNASYNC(sbp);
1625                 ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
1626                 xfsbdstrat(mp, sbp);
1627                 error = xfs_buf_iowait(sbp);
1628                 if (error)
1629                         xfs_ioerror_alert("xfs_unmountfs_writesb",
1630                                           mp, sbp, XFS_BUF_ADDR(sbp));
1631                 xfs_buf_relse(sbp);
1632         }
1633         return error;
1634 }
1635
1636 /*
1637  * xfs_mod_sb() can be used to copy arbitrary changes to the
1638  * in-core superblock into the superblock buffer to be logged.
1639  * It does not provide the higher level of locking that is
1640  * needed to protect the in-core superblock from concurrent
1641  * access.
1642  */
1643 void
1644 xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
1645 {
1646         xfs_buf_t       *bp;
1647         int             first;
1648         int             last;
1649         xfs_mount_t     *mp;
1650         xfs_sb_field_t  f;
1651
1652         ASSERT(fields);
1653         if (!fields)
1654                 return;
1655         mp = tp->t_mountp;
1656         bp = xfs_trans_getsb(tp, mp, 0);
1657         first = sizeof(xfs_sb_t);
1658         last = 0;
1659
1660         /* translate/copy */
1661
1662         xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
1663
1664         /* find modified range */
1665         f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
1666         ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1667         last = xfs_sb_info[f + 1].offset - 1;
1668
1669         f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
1670         ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1671         first = xfs_sb_info[f].offset;
1672
1673         xfs_trans_log_buf(tp, bp, first, last);
1674 }
1675
1676
1677 /*
1678  * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
1679  * a delta to a specified field in the in-core superblock.  Simply
1680  * switch on the field indicated and apply the delta to that field.
1681  * Fields are not allowed to dip below zero, so if the delta would
1682  * do this do not apply it and return EINVAL.
1683  *
1684  * The m_sb_lock must be held when this routine is called.
1685  */
1686 STATIC int
1687 xfs_mod_incore_sb_unlocked(
1688         xfs_mount_t     *mp,
1689         xfs_sb_field_t  field,
1690         int64_t         delta,
1691         int             rsvd)
1692 {
1693         int             scounter;       /* short counter for 32 bit fields */
1694         long long       lcounter;       /* long counter for 64 bit fields */
1695         long long       res_used, rem;
1696
1697         /*
1698          * With the in-core superblock spin lock held, switch
1699          * on the indicated field.  Apply the delta to the
1700          * proper field.  If the fields value would dip below
1701          * 0, then do not apply the delta and return EINVAL.
1702          */
1703         switch (field) {
1704         case XFS_SBS_ICOUNT:
1705                 lcounter = (long long)mp->m_sb.sb_icount;
1706                 lcounter += delta;
1707                 if (lcounter < 0) {
1708                         ASSERT(0);
1709                         return XFS_ERROR(EINVAL);
1710                 }
1711                 mp->m_sb.sb_icount = lcounter;
1712                 return 0;
1713         case XFS_SBS_IFREE:
1714                 lcounter = (long long)mp->m_sb.sb_ifree;
1715                 lcounter += delta;
1716                 if (lcounter < 0) {
1717                         ASSERT(0);
1718                         return XFS_ERROR(EINVAL);
1719                 }
1720                 mp->m_sb.sb_ifree = lcounter;
1721                 return 0;
1722         case XFS_SBS_FDBLOCKS:
1723                 lcounter = (long long)
1724                         mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
1725                 res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
1726
1727                 if (delta > 0) {                /* Putting blocks back */
1728                         if (res_used > delta) {
1729                                 mp->m_resblks_avail += delta;
1730                         } else {
1731                                 rem = delta - res_used;
1732                                 mp->m_resblks_avail = mp->m_resblks;
1733                                 lcounter += rem;
1734                         }
1735                 } else {                                /* Taking blocks away */
1736                         lcounter += delta;
1737                         if (lcounter >= 0) {
1738                                 mp->m_sb.sb_fdblocks = lcounter +
1739                                                         XFS_ALLOC_SET_ASIDE(mp);
1740                                 return 0;
1741                         }
1742
1743                         /*
1744                          * We are out of blocks, use any available reserved
1745                          * blocks if were allowed to.
1746                          */
1747                         if (!rsvd)
1748                                 return XFS_ERROR(ENOSPC);
1749
1750                         lcounter = (long long)mp->m_resblks_avail + delta;
1751                         if (lcounter >= 0) {
1752                                 mp->m_resblks_avail = lcounter;
1753                                 return 0;
1754                         }
1755                         printk_once(KERN_WARNING
1756                                 "Filesystem \"%s\": reserve blocks depleted! "
1757                                 "Consider increasing reserve pool size.",
1758                                 mp->m_fsname);
1759                         return XFS_ERROR(ENOSPC);
1760                 }
1761
1762                 mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
1763                 return 0;
1764         case XFS_SBS_FREXTENTS:
1765                 lcounter = (long long)mp->m_sb.sb_frextents;
1766                 lcounter += delta;
1767                 if (lcounter < 0) {
1768                         return XFS_ERROR(ENOSPC);
1769                 }
1770                 mp->m_sb.sb_frextents = lcounter;
1771                 return 0;
1772         case XFS_SBS_DBLOCKS:
1773                 lcounter = (long long)mp->m_sb.sb_dblocks;
1774                 lcounter += delta;
1775                 if (lcounter < 0) {
1776                         ASSERT(0);
1777                         return XFS_ERROR(EINVAL);
1778                 }
1779                 mp->m_sb.sb_dblocks = lcounter;
1780                 return 0;
1781         case XFS_SBS_AGCOUNT:
1782                 scounter = mp->m_sb.sb_agcount;
1783                 scounter += delta;
1784                 if (scounter < 0) {
1785                         ASSERT(0);
1786                         return XFS_ERROR(EINVAL);
1787                 }
1788                 mp->m_sb.sb_agcount = scounter;
1789                 return 0;
1790         case XFS_SBS_IMAX_PCT:
1791                 scounter = mp->m_sb.sb_imax_pct;
1792                 scounter += delta;
1793                 if (scounter < 0) {
1794                         ASSERT(0);
1795                         return XFS_ERROR(EINVAL);
1796                 }
1797                 mp->m_sb.sb_imax_pct = scounter;
1798                 return 0;
1799         case XFS_SBS_REXTSIZE:
1800                 scounter = mp->m_sb.sb_rextsize;
1801                 scounter += delta;
1802                 if (scounter < 0) {
1803                         ASSERT(0);
1804                         return XFS_ERROR(EINVAL);
1805                 }
1806                 mp->m_sb.sb_rextsize = scounter;
1807                 return 0;
1808         case XFS_SBS_RBMBLOCKS:
1809                 scounter = mp->m_sb.sb_rbmblocks;
1810                 scounter += delta;
1811                 if (scounter < 0) {
1812                         ASSERT(0);
1813                         return XFS_ERROR(EINVAL);
1814                 }
1815                 mp->m_sb.sb_rbmblocks = scounter;
1816                 return 0;
1817         case XFS_SBS_RBLOCKS:
1818                 lcounter = (long long)mp->m_sb.sb_rblocks;
1819                 lcounter += delta;
1820                 if (lcounter < 0) {
1821                         ASSERT(0);
1822                         return XFS_ERROR(EINVAL);
1823                 }
1824                 mp->m_sb.sb_rblocks = lcounter;
1825                 return 0;
1826         case XFS_SBS_REXTENTS:
1827                 lcounter = (long long)mp->m_sb.sb_rextents;
1828                 lcounter += delta;
1829                 if (lcounter < 0) {
1830                         ASSERT(0);
1831                         return XFS_ERROR(EINVAL);
1832                 }
1833                 mp->m_sb.sb_rextents = lcounter;
1834                 return 0;
1835         case XFS_SBS_REXTSLOG:
1836                 scounter = mp->m_sb.sb_rextslog;
1837                 scounter += delta;
1838                 if (scounter < 0) {
1839                         ASSERT(0);
1840                         return XFS_ERROR(EINVAL);
1841                 }
1842                 mp->m_sb.sb_rextslog = scounter;
1843                 return 0;
1844         default:
1845                 ASSERT(0);
1846                 return XFS_ERROR(EINVAL);
1847         }
1848 }
1849
1850 /*
1851  * xfs_mod_incore_sb() is used to change a field in the in-core
1852  * superblock structure by the specified delta.  This modification
1853  * is protected by the m_sb_lock.  Just use the xfs_mod_incore_sb_unlocked()
1854  * routine to do the work.
1855  */
1856 int
1857 xfs_mod_incore_sb(
1858         struct xfs_mount        *mp,
1859         xfs_sb_field_t          field,
1860         int64_t                 delta,
1861         int                     rsvd)
1862 {
1863         int                     status;
1864
1865 #ifdef HAVE_PERCPU_SB
1866         ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS);
1867 #endif
1868         spin_lock(&mp->m_sb_lock);
1869         status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
1870         spin_unlock(&mp->m_sb_lock);
1871
1872         return status;
1873 }
1874
1875 /*
1876  * Change more than one field in the in-core superblock structure at a time.
1877  *
1878  * The fields and changes to those fields are specified in the array of
1879  * xfs_mod_sb structures passed in.  Either all of the specified deltas
1880  * will be applied or none of them will.  If any modified field dips below 0,
1881  * then all modifications will be backed out and EINVAL will be returned.
1882  *
1883  * Note that this function may not be used for the superblock values that
1884  * are tracked with the in-memory per-cpu counters - a direct call to
1885  * xfs_icsb_modify_counters is required for these.
1886  */
1887 int
1888 xfs_mod_incore_sb_batch(
1889         struct xfs_mount        *mp,
1890         xfs_mod_sb_t            *msb,
1891         uint                    nmsb,
1892         int                     rsvd)
1893 {
1894         xfs_mod_sb_t            *msbp;
1895         int                     error = 0;
1896
1897         /*
1898          * Loop through the array of mod structures and apply each individually.
1899          * If any fail, then back out all those which have already been applied.
1900          * Do all of this within the scope of the m_sb_lock so that all of the
1901          * changes will be atomic.
1902          */
1903         spin_lock(&mp->m_sb_lock);
1904         for (msbp = msb; msbp < (msb + nmsb); msbp++) {
1905                 ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
1906                        msbp->msb_field > XFS_SBS_FDBLOCKS);
1907
1908                 error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
1909                                                    msbp->msb_delta, rsvd);
1910                 if (error)
1911                         goto unwind;
1912         }
1913         spin_unlock(&mp->m_sb_lock);
1914         return 0;
1915
1916 unwind:
1917         while (--msbp >= msb) {
1918                 error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
1919                                                    -msbp->msb_delta, rsvd);
1920                 ASSERT(error == 0);
1921         }
1922         spin_unlock(&mp->m_sb_lock);
1923         return error;
1924 }
1925
1926 /*
1927  * xfs_getsb() is called to obtain the buffer for the superblock.
1928  * The buffer is returned locked and read in from disk.
1929  * The buffer should be released with a call to xfs_brelse().
1930  *
1931  * If the flags parameter is BUF_TRYLOCK, then we'll only return
1932  * the superblock buffer if it can be locked without sleeping.
1933  * If it can't then we'll return NULL.
1934  */
1935 xfs_buf_t *
1936 xfs_getsb(
1937         xfs_mount_t     *mp,
1938         int             flags)
1939 {
1940         xfs_buf_t       *bp;
1941
1942         ASSERT(mp->m_sb_bp != NULL);
1943         bp = mp->m_sb_bp;
1944         if (flags & XBF_TRYLOCK) {
1945                 if (!XFS_BUF_CPSEMA(bp)) {
1946                         return NULL;
1947                 }
1948         } else {
1949                 XFS_BUF_PSEMA(bp, PRIBIO);
1950         }
1951         XFS_BUF_HOLD(bp);
1952         ASSERT(XFS_BUF_ISDONE(bp));
1953         return bp;
1954 }
1955
1956 /*
1957  * Used to free the superblock along various error paths.
1958  */
1959 void
1960 xfs_freesb(
1961         struct xfs_mount        *mp)
1962 {
1963         struct xfs_buf          *bp = mp->m_sb_bp;
1964
1965         xfs_buf_lock(bp);
1966         mp->m_sb_bp = NULL;
1967         xfs_buf_relse(bp);
1968 }
1969
1970 /*
1971  * Used to log changes to the superblock unit and width fields which could
1972  * be altered by the mount options, as well as any potential sb_features2
1973  * fixup. Only the first superblock is updated.
1974  */
1975 int
1976 xfs_mount_log_sb(
1977         xfs_mount_t     *mp,
1978         __int64_t       fields)
1979 {
1980         xfs_trans_t     *tp;
1981         int             error;
1982
1983         ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID |
1984                          XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2 |
1985                          XFS_SB_VERSIONNUM));
1986
1987         tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
1988         error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
1989                                 XFS_DEFAULT_LOG_COUNT);
1990         if (error) {
1991                 xfs_trans_cancel(tp, 0);
1992                 return error;
1993         }
1994         xfs_mod_sb(tp, fields);
1995         error = xfs_trans_commit(tp, 0);
1996         return error;
1997 }
1998
1999 /*
2000  * If the underlying (data/log/rt) device is readonly, there are some
2001  * operations that cannot proceed.
2002  */
2003 int
2004 xfs_dev_is_read_only(
2005         struct xfs_mount        *mp,
2006         char                    *message)
2007 {
2008         if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
2009             xfs_readonly_buftarg(mp->m_logdev_targp) ||
2010             (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
2011                 xfs_notice(mp, "%s required on read-only device.", message);
2012                 xfs_notice(mp, "write access unavailable, cannot proceed.");
2013                 return EROFS;
2014         }
2015         return 0;
2016 }
2017
2018 #ifdef HAVE_PERCPU_SB
2019 /*
2020  * Per-cpu incore superblock counters
2021  *
2022  * Simple concept, difficult implementation
2023  *
2024  * Basically, replace the incore superblock counters with a distributed per cpu
2025  * counter for contended fields (e.g.  free block count).
2026  *
2027  * Difficulties arise in that the incore sb is used for ENOSPC checking, and
2028  * hence needs to be accurately read when we are running low on space. Hence
2029  * there is a method to enable and disable the per-cpu counters based on how
2030  * much "stuff" is available in them.
2031  *
2032  * Basically, a counter is enabled if there is enough free resource to justify
2033  * running a per-cpu fast-path. If the per-cpu counter runs out (i.e. a local
2034  * ENOSPC), then we disable the counters to synchronise all callers and
2035  * re-distribute the available resources.
2036  *
2037  * If, once we redistributed the available resources, we still get a failure,
2038  * we disable the per-cpu counter and go through the slow path.
2039  *
2040  * The slow path is the current xfs_mod_incore_sb() function.  This means that
2041  * when we disable a per-cpu counter, we need to drain its resources back to
2042  * the global superblock. We do this after disabling the counter to prevent
2043  * more threads from queueing up on the counter.
2044  *
2045  * Essentially, this means that we still need a lock in the fast path to enable
2046  * synchronisation between the global counters and the per-cpu counters. This
2047  * is not a problem because the lock will be local to a CPU almost all the time
2048  * and have little contention except when we get to ENOSPC conditions.
2049  *
2050  * Basically, this lock becomes a barrier that enables us to lock out the fast
2051  * path while we do things like enabling and disabling counters and
2052  * synchronising the counters.
2053  *
2054  * Locking rules:
2055  *
2056  *      1. m_sb_lock before picking up per-cpu locks
2057  *      2. per-cpu locks always picked up via for_each_online_cpu() order
2058  *      3. accurate counter sync requires m_sb_lock + per cpu locks
2059  *      4. modifying per-cpu counters requires holding per-cpu lock
2060  *      5. modifying global counters requires holding m_sb_lock
2061  *      6. enabling or disabling a counter requires holding the m_sb_lock
2062  *         and _none_ of the per-cpu locks.
2063  *
2064  * Disabled counters are only ever re-enabled by a balance operation
2065  * that results in more free resources per CPU than a given threshold.
2066  * To ensure counters don't remain disabled, they are rebalanced when
2067  * the global resource goes above a higher threshold (i.e. some hysteresis
2068  * is present to prevent thrashing).
2069  */
2070
2071 #ifdef CONFIG_HOTPLUG_CPU
2072 /*
2073  * hot-plug CPU notifier support.
2074  *
2075  * We need a notifier per filesystem as we need to be able to identify
2076  * the filesystem to balance the counters out. This is achieved by
2077  * having a notifier block embedded in the xfs_mount_t and doing pointer
2078  * magic to get the mount pointer from the notifier block address.
2079  */
2080 STATIC int
2081 xfs_icsb_cpu_notify(
2082         struct notifier_block *nfb,
2083         unsigned long action,
2084         void *hcpu)
2085 {
2086         xfs_icsb_cnts_t *cntp;
2087         xfs_mount_t     *mp;
2088
2089         mp = (xfs_mount_t *)container_of(nfb, xfs_mount_t, m_icsb_notifier);
2090         cntp = (xfs_icsb_cnts_t *)
2091                         per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu);
2092         switch (action) {
2093         case CPU_UP_PREPARE:
2094         case CPU_UP_PREPARE_FROZEN:
2095                 /* Easy Case - initialize the area and locks, and
2096                  * then rebalance when online does everything else for us. */
2097                 memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
2098                 break;
2099         case CPU_ONLINE:
2100         case CPU_ONLINE_FROZEN:
2101                 xfs_icsb_lock(mp);
2102                 xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
2103                 xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
2104                 xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
2105                 xfs_icsb_unlock(mp);
2106                 break;
2107         case CPU_DEAD:
2108         case CPU_DEAD_FROZEN:
2109                 /* Disable all the counters, then fold the dead cpu's
2110                  * count into the total on the global superblock and
2111                  * re-enable the counters. */
2112                 xfs_icsb_lock(mp);
2113                 spin_lock(&mp->m_sb_lock);
2114                 xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT);
2115                 xfs_icsb_disable_counter(mp, XFS_SBS_IFREE);
2116                 xfs_icsb_disable_counter(mp, XFS_SBS_FDBLOCKS);
2117
2118                 mp->m_sb.sb_icount += cntp->icsb_icount;
2119                 mp->m_sb.sb_ifree += cntp->icsb_ifree;
2120                 mp->m_sb.sb_fdblocks += cntp->icsb_fdblocks;
2121
2122                 memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
2123
2124                 xfs_icsb_balance_counter_locked(mp, XFS_SBS_ICOUNT, 0);
2125                 xfs_icsb_balance_counter_locked(mp, XFS_SBS_IFREE, 0);
2126                 xfs_icsb_balance_counter_locked(mp, XFS_SBS_FDBLOCKS, 0);
2127                 spin_unlock(&mp->m_sb_lock);
2128                 xfs_icsb_unlock(mp);
2129                 break;
2130         }
2131
2132         return NOTIFY_OK;
2133 }
2134 #endif /* CONFIG_HOTPLUG_CPU */
2135
2136 int
2137 xfs_icsb_init_counters(
2138         xfs_mount_t     *mp)
2139 {
2140         xfs_icsb_cnts_t *cntp;
2141         int             i;
2142
2143         mp->m_sb_cnts = alloc_percpu(xfs_icsb_cnts_t);
2144         if (mp->m_sb_cnts == NULL)
2145                 return -ENOMEM;
2146
2147 #ifdef CONFIG_HOTPLUG_CPU
2148         mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
2149         mp->m_icsb_notifier.priority = 0;
2150         register_hotcpu_notifier(&mp->m_icsb_notifier);
2151 #endif /* CONFIG_HOTPLUG_CPU */
2152
2153         for_each_online_cpu(i) {
2154                 cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
2155                 memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
2156         }
2157
2158         mutex_init(&mp->m_icsb_mutex);
2159
2160         /*
2161          * start with all counters disabled so that the
2162          * initial balance kicks us off correctly
2163          */
2164         mp->m_icsb_counters = -1;
2165         return 0;
2166 }
2167
2168 void
2169 xfs_icsb_reinit_counters(
2170         xfs_mount_t     *mp)
2171 {
2172         xfs_icsb_lock(mp);
2173         /*
2174          * start with all counters disabled so that the
2175          * initial balance kicks us off correctly
2176          */
2177         mp->m_icsb_counters = -1;
2178         xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
2179         xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
2180         xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
2181         xfs_icsb_unlock(mp);
2182 }
2183
2184 void
2185 xfs_icsb_destroy_counters(
2186         xfs_mount_t     *mp)
2187 {
2188         if (mp->m_sb_cnts) {
2189                 unregister_hotcpu_notifier(&mp->m_icsb_notifier);
2190                 free_percpu(mp->m_sb_cnts);
2191         }
2192         mutex_destroy(&mp->m_icsb_mutex);
2193 }
2194
2195 STATIC void
2196 xfs_icsb_lock_cntr(
2197         xfs_icsb_cnts_t *icsbp)
2198 {
2199         while (test_and_set_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags)) {
2200                 ndelay(1000);
2201         }
2202 }
2203
2204 STATIC void
2205 xfs_icsb_unlock_cntr(
2206         xfs_icsb_cnts_t *icsbp)
2207 {
2208         clear_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags);
2209 }
2210
2211
2212 STATIC void
2213 xfs_icsb_lock_all_counters(
2214         xfs_mount_t     *mp)
2215 {
2216         xfs_icsb_cnts_t *cntp;
2217         int             i;
2218
2219         for_each_online_cpu(i) {
2220                 cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
2221                 xfs_icsb_lock_cntr(cntp);
2222         }
2223 }
2224
2225 STATIC void
2226 xfs_icsb_unlock_all_counters(
2227         xfs_mount_t     *mp)
2228 {
2229         xfs_icsb_cnts_t *cntp;
2230         int             i;
2231
2232         for_each_online_cpu(i) {
2233                 cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
2234                 xfs_icsb_unlock_cntr(cntp);
2235         }
2236 }
2237
2238 STATIC void
2239 xfs_icsb_count(
2240         xfs_mount_t     *mp,
2241         xfs_icsb_cnts_t *cnt,
2242         int             flags)
2243 {
2244         xfs_icsb_cnts_t *cntp;
2245         int             i;
2246
2247         memset(cnt, 0, sizeof(xfs_icsb_cnts_t));
2248
2249         if (!(flags & XFS_ICSB_LAZY_COUNT))
2250                 xfs_icsb_lock_all_counters(mp);
2251
2252         for_each_online_cpu(i) {
2253                 cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
2254                 cnt->icsb_icount += cntp->icsb_icount;
2255                 cnt->icsb_ifree += cntp->icsb_ifree;
2256                 cnt->icsb_fdblocks += cntp->icsb_fdblocks;
2257         }
2258
2259         if (!(flags & XFS_ICSB_LAZY_COUNT))
2260                 xfs_icsb_unlock_all_counters(mp);
2261 }
2262
2263 STATIC int
2264 xfs_icsb_counter_disabled(
2265         xfs_mount_t     *mp,
2266         xfs_sb_field_t  field)
2267 {
2268         ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
2269         return test_bit(field, &mp->m_icsb_counters);
2270 }
2271
2272 STATIC void
2273 xfs_icsb_disable_counter(
2274         xfs_mount_t     *mp,
2275         xfs_sb_field_t  field)
2276 {
2277         xfs_icsb_cnts_t cnt;
2278
2279         ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
2280
2281         /*
2282          * If we are already disabled, then there is nothing to do
2283          * here. We check before locking all the counters to avoid
2284          * the expensive lock operation when being called in the
2285          * slow path and the counter is already disabled. This is
2286          * safe because the only time we set or clear this state is under
2287          * the m_icsb_mutex.
2288          */
2289         if (xfs_icsb_counter_disabled(mp, field))
2290                 return;
2291
2292         xfs_icsb_lock_all_counters(mp);
2293         if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
2294                 /* drain back to superblock */
2295
2296                 xfs_icsb_count(mp, &cnt, XFS_ICSB_LAZY_COUNT);
2297                 switch(field) {
2298                 case XFS_SBS_ICOUNT:
2299                         mp->m_sb.sb_icount = cnt.icsb_icount;
2300                         break;
2301                 case XFS_SBS_IFREE:
2302                         mp->m_sb.sb_ifree = cnt.icsb_ifree;
2303                         break;
2304                 case XFS_SBS_FDBLOCKS:
2305                         mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
2306                         break;
2307                 default:
2308                         BUG();
2309                 }
2310         }
2311
2312         xfs_icsb_unlock_all_counters(mp);
2313 }
2314
2315 STATIC void
2316 xfs_icsb_enable_counter(
2317         xfs_mount_t     *mp,
2318         xfs_sb_field_t  field,
2319         uint64_t        count,
2320         uint64_t        resid)
2321 {
2322         xfs_icsb_cnts_t *cntp;
2323         int             i;
2324
2325         ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
2326
2327         xfs_icsb_lock_all_counters(mp);
2328         for_each_online_cpu(i) {
2329                 cntp = per_cpu_ptr(mp->m_sb_cnts, i);
2330                 switch (field) {
2331                 case XFS_SBS_ICOUNT:
2332                         cntp->icsb_icount = count + resid;
2333                         break;
2334                 case XFS_SBS_IFREE:
2335                         cntp->icsb_ifree = count + resid;
2336                         break;
2337                 case XFS_SBS_FDBLOCKS:
2338                         cntp->icsb_fdblocks = count + resid;
2339                         break;
2340                 default:
2341                         BUG();
2342                         break;
2343                 }
2344                 resid = 0;
2345         }
2346         clear_bit(field, &mp->m_icsb_counters);
2347         xfs_icsb_unlock_all_counters(mp);
2348 }
2349
2350 void
2351 xfs_icsb_sync_counters_locked(
2352         xfs_mount_t     *mp,
2353         int             flags)
2354 {
2355         xfs_icsb_cnts_t cnt;
2356
2357         xfs_icsb_count(mp, &cnt, flags);
2358
2359         if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT))
2360                 mp->m_sb.sb_icount = cnt.icsb_icount;
2361         if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE))
2362                 mp->m_sb.sb_ifree = cnt.icsb_ifree;
2363         if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS))
2364                 mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
2365 }
2366
2367 /*
2368  * Accurate update of per-cpu counters to incore superblock
2369  */
2370 void
2371 xfs_icsb_sync_counters(
2372         xfs_mount_t     *mp,
2373         int             flags)
2374 {
2375         spin_lock(&mp->m_sb_lock);
2376         xfs_icsb_sync_counters_locked(mp, flags);
2377         spin_unlock(&mp->m_sb_lock);
2378 }
2379
2380 /*
2381  * Balance and enable/disable counters as necessary.
2382  *
2383  * Thresholds for re-enabling counters are somewhat magic.  inode counts are
2384  * chosen to be the same number as single on disk allocation chunk per CPU, and
2385  * free blocks is something far enough zero that we aren't going thrash when we
2386  * get near ENOSPC. We also need to supply a minimum we require per cpu to
2387  * prevent looping endlessly when xfs_alloc_space asks for more than will
2388  * be distributed to a single CPU but each CPU has enough blocks to be
2389  * reenabled.
2390  *
2391  * Note that we can be called when counters are already disabled.
2392  * xfs_icsb_disable_counter() optimises the counter locking in this case to
2393  * prevent locking every per-cpu counter needlessly.
2394  */
2395
2396 #define XFS_ICSB_INO_CNTR_REENABLE      (uint64_t)64
2397 #define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \
2398                 (uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp))
2399 STATIC void
2400 xfs_icsb_balance_counter_locked(
2401         xfs_mount_t     *mp,
2402         xfs_sb_field_t  field,
2403         int             min_per_cpu)
2404 {
2405         uint64_t        count, resid;
2406         int             weight = num_online_cpus();
2407         uint64_t        min = (uint64_t)min_per_cpu;
2408
2409         /* disable counter and sync counter */
2410         xfs_icsb_disable_counter(mp, field);
2411
2412         /* update counters  - first CPU gets residual*/
2413         switch (field) {
2414         case XFS_SBS_ICOUNT:
2415                 count = mp->m_sb.sb_icount;
2416                 resid = do_div(count, weight);
2417                 if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
2418                         return;
2419                 break;
2420         case XFS_SBS_IFREE:
2421                 count = mp->m_sb.sb_ifree;
2422                 resid = do_div(count, weight);
2423                 if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
2424                         return;
2425                 break;
2426         case XFS_SBS_FDBLOCKS:
2427                 count = mp->m_sb.sb_fdblocks;
2428                 resid = do_div(count, weight);
2429                 if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp)))
2430                         return;
2431                 break;
2432         default:
2433                 BUG();
2434                 count = resid = 0;      /* quiet, gcc */
2435                 break;
2436         }
2437
2438         xfs_icsb_enable_counter(mp, field, count, resid);
2439 }
2440
2441 STATIC void
2442 xfs_icsb_balance_counter(
2443         xfs_mount_t     *mp,
2444         xfs_sb_field_t  fields,
2445         int             min_per_cpu)
2446 {
2447         spin_lock(&mp->m_sb_lock);
2448         xfs_icsb_balance_counter_locked(mp, fields, min_per_cpu);
2449         spin_unlock(&mp->m_sb_lock);
2450 }
2451
2452 int
2453 xfs_icsb_modify_counters(
2454         xfs_mount_t     *mp,
2455         xfs_sb_field_t  field,
2456         int64_t         delta,
2457         int             rsvd)
2458 {
2459         xfs_icsb_cnts_t *icsbp;
2460         long long       lcounter;       /* long counter for 64 bit fields */
2461         int             ret = 0;
2462
2463         might_sleep();
2464 again:
2465         preempt_disable();
2466         icsbp = this_cpu_ptr(mp->m_sb_cnts);
2467
2468         /*
2469          * if the counter is disabled, go to slow path
2470          */
2471         if (unlikely(xfs_icsb_counter_disabled(mp, field)))
2472                 goto slow_path;
2473         xfs_icsb_lock_cntr(icsbp);
2474         if (unlikely(xfs_icsb_counter_disabled(mp, field))) {
2475                 xfs_icsb_unlock_cntr(icsbp);
2476                 goto slow_path;
2477         }
2478
2479         switch (field) {
2480         case XFS_SBS_ICOUNT:
2481                 lcounter = icsbp->icsb_icount;
2482                 lcounter += delta;
2483                 if (unlikely(lcounter < 0))
2484                         goto balance_counter;
2485                 icsbp->icsb_icount = lcounter;
2486                 break;
2487
2488         case XFS_SBS_IFREE:
2489                 lcounter = icsbp->icsb_ifree;
2490                 lcounter += delta;
2491                 if (unlikely(lcounter < 0))
2492                         goto balance_counter;
2493                 icsbp->icsb_ifree = lcounter;
2494                 break;
2495
2496         case XFS_SBS_FDBLOCKS:
2497                 BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0);
2498
2499                 lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
2500                 lcounter += delta;
2501                 if (unlikely(lcounter < 0))
2502                         goto balance_counter;
2503                 icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
2504                 break;
2505         default:
2506                 BUG();
2507                 break;
2508         }
2509         xfs_icsb_unlock_cntr(icsbp);
2510         preempt_enable();
2511         return 0;
2512
2513 slow_path:
2514         preempt_enable();
2515
2516         /*
2517          * serialise with a mutex so we don't burn lots of cpu on
2518          * the superblock lock. We still need to hold the superblock
2519          * lock, however, when we modify the global structures.
2520          */
2521         xfs_icsb_lock(mp);
2522
2523         /*
2524          * Now running atomically.
2525          *
2526          * If the counter is enabled, someone has beaten us to rebalancing.
2527          * Drop the lock and try again in the fast path....
2528          */
2529         if (!(xfs_icsb_counter_disabled(mp, field))) {
2530                 xfs_icsb_unlock(mp);
2531                 goto again;
2532         }
2533
2534         /*
2535          * The counter is currently disabled. Because we are
2536          * running atomically here, we know a rebalance cannot
2537          * be in progress. Hence we can go straight to operating
2538          * on the global superblock. We do not call xfs_mod_incore_sb()
2539          * here even though we need to get the m_sb_lock. Doing so
2540          * will cause us to re-enter this function and deadlock.
2541          * Hence we get the m_sb_lock ourselves and then call
2542          * xfs_mod_incore_sb_unlocked() as the unlocked path operates
2543          * directly on the global counters.
2544          */
2545         spin_lock(&mp->m_sb_lock);
2546         ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
2547         spin_unlock(&mp->m_sb_lock);
2548
2549         /*
2550          * Now that we've modified the global superblock, we
2551          * may be able to re-enable the distributed counters
2552          * (e.g. lots of space just got freed). After that
2553          * we are done.
2554          */
2555         if (ret != ENOSPC)
2556                 xfs_icsb_balance_counter(mp, field, 0);
2557         xfs_icsb_unlock(mp);
2558         return ret;
2559
2560 balance_counter:
2561         xfs_icsb_unlock_cntr(icsbp);
2562         preempt_enable();
2563
2564         /*
2565          * We may have multiple threads here if multiple per-cpu
2566          * counters run dry at the same time. This will mean we can
2567          * do more balances than strictly necessary but it is not
2568          * the common slowpath case.
2569          */
2570         xfs_icsb_lock(mp);
2571
2572         /*
2573          * running atomically.
2574          *
2575          * This will leave the counter in the correct state for future
2576          * accesses. After the rebalance, we simply try again and our retry
2577          * will either succeed through the fast path or slow path without
2578          * another balance operation being required.
2579          */
2580         xfs_icsb_balance_counter(mp, field, delta);
2581         xfs_icsb_unlock(mp);
2582         goto again;
2583 }
2584
2585 #endif