fs/ext3/inode.c

   1 /*
   2  *  linux/fs/ext3/inode.c
   3  *
   4  * Copyright (C) 1992, 1993, 1994, 1995
   5  * Remy Card (card@masi.ibp.fr)
   6  * Laboratoire MASI - Institut Blaise Pascal
   7  * Universite Pierre et Marie Curie (Paris VI)
   8  *
   9  *  from
  10  *
  11  *  linux/fs/minix/inode.c
  12  *
  13  *  Copyright (C) 1991, 1992  Linus Torvalds
  14  *
  15  *  Goal-directed block allocation by Stephen Tweedie
  16  *      (sct@redhat.com), 1993, 1998
  17  *  Big-endian to little-endian byte-swapping/bitmaps by
  18  *        David S. Miller (davem@caip.rutgers.edu), 1995
  19  *  64-bit file support on 64-bit platforms by Jakub Jelinek
  20  *      (jj@sunsite.ms.mff.cuni.cz)
  21  *
  22  *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
  23  */
  24
  25 #include <linux/module.h>
  26 #include <linux/fs.h>
  27 #include <linux/time.h>
  28 #include <linux/ext3_jbd.h>
  29 #include <linux/jbd.h>
  30 #include <linux/smp_lock.h>
  31 #include <linux/highuid.h>
  32 #include <linux/pagemap.h>
  33 #include <linux/quotaops.h>
  34 #include <linux/string.h>
  35 #include <linux/buffer_head.h>
  36 #include <linux/writeback.h>
  37 #include <linux/mpage.h>
  38 #include <linux/uio.h>
  39 #include "xattr.h"
  40 #include "acl.h"
  41
  42 /*
  43  * Test whether an inode is a fast symlink.
  44  */
  45 static inline int ext3_inode_is_fast_symlink(struct inode *inode)
  46 {
  47         int ea_blocks = EXT3_I(inode)->i_file_acl ?
  48                 (inode->i_sb->s_blocksize >> 9) : 0;
  49
  50         return (S_ISLNK(inode->i_mode) &&
  51                 inode->i_blocks - ea_blocks == 0);
  52 }
  53
  54 /* The ext3 forget function must perform a revoke if we are freeing data
  55  * which has been journaled.  Metadata (eg. indirect blocks) must be
  56  * revoked in all cases.
  57  *
  58  * "bh" may be NULL: a metadata block may have been freed from memory
  59  * but there may still be a record of it in the journal, and that record
  60  * still needs to be revoked.
  61  */
  62
  63 int ext3_forget(handle_t *handle, int is_metadata,
  64                        struct inode *inode, struct buffer_head *bh,
  65                        int blocknr)
  66 {
  67         int err;
  68
  69         BUFFER_TRACE(bh, "enter");
  70
  71         jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
  72                   "data mode %lx\n",
  73                   bh, is_metadata, inode->i_mode,
  74                   test_opt(inode->i_sb, DATA_FLAGS));
  75
  76         /* Never use the revoke function if we are doing full data
  77          * journaling: there is no need to, and a V1 superblock won't
  78          * support it.  Otherwise, only skip the revoke on un-journaled
  79          * data blocks. */
  80
  81         if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
  82             (!is_metadata && !ext3_should_journal_data(inode))) {
  83                 if (bh) {
  84                         BUFFER_TRACE(bh, "call journal_forget");
  85                         ext3_journal_forget(handle, bh);
  86                 }
  87                 return 0;
  88         }
  89
  90         /*
  91          * data!=journal && (is_metadata || should_journal_data(inode))
  92          */
  93         BUFFER_TRACE(bh, "call ext3_journal_revoke");
  94         err = ext3_journal_revoke(handle, blocknr, bh);
  95         if (err)
  96                 ext3_abort(inode->i_sb, __FUNCTION__,
  97                            "error %d when attempting revoke", err);
  98         BUFFER_TRACE(bh, "exit");
  99         return err;
 100 }
 101
 102 /*
 103  * Work out how many blocks we need to progress with the next chunk of a
 104  * truncate transaction.
 105  */
 106
 107 static unsigned long blocks_for_truncate(struct inode *inode)
 108 {
 109         unsigned long needed;
 110
 111         needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
 112
 113         /* Give ourselves just enough room to cope with inodes in which
 114          * i_blocks is corrupt: we've seen disk corruptions in the past
 115          * which resulted in random data in an inode which looked enough
 116          * like a regular file for ext3 to try to delete it.  Things
 117          * will go a bit crazy if that happens, but at least we should
 118          * try not to panic the whole kernel. */
 119         if (needed < 2)
 120                 needed = 2;
 121
 122         /* But we need to bound the transaction so we don't overflow the
 123          * journal. */
 124         if (needed > EXT3_MAX_TRANS_DATA)
 125                 needed = EXT3_MAX_TRANS_DATA;
 126
 127         return EXT3_DATA_TRANS_BLOCKS + needed;
 128 }
 129
 130 /*
 131  * Truncate transactions can be complex and absolutely huge.  So we need to
 132  * be able to restart the transaction at a conventient checkpoint to make
 133  * sure we don't overflow the journal.
 134  *
 135  * start_transaction gets us a new handle for a truncate transaction,
 136  * and extend_transaction tries to extend the existing one a bit.  If
 137  * extend fails, we need to propagate the failure up and restart the
 138  * transaction in the top-level truncate loop. --sct
 139  */
 140
 141 static handle_t *start_transaction(struct inode *inode)
 142 {
 143         handle_t *result;
 144
 145         result = ext3_journal_start(inode, blocks_for_truncate(inode));
 146         if (!IS_ERR(result))
 147                 return result;
 148
 149         ext3_std_error(inode->i_sb, PTR_ERR(result));
 150         return result;
 151 }
 152
 153 /*
 154  * Try to extend this transaction for the purposes of truncation.
 155  *
 156  * Returns 0 if we managed to create more room.  If we can't create more
 157  * room, and the transaction must be restarted we return 1.
 158  */
 159 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
 160 {
 161         if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
 162                 return 0;
 163         if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
 164                 return 0;
 165         return 1;
 166 }
 167
 168 /*
 169  * Restart the transaction associated with *handle.  This does a commit,
 170  * so before we call here everything must be consistently dirtied against
 171  * this transaction.
 172  */
 173 static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
 174 {
 175         jbd_debug(2, "restarting handle %p\n", handle);
 176         return ext3_journal_restart(handle, blocks_for_truncate(inode));
 177 }
 178
 179 /*
 180  * Called at each iput()
 181  *
 182  * The inode may be "bad" if ext3_read_inode() saw an error from
 183  * ext3_get_inode(), so we need to check that to avoid freeing random disk
 184  * blocks.
 185  */
 186 void ext3_put_inode(struct inode *inode)
 187 {
 188         if (!is_bad_inode(inode))
 189                 ext3_discard_prealloc(inode);
 190 }
 191
 192 /*
 193  * Called at the last iput() if i_nlink is zero.
 194  */
 195 void ext3_delete_inode (struct inode * inode)
 196 {
 197         handle_t *handle;
 198
 199         if (is_bad_inode(inode))
 200                 goto no_delete;
 201
 202         handle = start_transaction(inode);
 203         if (IS_ERR(handle)) {
 204                 /* If we're going to skip the normal cleanup, we still
 205                  * need to make sure that the in-core orphan linked list
 206                  * is properly cleaned up. */
 207                 ext3_orphan_del(NULL, inode);
 208
 209                 ext3_std_error(inode->i_sb, PTR_ERR(handle));
 210                 goto no_delete;
 211         }
 212
 213         if (IS_SYNC(inode))
 214                 handle->h_sync = 1;
 215         inode->i_size = 0;
 216         if (inode->i_blocks)
 217                 ext3_truncate(inode);
 218         /*
 219          * Kill off the orphan record which ext3_truncate created.
 220          * AKPM: I think this can be inside the above `if'.
 221          * Note that ext3_orphan_del() has to be able to cope with the
 222          * deletion of a non-existent orphan - this is because we don't
 223          * know if ext3_truncate() actually created an orphan record.
 224          * (Well, we could do this if we need to, but heck - it works)
 225          */
 226         ext3_orphan_del(handle, inode);
 227         EXT3_I(inode)->i_dtime  = get_seconds();
 228
 229         /*
 230          * One subtle ordering requirement: if anything has gone wrong
 231          * (transaction abort, IO errors, whatever), then we can still
 232          * do these next steps (the fs will already have been marked as
 233          * having errors), but we can't free the inode if the mark_dirty
 234          * fails.
 235          */
 236         if (ext3_mark_inode_dirty(handle, inode))
 237                 /* If that failed, just do the required in-core inode clear. */
 238                 clear_inode(inode);
 239         else
 240                 ext3_free_inode(handle, inode);
 241         ext3_journal_stop(handle);
 242         return;
 243 no_delete:
 244         clear_inode(inode);     /* We must guarantee clearing of inode... */
 245 }
 246
 247 void ext3_discard_prealloc (struct inode * inode)
 248 {
 249 #ifdef EXT3_PREALLOCATE
 250         struct ext3_inode_info *ei = EXT3_I(inode);
 251         /* Writer: ->i_prealloc* */
 252         if (ei->i_prealloc_count) {
 253                 unsigned short total = ei->i_prealloc_count;
 254                 unsigned long block = ei->i_prealloc_block;
 255                 ei->i_prealloc_count = 0;
 256                 ei->i_prealloc_block = 0;
 257                 /* Writer: end */
 258                 ext3_free_blocks (inode, block, total);
 259         }
 260 #endif
 261 }
 262
 263 static int ext3_alloc_block (handle_t *handle,
 264                         struct inode * inode, unsigned long goal, int *err)
 265 {
 266         unsigned long result;
 267
 268 #ifdef EXT3_PREALLOCATE
 269 #ifdef EXT3FS_DEBUG
 270         static unsigned long alloc_hits = 0, alloc_attempts = 0;
 271 #endif
 272         struct ext3_inode_info *ei = EXT3_I(inode);
 273         /* Writer: ->i_prealloc* */
 274         if (ei->i_prealloc_count &&
 275             (goal == ei->i_prealloc_block ||
 276              goal + 1 == ei->i_prealloc_block))
 277         {
 278                 result = ei->i_prealloc_block++;
 279                 ei->i_prealloc_count--;
 280                 /* Writer: end */
 281                 ext3_debug ("preallocation hit (%lu/%lu).\n",
 282                             ++alloc_hits, ++alloc_attempts);
 283         } else {
 284                 ext3_discard_prealloc (inode);
 285                 ext3_debug ("preallocation miss (%lu/%lu).\n",
 286                             alloc_hits, ++alloc_attempts);
 287                 if (S_ISREG(inode->i_mode))
 288                         result = ext3_new_block (inode, goal,
 289                                  &ei->i_prealloc_count,
 290                                  &ei->i_prealloc_block, err);
 291                 else
 292                         result = ext3_new_block (inode, goal, 0, 0, err);
 293                 /*
 294                  * AKPM: this is somewhat sticky.  I'm not surprised it was
 295                  * disabled in 2.2's ext3.  Need to integrate b_committed_data
 296                  * guarding with preallocation, if indeed preallocation is
 297                  * effective.
 298                  */
 299         }
 300 #else
 301         result = ext3_new_block (handle, inode, goal, 0, 0, err);
 302 #endif
 303         return result;
 304 }
 305
 306
 307 typedef struct {
 308         u32     *p;
 309         u32     key;
 310         struct buffer_head *bh;
 311 } Indirect;
 312
 313 static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v)
 314 {
 315         p->key = *(p->p = v);
 316         p->bh = bh;
 317 }
 318
 319 static inline int verify_chain(Indirect *from, Indirect *to)
 320 {
 321         while (from <= to && from->key == *from->p)
 322                 from++;
 323         return (from > to);
 324 }
 325
 326 /**
 327  *      ext3_block_to_path - parse the block number into array of offsets
 328  *      @inode: inode in question (we are only interested in its superblock)
 329  *      @i_block: block number to be parsed
 330  *      @offsets: array to store the offsets in
 331  *      @boundary: set this non-zero if the referred-to block is likely to be
 332  *             followed (on disk) by an indirect block.
 333  *
 334  *      To store the locations of file's data ext3 uses a data structure common
 335  *      for UNIX filesystems - tree of pointers anchored in the inode, with
 336  *      data blocks at leaves and indirect blocks in intermediate nodes.
 337  *      This function translates the block number into path in that tree -
 338  *      return value is the path length and @offsets[n] is the offset of
 339  *      pointer to (n+1)th node in the nth one. If @block is out of range
 340  *      (negative or too large) warning is printed and zero returned.
 341  *
 342  *      Note: function doesn't find node addresses, so no IO is needed. All
 343  *      we need to know is the capacity of indirect blocks (taken from the
 344  *      inode->i_sb).
 345  */
 346
 347 /*
 348  * Portability note: the last comparison (check that we fit into triple
 349  * indirect block) is spelled differently, because otherwise on an
 350  * architecture with 32-bit longs and 8Kb pages we might get into trouble
 351  * if our filesystem had 8Kb blocks. We might use long long, but that would
 352  * kill us on x86. Oh, well, at least the sign propagation does not matter -
 353  * i_block would have to be negative in the very beginning, so we would not
 354  * get there at all.
 355  */
 356
 357 static int ext3_block_to_path(struct inode *inode,
 358                         long i_block, int offsets[4], int *boundary)
 359 {
 360         int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
 361         int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
 362         const long direct_blocks = EXT3_NDIR_BLOCKS,
 363                 indirect_blocks = ptrs,
 364                 double_blocks = (1 << (ptrs_bits * 2));
 365         int n = 0;
 366         int final = 0;
 367
 368         if (i_block < 0) {
 369                 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
 370         } else if (i_block < direct_blocks) {
 371                 offsets[n++] = i_block;
 372                 final = direct_blocks;
 373         } else if ( (i_block -= direct_blocks) < indirect_blocks) {
 374                 offsets[n++] = EXT3_IND_BLOCK;
 375                 offsets[n++] = i_block;
 376                 final = ptrs;
 377         } else if ((i_block -= indirect_blocks) < double_blocks) {
 378                 offsets[n++] = EXT3_DIND_BLOCK;
 379                 offsets[n++] = i_block >> ptrs_bits;
 380                 offsets[n++] = i_block & (ptrs - 1);
 381                 final = ptrs;
 382         } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
 383                 offsets[n++] = EXT3_TIND_BLOCK;
 384                 offsets[n++] = i_block >> (ptrs_bits * 2);
 385                 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
 386                 offsets[n++] = i_block & (ptrs - 1);
 387                 final = ptrs;
 388         } else {
 389                 ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
 390         }
 391         if (boundary)
 392                 *boundary = (i_block & (ptrs - 1)) == (final - 1);
 393         return n;
 394 }
 395
 396 /**
 397  *      ext3_get_branch - read the chain of indirect blocks leading to data
 398  *      @inode: inode in question
 399  *      @depth: depth of the chain (1 - direct pointer, etc.)
 400  *      @offsets: offsets of pointers in inode/indirect blocks
 401  *      @chain: place to store the result
 402  *      @err: here we store the error value
 403  *
 404  *      Function fills the array of triples <key, p, bh> and returns %NULL
 405  *      if everything went OK or the pointer to the last filled triple
 406  *      (incomplete one) otherwise. Upon the return chain[i].key contains
 407  *      the number of (i+1)-th block in the chain (as it is stored in memory,
 408  *      i.e. little-endian 32-bit), chain[i].p contains the address of that
 409  *      number (it points into struct inode for i==0 and into the bh->b_data
 410  *      for i>0) and chain[i].bh points to the buffer_head of i-th indirect
 411  *      block for i>0 and NULL for i==0. In other words, it holds the block
 412  *      numbers of the chain, addresses they were taken from (and where we can
 413  *      verify that chain did not change) and buffer_heads hosting these
 414  *      numbers.
 415  *
 416  *      Function stops when it stumbles upon zero pointer (absent block)
 417  *              (pointer to last triple returned, *@err == 0)
 418  *      or when it gets an IO error reading an indirect block
 419  *              (ditto, *@err == -EIO)
 420  *      or when it notices that chain had been changed while it was reading
 421  *              (ditto, *@err == -EAGAIN)
 422  *      or when it reads all @depth-1 indirect blocks successfully and finds
 423  *      the whole chain, all way to the data (returns %NULL, *err == 0).
 424  */
 425 static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
 426                                  Indirect chain[4], int *err)
 427 {
 428         struct super_block *sb = inode->i_sb;
 429         Indirect *p = chain;
 430         struct buffer_head *bh;
 431
 432         *err = 0;
 433         /* i_data is not going away, no lock needed */
 434         add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
 435         if (!p->key)
 436                 goto no_block;
 437         while (--depth) {
 438                 bh = sb_bread(sb, le32_to_cpu(p->key));
 439                 if (!bh)
 440                         goto failure;
 441                 /* Reader: pointers */
 442                 if (!verify_chain(chain, p))
 443                         goto changed;
 444                 add_chain(++p, bh, (u32*)bh->b_data + *++offsets);
 445                 /* Reader: end */
 446                 if (!p->key)
 447                         goto no_block;
 448         }
 449         return NULL;
 450
 451 changed:
 452         brelse(bh);
 453         *err = -EAGAIN;
 454         goto no_block;
 455 failure:
 456         *err = -EIO;
 457 no_block:
 458         return p;
 459 }
 460
 461 /**
 462  *      ext3_find_near - find a place for allocation with sufficient locality
 463  *      @inode: owner
 464  *      @ind: descriptor of indirect block.
 465  *
 466  *      This function returns the prefered place for block allocation.
 467  *      It is used when heuristic for sequential allocation fails.
 468  *      Rules are:
 469  *        + if there is a block to the left of our position - allocate near it.
 470  *        + if pointer will live in indirect block - allocate near that block.
 471  *        + if pointer will live in inode - allocate in the same
 472  *          cylinder group.
 473  *
 474  * In the latter case we colour the starting block by the callers PID to
 475  * prevent it from clashing with concurrent allocations for a different inode
 476  * in the same block group.   The PID is used here so that functionally related
 477  * files will be close-by on-disk.
 478  *
 479  *      Caller must make sure that @ind is valid and will stay that way.
 480  */
 481
 482 static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
 483 {
 484         struct ext3_inode_info *ei = EXT3_I(inode);
 485         u32 *start = ind->bh ? (u32*) ind->bh->b_data : ei->i_data;
 486         u32 *p;
 487         unsigned long bg_start;
 488         unsigned long colour;
 489
 490         /* Try to find previous block */
 491         for (p = ind->p - 1; p >= start; p--)
 492                 if (*p)
 493                         return le32_to_cpu(*p);
 494
 495         /* No such thing, so let's try location of indirect block */
 496         if (ind->bh)
 497                 return ind->bh->b_blocknr;
 498
 499         /*
 500          * It is going to be refered from inode itself? OK, just put it into
 501          * the same cylinder group then.
 502          */
 503         bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
 504                 le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
 505         colour = (current->pid % 16) *
 506                         (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
 507         return bg_start + colour;
 508 }
 509
 510 /**
 511  *      ext3_find_goal - find a prefered place for allocation.
 512  *      @inode: owner
 513  *      @block:  block we want
 514  *      @chain:  chain of indirect blocks
 515  *      @partial: pointer to the last triple within a chain
 516  *      @goal:  place to store the result.
 517  *
 518  *      Normally this function find the prefered place for block allocation,
 519  *      stores it in *@goal and returns zero. If the branch had been changed
 520  *      under us we return -EAGAIN.
 521  */
 522
 523 static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4],
 524                           Indirect *partial, unsigned long *goal)
 525 {
 526         struct ext3_inode_info *ei = EXT3_I(inode);
 527         /* Writer: ->i_next_alloc* */
 528         if (block == ei->i_next_alloc_block + 1) {
 529                 ei->i_next_alloc_block++;
 530                 ei->i_next_alloc_goal++;
 531         }
 532         /* Writer: end */
 533         /* Reader: pointers, ->i_next_alloc* */
 534         if (verify_chain(chain, partial)) {
 535                 /*
 536                  * try the heuristic for sequential allocation,
 537                  * failing that at least try to get decent locality.
 538                  */
 539                 if (block == ei->i_next_alloc_block)
 540                         *goal = ei->i_next_alloc_goal;
 541                 if (!*goal)
 542                         *goal = ext3_find_near(inode, partial);
 543                 return 0;
 544         }
 545         /* Reader: end */
 546         return -EAGAIN;
 547 }
 548
 549 /**
 550  *      ext3_alloc_branch - allocate and set up a chain of blocks.
 551  *      @inode: owner
 552  *      @num: depth of the chain (number of blocks to allocate)
 553  *      @offsets: offsets (in the blocks) to store the pointers to next.
 554  *      @branch: place to store the chain in.
 555  *
 556  *      This function allocates @num blocks, zeroes out all but the last one,
 557  *      links them into chain and (if we are synchronous) writes them to disk.
 558  *      In other words, it prepares a branch that can be spliced onto the
 559  *      inode. It stores the information about that chain in the branch[], in
 560  *      the same format as ext3_get_branch() would do. We are calling it after
 561  *      we had read the existing part of chain and partial points to the last
 562  *      triple of that (one with zero ->key). Upon the exit we have the same
 563  *      picture as after the successful ext3_get_block(), excpet that in one
 564  *      place chain is disconnected - *branch->p is still zero (we did not
 565  *      set the last link), but branch->key contains the number that should
 566  *      be placed into *branch->p to fill that gap.
 567  *
 568  *      If allocation fails we free all blocks we've allocated (and forget
 569  *      their buffer_heads) and return the error value the from failed
 570  *      ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
 571  *      as described above and return 0.
 572  */
 573
 574 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
 575                              int num,
 576                              unsigned long goal,
 577                              int *offsets,
 578                              Indirect *branch)
 579 {
 580         int blocksize = inode->i_sb->s_blocksize;
 581         int n = 0, keys = 0;
 582         int err = 0;
 583         int i;
 584         int parent = ext3_alloc_block(handle, inode, goal, &err);
 585
 586         branch[0].key = cpu_to_le32(parent);
 587         if (parent) {
 588                 for (n = 1; n < num; n++) {
 589                         struct buffer_head *bh;
 590                         /* Allocate the next block */
 591                         int nr = ext3_alloc_block(handle, inode, parent, &err);
 592                         if (!nr)
 593                                 break;
 594                         branch[n].key = cpu_to_le32(nr);
 595                         keys = n+1;
 596
 597                         /*
 598                          * Get buffer_head for parent block, zero it out
 599                          * and set the pointer to new one, then send
 600                          * parent to disk.
 601                          */
 602                         bh = sb_getblk(inode->i_sb, parent);
 603                         branch[n].bh = bh;
 604                         lock_buffer(bh);
 605                         BUFFER_TRACE(bh, "call get_create_access");
 606                         err = ext3_journal_get_create_access(handle, bh);
 607                         if (err) {
 608                                 unlock_buffer(bh);
 609                                 brelse(bh);
 610                                 break;
 611                         }
 612
 613                         memset(bh->b_data, 0, blocksize);
 614                         branch[n].p = (u32*) bh->b_data + offsets[n];
 615                         *branch[n].p = branch[n].key;
 616                         BUFFER_TRACE(bh, "marking uptodate");
 617                         set_buffer_uptodate(bh);
 618                         unlock_buffer(bh);
 619
 620                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
 621                         err = ext3_journal_dirty_metadata(handle, bh);
 622                         if (err)
 623                                 break;
 624
 625                         parent = nr;
 626                 }
 627         }
 628         if (n == num)
 629                 return 0;
 630
 631         /* Allocation failed, free what we already allocated */
 632         for (i = 1; i < keys; i++) {
 633                 BUFFER_TRACE(branch[i].bh, "call journal_forget");
 634                 ext3_journal_forget(handle, branch[i].bh);
 635         }
 636         for (i = 0; i < keys; i++)
 637                 ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
 638         return err;
 639 }
 640
 641 /**
 642  *      ext3_splice_branch - splice the allocated branch onto inode.
 643  *      @inode: owner
 644  *      @block: (logical) number of block we are adding
 645  *      @chain: chain of indirect blocks (with a missing link - see
 646  *              ext3_alloc_branch)
 647  *      @where: location of missing link
 648  *      @num:   number of blocks we are adding
 649  *
 650  *      This function verifies that chain (up to the missing link) had not
 651  *      changed, fills the missing link and does all housekeeping needed in
 652  *      inode (->i_blocks, etc.). In case of success we end up with the full
 653  *      chain to new block and return 0. Otherwise (== chain had been changed)
 654  *      we free the new blocks (forgetting their buffer_heads, indeed) and
 655  *      return -EAGAIN.
 656  */
 657
 658 static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
 659                               Indirect chain[4], Indirect *where, int num)
 660 {
 661         int i;
 662         int err = 0;
 663         struct ext3_inode_info *ei = EXT3_I(inode);
 664
 665         /*
 666          * If we're splicing into a [td]indirect block (as opposed to the
 667          * inode) then we need to get write access to the [td]indirect block
 668          * before the splice.
 669          */
 670         if (where->bh) {
 671                 BUFFER_TRACE(where->bh, "get_write_access");
 672                 err = ext3_journal_get_write_access(handle, where->bh);
 673                 if (err)
 674                         goto err_out;
 675         }
 676         /* Verify that place we are splicing to is still there and vacant */
 677
 678         /* Writer: pointers, ->i_next_alloc* */
 679         if (!verify_chain(chain, where-1) || *where->p)
 680                 /* Writer: end */
 681                 goto changed;
 682
 683         /* That's it */
 684
 685         *where->p = where->key;
 686         ei->i_next_alloc_block = block;
 687         ei->i_next_alloc_goal = le32_to_cpu(where[num-1].key);
 688         /* Writer: end */
 689
 690         /* We are done with atomic stuff, now do the rest of housekeeping */
 691
 692         inode->i_ctime = CURRENT_TIME;
 693         ext3_mark_inode_dirty(handle, inode);
 694
 695         /* had we spliced it onto indirect block? */
 696         if (where->bh) {
 697                 /*
 698                  * akpm: If we spliced it onto an indirect block, we haven't
 699                  * altered the inode.  Note however that if it is being spliced
 700                  * onto an indirect block at the very end of the file (the
 701                  * file is growing) then we *will* alter the inode to reflect
 702                  * the new i_size.  But that is not done here - it is done in
 703                  * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
 704                  */
 705                 jbd_debug(5, "splicing indirect only\n");
 706                 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
 707                 err = ext3_journal_dirty_metadata(handle, where->bh);
 708                 if (err)
 709                         goto err_out;
 710         } else {
 711                 /*
 712                  * OK, we spliced it into the inode itself on a direct block.
 713                  * Inode was dirtied above.
 714                  */
 715                 jbd_debug(5, "splicing direct\n");
 716         }
 717         return err;
 718
 719 changed:
 720         /*
 721          * AKPM: if where[i].bh isn't part of the current updating
 722          * transaction then we explode nastily.  Test this code path.
 723          */
 724         jbd_debug(1, "the chain changed: try again\n");
 725         err = -EAGAIN;
 726
 727 err_out:
 728         for (i = 1; i < num; i++) {
 729                 BUFFER_TRACE(where[i].bh, "call journal_forget");
 730                 ext3_journal_forget(handle, where[i].bh);
 731         }
 732         /* For the normal collision cleanup case, we free up the blocks.
 733          * On genuine filesystem errors we don't even think about doing
 734          * that. */
 735         if (err == -EAGAIN)
 736                 for (i = 0; i < num; i++)
 737                         ext3_free_blocks(handle, inode,
 738                                          le32_to_cpu(where[i].key), 1);
 739         return err;
 740 }
 741
 742 /*
 743  * Allocation strategy is simple: if we have to allocate something, we will
 744  * have to go the whole way to leaf. So let's do it before attaching anything
 745  * to tree, set linkage between the newborn blocks, write them if sync is
 746  * required, recheck the path, free and repeat if check fails, otherwise
 747  * set the last missing link (that will protect us from any truncate-generated
 748  * removals - all blocks on the path are immune now) and possibly force the
 749  * write on the parent block.
 750  * That has a nice additional property: no special recovery from the failed
 751  * allocations is needed - we simply release blocks and do not touch anything
 752  * reachable from inode.
 753  *
 754  * akpm: `handle' can be NULL if create == 0.
 755  *
 756  * The BKL may not be held on entry here.  Be sure to take it early.
 757  */
 758
 759 static int
 760 ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
 761                 struct buffer_head *bh_result, int create, int extend_disksize)
 762 {
 763         int err = -EIO;
 764         int offsets[4];
 765         Indirect chain[4];
 766         Indirect *partial;
 767         unsigned long goal;
 768         int left;
 769         int boundary = 0;
 770         int depth = ext3_block_to_path(inode, iblock, offsets, &boundary);
 771         struct ext3_inode_info *ei = EXT3_I(inode);
 772         loff_t new_size;
 773
 774         J_ASSERT(handle != NULL || create == 0);
 775
 776         if (depth == 0)
 777                 goto out;
 778
 779 reread:
 780         partial = ext3_get_branch(inode, depth, offsets, chain, &err);
 781
 782         /* Simplest case - block found, no allocation needed */
 783         if (!partial) {
 784                 clear_buffer_new(bh_result);
 785 got_it:
 786                 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
 787                 if (boundary)
 788                         set_buffer_boundary(bh_result);
 789                 /* Clean up and exit */
 790                 partial = chain+depth-1; /* the whole chain */
 791                 goto cleanup;
 792         }
 793
 794         /* Next simple case - plain lookup or failed read of indirect block */
 795         if (!create || err == -EIO) {
 796 cleanup:
 797                 while (partial > chain) {
 798                         BUFFER_TRACE(partial->bh, "call brelse");
 799                         brelse(partial->bh);
 800                         partial--;
 801                 }
 802                 BUFFER_TRACE(bh_result, "returned");
 803 out:
 804                 return err;
 805         }
 806
 807         /*
 808          * Indirect block might be removed by truncate while we were
 809          * reading it. Handling of that case (forget what we've got and
 810          * reread) is taken out of the main path.
 811          */
 812         if (err == -EAGAIN)
 813                 goto changed;
 814
 815         if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0)
 816                 goto changed;
 817
 818         left = (chain + depth) - partial;
 819
 820         /*
 821          * Block out ext3_truncate while we alter the tree
 822          */
 823         down_read(&ei->truncate_sem);
 824         err = ext3_alloc_branch(handle, inode, left, goal,
 825                                         offsets+(partial-chain), partial);
 826
 827         /* The ext3_splice_branch call will free and forget any buffers
 828          * on the new chain if there is a failure, but that risks using
 829          * up transaction credits, especially for bitmaps where the
 830          * credits cannot be returned.  Can we handle this somehow?  We
 831          * may need to return -EAGAIN upwards in the worst case.  --sct */
 832         if (!err)
 833                 err = ext3_splice_branch(handle, inode, iblock, chain,
 834                                          partial, left);
 835         up_read(&ei->truncate_sem);
 836         if (err == -EAGAIN)
 837                 goto changed;
 838         if (err)
 839                 goto cleanup;
 840
 841         if (extend_disksize) {
 842                 /*
 843                  * This is not racy against ext3_truncate's modification of
 844                  * i_disksize because VM/VFS ensures that the file cannot be
 845                  * extended while truncate is in progress.  It is racy between
 846                  * multiple parallel instances of get_block, but we have BKL.
 847                  */
 848                 new_size = inode->i_size;
 849                 if (new_size > ei->i_disksize)
 850                         ei->i_disksize = new_size;
 851         }
 852         set_buffer_new(bh_result);
 853         goto got_it;
 854
 855 changed:
 856         while (partial > chain) {
 857                 jbd_debug(1, "buffer chain changed, retrying\n");
 858                 BUFFER_TRACE(partial->bh, "brelsing");
 859                 brelse(partial->bh);
 860                 partial--;
 861         }
 862         goto reread;
 863 }
 864
 865 static int ext3_get_block(struct inode *inode, sector_t iblock,
 866                         struct buffer_head *bh_result, int create)
 867 {
 868         handle_t *handle = 0;
 869         int ret;
 870
 871         if (create) {
 872                 handle = ext3_journal_current_handle();
 873                 J_ASSERT(handle != 0);
 874         }
 875         ret = ext3_get_block_handle(handle, inode, iblock,
 876                                 bh_result, create, 1);
 877         return ret;
 878 }
 879
 880 #define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
 881
 882 static int
 883 ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
 884                 unsigned long max_blocks, struct buffer_head *bh_result,
 885                 int create)
 886 {
 887         handle_t *handle = journal_current_handle();
 888         int ret = 0;
 889
 890         if (handle && handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
 891                 /*
 892                  * Getting low on buffer credits...
 893                  */
 894                 if (!ext3_journal_extend(handle, DIO_CREDITS)) {
 895                         /*
 896                          * Couldn't extend the transaction.  Start a new one
 897                          */
 898                         ret = ext3_journal_restart(handle, DIO_CREDITS);
 899                 }
 900         }
 901         if (ret == 0)
 902                 ret = ext3_get_block_handle(handle, inode, iblock,
 903                                         bh_result, create, 0);
 904         if (ret == 0)
 905                 bh_result->b_size = (1 << inode->i_blkbits);
 906         return ret;
 907 }
 908
 909
 910 /*
 911  * `handle' can be NULL if create is zero
 912  */
 913 struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
 914                                 long block, int create, int * errp)
 915 {
 916         struct buffer_head dummy;
 917         int fatal = 0, err;
 918
 919         J_ASSERT(handle != NULL || create == 0);
 920
 921         dummy.b_state = 0;
 922         dummy.b_blocknr = -1000;
 923         buffer_trace_init(&dummy.b_history);
 924         *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
 925         if (!*errp && buffer_mapped(&dummy)) {
 926                 struct buffer_head *bh;
 927                 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
 928                 if (buffer_new(&dummy)) {
 929                         J_ASSERT(create != 0);
 930                         J_ASSERT(handle != 0);
 931
 932                         /* Now that we do not always journal data, we
 933                            should keep in mind whether this should
 934                            always journal the new buffer as metadata.
 935                            For now, regular file writes use
 936                            ext3_get_block instead, so it's not a
 937                            problem. */
 938                         lock_buffer(bh);
 939                         BUFFER_TRACE(bh, "call get_create_access");
 940                         fatal = ext3_journal_get_create_access(handle, bh);
 941                         if (!fatal) {
 942                                 memset(bh->b_data, 0,
 943                                        inode->i_sb->s_blocksize);
 944                                 set_buffer_uptodate(bh);
 945                         }
 946                         unlock_buffer(bh);
 947                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
 948                         err = ext3_journal_dirty_metadata(handle, bh);
 949                         if (!fatal) fatal = err;
 950                 } else {
 951                         BUFFER_TRACE(bh, "not a new buffer");
 952                 }
 953                 if (fatal) {
 954                         *errp = fatal;
 955                         brelse(bh);
 956                         bh = NULL;
 957                 }
 958                 return bh;
 959         }
 960         return NULL;
 961 }
 962
 963 struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
 964                                int block, int create, int *err)
 965 {
 966         struct buffer_head * bh;
 967         int prev_blocks;
 968
 969         prev_blocks = inode->i_blocks;
 970
 971         bh = ext3_getblk (handle, inode, block, create, err);
 972         if (!bh)
 973                 return bh;
 974 #ifdef EXT3_PREALLOCATE
 975         /*
 976          * If the inode has grown, and this is a directory, then use a few
 977          * more of the preallocated blocks to keep directory fragmentation
 978          * down.  The preallocated blocks are guaranteed to be contiguous.
 979          */
 980         if (create &&
 981             S_ISDIR(inode->i_mode) &&
 982             inode->i_blocks > prev_blocks &&
 983             EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
 984                                     EXT3_FEATURE_COMPAT_DIR_PREALLOC)) {
 985                 int i;
 986                 struct buffer_head *tmp_bh;
 987
 988                 for (i = 1;
 989                      EXT3_I(inode)->i_prealloc_count &&
 990                      i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks;
 991                      i++) {
 992                         /*
 993                          * ext3_getblk will zero out the contents of the
 994                          * directory for us
 995                          */
 996                         tmp_bh = ext3_getblk(handle, inode,
 997                                                 block+i, create, err);
 998                         if (!tmp_bh) {
 999                                 brelse (bh);
1000                                 return 0;
1001                         }
1002                         brelse (tmp_bh);
1003                 }
1004         }
1005 #endif
1006         if (buffer_uptodate(bh))
1007                 return bh;
1008         ll_rw_block (READ, 1, &bh);
1009         wait_on_buffer (bh);
1010         if (buffer_uptodate(bh))
1011                 return bh;
1012         brelse (bh);
1013         *err = -EIO;
1014         return NULL;
1015 }
1016
1017 static int walk_page_buffers(   handle_t *handle,
1018                                 struct buffer_head *head,
1019                                 unsigned from,
1020                                 unsigned to,
1021                                 int *partial,
1022                                 int (*fn)(      handle_t *handle,
1023                                                 struct buffer_head *bh))
1024 {
1025         struct buffer_head *bh;
1026         unsigned block_start, block_end;
1027         unsigned blocksize = head->b_size;
1028         int err, ret = 0;
1029         struct buffer_head *next;
1030
1031         for (   bh = head, block_start = 0;
1032                 ret == 0 && (bh != head || !block_start);
1033                 block_start = block_end, bh = next)
1034         {
1035                 next = bh->b_this_page;
1036                 block_end = block_start + blocksize;
1037                 if (block_end <= from || block_start >= to) {
1038                         if (partial && !buffer_uptodate(bh))
1039                                 *partial = 1;
1040                         continue;
1041                 }
1042                 err = (*fn)(handle, bh);
1043                 if (!ret)
1044                         ret = err;
1045         }
1046         return ret;
1047 }
1048
1049 /*
1050  * To preserve ordering, it is essential that the hole instantiation and
1051  * the data write be encapsulated in a single transaction.  We cannot
1052  * close off a transaction and start a new one between the ext3_get_block()
1053  * and the commit_write().  So doing the journal_start at the start of
1054  * prepare_write() is the right place.
1055  *
1056  * Also, this function can nest inside ext3_writepage() ->
1057  * block_write_full_page(). In that case, we *know* that ext3_writepage()
1058  * has generated enough buffer credits to do the whole page.  So we won't
1059  * block on the journal in that case, which is good, because the caller may
1060  * be PF_MEMALLOC.
1061  *
1062  * By accident, ext3 can be reentered when a transaction is open via
1063  * quota file writes.  If we were to commit the transaction while thus
1064  * reentered, there can be a deadlock - we would be holding a quota
1065  * lock, and the commit would never complete if another thread had a
1066  * transaction open and was blocking on the quota lock - a ranking
1067  * violation.
1068  *
1069  * So what we do is to rely on the fact that journal_stop/journal_start
1070  * will _not_ run commit under these circumstances because handle->h_ref
1071  * is elevated.  We'll still have enough credits for the tiny quotafile
1072  * write.
1073  */
1074
1075 static int do_journal_get_write_access(handle_t *handle,
1076                                        struct buffer_head *bh)
1077 {
1078         if (!buffer_mapped(bh) || buffer_freed(bh))
1079                 return 0;
1080         return ext3_journal_get_write_access(handle, bh);
1081 }
1082
1083 static int ext3_prepare_write(struct file *file, struct page *page,
1084                               unsigned from, unsigned to)
1085 {
1086         struct inode *inode = page->mapping->host;
1087         int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
1088         handle_t *handle;
1089
1090         handle = ext3_journal_start(inode, needed_blocks);
1091         if (IS_ERR(handle)) {
1092                 ret = PTR_ERR(handle);
1093                 goto out;
1094         }
1095         ret = block_prepare_write(page, from, to, ext3_get_block);
1096         if (ret != 0)
1097                 goto prepare_write_failed;
1098
1099         if (ext3_should_journal_data(inode)) {
1100                 ret = walk_page_buffers(handle, page_buffers(page),
1101                                 from, to, NULL, do_journal_get_write_access);
1102         }
1103 prepare_write_failed:
1104         if (ret)
1105                 ext3_journal_stop(handle);
1106 out:
1107         return ret;
1108 }
1109
1110 static int
1111 ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1112 {
1113         int err = journal_dirty_data(handle, bh);
1114         if (err)
1115                 ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
1116                                                 bh, handle,err);
1117         return err;
1118 }
1119
1120 /* For commit_write() in data=journal mode */
1121 static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
1122 {
1123         if (!buffer_mapped(bh) || buffer_freed(bh))
1124                 return 0;
1125         set_buffer_uptodate(bh);
1126         return ext3_journal_dirty_metadata(handle, bh);
1127 }
1128
1129 /*
1130  * We need to pick up the new inode size which generic_commit_write gave us
1131  * `file' can be NULL - eg, when called from page_symlink().
1132  *
1133  * ext3 never places buffers on inode->i_mapping->private_list.  metadata
1134  * buffers are managed internally.
1135  */
1136
1137 static int ext3_ordered_commit_write(struct file *file, struct page *page,
1138                              unsigned from, unsigned to)
1139 {
1140         handle_t *handle = ext3_journal_current_handle();
1141         struct inode *inode = page->mapping->host;
1142         int ret = 0, ret2;
1143
1144         ret = walk_page_buffers(handle, page_buffers(page),
1145                 from, to, NULL, ext3_journal_dirty_data);
1146
1147         if (ret == 0) {
1148                 /*
1149                  * generic_commit_write() will run mark_inode_dirty() if i_size
1150                  * changes.  So let's piggyback the i_disksize mark_inode_dirty
1151                  * into that.
1152                  */
1153                 loff_t new_i_size;
1154
1155                 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1156                 if (new_i_size > EXT3_I(inode)->i_disksize)
1157                         EXT3_I(inode)->i_disksize = new_i_size;
1158                 ret = generic_commit_write(file, page, from, to);
1159         }
1160         ret2 = ext3_journal_stop(handle);
1161         if (!ret)
1162                 ret = ret2;
1163         return ret;
1164 }
1165
1166 static int ext3_writeback_commit_write(struct file *file, struct page *page,
1167                              unsigned from, unsigned to)
1168 {
1169         handle_t *handle = ext3_journal_current_handle();
1170         struct inode *inode = page->mapping->host;
1171         int ret = 0, ret2;
1172         loff_t new_i_size;
1173
1174         new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1175         if (new_i_size > EXT3_I(inode)->i_disksize)
1176                 EXT3_I(inode)->i_disksize = new_i_size;
1177         ret = generic_commit_write(file, page, from, to);
1178         ret2 = ext3_journal_stop(handle);
1179         if (!ret)
1180                 ret = ret2;
1181         return ret;
1182 }
1183
1184 static int ext3_journalled_commit_write(struct file *file,
1185                         struct page *page, unsigned from, unsigned to)
1186 {
1187         handle_t *handle = ext3_journal_current_handle();
1188         struct inode *inode = page->mapping->host;
1189         int ret = 0, ret2;
1190         int partial = 0;
1191         loff_t pos;
1192
1193         /*
1194          * Here we duplicate the generic_commit_write() functionality
1195          */
1196         pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1197
1198         ret = walk_page_buffers(handle, page_buffers(page), from,
1199                                 to, &partial, commit_write_fn);
1200         if (!partial)
1201                 SetPageUptodate(page);
1202         if (pos > inode->i_size)
1203                 i_size_write(inode, pos);
1204         EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1205         if (inode->i_size > EXT3_I(inode)->i_disksize) {
1206                 EXT3_I(inode)->i_disksize = inode->i_size;
1207                 ret2 = ext3_mark_inode_dirty(handle, inode);
1208                 if (!ret)
1209                         ret = ret2;
1210         }
1211         ret2 = ext3_journal_stop(handle);
1212         if (!ret)
1213                 ret = ret2;
1214         return ret;
1215 }
1216
1217 /*
1218  * bmap() is special.  It gets used by applications such as lilo and by
1219  * the swapper to find the on-disk block of a specific piece of data.
1220  *
1221  * Naturally, this is dangerous if the block concerned is still in the
1222  * journal.  If somebody makes a swapfile on an ext3 data-journaling
1223  * filesystem and enables swap, then they may get a nasty shock when the
1224  * data getting swapped to that swapfile suddenly gets overwritten by
1225  * the original zero's written out previously to the journal and
1226  * awaiting writeback in the kernel's buffer cache.
1227  *
1228  * So, if we see any bmap calls here on a modified, data-journaled file,
1229  * take extra steps to flush any blocks which might be in the cache.
1230  */
1231 static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1232 {
1233         struct inode *inode = mapping->host;
1234         journal_t *journal;
1235         int err;
1236
1237         if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
1238                 /*
1239                  * This is a REALLY heavyweight approach, but the use of
1240                  * bmap on dirty files is expected to be extremely rare:
1241                  * only if we run lilo or swapon on a freshly made file
1242                  * do we expect this to happen.
1243                  *
1244                  * (bmap requires CAP_SYS_RAWIO so this does not
1245                  * represent an unprivileged user DOS attack --- we'd be
1246                  * in trouble if mortal users could trigger this path at
1247                  * will.)
1248                  *
1249                  * NB. EXT3_STATE_JDATA is not set on files other than
1250                  * regular files.  If somebody wants to bmap a directory
1251                  * or symlink and gets confused because the buffer
1252                  * hasn't yet been flushed to disk, they deserve
1253                  * everything they get.
1254                  */
1255
1256                 EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
1257                 journal = EXT3_JOURNAL(inode);
1258                 journal_lock_updates(journal);
1259                 err = journal_flush(journal);
1260                 journal_unlock_updates(journal);
1261
1262                 if (err)
1263                         return 0;
1264         }
1265
1266         return generic_block_bmap(mapping,block,ext3_get_block);
1267 }
1268
1269 static int bget_one(handle_t *handle, struct buffer_head *bh)
1270 {
1271         get_bh(bh);
1272         return 0;
1273 }
1274
1275 static int bput_one(handle_t *handle, struct buffer_head *bh)
1276 {
1277         put_bh(bh);
1278         return 0;
1279 }
1280
1281 static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1282 {
1283         if (buffer_mapped(bh))
1284                 return ext3_journal_dirty_data(handle, bh);
1285         return 0;
1286 }
1287
1288 /*
1289  * Note that we always start a transaction even if we're not journalling
1290  * data.  This is to preserve ordering: any hole instantiation within
1291  * __block_write_full_page -> ext3_get_block() should be journalled
1292  * along with the data so we don't crash and then get metadata which
1293  * refers to old data.
1294  *
1295  * In all journalling modes block_write_full_page() will start the I/O.
1296  *
1297  * Problem:
1298  *
1299  *      ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1300  *              ext3_writepage()
1301  *
1302  * Similar for:
1303  *
1304  *      ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1305  *
1306  * Same applies to ext3_get_block().  We will deadlock on various things like
1307  * lock_journal and i_truncate_sem.
1308  *
1309  * Setting PF_MEMALLOC here doesn't work - too many internal memory
1310  * allocations fail.
1311  *
1312  * 16May01: If we're reentered then journal_current_handle() will be
1313  *          non-zero. We simply *return*.
1314  *
1315  * 1 July 2001: @@@ FIXME:
1316  *   In journalled data mode, a data buffer may be metadata against the
1317  *   current transaction.  But the same file is part of a shared mapping
1318  *   and someone does a writepage() on it.
1319  *
1320  *   We will move the buffer onto the async_data list, but *after* it has
1321  *   been dirtied. So there's a small window where we have dirty data on
1322  *   BJ_Metadata.
1323  *
1324  *   Note that this only applies to the last partial page in the file.  The
1325  *   bit which block_write_full_page() uses prepare/commit for.  (That's
1326  *   broken code anyway: it's wrong for msync()).
1327  *
1328  *   It's a rare case: affects the final partial page, for journalled data
1329  *   where the file is subject to bith write() and writepage() in the same
1330  *   transction.  To fix it we'll need a custom block_write_full_page().
1331  *   We'll probably need that anyway for journalling writepage() output.
1332  *
1333  * We don't honour synchronous mounts for writepage().  That would be
1334  * disastrous.  Any write() or metadata operation will sync the fs for
1335  * us.
1336  *
1337  * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1338  * we don't need to open a transaction here.
1339  */
1340 static int ext3_ordered_writepage(struct page *page,
1341                         struct writeback_control *wbc)
1342 {
1343         struct inode *inode = page->mapping->host;
1344         struct buffer_head *page_bufs;
1345         handle_t *handle = NULL;
1346         int ret = 0;
1347         int err;
1348
1349         J_ASSERT(PageLocked(page));
1350
1351         /*
1352          * We give up here if we're reentered, because it might be for a
1353          * different filesystem.
1354          */
1355         if (ext3_journal_current_handle())
1356                 goto out_fail;
1357
1358         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1359
1360         if (IS_ERR(handle)) {
1361                 ret = PTR_ERR(handle);
1362                 goto out_fail;
1363         }
1364
1365         if (!page_has_buffers(page)) {
1366                 if (!PageUptodate(page))
1367                         buffer_error();
1368                 create_empty_buffers(page, inode->i_sb->s_blocksize,
1369                                 (1 << BH_Dirty)|(1 << BH_Uptodate));
1370         }
1371         page_bufs = page_buffers(page);
1372         walk_page_buffers(handle, page_bufs, 0,
1373                         PAGE_CACHE_SIZE, NULL, bget_one);
1374
1375         ret = block_write_full_page(page, ext3_get_block, wbc);
1376
1377         /*
1378          * The page can become unlocked at any point now, and
1379          * truncate can then come in and change things.  So we
1380          * can't touch *page from now on.  But *page_bufs is
1381          * safe due to elevated refcount.
1382          */
1383
1384         /*
1385          * And attach them to the current transaction.  But only if
1386          * block_write_full_page() succeeded.  Otherwise they are unmapped,
1387          * and generally junk.
1388          */
1389         if (ret == 0) {
1390                 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1391                                         NULL, journal_dirty_data_fn);
1392                 if (!ret)
1393                         ret = err;
1394         }
1395         walk_page_buffers(handle, page_bufs, 0,
1396                         PAGE_CACHE_SIZE, NULL, bput_one);
1397         err = ext3_journal_stop(handle);
1398         if (!ret)
1399                 ret = err;
1400         return ret;
1401
1402 out_fail:
1403         __set_page_dirty_nobuffers(page);
1404         unlock_page(page);
1405         return ret;
1406 }
1407
1408 static int ext3_writeback_writepage(struct page *page,
1409                                 struct writeback_control *wbc)
1410 {
1411         struct inode *inode = page->mapping->host;
1412         handle_t *handle = NULL;
1413         int ret = 0;
1414         int err;
1415
1416         if (ext3_journal_current_handle())
1417                 goto out_fail;
1418
1419         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1420         if (IS_ERR(handle)) {
1421                 ret = PTR_ERR(handle);
1422                 goto out_fail;
1423         }
1424
1425         ret = block_write_full_page(page, ext3_get_block, wbc);
1426         err = ext3_journal_stop(handle);
1427         if (!ret)
1428                 ret = err;
1429         return ret;
1430
1431 out_fail:
1432         __set_page_dirty_nobuffers(page);
1433         unlock_page(page);
1434         return ret;
1435 }
1436
1437 static int ext3_journalled_writepage(struct page *page,
1438                                 struct writeback_control *wbc)
1439 {
1440         struct inode *inode = page->mapping->host;
1441         handle_t *handle = NULL;
1442         int ret = 0;
1443         int err;
1444
1445         if (ext3_journal_current_handle())
1446                 goto no_write;
1447
1448         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1449         if (IS_ERR(handle)) {
1450                 ret = PTR_ERR(handle);
1451                 goto no_write;
1452         }
1453
1454         if (!page_has_buffers(page) || PageChecked(page)) {
1455                 /*
1456                  * It's mmapped pagecache.  Add buffers and journal it.  There
1457                  * doesn't seem much point in redirtying the page here.
1458                  */
1459                 ClearPageChecked(page);
1460                 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1461                                         ext3_get_block);
1462                 if (ret != 0)
1463                         goto out_unlock;
1464                 ret = walk_page_buffers(handle, page_buffers(page), 0,
1465                         PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1466
1467                 err = walk_page_buffers(handle, page_buffers(page), 0,
1468                                 PAGE_CACHE_SIZE, NULL, commit_write_fn);
1469                 if (ret == 0)
1470                         ret = err;
1471                 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1472                 unlock_page(page);
1473         } else {
1474                 /*
1475                  * It may be a page full of checkpoint-mode buffers.  We don't
1476                  * really know unless we go poke around in the buffer_heads.
1477                  * But block_write_full_page will do the right thing.
1478                  */
1479                 ret = block_write_full_page(page, ext3_get_block, wbc);
1480         }
1481         err = ext3_journal_stop(handle);
1482         if (!ret)
1483                 ret = err;
1484 out:
1485         return ret;
1486
1487 no_write:
1488         __set_page_dirty_nobuffers(page);
1489 out_unlock:
1490         unlock_page(page);
1491         goto out;
1492 }
1493
1494 static int ext3_readpage(struct file *file, struct page *page)
1495 {
1496         return mpage_readpage(page, ext3_get_block);
1497 }
1498
1499 static int
1500 ext3_readpages(struct file *file, struct address_space *mapping,
1501                 struct list_head *pages, unsigned nr_pages)
1502 {
1503         return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
1504 }
1505
1506 static int ext3_invalidatepage(struct page *page, unsigned long offset)
1507 {
1508         journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1509
1510         /*
1511          * If it's a full truncate we just forget about the pending dirtying
1512          */
1513         if (offset == 0)
1514                 ClearPageChecked(page);
1515
1516         return journal_invalidatepage(journal, page, offset);
1517 }
1518
1519 static int ext3_releasepage(struct page *page, int wait)
1520 {
1521         journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1522
1523         WARN_ON(PageChecked(page));
1524         return journal_try_to_free_buffers(journal, page, wait);
1525 }
1526
1527 /*
1528  * If the O_DIRECT write will extend the file then add this inode to the
1529  * orphan list.  So recovery will truncate it back to the original size
1530  * if the machine crashes during the write.
1531  *
1532  * If the O_DIRECT write is intantiating holes inside i_size and the machine
1533  * crashes then stale disk data _may_ be exposed inside the file.
1534  */
1535 static int ext3_direct_IO(int rw, struct kiocb *iocb,
1536                         const struct iovec *iov, loff_t offset,
1537                         unsigned long nr_segs)
1538 {
1539         struct file *file = iocb->ki_filp;
1540         struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
1541         struct ext3_inode_info *ei = EXT3_I(inode);
1542         handle_t *handle = NULL;
1543         int ret;
1544         int orphan = 0;
1545         size_t count = iov_length(iov, nr_segs);
1546
1547         if (rw == WRITE) {
1548                 loff_t final_size = offset + count;
1549
1550                 handle = ext3_journal_start(inode, DIO_CREDITS);
1551                 if (IS_ERR(handle)) {
1552                         ret = PTR_ERR(handle);
1553                         goto out;
1554                 }
1555                 if (final_size > inode->i_size) {
1556                         ret = ext3_orphan_add(handle, inode);
1557                         if (ret)
1558                                 goto out_stop;
1559                         orphan = 1;
1560                         ei->i_disksize = inode->i_size;
1561                 }
1562         }
1563
1564         ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
1565                                 offset, nr_segs, ext3_direct_io_get_blocks);
1566
1567 out_stop:
1568         if (handle) {
1569                 int err;
1570
1571                 if (orphan)
1572                         ext3_orphan_del(handle, inode);
1573                 if (orphan && ret > 0) {
1574                         loff_t end = offset + ret;
1575                         if (end > inode->i_size) {
1576                                 ei->i_disksize = end;
1577                                 i_size_write(inode, end);
1578                                 err = ext3_mark_inode_dirty(handle, inode);
1579                                 if (!ret)
1580                                         ret = err;
1581                         }
1582                 }
1583                 err = ext3_journal_stop(handle);
1584                 if (ret == 0)
1585                         ret = err;
1586         }
1587 out:
1588         return ret;
1589 }
1590
1591 /*
1592  * Pages can be marked dirty completely asynchronously from ext3's journalling
1593  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
1594  * much here because ->set_page_dirty is called under VFS locks.  The page is
1595  * not necessarily locked.
1596  *
1597  * We cannot just dirty the page and leave attached buffers clean, because the
1598  * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
1599  * or jbddirty because all the journalling code will explode.
1600  *
1601  * So what we do is to mark the page "pending dirty" and next time writepage
1602  * is called, propagate that into the buffers appropriately.
1603  */
1604 static int ext3_journalled_set_page_dirty(struct page *page)
1605 {
1606         SetPageChecked(page);
1607         return __set_page_dirty_nobuffers(page);
1608 }
1609
1610 static struct address_space_operations ext3_ordered_aops = {
1611         .readpage       = ext3_readpage,
1612         .readpages      = ext3_readpages,
1613         .writepage      = ext3_ordered_writepage,
1614         .sync_page      = block_sync_page,
1615         .prepare_write  = ext3_prepare_write,
1616         .commit_write   = ext3_ordered_commit_write,
1617         .bmap           = ext3_bmap,
1618         .invalidatepage = ext3_invalidatepage,
1619         .releasepage    = ext3_releasepage,
1620         .direct_IO      = ext3_direct_IO,
1621 };
1622
1623 static struct address_space_operations ext3_writeback_aops = {
1624         .readpage       = ext3_readpage,
1625         .readpages      = ext3_readpages,
1626         .writepage      = ext3_writeback_writepage,
1627         .sync_page      = block_sync_page,
1628         .prepare_write  = ext3_prepare_write,
1629         .commit_write   = ext3_writeback_commit_write,
1630         .bmap           = ext3_bmap,
1631         .invalidatepage = ext3_invalidatepage,
1632         .releasepage    = ext3_releasepage,
1633         .direct_IO      = ext3_direct_IO,
1634 };
1635
1636 static struct address_space_operations ext3_journalled_aops = {
1637         .readpage       = ext3_readpage,
1638         .readpages      = ext3_readpages,
1639         .writepage      = ext3_journalled_writepage,
1640         .sync_page      = block_sync_page,
1641         .prepare_write  = ext3_prepare_write,
1642         .commit_write   = ext3_journalled_commit_write,
1643         .set_page_dirty = ext3_journalled_set_page_dirty,
1644         .bmap           = ext3_bmap,
1645         .invalidatepage = ext3_invalidatepage,
1646         .releasepage    = ext3_releasepage,
1647 };
1648
1649 void ext3_set_aops(struct inode *inode)
1650 {
1651         if (ext3_should_order_data(inode))
1652                 inode->i_mapping->a_ops = &ext3_ordered_aops;
1653         else if (ext3_should_writeback_data(inode))
1654                 inode->i_mapping->a_ops = &ext3_writeback_aops;
1655         else
1656                 inode->i_mapping->a_ops = &ext3_journalled_aops;
1657 }
1658
1659 /*
1660  * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
1661  * up to the end of the block which corresponds to `from'.
1662  * This required during truncate. We need to physically zero the tail end
1663  * of that block so it doesn't yield old data if the file is later grown.
1664  */
1665 static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1666                 struct address_space *mapping, loff_t from)
1667 {
1668         unsigned long index = from >> PAGE_CACHE_SHIFT;
1669         unsigned offset = from & (PAGE_CACHE_SIZE-1);
1670         unsigned blocksize, iblock, length, pos;
1671         struct inode *inode = mapping->host;
1672         struct buffer_head *bh;
1673         int err;
1674         void *kaddr;
1675
1676         blocksize = inode->i_sb->s_blocksize;
1677         length = blocksize - (offset & (blocksize - 1));
1678         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1679
1680         if (!page_has_buffers(page))
1681                 create_empty_buffers(page, blocksize, 0);
1682
1683         /* Find the buffer that contains "offset" */
1684         bh = page_buffers(page);
1685         pos = blocksize;
1686         while (offset >= pos) {
1687                 bh = bh->b_this_page;
1688                 iblock++;
1689                 pos += blocksize;
1690         }
1691
1692         err = 0;
1693         if (buffer_freed(bh)) {
1694                 BUFFER_TRACE(bh, "freed: skip");
1695                 goto unlock;
1696         }
1697
1698         if (!buffer_mapped(bh)) {
1699                 BUFFER_TRACE(bh, "unmapped");
1700                 ext3_get_block(inode, iblock, bh, 0);
1701                 /* unmapped? It's a hole - nothing to do */
1702                 if (!buffer_mapped(bh)) {
1703                         BUFFER_TRACE(bh, "still unmapped");
1704                         goto unlock;
1705                 }
1706         }
1707
1708         /* Ok, it's mapped. Make sure it's up-to-date */
1709         if (PageUptodate(page))
1710                 set_buffer_uptodate(bh);
1711
1712         if (!buffer_uptodate(bh)) {
1713                 err = -EIO;
1714                 ll_rw_block(READ, 1, &bh);
1715                 wait_on_buffer(bh);
1716                 /* Uhhuh. Read error. Complain and punt. */
1717                 if (!buffer_uptodate(bh))
1718                         goto unlock;
1719         }
1720
1721         if (ext3_should_journal_data(inode)) {
1722                 BUFFER_TRACE(bh, "get write access");
1723                 err = ext3_journal_get_write_access(handle, bh);
1724                 if (err)
1725                         goto unlock;
1726         }
1727
1728         kaddr = kmap_atomic(page, KM_USER0);
1729         memset(kaddr + offset, 0, length);
1730         flush_dcache_page(page);
1731         kunmap_atomic(kaddr, KM_USER0);
1732
1733         BUFFER_TRACE(bh, "zeroed end of block");
1734
1735         err = 0;
1736         if (ext3_should_journal_data(inode)) {
1737                 err = ext3_journal_dirty_metadata(handle, bh);
1738         } else {
1739                 if (ext3_should_order_data(inode))
1740                         err = ext3_journal_dirty_data(handle, bh);
1741                 mark_buffer_dirty(bh);
1742         }
1743
1744 unlock:
1745         unlock_page(page);
1746         page_cache_release(page);
1747         return err;
1748 }
1749
1750 /*
1751  * Probably it should be a library function... search for first non-zero word
1752  * or memcmp with zero_page, whatever is better for particular architecture.
1753  * Linus?
1754  */
1755 static inline int all_zeroes(u32 *p, u32 *q)
1756 {
1757         while (p < q)
1758                 if (*p++)
1759                         return 0;
1760         return 1;
1761 }
1762
1763 /**
1764  *      ext3_find_shared - find the indirect blocks for partial truncation.
1765  *      @inode:   inode in question
1766  *      @depth:   depth of the affected branch
1767  *      @offsets: offsets of pointers in that branch (see ext3_block_to_path)
1768  *      @chain:   place to store the pointers to partial indirect blocks
1769  *      @top:     place to the (detached) top of branch
1770  *
1771  *      This is a helper function used by ext3_truncate().
1772  *
1773  *      When we do truncate() we may have to clean the ends of several
1774  *      indirect blocks but leave the blocks themselves alive. Block is
1775  *      partially truncated if some data below the new i_size is refered
1776  *      from it (and it is on the path to the first completely truncated
1777  *      data block, indeed).  We have to free the top of that path along
1778  *      with everything to the right of the path. Since no allocation
1779  *      past the truncation point is possible until ext3_truncate()
1780  *      finishes, we may safely do the latter, but top of branch may
1781  *      require special attention - pageout below the truncation point
1782  *      might try to populate it.
1783  *
1784  *      We atomically detach the top of branch from the tree, store the
1785  *      block number of its root in *@top, pointers to buffer_heads of
1786  *      partially truncated blocks - in @chain[].bh and pointers to
1787  *      their last elements that should not be removed - in
1788  *      @chain[].p. Return value is the pointer to last filled element
1789  *      of @chain.
1790  *
1791  *      The work left to caller to do the actual freeing of subtrees:
1792  *              a) free the subtree starting from *@top
1793  *              b) free the subtrees whose roots are stored in
1794  *                      (@chain[i].p+1 .. end of @chain[i].bh->b_data)
1795  *              c) free the subtrees growing from the inode past the @chain[0].
1796  *                      (no partially truncated stuff there).  */
1797
1798 static Indirect *ext3_find_shared(struct inode *inode,
1799                                 int depth,
1800                                 int offsets[4],
1801                                 Indirect chain[4],
1802                                 u32 *top)
1803 {
1804         Indirect *partial, *p;
1805         int k, err;
1806
1807         *top = 0;
1808         /* Make k index the deepest non-null offest + 1 */
1809         for (k = depth; k > 1 && !offsets[k-1]; k--)
1810                 ;
1811         partial = ext3_get_branch(inode, k, offsets, chain, &err);
1812         /* Writer: pointers */
1813         if (!partial)
1814                 partial = chain + k-1;
1815         /*
1816          * If the branch acquired continuation since we've looked at it -
1817          * fine, it should all survive and (new) top doesn't belong to us.
1818          */
1819         if (!partial->key && *partial->p)
1820                 /* Writer: end */
1821                 goto no_top;
1822         for (p=partial; p>chain && all_zeroes((u32*)p->bh->b_data,p->p); p--)
1823                 ;
1824         /*
1825          * OK, we've found the last block that must survive. The rest of our
1826          * branch should be detached before unlocking. However, if that rest
1827          * of branch is all ours and does not grow immediately from the inode
1828          * it's easier to cheat and just decrement partial->p.
1829          */
1830         if (p == chain + k - 1 && p > chain) {
1831                 p->p--;
1832         } else {
1833                 *top = *p->p;
1834                 /* Nope, don't do this in ext3.  Must leave the tree intact */
1835 #if 0
1836                 *p->p = 0;
1837 #endif
1838         }
1839         /* Writer: end */
1840
1841         while(partial > p)
1842         {
1843                 brelse(partial->bh);
1844                 partial--;
1845         }
1846 no_top:
1847         return partial;
1848 }
1849
1850 /*
1851  * Zero a number of block pointers in either an inode or an indirect block.
1852  * If we restart the transaction we must again get write access to the
1853  * indirect block for further modification.
1854  *
1855  * We release `count' blocks on disk, but (last - first) may be greater
1856  * than `count' because there can be holes in there.
1857  */
1858 static void
1859 ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
1860                 unsigned long block_to_free, unsigned long count,
1861                 u32 *first, u32 *last)
1862 {
1863         u32 *p;
1864         if (try_to_extend_transaction(handle, inode)) {
1865                 if (bh) {
1866                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1867                         ext3_journal_dirty_metadata(handle, bh);
1868                 }
1869                 ext3_mark_inode_dirty(handle, inode);
1870                 ext3_journal_test_restart(handle, inode);
1871                 if (bh) {
1872                         BUFFER_TRACE(bh, "retaking write access");
1873                         ext3_journal_get_write_access(handle, bh);
1874                 }
1875         }
1876
1877         /*
1878          * Any buffers which are on the journal will be in memory. We find
1879          * them on the hash table so journal_revoke() will run journal_forget()
1880          * on them.  We've already detached each block from the file, so
1881          * bforget() in journal_forget() should be safe.
1882          *
1883          * AKPM: turn on bforget in journal_forget()!!!
1884          */
1885         for (p = first; p < last; p++) {
1886                 u32 nr = le32_to_cpu(*p);
1887                 if (nr) {
1888                         struct buffer_head *bh;
1889
1890                         *p = 0;
1891                         bh = sb_find_get_block(inode->i_sb, nr);
1892                         ext3_forget(handle, 0, inode, bh, nr);
1893                 }
1894         }
1895
1896         ext3_free_blocks(handle, inode, block_to_free, count);
1897 }
1898
1899 /**
1900  * ext3_free_data - free a list of data blocks
1901  * @handle:     handle for this transaction
1902  * @inode:      inode we are dealing with
1903  * @this_bh:    indirect buffer_head which contains *@first and *@last
1904  * @first:      array of block numbers
1905  * @last:       points immediately past the end of array
1906  *
1907  * We are freeing all blocks refered from that array (numbers are stored as
1908  * little-endian 32-bit) and updating @inode->i_blocks appropriately.
1909  *
1910  * We accumulate contiguous runs of blocks to free.  Conveniently, if these
1911  * blocks are contiguous then releasing them at one time will only affect one
1912  * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
1913  * actually use a lot of journal space.
1914  *
1915  * @this_bh will be %NULL if @first and @last point into the inode's direct
1916  * block pointers.
1917  */
1918 static void ext3_free_data(handle_t *handle, struct inode *inode,
1919                            struct buffer_head *this_bh, u32 *first, u32 *last)
1920 {
1921         unsigned long block_to_free = 0;    /* Starting block # of a run */
1922         unsigned long count = 0;            /* Number of blocks in the run */
1923         u32 *block_to_free_p = NULL;        /* Pointer into inode/ind
1924                                                corresponding to
1925                                                block_to_free */
1926         unsigned long nr;                   /* Current block # */
1927         u32 *p;                             /* Pointer into inode/ind
1928                                                for current block */
1929         int err;
1930
1931         if (this_bh) {                          /* For indirect block */
1932                 BUFFER_TRACE(this_bh, "get_write_access");
1933                 err = ext3_journal_get_write_access(handle, this_bh);
1934                 /* Important: if we can't update the indirect pointers
1935                  * to the blocks, we can't free them. */
1936                 if (err)
1937                         return;
1938         }
1939
1940         for (p = first; p < last; p++) {
1941                 nr = le32_to_cpu(*p);
1942                 if (nr) {
1943                         /* accumulate blocks to free if they're contiguous */
1944                         if (count == 0) {
1945                                 block_to_free = nr;
1946                                 block_to_free_p = p;
1947                                 count = 1;
1948                         } else if (nr == block_to_free + count) {
1949                                 count++;
1950                         } else {
1951                                 ext3_clear_blocks(handle, inode, this_bh,
1952                                                   block_to_free,
1953                                                   count, block_to_free_p, p);
1954                                 block_to_free = nr;
1955                                 block_to_free_p = p;
1956                                 count = 1;
1957                         }
1958                 }
1959         }
1960
1961         if (count > 0)
1962                 ext3_clear_blocks(handle, inode, this_bh, block_to_free,
1963                                   count, block_to_free_p, p);
1964
1965         if (this_bh) {
1966                 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
1967                 ext3_journal_dirty_metadata(handle, this_bh);
1968         }
1969 }
1970
1971 /**
1972  *      ext3_free_branches - free an array of branches
1973  *      @handle: JBD handle for this transaction
1974  *      @inode: inode we are dealing with
1975  *      @parent_bh: the buffer_head which contains *@first and *@last
1976  *      @first: array of block numbers
1977  *      @last:  pointer immediately past the end of array
1978  *      @depth: depth of the branches to free
1979  *
1980  *      We are freeing all blocks refered from these branches (numbers are
1981  *      stored as little-endian 32-bit) and updating @inode->i_blocks
1982  *      appropriately.
1983  */
1984 static void ext3_free_branches(handle_t *handle, struct inode *inode,
1985                                struct buffer_head *parent_bh,
1986                                u32 *first, u32 *last, int depth)
1987 {
1988         unsigned long nr;
1989         u32 *p;
1990
1991         if (is_handle_aborted(handle))
1992                 return;
1993
1994         if (depth--) {
1995                 struct buffer_head *bh;
1996                 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
1997                 p = last;
1998                 while (--p >= first) {
1999                         nr = le32_to_cpu(*p);
2000                         if (!nr)
2001                                 continue;               /* A hole */
2002
2003                         /* Go read the buffer for the next level down */
2004                         bh = sb_bread(inode->i_sb, nr);
2005
2006                         /*
2007                          * A read failure? Report error and clear slot
2008                          * (should be rare).
2009                          */
2010                         if (!bh) {
2011                                 ext3_error(inode->i_sb, "ext3_free_branches",
2012                                            "Read failure, inode=%ld, block=%ld",
2013                                            inode->i_ino, nr);
2014                                 continue;
2015                         }
2016
2017                         /* This zaps the entire block.  Bottom up. */
2018                         BUFFER_TRACE(bh, "free child branches");
2019                         ext3_free_branches(handle, inode, bh, (u32*)bh->b_data,
2020                                            (u32*)bh->b_data + addr_per_block,
2021                                            depth);
2022
2023                         /*
2024                          * We've probably journalled the indirect block several
2025                          * times during the truncate.  But it's no longer
2026                          * needed and we now drop it from the transaction via
2027                          * journal_revoke().
2028                          *
2029                          * That's easy if it's exclusively part of this
2030                          * transaction.  But if it's part of the committing
2031                          * transaction then journal_forget() will simply
2032                          * brelse() it.  That means that if the underlying
2033                          * block is reallocated in ext3_get_block(),
2034                          * unmap_underlying_metadata() will find this block
2035                          * and will try to get rid of it.  damn, damn.
2036                          *
2037                          * If this block has already been committed to the
2038                          * journal, a revoke record will be written.  And
2039                          * revoke records must be emitted *before* clearing
2040                          * this block's bit in the bitmaps.
2041                          */
2042                         ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
2043
2044                         /*
2045                          * Everything below this this pointer has been
2046                          * released.  Now let this top-of-subtree go.
2047                          *
2048                          * We want the freeing of this indirect block to be
2049                          * atomic in the journal with the updating of the
2050                          * bitmap block which owns it.  So make some room in
2051                          * the journal.
2052                          *
2053                          * We zero the parent pointer *after* freeing its
2054                          * pointee in the bitmaps, so if extend_transaction()
2055                          * for some reason fails to put the bitmap changes and
2056                          * the release into the same transaction, recovery
2057                          * will merely complain about releasing a free block,
2058                          * rather than leaking blocks.
2059                          */
2060                         if (is_handle_aborted(handle))
2061                                 return;
2062                         if (try_to_extend_transaction(handle, inode)) {
2063                                 ext3_mark_inode_dirty(handle, inode);
2064                                 ext3_journal_test_restart(handle, inode);
2065                         }
2066
2067                         ext3_free_blocks(handle, inode, nr, 1);
2068
2069                         if (parent_bh) {
2070                                 /*
2071                                  * The block which we have just freed is
2072                                  * pointed to by an indirect block: journal it
2073                                  */
2074                                 BUFFER_TRACE(parent_bh, "get_write_access");
2075                                 if (!ext3_journal_get_write_access(handle,
2076                                                                    parent_bh)){
2077                                         *p = 0;
2078                                         BUFFER_TRACE(parent_bh,
2079                                         "call ext3_journal_dirty_metadata");
2080                                         ext3_journal_dirty_metadata(handle,
2081                                                                     parent_bh);
2082                                 }
2083                         }
2084                 }
2085         } else {
2086                 /* We have reached the bottom of the tree. */
2087                 BUFFER_TRACE(parent_bh, "free data blocks");
2088                 ext3_free_data(handle, inode, parent_bh, first, last);
2089         }
2090 }
2091
2092 /*
2093  * ext3_truncate()
2094  *
2095  * We block out ext3_get_block() block instantiations across the entire
2096  * transaction, and VFS/VM ensures that ext3_truncate() cannot run
2097  * simultaneously on behalf of the same inode.
2098  *
2099  * As we work through the truncate and commmit bits of it to the journal there
2100  * is one core, guiding principle: the file's tree must always be consistent on
2101  * disk.  We must be able to restart the truncate after a crash.
2102  *
2103  * The file's tree may be transiently inconsistent in memory (although it
2104  * probably isn't), but whenever we close off and commit a journal transaction,
2105  * the contents of (the filesystem + the journal) must be consistent and
2106  * restartable.  It's pretty simple, really: bottom up, right to left (although
2107  * left-to-right works OK too).
2108  *
2109  * Note that at recovery time, journal replay occurs *before* the restart of
2110  * truncate against the orphan inode list.
2111  *
2112  * The committed inode has the new, desired i_size (which is the same as
2113  * i_disksize in this case).  After a crash, ext3_orphan_cleanup() will see
2114  * that this inode's truncate did not complete and it will again call
2115  * ext3_truncate() to have another go.  So there will be instantiated blocks
2116  * to the right of the truncation point in a crashed ext3 filesystem.  But
2117  * that's fine - as long as they are linked from the inode, the post-crash
2118  * ext3_truncate() run will find them and release them.
2119  */
2120
2121 void ext3_truncate(struct inode * inode)
2122 {
2123         handle_t *handle;
2124         struct ext3_inode_info *ei = EXT3_I(inode);
2125         u32 *i_data = ei->i_data;
2126         int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2127         struct address_space *mapping = inode->i_mapping;
2128         int offsets[4];
2129         Indirect chain[4];
2130         Indirect *partial;
2131         int nr = 0;
2132         int n;
2133         long last_block;
2134         unsigned blocksize = inode->i_sb->s_blocksize;
2135         struct page *page;
2136
2137         if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2138             S_ISLNK(inode->i_mode)))
2139                 return;
2140         if (ext3_inode_is_fast_symlink(inode))
2141                 return;
2142         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2143                 return;
2144
2145         ext3_discard_prealloc(inode);
2146
2147         /*
2148          * We have to lock the EOF page here, because lock_page() nests
2149          * outside journal_start().
2150          */
2151         if ((inode->i_size & (blocksize - 1)) == 0) {
2152                 /* Block boundary? Nothing to do */
2153                 page = NULL;
2154         } else {
2155                 page = grab_cache_page(mapping,
2156                                 inode->i_size >> PAGE_CACHE_SHIFT);
2157                 if (!page)
2158                         return;
2159         }
2160
2161         handle = start_transaction(inode);
2162         if (IS_ERR(handle)) {
2163                 if (page) {
2164                         clear_highpage(page);
2165                         flush_dcache_page(page);
2166                         unlock_page(page);
2167                         page_cache_release(page);
2168                 }
2169                 return;         /* AKPM: return what? */
2170         }
2171
2172         last_block = (inode->i_size + blocksize-1)
2173                                         >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
2174
2175         if (page)
2176                 ext3_block_truncate_page(handle, page, mapping, inode->i_size);
2177
2178         n = ext3_block_to_path(inode, last_block, offsets, NULL);
2179         if (n == 0)
2180                 goto out_stop;  /* error */
2181
2182         /*
2183          * OK.  This truncate is going to happen.  We add the inode to the
2184          * orphan list, so that if this truncate spans multiple transactions,
2185          * and we crash, we will resume the truncate when the filesystem
2186          * recovers.  It also marks the inode dirty, to catch the new size.
2187          *
2188          * Implication: the file must always be in a sane, consistent
2189          * truncatable state while each transaction commits.
2190          */
2191         if (ext3_orphan_add(handle, inode))
2192                 goto out_stop;
2193
2194         /*
2195          * The orphan list entry will now protect us from any crash which
2196          * occurs before the truncate completes, so it is now safe to propagate
2197          * the new, shorter inode size (held for now in i_size) into the
2198          * on-disk inode. We do this via i_disksize, which is the value which
2199          * ext3 *really* writes onto the disk inode.
2200          */
2201         ei->i_disksize = inode->i_size;
2202
2203         /*
2204          * From here we block out all ext3_get_block() callers who want to
2205          * modify the block allocation tree.
2206          */
2207         down_write(&ei->truncate_sem);
2208
2209         if (n == 1) {           /* direct blocks */
2210                 ext3_free_data(handle, inode, NULL, i_data+offsets[0],
2211                                i_data + EXT3_NDIR_BLOCKS);
2212                 goto do_indirects;
2213         }
2214
2215         partial = ext3_find_shared(inode, n, offsets, chain, &nr);
2216         /* Kill the top of shared branch (not detached) */
2217         if (nr) {
2218                 if (partial == chain) {
2219                         /* Shared branch grows from the inode */
2220                         ext3_free_branches(handle, inode, NULL,
2221                                            &nr, &nr+1, (chain+n-1) - partial);
2222                         *partial->p = 0;
2223                         /*
2224                          * We mark the inode dirty prior to restart,
2225                          * and prior to stop.  No need for it here.
2226                          */
2227                 } else {
2228                         /* Shared branch grows from an indirect block */
2229                         BUFFER_TRACE(partial->bh, "get_write_access");
2230                         ext3_free_branches(handle, inode, partial->bh,
2231                                         partial->p,
2232                                         partial->p+1, (chain+n-1) - partial);
2233                 }
2234         }
2235         /* Clear the ends of indirect blocks on the shared branch */
2236         while (partial > chain) {
2237                 ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
2238                                    (u32*)partial->bh->b_data + addr_per_block,
2239                                    (chain+n-1) - partial);
2240                 BUFFER_TRACE(partial->bh, "call brelse");
2241                 brelse (partial->bh);
2242                 partial--;
2243         }
2244 do_indirects:
2245         /* Kill the remaining (whole) subtrees */
2246         switch (offsets[0]) {
2247                 default:
2248                         nr = i_data[EXT3_IND_BLOCK];
2249                         if (nr) {
2250                                 ext3_free_branches(handle, inode, NULL,
2251                                                    &nr, &nr+1, 1);
2252                                 i_data[EXT3_IND_BLOCK] = 0;
2253                         }
2254                 case EXT3_IND_BLOCK:
2255                         nr = i_data[EXT3_DIND_BLOCK];
2256                         if (nr) {
2257                                 ext3_free_branches(handle, inode, NULL,
2258                                                    &nr, &nr+1, 2);
2259                                 i_data[EXT3_DIND_BLOCK] = 0;
2260                         }
2261                 case EXT3_DIND_BLOCK:
2262                         nr = i_data[EXT3_TIND_BLOCK];
2263                         if (nr) {
2264                                 ext3_free_branches(handle, inode, NULL,
2265                                                    &nr, &nr+1, 3);
2266                                 i_data[EXT3_TIND_BLOCK] = 0;
2267                         }
2268                 case EXT3_TIND_BLOCK:
2269                         ;
2270         }
2271         up_write(&ei->truncate_sem);
2272         inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2273         ext3_mark_inode_dirty(handle, inode);
2274
2275         /* In a multi-transaction truncate, we only make the final
2276          * transaction synchronous */
2277         if (IS_SYNC(inode))
2278                 handle->h_sync = 1;
2279 out_stop:
2280         /*
2281          * If this was a simple ftruncate(), and the file will remain alive
2282          * then we need to clear up the orphan record which we created above.
2283          * However, if this was a real unlink then we were called by
2284          * ext3_delete_inode(), and we allow that function to clean up the
2285          * orphan info for us.
2286          */
2287         if (inode->i_nlink)
2288                 ext3_orphan_del(handle, inode);
2289
2290         ext3_journal_stop(handle);
2291 }
2292
2293 static unsigned long ext3_get_inode_block(struct super_block *sb,
2294                 unsigned long ino, struct ext3_iloc *iloc)
2295 {
2296         unsigned long desc, group_desc, block_group;
2297         unsigned long offset, block;
2298         struct buffer_head *bh;
2299         struct ext3_group_desc * gdp;
2300
2301         if ((ino != EXT3_ROOT_INO &&
2302                 ino != EXT3_JOURNAL_INO &&
2303                 ino < EXT3_FIRST_INO(sb)) ||
2304                 ino > le32_to_cpu(
2305                         EXT3_SB(sb)->s_es->s_inodes_count)) {
2306                 ext3_error (sb, "ext3_get_inode_block",
2307                             "bad inode number: %lu", ino);
2308                 return 0;
2309         }
2310         block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
2311         if (block_group >= EXT3_SB(sb)->s_groups_count) {
2312                 ext3_error (sb, "ext3_get_inode_block",
2313                             "group >= groups count");
2314                 return 0;
2315         }
2316         group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
2317         desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
2318         bh = EXT3_SB(sb)->s_group_desc[group_desc];
2319         if (!bh) {
2320                 ext3_error (sb, "ext3_get_inode_block",
2321                             "Descriptor not loaded");
2322                 return 0;
2323         }
2324
2325         gdp = (struct ext3_group_desc *) bh->b_data;
2326         /*
2327          * Figure out the offset within the block group inode table
2328          */
2329         offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
2330                 EXT3_INODE_SIZE(sb);
2331         block = le32_to_cpu(gdp[desc].bg_inode_table) +
2332                 (offset >> EXT3_BLOCK_SIZE_BITS(sb));
2333
2334         iloc->block_group = block_group;
2335         iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
2336         return block;
2337 }
2338
2339 /*
2340  * ext3_get_inode_loc returns with an extra refcount against the
2341  * inode's underlying buffer_head on success.
2342  */
2343
2344 int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc)
2345 {
2346         unsigned long block;
2347
2348         block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
2349         if (block) {
2350                 struct buffer_head *bh = sb_bread(inode->i_sb, block);
2351                 if (bh) {
2352                         iloc->bh = bh;
2353                         return 0;
2354                 }
2355                 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2356                             "unable to read inode block - "
2357                             "inode=%lu, block=%lu", inode->i_ino, block);
2358         }
2359         return -EIO;
2360 }
2361
2362 void ext3_set_inode_flags(struct inode *inode)
2363 {
2364         unsigned int flags = EXT3_I(inode)->i_flags;
2365
2366         inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
2367         if (flags & EXT3_SYNC_FL)
2368                 inode->i_flags |= S_SYNC;
2369         if (flags & EXT3_APPEND_FL)
2370                 inode->i_flags |= S_APPEND;
2371         if (flags & EXT3_IMMUTABLE_FL)
2372                 inode->i_flags |= S_IMMUTABLE;
2373         if (flags & EXT3_NOATIME_FL)
2374                 inode->i_flags |= S_NOATIME;
2375         if (flags & EXT3_DIRSYNC_FL)
2376                 inode->i_flags |= S_DIRSYNC;
2377 }
2378
2379
2380 void ext3_read_inode(struct inode * inode)
2381 {
2382         struct ext3_iloc iloc;
2383         struct ext3_inode *raw_inode;
2384         struct ext3_inode_info *ei = EXT3_I(inode);
2385         struct buffer_head *bh;
2386         int block;
2387
2388 #ifdef CONFIG_EXT3_FS_POSIX_ACL
2389         ei->i_acl = EXT3_ACL_NOT_CACHED;
2390         ei->i_default_acl = EXT3_ACL_NOT_CACHED;
2391 #endif
2392         if (ext3_get_inode_loc(inode, &iloc))
2393                 goto bad_inode;
2394         bh = iloc.bh;
2395         raw_inode = ext3_raw_inode(&iloc);
2396         inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2397         inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2398         inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2399         if(!(test_opt (inode->i_sb, NO_UID32))) {
2400                 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2401                 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2402         }
2403         inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2404         inode->i_size = le32_to_cpu(raw_inode->i_size);
2405         inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
2406         inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
2407         inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
2408         inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2409
2410         ei->i_state = 0;
2411         ei->i_next_alloc_block = 0;
2412         ei->i_next_alloc_goal = 0;
2413         ei->i_dir_start_lookup = 0;
2414         ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2415         /* We now have enough fields to check if the inode was active or not.
2416          * This is needed because nfsd might try to access dead inodes
2417          * the test is that same one that e2fsck uses
2418          * NeilBrown 1999oct15
2419          */
2420         if (inode->i_nlink == 0) {
2421                 if (inode->i_mode == 0 ||
2422                     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
2423                         /* this inode is deleted */
2424                         brelse (bh);
2425                         goto bad_inode;
2426                 }
2427                 /* The only unlinked inodes we let through here have
2428                  * valid i_mode and are being read by the orphan
2429                  * recovery code: that's fine, we're about to complete
2430                  * the process of deleting those. */
2431         }
2432         inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size
2433                                          * (for stat), not the fs block
2434                                          * size */
2435         inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2436         ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2437 #ifdef EXT3_FRAGMENTS
2438         ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
2439         ei->i_frag_no = raw_inode->i_frag;
2440         ei->i_frag_size = raw_inode->i_fsize;
2441 #endif
2442         ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2443         if (!S_ISREG(inode->i_mode)) {
2444                 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2445         } else {
2446                 inode->i_size |=
2447                         ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2448         }
2449         ei->i_disksize = inode->i_size;
2450         inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2451 #ifdef EXT3_PREALLOCATE
2452         ei->i_prealloc_count = 0;
2453 #endif
2454         ei->i_block_group = iloc.block_group;
2455
2456         /*
2457          * NOTE! The in-memory inode i_data array is in little-endian order
2458          * even on big-endian machines: we do NOT byteswap the block numbers!
2459          */
2460         for (block = 0; block < EXT3_N_BLOCKS; block++)
2461                 ei->i_data[block] = raw_inode->i_block[block];
2462         INIT_LIST_HEAD(&ei->i_orphan);
2463
2464         if (S_ISREG(inode->i_mode)) {
2465                 inode->i_op = &ext3_file_inode_operations;
2466                 inode->i_fop = &ext3_file_operations;
2467                 ext3_set_aops(inode);
2468         } else if (S_ISDIR(inode->i_mode)) {
2469                 inode->i_op = &ext3_dir_inode_operations;
2470                 inode->i_fop = &ext3_dir_operations;
2471         } else if (S_ISLNK(inode->i_mode)) {
2472                 if (ext3_inode_is_fast_symlink(inode))
2473                         inode->i_op = &ext3_fast_symlink_inode_operations;
2474                 else {
2475                         inode->i_op = &ext3_symlink_inode_operations;
2476                         ext3_set_aops(inode);
2477                 }
2478         } else {
2479                 inode->i_op = &ext3_special_inode_operations;
2480                 init_special_inode(inode, inode->i_mode,
2481                                    le32_to_cpu(raw_inode->i_block[0]));
2482         }
2483         brelse (iloc.bh);
2484         ext3_set_inode_flags(inode);
2485         return;
2486
2487 bad_inode:
2488         make_bad_inode(inode);
2489         return;
2490 }
2491
2492 /*
2493  * Post the struct inode info into an on-disk inode location in the
2494  * buffer-cache.  This gobbles the caller's reference to the
2495  * buffer_head in the inode location struct.
2496  *
2497  * The caller must have write access to iloc->bh.
2498  */
2499 static int ext3_do_update_inode(handle_t *handle,
2500                                 struct inode *inode,
2501                                 struct ext3_iloc *iloc)
2502 {
2503         struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
2504         struct ext3_inode_info *ei = EXT3_I(inode);
2505         struct buffer_head *bh = iloc->bh;
2506         int err = 0, rc, block;
2507
2508         /* For fields not not tracking in the in-memory inode,
2509          * initialise them to zero for new inodes. */
2510         if (ei->i_state & EXT3_STATE_NEW)
2511                 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
2512
2513         raw_inode->i_mode = cpu_to_le16(inode->i_mode);
2514         if(!(test_opt(inode->i_sb, NO_UID32))) {
2515                 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
2516                 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
2517 /*
2518  * Fix up interoperability with old kernels. Otherwise, old inodes get
2519  * re-used with the upper 16 bits of the uid/gid intact
2520  */
2521                 if(!ei->i_dtime) {
2522                         raw_inode->i_uid_high =
2523                                 cpu_to_le16(high_16_bits(inode->i_uid));
2524                         raw_inode->i_gid_high =
2525                                 cpu_to_le16(high_16_bits(inode->i_gid));
2526                 } else {
2527                         raw_inode->i_uid_high = 0;
2528                         raw_inode->i_gid_high = 0;
2529                 }
2530         } else {
2531                 raw_inode->i_uid_low =
2532                         cpu_to_le16(fs_high2lowuid(inode->i_uid));
2533                 raw_inode->i_gid_low =
2534                         cpu_to_le16(fs_high2lowgid(inode->i_gid));
2535                 raw_inode->i_uid_high = 0;
2536                 raw_inode->i_gid_high = 0;
2537         }
2538         raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2539         raw_inode->i_size = cpu_to_le32(ei->i_disksize);
2540         raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
2541         raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
2542         raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
2543         raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
2544         raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2545         raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2546 #ifdef EXT3_FRAGMENTS
2547         raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
2548         raw_inode->i_frag = ei->i_frag_no;
2549         raw_inode->i_fsize = ei->i_frag_size;
2550 #endif
2551         raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
2552         if (!S_ISREG(inode->i_mode)) {
2553                 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
2554         } else {
2555                 raw_inode->i_size_high =
2556                         cpu_to_le32(ei->i_disksize >> 32);
2557                 if (ei->i_disksize > 0x7fffffffULL) {
2558                         struct super_block *sb = inode->i_sb;
2559                         if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
2560                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
2561                             EXT3_SB(sb)->s_es->s_rev_level ==
2562                                         cpu_to_le32(EXT3_GOOD_OLD_REV)) {
2563                                /* If this is the first large file
2564                                 * created, add a flag to the superblock.
2565                                 */
2566                                 err = ext3_journal_get_write_access(handle,
2567                                                 EXT3_SB(sb)->s_sbh);
2568                                 if (err)
2569                                         goto out_brelse;
2570                                 ext3_update_dynamic_rev(sb);
2571                                 EXT3_SET_RO_COMPAT_FEATURE(sb,
2572                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2573                                 sb->s_dirt = 1;
2574                                 handle->h_sync = 1;
2575                                 err = ext3_journal_dirty_metadata(handle,
2576                                                 EXT3_SB(sb)->s_sbh);
2577                         }
2578                 }
2579         }
2580         raw_inode->i_generation = cpu_to_le32(inode->i_generation);
2581         if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
2582                 raw_inode->i_block[0] =
2583                         cpu_to_le32(kdev_t_to_nr(inode->i_rdev));
2584         else for (block = 0; block < EXT3_N_BLOCKS; block++)
2585                 raw_inode->i_block[block] = ei->i_data[block];
2586
2587         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2588         rc = ext3_journal_dirty_metadata(handle, bh);
2589         if (!err)
2590                 err = rc;
2591         ei->i_state &= ~EXT3_STATE_NEW;
2592
2593 out_brelse:
2594         brelse (bh);
2595         ext3_std_error(inode->i_sb, err);
2596         return err;
2597 }
2598
2599 /*
2600  * ext3_write_inode()
2601  *
2602  * We are called from a few places:
2603  *
2604  * - Within generic_file_write() for O_SYNC files.
2605  *   Here, there will be no transaction running. We wait for any running
2606  *   trasnaction to commit.
2607  *
2608  * - Within sys_sync(), kupdate and such.
2609  *   We wait on commit, if tol to.
2610  *
2611  * - Within prune_icache() (PF_MEMALLOC == true)
2612  *   Here we simply return.  We can't afford to block kswapd on the
2613  *   journal commit.
2614  *
2615  * In all cases it is actually safe for us to return without doing anything,
2616  * because the inode has been copied into a raw inode buffer in
2617  * ext3_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
2618  * knfsd.
2619  *
2620  * Note that we are absolutely dependent upon all inode dirtiers doing the
2621  * right thing: they *must* call mark_inode_dirty() after dirtying info in
2622  * which we are interested.
2623  *
2624  * It would be a bug for them to not do this.  The code:
2625  *
2626  *      mark_inode_dirty(inode)
2627  *      stuff();
2628  *      inode->i_size = expr;
2629  *
2630  * is in error because a kswapd-driven write_inode() could occur while
2631  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
2632  * will no longer be on the superblock's dirty inode list.
2633  */
2634 void ext3_write_inode(struct inode *inode, int wait)
2635 {
2636         if (current->flags & PF_MEMALLOC)
2637                 return;
2638
2639         if (ext3_journal_current_handle()) {
2640                 jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
2641                 dump_stack();
2642                 return;
2643         }
2644
2645         if (!wait)
2646                 return;
2647
2648         ext3_force_commit(inode->i_sb);
2649 }
2650
2651 /*
2652  * ext3_setattr()
2653  *
2654  * Called from notify_change.
2655  *
2656  * We want to trap VFS attempts to truncate the file as soon as
2657  * possible.  In particular, we want to make sure that when the VFS
2658  * shrinks i_size, we put the inode on the orphan list and modify
2659  * i_disksize immediately, so that during the subsequent flushing of
2660  * dirty pages and freeing of disk blocks, we can guarantee that any
2661  * commit will leave the blocks being flushed in an unused state on
2662  * disk.  (On recovery, the inode will get truncated and the blocks will
2663  * be freed, so we have a strong guarantee that no future commit will
2664  * leave these blocks visible to the user.)
2665  *
2666  * Called with inode->sem down.
2667  */
2668 int ext3_setattr(struct dentry *dentry, struct iattr *attr)
2669 {
2670         struct inode *inode = dentry->d_inode;
2671         int error, rc = 0;
2672         const unsigned int ia_valid = attr->ia_valid;
2673
2674         error = inode_change_ok(inode, attr);
2675         if (error)
2676                 return error;
2677
2678         if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2679                 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2680                 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2681                 if (error)
2682                         return error;
2683         }
2684
2685         if (S_ISREG(inode->i_mode) &&
2686             attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
2687                 handle_t *handle;
2688
2689                 handle = ext3_journal_start(inode, 3);
2690                 if (IS_ERR(handle)) {
2691                         error = PTR_ERR(handle);
2692                         goto err_out;
2693                 }
2694
2695                 error = ext3_orphan_add(handle, inode);
2696                 EXT3_I(inode)->i_disksize = attr->ia_size;
2697                 rc = ext3_mark_inode_dirty(handle, inode);
2698                 if (!error)
2699                         error = rc;
2700                 ext3_journal_stop(handle);
2701         }
2702
2703         rc = inode_setattr(inode, attr);
2704
2705         /* If inode_setattr's call to ext3_truncate failed to get a
2706          * transaction handle at all, we need to clean up the in-core
2707          * orphan list manually. */
2708         if (inode->i_nlink)
2709                 ext3_orphan_del(NULL, inode);
2710
2711         if (!rc && (ia_valid & ATTR_MODE))
2712                 rc = ext3_acl_chmod(inode);
2713
2714 err_out:
2715         ext3_std_error(inode->i_sb, error);
2716         if (!error)
2717                 error = rc;
2718         return error;
2719 }
2720
2721
2722 /*
2723  * akpm: how many blocks doth make a writepage()?
2724  *
2725  * With N blocks per page, it may be:
2726  * N data blocks
2727  * 2 indirect block
2728  * 2 dindirect
2729  * 1 tindirect
2730  * N+5 bitmap blocks (from the above)
2731  * N+5 group descriptor summary blocks
2732  * 1 inode block
2733  * 1 superblock.
2734  * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
2735  *
2736  * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
2737  *
2738  * With ordered or writeback data it's the same, less the N data blocks.
2739  *
2740  * If the inode's direct blocks can hold an integral number of pages then a
2741  * page cannot straddle two indirect blocks, and we can only touch one indirect
2742  * and dindirect block, and the "5" above becomes "3".
2743  *
2744  * This still overestimates under most circumstances.  If we were to pass the
2745  * start and end offsets in here as well we could do block_to_path() on each
2746  * block and work out the exact number of indirects which are touched.  Pah.
2747  */
2748
2749 int ext3_writepage_trans_blocks(struct inode *inode)
2750 {
2751         int bpp = ext3_journal_blocks_per_page(inode);
2752         int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
2753         int ret;
2754
2755         if (ext3_should_journal_data(inode))
2756                 ret = 3 * (bpp + indirects) + 2;
2757         else
2758                 ret = 2 * (bpp + indirects) + 2;
2759
2760 #ifdef CONFIG_QUOTA
2761         ret += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
2762 #endif
2763
2764         return ret;
2765 }
2766
2767 /*
2768  * The caller must have previously called ext3_reserve_inode_write().
2769  * Give this, we know that the caller already has write access to iloc->bh.
2770  */
2771 int ext3_mark_iloc_dirty(handle_t *handle,
2772                 struct inode *inode, struct ext3_iloc *iloc)
2773 {
2774         int err = 0;
2775
2776         /* the do_update_inode consumes one bh->b_count */
2777         get_bh(iloc->bh);
2778
2779         /* ext3_do_update_inode() does journal_dirty_metadata */
2780         err = ext3_do_update_inode(handle, inode, iloc);
2781         put_bh(iloc->bh);
2782         return err;
2783 }
2784
2785 /*
2786  * On success, We end up with an outstanding reference count against
2787  * iloc->bh.  This _must_ be cleaned up later.
2788  */
2789
2790 int
2791 ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
2792                          struct ext3_iloc *iloc)
2793 {
2794         int err = 0;
2795         if (handle) {
2796                 err = ext3_get_inode_loc(inode, iloc);
2797                 if (!err) {
2798                         BUFFER_TRACE(iloc->bh, "get_write_access");
2799                         err = ext3_journal_get_write_access(handle, iloc->bh);
2800                         if (err) {
2801                                 brelse(iloc->bh);
2802                                 iloc->bh = NULL;
2803                         }
2804                 }
2805         }
2806         ext3_std_error(inode->i_sb, err);
2807         return err;
2808 }
2809
2810 /*
2811  * akpm: What we do here is to mark the in-core inode as clean
2812  * with respect to inode dirtiness (it may still be data-dirty).
2813  * This means that the in-core inode may be reaped by prune_icache
2814  * without having to perform any I/O.  This is a very good thing,
2815  * because *any* task may call prune_icache - even ones which
2816  * have a transaction open against a different journal.
2817  *
2818  * Is this cheating?  Not really.  Sure, we haven't written the
2819  * inode out, but prune_icache isn't a user-visible syncing function.
2820  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
2821  * we start and wait on commits.
2822  *
2823  * Is this efficient/effective?  Well, we're being nice to the system
2824  * by cleaning up our inodes proactively so they can be reaped
2825  * without I/O.  But we are potentially leaving up to five seconds'
2826  * worth of inodes floating about which prune_icache wants us to
2827  * write out.  One way to fix that would be to get prune_icache()
2828  * to do a write_super() to free up some memory.  It has the desired
2829  * effect.
2830  */
2831 int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
2832 {
2833         struct ext3_iloc iloc;
2834         int err;
2835
2836         err = ext3_reserve_inode_write(handle, inode, &iloc);
2837         if (!err)
2838                 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
2839         return err;
2840 }
2841
2842 /*
2843  * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
2844  *
2845  * We're really interested in the case where a file is being extended.
2846  * i_size has been changed by generic_commit_write() and we thus need
2847  * to include the updated inode in the current transaction.
2848  *
2849  * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
2850  * are allocated to the file.
2851  *
2852  * If the inode is marked synchronous, we don't honour that here - doing
2853  * so would cause a commit on atime updates, which we don't bother doing.
2854  * We handle synchronous inodes at the highest possible level.
2855  */
2856 void ext3_dirty_inode(struct inode *inode)
2857 {
2858         handle_t *current_handle = ext3_journal_current_handle();
2859         handle_t *handle;
2860
2861         handle = ext3_journal_start(inode, 2);
2862         if (IS_ERR(handle))
2863                 goto out;
2864         if (current_handle &&
2865                 current_handle->h_transaction != handle->h_transaction) {
2866                 /* This task has a transaction open against a different fs */
2867                 printk(KERN_EMERG "%s: transactions do not match!\n",
2868                        __FUNCTION__);
2869         } else {
2870                 jbd_debug(5, "marking dirty.  outer handle=%p\n",
2871                                 current_handle);
2872                 ext3_mark_inode_dirty(handle, inode);
2873         }
2874         ext3_journal_stop(handle);
2875 out:
2876         return;
2877 }
2878
2879 #ifdef AKPM
2880 /*
2881  * Bind an inode's backing buffer_head into this transaction, to prevent
2882  * it from being flushed to disk early.  Unlike
2883  * ext3_reserve_inode_write, this leaves behind no bh reference and
2884  * returns no iloc structure, so the caller needs to repeat the iloc
2885  * lookup to mark the inode dirty later.
2886  */
2887 static inline int
2888 ext3_pin_inode(handle_t *handle, struct inode *inode)
2889 {
2890         struct ext3_iloc iloc;
2891
2892         int err = 0;
2893         if (handle) {
2894                 err = ext3_get_inode_loc(inode, &iloc);
2895                 if (!err) {
2896                         BUFFER_TRACE(iloc.bh, "get_write_access");
2897                         err = journal_get_write_access(handle, iloc.bh);
2898                         if (!err)
2899                                 err = ext3_journal_dirty_metadata(handle,
2900                                                                   iloc.bh);
2901                         brelse(iloc.bh);
2902                 }
2903         }
2904         ext3_std_error(inode->i_sb, err);
2905         return err;
2906 }
2907 #endif
2908
2909 int ext3_change_inode_journal_flag(struct inode *inode, int val)
2910 {
2911         journal_t *journal;
2912         handle_t *handle;
2913         int err;
2914
2915         /*
2916          * We have to be very careful here: changing a data block's
2917          * journaling status dynamically is dangerous.  If we write a
2918          * data block to the journal, change the status and then delete
2919          * that block, we risk forgetting to revoke the old log record
2920          * from the journal and so a subsequent replay can corrupt data.
2921          * So, first we make sure that the journal is empty and that
2922          * nobody is changing anything.
2923          */
2924
2925         journal = EXT3_JOURNAL(inode);
2926         if (is_journal_aborted(journal) || IS_RDONLY(inode))
2927                 return -EROFS;
2928
2929         journal_lock_updates(journal);
2930         journal_flush(journal);
2931
2932         /*
2933          * OK, there are no updates running now, and all cached data is
2934          * synced to disk.  We are now in a completely consistent state
2935          * which doesn't have anything in the journal, and we know that
2936          * no filesystem updates are running, so it is safe to modify
2937          * the inode's in-core data-journaling state flag now.
2938          */
2939
2940         if (val)
2941                 EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
2942         else
2943                 EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
2944
2945         journal_unlock_updates(journal);
2946
2947         /* Finally we can mark the inode as dirty. */
2948
2949         handle = ext3_journal_start(inode, 1);
2950         if (IS_ERR(handle))
2951                 return PTR_ERR(handle);
2952
2953         err = ext3_mark_inode_dirty(handle, inode);
2954         handle->h_sync = 1;
2955         ext3_journal_stop(handle);
2956         ext3_std_error(inode->i_sb, err);
2957
2958         return err;
2959 }