fs/reiserfs/inode.c

   1 /*
   2  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
   3  */
   4
   5 #include <linux/config.h>
   6 #include <linux/time.h>
   7 #include <linux/fs.h>
   8 #include <linux/reiserfs_fs.h>
   9 #include <linux/reiserfs_acl.h>
  10 #include <linux/reiserfs_xattr.h>
  11 #include <linux/smp_lock.h>
  12 #include <linux/pagemap.h>
  13 #include <linux/highmem.h>
  14 #include <asm/uaccess.h>
  15 #include <asm/unaligned.h>
  16 #include <linux/buffer_head.h>
  17 #include <linux/mpage.h>
  18 #include <linux/writeback.h>
  19 #include <linux/quotaops.h>
  20
  21 extern int reiserfs_default_io_size; /* default io size devuned in super.c */
  22
  23 static int reiserfs_commit_write(struct file *f, struct page *page,
  24                                  unsigned from, unsigned to);
  25 static int reiserfs_prepare_write(struct file *f, struct page *page,
  26                                   unsigned from, unsigned to);
  27
  28 void reiserfs_delete_inode (struct inode * inode)
  29 {
  30     /* We need blocks for transaction + (user+group) quota update (possibly delete) */
  31     int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
  32     struct reiserfs_transaction_handle th ;
  33
  34     reiserfs_write_lock(inode->i_sb);
  35
  36     /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
  37     if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */
  38         down (&inode->i_sem);
  39
  40         reiserfs_delete_xattrs (inode);
  41
  42         if (journal_begin(&th, inode->i_sb, jbegin_count)) {
  43             up (&inode->i_sem);
  44             goto out;
  45         }
  46         reiserfs_update_inode_transaction(inode) ;
  47
  48         if (reiserfs_delete_object (&th, inode)) {
  49             up (&inode->i_sem);
  50             goto out;
  51         }
  52
  53         /* Do quota update inside a transaction for journaled quotas. We must do that
  54          * after delete_object so that quota updates go into the same transaction as
  55          * stat data deletion */
  56         DQUOT_FREE_INODE(inode);
  57
  58         if (journal_end(&th, inode->i_sb, jbegin_count)) {
  59             up (&inode->i_sem);
  60             goto out;
  61         }
  62
  63         up (&inode->i_sem);
  64
  65         /* all items of file are deleted, so we can remove "save" link */
  66         remove_save_link (inode, 0/* not truncate */); /* we can't do anything
  67                                                         * about an error here */
  68     } else {
  69         /* no object items are in the tree */
  70         ;
  71     }
  72 out:
  73     clear_inode (inode); /* note this must go after the journal_end to prevent deadlock */
  74     inode->i_blocks = 0;
  75     reiserfs_write_unlock(inode->i_sb);
  76 }
  77
  78 static void _make_cpu_key (struct cpu_key * key, int version, __u32 dirid, __u32 objectid,
  79                loff_t offset, int type, int length )
  80 {
  81     key->version = version;
  82
  83     key->on_disk_key.k_dir_id = dirid;
  84     key->on_disk_key.k_objectid = objectid;
  85     set_cpu_key_k_offset (key, offset);
  86     set_cpu_key_k_type (key, type);
  87     key->key_length = length;
  88 }
  89
  90
  91 /* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set
  92    offset and type of key */
  93 void make_cpu_key (struct cpu_key * key, struct inode * inode, loff_t offset,
  94               int type, int length )
  95 {
  96   _make_cpu_key (key, get_inode_item_key_version (inode), le32_to_cpu (INODE_PKEY (inode)->k_dir_id),
  97                  le32_to_cpu (INODE_PKEY (inode)->k_objectid),
  98                  offset, type, length);
  99 }
 100
 101
 102 //
 103 // when key is 0, do not set version and short key
 104 //
 105 inline void make_le_item_head (struct item_head * ih, const struct cpu_key * key,
 106                                int version,
 107                                loff_t offset, int type, int length,
 108                                int entry_count/*or ih_free_space*/)
 109 {
 110     if (key) {
 111         ih->ih_key.k_dir_id = cpu_to_le32 (key->on_disk_key.k_dir_id);
 112         ih->ih_key.k_objectid = cpu_to_le32 (key->on_disk_key.k_objectid);
 113     }
 114     put_ih_version( ih, version );
 115     set_le_ih_k_offset (ih, offset);
 116     set_le_ih_k_type (ih, type);
 117     put_ih_item_len( ih, length );
 118     /*    set_ih_free_space (ih, 0);*/
 119     // for directory items it is entry count, for directs and stat
 120     // datas - 0xffff, for indirects - 0
 121     put_ih_entry_count( ih, entry_count );
 122 }
 123
 124 //
 125 // FIXME: we might cache recently accessed indirect item
 126
 127 // Ugh.  Not too eager for that....
 128 //  I cut the code until such time as I see a convincing argument (benchmark).
 129 // I don't want a bloated inode struct..., and I don't like code complexity....
 130
 131 /* cutting the code is fine, since it really isn't in use yet and is easy
 132 ** to add back in.  But, Vladimir has a really good idea here.  Think
 133 ** about what happens for reading a file.  For each page,
 134 ** The VFS layer calls reiserfs_readpage, who searches the tree to find
 135 ** an indirect item.  This indirect item has X number of pointers, where
 136 ** X is a big number if we've done the block allocation right.  But,
 137 ** we only use one or two of these pointers during each call to readpage,
 138 ** needlessly researching again later on.
 139 **
 140 ** The size of the cache could be dynamic based on the size of the file.
 141 **
 142 ** I'd also like to see us cache the location the stat data item, since
 143 ** we are needlessly researching for that frequently.
 144 **
 145 ** --chris
 146 */
 147
 148 /* If this page has a file tail in it, and
 149 ** it was read in by get_block_create_0, the page data is valid,
 150 ** but tail is still sitting in a direct item, and we can't write to
 151 ** it.  So, look through this page, and check all the mapped buffers
 152 ** to make sure they have valid block numbers.  Any that don't need
 153 ** to be unmapped, so that block_prepare_write will correctly call
 154 ** reiserfs_get_block to convert the tail into an unformatted node
 155 */
 156 static inline void fix_tail_page_for_writing(struct page *page) {
 157     struct buffer_head *head, *next, *bh ;
 158
 159     if (page && page_has_buffers(page)) {
 160         head = page_buffers(page) ;
 161         bh = head ;
 162         do {
 163             next = bh->b_this_page ;
 164             if (buffer_mapped(bh) && bh->b_blocknr == 0) {
 165                 reiserfs_unmap_buffer(bh) ;
 166             }
 167             bh = next ;
 168         } while (bh != head) ;
 169     }
 170 }
 171
 172 /* reiserfs_get_block does not need to allocate a block only if it has been
 173    done already or non-hole position has been found in the indirect item */
 174 static inline int allocation_needed (int retval, b_blocknr_t allocated,
 175                                      struct item_head * ih,
 176                                      __le32 * item, int pos_in_item)
 177 {
 178   if (allocated)
 179          return 0;
 180   if (retval == POSITION_FOUND && is_indirect_le_ih (ih) &&
 181       get_block_num(item, pos_in_item))
 182          return 0;
 183   return 1;
 184 }
 185
 186 static inline int indirect_item_found (int retval, struct item_head * ih)
 187 {
 188   return (retval == POSITION_FOUND) && is_indirect_le_ih (ih);
 189 }
 190
 191
 192 static inline void set_block_dev_mapped (struct buffer_head * bh,
 193                                          b_blocknr_t block, struct inode * inode)
 194 {
 195         map_bh(bh, inode->i_sb, block);
 196 }
 197
 198
 199 //
 200 // files which were created in the earlier version can not be longer,
 201 // than 2 gb
 202 //
 203 static int file_capable (struct inode * inode, long block)
 204 {
 205     if (get_inode_item_key_version (inode) != KEY_FORMAT_3_5 || // it is new file.
 206         block < (1 << (31 - inode->i_sb->s_blocksize_bits))) // old file, but 'block' is inside of 2gb
 207         return 1;
 208
 209     return 0;
 210 }
 211
 212 /*static*/ int restart_transaction(struct reiserfs_transaction_handle *th,
 213                                 struct inode *inode, struct path *path) {
 214   struct super_block *s = th->t_super ;
 215   int len = th->t_blocks_allocated ;
 216   int err;
 217
 218   BUG_ON (!th->t_trans_id);
 219   BUG_ON (!th->t_refcount);
 220
 221   /* we cannot restart while nested */
 222   if (th->t_refcount > 1) {
 223       return 0  ;
 224   }
 225   pathrelse(path) ;
 226   reiserfs_update_sd(th, inode) ;
 227   err = journal_end(th, s, len) ;
 228   if (!err) {
 229       err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6) ;
 230       if (!err)
 231         reiserfs_update_inode_transaction(inode) ;
 232   }
 233   return err;
 234 }
 235
 236 // it is called by get_block when create == 0. Returns block number
 237 // for 'block'-th logical block of file. When it hits direct item it
 238 // returns 0 (being called from bmap) or read direct item into piece
 239 // of page (bh_result)
 240
 241 // Please improve the english/clarity in the comment above, as it is
 242 // hard to understand.
 243
 244 static int _get_block_create_0 (struct inode * inode, long block,
 245                                  struct buffer_head * bh_result,
 246                                  int args)
 247 {
 248     INITIALIZE_PATH (path);
 249     struct cpu_key key;
 250     struct buffer_head * bh;
 251     struct item_head * ih, tmp_ih;
 252     int fs_gen ;
 253     int blocknr;
 254     char * p = NULL;
 255     int chars;
 256     int ret ;
 257     int result ;
 258     int done = 0 ;
 259     unsigned long offset ;
 260
 261     // prepare the key to look for the 'block'-th block of file
 262     make_cpu_key (&key, inode,
 263                   (loff_t)block * inode->i_sb->s_blocksize + 1, TYPE_ANY, 3);
 264
 265 research:
 266     result = search_for_position_by_key (inode->i_sb, &key, &path) ;
 267     if (result != POSITION_FOUND) {
 268         pathrelse (&path);
 269         if (p)
 270             kunmap(bh_result->b_page) ;
 271         if (result == IO_ERROR)
 272             return -EIO;
 273         // We do not return -ENOENT if there is a hole but page is uptodate, because it means
 274         // That there is some MMAPED data associated with it that is yet to be written to disk.
 275         if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page) ) {
 276             return -ENOENT ;
 277         }
 278         return 0 ;
 279     }
 280
 281     //
 282     bh = get_last_bh (&path);
 283     ih = get_ih (&path);
 284     if (is_indirect_le_ih (ih)) {
 285         __le32 * ind_item = (__le32 *)B_I_PITEM (bh, ih);
 286
 287         /* FIXME: here we could cache indirect item or part of it in
 288            the inode to avoid search_by_key in case of subsequent
 289            access to file */
 290         blocknr = get_block_num(ind_item, path.pos_in_item) ;
 291         ret = 0 ;
 292         if (blocknr) {
 293             map_bh(bh_result, inode->i_sb, blocknr);
 294             if (path.pos_in_item == ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
 295                 set_buffer_boundary(bh_result);
 296             }
 297         } else
 298             // We do not return -ENOENT if there is a hole but page is uptodate, because it means
 299             // That there is some MMAPED data associated with it that is yet to  be written to disk.
 300             if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page) ) {
 301             ret = -ENOENT ;
 302             }
 303
 304         pathrelse (&path);
 305         if (p)
 306             kunmap(bh_result->b_page) ;
 307         return ret ;
 308     }
 309
 310     // requested data are in direct item(s)
 311     if (!(args & GET_BLOCK_READ_DIRECT)) {
 312         // we are called by bmap. FIXME: we can not map block of file
 313         // when it is stored in direct item(s)
 314         pathrelse (&path);
 315         if (p)
 316             kunmap(bh_result->b_page) ;
 317         return -ENOENT;
 318     }
 319
 320     /* if we've got a direct item, and the buffer or page was uptodate,
 321     ** we don't want to pull data off disk again.  skip to the
 322     ** end, where we map the buffer and return
 323     */
 324     if (buffer_uptodate(bh_result)) {
 325         goto finished ;
 326     } else
 327         /*
 328         ** grab_tail_page can trigger calls to reiserfs_get_block on up to date
 329         ** pages without any buffers.  If the page is up to date, we don't want
 330         ** read old data off disk.  Set the up to date bit on the buffer instead
 331         ** and jump to the end
 332         */
 333             if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
 334                 set_buffer_uptodate(bh_result);
 335                 goto finished ;
 336     }
 337
 338     // read file tail into part of page
 339     offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1) ;
 340     fs_gen = get_generation(inode->i_sb) ;
 341     copy_item_head (&tmp_ih, ih);
 342
 343     /* we only want to kmap if we are reading the tail into the page.
 344     ** this is not the common case, so we don't kmap until we are
 345     ** sure we need to.  But, this means the item might move if
 346     ** kmap schedules
 347     */
 348     if (!p) {
 349         p = (char *)kmap(bh_result->b_page) ;
 350         if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
 351             goto research;
 352         }
 353     }
 354     p += offset ;
 355     memset (p, 0, inode->i_sb->s_blocksize);
 356     do {
 357         if (!is_direct_le_ih (ih)) {
 358             BUG ();
 359         }
 360         /* make sure we don't read more bytes than actually exist in
 361         ** the file.  This can happen in odd cases where i_size isn't
 362         ** correct, and when direct item padding results in a few
 363         ** extra bytes at the end of the direct item
 364         */
 365         if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
 366             break ;
 367         if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
 368             chars = inode->i_size - (le_ih_k_offset(ih) - 1) - path.pos_in_item;
 369             done = 1 ;
 370         } else {
 371             chars = ih_item_len(ih) - path.pos_in_item;
 372         }
 373         memcpy (p, B_I_PITEM (bh, ih) + path.pos_in_item, chars);
 374
 375         if (done)
 376             break ;
 377
 378         p += chars;
 379
 380         if (PATH_LAST_POSITION (&path) != (B_NR_ITEMS (bh) - 1))
 381             // we done, if read direct item is not the last item of
 382             // node FIXME: we could try to check right delimiting key
 383             // to see whether direct item continues in the right
 384             // neighbor or rely on i_size
 385             break;
 386
 387         // update key to look for the next piece
 388         set_cpu_key_k_offset (&key, cpu_key_k_offset (&key) + chars);
 389         result = search_for_position_by_key (inode->i_sb, &key, &path);
 390         if (result != POSITION_FOUND)
 391             // i/o error most likely
 392             break;
 393         bh = get_last_bh (&path);
 394         ih = get_ih (&path);
 395     } while (1);
 396
 397     flush_dcache_page(bh_result->b_page) ;
 398     kunmap(bh_result->b_page) ;
 399
 400 finished:
 401     pathrelse (&path);
 402
 403     if (result == IO_ERROR)
 404         return -EIO;
 405
 406     /* this buffer has valid data, but isn't valid for io.  mapping it to
 407      * block #0 tells the rest of reiserfs it just has a tail in it
 408      */
 409     map_bh(bh_result, inode->i_sb, 0);
 410     set_buffer_uptodate (bh_result);
 411     return 0;
 412 }
 413
 414
 415 // this is called to create file map. So, _get_block_create_0 will not
 416 // read direct item
 417 static int reiserfs_bmap (struct inode * inode, sector_t block,
 418                           struct buffer_head * bh_result, int create)
 419 {
 420     if (!file_capable (inode, block))
 421         return -EFBIG;
 422
 423     reiserfs_write_lock(inode->i_sb);
 424     /* do not read the direct item */
 425     _get_block_create_0 (inode, block, bh_result, 0) ;
 426     reiserfs_write_unlock(inode->i_sb);
 427     return 0;
 428 }
 429
 430 /* special version of get_block that is only used by grab_tail_page right
 431 ** now.  It is sent to block_prepare_write, and when you try to get a
 432 ** block past the end of the file (or a block from a hole) it returns
 433 ** -ENOENT instead of a valid buffer.  block_prepare_write expects to
 434 ** be able to do i/o on the buffers returned, unless an error value
 435 ** is also returned.
 436 **
 437 ** So, this allows block_prepare_write to be used for reading a single block
 438 ** in a page.  Where it does not produce a valid page for holes, or past the
 439 ** end of the file.  This turns out to be exactly what we need for reading
 440 ** tails for conversion.
 441 **
 442 ** The point of the wrapper is forcing a certain value for create, even
 443 ** though the VFS layer is calling this function with create==1.  If you
 444 ** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
 445 ** don't use this function.
 446 */
 447 static int reiserfs_get_block_create_0 (struct inode * inode, sector_t block,
 448                         struct buffer_head * bh_result, int create) {
 449     return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE) ;
 450 }
 451
 452 /* This is special helper for reiserfs_get_block in case we are executing
 453    direct_IO request. */
 454 static int reiserfs_get_blocks_direct_io(struct inode *inode,
 455                                          sector_t iblock,
 456                                          unsigned long max_blocks,
 457                                          struct buffer_head *bh_result,
 458                                          int create)
 459 {
 460     int ret ;
 461
 462     bh_result->b_page = NULL;
 463
 464     /* We set the b_size before reiserfs_get_block call since it is
 465        referenced in convert_tail_for_hole() that may be called from
 466        reiserfs_get_block() */
 467     bh_result->b_size = (1 << inode->i_blkbits);
 468
 469     ret = reiserfs_get_block(inode, iblock, bh_result,
 470                              create | GET_BLOCK_NO_DANGLE) ;
 471     if (ret)
 472         goto out;
 473
 474     /* don't allow direct io onto tail pages */
 475     if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
 476         /* make sure future calls to the direct io funcs for this offset
 477         ** in the file fail by unmapping the buffer
 478         */
 479         clear_buffer_mapped(bh_result);
 480         ret = -EINVAL ;
 481     }
 482     /* Possible unpacked tail. Flush the data before pages have
 483        disappeared */
 484     if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
 485         int err;
 486         lock_kernel();
 487         err = reiserfs_commit_for_inode(inode);
 488         REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
 489         unlock_kernel();
 490         if (err < 0)
 491             ret = err;
 492     }
 493 out:
 494     return ret ;
 495 }
 496
 497
 498 /*
 499 ** helper function for when reiserfs_get_block is called for a hole
 500 ** but the file tail is still in a direct item
 501 ** bh_result is the buffer head for the hole
 502 ** tail_offset is the offset of the start of the tail in the file
 503 **
 504 ** This calls prepare_write, which will start a new transaction
 505 ** you should not be in a transaction, or have any paths held when you
 506 ** call this.
 507 */
 508 static int convert_tail_for_hole(struct inode *inode,
 509                                  struct buffer_head *bh_result,
 510                                  loff_t tail_offset) {
 511     unsigned long index ;
 512     unsigned long tail_end ;
 513     unsigned long tail_start ;
 514     struct page * tail_page ;
 515     struct page * hole_page = bh_result->b_page ;
 516     int retval = 0 ;
 517
 518     if ((tail_offset & (bh_result->b_size - 1)) != 1)
 519         return -EIO ;
 520
 521     /* always try to read until the end of the block */
 522     tail_start = tail_offset & (PAGE_CACHE_SIZE - 1) ;
 523     tail_end = (tail_start | (bh_result->b_size - 1)) + 1 ;
 524
 525     index = tail_offset >> PAGE_CACHE_SHIFT ;
 526     /* hole_page can be zero in case of direct_io, we are sure
 527        that we cannot get here if we write with O_DIRECT into
 528        tail page */
 529     if (!hole_page || index != hole_page->index) {
 530         tail_page = grab_cache_page(inode->i_mapping, index) ;
 531         retval = -ENOMEM;
 532         if (!tail_page) {
 533             goto out ;
 534         }
 535     } else {
 536         tail_page = hole_page ;
 537     }
 538
 539     /* we don't have to make sure the conversion did not happen while
 540     ** we were locking the page because anyone that could convert
 541     ** must first take i_sem.
 542     **
 543     ** We must fix the tail page for writing because it might have buffers
 544     ** that are mapped, but have a block number of 0.  This indicates tail
 545     ** data that has been read directly into the page, and block_prepare_write
 546     ** won't trigger a get_block in this case.
 547     */
 548     fix_tail_page_for_writing(tail_page) ;
 549     retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
 550     if (retval)
 551         goto unlock ;
 552
 553     /* tail conversion might change the data in the page */
 554     flush_dcache_page(tail_page) ;
 555
 556     retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end) ;
 557
 558 unlock:
 559     if (tail_page != hole_page) {
 560         unlock_page(tail_page) ;
 561         page_cache_release(tail_page) ;
 562     }
 563 out:
 564     return retval ;
 565 }
 566
 567 static inline int _allocate_block(struct reiserfs_transaction_handle *th,
 568                            long block,
 569                            struct inode *inode,
 570                            b_blocknr_t *allocated_block_nr,
 571                            struct path * path,
 572                            int flags) {
 573     BUG_ON (!th->t_trans_id);
 574
 575 #ifdef REISERFS_PREALLOCATE
 576     if (!(flags & GET_BLOCK_NO_ISEM)) {
 577         return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr, path, block);
 578     }
 579 #endif
 580     return reiserfs_new_unf_blocknrs (th, inode, allocated_block_nr, path, block);
 581 }
 582
 583 int reiserfs_get_block (struct inode * inode, sector_t block,
 584                         struct buffer_head * bh_result, int create)
 585 {
 586     int repeat, retval = 0;
 587     b_blocknr_t allocated_block_nr = 0;// b_blocknr_t is (unsigned) 32 bit int
 588     INITIALIZE_PATH(path);
 589     int pos_in_item;
 590     struct cpu_key key;
 591     struct buffer_head * bh, * unbh = NULL;
 592     struct item_head * ih, tmp_ih;
 593     __le32 * item;
 594     int done;
 595     int fs_gen;
 596     struct reiserfs_transaction_handle *th = NULL;
 597     /* space reserved in transaction batch:
 598         . 3 balancings in direct->indirect conversion
 599         . 1 block involved into reiserfs_update_sd()
 600        XXX in practically impossible worst case direct2indirect()
 601        can incur (much) more than 3 balancings.
 602        quota update for user, group */
 603     int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
 604     int version;
 605     int dangle = 1;
 606     loff_t new_offset = (((loff_t)block) << inode->i_sb->s_blocksize_bits) + 1 ;
 607
 608                                 /* bad.... */
 609     reiserfs_write_lock(inode->i_sb);
 610     version = get_inode_item_key_version (inode);
 611
 612     if (block < 0) {
 613         reiserfs_write_unlock(inode->i_sb);
 614         return -EIO;
 615     }
 616
 617     if (!file_capable (inode, block)) {
 618         reiserfs_write_unlock(inode->i_sb);
 619         return -EFBIG;
 620     }
 621
 622     /* if !create, we aren't changing the FS, so we don't need to
 623     ** log anything, so we don't need to start a transaction
 624     */
 625     if (!(create & GET_BLOCK_CREATE)) {
 626         int ret ;
 627         /* find number of block-th logical block of the file */
 628         ret = _get_block_create_0 (inode, block, bh_result,
 629                                    create | GET_BLOCK_READ_DIRECT) ;
 630         reiserfs_write_unlock(inode->i_sb);
 631         return ret;
 632     }
 633     /*
 634      * if we're already in a transaction, make sure to close
 635      * any new transactions we start in this func
 636      */
 637     if ((create & GET_BLOCK_NO_DANGLE) ||
 638         reiserfs_transaction_running(inode->i_sb))
 639         dangle = 0;
 640
 641     /* If file is of such a size, that it might have a tail and tails are enabled
 642     ** we should mark it as possibly needing tail packing on close
 643     */
 644     if ( (have_large_tails (inode->i_sb) && inode->i_size < i_block_size (inode)*4) ||
 645          (have_small_tails (inode->i_sb) && inode->i_size < i_block_size(inode)) )
 646         REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
 647
 648     /* set the key of the first byte in the 'block'-th block of file */
 649     make_cpu_key (&key, inode, new_offset,
 650                   TYPE_ANY, 3/*key length*/);
 651     if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
 652 start_trans:
 653         th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
 654         if (!th) {
 655             retval = -ENOMEM;
 656             goto failure;
 657         }
 658         reiserfs_update_inode_transaction(inode) ;
 659     }
 660  research:
 661
 662     retval = search_for_position_by_key (inode->i_sb, &key, &path);
 663     if (retval == IO_ERROR) {
 664         retval = -EIO;
 665         goto failure;
 666     }
 667
 668     bh = get_last_bh (&path);
 669     ih = get_ih (&path);
 670     item = get_item (&path);
 671     pos_in_item = path.pos_in_item;
 672
 673     fs_gen = get_generation (inode->i_sb);
 674     copy_item_head (&tmp_ih, ih);
 675
 676     if (allocation_needed (retval, allocated_block_nr, ih, item, pos_in_item)) {
 677         /* we have to allocate block for the unformatted node */
 678         if (!th) {
 679             pathrelse(&path) ;
 680             goto start_trans;
 681         }
 682
 683         repeat = _allocate_block(th, block, inode, &allocated_block_nr, &path, create);
 684
 685         if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
 686             /* restart the transaction to give the journal a chance to free
 687             ** some blocks.  releases the path, so we have to go back to
 688             ** research if we succeed on the second try
 689             */
 690             SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
 691             retval = restart_transaction(th, inode, &path) ;
 692             if (retval)
 693                 goto failure;
 694             repeat = _allocate_block(th, block, inode, &allocated_block_nr, NULL, create);
 695
 696             if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
 697                 goto research ;
 698             }
 699             if (repeat == QUOTA_EXCEEDED)
 700                 retval = -EDQUOT;
 701             else
 702                 retval = -ENOSPC;
 703             goto failure;
 704         }
 705
 706         if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
 707             goto research;
 708         }
 709     }
 710
 711     if (indirect_item_found (retval, ih)) {
 712         b_blocknr_t unfm_ptr;
 713         /* 'block'-th block is in the file already (there is
 714            corresponding cell in some indirect item). But it may be
 715            zero unformatted node pointer (hole) */
 716         unfm_ptr = get_block_num (item, pos_in_item);
 717         if (unfm_ptr == 0) {
 718             /* use allocated block to plug the hole */
 719             reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
 720             if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
 721                 reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
 722                 goto research;
 723             }
 724             set_buffer_new(bh_result);
 725             if (buffer_dirty(bh_result) && reiserfs_data_ordered(inode->i_sb))
 726                 reiserfs_add_ordered_list(inode, bh_result);
 727             put_block_num(item, pos_in_item, allocated_block_nr) ;
 728             unfm_ptr = allocated_block_nr;
 729             journal_mark_dirty (th, inode->i_sb, bh);
 730             reiserfs_update_sd(th, inode) ;
 731         }
 732         set_block_dev_mapped(bh_result, unfm_ptr, inode);
 733         pathrelse (&path);
 734         retval = 0;
 735         if (!dangle && th)
 736             retval = reiserfs_end_persistent_transaction(th);
 737
 738         reiserfs_write_unlock(inode->i_sb);
 739
 740         /* the item was found, so new blocks were not added to the file
 741         ** there is no need to make sure the inode is updated with this
 742         ** transaction
 743         */
 744         return retval;
 745     }
 746
 747     if (!th) {
 748         pathrelse(&path) ;
 749         goto start_trans;
 750     }
 751
 752     /* desired position is not found or is in the direct item. We have
 753        to append file with holes up to 'block'-th block converting
 754        direct items to indirect one if necessary */
 755     done = 0;
 756     do {
 757         if (is_statdata_le_ih (ih)) {
 758             __le32 unp = 0;
 759             struct cpu_key tmp_key;
 760
 761             /* indirect item has to be inserted */
 762             make_le_item_head (&tmp_ih, &key, version, 1, TYPE_INDIRECT,
 763                                UNFM_P_SIZE, 0/* free_space */);
 764
 765             if (cpu_key_k_offset (&key) == 1) {
 766                 /* we are going to add 'block'-th block to the file. Use
 767                    allocated block for that */
 768                 unp = cpu_to_le32 (allocated_block_nr);
 769                 set_block_dev_mapped (bh_result, allocated_block_nr, inode);
 770                 set_buffer_new(bh_result);
 771                 done = 1;
 772             }
 773             tmp_key = key; // ;)
 774             set_cpu_key_k_offset (&tmp_key, 1);
 775             PATH_LAST_POSITION(&path) ++;
 776
 777             retval = reiserfs_insert_item (th, &path, &tmp_key, &tmp_ih, inode, (char *)&unp);
 778             if (retval) {
 779                 reiserfs_free_block (th, inode, allocated_block_nr, 1);
 780                 goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
 781             }
 782             //mark_tail_converted (inode);
 783         } else if (is_direct_le_ih (ih)) {
 784             /* direct item has to be converted */
 785             loff_t tail_offset;
 786
 787             tail_offset = ((le_ih_k_offset (ih) - 1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
 788             if (tail_offset == cpu_key_k_offset (&key)) {
 789                 /* direct item we just found fits into block we have
 790                    to map. Convert it into unformatted node: use
 791                    bh_result for the conversion */
 792                 set_block_dev_mapped (bh_result, allocated_block_nr, inode);
 793                 unbh = bh_result;
 794                 done = 1;
 795             } else {
 796                 /* we have to padd file tail stored in direct item(s)
 797                    up to block size and convert it to unformatted
 798                    node. FIXME: this should also get into page cache */
 799
 800                 pathrelse(&path) ;
 801                 /*
 802                  * ugly, but we can only end the transaction if
 803                  * we aren't nested
 804                  */
 805                 BUG_ON (!th->t_refcount);
 806                 if (th->t_refcount == 1) {
 807                     retval = reiserfs_end_persistent_transaction(th);
 808                     th = NULL;
 809                     if (retval)
 810                         goto failure;
 811                 }
 812
 813                 retval = convert_tail_for_hole(inode, bh_result, tail_offset) ;
 814                 if (retval) {
 815                     if ( retval != -ENOSPC )
 816                         reiserfs_warning (inode->i_sb, "clm-6004: convert tail failed inode %lu, error %d", inode->i_ino, retval) ;
 817                     if (allocated_block_nr) {
 818                         /* the bitmap, the super, and the stat data == 3 */
 819                         if (!th)
 820                             th = reiserfs_persistent_transaction(inode->i_sb,3);
 821                         if (th)
 822                             reiserfs_free_block (th,inode,allocated_block_nr,1);
 823                     }
 824                     goto failure ;
 825                 }
 826                 goto research ;
 827             }
 828             retval = direct2indirect (th, inode, &path, unbh, tail_offset);
 829             if (retval) {
 830                 reiserfs_unmap_buffer(unbh);
 831                 reiserfs_free_block (th, inode, allocated_block_nr, 1);
 832                 goto failure;
 833             }
 834             /* it is important the set_buffer_uptodate is done after
 835             ** the direct2indirect.  The buffer might contain valid
 836             ** data newer than the data on disk (read by readpage, changed,
 837             ** and then sent here by writepage).  direct2indirect needs
 838             ** to know if unbh was already up to date, so it can decide
 839             ** if the data in unbh needs to be replaced with data from
 840             ** the disk
 841             */
 842             set_buffer_uptodate (unbh);
 843
 844             /* unbh->b_page == NULL in case of DIRECT_IO request, this means
 845                buffer will disappear shortly, so it should not be added to
 846              */
 847             if ( unbh->b_page ) {
 848                 /* we've converted the tail, so we must
 849                 ** flush unbh before the transaction commits
 850                 */
 851                 reiserfs_add_tail_list(inode, unbh) ;
 852
 853                 /* mark it dirty now to prevent commit_write from adding
 854                 ** this buffer to the inode's dirty buffer list
 855                 */
 856                 /*
 857                  * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
 858                  * It's still atomic, but it sets the page dirty too,
 859                  * which makes it eligible for writeback at any time by the
 860                  * VM (which was also the case with __mark_buffer_dirty())
 861                  */
 862                 mark_buffer_dirty(unbh) ;
 863             }
 864         } else {
 865             /* append indirect item with holes if needed, when appending
 866                pointer to 'block'-th block use block, which is already
 867                allocated */
 868             struct cpu_key tmp_key;
 869             unp_t unf_single=0; // We use this in case we need to allocate only
 870                                 // one block which is a fastpath
 871             unp_t *un;
 872             __u64 max_to_insert=MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE;
 873             __u64 blocks_needed;
 874
 875             RFALSE( pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
 876                     "vs-804: invalid position for append");
 877             /* indirect item has to be appended, set up key of that position */
 878             make_cpu_key (&tmp_key, inode,
 879                           le_key_k_offset (version, &(ih->ih_key)) + op_bytes_number (ih, inode->i_sb->s_blocksize),
 880                           //pos_in_item * inode->i_sb->s_blocksize,
 881                           TYPE_INDIRECT, 3);// key type is unimportant
 882
 883             blocks_needed = 1 + ((cpu_key_k_offset (&key) - cpu_key_k_offset (&tmp_key)) >> inode->i_sb->s_blocksize_bits);
 884             RFALSE( blocks_needed < 0, "green-805: invalid offset");
 885
 886             if ( blocks_needed == 1 ) {
 887                 un = &unf_single;
 888             } else {
 889                 un=kmalloc( min(blocks_needed,max_to_insert)*UNFM_P_SIZE,
 890                             GFP_ATOMIC); // We need to avoid scheduling.
 891                 if ( !un) {
 892                     un = &unf_single;
 893                     blocks_needed = 1;
 894                     max_to_insert = 0;
 895                 } else
 896                     memset(un, 0, UNFM_P_SIZE * min(blocks_needed,max_to_insert));
 897             }
 898             if ( blocks_needed <= max_to_insert) {
 899                 /* we are going to add target block to the file. Use allocated
 900                    block for that */
 901                 un[blocks_needed-1] = cpu_to_le32 (allocated_block_nr);
 902                 set_block_dev_mapped (bh_result, allocated_block_nr, inode);
 903                 set_buffer_new(bh_result);
 904                 done = 1;
 905             } else {
 906                 /* paste hole to the indirect item */
 907                 /* If kmalloc failed, max_to_insert becomes zero and it means we
 908                    only have space for one block */
 909                 blocks_needed=max_to_insert?max_to_insert:1;
 910             }
 911             retval = reiserfs_paste_into_item (th, &path, &tmp_key, inode, (char *)un, UNFM_P_SIZE * blocks_needed);
 912
 913             if (blocks_needed != 1)
 914                 kfree(un);
 915
 916             if (retval) {
 917                 reiserfs_free_block (th, inode, allocated_block_nr, 1);
 918                 goto failure;
 919             }
 920             if (!done) {
 921                 /* We need to mark new file size in case this function will be
 922                    interrupted/aborted later on. And we may do this only for
 923                    holes. */
 924                 inode->i_size += inode->i_sb->s_blocksize * blocks_needed;
 925             }
 926         }
 927
 928         if (done == 1)
 929             break;
 930
 931         /* this loop could log more blocks than we had originally asked
 932         ** for.  So, we have to allow the transaction to end if it is
 933         ** too big or too full.  Update the inode so things are
 934         ** consistent if we crash before the function returns
 935         **
 936         ** release the path so that anybody waiting on the path before
 937         ** ending their transaction will be able to continue.
 938         */
 939         if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
 940           retval = restart_transaction(th, inode, &path) ;
 941           if (retval)
 942             goto failure;
 943         }
 944         /* inserting indirect pointers for a hole can take a
 945         ** long time.  reschedule if needed
 946         */
 947         cond_resched();
 948
 949         retval = search_for_position_by_key (inode->i_sb, &key, &path);
 950         if (retval == IO_ERROR) {
 951             retval = -EIO;
 952             goto failure;
 953         }
 954         if (retval == POSITION_FOUND) {
 955             reiserfs_warning (inode->i_sb, "vs-825: reiserfs_get_block: "
 956                               "%K should not be found", &key);
 957             retval = -EEXIST;
 958             if (allocated_block_nr)
 959                 reiserfs_free_block (th, inode, allocated_block_nr, 1);
 960             pathrelse(&path) ;
 961             goto failure;
 962         }
 963         bh = get_last_bh (&path);
 964         ih = get_ih (&path);
 965         item = get_item (&path);
 966         pos_in_item = path.pos_in_item;
 967     } while (1);
 968
 969
 970     retval = 0;
 971
 972  failure:
 973     if (th && (!dangle || (retval && !th->t_trans_id))) {
 974         int err;
 975         if (th->t_trans_id)
 976             reiserfs_update_sd(th, inode);
 977         err = reiserfs_end_persistent_transaction(th);
 978         if (err)
 979             retval = err;
 980     }
 981
 982     reiserfs_write_unlock(inode->i_sb);
 983     reiserfs_check_path(&path) ;
 984     return retval;
 985 }
 986
 987 static int
 988 reiserfs_readpages(struct file *file, struct address_space *mapping,
 989                 struct list_head *pages, unsigned nr_pages)
 990 {
 991     return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
 992 }
 993
 994 /* Compute real number of used bytes by file
 995  * Following three functions can go away when we'll have enough space in stat item
 996  */
 997 static int real_space_diff(struct inode *inode, int sd_size)
 998 {
 999     int bytes;
1000     loff_t blocksize = inode->i_sb->s_blocksize ;
1001
1002     if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
1003         return sd_size ;
1004
1005     /* End of file is also in full block with indirect reference, so round
1006     ** up to the next block.
1007     **
1008     ** there is just no way to know if the tail is actually packed
1009     ** on the file, so we have to assume it isn't.  When we pack the
1010     ** tail, we add 4 bytes to pretend there really is an unformatted
1011     ** node pointer
1012     */
1013     bytes = ((inode->i_size + (blocksize-1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE + sd_size;
1014     return bytes ;
1015 }
1016
1017 static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
1018                                         int sd_size)
1019 {
1020     if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
1021         return inode->i_size + (loff_t)(real_space_diff(inode, sd_size)) ;
1022     }
1023     return ((loff_t)real_space_diff(inode, sd_size)) + (((loff_t)blocks) << 9);
1024 }
1025
1026 /* Compute number of blocks used by file in ReiserFS counting */
1027 static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
1028 {
1029     loff_t bytes = inode_get_bytes(inode) ;
1030     loff_t real_space = real_space_diff(inode, sd_size) ;
1031
1032     /* keeps fsck and non-quota versions of reiserfs happy */
1033     if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
1034         bytes += (loff_t)511 ;
1035     }
1036
1037     /* files from before the quota patch might i_blocks such that
1038     ** bytes < real_space.  Deal with that here to prevent it from
1039     ** going negative.
1040     */
1041     if (bytes < real_space)
1042         return 0 ;
1043     return (bytes - real_space) >> 9;
1044 }
1045
1046 //
1047 // BAD: new directories have stat data of new type and all other items
1048 // of old type. Version stored in the inode says about body items, so
1049 // in update_stat_data we can not rely on inode, but have to check
1050 // item version directly
1051 //
1052
1053 // called by read_locked_inode
1054 static void init_inode (struct inode * inode, struct path * path)
1055 {
1056     struct buffer_head * bh;
1057     struct item_head * ih;
1058     __u32 rdev;
1059     //int version = ITEM_VERSION_1;
1060
1061     bh = PATH_PLAST_BUFFER (path);
1062     ih = PATH_PITEM_HEAD (path);
1063
1064
1065     copy_key (INODE_PKEY (inode), &(ih->ih_key));
1066     inode->i_blksize = reiserfs_default_io_size;
1067
1068     INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list ));
1069     REISERFS_I(inode)->i_flags = 0;
1070     REISERFS_I(inode)->i_prealloc_block = 0;
1071     REISERFS_I(inode)->i_prealloc_count = 0;
1072     REISERFS_I(inode)->i_trans_id = 0;
1073     REISERFS_I(inode)->i_jl = NULL;
1074     REISERFS_I(inode)->i_acl_access = NULL;
1075     REISERFS_I(inode)->i_acl_default = NULL;
1076     init_rwsem (&REISERFS_I(inode)->xattr_sem);
1077
1078     if (stat_data_v1 (ih)) {
1079         struct stat_data_v1 * sd = (struct stat_data_v1 *)B_I_PITEM (bh, ih);
1080         unsigned long blocks;
1081
1082         set_inode_item_key_version (inode, KEY_FORMAT_3_5);
1083         set_inode_sd_version (inode, STAT_DATA_V1);
1084         inode->i_mode  = sd_v1_mode(sd);
1085         inode->i_nlink = sd_v1_nlink(sd);
1086         inode->i_uid   = sd_v1_uid(sd);
1087         inode->i_gid   = sd_v1_gid(sd);
1088         inode->i_size  = sd_v1_size(sd);
1089         inode->i_atime.tv_sec = sd_v1_atime(sd);
1090         inode->i_mtime.tv_sec = sd_v1_mtime(sd);
1091         inode->i_ctime.tv_sec = sd_v1_ctime(sd);
1092         inode->i_atime.tv_nsec = 0;
1093         inode->i_ctime.tv_nsec = 0;
1094         inode->i_mtime.tv_nsec = 0;
1095
1096         inode->i_blocks = sd_v1_blocks(sd);
1097         inode->i_generation = le32_to_cpu (INODE_PKEY (inode)->k_dir_id);
1098         blocks = (inode->i_size + 511) >> 9;
1099         blocks = _ROUND_UP (blocks, inode->i_sb->s_blocksize >> 9);
1100         if (inode->i_blocks > blocks) {
1101             // there was a bug in <=3.5.23 when i_blocks could take negative
1102             // values. Starting from 3.5.17 this value could even be stored in
1103             // stat data. For such files we set i_blocks based on file
1104             // size. Just 2 notes: this can be wrong for sparce files. On-disk value will be
1105             // only updated if file's inode will ever change
1106             inode->i_blocks = blocks;
1107         }
1108
1109         rdev = sd_v1_rdev(sd);
1110         REISERFS_I(inode)->i_first_direct_byte = sd_v1_first_direct_byte(sd);
1111         /* an early bug in the quota code can give us an odd number for the
1112         ** block count.  This is incorrect, fix it here.
1113         */
1114         if (inode->i_blocks & 1) {
1115             inode->i_blocks++ ;
1116         }
1117         inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks,
1118                                                   SD_V1_SIZE));
1119         /* nopack is initially zero for v1 objects. For v2 objects,
1120            nopack is initialised from sd_attrs */
1121         REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
1122     } else {
1123         // new stat data found, but object may have old items
1124         // (directories and symlinks)
1125         struct stat_data * sd = (struct stat_data *)B_I_PITEM (bh, ih);
1126
1127         inode->i_mode   = sd_v2_mode(sd);
1128         inode->i_nlink  = sd_v2_nlink(sd);
1129         inode->i_uid    = sd_v2_uid(sd);
1130         inode->i_size   = sd_v2_size(sd);
1131         inode->i_gid    = sd_v2_gid(sd);
1132         inode->i_mtime.tv_sec  = sd_v2_mtime(sd);
1133         inode->i_atime.tv_sec = sd_v2_atime(sd);
1134         inode->i_ctime.tv_sec  = sd_v2_ctime(sd);
1135         inode->i_ctime.tv_nsec = 0;
1136         inode->i_mtime.tv_nsec = 0;
1137         inode->i_atime.tv_nsec = 0;
1138         inode->i_blocks = sd_v2_blocks(sd);
1139         rdev            = sd_v2_rdev(sd);
1140         if( S_ISCHR( inode -> i_mode ) || S_ISBLK( inode -> i_mode ) )
1141             inode->i_generation = le32_to_cpu (INODE_PKEY (inode)->k_dir_id);
1142         else
1143             inode->i_generation = sd_v2_generation(sd);
1144
1145         if (S_ISDIR (inode->i_mode) || S_ISLNK (inode->i_mode))
1146             set_inode_item_key_version (inode, KEY_FORMAT_3_5);
1147         else
1148             set_inode_item_key_version (inode, KEY_FORMAT_3_6);
1149         REISERFS_I(inode)->i_first_direct_byte = 0;
1150         set_inode_sd_version (inode, STAT_DATA_V2);
1151         inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks,
1152                                                   SD_V2_SIZE));
1153         /* read persistent inode attributes from sd and initalise
1154            generic inode flags from them */
1155         REISERFS_I(inode)->i_attrs = sd_v2_attrs( sd );
1156         sd_attrs_to_i_attrs( sd_v2_attrs( sd ), inode );
1157     }
1158
1159     pathrelse (path);
1160     if (S_ISREG (inode->i_mode)) {
1161         inode->i_op = &reiserfs_file_inode_operations;
1162         inode->i_fop = &reiserfs_file_operations;
1163         inode->i_mapping->a_ops = &reiserfs_address_space_operations ;
1164     } else if (S_ISDIR (inode->i_mode)) {
1165         inode->i_op = &reiserfs_dir_inode_operations;
1166         inode->i_fop = &reiserfs_dir_operations;
1167     } else if (S_ISLNK (inode->i_mode)) {
1168         inode->i_op = &reiserfs_symlink_inode_operations;
1169         inode->i_mapping->a_ops = &reiserfs_address_space_operations;
1170     } else {
1171         inode->i_blocks = 0;
1172         inode->i_op = &reiserfs_special_inode_operations;
1173         init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
1174     }
1175 }
1176
1177
1178 // update new stat data with inode fields
1179 static void inode2sd (void * sd, struct inode * inode, loff_t size)
1180 {
1181     struct stat_data * sd_v2 = (struct stat_data *)sd;
1182     __u16 flags;
1183
1184     set_sd_v2_mode(sd_v2, inode->i_mode );
1185     set_sd_v2_nlink(sd_v2, inode->i_nlink );
1186     set_sd_v2_uid(sd_v2, inode->i_uid );
1187     set_sd_v2_size(sd_v2, size );
1188     set_sd_v2_gid(sd_v2, inode->i_gid );
1189     set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec );
1190     set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec );
1191     set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec );
1192     set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
1193     if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1194         set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
1195     else
1196         set_sd_v2_generation(sd_v2, inode->i_generation);
1197     flags = REISERFS_I(inode)->i_attrs;
1198     i_attrs_to_sd_attrs( inode, &flags );
1199     set_sd_v2_attrs( sd_v2, flags );
1200 }
1201
1202
1203 // used to copy inode's fields to old stat data
1204 static void inode2sd_v1 (void * sd, struct inode * inode, loff_t size)
1205 {
1206     struct stat_data_v1 * sd_v1 = (struct stat_data_v1 *)sd;
1207
1208     set_sd_v1_mode(sd_v1, inode->i_mode );
1209     set_sd_v1_uid(sd_v1, inode->i_uid );
1210     set_sd_v1_gid(sd_v1, inode->i_gid );
1211     set_sd_v1_nlink(sd_v1, inode->i_nlink );
1212     set_sd_v1_size(sd_v1, size );
1213     set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec );
1214     set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec );
1215     set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec );
1216
1217     if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1218         set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
1219     else
1220         set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
1221
1222     // Sigh. i_first_direct_byte is back
1223     set_sd_v1_first_direct_byte(sd_v1, REISERFS_I(inode)->i_first_direct_byte);
1224 }
1225
1226
1227 /* NOTE, you must prepare the buffer head before sending it here,
1228 ** and then log it after the call
1229 */
1230 static void update_stat_data (struct path * path, struct inode * inode,
1231                               loff_t size)
1232 {
1233     struct buffer_head * bh;
1234     struct item_head * ih;
1235
1236     bh = PATH_PLAST_BUFFER (path);
1237     ih = PATH_PITEM_HEAD (path);
1238
1239     if (!is_statdata_le_ih (ih))
1240         reiserfs_panic (inode->i_sb, "vs-13065: update_stat_data: key %k, found item %h",
1241                         INODE_PKEY (inode), ih);
1242
1243     if (stat_data_v1 (ih)) {
1244         // path points to old stat data
1245         inode2sd_v1 (B_I_PITEM (bh, ih), inode, size);
1246     } else {
1247         inode2sd (B_I_PITEM (bh, ih), inode, size);
1248     }
1249
1250     return;
1251 }
1252
1253
1254 void reiserfs_update_sd_size (struct reiserfs_transaction_handle *th,
1255                               struct inode * inode, loff_t size)
1256 {
1257     struct cpu_key key;
1258     INITIALIZE_PATH(path);
1259     struct buffer_head *bh ;
1260     int fs_gen ;
1261     struct item_head *ih, tmp_ih ;
1262     int retval;
1263
1264     BUG_ON (!th->t_trans_id);
1265
1266     make_cpu_key (&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);//key type is unimportant
1267
1268     for(;;) {
1269         int pos;
1270         /* look for the object's stat data */
1271         retval = search_item (inode->i_sb, &key, &path);
1272         if (retval == IO_ERROR) {
1273             reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: "
1274                               "i/o failure occurred trying to update %K stat data",
1275                               &key);
1276             return;
1277         }
1278         if (retval == ITEM_NOT_FOUND) {
1279             pos = PATH_LAST_POSITION (&path);
1280             pathrelse(&path) ;
1281             if (inode->i_nlink == 0) {
1282                 /*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found");*/
1283                 return;
1284             }
1285             reiserfs_warning (inode->i_sb, "vs-13060: reiserfs_update_sd: "
1286                               "stat data of object %k (nlink == %d) not found (pos %d)",
1287                               INODE_PKEY (inode), inode->i_nlink, pos);
1288             reiserfs_check_path(&path) ;
1289             return;
1290         }
1291
1292         /* sigh, prepare_for_journal might schedule.  When it schedules the
1293         ** FS might change.  We have to detect that, and loop back to the
1294         ** search if the stat data item has moved
1295         */
1296         bh = get_last_bh(&path) ;
1297         ih = get_ih(&path) ;
1298         copy_item_head (&tmp_ih, ih);
1299         fs_gen = get_generation (inode->i_sb);
1300         reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
1301         if (fs_changed (fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) {
1302             reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
1303             continue ;  /* Stat_data item has been moved after scheduling. */
1304         }
1305         break;
1306     }
1307     update_stat_data (&path, inode, size);
1308     journal_mark_dirty(th, th->t_super, bh) ;
1309     pathrelse (&path);
1310     return;
1311 }
1312
1313 /* reiserfs_read_locked_inode is called to read the inode off disk, and it
1314 ** does a make_bad_inode when things go wrong.  But, we need to make sure
1315 ** and clear the key in the private portion of the inode, otherwise a
1316 ** corresponding iput might try to delete whatever object the inode last
1317 ** represented.
1318 */
1319 static void reiserfs_make_bad_inode(struct inode *inode) {
1320     memset(INODE_PKEY(inode), 0, KEY_SIZE);
1321     make_bad_inode(inode);
1322 }
1323
1324 //
1325 // initially this function was derived from minix or ext2's analog and
1326 // evolved as the prototype did
1327 //
1328
1329 int reiserfs_init_locked_inode (struct inode * inode, void *p)
1330 {
1331     struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p ;
1332     inode->i_ino = args->objectid;
1333     INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
1334     return 0;
1335 }
1336
1337 /* looks for stat data in the tree, and fills up the fields of in-core
1338    inode stat data fields */
1339 void reiserfs_read_locked_inode (struct inode * inode, struct reiserfs_iget_args *args)
1340 {
1341     INITIALIZE_PATH (path_to_sd);
1342     struct cpu_key key;
1343     unsigned long dirino;
1344     int retval;
1345
1346     dirino = args->dirid ;
1347
1348     /* set version 1, version 2 could be used too, because stat data
1349        key is the same in both versions */
1350     key.version = KEY_FORMAT_3_5;
1351     key.on_disk_key.k_dir_id = dirino;
1352     key.on_disk_key.k_objectid = inode->i_ino;
1353     key.on_disk_key.k_offset = 0;
1354     key.on_disk_key.k_type = 0;
1355
1356     /* look for the object's stat data */
1357     retval = search_item (inode->i_sb, &key, &path_to_sd);
1358     if (retval == IO_ERROR) {
1359         reiserfs_warning (inode->i_sb, "vs-13070: reiserfs_read_locked_inode: "
1360                           "i/o failure occurred trying to find stat data of %K",
1361                           &key);
1362         reiserfs_make_bad_inode(inode) ;
1363         return;
1364     }
1365     if (retval != ITEM_FOUND) {
1366         /* a stale NFS handle can trigger this without it being an error */
1367         pathrelse (&path_to_sd);
1368         reiserfs_make_bad_inode(inode) ;
1369         inode->i_nlink = 0;
1370         return;
1371     }
1372
1373     init_inode (inode, &path_to_sd);
1374
1375     /* It is possible that knfsd is trying to access inode of a file
1376        that is being removed from the disk by some other thread. As we
1377        update sd on unlink all that is required is to check for nlink
1378        here. This bug was first found by Sizif when debugging
1379        SquidNG/Butterfly, forgotten, and found again after Philippe
1380        Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
1381
1382        More logical fix would require changes in fs/inode.c:iput() to
1383        remove inode from hash-table _after_ fs cleaned disk stuff up and
1384        in iget() to return NULL if I_FREEING inode is found in
1385        hash-table. */
1386     /* Currently there is one place where it's ok to meet inode with
1387        nlink==0: processing of open-unlinked and half-truncated files
1388        during mount (fs/reiserfs/super.c:finish_unfinished()). */
1389     if( ( inode -> i_nlink == 0 ) &&
1390         ! REISERFS_SB(inode -> i_sb) -> s_is_unlinked_ok ) {
1391             reiserfs_warning (inode->i_sb,
1392                               "vs-13075: reiserfs_read_locked_inode: "
1393                               "dead inode read from disk %K. "
1394                               "This is likely to be race with knfsd. Ignore",
1395                               &key );
1396             reiserfs_make_bad_inode( inode );
1397     }
1398
1399     reiserfs_check_path(&path_to_sd) ; /* init inode should be relsing */
1400
1401 }
1402
1403 /**
1404  * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
1405  *
1406  * @inode:    inode from hash table to check
1407  * @opaque:   "cookie" passed to iget5_locked(). This is &reiserfs_iget_args.
1408  *
1409  * This function is called by iget5_locked() to distinguish reiserfs inodes
1410  * having the same inode numbers. Such inodes can only exist due to some
1411  * error condition. One of them should be bad. Inodes with identical
1412  * inode numbers (objectids) are distinguished by parent directory ids.
1413  *
1414  */
1415 int reiserfs_find_actor( struct inode *inode, void *opaque )
1416 {
1417     struct reiserfs_iget_args *args;
1418
1419     args = opaque;
1420     /* args is already in CPU order */
1421     return (inode->i_ino == args->objectid) &&
1422         (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
1423 }
1424
1425 struct inode * reiserfs_iget (struct super_block * s, const struct cpu_key * key)
1426 {
1427     struct inode * inode;
1428     struct reiserfs_iget_args args ;
1429
1430     args.objectid = key->on_disk_key.k_objectid ;
1431     args.dirid = key->on_disk_key.k_dir_id ;
1432     inode = iget5_locked (s, key->on_disk_key.k_objectid,
1433                    reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args));
1434     if (!inode)
1435         return ERR_PTR(-ENOMEM) ;
1436
1437     if (inode->i_state & I_NEW) {
1438         reiserfs_read_locked_inode(inode, &args);
1439         unlock_new_inode(inode);
1440     }
1441
1442     if (comp_short_keys (INODE_PKEY (inode), key) || is_bad_inode (inode)) {
1443         /* either due to i/o error or a stale NFS handle */
1444         iput (inode);
1445         inode = NULL;
1446     }
1447     return inode;
1448 }
1449
1450 struct dentry *reiserfs_get_dentry(struct super_block *sb, void *vobjp)
1451 {
1452     __u32 *data = vobjp;
1453     struct cpu_key key ;
1454     struct dentry *result;
1455     struct inode *inode;
1456
1457     key.on_disk_key.k_objectid = data[0] ;
1458     key.on_disk_key.k_dir_id = data[1] ;
1459     reiserfs_write_lock(sb);
1460     inode = reiserfs_iget(sb, &key) ;
1461     if (inode && !IS_ERR(inode) && data[2] != 0 &&
1462         data[2] != inode->i_generation) {
1463             iput(inode) ;
1464             inode = NULL ;
1465     }
1466     reiserfs_write_unlock(sb);
1467     if (!inode)
1468             inode = ERR_PTR(-ESTALE);
1469     if (IS_ERR(inode))
1470             return ERR_PTR(PTR_ERR(inode));
1471     result = d_alloc_anon(inode);
1472     if (!result) {
1473             iput(inode);
1474             return ERR_PTR(-ENOMEM);
1475     }
1476     return result;
1477 }
1478
1479 struct dentry *reiserfs_decode_fh(struct super_block *sb, __u32 *data,
1480                                      int len, int fhtype,
1481                                   int (*acceptable)(void *contect, struct dentry *de),
1482                                   void *context) {
1483     __u32 obj[3], parent[3];
1484
1485     /* fhtype happens to reflect the number of u32s encoded.
1486      * due to a bug in earlier code, fhtype might indicate there
1487      * are more u32s then actually fitted.
1488      * so if fhtype seems to be more than len, reduce fhtype.
1489      * Valid types are:
1490      *   2 - objectid + dir_id - legacy support
1491      *   3 - objectid + dir_id + generation
1492      *   4 - objectid + dir_id + objectid and dirid of parent - legacy
1493      *   5 - objectid + dir_id + generation + objectid and dirid of parent
1494      *   6 - as above plus generation of directory
1495      * 6 does not fit in NFSv2 handles
1496      */
1497     if (fhtype > len) {
1498             if (fhtype != 6 || len != 5)
1499                     reiserfs_warning (sb, "nfsd/reiserfs, fhtype=%d, len=%d - odd",
1500                            fhtype, len);
1501             fhtype = 5;
1502     }
1503
1504     obj[0] = data[0];
1505     obj[1] = data[1];
1506     if (fhtype == 3 || fhtype >= 5)
1507             obj[2] = data[2];
1508     else    obj[2] = 0; /* generation number */
1509
1510     if (fhtype >= 4) {
1511             parent[0] = data[fhtype>=5?3:2] ;
1512             parent[1] = data[fhtype>=5?4:3] ;
1513             if (fhtype == 6)
1514                     parent[2] = data[5];
1515             else    parent[2] = 0;
1516     }
1517     return sb->s_export_op->find_exported_dentry(sb, obj, fhtype < 4 ? NULL : parent,
1518                                acceptable, context);
1519 }
1520
1521 int reiserfs_encode_fh(struct dentry *dentry, __u32 *data, int *lenp, int need_parent) {
1522     struct inode *inode = dentry->d_inode ;
1523     int maxlen = *lenp;
1524
1525     if (maxlen < 3)
1526         return 255 ;
1527
1528     data[0] = inode->i_ino ;
1529     data[1] = le32_to_cpu(INODE_PKEY (inode)->k_dir_id) ;
1530     data[2] = inode->i_generation ;
1531     *lenp = 3 ;
1532     /* no room for directory info? return what we've stored so far */
1533     if (maxlen < 5 || ! need_parent)
1534         return 3 ;
1535
1536     spin_lock(&dentry->d_lock);
1537     inode = dentry->d_parent->d_inode ;
1538     data[3] = inode->i_ino ;
1539     data[4] = le32_to_cpu(INODE_PKEY (inode)->k_dir_id) ;
1540     *lenp = 5 ;
1541     if (maxlen >= 6) {
1542             data[5] = inode->i_generation ;
1543             *lenp = 6 ;
1544     }
1545     spin_unlock(&dentry->d_lock);
1546     return *lenp ;
1547 }
1548
1549
1550 /* looks for stat data, then copies fields to it, marks the buffer
1551    containing stat data as dirty */
1552 /* reiserfs inodes are never really dirty, since the dirty inode call
1553 ** always logs them.  This call allows the VFS inode marking routines
1554 ** to properly mark inodes for datasync and such, but only actually
1555 ** does something when called for a synchronous update.
1556 */
1557 int reiserfs_write_inode (struct inode * inode, int do_sync) {
1558     struct reiserfs_transaction_handle th ;
1559     int jbegin_count = 1 ;
1560
1561     if (inode->i_sb->s_flags & MS_RDONLY)
1562         return -EROFS;
1563     /* memory pressure can sometimes initiate write_inode calls with sync == 1,
1564     ** these cases are just when the system needs ram, not when the
1565     ** inode needs to reach disk for safety, and they can safely be
1566     ** ignored because the altered inode has already been logged.
1567     */
1568     if (do_sync && !(current->flags & PF_MEMALLOC)) {
1569         reiserfs_write_lock(inode->i_sb);
1570         if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
1571             reiserfs_update_sd (&th, inode);
1572             journal_end_sync(&th, inode->i_sb, jbegin_count) ;
1573         }
1574         reiserfs_write_unlock(inode->i_sb);
1575     }
1576     return 0;
1577 }
1578
1579 /* stat data of new object is inserted already, this inserts the item
1580    containing "." and ".." entries */
1581 static int reiserfs_new_directory (struct reiserfs_transaction_handle *th,
1582                                    struct inode *inode,
1583                                    struct item_head * ih, struct path * path,
1584                                    struct inode * dir)
1585 {
1586     struct super_block * sb = th->t_super;
1587     char empty_dir [EMPTY_DIR_SIZE];
1588     char * body = empty_dir;
1589     struct cpu_key key;
1590     int retval;
1591
1592     BUG_ON (!th->t_trans_id);
1593
1594     _make_cpu_key (&key, KEY_FORMAT_3_5, le32_to_cpu (ih->ih_key.k_dir_id),
1595                    le32_to_cpu (ih->ih_key.k_objectid), DOT_OFFSET, TYPE_DIRENTRY, 3/*key length*/);
1596
1597     /* compose item head for new item. Directories consist of items of
1598        old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
1599        is done by reiserfs_new_inode */
1600     if (old_format_only (sb)) {
1601         make_le_item_head (ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
1602
1603         make_empty_dir_item_v1 (body, ih->ih_key.k_dir_id, ih->ih_key.k_objectid,
1604                                 INODE_PKEY (dir)->k_dir_id,
1605                                 INODE_PKEY (dir)->k_objectid );
1606     } else {
1607         make_le_item_head (ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
1608
1609         make_empty_dir_item (body, ih->ih_key.k_dir_id, ih->ih_key.k_objectid,
1610                                 INODE_PKEY (dir)->k_dir_id,
1611                                 INODE_PKEY (dir)->k_objectid );
1612     }
1613
1614     /* look for place in the tree for new item */
1615     retval = search_item (sb, &key, path);
1616     if (retval == IO_ERROR) {
1617         reiserfs_warning (sb, "vs-13080: reiserfs_new_directory: "
1618                           "i/o failure occurred creating new directory");
1619         return -EIO;
1620     }
1621     if (retval == ITEM_FOUND) {
1622         pathrelse (path);
1623         reiserfs_warning (sb, "vs-13070: reiserfs_new_directory: "
1624                           "object with this key exists (%k)", &(ih->ih_key));
1625         return -EEXIST;
1626     }
1627
1628     /* insert item, that is empty directory item */
1629     return reiserfs_insert_item (th, path, &key, ih, inode, body);
1630 }
1631
1632
1633 /* stat data of object has been inserted, this inserts the item
1634    containing the body of symlink */
1635 static int reiserfs_new_symlink (struct reiserfs_transaction_handle *th,
1636                                  struct inode *inode,   /* Inode of symlink */
1637                                  struct item_head * ih,
1638                                  struct path * path, const char * symname, int item_len)
1639 {
1640     struct super_block * sb = th->t_super;
1641     struct cpu_key key;
1642     int retval;
1643
1644     BUG_ON (!th->t_trans_id);
1645
1646     _make_cpu_key (&key, KEY_FORMAT_3_5,
1647                    le32_to_cpu (ih->ih_key.k_dir_id),
1648                    le32_to_cpu (ih->ih_key.k_objectid),
1649                    1, TYPE_DIRECT, 3/*key length*/);
1650
1651     make_le_item_head (ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len, 0/*free_space*/);
1652
1653     /* look for place in the tree for new item */
1654     retval = search_item (sb, &key, path);
1655     if (retval == IO_ERROR) {
1656         reiserfs_warning (sb, "vs-13080: reiserfs_new_symlinik: "
1657                           "i/o failure occurred creating new symlink");
1658         return -EIO;
1659     }
1660     if (retval == ITEM_FOUND) {
1661         pathrelse (path);
1662         reiserfs_warning (sb, "vs-13080: reiserfs_new_symlink: "
1663                           "object with this key exists (%k)", &(ih->ih_key));
1664         return -EEXIST;
1665     }
1666
1667     /* insert item, that is body of symlink */
1668     return reiserfs_insert_item (th, path, &key, ih, inode, symname);
1669 }
1670
1671
1672 /* inserts the stat data into the tree, and then calls
1673    reiserfs_new_directory (to insert ".", ".." item if new object is
1674    directory) or reiserfs_new_symlink (to insert symlink body if new
1675    object is symlink) or nothing (if new object is regular file)
1676
1677    NOTE! uid and gid must already be set in the inode.  If we return
1678    non-zero due to an error, we have to drop the quota previously allocated
1679    for the fresh inode.  This can only be done outside a transaction, so
1680    if we return non-zero, we also end the transaction.  */
1681 int reiserfs_new_inode (struct reiserfs_transaction_handle *th,
1682                         struct inode * dir, int mode,
1683                         const char * symname,
1684                         /* 0 for regular, EMTRY_DIR_SIZE for dirs,
1685                            strlen (symname) for symlinks)*/
1686                          loff_t i_size, struct dentry *dentry,
1687                          struct inode *inode)
1688 {
1689     struct super_block * sb;
1690     INITIALIZE_PATH (path_to_key);
1691     struct cpu_key key;
1692     struct item_head ih;
1693     struct stat_data sd;
1694     int retval;
1695     int err;
1696
1697     BUG_ON (!th->t_trans_id);
1698
1699     if (DQUOT_ALLOC_INODE(inode)) {
1700         err = -EDQUOT;
1701         goto out_end_trans;
1702     }
1703     if (!dir || !dir->i_nlink) {
1704         err = -EPERM;
1705         goto out_bad_inode;
1706     }
1707
1708     sb = dir->i_sb;
1709
1710     /* item head of new item */
1711     ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
1712     ih.ih_key.k_objectid = cpu_to_le32 (reiserfs_get_unused_objectid (th));
1713     if (!ih.ih_key.k_objectid) {
1714         err = -ENOMEM;
1715         goto out_bad_inode ;
1716     }
1717     if (old_format_only (sb))
1718         /* not a perfect generation count, as object ids can be reused, but
1719         ** this is as good as reiserfs can do right now.
1720         ** note that the private part of inode isn't filled in yet, we have
1721         ** to use the directory.
1722         */
1723         inode->i_generation = le32_to_cpu (INODE_PKEY (dir)->k_objectid);
1724     else
1725 #if defined( USE_INODE_GENERATION_COUNTER )
1726         inode->i_generation = le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
1727 #else
1728         inode->i_generation = ++event;
1729 #endif
1730
1731     /* fill stat data */
1732     inode->i_nlink = (S_ISDIR (mode) ? 2 : 1);
1733
1734     /* uid and gid must already be set by the caller for quota init */
1735
1736     /* symlink cannot be immutable or append only, right? */
1737     if( S_ISLNK( inode -> i_mode ) )
1738             inode -> i_flags &= ~ ( S_IMMUTABLE | S_APPEND );
1739
1740     inode->i_mtime = inode->i_atime = inode->i_ctime =
1741             CURRENT_TIME_SEC;
1742     inode->i_size = i_size;
1743     inode->i_blocks = 0;
1744     inode->i_bytes = 0;
1745     REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
1746       U32_MAX/*NO_BYTES_IN_DIRECT_ITEM*/;
1747
1748     INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list ));
1749     REISERFS_I(inode)->i_flags = 0;
1750     REISERFS_I(inode)->i_prealloc_block = 0;
1751     REISERFS_I(inode)->i_prealloc_count = 0;
1752     REISERFS_I(inode)->i_trans_id = 0;
1753     REISERFS_I(inode)->i_jl = NULL;
1754     REISERFS_I(inode)->i_attrs =
1755         REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
1756     sd_attrs_to_i_attrs( REISERFS_I(inode) -> i_attrs, inode );
1757     REISERFS_I(inode)->i_acl_access = NULL;
1758     REISERFS_I(inode)->i_acl_default = NULL;
1759     init_rwsem (&REISERFS_I(inode)->xattr_sem);
1760
1761     if (old_format_only (sb))
1762         make_le_item_head (&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET, TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
1763     else
1764         make_le_item_head (&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET, TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
1765
1766     /* key to search for correct place for new stat data */
1767     _make_cpu_key (&key, KEY_FORMAT_3_6, le32_to_cpu (ih.ih_key.k_dir_id),
1768                    le32_to_cpu (ih.ih_key.k_objectid), SD_OFFSET, TYPE_STAT_DATA, 3/*key length*/);
1769
1770     /* find proper place for inserting of stat data */
1771     retval = search_item (sb, &key, &path_to_key);
1772     if (retval == IO_ERROR) {
1773         err = -EIO;
1774         goto out_bad_inode;
1775     }
1776     if (retval == ITEM_FOUND) {
1777         pathrelse (&path_to_key);
1778         err = -EEXIST;
1779         goto out_bad_inode;
1780     }
1781     if (old_format_only (sb)) {
1782         if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) {
1783             pathrelse (&path_to_key);
1784             /* i_uid or i_gid is too big to be stored in stat data v3.5 */
1785             err = -EINVAL;
1786             goto out_bad_inode;
1787         }
1788         inode2sd_v1 (&sd, inode, inode->i_size);
1789     } else {
1790         inode2sd (&sd, inode, inode->i_size);
1791     }
1792     // these do not go to on-disk stat data
1793     inode->i_ino = le32_to_cpu (ih.ih_key.k_objectid);
1794     inode->i_blksize = reiserfs_default_io_size;
1795
1796     // store in in-core inode the key of stat data and version all
1797     // object items will have (directory items will have old offset
1798     // format, other new objects will consist of new items)
1799     memcpy (INODE_PKEY (inode), &(ih.ih_key), KEY_SIZE);
1800     if (old_format_only (sb) || S_ISDIR(mode) || S_ISLNK(mode))
1801         set_inode_item_key_version (inode, KEY_FORMAT_3_5);
1802     else
1803         set_inode_item_key_version (inode, KEY_FORMAT_3_6);
1804     if (old_format_only (sb))
1805         set_inode_sd_version (inode, STAT_DATA_V1);
1806     else
1807         set_inode_sd_version (inode, STAT_DATA_V2);
1808
1809     /* insert the stat data into the tree */
1810 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
1811     if (REISERFS_I(dir)->new_packing_locality)
1812         th->displace_new_blocks = 1;
1813 #endif
1814     retval = reiserfs_insert_item (th, &path_to_key, &key, &ih, inode, (char *)(&sd));
1815     if (retval) {
1816         err = retval;
1817         reiserfs_check_path(&path_to_key) ;
1818         goto out_bad_inode;
1819     }
1820
1821 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
1822     if (!th->displace_new_blocks)
1823         REISERFS_I(dir)->new_packing_locality = 0;
1824 #endif
1825     if (S_ISDIR(mode)) {
1826         /* insert item with "." and ".." */
1827         retval = reiserfs_new_directory (th, inode, &ih, &path_to_key, dir);
1828     }
1829
1830     if (S_ISLNK(mode)) {
1831         /* insert body of symlink */
1832         if (!old_format_only (sb))
1833             i_size = ROUND_UP(i_size);
1834         retval = reiserfs_new_symlink (th, inode, &ih, &path_to_key, symname, i_size);
1835     }
1836     if (retval) {
1837         err = retval;
1838         reiserfs_check_path(&path_to_key) ;
1839         journal_end(th, th->t_super, th->t_blocks_allocated);
1840         goto out_inserted_sd;
1841     }
1842
1843     /* XXX CHECK THIS */
1844     if (reiserfs_posixacl (inode->i_sb)) {
1845         retval = reiserfs_inherit_default_acl (dir, dentry, inode);
1846         if (retval) {
1847             err = retval;
1848             reiserfs_check_path(&path_to_key) ;
1849             journal_end(th, th->t_super, th->t_blocks_allocated);
1850             goto out_inserted_sd;
1851         }
1852     } else if (inode->i_sb->s_flags & MS_POSIXACL) {
1853         reiserfs_warning (inode->i_sb, "ACLs aren't enabled in the fs, "
1854                           "but vfs thinks they are!");
1855     } else if (is_reiserfs_priv_object (dir)) {
1856         reiserfs_mark_inode_private (inode);
1857     }
1858
1859     insert_inode_hash (inode);
1860     reiserfs_update_sd(th, inode);
1861     reiserfs_check_path(&path_to_key) ;
1862
1863     return 0;
1864
1865 /* it looks like you can easily compress these two goto targets into
1866  * one.  Keeping it like this doesn't actually hurt anything, and they
1867  * are place holders for what the quota code actually needs.
1868  */
1869 out_bad_inode:
1870     /* Invalidate the object, nothing was inserted yet */
1871     INODE_PKEY(inode)->k_objectid = 0;
1872
1873     /* Quota change must be inside a transaction for journaling */
1874     DQUOT_FREE_INODE(inode);
1875
1876 out_end_trans:
1877     journal_end(th, th->t_super, th->t_blocks_allocated) ;
1878     /* Drop can be outside and it needs more credits so it's better to have it outside */
1879     DQUOT_DROP(inode);
1880     inode->i_flags |= S_NOQUOTA;
1881     make_bad_inode(inode);
1882
1883 out_inserted_sd:
1884     inode->i_nlink = 0;
1885     th->t_trans_id = 0; /* so the caller can't use this handle later */
1886     iput(inode);
1887     return err;
1888 }
1889
1890 /*
1891 ** finds the tail page in the page cache,
1892 ** reads the last block in.
1893 **
1894 ** On success, page_result is set to a locked, pinned page, and bh_result
1895 ** is set to an up to date buffer for the last block in the file.  returns 0.
1896 **
1897 ** tail conversion is not done, so bh_result might not be valid for writing
1898 ** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
1899 ** trying to write the block.
1900 **
1901 ** on failure, nonzero is returned, page_result and bh_result are untouched.
1902 */
1903 static int grab_tail_page(struct inode *p_s_inode,
1904                           struct page **page_result,
1905                           struct buffer_head **bh_result) {
1906
1907     /* we want the page with the last byte in the file,
1908     ** not the page that will hold the next byte for appending
1909     */
1910     unsigned long index = (p_s_inode->i_size-1) >> PAGE_CACHE_SHIFT ;
1911     unsigned long pos = 0 ;
1912     unsigned long start = 0 ;
1913     unsigned long blocksize = p_s_inode->i_sb->s_blocksize ;
1914     unsigned long offset = (p_s_inode->i_size) & (PAGE_CACHE_SIZE - 1) ;
1915     struct buffer_head *bh ;
1916     struct buffer_head *head ;
1917     struct page * page ;
1918     int error ;
1919
1920     /* we know that we are only called with inode->i_size > 0.
1921     ** we also know that a file tail can never be as big as a block
1922     ** If i_size % blocksize == 0, our file is currently block aligned
1923     ** and it won't need converting or zeroing after a truncate.
1924     */
1925     if ((offset & (blocksize - 1)) == 0) {
1926         return -ENOENT ;
1927     }
1928     page = grab_cache_page(p_s_inode->i_mapping, index) ;
1929     error = -ENOMEM ;
1930     if (!page) {
1931         goto out ;
1932     }
1933     /* start within the page of the last block in the file */
1934     start = (offset / blocksize) * blocksize ;
1935
1936     error = block_prepare_write(page, start, offset,
1937                                 reiserfs_get_block_create_0) ;
1938     if (error)
1939         goto unlock ;
1940
1941     head = page_buffers(page) ;
1942     bh = head;
1943     do {
1944         if (pos >= start) {
1945             break ;
1946         }
1947         bh = bh->b_this_page ;
1948         pos += blocksize ;
1949     } while(bh != head) ;
1950
1951     if (!buffer_uptodate(bh)) {
1952         /* note, this should never happen, prepare_write should
1953         ** be taking care of this for us.  If the buffer isn't up to date,
1954         ** I've screwed up the code to find the buffer, or the code to
1955         ** call prepare_write
1956         */
1957         reiserfs_warning (p_s_inode->i_sb,
1958                           "clm-6000: error reading block %lu on dev %s",
1959                           bh->b_blocknr,
1960                           reiserfs_bdevname (p_s_inode->i_sb)) ;
1961         error = -EIO ;
1962         goto unlock ;
1963     }
1964     *bh_result = bh ;
1965     *page_result = page ;
1966
1967 out:
1968     return error ;
1969
1970 unlock:
1971     unlock_page(page) ;
1972     page_cache_release(page) ;
1973     return error ;
1974 }
1975
1976 /*
1977 ** vfs version of truncate file.  Must NOT be called with
1978 ** a transaction already started.
1979 **
1980 ** some code taken from block_truncate_page
1981 */
1982 int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps) {
1983     struct reiserfs_transaction_handle th ;
1984     /* we want the offset for the first byte after the end of the file */
1985     unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1) ;
1986     unsigned blocksize = p_s_inode->i_sb->s_blocksize ;
1987     unsigned length ;
1988     struct page *page = NULL ;
1989     int error ;
1990     struct buffer_head *bh = NULL ;
1991
1992     reiserfs_write_lock(p_s_inode->i_sb);
1993
1994     if (p_s_inode->i_size > 0) {
1995         if ((error = grab_tail_page(p_s_inode, &page, &bh))) {
1996             // -ENOENT means we truncated past the end of the file,
1997             // and get_block_create_0 could not find a block to read in,
1998             // which is ok.
1999             if (error != -ENOENT)
2000                 reiserfs_warning (p_s_inode->i_sb,
2001                                   "clm-6001: grab_tail_page failed %d",
2002                                   error);
2003             page = NULL ;
2004             bh = NULL ;
2005         }
2006     }
2007
2008     /* so, if page != NULL, we have a buffer head for the offset at
2009     ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
2010     ** then we have an unformatted node.  Otherwise, we have a direct item,
2011     ** and no zeroing is required on disk.  We zero after the truncate,
2012     ** because the truncate might pack the item anyway
2013     ** (it will unmap bh if it packs).
2014     */
2015     /* it is enough to reserve space in transaction for 2 balancings:
2016        one for "save" link adding and another for the first
2017        cut_from_item. 1 is for update_sd */
2018     error = journal_begin (&th, p_s_inode->i_sb,
2019                            JOURNAL_PER_BALANCE_CNT * 2 + 1);
2020     if (error)
2021         goto out;
2022     reiserfs_update_inode_transaction(p_s_inode) ;
2023     if (update_timestamps)
2024             /* we are doing real truncate: if the system crashes before the last
2025                transaction of truncating gets committed - on reboot the file
2026                either appears truncated properly or not truncated at all */
2027         add_save_link (&th, p_s_inode, 1);
2028     error = reiserfs_do_truncate (&th, p_s_inode, page, update_timestamps) ;
2029     if (error)
2030         goto out;
2031     error = journal_end (&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
2032     if (error)
2033         goto out;
2034
2035     if (update_timestamps) {
2036         error = remove_save_link (p_s_inode, 1/* truncate */);
2037         if (error)
2038             goto out;
2039     }
2040
2041     if (page) {
2042         length = offset & (blocksize - 1) ;
2043         /* if we are not on a block boundary */
2044         if (length) {
2045             char *kaddr;
2046
2047             length = blocksize - length ;
2048             kaddr = kmap_atomic(page, KM_USER0) ;
2049             memset(kaddr + offset, 0, length) ;
2050             flush_dcache_page(page) ;
2051             kunmap_atomic(kaddr, KM_USER0) ;
2052             if (buffer_mapped(bh) && bh->b_blocknr != 0) {
2053                 mark_buffer_dirty(bh) ;
2054             }
2055         }
2056         unlock_page(page) ;
2057         page_cache_release(page) ;
2058     }
2059
2060     reiserfs_write_unlock(p_s_inode->i_sb);
2061     return 0;
2062 out:
2063     if (page) {
2064         unlock_page (page);
2065         page_cache_release (page);
2066     }
2067     reiserfs_write_unlock(p_s_inode->i_sb);
2068     return error;
2069 }
2070
2071 static int map_block_for_writepage(struct inode *inode,
2072                                struct buffer_head *bh_result,
2073                                unsigned long block) {
2074     struct reiserfs_transaction_handle th ;
2075     int fs_gen ;
2076     struct item_head tmp_ih ;
2077     struct item_head *ih ;
2078     struct buffer_head *bh ;
2079     __le32 *item ;
2080     struct cpu_key key ;
2081     INITIALIZE_PATH(path) ;
2082     int pos_in_item ;
2083     int jbegin_count = JOURNAL_PER_BALANCE_CNT ;
2084     loff_t byte_offset = (block << inode->i_sb->s_blocksize_bits) + 1 ;
2085     int retval ;
2086     int use_get_block = 0 ;
2087     int bytes_copied = 0 ;
2088     int copy_size ;
2089     int trans_running = 0;
2090
2091     /* catch places below that try to log something without starting a trans */
2092     th.t_trans_id = 0;
2093
2094     if (!buffer_uptodate(bh_result)) {
2095         return -EIO;
2096     }
2097
2098     kmap(bh_result->b_page) ;
2099 start_over:
2100     reiserfs_write_lock(inode->i_sb);
2101     make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3) ;
2102
2103 research:
2104     retval = search_for_position_by_key(inode->i_sb, &key, &path) ;
2105     if (retval != POSITION_FOUND) {
2106         use_get_block = 1;
2107         goto out ;
2108     }
2109
2110     bh = get_last_bh(&path) ;
2111     ih = get_ih(&path) ;
2112     item = get_item(&path) ;
2113     pos_in_item = path.pos_in_item ;
2114
2115     /* we've found an unformatted node */
2116     if (indirect_item_found(retval, ih)) {
2117         if (bytes_copied > 0) {
2118             reiserfs_warning (inode->i_sb, "clm-6002: bytes_copied %d",
2119                               bytes_copied) ;
2120         }
2121         if (!get_block_num(item, pos_in_item)) {
2122             /* crap, we are writing to a hole */
2123             use_get_block = 1;
2124             goto out ;
2125         }
2126         set_block_dev_mapped(bh_result, get_block_num(item,pos_in_item),inode);
2127     } else if (is_direct_le_ih(ih)) {
2128         char *p ;
2129         p = page_address(bh_result->b_page) ;
2130         p += (byte_offset -1) & (PAGE_CACHE_SIZE - 1) ;
2131         copy_size = ih_item_len(ih) - pos_in_item;
2132
2133         fs_gen = get_generation(inode->i_sb) ;
2134         copy_item_head(&tmp_ih, ih) ;
2135
2136         if (!trans_running) {
2137             /* vs-3050 is gone, no need to drop the path */
2138             retval = journal_begin(&th, inode->i_sb, jbegin_count) ;
2139             if (retval)
2140                 goto out;
2141             reiserfs_update_inode_transaction(inode) ;
2142             trans_running = 1;
2143             if (fs_changed(fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) {
2144                 reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
2145                 goto research;
2146             }
2147         }
2148
2149         reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
2150
2151         if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
2152             reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
2153             goto research;
2154         }
2155
2156         memcpy( B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied, copy_size) ;
2157
2158         journal_mark_dirty(&th, inode->i_sb, bh) ;
2159         bytes_copied += copy_size ;
2160         set_block_dev_mapped(bh_result, 0, inode);
2161
2162         /* are there still bytes left? */
2163         if (bytes_copied < bh_result->b_size &&
2164             (byte_offset + bytes_copied) < inode->i_size) {
2165             set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + copy_size) ;
2166             goto research ;
2167         }
2168     } else {
2169         reiserfs_warning (inode->i_sb,
2170                           "clm-6003: bad item inode %lu, device %s",
2171                           inode->i_ino, reiserfs_bdevname (inode->i_sb)) ;
2172         retval = -EIO ;
2173         goto out ;
2174     }
2175     retval = 0 ;
2176
2177 out:
2178     pathrelse(&path) ;
2179     if (trans_running) {
2180         int err = journal_end(&th, inode->i_sb, jbegin_count) ;
2181         if (err)
2182             retval = err;
2183         trans_running = 0;
2184     }
2185     reiserfs_write_unlock(inode->i_sb);
2186
2187     /* this is where we fill in holes in the file. */
2188     if (use_get_block) {
2189         retval = reiserfs_get_block(inode, block, bh_result,
2190                                     GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM |
2191                                     GET_BLOCK_NO_DANGLE);
2192         if (!retval) {
2193             if (!buffer_mapped(bh_result) || bh_result->b_blocknr == 0) {
2194                 /* get_block failed to find a mapped unformatted node. */
2195                 use_get_block = 0 ;
2196                 goto start_over ;
2197             }
2198         }
2199     }
2200     kunmap(bh_result->b_page) ;
2201
2202     if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
2203         /* we've copied data from the page into the direct item, so the
2204          * buffer in the page is now clean, mark it to reflect that.
2205          */
2206         lock_buffer(bh_result);
2207         clear_buffer_dirty(bh_result);
2208         unlock_buffer(bh_result);
2209     }
2210     return retval ;
2211 }
2212
2213 /*
2214  * mason@suse.com: updated in 2.5.54 to follow the same general io
2215  * start/recovery path as __block_write_full_page, along with special
2216  * code to handle reiserfs tails.
2217  */
2218 static int reiserfs_write_full_page(struct page *page, struct writeback_control *wbc) {
2219     struct inode *inode = page->mapping->host ;
2220     unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT ;
2221     int error = 0;
2222     unsigned long block ;
2223     struct buffer_head *head, *bh;
2224     int partial = 0 ;
2225     int nr = 0;
2226     int checked = PageChecked(page);
2227     struct reiserfs_transaction_handle th;
2228     struct super_block *s = inode->i_sb;
2229     int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
2230     th.t_trans_id = 0;
2231
2232     /* The page dirty bit is cleared before writepage is called, which
2233      * means we have to tell create_empty_buffers to make dirty buffers
2234      * The page really should be up to date at this point, so tossing
2235      * in the BH_Uptodate is just a sanity check.
2236      */
2237     if (!page_has_buffers(page)) {
2238         create_empty_buffers(page, s->s_blocksize,
2239                             (1 << BH_Dirty) | (1 << BH_Uptodate));
2240     }
2241     head = page_buffers(page) ;
2242
2243     /* last page in the file, zero out any contents past the
2244     ** last byte in the file
2245     */
2246     if (page->index >= end_index) {
2247         char *kaddr;
2248         unsigned last_offset;
2249
2250         last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1) ;
2251         /* no file contents in this page */
2252         if (page->index >= end_index + 1 || !last_offset) {
2253             unlock_page(page);
2254             return 0;
2255         }
2256         kaddr = kmap_atomic(page, KM_USER0);
2257         memset(kaddr + last_offset, 0, PAGE_CACHE_SIZE-last_offset) ;
2258         flush_dcache_page(page) ;
2259         kunmap_atomic(kaddr, KM_USER0) ;
2260     }
2261     bh = head ;
2262     block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits) ;
2263     /* first map all the buffers, logging any direct items we find */
2264     do {
2265         if ((checked || buffer_dirty(bh)) && (!buffer_mapped(bh) ||
2266            (buffer_mapped(bh) && bh->b_blocknr == 0))) {
2267             /* not mapped yet, or it points to a direct item, search
2268              * the btree for the mapping info, and log any direct
2269              * items found
2270              */
2271             if ((error = map_block_for_writepage(inode, bh, block))) {
2272                 goto fail ;
2273             }
2274         }
2275         bh = bh->b_this_page;
2276         block++;
2277     } while(bh != head) ;
2278
2279     /*
2280      * we start the transaction after map_block_for_writepage,
2281      * because it can create holes in the file (an unbounded operation).
2282      * starting it here, we can make a reliable estimate for how many
2283      * blocks we're going to log
2284      */
2285     if (checked) {
2286         ClearPageChecked(page);
2287         reiserfs_write_lock(s);
2288         error = journal_begin(&th, s, bh_per_page + 1);
2289         if (error) {
2290             reiserfs_write_unlock(s);
2291             goto fail;
2292         }
2293         reiserfs_update_inode_transaction(inode);
2294     }
2295     /* now go through and lock any dirty buffers on the page */
2296     do {
2297         get_bh(bh);
2298         if (!buffer_mapped(bh))
2299             continue;
2300         if (buffer_mapped(bh) && bh->b_blocknr == 0)
2301             continue;
2302
2303         if (checked) {
2304             reiserfs_prepare_for_journal(s, bh, 1);
2305             journal_mark_dirty(&th, s, bh);
2306             continue;
2307         }
2308         /* from this point on, we know the buffer is mapped to a
2309          * real block and not a direct item
2310          */
2311         if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
2312             lock_buffer(bh);
2313         } else {
2314             if (test_set_buffer_locked(bh)) {
2315                 redirty_page_for_writepage(wbc, page);
2316                 continue;
2317             }
2318         }
2319         if (test_clear_buffer_dirty(bh)) {
2320             mark_buffer_async_write(bh);
2321         } else {
2322             unlock_buffer(bh);
2323         }
2324     } while((bh = bh->b_this_page) != head);
2325
2326     if (checked) {
2327         error = journal_end(&th, s, bh_per_page + 1);
2328         reiserfs_write_unlock(s);
2329         if (error)
2330             goto fail;
2331     }
2332     BUG_ON(PageWriteback(page));
2333     set_page_writeback(page);
2334     unlock_page(page);
2335
2336     /*
2337      * since any buffer might be the only dirty buffer on the page,
2338      * the first submit_bh can bring the page out of writeback.
2339      * be careful with the buffers.
2340      */
2341     do {
2342         struct buffer_head *next = bh->b_this_page;
2343         if (buffer_async_write(bh)) {
2344             submit_bh(WRITE, bh);
2345             nr++;
2346         }
2347         put_bh(bh);
2348         bh = next;
2349     } while(bh != head);
2350
2351     error = 0;
2352 done:
2353     if (nr == 0) {
2354         /*
2355          * if this page only had a direct item, it is very possible for
2356          * no io to be required without there being an error.  Or,
2357          * someone else could have locked them and sent them down the
2358          * pipe without locking the page
2359          */
2360         bh = head ;
2361         do {
2362             if (!buffer_uptodate(bh)) {
2363                 partial = 1;
2364                 break;
2365             }
2366             bh = bh->b_this_page;
2367         } while(bh != head);
2368         if (!partial)
2369             SetPageUptodate(page);
2370         end_page_writeback(page);
2371     }
2372     return error;
2373
2374 fail:
2375     /* catches various errors, we need to make sure any valid dirty blocks
2376      * get to the media.  The page is currently locked and not marked for
2377      * writeback
2378      */
2379     ClearPageUptodate(page);
2380     bh = head;
2381     do {
2382         get_bh(bh);
2383         if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
2384             lock_buffer(bh);
2385             mark_buffer_async_write(bh);
2386         } else {
2387             /*
2388              * clear any dirty bits that might have come from getting
2389              * attached to a dirty page
2390              */
2391              clear_buffer_dirty(bh);
2392         }
2393         bh = bh->b_this_page;
2394     } while(bh != head);
2395     SetPageError(page);
2396     BUG_ON(PageWriteback(page));
2397     set_page_writeback(page);
2398     unlock_page(page);
2399     do {
2400         struct buffer_head *next = bh->b_this_page;
2401         if (buffer_async_write(bh)) {
2402             clear_buffer_dirty(bh);
2403             submit_bh(WRITE, bh);
2404             nr++;
2405         }
2406         put_bh(bh);
2407         bh = next;
2408     } while(bh != head);
2409     goto done;
2410 }
2411
2412
2413 static int reiserfs_readpage (struct file *f, struct page * page)
2414 {
2415     return block_read_full_page (page, reiserfs_get_block);
2416 }
2417
2418
2419 static int reiserfs_writepage (struct page * page, struct writeback_control *wbc)
2420 {
2421     struct inode *inode = page->mapping->host ;
2422     reiserfs_wait_on_write_block(inode->i_sb) ;
2423     return reiserfs_write_full_page(page, wbc) ;
2424 }
2425
2426 static int reiserfs_prepare_write(struct file *f, struct page *page,
2427                            unsigned from, unsigned to) {
2428     struct inode *inode = page->mapping->host ;
2429     int ret;
2430     int old_ref = 0;
2431
2432     reiserfs_wait_on_write_block(inode->i_sb) ;
2433     fix_tail_page_for_writing(page) ;
2434     if (reiserfs_transaction_running(inode->i_sb)) {
2435         struct reiserfs_transaction_handle *th;
2436         th = (struct reiserfs_transaction_handle *)current->journal_info;
2437         BUG_ON (!th->t_refcount);
2438         BUG_ON (!th->t_trans_id);
2439         old_ref = th->t_refcount;
2440         th->t_refcount++;
2441     }
2442
2443     ret = block_prepare_write(page, from, to, reiserfs_get_block) ;
2444     if (ret && reiserfs_transaction_running(inode->i_sb)) {
2445         struct reiserfs_transaction_handle *th = current->journal_info;
2446         /* this gets a little ugly.  If reiserfs_get_block returned an
2447          * error and left a transacstion running, we've got to close it,
2448          * and we've got to free handle if it was a persistent transaction.
2449          *
2450          * But, if we had nested into an existing transaction, we need
2451          * to just drop the ref count on the handle.
2452          *
2453          * If old_ref == 0, the transaction is from reiserfs_get_block,
2454          * and it was a persistent trans.  Otherwise, it was nested above.
2455          */
2456         if (th->t_refcount > old_ref) {
2457             if (old_ref)
2458                 th->t_refcount--;
2459             else {
2460                 int err;
2461                 reiserfs_write_lock(inode->i_sb);
2462                 err = reiserfs_end_persistent_transaction(th);
2463                 reiserfs_write_unlock(inode->i_sb);
2464                 if (err)
2465                     ret = err;
2466             }
2467         }
2468     }
2469     return ret;
2470
2471 }
2472
2473
2474 static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block) {
2475   return generic_block_bmap(as, block, reiserfs_bmap) ;
2476 }
2477
2478 static int reiserfs_commit_write(struct file *f, struct page *page,
2479                                  unsigned from, unsigned to) {
2480     struct inode *inode = page->mapping->host ;
2481     loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2482     int ret = 0;
2483     int update_sd = 0;
2484     struct reiserfs_transaction_handle *th = NULL;
2485
2486     reiserfs_wait_on_write_block(inode->i_sb) ;
2487     if (reiserfs_transaction_running(inode->i_sb)) {
2488         th = current->journal_info;
2489     }
2490     reiserfs_commit_page(inode, page, from, to);
2491
2492     /* generic_commit_write does this for us, but does not update the
2493     ** transaction tracking stuff when the size changes.  So, we have
2494     ** to do the i_size updates here.
2495     */
2496     if (pos > inode->i_size) {
2497         struct reiserfs_transaction_handle myth ;
2498         reiserfs_write_lock(inode->i_sb);
2499         /* If the file have grown beyond the border where it
2500            can have a tail, unmark it as needing a tail
2501            packing */
2502         if ( (have_large_tails (inode->i_sb) && inode->i_size > i_block_size (inode)*4) ||
2503              (have_small_tails (inode->i_sb) && inode->i_size > i_block_size(inode)) )
2504             REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
2505
2506         ret = journal_begin(&myth, inode->i_sb, 1) ;
2507         if (ret) {
2508             reiserfs_write_unlock(inode->i_sb);
2509             goto journal_error;
2510         }
2511         reiserfs_update_inode_transaction(inode) ;
2512         inode->i_size = pos ;
2513         reiserfs_update_sd(&myth, inode) ;
2514         update_sd = 1;
2515         ret = journal_end(&myth, inode->i_sb, 1) ;
2516         reiserfs_write_unlock(inode->i_sb);
2517         if (ret)
2518             goto journal_error;
2519     }
2520     if (th) {
2521         reiserfs_write_lock(inode->i_sb);
2522         if (!update_sd)
2523             reiserfs_update_sd(th, inode) ;
2524         ret = reiserfs_end_persistent_transaction(th);
2525         reiserfs_write_unlock(inode->i_sb);
2526         if (ret)
2527             goto out;
2528     }
2529
2530     /* we test for O_SYNC here so we can commit the transaction
2531     ** for any packed tails the file might have had
2532     */
2533     if (f && (f->f_flags & O_SYNC)) {
2534         reiserfs_write_lock(inode->i_sb);
2535         ret = reiserfs_commit_for_inode(inode) ;
2536         reiserfs_write_unlock(inode->i_sb);
2537     }
2538 out:
2539     return ret ;
2540
2541 journal_error:
2542     if (th) {
2543         reiserfs_write_lock(inode->i_sb);
2544         if (!update_sd)
2545             reiserfs_update_sd(th, inode) ;
2546         ret = reiserfs_end_persistent_transaction(th);
2547         reiserfs_write_unlock(inode->i_sb);
2548     }
2549
2550     return ret;
2551 }
2552
2553 void sd_attrs_to_i_attrs( __u16 sd_attrs, struct inode *inode )
2554 {
2555         if( reiserfs_attrs( inode -> i_sb ) ) {
2556                 if( sd_attrs & REISERFS_SYNC_FL )
2557                         inode -> i_flags |= S_SYNC;
2558                 else
2559                         inode -> i_flags &= ~S_SYNC;
2560                 if( sd_attrs & REISERFS_IMMUTABLE_FL )
2561                         inode -> i_flags |= S_IMMUTABLE;
2562                 else
2563                         inode -> i_flags &= ~S_IMMUTABLE;
2564                 if( sd_attrs & REISERFS_APPEND_FL )
2565                         inode -> i_flags |= S_APPEND;
2566                 else
2567                         inode -> i_flags &= ~S_APPEND;
2568                 if( sd_attrs & REISERFS_NOATIME_FL )
2569                         inode -> i_flags |= S_NOATIME;
2570                 else
2571                         inode -> i_flags &= ~S_NOATIME;
2572                 if( sd_attrs & REISERFS_NOTAIL_FL )
2573                         REISERFS_I(inode)->i_flags |= i_nopack_mask;
2574                 else
2575                         REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
2576         }
2577 }
2578
2579 void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs )
2580 {
2581         if( reiserfs_attrs( inode -> i_sb ) ) {
2582                 if( inode -> i_flags & S_IMMUTABLE )
2583                         *sd_attrs |= REISERFS_IMMUTABLE_FL;
2584                 else
2585                         *sd_attrs &= ~REISERFS_IMMUTABLE_FL;
2586                 if( inode -> i_flags & S_SYNC )
2587                         *sd_attrs |= REISERFS_SYNC_FL;
2588                 else
2589                         *sd_attrs &= ~REISERFS_SYNC_FL;
2590                 if( inode -> i_flags & S_NOATIME )
2591                         *sd_attrs |= REISERFS_NOATIME_FL;
2592                 else
2593                         *sd_attrs &= ~REISERFS_NOATIME_FL;
2594                 if( REISERFS_I(inode)->i_flags & i_nopack_mask )
2595                         *sd_attrs |= REISERFS_NOTAIL_FL;
2596                 else
2597                         *sd_attrs &= ~REISERFS_NOTAIL_FL;
2598         }
2599 }
2600
2601 /* decide if this buffer needs to stay around for data logging or ordered
2602 ** write purposes
2603 */
2604 static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
2605 {
2606     int ret = 1 ;
2607     struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
2608
2609     spin_lock(&j->j_dirty_buffers_lock) ;
2610     if (!buffer_mapped(bh)) {
2611         goto free_jh;
2612     }
2613     /* the page is locked, and the only places that log a data buffer
2614      * also lock the page.
2615      */
2616     if (reiserfs_file_data_log(inode)) {
2617         /*
2618          * very conservative, leave the buffer pinned if
2619          * anyone might need it.
2620          */
2621         if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
2622             ret = 0 ;
2623         }
2624     } else
2625     if (buffer_dirty(bh) || buffer_locked(bh)) {
2626         struct reiserfs_journal_list *jl;
2627         struct reiserfs_jh *jh = bh->b_private;
2628
2629         /* why is this safe?
2630          * reiserfs_setattr updates i_size in the on disk
2631          * stat data before allowing vmtruncate to be called.
2632          *
2633          * If buffer was put onto the ordered list for this
2634          * transaction, we know for sure either this transaction
2635          * or an older one already has updated i_size on disk,
2636          * and this ordered data won't be referenced in the file
2637          * if we crash.
2638          *
2639          * if the buffer was put onto the ordered list for an older
2640          * transaction, we need to leave it around
2641          */
2642         if (jh && (jl = jh->jl) && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
2643             ret = 0;
2644     }
2645 free_jh:
2646     if (ret && bh->b_private) {
2647         reiserfs_free_jh(bh);
2648     }
2649     spin_unlock(&j->j_dirty_buffers_lock) ;
2650     return ret ;
2651 }
2652
2653 /* clm -- taken from fs/buffer.c:block_invalidate_page */
2654 static int reiserfs_invalidatepage(struct page *page, unsigned long offset)
2655 {
2656     struct buffer_head *head, *bh, *next;
2657     struct inode *inode = page->mapping->host;
2658     unsigned int curr_off = 0;
2659     int ret = 1;
2660
2661     BUG_ON(!PageLocked(page));
2662
2663     if (offset == 0)
2664         ClearPageChecked(page);
2665
2666     if (!page_has_buffers(page))
2667         goto out;
2668
2669     head = page_buffers(page);
2670     bh = head;
2671     do {
2672         unsigned int next_off = curr_off + bh->b_size;
2673         next = bh->b_this_page;
2674
2675         /*
2676          * is this block fully invalidated?
2677          */
2678         if (offset <= curr_off) {
2679             if (invalidatepage_can_drop(inode, bh))
2680                 reiserfs_unmap_buffer(bh);
2681             else
2682                 ret = 0;
2683         }
2684         curr_off = next_off;
2685         bh = next;
2686     } while (bh != head);
2687
2688     /*
2689      * We release buffers only if the entire page is being invalidated.
2690      * The get_block cached value has been unconditionally invalidated,
2691      * so real IO is not possible anymore.
2692      */
2693     if (!offset && ret)
2694         ret = try_to_release_page(page, 0);
2695 out:
2696     return ret;
2697 }
2698
2699 static int reiserfs_set_page_dirty(struct page *page) {
2700     struct inode *inode = page->mapping->host;
2701     if (reiserfs_file_data_log(inode)) {
2702         SetPageChecked(page);
2703         return __set_page_dirty_nobuffers(page);
2704     }
2705     return __set_page_dirty_buffers(page);
2706 }
2707
2708 /*
2709  * Returns 1 if the page's buffers were dropped.  The page is locked.
2710  *
2711  * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
2712  * in the buffers at page_buffers(page).
2713  *
2714  * even in -o notail mode, we can't be sure an old mount without -o notail
2715  * didn't create files with tails.
2716  */
2717 static int reiserfs_releasepage(struct page *page, int unused_gfp_flags)
2718 {
2719     struct inode *inode = page->mapping->host ;
2720     struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
2721     struct buffer_head *head ;
2722     struct buffer_head *bh ;
2723     int ret = 1 ;
2724
2725     WARN_ON(PageChecked(page));
2726     spin_lock(&j->j_dirty_buffers_lock) ;
2727     head = page_buffers(page) ;
2728     bh = head ;
2729     do {
2730         if (bh->b_private) {
2731             if (!buffer_dirty(bh) && !buffer_locked(bh)) {
2732                 reiserfs_free_jh(bh);
2733             } else {
2734                 ret = 0 ;
2735                 break ;
2736             }
2737         }
2738         bh = bh->b_this_page ;
2739     } while (bh != head) ;
2740     if (ret)
2741         ret = try_to_free_buffers(page) ;
2742     spin_unlock(&j->j_dirty_buffers_lock) ;
2743     return ret ;
2744 }
2745
2746 /* We thank Mingming Cao for helping us understand in great detail what
2747    to do in this section of the code. */
2748 static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
2749                 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
2750 {
2751     struct file *file = iocb->ki_filp;
2752     struct inode *inode = file->f_mapping->host;
2753
2754     return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
2755                         offset, nr_segs, reiserfs_get_blocks_direct_io, NULL);
2756 }
2757
2758 int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) {
2759     struct inode *inode = dentry->d_inode ;
2760     int error ;
2761     unsigned int ia_valid = attr->ia_valid;
2762     reiserfs_write_lock(inode->i_sb);
2763     if (attr->ia_valid & ATTR_SIZE) {
2764         /* version 2 items will be caught by the s_maxbytes check
2765         ** done for us in vmtruncate
2766         */
2767         if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
2768             attr->ia_size > MAX_NON_LFS) {
2769             error = -EFBIG ;
2770             goto out;
2771         }
2772         /* fill in hole pointers in the expanding truncate case. */
2773         if (attr->ia_size > inode->i_size) {
2774             error = generic_cont_expand(inode, attr->ia_size) ;
2775             if (REISERFS_I(inode)->i_prealloc_count > 0) {
2776                 int err;
2777                 struct reiserfs_transaction_handle th ;
2778                 /* we're changing at most 2 bitmaps, inode + super */
2779                 err = journal_begin(&th, inode->i_sb, 4) ;
2780                 if (!err) {
2781                     reiserfs_discard_prealloc (&th, inode);
2782                     err = journal_end(&th, inode->i_sb, 4) ;
2783                 }
2784                 if (err)
2785                     error = err;
2786             }
2787             if (error)
2788                 goto out;
2789         }
2790     }
2791
2792     if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) ||
2793          ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) &&
2794         (get_inode_sd_version (inode) == STAT_DATA_V1)) {
2795                 /* stat data of format v3.5 has 16 bit uid and gid */
2796             error = -EINVAL;
2797             goto out;
2798         }
2799
2800     error = inode_change_ok(inode, attr) ;
2801     if (!error) {
2802         if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2803             (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2804                 error = reiserfs_chown_xattrs (inode, attr);
2805
2806                 if (!error) {
2807                     struct reiserfs_transaction_handle th;
2808                     int jbegin_count = 2*(REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb)+REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb))+2;
2809
2810                     /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
2811                     error = journal_begin(&th, inode->i_sb, jbegin_count);
2812                     if (error)
2813                         goto out;
2814                     error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2815                     if (error) {
2816                         journal_end(&th, inode->i_sb, jbegin_count);
2817                         goto out;
2818                     }
2819                     /* Update corresponding info in inode so that everything is in
2820                      * one transaction */
2821                     if (attr->ia_valid & ATTR_UID)
2822                         inode->i_uid = attr->ia_uid;
2823                     if (attr->ia_valid & ATTR_GID)
2824                         inode->i_gid = attr->ia_gid;
2825                     mark_inode_dirty(inode);
2826                     error = journal_end(&th, inode->i_sb, jbegin_count);
2827                 }
2828         }
2829         if (!error)
2830             error = inode_setattr(inode, attr) ;
2831     }
2832
2833
2834     if (!error && reiserfs_posixacl (inode->i_sb)) {
2835         if (attr->ia_valid & ATTR_MODE)
2836             error = reiserfs_acl_chmod (inode);
2837     }
2838
2839 out:
2840     reiserfs_write_unlock(inode->i_sb);
2841     return error ;
2842 }
2843
2844
2845
2846 struct address_space_operations reiserfs_address_space_operations = {
2847     .writepage = reiserfs_writepage,
2848     .readpage = reiserfs_readpage,
2849     .readpages = reiserfs_readpages,
2850     .releasepage = reiserfs_releasepage,
2851     .invalidatepage = reiserfs_invalidatepage,
2852     .sync_page = block_sync_page,
2853     .prepare_write = reiserfs_prepare_write,
2854     .commit_write = reiserfs_commit_write,
2855     .bmap = reiserfs_aop_bmap,
2856     .direct_IO = reiserfs_direct_IO,
2857     .set_page_dirty = reiserfs_set_page_dirty,
2858 } ;