fs/reiserfs/file.c

   1 /*
   2  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
   3  */
   4
   5
   6 #include <linux/time.h>
   7 #include <linux/reiserfs_fs.h>
   8 #include <linux/reiserfs_acl.h>
   9 #include <linux/reiserfs_xattr.h>
  10 #include <linux/smp_lock.h>
  11 #include <asm/uaccess.h>
  12 #include <linux/pagemap.h>
  13 #include <linux/swap.h>
  14 #include <linux/writeback.h>
  15 #include <linux/blkdev.h>
  16 #include <linux/buffer_head.h>
  17 #include <linux/quotaops.h>
  18
  19 /*
  20 ** We pack the tails of files on file close, not at the time they are written.
  21 ** This implies an unnecessary copy of the tail and an unnecessary indirect item
  22 ** insertion/balancing, for files that are written in one write.
  23 ** It avoids unnecessary tail packings (balances) for files that are written in
  24 ** multiple writes and are small enough to have tails.
  25 **
  26 ** file_release is called by the VFS layer when the file is closed.  If
  27 ** this is the last open file descriptor, and the file
  28 ** small enough to have a tail, and the tail is currently in an
  29 ** unformatted node, the tail is converted back into a direct item.
  30 **
  31 ** We use reiserfs_truncate_file to pack the tail, since it already has
  32 ** all the conditions coded.
  33 */
  34 static int reiserfs_file_release (struct inode * inode, struct file * filp)
  35 {
  36
  37     struct reiserfs_transaction_handle th ;
  38
  39     if (!S_ISREG (inode->i_mode))
  40         BUG ();
  41
  42     /* fast out for when nothing needs to be done */
  43     if ((atomic_read(&inode->i_count) > 1 ||
  44         !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
  45          !tail_has_to_be_packed(inode))       &&
  46         REISERFS_I(inode)->i_prealloc_count <= 0) {
  47         return 0;
  48     }
  49
  50     reiserfs_write_lock(inode->i_sb);
  51     down (&inode->i_sem);
  52     journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3) ;
  53     reiserfs_update_inode_transaction(inode) ;
  54
  55 #ifdef REISERFS_PREALLOCATE
  56     reiserfs_discard_prealloc (&th, inode);
  57 #endif
  58     journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3) ;
  59
  60     if (atomic_read(&inode->i_count) <= 1 &&
  61         (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
  62         tail_has_to_be_packed (inode)) {
  63         /* if regular file is released by last holder and it has been
  64            appended (we append by unformatted node only) or its direct
  65            item(s) had to be converted, then it may have to be
  66            indirect2direct converted */
  67         reiserfs_truncate_file(inode, 0) ;
  68     }
  69     up (&inode->i_sem);
  70     reiserfs_write_unlock(inode->i_sb);
  71     return 0;
  72 }
  73
  74 static void reiserfs_vfs_truncate_file(struct inode *inode) {
  75     reiserfs_truncate_file(inode, 1) ;
  76 }
  77
  78 /* Sync a reiserfs file. */
  79
  80 /*
  81  * FIXME: sync_mapping_buffers() never has anything to sync.  Can
  82  * be removed...
  83  */
  84
  85 static int reiserfs_sync_file(
  86                               struct file   * p_s_filp,
  87                               struct dentry * p_s_dentry,
  88                               int datasync
  89                               ) {
  90   struct inode * p_s_inode = p_s_dentry->d_inode;
  91   int n_err;
  92   int barrier_done;
  93
  94   if (!S_ISREG(p_s_inode->i_mode))
  95       BUG ();
  96   n_err = sync_mapping_buffers(p_s_inode->i_mapping) ;
  97   reiserfs_write_lock(p_s_inode->i_sb);
  98   barrier_done = reiserfs_commit_for_inode(p_s_inode);
  99   reiserfs_write_unlock(p_s_inode->i_sb);
 100   if (barrier_done != 1)
 101       blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
 102   return ( n_err < 0 ) ? -EIO : 0;
 103 }
 104
 105 /* I really do not want to play with memory shortage right now, so
 106    to simplify the code, we are not going to write more than this much pages at
 107    a time. This still should considerably improve performance compared to 4k
 108    at a time case. This is 32 pages of 4k size. */
 109 #define REISERFS_WRITE_PAGES_AT_A_TIME (128 * 1024) / PAGE_CACHE_SIZE
 110
 111 /* Allocates blocks for a file to fulfil write request.
 112    Maps all unmapped but prepared pages from the list.
 113    Updates metadata with newly allocated blocknumbers as needed */
 114 int reiserfs_allocate_blocks_for_region(
 115                                 struct reiserfs_transaction_handle *th,
 116                                 struct inode *inode, /* Inode we work with */
 117                                 loff_t pos, /* Writing position */
 118                                 int num_pages, /* number of pages write going
 119                                                   to touch */
 120                                 int write_bytes, /* amount of bytes to write */
 121                                 struct page **prepared_pages, /* array of
 122                                                                  prepared pages
 123                                                                */
 124                                 int blocks_to_allocate /* Amount of blocks we
 125                                                           need to allocate to
 126                                                           fit the data into file
 127                                                          */
 128                                 )
 129 {
 130     struct cpu_key key; // cpu key of item that we are going to deal with
 131     struct item_head *ih; // pointer to item head that we are going to deal with
 132     struct buffer_head *bh; // Buffer head that contains items that we are going to deal with
 133     __u32 * item; // pointer to item we are going to deal with
 134     INITIALIZE_PATH(path); // path to item, that we are going to deal with.
 135     b_blocknr_t *allocated_blocks; // Pointer to a place where allocated blocknumbers would be stored.
 136     reiserfs_blocknr_hint_t hint; // hint structure for block allocator.
 137     size_t res; // return value of various functions that we call.
 138     int curr_block; // current block used to keep track of unmapped blocks.
 139     int i; // loop counter
 140     int itempos; // position in item
 141     unsigned int from = (pos & (PAGE_CACHE_SIZE - 1)); // writing position in
 142                                                        // first page
 143     unsigned int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1; /* last modified byte offset in last page */
 144     __u64 hole_size ; // amount of blocks for a file hole, if it needed to be created.
 145     int modifying_this_item = 0; // Flag for items traversal code to keep track
 146                                  // of the fact that we already prepared
 147                                  // current block for journal
 148     int will_prealloc = 0;
 149
 150     RFALSE(!blocks_to_allocate, "green-9004: tried to allocate zero blocks?");
 151
 152     /* only preallocate if this is a small write */
 153     if (REISERFS_I(inode)->i_prealloc_count ||
 154        (!(write_bytes & (inode->i_sb->s_blocksize -1)) &&
 155         blocks_to_allocate <
 156         REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize))
 157         will_prealloc = REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize;
 158
 159     allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) *
 160                                         sizeof(b_blocknr_t), GFP_NOFS);
 161
 162     /* First we compose a key to point at the writing position, we want to do
 163        that outside of any locking region. */
 164     make_cpu_key (&key, inode, pos+1, TYPE_ANY, 3/*key length*/);
 165
 166     /* If we came here, it means we absolutely need to open a transaction,
 167        since we need to allocate some blocks */
 168     reiserfs_write_lock(inode->i_sb); // Journaling stuff and we need that.
 169     journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1); // Wish I know if this number enough
 170     reiserfs_update_inode_transaction(inode) ;
 171
 172     /* Look for the in-tree position of our write, need path for block allocator */
 173     res = search_for_position_by_key(inode->i_sb, &key, &path);
 174     if ( res == IO_ERROR ) {
 175         res = -EIO;
 176         goto error_exit;
 177     }
 178
 179     /* Allocate blocks */
 180     /* First fill in "hint" structure for block allocator */
 181     hint.th = th; // transaction handle.
 182     hint.path = &path; // Path, so that block allocator can determine packing locality or whatever it needs to determine.
 183     hint.inode = inode; // Inode is needed by block allocator too.
 184     hint.search_start = 0; // We have no hint on where to search free blocks for block allocator.
 185     hint.key = key.on_disk_key; // on disk key of file.
 186     hint.block = inode->i_blocks>>(inode->i_sb->s_blocksize_bits-9); // Number of disk blocks this file occupies already.
 187     hint.formatted_node = 0; // We are allocating blocks for unformatted node.
 188     hint.preallocate = will_prealloc;
 189
 190     /* Call block allocator to allocate blocks */
 191     res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate);
 192     if ( res != CARRY_ON ) {
 193         if ( res == NO_DISK_SPACE ) {
 194             /* We flush the transaction in case of no space. This way some
 195                blocks might become free */
 196             SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
 197             restart_transaction(th, inode, &path);
 198
 199             /* We might have scheduled, so search again */
 200             res = search_for_position_by_key(inode->i_sb, &key, &path);
 201             if ( res == IO_ERROR ) {
 202                 res = -EIO;
 203                 goto error_exit;
 204             }
 205
 206             /* update changed info for hint structure. */
 207             res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate);
 208             if ( res != CARRY_ON ) {
 209                 res = -ENOSPC;
 210                 pathrelse(&path);
 211                 goto error_exit;
 212             }
 213         } else {
 214             res = -ENOSPC;
 215             pathrelse(&path);
 216             goto error_exit;
 217         }
 218     }
 219
 220 #ifdef __BIG_ENDIAN
 221         // Too bad, I have not found any way to convert a given region from
 222         // cpu format to little endian format
 223     {
 224         int i;
 225         for ( i = 0; i < blocks_to_allocate ; i++)
 226             allocated_blocks[i]=cpu_to_le32(allocated_blocks[i]);
 227     }
 228 #endif
 229
 230     /* Blocks allocating well might have scheduled and tree might have changed,
 231        let's search the tree again */
 232     /* find where in the tree our write should go */
 233     res = search_for_position_by_key(inode->i_sb, &key, &path);
 234     if ( res == IO_ERROR ) {
 235         res = -EIO;
 236         goto error_exit_free_blocks;
 237     }
 238
 239     bh = get_last_bh( &path ); // Get a bufferhead for last element in path.
 240     ih = get_ih( &path );      // Get a pointer to last item head in path.
 241     item = get_item( &path );  // Get a pointer to last item in path
 242
 243     /* Let's see what we have found */
 244     if ( res != POSITION_FOUND ) { /* position not found, this means that we
 245                                       might need to append file with holes
 246                                       first */
 247         // Since we are writing past the file's end, we need to find out if
 248         // there is a hole that needs to be inserted before our writing
 249         // position, and how many blocks it is going to cover (we need to
 250         //  populate pointers to file blocks representing the hole with zeros)
 251
 252         {
 253             int item_offset = 1;
 254             /*
 255              * if ih is stat data, its offset is 0 and we don't want to
 256              * add 1 to pos in the hole_size calculation
 257              */
 258             if (is_statdata_le_ih(ih))
 259                 item_offset = 0;
 260             hole_size = (pos + item_offset -
 261                     (le_key_k_offset( get_inode_item_key_version(inode),
 262                     &(ih->ih_key)) +
 263                     op_bytes_number(ih, inode->i_sb->s_blocksize))) >>
 264                     inode->i_sb->s_blocksize_bits;
 265         }
 266
 267         if ( hole_size > 0 ) {
 268             int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE ); // How much data to insert first time.
 269             /* area filled with zeroes, to supply as list of zero blocknumbers
 270                We allocate it outside of loop just in case loop would spin for
 271                several iterations. */
 272             char *zeros = kmalloc(to_paste*UNFM_P_SIZE, GFP_ATOMIC); // We cannot insert more than MAX_ITEM_LEN bytes anyway.
 273             if ( !zeros ) {
 274                 res = -ENOMEM;
 275                 goto error_exit_free_blocks;
 276             }
 277             memset ( zeros, 0, to_paste*UNFM_P_SIZE);
 278             do {
 279                 to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE );
 280                 if ( is_indirect_le_ih(ih) ) {
 281                     /* Ok, there is existing indirect item already. Need to append it */
 282                     /* Calculate position past inserted item */
 283                     make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3);
 284                     res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)zeros, UNFM_P_SIZE*to_paste);
 285                     if ( res ) {
 286                         kfree(zeros);
 287                         goto error_exit_free_blocks;
 288                     }
 289                 } else if ( is_statdata_le_ih(ih) ) {
 290                     /* No existing item, create it */
 291                     /* item head for new item */
 292                     struct item_head ins_ih;
 293
 294                     /* create a key for our new item */
 295                     make_cpu_key( &key, inode, 1, TYPE_INDIRECT, 3);
 296
 297                     /* Create new item head for our new item */
 298                     make_le_item_head (&ins_ih, &key, key.version, 1,
 299                                        TYPE_INDIRECT, to_paste*UNFM_P_SIZE,
 300                                        0 /* free space */);
 301
 302                     /* Find where such item should live in the tree */
 303                     res = search_item (inode->i_sb, &key, &path);
 304                     if ( res != ITEM_NOT_FOUND ) {
 305                         /* item should not exist, otherwise we have error */
 306                         if ( res != -ENOSPC ) {
 307                             reiserfs_warning (inode->i_sb,
 308                                 "green-9008: search_by_key (%K) returned %d",
 309                                               &key, res);
 310                         }
 311                         res = -EIO;
 312                         kfree(zeros);
 313                         goto error_exit_free_blocks;
 314                     }
 315                     res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)zeros);
 316                 } else {
 317                     reiserfs_panic(inode->i_sb, "green-9011: Unexpected key type %K\n", &key);
 318                 }
 319                 if ( res ) {
 320                     kfree(zeros);
 321                     goto error_exit_free_blocks;
 322                 }
 323                 /* Now we want to check if transaction is too full, and if it is
 324                    we restart it. This will also free the path. */
 325                 if (journal_transaction_should_end(th, th->t_blocks_allocated))
 326                     restart_transaction(th, inode, &path);
 327
 328                 /* Well, need to recalculate path and stuff */
 329                 set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + (to_paste << inode->i_blkbits));
 330                 res = search_for_position_by_key(inode->i_sb, &key, &path);
 331                 if ( res == IO_ERROR ) {
 332                     res = -EIO;
 333                     kfree(zeros);
 334                     goto error_exit_free_blocks;
 335                 }
 336                 bh=get_last_bh(&path);
 337                 ih=get_ih(&path);
 338                 item = get_item(&path);
 339                 hole_size -= to_paste;
 340             } while ( hole_size );
 341             kfree(zeros);
 342         }
 343     }
 344
 345     // Go through existing indirect items first
 346     // replace all zeroes with blocknumbers from list
 347     // Note that if no corresponding item was found, by previous search,
 348     // it means there are no existing in-tree representation for file area
 349     // we are going to overwrite, so there is nothing to scan through for holes.
 350     for ( curr_block = 0, itempos = path.pos_in_item ; curr_block < blocks_to_allocate && res == POSITION_FOUND ; ) {
 351 retry:
 352         if ( itempos >= ih_item_len(ih)/UNFM_P_SIZE ) {
 353             /* We run out of data in this indirect item, let's look for another
 354                one. */
 355             /* First if we are already modifying current item, log it */
 356             if ( modifying_this_item ) {
 357                 journal_mark_dirty (th, inode->i_sb, bh);
 358                 modifying_this_item = 0;
 359             }
 360             /* Then set the key to look for a new indirect item (offset of old
 361                item is added to old item length */
 362             set_cpu_key_k_offset( &key, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize));
 363             /* Search ofor position of new key in the tree. */
 364             res = search_for_position_by_key(inode->i_sb, &key, &path);
 365             if ( res == IO_ERROR) {
 366                 res = -EIO;
 367                 goto error_exit_free_blocks;
 368             }
 369             bh=get_last_bh(&path);
 370             ih=get_ih(&path);
 371             item = get_item(&path);
 372             itempos = path.pos_in_item;
 373             continue; // loop to check all kinds of conditions and so on.
 374         }
 375         /* Ok, we have correct position in item now, so let's see if it is
 376            representing file hole (blocknumber is zero) and fill it if needed */
 377         if ( !item[itempos] ) {
 378             /* Ok, a hole. Now we need to check if we already prepared this
 379                block to be journaled */
 380             while ( !modifying_this_item ) { // loop until succeed
 381                 /* Well, this item is not journaled yet, so we must prepare
 382                    it for journal first, before we can change it */
 383                 struct item_head tmp_ih; // We copy item head of found item,
 384                                          // here to detect if fs changed under
 385                                          // us while we were preparing for
 386                                          // journal.
 387                 int fs_gen; // We store fs generation here to find if someone
 388                             // changes fs under our feet
 389
 390                 copy_item_head (&tmp_ih, ih); // Remember itemhead
 391                 fs_gen = get_generation (inode->i_sb); // remember fs generation
 392                 reiserfs_prepare_for_journal(inode->i_sb, bh, 1); // Prepare a buffer within which indirect item is stored for changing.
 393                 if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
 394                     // Sigh, fs was changed under us, we need to look for new
 395                     // location of item we are working with
 396
 397                     /* unmark prepaerd area as journaled and search for it's
 398                        new position */
 399                     reiserfs_restore_prepared_buffer(inode->i_sb, bh);
 400                     res = search_for_position_by_key(inode->i_sb, &key, &path);
 401                     if ( res == IO_ERROR) {
 402                         res = -EIO;
 403                         goto error_exit_free_blocks;
 404                     }
 405                     bh=get_last_bh(&path);
 406                     ih=get_ih(&path);
 407                     item = get_item(&path);
 408                     itempos = path.pos_in_item;
 409                     goto retry;
 410                 }
 411                 modifying_this_item = 1;
 412             }
 413             item[itempos] = allocated_blocks[curr_block]; // Assign new block
 414             curr_block++;
 415         }
 416         itempos++;
 417     }
 418
 419     if ( modifying_this_item ) { // We need to log last-accessed block, if it
 420                                  // was modified, but not logged yet.
 421         journal_mark_dirty (th, inode->i_sb, bh);
 422     }
 423
 424     if ( curr_block < blocks_to_allocate ) {
 425         // Oh, well need to append to indirect item, or to create indirect item
 426         // if there weren't any
 427         if ( is_indirect_le_ih(ih) ) {
 428             // Existing indirect item - append. First calculate key for append
 429             // position. We do not need to recalculate path as it should
 430             // already point to correct place.
 431             make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3);
 432             res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)(allocated_blocks+curr_block), UNFM_P_SIZE*(blocks_to_allocate-curr_block));
 433             if ( res ) {
 434                 goto error_exit_free_blocks;
 435             }
 436         } else if (is_statdata_le_ih(ih) ) {
 437             // Last found item was statdata. That means we need to create indirect item.
 438             struct item_head ins_ih; /* itemhead for new item */
 439
 440             /* create a key for our new item */
 441             make_cpu_key( &key, inode, 1, TYPE_INDIRECT, 3); // Position one,
 442                                                             // because that's
 443                                                             // where first
 444                                                             // indirect item
 445                                                             // begins
 446             /* Create new item head for our new item */
 447             make_le_item_head (&ins_ih, &key, key.version, 1, TYPE_INDIRECT,
 448                                (blocks_to_allocate-curr_block)*UNFM_P_SIZE,
 449                                0 /* free space */);
 450             /* Find where such item should live in the tree */
 451             res = search_item (inode->i_sb, &key, &path);
 452             if ( res != ITEM_NOT_FOUND ) {
 453                 /* Well, if we have found such item already, or some error
 454                    occured, we need to warn user and return error */
 455                 if ( res != -ENOSPC ) {
 456                     reiserfs_warning (inode->i_sb,
 457                                       "green-9009: search_by_key (%K) "
 458                                       "returned %d", &key, res);
 459                 }
 460                 res = -EIO;
 461                 goto error_exit_free_blocks;
 462             }
 463             /* Insert item into the tree with the data as its body */
 464             res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)(allocated_blocks+curr_block));
 465         } else {
 466             reiserfs_panic(inode->i_sb, "green-9010: unexpected item type for key %K\n",&key);
 467         }
 468     }
 469
 470     // the caller is responsible for closing the transaction
 471     // unless we return an error, they are also responsible for logging
 472     // the inode.
 473     //
 474     pathrelse(&path);
 475     /*
 476      * cleanup prellocation from previous writes
 477      * if this is a partial block write
 478      */
 479     if (write_bytes & (inode->i_sb->s_blocksize -1))
 480         reiserfs_discard_prealloc(th, inode);
 481     reiserfs_write_unlock(inode->i_sb);
 482
 483     // go through all the pages/buffers and map the buffers to newly allocated
 484     // blocks (so that system knows where to write these pages later).
 485     curr_block = 0;
 486     for ( i = 0; i < num_pages ; i++ ) {
 487         struct page *page=prepared_pages[i]; //current page
 488         struct buffer_head *head = page_buffers(page);// first buffer for a page
 489         int block_start, block_end; // in-page offsets for buffers.
 490
 491         if (!page_buffers(page))
 492             reiserfs_panic(inode->i_sb, "green-9005: No buffers for prepared page???");
 493
 494         /* For each buffer in page */
 495         for(bh = head, block_start = 0; bh != head || !block_start;
 496             block_start=block_end, bh = bh->b_this_page) {
 497             if (!bh)
 498                 reiserfs_panic(inode->i_sb, "green-9006: Allocated but absent buffer for a page?");
 499             block_end = block_start+inode->i_sb->s_blocksize;
 500             if (i == 0 && block_end <= from )
 501                 /* if this buffer is before requested data to map, skip it */
 502                 continue;
 503             if (i == num_pages - 1 && block_start >= to)
 504                 /* If this buffer is after requested data to map, abort
 505                    processing of current page */
 506                 break;
 507
 508             if ( !buffer_mapped(bh) ) { // Ok, unmapped buffer, need to map it
 509                 map_bh( bh, inode->i_sb, le32_to_cpu(allocated_blocks[curr_block]));
 510                 curr_block++;
 511                 set_buffer_new(bh);
 512             }
 513         }
 514     }
 515
 516     RFALSE( curr_block > blocks_to_allocate, "green-9007: Used too many blocks? weird");
 517
 518     kfree(allocated_blocks);
 519     return 0;
 520
 521 // Need to deal with transaction here.
 522 error_exit_free_blocks:
 523     pathrelse(&path);
 524     // free blocks
 525     for( i = 0; i < blocks_to_allocate; i++ )
 526         reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]), 1);
 527
 528 error_exit:
 529     reiserfs_update_sd(th, inode); // update any changes we made to blk count
 530     journal_end(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1);
 531     reiserfs_write_unlock(inode->i_sb);
 532     kfree(allocated_blocks);
 533
 534     return res;
 535 }
 536
 537 /* Unlock pages prepared by reiserfs_prepare_file_region_for_write */
 538 void reiserfs_unprepare_pages(struct page **prepared_pages, /* list of locked pages */
 539                               int num_pages /* amount of pages */) {
 540     int i; // loop counter
 541
 542     for (i=0; i < num_pages ; i++) {
 543         struct page *page = prepared_pages[i];
 544
 545         try_to_free_buffers(page);
 546         unlock_page(page);
 547         page_cache_release(page);
 548     }
 549 }
 550
 551 /* This function will copy data from userspace to specified pages within
 552    supplied byte range */
 553 int reiserfs_copy_from_user_to_file_region(
 554                                 loff_t pos, /* In-file position */
 555                                 int num_pages, /* Number of pages affected */
 556                                 int write_bytes, /* Amount of bytes to write */
 557                                 struct page **prepared_pages, /* pointer to
 558                                                                  array to
 559                                                                  prepared pages
 560                                                                 */
 561                                 const char __user *buf /* Pointer to user-supplied
 562                                                    data*/
 563                                 )
 564 {
 565     long page_fault=0; // status of copy_from_user.
 566     int i; // loop counter.
 567     int offset; // offset in page
 568
 569     for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) {
 570         int count = min_t(int,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page
 571         struct page *page=prepared_pages[i]; // Current page we process.
 572
 573         fault_in_pages_readable( buf, count);
 574
 575         /* Copy data from userspace to the current page */
 576         kmap(page);
 577         page_fault = __copy_from_user(page_address(page)+offset, buf, count); // Copy the data.
 578         /* Flush processor's dcache for this page */
 579         flush_dcache_page(page);
 580         kunmap(page);
 581         buf+=count;
 582         write_bytes-=count;
 583
 584         if (page_fault)
 585             break; // Was there a fault? abort.
 586     }
 587
 588     return page_fault?-EFAULT:0;
 589 }
 590
 591 /* taken fs/buffer.c:__block_commit_write */
 592 int reiserfs_commit_page(struct inode *inode, struct page *page,
 593                 unsigned from, unsigned to)
 594 {
 595     unsigned block_start, block_end;
 596     int partial = 0;
 597     unsigned blocksize;
 598     struct buffer_head *bh, *head;
 599     unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
 600     int new;
 601     int logit = reiserfs_file_data_log(inode);
 602     struct super_block *s = inode->i_sb;
 603     int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
 604     struct reiserfs_transaction_handle th;
 605     th.t_trans_id = 0;
 606
 607     blocksize = 1 << inode->i_blkbits;
 608
 609     if (logit) {
 610         reiserfs_write_lock(s);
 611         journal_begin(&th, s, bh_per_page + 1);
 612         reiserfs_update_inode_transaction(inode);
 613     }
 614     for(bh = head = page_buffers(page), block_start = 0;
 615         bh != head || !block_start;
 616         block_start=block_end, bh = bh->b_this_page)
 617     {
 618
 619         new = buffer_new(bh);
 620         clear_buffer_new(bh);
 621         block_end = block_start + blocksize;
 622         if (block_end <= from || block_start >= to) {
 623             if (!buffer_uptodate(bh))
 624                     partial = 1;
 625         } else {
 626             set_buffer_uptodate(bh);
 627             if (logit) {
 628                 reiserfs_prepare_for_journal(s, bh, 1);
 629                 journal_mark_dirty(&th, s, bh);
 630             } else if (!buffer_dirty(bh)) {
 631                 mark_buffer_dirty(bh);
 632                 /* do data=ordered on any page past the end
 633                  * of file and any buffer marked BH_New.
 634                  */
 635                 if (reiserfs_data_ordered(inode->i_sb) &&
 636                     (new || page->index >= i_size_index)) {
 637                     reiserfs_add_ordered_list(inode, bh);
 638                 }
 639             }
 640         }
 641     }
 642     if (logit) {
 643         journal_end(&th, s, bh_per_page + 1);
 644         reiserfs_write_unlock(s);
 645     }
 646     /*
 647      * If this is a partial write which happened to make all buffers
 648      * uptodate then we can optimize away a bogus readpage() for
 649      * the next read(). Here we 'discover' whether the page went
 650      * uptodate as a result of this (potentially partial) write.
 651      */
 652     if (!partial)
 653         SetPageUptodate(page);
 654     return 0;
 655 }
 656
 657
 658 /* Submit pages for write. This was separated from actual file copying
 659    because we might want to allocate block numbers in-between.
 660    This function assumes that caller will adjust file size to correct value. */
 661 int reiserfs_submit_file_region_for_write(
 662                                 struct reiserfs_transaction_handle *th,
 663                                 struct inode *inode,
 664                                 loff_t pos, /* Writing position offset */
 665                                 int num_pages, /* Number of pages to write */
 666                                 int write_bytes, /* number of bytes to write */
 667                                 struct page **prepared_pages /* list of pages */
 668                                 )
 669 {
 670     int status; // return status of block_commit_write.
 671     int retval = 0; // Return value we are going to return.
 672     int i; // loop counter
 673     int offset; // Writing offset in page.
 674     int orig_write_bytes = write_bytes;
 675     int sd_update = 0;
 676
 677     for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) {
 678         int count = min_t(int,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page
 679         struct page *page=prepared_pages[i]; // Current page we process.
 680
 681         status = reiserfs_commit_page(inode, page, offset, offset+count);
 682         if ( status )
 683             retval = status; // To not overcomplicate matters We are going to
 684                              // submit all the pages even if there was error.
 685                              // we only remember error status to report it on
 686                              // exit.
 687         write_bytes-=count;
 688     }
 689     /* now that we've gotten all the ordered buffers marked dirty,
 690      * we can safely update i_size and close any running transaction
 691      */
 692     if ( pos + orig_write_bytes > inode->i_size) {
 693         inode->i_size = pos + orig_write_bytes; // Set new size
 694         /* If the file have grown so much that tail packing is no
 695          * longer possible, reset "need to pack" flag */
 696         if ( (have_large_tails (inode->i_sb) &&
 697               inode->i_size > i_block_size (inode)*4) ||
 698              (have_small_tails (inode->i_sb) &&
 699              inode->i_size > i_block_size(inode)) )
 700             REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
 701         else if ( (have_large_tails (inode->i_sb) &&
 702                   inode->i_size < i_block_size (inode)*4) ||
 703                   (have_small_tails (inode->i_sb) &&
 704                   inode->i_size < i_block_size(inode)) )
 705             REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
 706
 707         if (th->t_trans_id) {
 708             reiserfs_write_lock(inode->i_sb);
 709             reiserfs_update_sd(th, inode); // And update on-disk metadata
 710             reiserfs_write_unlock(inode->i_sb);
 711         } else
 712             inode->i_sb->s_op->dirty_inode(inode);
 713
 714         sd_update = 1;
 715     }
 716     if (th->t_trans_id) {
 717         reiserfs_write_lock(inode->i_sb);
 718         if (!sd_update)
 719             reiserfs_update_sd(th, inode);
 720         journal_end(th, th->t_super, th->t_blocks_allocated);
 721         reiserfs_write_unlock(inode->i_sb);
 722     }
 723     th->t_trans_id = 0;
 724
 725     /*
 726      * we have to unlock the pages after updating i_size, otherwise
 727      * we race with writepage
 728      */
 729     for ( i = 0; i < num_pages ; i++) {
 730         struct page *page=prepared_pages[i];
 731         unlock_page(page);
 732         mark_page_accessed(page);
 733         page_cache_release(page);
 734     }
 735     return retval;
 736 }
 737
 738 /* Look if passed writing region is going to touch file's tail
 739    (if it is present). And if it is, convert the tail to unformatted node */
 740 int reiserfs_check_for_tail_and_convert( struct inode *inode, /* inode to deal with */
 741                                          loff_t pos, /* Writing position */
 742                                          int write_bytes /* amount of bytes to write */
 743                                         )
 744 {
 745     INITIALIZE_PATH(path); // needed for search_for_position
 746     struct cpu_key key; // Key that would represent last touched writing byte.
 747     struct item_head *ih; // item header of found block;
 748     int res; // Return value of various functions we call.
 749     int cont_expand_offset; // We will put offset for generic_cont_expand here
 750                             // This can be int just because tails are created
 751                             // only for small files.
 752
 753 /* this embodies a dependency on a particular tail policy */
 754     if ( inode->i_size >= inode->i_sb->s_blocksize*4 ) {
 755         /* such a big files do not have tails, so we won't bother ourselves
 756            to look for tails, simply return */
 757         return 0;
 758     }
 759
 760     reiserfs_write_lock(inode->i_sb);
 761     /* find the item containing the last byte to be written, or if
 762      * writing past the end of the file then the last item of the
 763      * file (and then we check its type). */
 764     make_cpu_key (&key, inode, pos+write_bytes+1, TYPE_ANY, 3/*key length*/);
 765     res = search_for_position_by_key(inode->i_sb, &key, &path);
 766     if ( res == IO_ERROR ) {
 767         reiserfs_write_unlock(inode->i_sb);
 768         return -EIO;
 769     }
 770     ih = get_ih(&path);
 771     res = 0;
 772     if ( is_direct_le_ih(ih) ) {
 773         /* Ok, closest item is file tail (tails are stored in "direct"
 774          * items), so we need to unpack it. */
 775         /* To not overcomplicate matters, we just call generic_cont_expand
 776            which will in turn call other stuff and finally will boil down to
 777             reiserfs_get_block() that would do necessary conversion. */
 778         cont_expand_offset = le_key_k_offset(get_inode_item_key_version(inode), &(ih->ih_key));
 779         pathrelse(&path);
 780         res = generic_cont_expand( inode, cont_expand_offset);
 781     } else
 782         pathrelse(&path);
 783
 784     reiserfs_write_unlock(inode->i_sb);
 785     return res;
 786 }
 787
 788 /* This function locks pages starting from @pos for @inode.
 789    @num_pages pages are locked and stored in
 790    @prepared_pages array. Also buffers are allocated for these pages.
 791    First and last page of the region is read if it is overwritten only
 792    partially. If last page did not exist before write (file hole or file
 793    append), it is zeroed, then.
 794    Returns number of unallocated blocks that should be allocated to cover
 795    new file data.*/
 796 int reiserfs_prepare_file_region_for_write(
 797                                 struct inode *inode /* Inode of the file */,
 798                                 loff_t pos, /* position in the file */
 799                                 int num_pages, /* number of pages to
 800                                                   prepare */
 801                                 int write_bytes, /* Amount of bytes to be
 802                                                     overwritten from
 803                                                     @pos */
 804                                 struct page **prepared_pages /* pointer to array
 805                                                                where to store
 806                                                                prepared pages */
 807                                            )
 808 {
 809     int res=0; // Return values of different functions we call.
 810     unsigned long index = pos >> PAGE_CACHE_SHIFT; // Offset in file in pages.
 811     int from = (pos & (PAGE_CACHE_SIZE - 1)); // Writing offset in first page
 812     int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;
 813                                          /* offset of last modified byte in last
 814                                             page */
 815     struct address_space *mapping = inode->i_mapping; // Pages are mapped here.
 816     int i; // Simple counter
 817     int blocks = 0; /* Return value (blocks that should be allocated) */
 818     struct buffer_head *bh, *head; // Current bufferhead and first bufferhead
 819                                    // of a page.
 820     unsigned block_start, block_end; // Starting and ending offsets of current
 821                                      // buffer in the page.
 822     struct buffer_head *wait[2], **wait_bh=wait; // Buffers for page, if
 823                                                  // Page appeared to be not up
 824                                                  // to date. Note how we have
 825                                                  // at most 2 buffers, this is
 826                                                  // because we at most may
 827                                                  // partially overwrite two
 828                                                  // buffers for one page. One at                                                 // the beginning of write area
 829                                                  // and one at the end.
 830                                                  // Everything inthe middle gets                                                 // overwritten totally.
 831
 832     struct cpu_key key; // cpu key of item that we are going to deal with
 833     struct item_head *ih = NULL; // pointer to item head that we are going to deal with
 834     struct buffer_head *itembuf=NULL; // Buffer head that contains items that we are going to deal with
 835     INITIALIZE_PATH(path); // path to item, that we are going to deal with.
 836     __u32 * item=NULL; // pointer to item we are going to deal with
 837     int item_pos=-1; /* Position in indirect item */
 838
 839
 840     if ( num_pages < 1 ) {
 841         reiserfs_warning (inode->i_sb,
 842                           "green-9001: reiserfs_prepare_file_region_for_write "
 843                           "called with zero number of pages to process");
 844         return -EFAULT;
 845     }
 846
 847     /* We have 2 loops for pages. In first loop we grab and lock the pages, so
 848        that nobody would touch these until we release the pages. Then
 849        we'd start to deal with mapping buffers to blocks. */
 850     for ( i = 0; i < num_pages; i++) {
 851         prepared_pages[i] = grab_cache_page(mapping, index + i); // locks the page
 852         if ( !prepared_pages[i]) {
 853             res = -ENOMEM;
 854             goto failed_page_grabbing;
 855         }
 856         if (!page_has_buffers(prepared_pages[i]))
 857             create_empty_buffers(prepared_pages[i], inode->i_sb->s_blocksize, 0);
 858     }
 859
 860     /* Let's count amount of blocks for a case where all the blocks
 861        overwritten are new (we will substract already allocated blocks later)*/
 862     if ( num_pages > 2 )
 863         /* These are full-overwritten pages so we count all the blocks in
 864            these pages are counted as needed to be allocated */
 865         blocks = (num_pages - 2) << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 866
 867     /* count blocks needed for first page (possibly partially written) */
 868     blocks += ((PAGE_CACHE_SIZE - from) >> inode->i_blkbits) +
 869            !!(from & (inode->i_sb->s_blocksize-1)); /* roundup */
 870
 871     /* Now we account for last page. If last page == first page (we
 872        overwrite only one page), we substract all the blocks past the
 873        last writing position in a page out of already calculated number
 874        of blocks */
 875     blocks += ((num_pages > 1) << (PAGE_CACHE_SHIFT-inode->i_blkbits)) -
 876            ((PAGE_CACHE_SIZE - to) >> inode->i_blkbits);
 877            /* Note how we do not roundup here since partial blocks still
 878                    should be allocated */
 879
 880     /* Now if all the write area lies past the file end, no point in
 881        maping blocks, since there is none, so we just zero out remaining
 882        parts of first and last pages in write area (if needed) */
 883     if ( (pos & ~((loff_t)PAGE_CACHE_SIZE - 1)) > inode->i_size ) {
 884         if ( from != 0 ) {/* First page needs to be partially zeroed */
 885             char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
 886             memset(kaddr, 0, from);
 887             kunmap_atomic( kaddr, KM_USER0);
 888         }
 889         if ( to != PAGE_CACHE_SIZE ) { /* Last page needs to be partially zeroed */
 890             char *kaddr = kmap_atomic(prepared_pages[num_pages-1], KM_USER0);
 891             memset(kaddr+to, 0, PAGE_CACHE_SIZE - to);
 892             kunmap_atomic( kaddr, KM_USER0);
 893         }
 894
 895         /* Since all blocks are new - use already calculated value */
 896         return blocks;
 897     }
 898
 899     /* Well, since we write somewhere into the middle of a file, there is
 900        possibility we are writing over some already allocated blocks, so
 901        let's map these blocks and substract number of such blocks out of blocks
 902        we need to allocate (calculated above) */
 903     /* Mask write position to start on blocksize, we do it out of the
 904        loop for performance reasons */
 905     pos &= ~((loff_t) inode->i_sb->s_blocksize - 1);
 906     /* Set cpu key to the starting position in a file (on left block boundary)*/
 907     make_cpu_key (&key, inode, 1 + ((pos) & ~((loff_t) inode->i_sb->s_blocksize - 1)), TYPE_ANY, 3/*key length*/);
 908
 909     reiserfs_write_lock(inode->i_sb); // We need that for at least search_by_key()
 910     for ( i = 0; i < num_pages ; i++ ) {
 911
 912         head = page_buffers(prepared_pages[i]);
 913         /* For each buffer in the page */
 914         for(bh = head, block_start = 0; bh != head || !block_start;
 915             block_start=block_end, bh = bh->b_this_page) {
 916                 if (!bh)
 917                     reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?");
 918                 /* Find where this buffer ends */
 919                 block_end = block_start+inode->i_sb->s_blocksize;
 920                 if (i == 0 && block_end <= from )
 921                     /* if this buffer is before requested data to map, skip it*/
 922                     continue;
 923
 924                 if (i == num_pages - 1 && block_start >= to) {
 925                     /* If this buffer is after requested data to map, abort
 926                        processing of current page */
 927                     break;
 928                 }
 929
 930                 if ( buffer_mapped(bh) && bh->b_blocknr !=0 ) {
 931                     /* This is optimisation for a case where buffer is mapped
 932                        and have blocknumber assigned. In case significant amount
 933                        of such buffers are present, we may avoid some amount
 934                        of search_by_key calls.
 935                        Probably it would be possible to move parts of this code
 936                        out of BKL, but I afraid that would overcomplicate code
 937                        without any noticeable benefit.
 938                     */
 939                     item_pos++;
 940                     /* Update the key */
 941                     set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + inode->i_sb->s_blocksize);
 942                     blocks--; // Decrease the amount of blocks that need to be
 943                               // allocated
 944                     continue; // Go to the next buffer
 945                 }
 946
 947                 if ( !itembuf || /* if first iteration */
 948                      item_pos >= ih_item_len(ih)/UNFM_P_SIZE)
 949                                              { /* or if we progressed past the
 950                                                   current unformatted_item */
 951                         /* Try to find next item */
 952                         res = search_for_position_by_key(inode->i_sb, &key, &path);
 953                         /* Abort if no more items */
 954                         if ( res != POSITION_FOUND ) {
 955                             /* make sure later loops don't use this item */
 956                             itembuf = NULL;
 957                             item = NULL;
 958                             break;
 959                         }
 960
 961                         /* Update information about current indirect item */
 962                         itembuf = get_last_bh( &path );
 963                         ih = get_ih( &path );
 964                         item = get_item( &path );
 965                         item_pos = path.pos_in_item;
 966
 967                         RFALSE( !is_indirect_le_ih (ih), "green-9003: indirect item expected");
 968                 }
 969
 970                 /* See if there is some block associated with the file
 971                    at that position, map the buffer to this block */
 972                 if ( get_block_num(item,item_pos) ) {
 973                     map_bh(bh, inode->i_sb, get_block_num(item,item_pos));
 974                     blocks--; // Decrease the amount of blocks that need to be
 975                               // allocated
 976                 }
 977                 item_pos++;
 978                 /* Update the key */
 979                 set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + inode->i_sb->s_blocksize);
 980         }
 981     }
 982     pathrelse(&path); // Free the path
 983     reiserfs_write_unlock(inode->i_sb);
 984
 985         /* Now zero out unmappend buffers for the first and last pages of
 986            write area or issue read requests if page is mapped. */
 987         /* First page, see if it is not uptodate */
 988         if ( !PageUptodate(prepared_pages[0]) ) {
 989             head = page_buffers(prepared_pages[0]);
 990
 991             /* For each buffer in page */
 992             for(bh = head, block_start = 0; bh != head || !block_start;
 993                 block_start=block_end, bh = bh->b_this_page) {
 994
 995                 if (!bh)
 996                     reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?");
 997                 /* Find where this buffer ends */
 998                 block_end = block_start+inode->i_sb->s_blocksize;
 999                 if ( block_end <= from )
1000                     /* if this buffer is before requested data to map, skip it*/
1001                     continue;
1002                 if ( block_start < from ) { /* Aha, our partial buffer */
1003                     if ( buffer_mapped(bh) ) { /* If it is mapped, we need to
1004                                                   issue READ request for it to
1005                                                   not loose data */
1006                         ll_rw_block(READ, 1, &bh);
1007                         *wait_bh++=bh;
1008                     } else { /* Not mapped, zero it */
1009                         char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
1010                         memset(kaddr+block_start, 0, from-block_start);
1011                         kunmap_atomic( kaddr, KM_USER0);
1012                         set_buffer_uptodate(bh);
1013                     }
1014                 }
1015             }
1016         }
1017
1018         /* Last page, see if it is not uptodate, or if the last page is past the end of the file. */
1019         if ( !PageUptodate(prepared_pages[num_pages-1]) ||
1020             ((pos+write_bytes)>>PAGE_CACHE_SHIFT) > (inode->i_size>>PAGE_CACHE_SHIFT) ) {
1021             head = page_buffers(prepared_pages[num_pages-1]);
1022
1023             /* for each buffer in page */
1024             for(bh = head, block_start = 0; bh != head || !block_start;
1025                 block_start=block_end, bh = bh->b_this_page) {
1026
1027                 if (!bh)
1028                     reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?");
1029                 /* Find where this buffer ends */
1030                 block_end = block_start+inode->i_sb->s_blocksize;
1031                 if ( block_start >= to )
1032                     /* if this buffer is after requested data to map, skip it*/
1033                     break;
1034                 if ( block_end > to ) { /* Aha, our partial buffer */
1035                     if ( buffer_mapped(bh) ) { /* If it is mapped, we need to
1036                                                   issue READ request for it to
1037                                                   not loose data */
1038                         ll_rw_block(READ, 1, &bh);
1039                         *wait_bh++=bh;
1040                     } else { /* Not mapped, zero it */
1041                         char *kaddr = kmap_atomic(prepared_pages[num_pages-1], KM_USER0);
1042                         memset(kaddr+to, 0, block_end-to);
1043                         kunmap_atomic( kaddr, KM_USER0);
1044                         set_buffer_uptodate(bh);
1045                     }
1046                 }
1047             }
1048         }
1049
1050     /* Wait for read requests we made to happen, if necessary */
1051     while(wait_bh > wait) {
1052         wait_on_buffer(*--wait_bh);
1053         if (!buffer_uptodate(*wait_bh)) {
1054             res = -EIO;
1055             goto failed_read;
1056         }
1057     }
1058
1059     return blocks;
1060 failed_page_grabbing:
1061     num_pages = i;
1062 failed_read:
1063     reiserfs_unprepare_pages(prepared_pages, num_pages);
1064     return res;
1065 }
1066
1067 /* Write @count bytes at position @ppos in a file indicated by @file
1068    from the buffer @buf.
1069
1070    generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
1071    something simple that works.  It is not for serious use by general purpose filesystems, excepting the one that it was
1072    written for (ext2/3).  This is for several reasons:
1073
1074    * It has no understanding of any filesystem specific optimizations.
1075
1076    * It enters the filesystem repeatedly for each page that is written.
1077
1078    * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key
1079    * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time
1080    * to reiserfs which allows for fewer tree traversals.
1081
1082    * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks.
1083
1084    * Asking the block allocation code for blocks one at a time is slightly less efficient.
1085
1086    All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to
1087    use it, but we were in a hurry to make code freeze, and so it couldn't be revised then.  This new code should make
1088    things right finally.
1089
1090    Future Features: providing search_by_key with hints.
1091
1092 */
1093 ssize_t reiserfs_file_write( struct file *file, /* the file we are going to write into */
1094                              const char __user *buf, /*  pointer to user supplied data
1095 (in userspace) */
1096                              size_t count, /* amount of bytes to write */
1097                              loff_t *ppos /* pointer to position in file that we start writing at. Should be updated to
1098                                            * new current position before returning. */ )
1099 {
1100     size_t already_written = 0; // Number of bytes already written to the file.
1101     loff_t pos; // Current position in the file.
1102     ssize_t res; // return value of various functions that we call.
1103     struct inode *inode = file->f_dentry->d_inode; // Inode of the file that we are writing to.
1104                                 /* To simplify coding at this time, we store
1105                                    locked pages in array for now */
1106     struct page * prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
1107     struct reiserfs_transaction_handle th;
1108     th.t_trans_id = 0;
1109
1110     if ( file->f_flags & O_DIRECT) { // Direct IO needs treatment
1111         ssize_t result, after_file_end = 0;
1112         if ( (*ppos + count >= inode->i_size) || (file->f_flags & O_APPEND) ) {
1113             /* If we are appending a file, we need to put this savelink in here.
1114                If we will crash while doing direct io, finish_unfinished will
1115                cut the garbage from the file end. */
1116             reiserfs_write_lock(inode->i_sb);
1117             journal_begin(&th, inode->i_sb,  JOURNAL_PER_BALANCE_CNT );
1118             reiserfs_update_inode_transaction(inode);
1119             add_save_link (&th, inode, 1 /* Truncate */);
1120             journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT );
1121             reiserfs_write_unlock(inode->i_sb);
1122             after_file_end = 1;
1123         }
1124         result = generic_file_write(file, buf, count, ppos);
1125
1126         if ( after_file_end ) { /* Now update i_size and remove the savelink */
1127             struct reiserfs_transaction_handle th;
1128             reiserfs_write_lock(inode->i_sb);
1129             journal_begin(&th, inode->i_sb, 1);
1130             reiserfs_update_inode_transaction(inode);
1131             reiserfs_update_sd(&th, inode);
1132             journal_end(&th, inode->i_sb, 1);
1133             remove_save_link (inode, 1/* truncate */);
1134             reiserfs_write_unlock(inode->i_sb);
1135         }
1136
1137         return result;
1138     }
1139
1140     if ( unlikely((ssize_t) count < 0 ))
1141         return -EINVAL;
1142
1143     if (unlikely(!access_ok(VERIFY_READ, buf, count)))
1144         return -EFAULT;
1145
1146     down(&inode->i_sem); // locks the entire file for just us
1147
1148     pos = *ppos;
1149
1150     /* Check if we can write to specified region of file, file
1151        is not overly big and this kind of stuff. Adjust pos and
1152        count, if needed */
1153     res = generic_write_checks(file, &pos, &count, 0);
1154     if (res)
1155         goto out;
1156
1157     if ( count == 0 )
1158         goto out;
1159
1160     res = remove_suid(file->f_dentry);
1161     if (res)
1162         goto out;
1163
1164     inode_update_time(inode, 1); /* Both mtime and ctime */
1165
1166     // Ok, we are done with all the checks.
1167
1168     // Now we should start real work
1169
1170     /* If we are going to write past the file's packed tail or if we are going
1171        to overwrite part of the tail, we need that tail to be converted into
1172        unformatted node */
1173     res = reiserfs_check_for_tail_and_convert( inode, pos, count);
1174     if (res)
1175         goto out;
1176
1177     while ( count > 0) {
1178         /* This is the main loop in which we running until some error occures
1179            or until we write all of the data. */
1180         int num_pages;/* amount of pages we are going to write this iteration */
1181         int write_bytes; /* amount of bytes to write during this iteration */
1182         int blocks_to_allocate; /* how much blocks we need to allocate for
1183                                    this iteration */
1184
1185         /*  (pos & (PAGE_CACHE_SIZE-1)) is an idiom for offset into a page of pos*/
1186         num_pages = !!((pos+count) & (PAGE_CACHE_SIZE - 1)) + /* round up partial
1187                                                           pages */
1188                     ((count + (pos & (PAGE_CACHE_SIZE-1))) >> PAGE_CACHE_SHIFT);
1189                                                 /* convert size to amount of
1190                                                    pages */
1191         reiserfs_write_lock(inode->i_sb);
1192         if ( num_pages > REISERFS_WRITE_PAGES_AT_A_TIME
1193                 || num_pages > reiserfs_can_fit_pages(inode->i_sb) ) {
1194             /* If we were asked to write more data than we want to or if there
1195                is not that much space, then we shorten amount of data to write
1196                for this iteration. */
1197             num_pages = min_t(int, REISERFS_WRITE_PAGES_AT_A_TIME, reiserfs_can_fit_pages(inode->i_sb));
1198             /* Also we should not forget to set size in bytes accordingly */
1199             write_bytes = (num_pages << PAGE_CACHE_SHIFT) -
1200                             (pos & (PAGE_CACHE_SIZE-1));
1201                                          /* If position is not on the
1202                                             start of the page, we need
1203                                             to substract the offset
1204                                             within page */
1205         } else
1206             write_bytes = count;
1207
1208         /* reserve the blocks to be allocated later, so that later on
1209            we still have the space to write the blocks to */
1210         reiserfs_claim_blocks_to_be_allocated(inode->i_sb, num_pages << (PAGE_CACHE_SHIFT - inode->i_blkbits));
1211         reiserfs_write_unlock(inode->i_sb);
1212
1213         if ( !num_pages ) { /* If we do not have enough space even for */
1214             res = -ENOSPC;  /* single page, return -ENOSPC */
1215             if ( pos > (inode->i_size & (inode->i_sb->s_blocksize-1)))
1216                 break; // In case we are writing past the file end, break.
1217             // Otherwise we are possibly overwriting the file, so
1218             // let's set write size to be equal or less than blocksize.
1219             // This way we get it correctly for file holes.
1220             // But overwriting files on absolutelly full volumes would not
1221             // be very efficient. Well, people are not supposed to fill
1222             // 100% of disk space anyway.
1223             write_bytes = min_t(int, count, inode->i_sb->s_blocksize - (pos & (inode->i_sb->s_blocksize - 1)));
1224             num_pages = 1;
1225             // No blocks were claimed before, so do it now.
1226             reiserfs_claim_blocks_to_be_allocated(inode->i_sb, 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits));
1227         }
1228
1229         /* Prepare for writing into the region, read in all the
1230            partially overwritten pages, if needed. And lock the pages,
1231            so that nobody else can access these until we are done.
1232            We get number of actual blocks needed as a result.*/
1233         blocks_to_allocate = reiserfs_prepare_file_region_for_write(inode, pos, num_pages, write_bytes, prepared_pages);
1234         if ( blocks_to_allocate < 0 ) {
1235             res = blocks_to_allocate;
1236             reiserfs_release_claimed_blocks(inode->i_sb, num_pages << (PAGE_CACHE_SHIFT - inode->i_blkbits));
1237             break;
1238         }
1239
1240         /* First we correct our estimate of how many blocks we need */
1241         reiserfs_release_claimed_blocks(inode->i_sb, (num_pages << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits)) - blocks_to_allocate );
1242
1243         if ( blocks_to_allocate > 0) {/*We only allocate blocks if we need to*/
1244             /* Fill in all the possible holes and append the file if needed */
1245             res = reiserfs_allocate_blocks_for_region(&th, inode, pos, num_pages, write_bytes, prepared_pages, blocks_to_allocate);
1246         }
1247
1248         /* well, we have allocated the blocks, so it is time to free
1249            the reservation we made earlier. */
1250         reiserfs_release_claimed_blocks(inode->i_sb, blocks_to_allocate);
1251         if ( res ) {
1252             reiserfs_unprepare_pages(prepared_pages, num_pages);
1253             break;
1254         }
1255
1256 /* NOTE that allocating blocks and filling blocks can be done in reverse order
1257    and probably we would do that just to get rid of garbage in files after a
1258    crash */
1259
1260         /* Copy data from user-supplied buffer to file's pages */
1261         res = reiserfs_copy_from_user_to_file_region(pos, num_pages, write_bytes, prepared_pages, buf);
1262         if ( res ) {
1263             reiserfs_unprepare_pages(prepared_pages, num_pages);
1264             break;
1265         }
1266
1267         /* Send the pages to disk and unlock them. */
1268         res = reiserfs_submit_file_region_for_write(&th, inode, pos, num_pages,
1269                                                     write_bytes,prepared_pages);
1270         if ( res )
1271             break;
1272
1273         already_written += write_bytes;
1274         buf += write_bytes;
1275         *ppos = pos += write_bytes;
1276         count -= write_bytes;
1277         balance_dirty_pages_ratelimited(inode->i_mapping);
1278     }
1279
1280     /* this is only true on error */
1281     if (th.t_trans_id) {
1282         reiserfs_write_lock(inode->i_sb);
1283         journal_end(&th, th.t_super, th.t_blocks_allocated);
1284         reiserfs_write_unlock(inode->i_sb);
1285     }
1286
1287     if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
1288         res = generic_osync_inode(inode, file->f_mapping, OSYNC_METADATA|OSYNC_DATA);
1289
1290     up(&inode->i_sem);
1291     reiserfs_async_progress_wait(inode->i_sb);
1292     return (already_written != 0)?already_written:res;
1293
1294 out:
1295     up(&inode->i_sem); // unlock the file on exit.
1296     return res;
1297 }
1298
1299 static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user *buf,
1300                                size_t count, loff_t pos)
1301 {
1302     return generic_file_aio_write(iocb, buf, count, pos);
1303 }
1304
1305
1306
1307 struct file_operations reiserfs_file_operations = {
1308     .read       = generic_file_read,
1309     .write      = reiserfs_file_write,
1310     .ioctl      = reiserfs_ioctl,
1311     .mmap       = generic_file_mmap,
1312     .release    = reiserfs_file_release,
1313     .fsync      = reiserfs_sync_file,
1314     .sendfile   = generic_file_sendfile,
1315     .aio_read   = generic_file_aio_read,
1316     .aio_write  = reiserfs_aio_write,
1317 };
1318
1319
1320 struct  inode_operations reiserfs_file_inode_operations = {
1321     .truncate   = reiserfs_vfs_truncate_file,
1322     .setattr    = reiserfs_setattr,
1323     .setxattr   = reiserfs_setxattr,
1324     .getxattr   = reiserfs_getxattr,
1325     .listxattr  = reiserfs_listxattr,
1326     .removexattr = reiserfs_removexattr,
1327     .permission = reiserfs_permission,
1328 };
1329
1330