fs/ntfs/aops.c

   1 /**
   2  * aops.c - NTFS kernel address space operations and page cache handling.
   3  *          Part of the Linux-NTFS project.
   4  *
   5  * Copyright (c) 2001-2004 Anton Altaparmakov
   6  * Copyright (c) 2002 Richard Russon
   7  *
   8  * This program/include file is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU General Public License as published
  10  * by the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program/include file is distributed in the hope that it will be
  14  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
  15  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with this program (in the main directory of the Linux-NTFS
  20  * distribution in the file COPYING); if not, write to the Free Software
  21  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  22  */
  23
  24 #include <linux/errno.h>
  25 #include <linux/mm.h>
  26 #include <linux/pagemap.h>
  27 #include <linux/swap.h>
  28 #include <linux/buffer_head.h>
  29 #include <linux/writeback.h>
  30
  31 #include "aops.h"
  32 #include "attrib.h"
  33 #include "debug.h"
  34 #include "inode.h"
  35 #include "mft.h"
  36 #include "runlist.h"
  37 #include "types.h"
  38 #include "ntfs.h"
  39
  40 /**
  41  * ntfs_end_buffer_async_read - async io completion for reading attributes
  42  * @bh:         buffer head on which io is completed
  43  * @uptodate:   whether @bh is now uptodate or not
  44  *
  45  * Asynchronous I/O completion handler for reading pages belonging to the
  46  * attribute address space of an inode. The inodes can either be files or
  47  * directories or they can be fake inodes describing some attribute.
  48  *
  49  * If NInoMstProtected(), perform the post read mst fixups when all IO on the
  50  * page has been completed and mark the page uptodate or set the error bit on
  51  * the page. To determine the size of the records that need fixing up, we cheat
  52  * a little bit by setting the index_block_size in ntfs_inode to the ntfs
  53  * record size, and index_block_size_bits, to the log(base 2) of the ntfs
  54  * record size.
  55  */
  56 static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
  57 {
  58         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
  59         unsigned long flags;
  60         struct buffer_head *tmp;
  61         struct page *page;
  62         ntfs_inode *ni;
  63         int page_uptodate = 1;
  64
  65         page = bh->b_page;
  66         ni = NTFS_I(page->mapping->host);
  67
  68         if (likely(uptodate)) {
  69                 s64 file_ofs;
  70
  71                 set_buffer_uptodate(bh);
  72
  73                 file_ofs = ((s64)page->index << PAGE_CACHE_SHIFT) +
  74                                 bh_offset(bh);
  75                 /* Check for the current buffer head overflowing. */
  76                 if (file_ofs + bh->b_size > ni->initialized_size) {
  77                         char *addr;
  78                         int ofs = 0;
  79
  80                         if (file_ofs < ni->initialized_size)
  81                                 ofs = ni->initialized_size - file_ofs;
  82                         addr = kmap_atomic(page, KM_BIO_SRC_IRQ);
  83                         memset(addr + bh_offset(bh) + ofs, 0, bh->b_size - ofs);
  84                         flush_dcache_page(page);
  85                         kunmap_atomic(addr, KM_BIO_SRC_IRQ);
  86                 }
  87         } else {
  88                 clear_buffer_uptodate(bh);
  89                 ntfs_error(ni->vol->sb, "Buffer I/O error, logical block %llu.",
  90                                 (unsigned long long)bh->b_blocknr);
  91                 SetPageError(page);
  92         }
  93
  94         spin_lock_irqsave(&page_uptodate_lock, flags);
  95         clear_buffer_async_read(bh);
  96         unlock_buffer(bh);
  97         tmp = bh;
  98         do {
  99                 if (!buffer_uptodate(tmp))
 100                         page_uptodate = 0;
 101                 if (buffer_async_read(tmp)) {
 102                         if (likely(buffer_locked(tmp)))
 103                                 goto still_busy;
 104                         /* Async buffers must be locked. */
 105                         BUG();
 106                 }
 107                 tmp = tmp->b_this_page;
 108         } while (tmp != bh);
 109         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 110         /*
 111          * If none of the buffers had errors then we can set the page uptodate,
 112          * but we first have to perform the post read mst fixups, if the
 113          * attribute is mst protected, i.e. if NInoMstProteced(ni) is true.
 114          */
 115         if (!NInoMstProtected(ni)) {
 116                 if (likely(page_uptodate && !PageError(page)))
 117                         SetPageUptodate(page);
 118         } else {
 119                 char *addr;
 120                 unsigned int i, recs, nr_err;
 121                 u32 rec_size;
 122
 123                 rec_size = ni->itype.index.block_size;
 124                 recs = PAGE_CACHE_SIZE / rec_size;
 125                 addr = kmap_atomic(page, KM_BIO_SRC_IRQ);
 126                 for (i = nr_err = 0; i < recs; i++) {
 127                         if (likely(!post_read_mst_fixup((NTFS_RECORD*)(addr +
 128                                         i * rec_size), rec_size)))
 129                                 continue;
 130                         nr_err++;
 131                         ntfs_error(ni->vol->sb, "post_read_mst_fixup() failed, "
 132                                         "corrupt %s record 0x%llx. Run chkdsk.",
 133                                         ni->mft_no ? "index" : "mft",
 134                                         (unsigned long long)(((s64)page->index
 135                                         << PAGE_CACHE_SHIFT >>
 136                                         ni->itype.index.block_size_bits) + i));
 137                 }
 138                 flush_dcache_page(page);
 139                 kunmap_atomic(addr, KM_BIO_SRC_IRQ);
 140                 if (likely(!PageError(page))) {
 141                         if (likely(!nr_err && recs)) {
 142                                 if (likely(page_uptodate))
 143                                         SetPageUptodate(page);
 144                         } else {
 145                                 ntfs_error(ni->vol->sb, "Setting page error, "
 146                                                 "index 0x%lx.", page->index);
 147                                 SetPageError(page);
 148                         }
 149                 }
 150         }
 151         unlock_page(page);
 152         return;
 153 still_busy:
 154         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 155         return;
 156 }
 157
 158 /**
 159  * ntfs_read_block - fill a @page of an address space with data
 160  * @page:       page cache page to fill with data
 161  *
 162  * Fill the page @page of the address space belonging to the @page->host inode.
 163  * We read each buffer asynchronously and when all buffers are read in, our io
 164  * completion handler ntfs_end_buffer_read_async(), if required, automatically
 165  * applies the mst fixups to the page before finally marking it uptodate and
 166  * unlocking it.
 167  *
 168  * We only enforce allocated_size limit because i_size is checked for in
 169  * generic_file_read().
 170  *
 171  * Return 0 on success and -errno on error.
 172  *
 173  * Contains an adapted version of fs/buffer.c::block_read_full_page().
 174  */
 175 static int ntfs_read_block(struct page *page)
 176 {
 177         VCN vcn;
 178         LCN lcn;
 179         ntfs_inode *ni;
 180         ntfs_volume *vol;
 181         runlist_element *rl;
 182         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
 183         sector_t iblock, lblock, zblock;
 184         unsigned int blocksize, vcn_ofs;
 185         int i, nr;
 186         unsigned char blocksize_bits;
 187
 188         ni = NTFS_I(page->mapping->host);
 189         vol = ni->vol;
 190
 191         blocksize_bits = VFS_I(ni)->i_blkbits;
 192         blocksize = 1 << blocksize_bits;
 193
 194         if (!page_has_buffers(page))
 195                 create_empty_buffers(page, blocksize, 0);
 196         bh = head = page_buffers(page);
 197         if (unlikely(!bh)) {
 198                 unlock_page(page);
 199                 return -ENOMEM;
 200         }
 201
 202         iblock = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
 203         lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
 204         zblock = (ni->initialized_size + blocksize - 1) >> blocksize_bits;
 205
 206 #ifdef DEBUG
 207         if (unlikely(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni)))
 208                 panic("NTFS: $MFT/$DATA runlist has been unmapped! This is a "
 209                                 "very serious bug! Cannot continue...");
 210 #endif
 211
 212         /* Loop through all the buffers in the page. */
 213         rl = NULL;
 214         nr = i = 0;
 215         do {
 216                 u8 *kaddr;
 217
 218                 if (unlikely(buffer_uptodate(bh)))
 219                         continue;
 220                 if (unlikely(buffer_mapped(bh))) {
 221                         arr[nr++] = bh;
 222                         continue;
 223                 }
 224                 bh->b_bdev = vol->sb->s_bdev;
 225                 /* Is the block within the allowed limits? */
 226                 if (iblock < lblock) {
 227                         BOOL is_retry = FALSE;
 228
 229                         /* Convert iblock into corresponding vcn and offset. */
 230                         vcn = (VCN)iblock << blocksize_bits >>
 231                                         vol->cluster_size_bits;
 232                         vcn_ofs = ((VCN)iblock << blocksize_bits) &
 233                                         vol->cluster_size_mask;
 234                         if (!rl) {
 235 lock_retry_remap:
 236                                 down_read(&ni->runlist.lock);
 237                                 rl = ni->runlist.rl;
 238                         }
 239                         if (likely(rl != NULL)) {
 240                                 /* Seek to element containing target vcn. */
 241                                 while (rl->length && rl[1].vcn <= vcn)
 242                                         rl++;
 243                                 lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
 244                         } else
 245                                 lcn = LCN_RL_NOT_MAPPED;
 246                         /* Successful remap. */
 247                         if (lcn >= 0) {
 248                                 /* Setup buffer head to correct block. */
 249                                 bh->b_blocknr = ((lcn << vol->cluster_size_bits)
 250                                                 + vcn_ofs) >> blocksize_bits;
 251                                 set_buffer_mapped(bh);
 252                                 /* Only read initialized data blocks. */
 253                                 if (iblock < zblock) {
 254                                         arr[nr++] = bh;
 255                                         continue;
 256                                 }
 257                                 /* Fully non-initialized data block, zero it. */
 258                                 goto handle_zblock;
 259                         }
 260                         /* It is a hole, need to zero it. */
 261                         if (lcn == LCN_HOLE)
 262                                 goto handle_hole;
 263                         /* If first try and runlist unmapped, map and retry. */
 264                         if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
 265                                 is_retry = TRUE;
 266                                 /*
 267                                  * Attempt to map runlist, dropping lock for
 268                                  * the duration.
 269                                  */
 270                                 up_read(&ni->runlist.lock);
 271                                 if (!ntfs_map_runlist(ni, vcn))
 272                                         goto lock_retry_remap;
 273                                 rl = NULL;
 274                         }
 275                         /* Hard error, zero out region. */
 276                         SetPageError(page);
 277                         ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn(vcn = 0x%llx) "
 278                                         "failed with error code 0x%llx%s.",
 279                                         (unsigned long long)vcn,
 280                                         (unsigned long long)-lcn,
 281                                         is_retry ? " even after retrying" : "");
 282                         // FIXME: Depending on vol->on_errors, do something.
 283                 }
 284                 /*
 285                  * Either iblock was outside lblock limits or
 286                  * ntfs_rl_vcn_to_lcn() returned error.  Just zero that portion
 287                  * of the page and set the buffer uptodate.
 288                  */
 289 handle_hole:
 290                 bh->b_blocknr = -1UL;
 291                 clear_buffer_mapped(bh);
 292 handle_zblock:
 293                 kaddr = kmap_atomic(page, KM_USER0);
 294                 memset(kaddr + i * blocksize, 0, blocksize);
 295                 flush_dcache_page(page);
 296                 kunmap_atomic(kaddr, KM_USER0);
 297                 set_buffer_uptodate(bh);
 298         } while (i++, iblock++, (bh = bh->b_this_page) != head);
 299
 300         /* Release the lock if we took it. */
 301         if (rl)
 302                 up_read(&ni->runlist.lock);
 303
 304         /* Check we have at least one buffer ready for i/o. */
 305         if (nr) {
 306                 struct buffer_head *tbh;
 307
 308                 /* Lock the buffers. */
 309                 for (i = 0; i < nr; i++) {
 310                         tbh = arr[i];
 311                         lock_buffer(tbh);
 312                         tbh->b_end_io = ntfs_end_buffer_async_read;
 313                         set_buffer_async_read(tbh);
 314                 }
 315                 /* Finally, start i/o on the buffers. */
 316                 for (i = 0; i < nr; i++) {
 317                         tbh = arr[i];
 318                         if (likely(!buffer_uptodate(tbh)))
 319                                 submit_bh(READ, tbh);
 320                         else
 321                                 ntfs_end_buffer_async_read(tbh, 1);
 322                 }
 323                 return 0;
 324         }
 325         /* No i/o was scheduled on any of the buffers. */
 326         if (likely(!PageError(page)))
 327                 SetPageUptodate(page);
 328         else /* Signal synchronous i/o error. */
 329                 nr = -EIO;
 330         unlock_page(page);
 331         return nr;
 332 }
 333
 334 /**
 335  * ntfs_readpage - fill a @page of a @file with data from the device
 336  * @file:       open file to which the page @page belongs or NULL
 337  * @page:       page cache page to fill with data
 338  *
 339  * For non-resident attributes, ntfs_readpage() fills the @page of the open
 340  * file @file by calling the ntfs version of the generic block_read_full_page()
 341  * function, ntfs_read_block(), which in turn creates and reads in the buffers
 342  * associated with the page asynchronously.
 343  *
 344  * For resident attributes, OTOH, ntfs_readpage() fills @page by copying the
 345  * data from the mft record (which at this stage is most likely in memory) and
 346  * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as
 347  * even if the mft record is not cached at this point in time, we need to wait
 348  * for it to be read in before we can do the copy.
 349  *
 350  * Return 0 on success and -errno on error.
 351  *
 352  * WARNING: Do not make this function static! It is used by mft.c!
 353  */
 354 int ntfs_readpage(struct file *file, struct page *page)
 355 {
 356         s64 attr_pos;
 357         ntfs_inode *ni, *base_ni;
 358         u8 *kaddr;
 359         ntfs_attr_search_ctx *ctx;
 360         MFT_RECORD *mrec;
 361         u32 attr_len;
 362         int err = 0;
 363
 364         BUG_ON(!PageLocked(page));
 365
 366         /*
 367          * This can potentially happen because we clear PageUptodate() during
 368          * ntfs_writepage() of MstProtected() attributes.
 369          */
 370         if (PageUptodate(page)) {
 371                 unlock_page(page);
 372                 return 0;
 373         }
 374
 375         ni = NTFS_I(page->mapping->host);
 376
 377         /* NInoNonResident() == NInoIndexAllocPresent() */
 378         if (NInoNonResident(ni)) {
 379                 /*
 380                  * Only unnamed $DATA attributes can be compressed or
 381                  * encrypted.
 382                  */
 383                 if (ni->type == AT_DATA && !ni->name_len) {
 384                         /* If file is encrypted, deny access, just like NT4. */
 385                         if (NInoEncrypted(ni)) {
 386                                 err = -EACCES;
 387                                 goto err_out;
 388                         }
 389                         /* Compressed data streams are handled in compress.c. */
 390                         if (NInoCompressed(ni))
 391                                 return ntfs_read_compressed_block(page);
 392                 }
 393                 /* Normal data stream. */
 394                 return ntfs_read_block(page);
 395         }
 396         /* Attribute is resident, implying it is not compressed or encrypted. */
 397         if (!NInoAttr(ni))
 398                 base_ni = ni;
 399         else
 400                 base_ni = ni->ext.base_ntfs_ino;
 401
 402         /* Map, pin, and lock the mft record. */
 403         mrec = map_mft_record(base_ni);
 404         if (IS_ERR(mrec)) {
 405                 err = PTR_ERR(mrec);
 406                 goto err_out;
 407         }
 408         ctx = ntfs_attr_get_search_ctx(base_ni, mrec);
 409         if (unlikely(!ctx)) {
 410                 err = -ENOMEM;
 411                 goto unm_err_out;
 412         }
 413         err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
 414                         CASE_SENSITIVE, 0, NULL, 0, ctx);
 415         if (unlikely(err))
 416                 goto put_unm_err_out;
 417
 418         /* Starting position of the page within the attribute value. */
 419         attr_pos = page->index << PAGE_CACHE_SHIFT;
 420
 421         /* The total length of the attribute value. */
 422         attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
 423
 424         kaddr = kmap_atomic(page, KM_USER0);
 425         /* Copy over in bounds data, zeroing the remainder of the page. */
 426         if (attr_pos < attr_len) {
 427                 u32 bytes = attr_len - attr_pos;
 428                 if (bytes > PAGE_CACHE_SIZE)
 429                         bytes = PAGE_CACHE_SIZE;
 430                 else if (bytes < PAGE_CACHE_SIZE)
 431                         memset(kaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
 432                 /* Copy the data to the page. */
 433                 memcpy(kaddr, attr_pos + (char*)ctx->attr +
 434                                 le16_to_cpu(
 435                                 ctx->attr->data.resident.value_offset), bytes);
 436         } else
 437                 memset(kaddr, 0, PAGE_CACHE_SIZE);
 438         flush_dcache_page(page);
 439         kunmap_atomic(kaddr, KM_USER0);
 440
 441         SetPageUptodate(page);
 442 put_unm_err_out:
 443         ntfs_attr_put_search_ctx(ctx);
 444 unm_err_out:
 445         unmap_mft_record(base_ni);
 446 err_out:
 447         unlock_page(page);
 448         return err;
 449 }
 450
 451 #ifdef NTFS_RW
 452
 453 /**
 454  * ntfs_write_block - write a @page to the backing store
 455  * @wbc:        writeback control structure
 456  * @page:       page cache page to write out
 457  *
 458  * This function is for writing pages belonging to non-resident, non-mst
 459  * protected attributes to their backing store.
 460  *
 461  * For a page with buffers, map and write the dirty buffers asynchronously
 462  * under page writeback. For a page without buffers, create buffers for the
 463  * page, then proceed as above.
 464  *
 465  * If a page doesn't have buffers the page dirty state is definitive. If a page
 466  * does have buffers, the page dirty state is just a hint, and the buffer dirty
 467  * state is definitive. (A hint which has rules: dirty buffers against a clean
 468  * page is illegal. Other combinations are legal and need to be handled. In
 469  * particular a dirty page containing clean buffers for example.)
 470  *
 471  * Return 0 on success and -errno on error.
 472  *
 473  * Based on ntfs_read_block() and __block_write_full_page().
 474  */
 475 static int ntfs_write_block(struct writeback_control *wbc, struct page *page)
 476 {
 477         VCN vcn;
 478         LCN lcn;
 479         sector_t block, dblock, iblock;
 480         struct inode *vi;
 481         ntfs_inode *ni;
 482         ntfs_volume *vol;
 483         runlist_element *rl;
 484         struct buffer_head *bh, *head;
 485         unsigned int blocksize, vcn_ofs;
 486         int err;
 487         BOOL need_end_writeback;
 488         unsigned char blocksize_bits;
 489
 490         vi = page->mapping->host;
 491         ni = NTFS_I(vi);
 492         vol = ni->vol;
 493
 494         ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
 495                         "0x%lx.", vi->i_ino, ni->type, page->index);
 496
 497         BUG_ON(!NInoNonResident(ni));
 498         BUG_ON(NInoMstProtected(ni));
 499
 500         blocksize_bits = vi->i_blkbits;
 501         blocksize = 1 << blocksize_bits;
 502
 503         if (!page_has_buffers(page)) {
 504                 BUG_ON(!PageUptodate(page));
 505                 create_empty_buffers(page, blocksize,
 506                                 (1 << BH_Uptodate) | (1 << BH_Dirty));
 507         }
 508         bh = head = page_buffers(page);
 509         if (unlikely(!bh)) {
 510                 ntfs_warning(vol->sb, "Error allocating page buffers. "
 511                                 "Redirtying page so we try again later.");
 512                 /*
 513                  * Put the page back on mapping->dirty_pages, but leave its
 514                  * buffer's dirty state as-is.
 515                  */
 516                 redirty_page_for_writepage(wbc, page);
 517                 unlock_page(page);
 518                 return 0;
 519         }
 520
 521         /* NOTE: Different naming scheme to ntfs_read_block()! */
 522
 523         /* The first block in the page. */
 524         block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
 525
 526         /* The first out of bounds block for the data size. */
 527         dblock = (vi->i_size + blocksize - 1) >> blocksize_bits;
 528
 529         /* The last (fully or partially) initialized block. */
 530         iblock = ni->initialized_size >> blocksize_bits;
 531
 532         /*
 533          * Be very careful.  We have no exclusion from __set_page_dirty_buffers
 534          * here, and the (potentially unmapped) buffers may become dirty at
 535          * any time.  If a buffer becomes dirty here after we've inspected it
 536          * then we just miss that fact, and the page stays dirty.
 537          *
 538          * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
 539          * handle that here by just cleaning them.
 540          */
 541
 542         /*
 543          * Loop through all the buffers in the page, mapping all the dirty
 544          * buffers to disk addresses and handling any aliases from the
 545          * underlying block device's mapping.
 546          */
 547         rl = NULL;
 548         err = 0;
 549         do {
 550                 BOOL is_retry = FALSE;
 551
 552                 if (unlikely(block >= dblock)) {
 553                         /*
 554                          * Mapped buffers outside i_size will occur, because
 555                          * this page can be outside i_size when there is a
 556                          * truncate in progress. The contents of such buffers
 557                          * were zeroed by ntfs_writepage().
 558                          *
 559                          * FIXME: What about the small race window where
 560                          * ntfs_writepage() has not done any clearing because
 561                          * the page was within i_size but before we get here,
 562                          * vmtruncate() modifies i_size?
 563                          */
 564                         clear_buffer_dirty(bh);
 565                         set_buffer_uptodate(bh);
 566                         continue;
 567                 }
 568
 569                 /* Clean buffers are not written out, so no need to map them. */
 570                 if (!buffer_dirty(bh))
 571                         continue;
 572
 573                 /* Make sure we have enough initialized size. */
 574                 if (unlikely((block >= iblock) &&
 575                                 (ni->initialized_size < vi->i_size))) {
 576                         /*
 577                          * If this page is fully outside initialized size, zero
 578                          * out all pages between the current initialized size
 579                          * and the current page. Just use ntfs_readpage() to do
 580                          * the zeroing transparently.
 581                          */
 582                         if (block > iblock) {
 583                                 // TODO:
 584                                 // For each page do:
 585                                 // - read_cache_page()
 586                                 // Again for each page do:
 587                                 // - wait_on_page_locked()
 588                                 // - Check (PageUptodate(page) &&
 589                                 //                      !PageError(page))
 590                                 // Update initialized size in the attribute and
 591                                 // in the inode.
 592                                 // Again, for each page do:
 593                                 //      __set_page_dirty_buffers();
 594                                 // page_cache_release()
 595                                 // We don't need to wait on the writes.
 596                                 // Update iblock.
 597                         }
 598                         /*
 599                          * The current page straddles initialized size. Zero
 600                          * all non-uptodate buffers and set them uptodate (and
 601                          * dirty?). Note, there aren't any non-uptodate buffers
 602                          * if the page is uptodate.
 603                          * FIXME: For an uptodate page, the buffers may need to
 604                          * be written out because they were not initialized on
 605                          * disk before.
 606                          */
 607                         if (!PageUptodate(page)) {
 608                                 // TODO:
 609                                 // Zero any non-uptodate buffers up to i_size.
 610                                 // Set them uptodate and dirty.
 611                         }
 612                         // TODO:
 613                         // Update initialized size in the attribute and in the
 614                         // inode (up to i_size).
 615                         // Update iblock.
 616                         // FIXME: This is inefficient. Try to batch the two
 617                         // size changes to happen in one go.
 618                         ntfs_error(vol->sb, "Writing beyond initialized size "
 619                                         "is not supported yet. Sorry.");
 620                         err = -EOPNOTSUPP;
 621                         break;
 622                         // Do NOT set_buffer_new() BUT DO clear buffer range
 623                         // outside write request range.
 624                         // set_buffer_uptodate() on complete buffers as well as
 625                         // set_buffer_dirty().
 626                 }
 627
 628                 /* No need to map buffers that are already mapped. */
 629                 if (buffer_mapped(bh))
 630                         continue;
 631
 632                 /* Unmapped, dirty buffer. Need to map it. */
 633                 bh->b_bdev = vol->sb->s_bdev;
 634
 635                 /* Convert block into corresponding vcn and offset. */
 636                 vcn = (VCN)block << blocksize_bits >> vol->cluster_size_bits;
 637                 vcn_ofs = ((VCN)block << blocksize_bits) &
 638                                 vol->cluster_size_mask;
 639                 if (!rl) {
 640 lock_retry_remap:
 641                         down_read(&ni->runlist.lock);
 642                         rl = ni->runlist.rl;
 643                 }
 644                 if (likely(rl != NULL)) {
 645                         /* Seek to element containing target vcn. */
 646                         while (rl->length && rl[1].vcn <= vcn)
 647                                 rl++;
 648                         lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
 649                 } else
 650                         lcn = LCN_RL_NOT_MAPPED;
 651                 /* Successful remap. */
 652                 if (lcn >= 0) {
 653                         /* Setup buffer head to point to correct block. */
 654                         bh->b_blocknr = ((lcn << vol->cluster_size_bits) +
 655                                         vcn_ofs) >> blocksize_bits;
 656                         set_buffer_mapped(bh);
 657                         continue;
 658                 }
 659                 /* It is a hole, need to instantiate it. */
 660                 if (lcn == LCN_HOLE) {
 661                         // TODO: Instantiate the hole.
 662                         // clear_buffer_new(bh);
 663                         // unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
 664                         ntfs_error(vol->sb, "Writing into sparse regions is "
 665                                         "not supported yet. Sorry.");
 666                         err = -EOPNOTSUPP;
 667                         break;
 668                 }
 669                 /* If first try and runlist unmapped, map and retry. */
 670                 if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
 671                         is_retry = TRUE;
 672                         /*
 673                          * Attempt to map runlist, dropping lock for
 674                          * the duration.
 675                          */
 676                         up_read(&ni->runlist.lock);
 677                         err = ntfs_map_runlist(ni, vcn);
 678                         if (likely(!err))
 679                                 goto lock_retry_remap;
 680                         rl = NULL;
 681                 }
 682                 /* Failed to map the buffer, even after retrying. */
 683                 bh->b_blocknr = -1UL;
 684                 ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn(vcn = 0x%llx) failed "
 685                                 "with error code 0x%llx%s.",
 686                                 (unsigned long long)vcn,
 687                                 (unsigned long long)-lcn,
 688                                 is_retry ? " even after retrying" : "");
 689                 // FIXME: Depending on vol->on_errors, do something.
 690                 if (!err)
 691                         err = -EIO;
 692                 break;
 693         } while (block++, (bh = bh->b_this_page) != head);
 694
 695         /* Release the lock if we took it. */
 696         if (rl)
 697                 up_read(&ni->runlist.lock);
 698
 699         /* For the error case, need to reset bh to the beginning. */
 700         bh = head;
 701
 702         /* Just an optimization, so ->readpage() isn't called later. */
 703         if (unlikely(!PageUptodate(page))) {
 704                 int uptodate = 1;
 705                 do {
 706                         if (!buffer_uptodate(bh)) {
 707                                 uptodate = 0;
 708                                 bh = head;
 709                                 break;
 710                         }
 711                 } while ((bh = bh->b_this_page) != head);
 712                 if (uptodate)
 713                         SetPageUptodate(page);
 714         }
 715
 716         /* Setup all mapped, dirty buffers for async write i/o. */
 717         do {
 718                 get_bh(bh);
 719                 if (buffer_mapped(bh) && buffer_dirty(bh)) {
 720                         lock_buffer(bh);
 721                         if (test_clear_buffer_dirty(bh)) {
 722                                 BUG_ON(!buffer_uptodate(bh));
 723                                 mark_buffer_async_write(bh);
 724                         } else
 725                                 unlock_buffer(bh);
 726                 } else if (unlikely(err)) {
 727                         /*
 728                          * For the error case. The buffer may have been set
 729                          * dirty during attachment to a dirty page.
 730                          */
 731                         if (err != -ENOMEM)
 732                                 clear_buffer_dirty(bh);
 733                 }
 734         } while ((bh = bh->b_this_page) != head);
 735
 736         if (unlikely(err)) {
 737                 // TODO: Remove the -EOPNOTSUPP check later on...
 738                 if (unlikely(err == -EOPNOTSUPP))
 739                         err = 0;
 740                 else if (err == -ENOMEM) {
 741                         ntfs_warning(vol->sb, "Error allocating memory. "
 742                                         "Redirtying page so we try again "
 743                                         "later.");
 744                         /*
 745                          * Put the page back on mapping->dirty_pages, but
 746                          * leave its buffer's dirty state as-is.
 747                          */
 748                         redirty_page_for_writepage(wbc, page);
 749                         err = 0;
 750                 } else
 751                         SetPageError(page);
 752         }
 753
 754         BUG_ON(PageWriteback(page));
 755         set_page_writeback(page);       /* Keeps try_to_free_buffers() away. */
 756         unlock_page(page);
 757
 758         /*
 759          * Submit the prepared buffers for i/o. Note the page is unlocked,
 760          * and the async write i/o completion handler can end_page_writeback()
 761          * at any time after the *first* submit_bh(). So the buffers can then
 762          * disappear...
 763          */
 764         need_end_writeback = TRUE;
 765         do {
 766                 struct buffer_head *next = bh->b_this_page;
 767                 if (buffer_async_write(bh)) {
 768                         submit_bh(WRITE, bh);
 769                         need_end_writeback = FALSE;
 770                 }
 771                 put_bh(bh);
 772                 bh = next;
 773         } while (bh != head);
 774
 775         /* If no i/o was started, need to end_page_writeback(). */
 776         if (unlikely(need_end_writeback))
 777                 end_page_writeback(page);
 778
 779         ntfs_debug("Done.");
 780         return err;
 781 }
 782
 783 /**
 784  * ntfs_write_mst_block - write a @page to the backing store
 785  * @wbc:        writeback control structure
 786  * @page:       page cache page to write out
 787  *
 788  * This function is for writing pages belonging to non-resident, mst protected
 789  * attributes to their backing store.  The only supported attributes are index
 790  * allocation and $MFT/$DATA.  Both directory inodes and index inodes are
 791  * supported for the index allocation case.
 792  *
 793  * The page must remain locked for the duration of the write because we apply
 794  * the mst fixups, write, and then undo the fixups, so if we were to unlock the
 795  * page before undoing the fixups, any other user of the page will see the
 796  * page contents as corrupt.
 797  *
 798  * We clear the page uptodate flag for the duration of the function to ensure
 799  * exclusion for the $MFT/$DATA case against someone mapping an mft record we
 800  * are about to apply the mst fixups to.
 801  *
 802  * Return 0 on success and -errno on error.
 803  *
 804  * Based on ntfs_write_block(), ntfs_mft_writepage(), and
 805  * write_mft_record_nolock().
 806  */
 807 static int ntfs_write_mst_block(struct writeback_control *wbc,
 808                 struct page *page)
 809 {
 810         sector_t block, dblock, rec_block;
 811         struct inode *vi = page->mapping->host;
 812         ntfs_inode *ni = NTFS_I(vi);
 813         ntfs_volume *vol = ni->vol;
 814         u8 *kaddr;
 815         unsigned int bh_size = 1 << vi->i_blkbits;
 816         unsigned int rec_size = ni->itype.index.block_size;
 817         ntfs_inode *locked_nis[PAGE_CACHE_SIZE / rec_size];
 818         struct buffer_head *bh, *head, *tbh;
 819         int max_bhs = PAGE_CACHE_SIZE / bh_size;
 820         struct buffer_head *bhs[max_bhs];
 821         int i, nr_locked_nis, nr_recs, nr_bhs, bhs_per_rec, err;
 822         unsigned char bh_size_bits, rec_size_bits;
 823         BOOL sync, is_mft, page_is_dirty, rec_is_dirty;
 824
 825         ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
 826                         "0x%lx.", vi->i_ino, ni->type, page->index);
 827         BUG_ON(!NInoNonResident(ni));
 828         BUG_ON(!NInoMstProtected(ni));
 829         is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino);
 830         BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) ||
 831                         (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
 832         BUG_ON(!max_bhs);
 833
 834         /* Were we called for sync purposes? */
 835         sync = (wbc->sync_mode == WB_SYNC_ALL);
 836
 837         /* Make sure we have mapped buffers. */
 838         BUG_ON(!page_has_buffers(page));
 839         bh = head = page_buffers(page);
 840         BUG_ON(!bh);
 841
 842         bh_size_bits = vi->i_blkbits;
 843         rec_size_bits = ni->itype.index.block_size_bits;
 844         BUG_ON(!(PAGE_CACHE_SIZE >> rec_size_bits));
 845         bhs_per_rec = rec_size >> bh_size_bits;
 846         BUG_ON(!bhs_per_rec);
 847
 848         /* The first block in the page. */
 849         rec_block = block = (sector_t)page->index <<
 850                         (PAGE_CACHE_SHIFT - bh_size_bits);
 851
 852         /* The first out of bounds block for the data size. */
 853         dblock = (vi->i_size + bh_size - 1) >> bh_size_bits;
 854
 855         err = nr_bhs = nr_recs = nr_locked_nis = 0;
 856         page_is_dirty = rec_is_dirty = FALSE;
 857         do {
 858                 if (unlikely(block >= dblock)) {
 859                         /*
 860                          * Mapped buffers outside i_size will occur, because
 861                          * this page can be outside i_size when there is a
 862                          * truncate in progress.  The contents of such buffers
 863                          * were zeroed by ntfs_writepage().
 864                          *
 865                          * FIXME: What about the small race window where
 866                          * ntfs_writepage() has not done any clearing because
 867                          * the page was within i_size but before we get here,
 868                          * vmtruncate() modifies i_size?
 869                          */
 870                         clear_buffer_dirty(bh);
 871                         continue;
 872                 }
 873                 if (likely(block < rec_block)) {
 874                         /*
 875                          * This block is not the first one in the record.  We
 876                          * ignore the buffer's dirty state because we could
 877                          * have raced with a parallel mark_ntfs_record_dirty().
 878                          */
 879                         if (!rec_is_dirty)
 880                                 continue;
 881                 } else /* if (block == rec_block) */ {
 882                         BUG_ON(block > rec_block);
 883                         /* This block is the first one in the record. */
 884                         rec_block += bhs_per_rec;
 885                         if (!buffer_dirty(bh)) {
 886                                 /* Clean records are not written out. */
 887                                 rec_is_dirty = FALSE;
 888                                 continue;
 889                         }
 890                         rec_is_dirty = TRUE;
 891                 }
 892                 BUG_ON(!buffer_mapped(bh));
 893                 BUG_ON(!buffer_uptodate(bh));
 894                 bhs[nr_bhs++] = bh;
 895                 BUG_ON(nr_bhs > max_bhs);
 896         } while (block++, (bh = bh->b_this_page) != head);
 897         /* If there were no dirty buffers, we are done. */
 898         if (!nr_bhs)
 899                 goto done;
 900         /* Map the page so we can access its contents. */
 901         kaddr = kmap(page);
 902         /* Clear the page uptodate flag whilst the mst fixups are applied. */
 903         BUG_ON(!PageUptodate(page));
 904         ClearPageUptodate(page);
 905         for (i = 0; i < nr_bhs; i++) {
 906                 unsigned int ofs;
 907
 908                 /* Skip buffers which are not at the beginning of records. */
 909                 if (i % bhs_per_rec)
 910                         continue;
 911                 tbh = bhs[i];
 912                 ofs = bh_offset(tbh);
 913                 if (is_mft) {
 914                         ntfs_inode *tni;
 915                         unsigned long mft_no;
 916
 917                         /* Get the mft record number. */
 918                         mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
 919                                         >> rec_size_bits;
 920                         /* Check whether to write this mft record. */
 921                         tni = NULL;
 922                         if (!ntfs_may_write_mft_record(vol, mft_no,
 923                                         (MFT_RECORD*)(kaddr + ofs), &tni)) {
 924                                 /*
 925                                  * The record should not be written.  This
 926                                  * means we need to redirty the page before
 927                                  * returning.
 928                                  */
 929                                 page_is_dirty = TRUE;
 930                                 /*
 931                                  * Remove the buffers in this mft record from
 932                                  * the list of buffers to write.
 933                                  */
 934                                 do {
 935                                         bhs[i] = NULL;
 936                                 } while (++i % bhs_per_rec);
 937                                 continue;
 938                         }
 939                         /*
 940                          * The record should be written.  If a locked ntfs
 941                          * inode was returned, add it to the array of locked
 942                          * ntfs inodes.
 943                          */
 944                         if (tni)
 945                                 locked_nis[nr_locked_nis++] = tni;
 946                 }
 947                 /* Apply the mst protection fixups. */
 948                 err = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs),
 949                                 rec_size);
 950                 if (unlikely(err)) {
 951                         ntfs_error(vol->sb, "Failed to apply mst fixups "
 952                                         "(inode 0x%lx, attribute type 0x%x, "
 953                                         "page index 0x%lx, page offset 0x%x)!"
 954                                         "  Unmount and run chkdsk.", vi->i_ino,
 955                                         ni->type, page->index, ofs);
 956                         /*
 957                          * Mark all the buffers in this record clean as we do
 958                          * not want to write corrupt data to disk.
 959                          */
 960                         do {
 961                                 clear_buffer_dirty(bhs[i]);
 962                                 bhs[i] = NULL;
 963                         } while (++i % bhs_per_rec);
 964                         continue;
 965                 }
 966                 nr_recs++;
 967         }
 968         /* If no records are to be written out, we are done. */
 969         if (!nr_recs)
 970                 goto unm_done;
 971         flush_dcache_page(page);
 972         /* Lock buffers and start synchronous write i/o on them. */
 973         for (i = 0; i < nr_bhs; i++) {
 974                 tbh = bhs[i];
 975                 if (!tbh)
 976                         continue;
 977                 if (unlikely(test_set_buffer_locked(tbh)))
 978                         BUG();
 979                 /* The buffer dirty state is now irrelevant, just clean it. */
 980                 clear_buffer_dirty(tbh);
 981                 BUG_ON(!buffer_uptodate(tbh));
 982                 BUG_ON(!buffer_mapped(tbh));
 983                 get_bh(tbh);
 984                 tbh->b_end_io = end_buffer_write_sync;
 985                 submit_bh(WRITE, tbh);
 986         }
 987         /* Synchronize the mft mirror now if not @sync. */
 988         if (is_mft && !sync)
 989                 goto do_mirror;
 990 do_wait:
 991         /* Wait on i/o completion of buffers. */
 992         for (i = 0; i < nr_bhs; i++) {
 993                 tbh = bhs[i];
 994                 if (!tbh)
 995                         continue;
 996                 wait_on_buffer(tbh);
 997                 if (unlikely(!buffer_uptodate(tbh))) {
 998                         ntfs_error(vol->sb, "I/O error while writing ntfs "
 999                                         "record buffer (inode 0x%lx, "
1000                                         "attribute type 0x%x, page index "
1001                                         "0x%lx, page offset 0x%lx)!  Unmount "
1002                                         "and run chkdsk.", vi->i_ino, ni->type,
1003                                         page->index, bh_offset(tbh));
1004                         err = -EIO;
1005                         /*
1006                          * Set the buffer uptodate so the page and buffer
1007                          * states do not become out of sync.
1008                          */
1009                         set_buffer_uptodate(tbh);
1010                 }
1011         }
1012         /* If @sync, now synchronize the mft mirror. */
1013         if (is_mft && sync) {
1014 do_mirror:
1015                 for (i = 0; i < nr_bhs; i++) {
1016                         unsigned long mft_no;
1017                         unsigned int ofs;
1018
1019                         /*
1020                          * Skip buffers which are not at the beginning of
1021                          * records.
1022                          */
1023                         if (i % bhs_per_rec)
1024                                 continue;
1025                         tbh = bhs[i];
1026                         /* Skip removed buffers (and hence records). */
1027                         if (!tbh)
1028                                 continue;
1029                         ofs = bh_offset(tbh);
1030                         /* Get the mft record number. */
1031                         mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
1032                                         >> rec_size_bits;
1033                         if (mft_no < vol->mftmirr_size)
1034                                 ntfs_sync_mft_mirror(vol, mft_no,
1035                                                 (MFT_RECORD*)(kaddr + ofs),
1036                                                 sync);
1037                 }
1038                 if (!sync)
1039                         goto do_wait;
1040         }
1041         /* Remove the mst protection fixups again. */
1042         for (i = 0; i < nr_bhs; i++) {
1043                 if (!(i % bhs_per_rec)) {
1044                         tbh = bhs[i];
1045                         if (!tbh)
1046                                 continue;
1047                         post_write_mst_fixup((NTFS_RECORD*)(kaddr +
1048                                         bh_offset(tbh)));
1049                 }
1050         }
1051         flush_dcache_page(page);
1052 unm_done:
1053         /* Unlock any locked inodes. */
1054         while (nr_locked_nis-- > 0) {
1055                 ntfs_inode *tni, *base_tni;
1056
1057                 tni = locked_nis[nr_locked_nis];
1058                 /* Get the base inode. */
1059                 down(&tni->extent_lock);
1060                 if (tni->nr_extents >= 0)
1061                         base_tni = tni;
1062                 else {
1063                         base_tni = tni->ext.base_ntfs_ino;
1064                         BUG_ON(!base_tni);
1065                 }
1066                 up(&tni->extent_lock);
1067                 ntfs_debug("Unlocking %s inode 0x%lx.",
1068                                 tni == base_tni ? "base" : "extent",
1069                                 tni->mft_no);
1070                 up(&tni->mrec_lock);
1071                 atomic_dec(&tni->count);
1072                 iput(VFS_I(base_tni));
1073         }
1074         if (unlikely(err)) {
1075                 SetPageError(page);
1076                 NVolSetErrors(vol);
1077         }
1078         SetPageUptodate(page);
1079         kunmap(page);
1080 done:
1081         if (page_is_dirty) {
1082                 ntfs_debug("Page still contains one or more dirty ntfs "
1083                                 "records.  Redirtying the page starting at "
1084                                 "record 0x%lx.", page->index <<
1085                                 (PAGE_CACHE_SHIFT - rec_size_bits));
1086                 redirty_page_for_writepage(wbc, page);
1087                 unlock_page(page);
1088         } else {
1089                 /*
1090                  * Keep the VM happy.  This must be done otherwise the
1091                  * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though
1092                  * the page is clean.
1093                  */
1094                 BUG_ON(PageWriteback(page));
1095                 set_page_writeback(page);
1096                 unlock_page(page);
1097                 end_page_writeback(page);
1098         }
1099         if (likely(!err))
1100                 ntfs_debug("Done.");
1101         return err;
1102 }
1103
1104 /**
1105  * ntfs_writepage - write a @page to the backing store
1106  * @page:       page cache page to write out
1107  * @wbc:        writeback control structure
1108  *
1109  * This is called from the VM when it wants to have a dirty ntfs page cache
1110  * page cleaned.  The VM has already locked the page and marked it clean.
1111  *
1112  * For non-resident attributes, ntfs_writepage() writes the @page by calling
1113  * the ntfs version of the generic block_write_full_page() function,
1114  * ntfs_write_block(), which in turn if necessary creates and writes the
1115  * buffers associated with the page asynchronously.
1116  *
1117  * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying
1118  * the data to the mft record (which at this stage is most likely in memory).
1119  * The mft record is then marked dirty and written out asynchronously via the
1120  * vfs inode dirty code path.
1121  *
1122  * Based on ntfs_readpage() and fs/buffer.c::block_write_full_page().
1123  *
1124  * Return 0 on success and -errno on error.
1125  */
1126 static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
1127 {
1128         s64 attr_pos;
1129         struct inode *vi;
1130         ntfs_inode *ni, *base_ni;
1131         char *kaddr;
1132         ntfs_attr_search_ctx *ctx;
1133         MFT_RECORD *m;
1134         u32 attr_len, bytes;
1135         int err;
1136
1137         BUG_ON(!PageLocked(page));
1138
1139         vi = page->mapping->host;
1140
1141         /* Is the page fully outside i_size? (truncate in progress) */
1142         if (unlikely(page->index >= (vi->i_size + PAGE_CACHE_SIZE - 1) >>
1143                         PAGE_CACHE_SHIFT)) {
1144                 unlock_page(page);
1145                 ntfs_debug("Write outside i_size - truncated?");
1146                 return 0;
1147         }
1148
1149         ni = NTFS_I(vi);
1150
1151         /* NInoNonResident() == NInoIndexAllocPresent() */
1152         if (NInoNonResident(ni)) {
1153                 /*
1154                  * Only unnamed $DATA attributes can be compressed, encrypted,
1155                  * and/or sparse.
1156                  */
1157                 if (ni->type == AT_DATA && !ni->name_len) {
1158                         /* If file is encrypted, deny access, just like NT4. */
1159                         if (NInoEncrypted(ni)) {
1160                                 unlock_page(page);
1161                                 ntfs_debug("Denying write access to encrypted "
1162                                                 "file.");
1163                                 return -EACCES;
1164                         }
1165                         /* Compressed data streams are handled in compress.c. */
1166                         if (NInoCompressed(ni)) {
1167                                 // TODO: Implement and replace this check with
1168                                 // return ntfs_write_compressed_block(page);
1169                                 unlock_page(page);
1170                                 ntfs_error(vi->i_sb, "Writing to compressed "
1171                                                 "files is not supported yet. "
1172                                                 "Sorry.");
1173                                 return -EOPNOTSUPP;
1174                         }
1175                         // TODO: Implement and remove this check.
1176                         if (NInoSparse(ni)) {
1177                                 unlock_page(page);
1178                                 ntfs_error(vi->i_sb, "Writing to sparse files "
1179                                                 "is not supported yet. Sorry.");
1180                                 return -EOPNOTSUPP;
1181                         }
1182                 }
1183                 /* We have to zero every time due to mmap-at-end-of-file. */
1184                 if (page->index >= (vi->i_size >> PAGE_CACHE_SHIFT)) {
1185                         /* The page straddles i_size. */
1186                         unsigned int ofs = vi->i_size & ~PAGE_CACHE_MASK;
1187                         kaddr = kmap_atomic(page, KM_USER0);
1188                         memset(kaddr + ofs, 0, PAGE_CACHE_SIZE - ofs);
1189                         flush_dcache_page(page);
1190                         kunmap_atomic(kaddr, KM_USER0);
1191                 }
1192                 /* Handle mst protected attributes. */
1193                 if (NInoMstProtected(ni))
1194                         return ntfs_write_mst_block(wbc, page);
1195                 /* Normal data stream. */
1196                 return ntfs_write_block(wbc, page);
1197         }
1198
1199         /*
1200          * Attribute is resident, implying it is not compressed, encrypted, or
1201          * mst protected.
1202          */
1203         BUG_ON(page_has_buffers(page));
1204         BUG_ON(!PageUptodate(page));
1205
1206         if (!NInoAttr(ni))
1207                 base_ni = ni;
1208         else
1209                 base_ni = ni->ext.base_ntfs_ino;
1210
1211         /* Map, pin, and lock the mft record. */
1212         m = map_mft_record(base_ni);
1213         if (IS_ERR(m)) {
1214                 err = PTR_ERR(m);
1215                 m = NULL;
1216                 ctx = NULL;
1217                 goto err_out;
1218         }
1219         ctx = ntfs_attr_get_search_ctx(base_ni, m);
1220         if (unlikely(!ctx)) {
1221                 err = -ENOMEM;
1222                 goto err_out;
1223         }
1224         err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1225                         CASE_SENSITIVE, 0, NULL, 0, ctx);
1226         if (unlikely(err))
1227                 goto err_out;
1228
1229         /* Starting position of the page within the attribute value. */
1230         attr_pos = page->index << PAGE_CACHE_SHIFT;
1231
1232         /* The total length of the attribute value. */
1233         attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
1234
1235         if (unlikely(vi->i_size != attr_len)) {
1236                 ntfs_error(vi->i_sb, "BUG()! i_size (0x%llx) doesn't match "
1237                                 "attr_len (0x%x). Aborting write.", vi->i_size,
1238                                 attr_len);
1239                 err = -EIO;
1240                 goto err_out;
1241         }
1242         if (unlikely(attr_pos >= attr_len)) {
1243                 ntfs_error(vi->i_sb, "BUG()! attr_pos (0x%llx) > attr_len "
1244                                 "(0x%x). Aborting write.",
1245                                 (unsigned long long)attr_pos, attr_len);
1246                 err = -EIO;
1247                 goto err_out;
1248         }
1249
1250         bytes = attr_len - attr_pos;
1251         if (unlikely(bytes > PAGE_CACHE_SIZE))
1252                 bytes = PAGE_CACHE_SIZE;
1253
1254         /*
1255          * Keep the VM happy.  This must be done otherwise the radix-tree tag
1256          * PAGECACHE_TAG_DIRTY remains set even though the page is clean.
1257          */
1258         BUG_ON(PageWriteback(page));
1259         set_page_writeback(page);
1260         unlock_page(page);
1261
1262         /*
1263          * Here, we don't need to zero the out of bounds area everytime because
1264          * the below memcpy() already takes care of the mmap-at-end-of-file
1265          * requirements. If the file is converted to a non-resident one, then
1266          * the code path use is switched to the non-resident one where the
1267          * zeroing happens on each ntfs_writepage() invocation.
1268          *
1269          * The above also applies nicely when i_size is decreased.
1270          *
1271          * When i_size is increased, the memory between the old and new i_size
1272          * _must_ be zeroed (or overwritten with new data). Otherwise we will
1273          * expose data to userspace/disk which should never have been exposed.
1274          *
1275          * FIXME: Ensure that i_size increases do the zeroing/overwriting and
1276          * if we cannot guarantee that, then enable the zeroing below.  If the
1277          * zeroing below is enabled, we MUST move the unlock_page() from above
1278          * to after the kunmap_atomic(), i.e. just before the
1279          * end_page_writeback().
1280          */
1281
1282         kaddr = kmap_atomic(page, KM_USER0);
1283         /* Copy the data from the page to the mft record. */
1284         memcpy((u8*)ctx->attr + le16_to_cpu(
1285                         ctx->attr->data.resident.value_offset) + attr_pos,
1286                         kaddr, bytes);
1287         flush_dcache_mft_record_page(ctx->ntfs_ino);
1288 #if 0
1289         /* Zero out of bounds area. */
1290         if (likely(bytes < PAGE_CACHE_SIZE)) {
1291                 memset(kaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
1292                 flush_dcache_page(page);
1293         }
1294 #endif
1295         kunmap_atomic(kaddr, KM_USER0);
1296
1297         end_page_writeback(page);
1298
1299         /* Mark the mft record dirty, so it gets written back. */
1300         mark_mft_record_dirty(ctx->ntfs_ino);
1301
1302         ntfs_attr_put_search_ctx(ctx);
1303         unmap_mft_record(base_ni);
1304         return 0;
1305 err_out:
1306         if (err == -ENOMEM) {
1307                 ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying "
1308                                 "page so we try again later.");
1309                 /*
1310                  * Put the page back on mapping->dirty_pages, but leave its
1311                  * buffer's dirty state as-is.
1312                  */
1313                 redirty_page_for_writepage(wbc, page);
1314                 err = 0;
1315         } else {
1316                 ntfs_error(vi->i_sb, "Resident attribute write failed with "
1317                                 "error %i. Setting page error flag.", -err);
1318                 SetPageError(page);
1319         }
1320         unlock_page(page);
1321         if (ctx)
1322                 ntfs_attr_put_search_ctx(ctx);
1323         if (m)
1324                 unmap_mft_record(base_ni);
1325         return err;
1326 }
1327
1328 /**
1329  * ntfs_prepare_nonresident_write -
1330  *
1331  */
1332 static int ntfs_prepare_nonresident_write(struct page *page,
1333                 unsigned from, unsigned to)
1334 {
1335         VCN vcn;
1336         LCN lcn;
1337         sector_t block, ablock, iblock;
1338         struct inode *vi;
1339         ntfs_inode *ni;
1340         ntfs_volume *vol;
1341         runlist_element *rl;
1342         struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
1343         unsigned int vcn_ofs, block_start, block_end, blocksize;
1344         int err;
1345         BOOL is_retry;
1346         unsigned char blocksize_bits;
1347
1348         vi = page->mapping->host;
1349         ni = NTFS_I(vi);
1350         vol = ni->vol;
1351
1352         ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
1353                         "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type,
1354                         page->index, from, to);
1355
1356         BUG_ON(!NInoNonResident(ni));
1357         BUG_ON(NInoMstProtected(ni));
1358
1359         blocksize_bits = vi->i_blkbits;
1360         blocksize = 1 << blocksize_bits;
1361
1362         /*
1363          * create_empty_buffers() will create uptodate/dirty buffers if the
1364          * page is uptodate/dirty.
1365          */
1366         if (!page_has_buffers(page))
1367                 create_empty_buffers(page, blocksize, 0);
1368         bh = head = page_buffers(page);
1369         if (unlikely(!bh))
1370                 return -ENOMEM;
1371
1372         /* The first block in the page. */
1373         block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
1374
1375         /*
1376          * The first out of bounds block for the allocated size. No need to
1377          * round up as allocated_size is in multiples of cluster size and the
1378          * minimum cluster size is 512 bytes, which is equal to the smallest
1379          * blocksize.
1380          */
1381         ablock = ni->allocated_size >> blocksize_bits;
1382
1383         /* The last (fully or partially) initialized block. */
1384         iblock = ni->initialized_size >> blocksize_bits;
1385
1386         /* Loop through all the buffers in the page. */
1387         block_start = 0;
1388         rl = NULL;
1389         err = 0;
1390         do {
1391                 block_end = block_start + blocksize;
1392                 /*
1393                  * If buffer @bh is outside the write, just mark it uptodate
1394                  * if the page is uptodate and continue with the next buffer.
1395                  */
1396                 if (block_end <= from || block_start >= to) {
1397                         if (PageUptodate(page)) {
1398                                 if (!buffer_uptodate(bh))
1399                                         set_buffer_uptodate(bh);
1400                         }
1401                         continue;
1402                 }
1403                 /*
1404                  * @bh is at least partially being written to.
1405                  * Make sure it is not marked as new.
1406                  */
1407                 //if (buffer_new(bh))
1408                 //      clear_buffer_new(bh);
1409
1410                 if (block >= ablock) {
1411                         // TODO: block is above allocated_size, need to
1412                         // allocate it. Best done in one go to accommodate not
1413                         // only block but all above blocks up to and including:
1414                         // ((page->index << PAGE_CACHE_SHIFT) + to + blocksize
1415                         // - 1) >> blobksize_bits. Obviously will need to round
1416                         // up to next cluster boundary, too. This should be
1417                         // done with a helper function, so it can be reused.
1418                         ntfs_error(vol->sb, "Writing beyond allocated size "
1419                                         "is not supported yet. Sorry.");
1420                         err = -EOPNOTSUPP;
1421                         goto err_out;
1422                         // Need to update ablock.
1423                         // Need to set_buffer_new() on all block bhs that are
1424                         // newly allocated.
1425                 }
1426                 /*
1427                  * Now we have enough allocated size to fulfill the whole
1428                  * request, i.e. block < ablock is true.
1429                  */
1430                 if (unlikely((block >= iblock) &&
1431                                 (ni->initialized_size < vi->i_size))) {
1432                         /*
1433                          * If this page is fully outside initialized size, zero
1434                          * out all pages between the current initialized size
1435                          * and the current page. Just use ntfs_readpage() to do
1436                          * the zeroing transparently.
1437                          */
1438                         if (block > iblock) {
1439                                 // TODO:
1440                                 // For each page do:
1441                                 // - read_cache_page()
1442                                 // Again for each page do:
1443                                 // - wait_on_page_locked()
1444                                 // - Check (PageUptodate(page) &&
1445                                 //                      !PageError(page))
1446                                 // Update initialized size in the attribute and
1447                                 // in the inode.
1448                                 // Again, for each page do:
1449                                 //      __set_page_dirty_buffers();
1450                                 // page_cache_release()
1451                                 // We don't need to wait on the writes.
1452                                 // Update iblock.
1453                         }
1454                         /*
1455                          * The current page straddles initialized size. Zero
1456                          * all non-uptodate buffers and set them uptodate (and
1457                          * dirty?). Note, there aren't any non-uptodate buffers
1458                          * if the page is uptodate.
1459                          * FIXME: For an uptodate page, the buffers may need to
1460                          * be written out because they were not initialized on
1461                          * disk before.
1462                          */
1463                         if (!PageUptodate(page)) {
1464                                 // TODO:
1465                                 // Zero any non-uptodate buffers up to i_size.
1466                                 // Set them uptodate and dirty.
1467                         }
1468                         // TODO:
1469                         // Update initialized size in the attribute and in the
1470                         // inode (up to i_size).
1471                         // Update iblock.
1472                         // FIXME: This is inefficient. Try to batch the two
1473                         // size changes to happen in one go.
1474                         ntfs_error(vol->sb, "Writing beyond initialized size "
1475                                         "is not supported yet. Sorry.");
1476                         err = -EOPNOTSUPP;
1477                         goto err_out;
1478                         // Do NOT set_buffer_new() BUT DO clear buffer range
1479                         // outside write request range.
1480                         // set_buffer_uptodate() on complete buffers as well as
1481                         // set_buffer_dirty().
1482                 }
1483
1484                 /* Need to map unmapped buffers. */
1485                 if (!buffer_mapped(bh)) {
1486                         /* Unmapped buffer. Need to map it. */
1487                         bh->b_bdev = vol->sb->s_bdev;
1488
1489                         /* Convert block into corresponding vcn and offset. */
1490                         vcn = (VCN)block << blocksize_bits >>
1491                                         vol->cluster_size_bits;
1492                         vcn_ofs = ((VCN)block << blocksize_bits) &
1493                                         vol->cluster_size_mask;
1494
1495                         is_retry = FALSE;
1496                         if (!rl) {
1497 lock_retry_remap:
1498                                 down_read(&ni->runlist.lock);
1499                                 rl = ni->runlist.rl;
1500                         }
1501                         if (likely(rl != NULL)) {
1502                                 /* Seek to element containing target vcn. */
1503                                 while (rl->length && rl[1].vcn <= vcn)
1504                                         rl++;
1505                                 lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
1506                         } else
1507                                 lcn = LCN_RL_NOT_MAPPED;
1508                         if (unlikely(lcn < 0)) {
1509                                 /*
1510                                  * We extended the attribute allocation above.
1511                                  * If we hit an ENOENT here it means that the
1512                                  * allocation was insufficient which is a bug.
1513                                  */
1514                                 BUG_ON(lcn == LCN_ENOENT);
1515
1516                                 /* It is a hole, need to instantiate it. */
1517                                 if (lcn == LCN_HOLE) {
1518                                         // TODO: Instantiate the hole.
1519                                         // clear_buffer_new(bh);
1520                                         // unmap_underlying_metadata(bh->b_bdev,
1521                                         //              bh->b_blocknr);
1522                                         // For non-uptodate buffers, need to
1523                                         // zero out the region outside the
1524                                         // request in this bh or all bhs,
1525                                         // depending on what we implemented
1526                                         // above.
1527                                         // Need to flush_dcache_page().
1528                                         // Or could use set_buffer_new()
1529                                         // instead?
1530                                         ntfs_error(vol->sb, "Writing into "
1531                                                         "sparse regions is "
1532                                                         "not supported yet. "
1533                                                         "Sorry.");
1534                                         err = -EOPNOTSUPP;
1535                                         goto err_out;
1536                                 } else if (!is_retry &&
1537                                                 lcn == LCN_RL_NOT_MAPPED) {
1538                                         is_retry = TRUE;
1539                                         /*
1540                                          * Attempt to map runlist, dropping
1541                                          * lock for the duration.
1542                                          */
1543                                         up_read(&ni->runlist.lock);
1544                                         err = ntfs_map_runlist(ni, vcn);
1545                                         if (likely(!err))
1546                                                 goto lock_retry_remap;
1547                                         rl = NULL;
1548                                 }
1549                                 /*
1550                                  * Failed to map the buffer, even after
1551                                  * retrying.
1552                                  */
1553                                 bh->b_blocknr = -1UL;
1554                                 ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn(vcn = "
1555                                                 "0x%llx) failed with error "
1556                                                 "code 0x%llx%s.",
1557                                                 (unsigned long long)vcn,
1558                                                 (unsigned long long)-lcn,
1559                                                 is_retry ? " even after "
1560                                                 "retrying" : "");
1561                                 // FIXME: Depending on vol->on_errors, do
1562                                 // something.
1563                                 if (!err)
1564                                         err = -EIO;
1565                                 goto err_out;
1566                         }
1567                         /* We now have a successful remap, i.e. lcn >= 0. */
1568
1569                         /* Setup buffer head to correct block. */
1570                         bh->b_blocknr = ((lcn << vol->cluster_size_bits)
1571                                         + vcn_ofs) >> blocksize_bits;
1572                         set_buffer_mapped(bh);
1573
1574                         // FIXME: Something analogous to this is needed for
1575                         // each newly allocated block, i.e. BH_New.
1576                         // FIXME: Might need to take this out of the
1577                         // if (!buffer_mapped(bh)) {}, depending on how we
1578                         // implement things during the allocated_size and
1579                         // initialized_size extension code above.
1580                         if (buffer_new(bh)) {
1581                                 clear_buffer_new(bh);
1582                                 unmap_underlying_metadata(bh->b_bdev,
1583                                                 bh->b_blocknr);
1584                                 if (PageUptodate(page)) {
1585                                         set_buffer_uptodate(bh);
1586                                         continue;
1587                                 }
1588                                 /*
1589                                  * Page is _not_ uptodate, zero surrounding
1590                                  * region. NOTE: This is how we decide if to
1591                                  * zero or not!
1592                                  */
1593                                 if (block_end > to || block_start < from) {
1594                                         void *kaddr;
1595
1596                                         kaddr = kmap_atomic(page, KM_USER0);
1597                                         if (block_end > to)
1598                                                 memset(kaddr + to, 0,
1599                                                                 block_end - to);
1600                                         if (block_start < from)
1601                                                 memset(kaddr + block_start, 0,
1602                                                                 from -
1603                                                                 block_start);
1604                                         flush_dcache_page(page);
1605                                         kunmap_atomic(kaddr, KM_USER0);
1606                                 }
1607                                 continue;
1608                         }
1609                 }
1610                 /* @bh is mapped, set it uptodate if the page is uptodate. */
1611                 if (PageUptodate(page)) {
1612                         if (!buffer_uptodate(bh))
1613                                 set_buffer_uptodate(bh);
1614                         continue;
1615                 }
1616                 /*
1617                  * The page is not uptodate. The buffer is mapped. If it is not
1618                  * uptodate, and it is only partially being written to, we need
1619                  * to read the buffer in before the write, i.e. right now.
1620                  */
1621                 if (!buffer_uptodate(bh) &&
1622                                 (block_start < from || block_end > to)) {
1623                         ll_rw_block(READ, 1, &bh);
1624                         *wait_bh++ = bh;
1625                 }
1626         } while (block++, block_start = block_end,
1627                         (bh = bh->b_this_page) != head);
1628
1629         /* Release the lock if we took it. */
1630         if (rl) {
1631                 up_read(&ni->runlist.lock);
1632                 rl = NULL;
1633         }
1634
1635         /* If we issued read requests, let them complete. */
1636         while (wait_bh > wait) {
1637                 wait_on_buffer(*--wait_bh);
1638                 if (!buffer_uptodate(*wait_bh))
1639                         return -EIO;
1640         }
1641
1642         ntfs_debug("Done.");
1643         return 0;
1644 err_out:
1645         /*
1646          * Zero out any newly allocated blocks to avoid exposing stale data.
1647          * If BH_New is set, we know that the block was newly allocated in the
1648          * above loop.
1649          * FIXME: What about initialized_size increments? Have we done all the
1650          * required zeroing above? If not this error handling is broken, and
1651          * in particular the if (block_end <= from) check is completely bogus.
1652          */
1653         bh = head;
1654         block_start = 0;
1655         is_retry = FALSE;
1656         do {
1657                 block_end = block_start + blocksize;
1658                 if (block_end <= from)
1659                         continue;
1660                 if (block_start >= to)
1661                         break;
1662                 if (buffer_new(bh)) {
1663                         void *kaddr;
1664
1665                         clear_buffer_new(bh);
1666                         kaddr = kmap_atomic(page, KM_USER0);
1667                         memset(kaddr + block_start, 0, bh->b_size);
1668                         kunmap_atomic(kaddr, KM_USER0);
1669                         set_buffer_uptodate(bh);
1670                         mark_buffer_dirty(bh);
1671                         is_retry = TRUE;
1672                 }
1673         } while (block_start = block_end, (bh = bh->b_this_page) != head);
1674         if (is_retry)
1675                 flush_dcache_page(page);
1676         if (rl)
1677                 up_read(&ni->runlist.lock);
1678         return err;
1679 }
1680
1681 /**
1682  * ntfs_prepare_write - prepare a page for receiving data
1683  *
1684  * This is called from generic_file_write() with i_sem held on the inode
1685  * (@page->mapping->host). The @page is locked and kmap()ped so page_address()
1686  * can simply be used. The source data has not yet been copied into the @page.
1687  *
1688  * Need to extend the attribute/fill in holes if necessary, create blocks and
1689  * make partially overwritten blocks uptodate,
1690  *
1691  * i_size is not to be modified yet.
1692  *
1693  * Return 0 on success or -errno on error.
1694  *
1695  * Should be using block_prepare_write() [support for sparse files] or
1696  * cont_prepare_write() [no support for sparse files]. Can't do that due to
1697  * ntfs specifics but can look at them for implementation guidancea.
1698  *
1699  * Note: In the range, @from is inclusive and @to is exclusive, i.e. @from is
1700  * the first byte in the page that will be written to and @to is the first byte
1701  * after the last byte that will be written to.
1702  */
1703 static int ntfs_prepare_write(struct file *file, struct page *page,
1704                 unsigned from, unsigned to)
1705 {
1706         struct inode *vi = page->mapping->host;
1707         ntfs_inode   *ni = NTFS_I(vi);
1708
1709         ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
1710                         "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type,
1711                         page->index, from, to);
1712
1713         BUG_ON(!PageLocked(page));
1714         BUG_ON(from > PAGE_CACHE_SIZE);
1715         BUG_ON(to > PAGE_CACHE_SIZE);
1716         BUG_ON(from > to);
1717
1718         if (NInoNonResident(ni)) {
1719                 /*
1720                  * Only unnamed $DATA attributes can be compressed, encrypted,
1721                  * and/or sparse.
1722                  */
1723                 if (ni->type == AT_DATA && !ni->name_len) {
1724                         /* If file is encrypted, deny access, just like NT4. */
1725                         if (NInoEncrypted(ni)) {
1726                                 ntfs_debug("Denying write access to encrypted "
1727                                                 "file.");
1728                                 return -EACCES;
1729                         }
1730                         /* Compressed data streams are handled in compress.c. */
1731                         if (NInoCompressed(ni)) {
1732                                 // TODO: Implement and replace this check with
1733                                 // return ntfs_write_compressed_block(page);
1734                                 ntfs_error(vi->i_sb, "Writing to compressed "
1735                                                 "files is not supported yet. "
1736                                                 "Sorry.");
1737                                 return -EOPNOTSUPP;
1738                         }
1739                         // TODO: Implement and remove this check.
1740                         if (NInoSparse(ni)) {
1741                                 ntfs_error(vi->i_sb, "Writing to sparse files "
1742                                                 "is not supported yet. Sorry.");
1743                                 return -EOPNOTSUPP;
1744                         }
1745                 }
1746
1747                 // TODO: Implement and remove this check.
1748                 if (NInoMstProtected(ni)) {
1749                         ntfs_error(vi->i_sb, "Writing to MST protected "
1750                                         "attributes is not supported yet. "
1751                                         "Sorry.");
1752                         return -EOPNOTSUPP;
1753                 }
1754
1755                 /* Normal data stream. */
1756                 return ntfs_prepare_nonresident_write(page, from, to);
1757         }
1758
1759         /*
1760          * Attribute is resident, implying it is not compressed, encrypted, or
1761          * mst protected.
1762          */
1763         BUG_ON(page_has_buffers(page));
1764
1765         /* Do we need to resize the attribute? */
1766         if (((s64)page->index << PAGE_CACHE_SHIFT) + to > vi->i_size) {
1767                 // TODO: Implement resize...
1768                 ntfs_error(vi->i_sb, "Writing beyond the existing file size is "
1769                                 "not supported yet. Sorry.");
1770                 return -EOPNOTSUPP;
1771         }
1772
1773         /*
1774          * Because resident attributes are handled by memcpy() to/from the
1775          * corresponding MFT record, and because this form of i/o is byte
1776          * aligned rather than block aligned, there is no need to bring the
1777          * page uptodate here as in the non-resident case where we need to
1778          * bring the buffers straddled by the write uptodate before
1779          * generic_file_write() does the copying from userspace.
1780          *
1781          * We thus defer the uptodate bringing of the page region outside the
1782          * region written to to ntfs_commit_write(). The reason for doing this
1783          * is that we save one round of:
1784          *      map_mft_record(), ntfs_attr_get_search_ctx(),
1785          *      ntfs_attr_lookup(), kmap_atomic(), kunmap_atomic(),
1786          *      ntfs_attr_put_search_ctx(), unmap_mft_record().
1787          * Which is obviously a very worthwhile save.
1788          *
1789          * Thus we just return success now...
1790          */
1791         ntfs_debug("Done.");
1792         return 0;
1793 }
1794
1795 /*
1796  * NOTES: There is a disparity between the apparent need to extend the
1797  * attribute in prepare write but to update i_size only in commit write.
1798  * Need to make sure i_sem protection is sufficient. And if not will need to
1799  * handle this in some way or another.
1800  */
1801
1802 /**
1803  * ntfs_commit_nonresident_write -
1804  *
1805  */
1806 static int ntfs_commit_nonresident_write(struct page *page,
1807                 unsigned from, unsigned to)
1808 {
1809         s64 pos = ((s64)page->index << PAGE_CACHE_SHIFT) + to;
1810         struct inode *vi;
1811         struct buffer_head *bh, *head;
1812         unsigned int block_start, block_end, blocksize;
1813         BOOL partial;
1814
1815         vi = page->mapping->host;
1816
1817         ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
1818                         "0x%lx, from = %u, to = %u.", vi->i_ino,
1819                         NTFS_I(vi)->type, page->index, from, to);
1820
1821         blocksize = 1 << vi->i_blkbits;
1822
1823         // FIXME: We need a whole slew of special cases in here for MST
1824         // protected attributes for example. For compressed files, too...
1825         // For now, we know ntfs_prepare_write() would have failed so we can't
1826         // get here in any of the cases which we have to special case, so we
1827         // are just a ripped off unrolled generic_commit_write() at present.
1828
1829         bh = head = page_buffers(page);
1830         block_start = 0;
1831         partial = FALSE;
1832         do {
1833                 block_end = block_start + blocksize;
1834                 if (block_end <= from || block_start >= to) {
1835                         if (!buffer_uptodate(bh))
1836                                 partial = TRUE;
1837                 } else {
1838                         set_buffer_uptodate(bh);
1839                         mark_buffer_dirty(bh);
1840                 }
1841         } while (block_start = block_end, (bh = bh->b_this_page) != head);
1842
1843         /*
1844          * If this is a partial write which happened to make all buffers
1845          * uptodate then we can optimize away a bogus ->readpage() for the next
1846          * read(). Here we 'discover' whether the page went uptodate as a
1847          * result of this (potentially partial) write.
1848          */
1849         if (!partial)
1850                 SetPageUptodate(page);
1851
1852         /*
1853          * Not convinced about this at all. See disparity comment above. For
1854          * now we know ntfs_prepare_write() would have failed in the write
1855          * exceeds i_size case, so this will never trigger which is fine.
1856          */
1857         if (pos > vi->i_size) {
1858                 ntfs_error(vi->i_sb, "Writing beyond the existing file size is "
1859                                 "not supported yet. Sorry.");
1860                 return -EOPNOTSUPP;
1861                 // vi->i_size = pos;
1862                 // mark_inode_dirty(vi);
1863         }
1864         ntfs_debug("Done.");
1865         return 0;
1866 }
1867
1868 /**
1869  * ntfs_commit_write - commit the received data
1870  *
1871  * This is called from generic_file_write() with i_sem held on the inode
1872  * (@page->mapping->host). The @page is locked and kmap()ped so page_address()
1873  * can simply be used. The source data has already been copied into the @page.
1874  *
1875  * Need to mark modified blocks dirty so they get written out later when
1876  * ntfs_writepage() is invoked by the VM.
1877  *
1878  * Return 0 on success or -errno on error.
1879  *
1880  * Should be using generic_commit_write(). This marks buffers uptodate and
1881  * dirty, sets the page uptodate if all buffers in the page are uptodate, and
1882  * updates i_size if the end of io is beyond i_size. In that case, it also
1883  * marks the inode dirty. - We could still use this (obviously except for
1884  * NInoMstProtected() attributes, where we will need to duplicate the core code
1885  * because we need our own async_io completion handler) but we could just do
1886  * the i_size update in prepare write, when we resize the attribute. Then
1887  * we would avoid the i_size update and mark_inode_dirty() happening here.
1888  *
1889  * Can't use generic_commit_write() due to ntfs specialities but can look at
1890  * it for implementation guidance.
1891  *
1892  * If things have gone as outlined in ntfs_prepare_write(), then we do not
1893  * need to do any page content modifications here at all, except in the write
1894  * to resident attribute case, where we need to do the uptodate bringing here
1895  * which we combine with the copying into the mft record which means we only
1896  * need to map the mft record and find the attribute record in it only once.
1897  */
1898 static int ntfs_commit_write(struct file *file, struct page *page,
1899                 unsigned from, unsigned to)
1900 {
1901         s64 attr_pos;
1902         struct inode *vi;
1903         ntfs_inode *ni, *base_ni;
1904         char *kaddr, *kattr;
1905         ntfs_attr_search_ctx *ctx;
1906         MFT_RECORD *m;
1907         u32 attr_len, bytes;
1908         int err;
1909
1910         vi = page->mapping->host;
1911         ni = NTFS_I(vi);
1912
1913         ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
1914                         "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type,
1915                         page->index, from, to);
1916
1917         if (NInoNonResident(ni)) {
1918                 /*
1919                  * Only unnamed $DATA attributes can be compressed, encrypted,
1920                  * and/or sparse.
1921                  */
1922                 if (ni->type == AT_DATA && !ni->name_len) {
1923                         /* If file is encrypted, deny access, just like NT4. */
1924                         if (NInoEncrypted(ni)) {
1925                                 // Should never get here!
1926                                 ntfs_debug("Denying write access to encrypted "
1927                                                 "file.");
1928                                 return -EACCES;
1929                         }
1930                         /* Compressed data streams are handled in compress.c. */
1931                         if (NInoCompressed(ni)) {
1932                                 // TODO: Implement and replace this check with
1933                                 // return ntfs_write_compressed_block(page);
1934                                 // Should never get here!
1935                                 ntfs_error(vi->i_sb, "Writing to compressed "
1936                                                 "files is not supported yet. "
1937                                                 "Sorry.");
1938                                 return -EOPNOTSUPP;
1939                         }
1940                         // TODO: Implement and remove this check.
1941                         if (NInoSparse(ni)) {
1942                                 // Should never get here!
1943                                 ntfs_error(vi->i_sb, "Writing to sparse files "
1944                                                 "is not supported yet. Sorry.");
1945                                 return -EOPNOTSUPP;
1946                         }
1947                 }
1948
1949                 // TODO: Implement and remove this check.
1950                 if (NInoMstProtected(ni)) {
1951                         // Should never get here!
1952                         ntfs_error(vi->i_sb, "Writing to MST protected "
1953                                         "attributes is not supported yet. "
1954                                         "Sorry.");
1955                         return -EOPNOTSUPP;
1956                 }
1957
1958                 /* Normal data stream. */
1959                 return ntfs_commit_nonresident_write(page, from, to);
1960         }
1961
1962         /*
1963          * Attribute is resident, implying it is not compressed, encrypted, or
1964          * mst protected.
1965          */
1966
1967         /* Do we need to resize the attribute? */
1968         if (((s64)page->index << PAGE_CACHE_SHIFT) + to > vi->i_size) {
1969                 // TODO: Implement resize...
1970                 // pos = ((s64)page->index << PAGE_CACHE_SHIFT) + to;
1971                 // vi->i_size = pos;
1972                 // mark_inode_dirty(vi);
1973                 // Should never get here!
1974                 ntfs_error(vi->i_sb, "Writing beyond the existing file size is "
1975                                 "not supported yet. Sorry.");
1976                 return -EOPNOTSUPP;
1977         }
1978
1979         if (!NInoAttr(ni))
1980                 base_ni = ni;
1981         else
1982                 base_ni = ni->ext.base_ntfs_ino;
1983
1984         /* Map, pin, and lock the mft record. */
1985         m = map_mft_record(base_ni);
1986         if (IS_ERR(m)) {
1987                 err = PTR_ERR(m);
1988                 m = NULL;
1989                 ctx = NULL;
1990                 goto err_out;
1991         }
1992         ctx = ntfs_attr_get_search_ctx(base_ni, m);
1993         if (unlikely(!ctx)) {
1994                 err = -ENOMEM;
1995                 goto err_out;
1996         }
1997         err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1998                         CASE_SENSITIVE, 0, NULL, 0, ctx);
1999         if (unlikely(err))
2000                 goto err_out;
2001
2002         /* Starting position of the page within the attribute value. */
2003         attr_pos = page->index << PAGE_CACHE_SHIFT;
2004
2005         /* The total length of the attribute value. */
2006         attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
2007
2008         if (unlikely(vi->i_size != attr_len)) {
2009                 ntfs_error(vi->i_sb, "BUG()! i_size (0x%llx) doesn't match "
2010                                 "attr_len (0x%x). Aborting write.", vi->i_size,
2011                                 attr_len);
2012                 err = -EIO;
2013                 goto err_out;
2014         }
2015         if (unlikely(attr_pos >= attr_len)) {
2016                 ntfs_error(vi->i_sb, "BUG()! attr_pos (0x%llx) > attr_len "
2017                                 "(0x%x). Aborting write.",
2018                                 (unsigned long long)attr_pos, attr_len);
2019                 err = -EIO;
2020                 goto err_out;
2021         }
2022
2023         bytes = attr_len - attr_pos;
2024         if (unlikely(bytes > PAGE_CACHE_SIZE))
2025                 bytes = PAGE_CACHE_SIZE;
2026
2027         /*
2028          * Calculate the address of the attribute value corresponding to the
2029          * beginning of the current data @page.
2030          */
2031         kattr = (u8*)ctx->attr + le16_to_cpu(
2032                         ctx->attr->data.resident.value_offset) + attr_pos;
2033
2034         kaddr = kmap_atomic(page, KM_USER0);
2035
2036         /* Copy the received data from the page to the mft record. */
2037         memcpy(kattr + from, kaddr + from, to - from);
2038         flush_dcache_mft_record_page(ctx->ntfs_ino);
2039
2040         if (!PageUptodate(page)) {
2041                 /*
2042                  * Bring the out of bounds area(s) uptodate by copying data
2043                  * from the mft record to the page.
2044                  */
2045                 if (from > 0)
2046                         memcpy(kaddr, kattr, from);
2047                 if (to < bytes)
2048                         memcpy(kaddr + to, kattr + to, bytes - to);
2049
2050                 /* Zero the region outside the end of the attribute value. */
2051                 if (likely(bytes < PAGE_CACHE_SIZE))
2052                         memset(kaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
2053
2054                 /*
2055                  * The probability of not having done any of the above is
2056                  * extremely small, so we just flush unconditionally.
2057                  */
2058                 flush_dcache_page(page);
2059                 SetPageUptodate(page);
2060         }
2061         kunmap_atomic(kaddr, KM_USER0);
2062
2063         /* Mark the mft record dirty, so it gets written back. */
2064         mark_mft_record_dirty(ctx->ntfs_ino);
2065
2066         ntfs_attr_put_search_ctx(ctx);
2067         unmap_mft_record(base_ni);
2068         ntfs_debug("Done.");
2069         return 0;
2070 err_out:
2071         if (err == -ENOMEM) {
2072                 ntfs_warning(vi->i_sb, "Error allocating memory required to "
2073                                 "commit the write.");
2074                 if (PageUptodate(page)) {
2075                         ntfs_warning(vi->i_sb, "Page is uptodate, setting "
2076                                         "dirty so the write will be retried "
2077                                         "later on by the VM.");
2078                         /*
2079                          * Put the page on mapping->dirty_pages, but leave its
2080                          * buffer's dirty state as-is.
2081                          */
2082                         __set_page_dirty_nobuffers(page);
2083                         err = 0;
2084                 } else
2085                         ntfs_error(vi->i_sb, "Page is not uptodate. Written "
2086                                         "data has been lost. )-:");
2087         } else {
2088                 ntfs_error(vi->i_sb, "Resident attribute write failed with "
2089                                 "error %i. Setting page error flag.", -err);
2090                 SetPageError(page);
2091         }
2092         if (ctx)
2093                 ntfs_attr_put_search_ctx(ctx);
2094         if (m)
2095                 unmap_mft_record(base_ni);
2096         return err;
2097 }
2098
2099 #endif  /* NTFS_RW */
2100
2101 /**
2102  * ntfs_aops - general address space operations for inodes and attributes
2103  */
2104 struct address_space_operations ntfs_aops = {
2105         .readpage       = ntfs_readpage,        /* Fill page with data. */
2106         .sync_page      = block_sync_page,      /* Currently, just unplugs the
2107                                                    disk request queue. */
2108 #ifdef NTFS_RW
2109         .writepage      = ntfs_writepage,       /* Write dirty page to disk. */
2110         .prepare_write  = ntfs_prepare_write,   /* Prepare page and buffers
2111                                                    ready to receive data. */
2112         .commit_write   = ntfs_commit_write,    /* Commit received data. */
2113 #endif /* NTFS_RW */
2114 };
2115
2116 /**
2117  * ntfs_mst_aops - general address space operations for mst protecteed inodes
2118  *                 and attributes
2119  */
2120 struct address_space_operations ntfs_mst_aops = {
2121         .readpage       = ntfs_readpage,        /* Fill page with data. */
2122         .sync_page      = block_sync_page,      /* Currently, just unplugs the
2123                                                    disk request queue. */
2124 #ifdef NTFS_RW
2125         .writepage      = ntfs_writepage,       /* Write dirty page to disk. */
2126         .set_page_dirty = __set_page_dirty_nobuffers,   /* Set the page dirty
2127                                                    without touching the buffers
2128                                                    belonging to the page. */
2129 #endif /* NTFS_RW */
2130 };
2131
2132 #ifdef NTFS_RW
2133
2134 /**
2135  * mark_ntfs_record_dirty - mark an ntfs record dirty
2136  * @page:       page containing the ntfs record to mark dirty
2137  * @ofs:        byte offset within @page at which the ntfs record begins
2138  *
2139  * If the ntfs record is the same size as the page cache page @page, set all
2140  * buffers in the page dirty.  Otherwise, set only the buffers in which the
2141  * ntfs record is located dirty.
2142  *
2143  * Also, set the page containing the ntfs record dirty, which also marks the
2144  * vfs inode the ntfs record belongs to dirty (I_DIRTY_PAGES).
2145  */
2146 void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
2147         ntfs_inode *ni;
2148         struct buffer_head *bh, *head;
2149         unsigned int end, bh_size, bh_ofs;
2150
2151         BUG_ON(!page);
2152         BUG_ON(!page_has_buffers(page));
2153         ni = NTFS_I(page->mapping->host);
2154         BUG_ON(!ni);
2155         if (ni->itype.index.block_size == PAGE_CACHE_SIZE) {
2156                 __set_page_dirty_buffers(page);
2157                 return;
2158         }
2159         end = ofs + ni->itype.index.block_size;
2160         bh_size = ni->vol->sb->s_blocksize;
2161         bh = head = page_buffers(page);
2162         do {
2163                 bh_ofs = bh_offset(bh);
2164                 if (bh_ofs + bh_size <= ofs)
2165                         continue;
2166                 if (unlikely(bh_ofs >= end))
2167                         break;
2168                 set_buffer_dirty(bh);
2169         } while ((bh = bh->b_this_page) != head);
2170         __set_page_dirty_nobuffers(page);
2171 }
2172
2173 #endif /* NTFS_RW */