2 * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
4 * Copyright (c) 2001-2004 Anton Altaparmakov
5 * Copyright (c) 2002 Richard Russon
7 * This program/include file is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as published
9 * by the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program/include file is distributed in the hope that it will be
13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program (in the main directory of the Linux-NTFS
19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 #include <linux/swap.h>
28 * __format_mft_record - initialize an empty mft record
29 * @m: mapped, pinned and locked for writing mft record
30 * @size: size of the mft record
31 * @rec_no: mft record number / inode number
33 * Private function to initialize an empty mft record. Use one of the two
34 * provided format_mft_record() functions instead.
36 static void __format_mft_record(MFT_RECORD
*m
, const int size
,
37 const unsigned long rec_no
)
42 m
->magic
= magic_FILE
;
43 /* Aligned to 2-byte boundary. */
44 m
->usa_ofs
= cpu_to_le16((sizeof(MFT_RECORD
) + 1) & ~1);
45 m
->usa_count
= cpu_to_le16(size
/ NTFS_BLOCK_SIZE
+ 1);
46 /* Set the update sequence number to 1. */
47 *(le16
*)((char*)m
+ ((sizeof(MFT_RECORD
) + 1) & ~1)) = cpu_to_le16(1);
48 m
->lsn
= cpu_to_le64(0LL);
49 m
->sequence_number
= cpu_to_le16(1);
51 /* Aligned to 8-byte boundary. */
52 m
->attrs_offset
= cpu_to_le16((le16_to_cpu(m
->usa_ofs
) +
53 (le16_to_cpu(m
->usa_count
) << 1) + 7) & ~7);
56 * Using attrs_offset plus eight bytes (for the termination attribute),
57 * aligned to 8-byte boundary.
59 m
->bytes_in_use
= cpu_to_le32((le16_to_cpu(m
->attrs_offset
) + 8 + 7) &
61 m
->bytes_allocated
= cpu_to_le32(size
);
62 m
->base_mft_record
= cpu_to_le64((MFT_REF
)0);
63 m
->next_attr_instance
= 0;
64 a
= (ATTR_RECORD
*)((char*)m
+ le16_to_cpu(m
->attrs_offset
));
70 * format_mft_record - initialize an empty mft record
71 * @ni: ntfs inode of mft record
72 * @mft_rec: mapped, pinned and locked mft record (optional)
74 * Initialize an empty mft record. This is used when extending the MFT.
76 * If @mft_rec is NULL, we call map_mft_record() to obtain the
77 * record and we unmap it again when finished.
79 * We return 0 on success or -errno on error.
81 int format_mft_record(ntfs_inode
*ni
, MFT_RECORD
*mft_rec
)
88 m
= map_mft_record(ni
);
92 __format_mft_record(m
, ni
->vol
->mft_record_size
, ni
->mft_no
);
94 // FIXME: Need to set the mft record dirty!
101 * ntfs_readpage - external declaration, function is in fs/ntfs/aops.c
103 extern int ntfs_readpage(struct file
*, struct page
*);
107 * ntfs_mft_writepage - forward declaration, function is further below
109 static int ntfs_mft_writepage(struct page
*page
, struct writeback_control
*wbc
);
113 * ntfs_mft_aops - address space operations for access to $MFT
115 * Address space operations for access to $MFT. This allows us to simply use
116 * ntfs_map_page() in map_mft_record_page().
118 struct address_space_operations ntfs_mft_aops
= {
119 .readpage
= ntfs_readpage
, /* Fill page with data. */
120 .sync_page
= block_sync_page
, /* Currently, just unplugs the
121 disk request queue. */
123 .writepage
= ntfs_mft_writepage
, /* Write out the dirty mft
124 records in a page. */
125 .set_page_dirty
= __set_page_dirty_nobuffers
, /* Set the page dirty
126 without touching the buffers
127 belonging to the page. */
132 * map_mft_record_page - map the page in which a specific mft record resides
133 * @ni: ntfs inode whose mft record page to map
135 * This maps the page in which the mft record of the ntfs inode @ni is situated
136 * and returns a pointer to the mft record within the mapped page.
138 * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR()
139 * contains the negative error code returned.
141 static inline MFT_RECORD
*map_mft_record_page(ntfs_inode
*ni
)
143 ntfs_volume
*vol
= ni
->vol
;
144 struct inode
*mft_vi
= vol
->mft_ino
;
146 unsigned long index
, ofs
, end_index
;
150 * The index into the page cache and the offset within the page cache
151 * page of the wanted mft record. FIXME: We need to check for
152 * overflowing the unsigned long, but I don't think we would ever get
153 * here if the volume was that big...
155 index
= ni
->mft_no
<< vol
->mft_record_size_bits
>> PAGE_CACHE_SHIFT
;
156 ofs
= (ni
->mft_no
<< vol
->mft_record_size_bits
) & ~PAGE_CACHE_MASK
;
158 /* The maximum valid index into the page cache for $MFT's data. */
159 end_index
= mft_vi
->i_size
>> PAGE_CACHE_SHIFT
;
161 /* If the wanted index is out of bounds the mft record doesn't exist. */
162 if (unlikely(index
>= end_index
)) {
163 if (index
> end_index
|| (mft_vi
->i_size
& ~PAGE_CACHE_MASK
) <
164 ofs
+ vol
->mft_record_size
) {
165 page
= ERR_PTR(-ENOENT
);
169 /* Read, map, and pin the page. */
170 page
= ntfs_map_page(mft_vi
->i_mapping
, index
);
171 if (likely(!IS_ERR(page
))) {
174 return page_address(page
) + ofs
;
179 ntfs_error(vol
->sb
, "Failed with error code %lu.", -PTR_ERR(page
));
184 * map_mft_record - map, pin and lock an mft record
185 * @ni: ntfs inode whose MFT record to map
187 * First, take the mrec_lock semaphore. We might now be sleeping, while waiting
188 * for the semaphore if it was already locked by someone else.
190 * The page of the record is mapped using map_mft_record_page() before being
191 * returned to the caller.
193 * This in turn uses ntfs_map_page() to get the page containing the wanted mft
194 * record (it in turn calls read_cache_page() which reads it in from disk if
195 * necessary, increments the use count on the page so that it cannot disappear
196 * under us and returns a reference to the page cache page).
198 * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it
199 * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed
200 * and the post-read mst fixups on each mft record in the page have been
201 * performed, the page gets PG_uptodate set and PG_locked cleared (this is done
202 * in our asynchronous I/O completion handler end_buffer_read_mft_async()).
203 * ntfs_map_page() waits for PG_locked to become clear and checks if
204 * PG_uptodate is set and returns an error code if not. This provides
205 * sufficient protection against races when reading/using the page.
207 * However there is the write mapping to think about. Doing the above described
208 * checking here will be fine, because when initiating the write we will set
209 * PG_locked and clear PG_uptodate making sure nobody is touching the page
210 * contents. Doing the locking this way means that the commit to disk code in
211 * the page cache code paths is automatically sufficiently locked with us as
212 * we will not touch a page that has been locked or is not uptodate. The only
213 * locking problem then is them locking the page while we are accessing it.
215 * So that code will end up having to own the mrec_lock of all mft
216 * records/inodes present in the page before I/O can proceed. In that case we
217 * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be
218 * accessing anything without owning the mrec_lock semaphore. But we do need
219 * to use them because of the read_cache_page() invocation and the code becomes
220 * so much simpler this way that it is well worth it.
222 * The mft record is now ours and we return a pointer to it. You need to check
223 * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return
226 * NOTE: Caller is responsible for setting the mft record dirty before calling
227 * unmap_mft_record(). This is obviously only necessary if the caller really
228 * modified the mft record...
229 * Q: Do we want to recycle one of the VFS inode state bits instead?
230 * A: No, the inode ones mean we want to change the mft record, not we want to
233 MFT_RECORD
*map_mft_record(ntfs_inode
*ni
)
237 ntfs_debug("Entering for mft_no 0x%lx.", ni
->mft_no
);
239 /* Make sure the ntfs inode doesn't go away. */
240 atomic_inc(&ni
->count
);
242 /* Serialize access to this mft record. */
243 down(&ni
->mrec_lock
);
245 m
= map_mft_record_page(ni
);
246 if (likely(!IS_ERR(m
)))
250 atomic_dec(&ni
->count
);
251 ntfs_error(ni
->vol
->sb
, "Failed with error code %lu.", -PTR_ERR(m
));
256 * unmap_mft_record_page - unmap the page in which a specific mft record resides
257 * @ni: ntfs inode whose mft record page to unmap
259 * This unmaps the page in which the mft record of the ntfs inode @ni is
260 * situated and returns. This is a NOOP if highmem is not configured.
262 * The unmap happens via ntfs_unmap_page() which in turn decrements the use
263 * count on the page thus releasing it from the pinned state.
265 * We do not actually unmap the page from memory of course, as that will be
266 * done by the page cache code itself when memory pressure increases or
269 static inline void unmap_mft_record_page(ntfs_inode
*ni
)
273 // TODO: If dirty, blah...
274 ntfs_unmap_page(ni
->page
);
281 * unmap_mft_record - release a mapped mft record
282 * @ni: ntfs inode whose MFT record to unmap
284 * We release the page mapping and the mrec_lock mutex which unmaps the mft
285 * record and releases it for others to get hold of. We also release the ntfs
286 * inode by decrementing the ntfs inode reference count.
288 * NOTE: If caller has modified the mft record, it is imperative to set the mft
289 * record dirty BEFORE calling unmap_mft_record().
291 void unmap_mft_record(ntfs_inode
*ni
)
293 struct page
*page
= ni
->page
;
297 ntfs_debug("Entering for mft_no 0x%lx.", ni
->mft_no
);
299 unmap_mft_record_page(ni
);
301 atomic_dec(&ni
->count
);
303 * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to
304 * ntfs_clear_extent_inode() in the extent inode case, and to the
305 * caller in the non-extent, yet pure ntfs inode case, to do the actual
306 * tear down of all structures and freeing of all allocated memory.
312 * map_extent_mft_record - load an extent inode and attach it to its base
313 * @base_ni: base ntfs inode
314 * @mref: mft reference of the extent inode to load
315 * @ntfs_ino: on successful return, pointer to the ntfs_inode structure
317 * Load the extent mft record @mref and attach it to its base inode @base_ni.
318 * Return the mapped extent mft record if IS_ERR(result) is false. Otherwise
319 * PTR_ERR(result) gives the negative error code.
321 * On successful return, @ntfs_ino contains a pointer to the ntfs_inode
322 * structure of the mapped extent inode.
324 MFT_RECORD
*map_extent_mft_record(ntfs_inode
*base_ni
, MFT_REF mref
,
325 ntfs_inode
**ntfs_ino
)
328 ntfs_inode
*ni
= NULL
;
329 ntfs_inode
**extent_nis
= NULL
;
331 unsigned long mft_no
= MREF(mref
);
332 u16 seq_no
= MSEQNO(mref
);
333 BOOL destroy_ni
= FALSE
;
335 ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).",
336 mft_no
, base_ni
->mft_no
);
337 /* Make sure the base ntfs inode doesn't go away. */
338 atomic_inc(&base_ni
->count
);
340 * Check if this extent inode has already been added to the base inode,
341 * in which case just return it. If not found, add it to the base
342 * inode before returning it.
344 down(&base_ni
->extent_lock
);
345 if (base_ni
->nr_extents
> 0) {
346 extent_nis
= base_ni
->ext
.extent_ntfs_inos
;
347 for (i
= 0; i
< base_ni
->nr_extents
; i
++) {
348 if (mft_no
!= extent_nis
[i
]->mft_no
)
351 /* Make sure the ntfs inode doesn't go away. */
352 atomic_inc(&ni
->count
);
356 if (likely(ni
!= NULL
)) {
357 up(&base_ni
->extent_lock
);
358 atomic_dec(&base_ni
->count
);
359 /* We found the record; just have to map and return it. */
360 m
= map_mft_record(ni
);
361 /* map_mft_record() has incremented this on success. */
362 atomic_dec(&ni
->count
);
363 if (likely(!IS_ERR(m
))) {
364 /* Verify the sequence number. */
365 if (likely(le16_to_cpu(m
->sequence_number
) == seq_no
)) {
366 ntfs_debug("Done 1.");
370 unmap_mft_record(ni
);
371 ntfs_error(base_ni
->vol
->sb
, "Found stale extent mft "
372 "reference! Corrupt file system. "
374 return ERR_PTR(-EIO
);
377 ntfs_error(base_ni
->vol
->sb
, "Failed to map extent "
378 "mft record, error code %ld.", -PTR_ERR(m
));
381 /* Record wasn't there. Get a new ntfs inode and initialize it. */
382 ni
= ntfs_new_extent_inode(base_ni
->vol
->sb
, mft_no
);
384 up(&base_ni
->extent_lock
);
385 atomic_dec(&base_ni
->count
);
386 return ERR_PTR(-ENOMEM
);
388 ni
->vol
= base_ni
->vol
;
391 ni
->ext
.base_ntfs_ino
= base_ni
;
392 /* Now map the record. */
393 m
= map_mft_record(ni
);
395 up(&base_ni
->extent_lock
);
396 atomic_dec(&base_ni
->count
);
397 ntfs_clear_extent_inode(ni
);
400 /* Verify the sequence number. */
401 if (unlikely(le16_to_cpu(m
->sequence_number
) != seq_no
)) {
402 ntfs_error(base_ni
->vol
->sb
, "Found stale extent mft "
403 "reference! Corrupt file system. Run chkdsk.");
408 /* Attach extent inode to base inode, reallocating memory if needed. */
409 if (!(base_ni
->nr_extents
& 3)) {
411 int new_size
= (base_ni
->nr_extents
+ 4) * sizeof(ntfs_inode
*);
413 tmp
= (ntfs_inode
**)kmalloc(new_size
, GFP_NOFS
);
414 if (unlikely(!tmp
)) {
415 ntfs_error(base_ni
->vol
->sb
, "Failed to allocate "
418 m
= ERR_PTR(-ENOMEM
);
421 if (base_ni
->nr_extents
) {
422 BUG_ON(!base_ni
->ext
.extent_ntfs_inos
);
423 memcpy(tmp
, base_ni
->ext
.extent_ntfs_inos
, new_size
-
424 4 * sizeof(ntfs_inode
*));
425 kfree(base_ni
->ext
.extent_ntfs_inos
);
427 base_ni
->ext
.extent_ntfs_inos
= tmp
;
429 base_ni
->ext
.extent_ntfs_inos
[base_ni
->nr_extents
++] = ni
;
430 up(&base_ni
->extent_lock
);
431 atomic_dec(&base_ni
->count
);
432 ntfs_debug("Done 2.");
436 unmap_mft_record(ni
);
437 up(&base_ni
->extent_lock
);
438 atomic_dec(&base_ni
->count
);
440 * If the extent inode was not attached to the base inode we need to
441 * release it or we will leak memory.
444 ntfs_clear_extent_inode(ni
);
451 * __mark_mft_record_dirty - set the mft record and the page containing it dirty
452 * @ni: ntfs inode describing the mapped mft record
454 * Internal function. Users should call mark_mft_record_dirty() instead.
456 * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
457 * as well as the page containing the mft record, dirty. Also, mark the base
458 * vfs inode dirty. This ensures that any changes to the mft record are
459 * written out to disk.
461 * NOTE: We only set I_DIRTY_SYNC and I_DIRTY_DATASYNC (and not I_DIRTY_PAGES)
462 * on the base vfs inode, because even though file data may have been modified,
463 * it is dirty in the inode meta data rather than the data page cache of the
464 * inode, and thus there are no data pages that need writing out. Therefore, a
465 * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the
466 * other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to
467 * ensure ->write_inode is called from generic_osync_inode() and this needs to
468 * happen or the file data would not necessarily hit the device synchronously,
469 * even though the vfs inode has the O_SYNC flag set. Also, I_DIRTY_DATASYNC
470 * simply "feels" better than just I_DIRTY_SYNC, since the file data has not
471 * actually hit the block device yet, which is not what I_DIRTY_SYNC on its own
474 void __mark_mft_record_dirty(ntfs_inode
*ni
)
476 struct page
*page
= ni
->page
;
479 ntfs_debug("Entering for inode 0x%lx.", ni
->mft_no
);
481 BUG_ON(NInoAttr(ni
));
484 * Set the page containing the mft record dirty. This also marks the
485 * $MFT inode dirty (I_DIRTY_PAGES).
487 __set_page_dirty_nobuffers(page
);
489 /* Determine the base vfs inode and mark it dirty, too. */
490 down(&ni
->extent_lock
);
491 if (likely(ni
->nr_extents
>= 0))
494 base_ni
= ni
->ext
.base_ntfs_ino
;
495 up(&ni
->extent_lock
);
496 __mark_inode_dirty(VFS_I(base_ni
), I_DIRTY_SYNC
| I_DIRTY_DATASYNC
);
499 static const char *ntfs_please_email
= "Please email "
500 "linux-ntfs-dev@lists.sourceforge.net and say that you saw "
501 "this message. Thank you.";
504 * sync_mft_mirror_umount - synchronise an mft record to the mft mirror
505 * @ni: ntfs inode whose mft record to synchronize
506 * @m: mapped, mst protected (extent) mft record to synchronize
508 * Write the mapped, mst protected (extent) mft record @m described by the
509 * (regular or extent) ntfs inode @ni to the mft mirror ($MFTMirr) bypassing
510 * the page cache and the $MFTMirr inode itself.
512 * This function is only for use at umount time when the mft mirror inode has
513 * already been disposed off. We BUG() if we are called while the mft mirror
514 * inode is still attached to the volume.
516 * On success return 0. On error return -errno.
518 * NOTE: This function is not implemented yet as I am not convinced it can
519 * actually be triggered considering the sequence of commits we do in super.c::
520 * ntfs_put_super(). But just in case we provide this place holder as the
521 * alternative would be either to BUG() or to get a NULL pointer dereference
524 static int sync_mft_mirror_umount(ntfs_inode
*ni
, MFT_RECORD
*m
)
526 ntfs_volume
*vol
= ni
->vol
;
528 BUG_ON(vol
->mftmirr_ino
);
529 ntfs_error(vol
->sb
, "Umount time mft mirror syncing is not "
530 "implemented yet. %s", ntfs_please_email
);
535 * sync_mft_mirror - synchronize an mft record to the mft mirror
536 * @ni: ntfs inode whose mft record to synchronize
537 * @m: mapped, mst protected (extent) mft record to synchronize
538 * @sync: if true, wait for i/o completion
540 * Write the mapped, mst protected (extent) mft record @m described by the
541 * (regular or extent) ntfs inode @ni to the mft mirror ($MFTMirr).
543 * On success return 0. On error return -errno and set the volume errors flag
544 * in the ntfs_volume to which @ni belongs.
546 * NOTE: We always perform synchronous i/o and ignore the @sync parameter.
548 * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just
549 * schedule i/o via ->writepage or do it via kntfsd or whatever.
551 static int sync_mft_mirror(ntfs_inode
*ni
, MFT_RECORD
*m
, int sync
)
553 ntfs_volume
*vol
= ni
->vol
;
555 unsigned int blocksize
= vol
->sb
->s_blocksize
;
556 int max_bhs
= vol
->mft_record_size
/ blocksize
;
557 struct buffer_head
*bhs
[max_bhs
];
558 struct buffer_head
*bh
, *head
;
560 unsigned int block_start
, block_end
, m_start
, m_end
;
561 int i_bhs
, nr_bhs
, err
= 0;
563 ntfs_debug("Entering for inode 0x%lx.", ni
->mft_no
);
565 if (unlikely(!vol
->mftmirr_ino
)) {
566 /* This could happen during umount... */
567 err
= sync_mft_mirror_umount(ni
, m
);
572 /* Get the page containing the mirror copy of the mft record @m. */
573 page
= ntfs_map_page(vol
->mftmirr_ino
->i_mapping
, ni
->mft_no
>>
574 (PAGE_CACHE_SHIFT
- vol
->mft_record_size_bits
));
576 ntfs_error(vol
->sb
, "Failed to map mft mirror page.");
581 * Exclusion against other writers. This should never be a problem
582 * since the page in which the mft record @m resides is also locked and
583 * hence any other writers would be held up there but it is better to
584 * make sure no one is writing from elsewhere.
587 /* The address in the page of the mirror copy of the mft record @m. */
588 kmirr
= page_address(page
) + ((ni
->mft_no
<< vol
->mft_record_size_bits
)
590 /* Copy the mst protected mft record to the mirror. */
591 memcpy(kmirr
, m
, vol
->mft_record_size
);
592 /* Make sure we have mapped buffers. */
593 if (!page_has_buffers(page
)) {
595 ntfs_error(vol
->sb
, "Writing mft mirror records without "
596 "existing buffers is not implemented yet. %s",
601 bh
= head
= page_buffers(page
);
603 goto no_buffers_err_out
;
606 m_start
= kmirr
- (u8
*)page_address(page
);
607 m_end
= m_start
+ vol
->mft_record_size
;
609 block_end
= block_start
+ blocksize
;
611 * If the buffer is outside the mft record, just skip it,
612 * clearing it if it is dirty to make sure it is not written
613 * out. It should never be marked dirty but better be safe.
615 if ((block_end
<= m_start
) || (block_start
>= m_end
)) {
616 if (buffer_dirty(bh
)) {
617 ntfs_warning(vol
->sb
, "Clearing dirty mft "
618 "record page buffer. %s",
620 clear_buffer_dirty(bh
);
624 if (!buffer_mapped(bh
)) {
625 ntfs_error(vol
->sb
, "Writing mft mirror records "
626 "without existing mapped buffers is "
627 "not implemented yet. %s",
632 if (!buffer_uptodate(bh
)) {
633 ntfs_error(vol
->sb
, "Writing mft mirror records "
634 "without existing uptodate buffers is "
635 "not implemented yet. %s",
640 BUG_ON(!nr_bhs
&& (m_start
!= block_start
));
641 BUG_ON(nr_bhs
>= max_bhs
);
643 BUG_ON((nr_bhs
>= max_bhs
) && (m_end
!= block_end
));
644 } while (block_start
= block_end
, (bh
= bh
->b_this_page
) != head
);
646 /* Lock buffers and start synchronous write i/o on them. */
647 for (i_bhs
= 0; i_bhs
< nr_bhs
; i_bhs
++) {
648 struct buffer_head
*tbh
= bhs
[i_bhs
];
650 if (unlikely(test_set_buffer_locked(tbh
)))
652 BUG_ON(!buffer_uptodate(tbh
));
653 if (buffer_dirty(tbh
))
654 clear_buffer_dirty(tbh
);
656 tbh
->b_end_io
= end_buffer_write_sync
;
657 submit_bh(WRITE
, tbh
);
659 /* Wait on i/o completion of buffers. */
660 for (i_bhs
= 0; i_bhs
< nr_bhs
; i_bhs
++) {
661 struct buffer_head
*tbh
= bhs
[i_bhs
];
664 if (unlikely(!buffer_uptodate(tbh
))) {
667 * Set the buffer uptodate so the page & buffer
668 * states don't become out of sync.
670 if (PageUptodate(page
))
671 set_buffer_uptodate(tbh
);
674 } else /* if (unlikely(err)) */ {
675 /* Clean the buffers. */
676 for (i_bhs
= 0; i_bhs
< nr_bhs
; i_bhs
++)
677 clear_buffer_dirty(bhs
[i_bhs
]);
680 /* Current state: all buffers are clean, unlocked, and uptodate. */
681 /* Remove the mst protection fixups again. */
682 post_write_mst_fixup((NTFS_RECORD
*)kmirr
);
683 flush_dcache_page(page
);
685 ntfs_unmap_page(page
);
687 /* I/O error during writing. This is really bad! */
688 ntfs_error(vol
->sb
, "I/O error while writing mft mirror "
689 "record 0x%lx! You should unmount the volume "
690 "and run chkdsk or ntfsfix.", ni
->mft_no
);
696 ntfs_error(vol
->sb
, "Failed to synchronize $MFTMirr (error code %i). "
697 "Volume will be left marked dirty on umount. Run "
698 "ntfsfix on the partition after umounting to correct "
700 /* We don't want to clear the dirty bit on umount. */
706 * write_mft_record_nolock - write out a mapped (extent) mft record
707 * @ni: ntfs inode describing the mapped (extent) mft record
708 * @m: mapped (extent) mft record to write
709 * @sync: if true, wait for i/o completion
711 * Write the mapped (extent) mft record @m described by the (regular or extent)
712 * ntfs inode @ni to backing store. If the mft record @m has a counterpart in
713 * the mft mirror, that is also updated.
715 * On success, clean the mft record and return 0. On error, leave the mft
716 * record dirty and return -errno. The caller should call make_bad_inode() on
717 * the base inode to ensure no more access happens to this inode. We do not do
718 * it here as the caller may want to finish writing other extent mft records
719 * first to minimize on-disk metadata inconsistencies.
721 * NOTE: We always perform synchronous i/o and ignore the @sync parameter.
722 * However, if the mft record has a counterpart in the mft mirror and @sync is
723 * true, we write the mft record, wait for i/o completion, and only then write
724 * the mft mirror copy. This ensures that if the system crashes either the mft
725 * or the mft mirror will contain a self-consistent mft record @m. If @sync is
726 * false on the other hand, we start i/o on both and then wait for completion
727 * on them. This provides a speedup but no longer guarantees that you will end
728 * up with a self-consistent mft record in the case of a crash but if you asked
729 * for asynchronous writing you probably do not care about that anyway.
731 * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just
732 * schedule i/o via ->writepage or do it via kntfsd or whatever.
734 int write_mft_record_nolock(ntfs_inode
*ni
, MFT_RECORD
*m
, int sync
)
736 ntfs_volume
*vol
= ni
->vol
;
737 struct page
*page
= ni
->page
;
738 unsigned int blocksize
= vol
->sb
->s_blocksize
;
739 int max_bhs
= vol
->mft_record_size
/ blocksize
;
740 struct buffer_head
*bhs
[max_bhs
];
741 struct buffer_head
*bh
, *head
;
742 unsigned int block_start
, block_end
, m_start
, m_end
;
743 int i_bhs
, nr_bhs
, err
= 0;
745 ntfs_debug("Entering for inode 0x%lx.", ni
->mft_no
);
746 BUG_ON(NInoAttr(ni
));
748 BUG_ON(!PageLocked(page
));
750 * If the ntfs_inode is clean no need to do anything. If it is dirty,
751 * mark it as clean now so that it can be redirtied later on if needed.
752 * There is no danger of races since the caller is holding the locks
753 * for the mft record @m and the page it is in.
755 if (!NInoTestClearDirty(ni
))
757 /* Make sure we have mapped buffers. */
758 if (!page_has_buffers(page
)) {
760 ntfs_error(vol
->sb
, "Writing mft records without existing "
761 "buffers is not implemented yet. %s",
766 bh
= head
= page_buffers(page
);
768 goto no_buffers_err_out
;
771 m_start
= ni
->page_ofs
;
772 m_end
= m_start
+ vol
->mft_record_size
;
774 block_end
= block_start
+ blocksize
;
776 * If the buffer is outside the mft record, just skip it,
777 * clearing it if it is dirty to make sure it is not written
778 * out. It should never be marked dirty but better be safe.
780 if ((block_end
<= m_start
) || (block_start
>= m_end
)) {
781 if (buffer_dirty(bh
)) {
782 ntfs_warning(vol
->sb
, "Clearing dirty mft "
783 "record page buffer. %s",
785 clear_buffer_dirty(bh
);
789 if (!buffer_mapped(bh
)) {
790 ntfs_error(vol
->sb
, "Writing mft records without "
791 "existing mapped buffers is not "
792 "implemented yet. %s",
797 if (!buffer_uptodate(bh
)) {
798 ntfs_error(vol
->sb
, "Writing mft records without "
799 "existing uptodate buffers is not "
800 "implemented yet. %s",
805 BUG_ON(!nr_bhs
&& (m_start
!= block_start
));
806 BUG_ON(nr_bhs
>= max_bhs
);
808 BUG_ON((nr_bhs
>= max_bhs
) && (m_end
!= block_end
));
809 } while (block_start
= block_end
, (bh
= bh
->b_this_page
) != head
);
812 /* Apply the mst protection fixups. */
813 err
= pre_write_mst_fixup((NTFS_RECORD
*)m
, vol
->mft_record_size
);
815 ntfs_error(vol
->sb
, "Failed to apply mst fixups!");
818 flush_dcache_mft_record_page(ni
);
819 /* Lock buffers and start synchronous write i/o on them. */
820 for (i_bhs
= 0; i_bhs
< nr_bhs
; i_bhs
++) {
821 struct buffer_head
*tbh
= bhs
[i_bhs
];
823 if (unlikely(test_set_buffer_locked(tbh
)))
825 BUG_ON(!buffer_uptodate(tbh
));
826 if (buffer_dirty(tbh
))
827 clear_buffer_dirty(tbh
);
829 tbh
->b_end_io
= end_buffer_write_sync
;
830 submit_bh(WRITE
, tbh
);
832 /* Synchronize the mft mirror now if not @sync. */
833 if (!sync
&& ni
->mft_no
< vol
->mftmirr_size
)
834 sync_mft_mirror(ni
, m
, sync
);
835 /* Wait on i/o completion of buffers. */
836 for (i_bhs
= 0; i_bhs
< nr_bhs
; i_bhs
++) {
837 struct buffer_head
*tbh
= bhs
[i_bhs
];
840 if (unlikely(!buffer_uptodate(tbh
))) {
843 * Set the buffer uptodate so the page & buffer states
844 * don't become out of sync.
846 if (PageUptodate(page
))
847 set_buffer_uptodate(tbh
);
850 /* If @sync, now synchronize the mft mirror. */
851 if (sync
&& ni
->mft_no
< vol
->mftmirr_size
)
852 sync_mft_mirror(ni
, m
, sync
);
853 /* Remove the mst protection fixups again. */
854 post_write_mst_fixup((NTFS_RECORD
*)m
);
855 flush_dcache_mft_record_page(ni
);
857 /* I/O error during writing. This is really bad! */
858 ntfs_error(vol
->sb
, "I/O error while writing mft record "
859 "0x%lx! Marking base inode as bad. You "
860 "should unmount the volume and run chkdsk.",
868 /* Clean the buffers. */
869 for (i_bhs
= 0; i_bhs
< nr_bhs
; i_bhs
++)
870 clear_buffer_dirty(bhs
[i_bhs
]);
873 * Current state: all buffers are clean, unlocked, and uptodate.
874 * The caller should mark the base inode as bad so that no more i/o
875 * happens. ->clear_inode() will still be invoked so all extent inodes
876 * and other allocated memory will be freed.
878 if (err
== -ENOMEM
) {
879 ntfs_error(vol
->sb
, "Not enough memory to write mft record. "
880 "Redirtying so the write is retried later.");
881 mark_mft_record_dirty(ni
);
888 * ntfs_mft_writepage - check if a metadata page contains dirty mft records
889 * @page: metadata page possibly containing dirty mft records
890 * @wbc: writeback control structure
892 * This is called from the VM when it wants to have a dirty $MFT/$DATA metadata
893 * page cache page cleaned. The VM has already locked the page and marked it
894 * clean. Instead of writing the page as a conventional ->writepage function
895 * would do, we check if the page still contains any dirty mft records (it must
896 * have done at some point in the past since the page was marked dirty) and if
897 * none are found, i.e. all mft records are clean, we unlock the page and
898 * return. The VM is then free to do with the page as it pleases. If on the
899 * other hand we do find any dirty mft records in the page, we redirty the page
900 * before unlocking it and returning so the VM knows that the page is still
901 * busy and cannot be thrown out.
903 * Note, we do not actually write any dirty mft records here because they are
904 * dirty inodes and hence will be written by the VFS inode dirty code paths.
905 * There is no need to write them from the VM page dirty code paths, too and in
906 * fact once we implement journalling it would be a complete nightmare having
907 * two code paths leading to mft record writeout.
909 static int ntfs_mft_writepage(struct page
*page
, struct writeback_control
*wbc
)
911 struct inode
*mft_vi
= page
->mapping
->host
;
912 struct super_block
*sb
= mft_vi
->i_sb
;
913 ntfs_volume
*vol
= NTFS_SB(sb
);
916 ntfs_inode
**extent_nis
;
917 unsigned long mft_no
;
919 BOOL is_dirty
= FALSE
;
921 BUG_ON(!PageLocked(page
));
922 BUG_ON(PageWriteback(page
));
923 BUG_ON(mft_vi
!= vol
->mft_ino
);
924 /* The first mft record number in the page. */
925 mft_no
= page
->index
<< (PAGE_CACHE_SHIFT
- vol
->mft_record_size_bits
);
926 /* Number of mft records in the page. */
927 nr
= PAGE_CACHE_SIZE
>> vol
->mft_record_size_bits
;
929 ntfs_debug("Entering for %i inodes starting at 0x%lx.", nr
, mft_no
);
930 /* Iterate over the mft records in the page looking for a dirty one. */
931 maddr
= (u8
*)kmap(page
);
932 for (i
= 0; i
< nr
; ++i
, ++mft_no
, maddr
+= vol
->mft_record_size
) {
934 ntfs_inode
*ni
, *eni
;
942 * Check if the inode corresponding to this mft record is in
943 * the VFS inode cache and obtain a reference to it if it is.
945 ntfs_debug("Looking for inode 0x%lx in icache.", mft_no
);
947 * For inode 0, i.e. $MFT itself, we cannot use ilookup5() from
948 * here or we deadlock because the inode is already locked by
949 * the kernel (fs/fs-writeback.c::__sync_single_inode()) and
950 * ilookup5() waits until the inode is unlocked before
951 * returning it and it never gets unlocked because
952 * ntfs_mft_writepage() never returns. )-: Fortunately, we
953 * have inode 0 pinned in icache for the duration of the mount
954 * so we can access it directly.
957 /* Balance the below iput(). */
959 BUG_ON(vi
!= mft_vi
);
961 vi
= ilookup5(sb
, mft_no
, (test_t
)ntfs_test_inode
, &na
);
963 ntfs_debug("Inode 0x%lx is in icache.", mft_no
);
964 /* The inode is in icache. Check if it is dirty. */
966 if (!NInoDirty(ni
)) {
967 /* The inode is not dirty, skip this record. */
968 ntfs_debug("Inode 0x%lx is not dirty, "
969 "continuing search.", mft_no
);
973 ntfs_debug("Inode 0x%lx is dirty, aborting search.",
975 /* The inode is dirty, no need to search further. */
980 ntfs_debug("Inode 0x%lx is not in icache.", mft_no
);
981 /* The inode is not in icache. */
982 /* Skip the record if it is not a mft record (type "FILE"). */
983 if (!ntfs_is_mft_recordp((le32
*)maddr
)) {
984 ntfs_debug("Mft record 0x%lx is not a FILE record, "
985 "continuing search.", mft_no
);
988 m
= (MFT_RECORD
*)maddr
;
990 * Skip the mft record if it is not in use. FIXME: What about
991 * deleted/deallocated (extent) inodes? (AIA)
993 if (!(m
->flags
& MFT_RECORD_IN_USE
)) {
994 ntfs_debug("Mft record 0x%lx is not in use, "
995 "continuing search.", mft_no
);
998 /* Skip the mft record if it is a base inode. */
999 if (!m
->base_mft_record
) {
1000 ntfs_debug("Mft record 0x%lx is a base record, "
1001 "continuing search.", mft_no
);
1005 * This is an extent mft record. Check if the inode
1006 * corresponding to its base mft record is in icache.
1008 na
.mft_no
= MREF_LE(m
->base_mft_record
);
1009 ntfs_debug("Mft record 0x%lx is an extent record. Looking "
1010 "for base inode 0x%lx in icache.", mft_no
,
1012 vi
= ilookup5(sb
, na
.mft_no
, (test_t
)ntfs_test_inode
,
1016 * The base inode is not in icache. Skip this extent
1019 ntfs_debug("Base inode 0x%lx is not in icache, "
1020 "continuing search.", na
.mft_no
);
1023 ntfs_debug("Base inode 0x%lx is in icache.", na
.mft_no
);
1025 * The base inode is in icache. Check if it has the extent
1026 * inode corresponding to this extent mft record attached.
1029 down(&ni
->extent_lock
);
1030 if (ni
->nr_extents
<= 0) {
1032 * The base inode has no attached extent inodes. Skip
1033 * this extent mft record.
1035 up(&ni
->extent_lock
);
1039 /* Iterate over the attached extent inodes. */
1040 extent_nis
= ni
->ext
.extent_ntfs_inos
;
1041 for (eni
= NULL
, j
= 0; j
< ni
->nr_extents
; ++j
) {
1042 if (mft_no
== extent_nis
[j
]->mft_no
) {
1044 * Found the extent inode corresponding to this
1045 * extent mft record.
1047 eni
= extent_nis
[j
];
1052 * If the extent inode was not attached to the base inode, skip
1053 * this extent mft record.
1056 up(&ni
->extent_lock
);
1061 * Found the extent inode corrsponding to this extent mft
1062 * record. If it is dirty, no need to search further.
1064 if (NInoDirty(eni
)) {
1065 up(&ni
->extent_lock
);
1070 /* The extent inode is not dirty, so do the next record. */
1071 up(&ni
->extent_lock
);
1075 /* If a dirty mft record was found, redirty the page. */
1077 ntfs_debug("Inode 0x%lx is dirty. Redirtying the page "
1078 "starting at inode 0x%lx.", mft_no
,
1079 page
->index
<< (PAGE_CACHE_SHIFT
-
1080 vol
->mft_record_size_bits
));
1081 redirty_page_for_writepage(wbc
, page
);
1085 * Keep the VM happy. This must be done otherwise the
1086 * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though
1087 * the page is clean.
1089 BUG_ON(PageWriteback(page
));
1090 set_page_writeback(page
);
1092 end_page_writeback(page
);
1094 ntfs_debug("Done.");
1098 #endif /* NTFS_RW */