2 * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
4 * Copyright (c) 2001-2003 Anton Altaparmakov
5 * Copyright (c) 2002 Richard Russon
7 * This program/include file is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as published
9 * by the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program/include file is distributed in the hope that it will be
13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program (in the main directory of the Linux-NTFS
19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 #include <linux/swap.h>
28 * __format_mft_record - initialize an empty mft record
29 * @m: mapped, pinned and locked for writing mft record
30 * @size: size of the mft record
31 * @rec_no: mft record number / inode number
33 * Private function to initialize an empty mft record. Use one of the two
34 * provided format_mft_record() functions instead.
36 static void __format_mft_record(MFT_RECORD
*m
, const int size
,
37 const unsigned long rec_no
)
42 m
->magic
= magic_FILE
;
43 /* Aligned to 2-byte boundary. */
44 m
->usa_ofs
= cpu_to_le16((sizeof(MFT_RECORD
) + 1) & ~1);
45 m
->usa_count
= cpu_to_le16(size
/ NTFS_BLOCK_SIZE
+ 1);
46 /* Set the update sequence number to 1. */
47 *(u16
*)((char*)m
+ ((sizeof(MFT_RECORD
) + 1) & ~1)) = cpu_to_le16(1);
48 m
->lsn
= cpu_to_le64(0LL);
49 m
->sequence_number
= cpu_to_le16(1);
50 m
->link_count
= cpu_to_le16(0);
51 /* Aligned to 8-byte boundary. */
52 m
->attrs_offset
= cpu_to_le16((le16_to_cpu(m
->usa_ofs
) +
53 (le16_to_cpu(m
->usa_count
) << 1) + 7) & ~7);
54 m
->flags
= cpu_to_le16(0);
56 * Using attrs_offset plus eight bytes (for the termination attribute),
57 * aligned to 8-byte boundary.
59 m
->bytes_in_use
= cpu_to_le32((le16_to_cpu(m
->attrs_offset
) + 8 + 7) &
61 m
->bytes_allocated
= cpu_to_le32(size
);
62 m
->base_mft_record
= cpu_to_le64((MFT_REF
)0);
63 m
->next_attr_instance
= cpu_to_le16(0);
64 a
= (ATTR_RECORD
*)((char*)m
+ le16_to_cpu(m
->attrs_offset
));
66 a
->length
= cpu_to_le32(0);
70 * format_mft_record - initialize an empty mft record
71 * @ni: ntfs inode of mft record
72 * @mft_rec: mapped, pinned and locked mft record (optional)
74 * Initialize an empty mft record. This is used when extending the MFT.
76 * If @mft_rec is NULL, we call map_mft_record() to obtain the
77 * record and we unmap it again when finished.
79 * We return 0 on success or -errno on error.
81 int format_mft_record(ntfs_inode
*ni
, MFT_RECORD
*mft_rec
)
88 m
= map_mft_record(ni
);
92 __format_mft_record(m
, ni
->vol
->mft_record_size
, ni
->mft_no
);
94 // FIXME: Need to set the mft record dirty!
101 * ntfs_readpage - external declaration, function is in fs/ntfs/aops.c
103 extern int ntfs_readpage(struct file
*, struct page
*);
106 * ntfs_mft_aops - address space operations for access to $MFT
108 * Address space operations for access to $MFT. This allows us to simply use
109 * ntfs_map_page() in map_mft_record_page().
111 struct address_space_operations ntfs_mft_aops
= {
112 .readpage
= ntfs_readpage
, /* Fill page with data. */
113 .sync_page
= block_sync_page
, /* Currently, just unplugs the
114 disk request queue. */
118 * map_mft_record_page - map the page in which a specific mft record resides
119 * @ni: ntfs inode whose mft record page to map
121 * This maps the page in which the mft record of the ntfs inode @ni is situated
122 * and returns a pointer to the mft record within the mapped page.
124 * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR()
125 * contains the negative error code returned.
127 static inline MFT_RECORD
*map_mft_record_page(ntfs_inode
*ni
)
129 ntfs_volume
*vol
= ni
->vol
;
130 struct inode
*mft_vi
= vol
->mft_ino
;
132 unsigned long index
, ofs
, end_index
;
136 * The index into the page cache and the offset within the page cache
137 * page of the wanted mft record. FIXME: We need to check for
138 * overflowing the unsigned long, but I don't think we would ever get
139 * here if the volume was that big...
141 index
= ni
->mft_no
<< vol
->mft_record_size_bits
>> PAGE_CACHE_SHIFT
;
142 ofs
= (ni
->mft_no
<< vol
->mft_record_size_bits
) & ~PAGE_CACHE_MASK
;
144 /* The maximum valid index into the page cache for $MFT's data. */
145 end_index
= mft_vi
->i_size
>> PAGE_CACHE_SHIFT
;
147 /* If the wanted index is out of bounds the mft record doesn't exist. */
148 if (unlikely(index
>= end_index
)) {
149 if (index
> end_index
|| (mft_vi
->i_size
& ~PAGE_CACHE_MASK
) <
150 ofs
+ vol
->mft_record_size
) {
151 page
= ERR_PTR(-ENOENT
);
155 /* Read, map, and pin the page. */
156 page
= ntfs_map_page(mft_vi
->i_mapping
, index
);
157 if (likely(!IS_ERR(page
))) {
160 return page_address(page
) + ofs
;
165 ntfs_error(vol
->sb
, "Failed with error code %lu.", -PTR_ERR(page
));
170 * map_mft_record - map, pin and lock an mft record
171 * @ni: ntfs inode whose MFT record to map
173 * First, take the mrec_lock semaphore. We might now be sleeping, while waiting
174 * for the semaphore if it was already locked by someone else.
176 * The page of the record is mapped using map_mft_record_page() before being
177 * returned to the caller.
179 * This in turn uses ntfs_map_page() to get the page containing the wanted mft
180 * record (it in turn calls read_cache_page() which reads it in from disk if
181 * necessary, increments the use count on the page so that it cannot disappear
182 * under us and returns a reference to the page cache page).
184 * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it
185 * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed
186 * and the post-read mst fixups on each mft record in the page have been
187 * performed, the page gets PG_uptodate set and PG_locked cleared (this is done
188 * in our asynchronous I/O completion handler end_buffer_read_mft_async()).
189 * ntfs_map_page() waits for PG_locked to become clear and checks if
190 * PG_uptodate is set and returns an error code if not. This provides
191 * sufficient protection against races when reading/using the page.
193 * However there is the write mapping to think about. Doing the above described
194 * checking here will be fine, because when initiating the write we will set
195 * PG_locked and clear PG_uptodate making sure nobody is touching the page
196 * contents. Doing the locking this way means that the commit to disk code in
197 * the page cache code paths is automatically sufficiently locked with us as
198 * we will not touch a page that has been locked or is not uptodate. The only
199 * locking problem then is them locking the page while we are accessing it.
201 * So that code will end up having to own the mrec_lock of all mft
202 * records/inodes present in the page before I/O can proceed. In that case we
203 * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be
204 * accessing anything without owning the mrec_lock semaphore. But we do need
205 * to use them because of the read_cache_page() invocation and the code becomes
206 * so much simpler this way that it is well worth it.
208 * The mft record is now ours and we return a pointer to it. You need to check
209 * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return
212 * NOTE: Caller is responsible for setting the mft record dirty before calling
213 * unmap_mft_record(). This is obviously only necessary if the caller really
214 * modified the mft record...
215 * Q: Do we want to recycle one of the VFS inode state bits instead?
216 * A: No, the inode ones mean we want to change the mft record, not we want to
219 MFT_RECORD
*map_mft_record(ntfs_inode
*ni
)
223 ntfs_debug("Entering for mft_no 0x%lx.", ni
->mft_no
);
225 /* Make sure the ntfs inode doesn't go away. */
226 atomic_inc(&ni
->count
);
228 /* Serialize access to this mft record. */
229 down(&ni
->mrec_lock
);
231 m
= map_mft_record_page(ni
);
232 if (likely(!IS_ERR(m
)))
236 atomic_dec(&ni
->count
);
237 ntfs_error(ni
->vol
->sb
, "Failed with error code %lu.", -PTR_ERR(m
));
242 * unmap_mft_record_page - unmap the page in which a specific mft record resides
243 * @ni: ntfs inode whose mft record page to unmap
245 * This unmaps the page in which the mft record of the ntfs inode @ni is
246 * situated and returns. This is a NOOP if highmem is not configured.
248 * The unmap happens via ntfs_unmap_page() which in turn decrements the use
249 * count on the page thus releasing it from the pinned state.
251 * We do not actually unmap the page from memory of course, as that will be
252 * done by the page cache code itself when memory pressure increases or
255 static inline void unmap_mft_record_page(ntfs_inode
*ni
)
259 // TODO: If dirty, blah...
260 ntfs_unmap_page(ni
->page
);
267 * unmap_mft_record - release a mapped mft record
268 * @ni: ntfs inode whose MFT record to unmap
270 * We release the page mapping and the mrec_lock mutex which unmaps the mft
271 * record and releases it for others to get hold of. We also release the ntfs
272 * inode by decrementing the ntfs inode reference count.
274 * NOTE: If caller has modified the mft record, it is imperative to set the mft
275 * record dirty BEFORE calling unmap_mft_record().
277 void unmap_mft_record(ntfs_inode
*ni
)
279 struct page
*page
= ni
->page
;
283 ntfs_debug("Entering for mft_no 0x%lx.", ni
->mft_no
);
285 unmap_mft_record_page(ni
);
287 atomic_dec(&ni
->count
);
289 * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to
290 * ntfs_clear_extent_inode() in the extent inode case, and to the
291 * caller in the non-extent, yet pure ntfs inode case, to do the actual
292 * tear down of all structures and freeing of all allocated memory.
298 * map_extent_mft_record - load an extent inode and attach it to its base
299 * @base_ni: base ntfs inode
300 * @mref: mft reference of the extent inode to load (in little endian)
301 * @ntfs_ino: on successful return, pointer to the ntfs_inode structure
303 * Load the extent mft record @mref and attach it to its base inode @base_ni.
304 * Return the mapped extent mft record if IS_ERR(result) is false. Otherwise
305 * PTR_ERR(result) gives the negative error code.
307 * On successful return, @ntfs_ino contains a pointer to the ntfs_inode
308 * structure of the mapped extent inode.
310 MFT_RECORD
*map_extent_mft_record(ntfs_inode
*base_ni
, MFT_REF mref
,
311 ntfs_inode
**ntfs_ino
)
314 ntfs_inode
*ni
= NULL
;
315 ntfs_inode
**extent_nis
= NULL
;
317 unsigned long mft_no
= MREF_LE(mref
);
318 u16 seq_no
= MSEQNO_LE(mref
);
319 BOOL destroy_ni
= FALSE
;
321 ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).",
322 mft_no
, base_ni
->mft_no
);
323 /* Make sure the base ntfs inode doesn't go away. */
324 atomic_inc(&base_ni
->count
);
326 * Check if this extent inode has already been added to the base inode,
327 * in which case just return it. If not found, add it to the base
328 * inode before returning it.
330 down(&base_ni
->extent_lock
);
331 if (base_ni
->nr_extents
> 0) {
332 extent_nis
= base_ni
->ext
.extent_ntfs_inos
;
333 for (i
= 0; i
< base_ni
->nr_extents
; i
++) {
334 if (mft_no
!= extent_nis
[i
]->mft_no
)
337 /* Make sure the ntfs inode doesn't go away. */
338 atomic_inc(&ni
->count
);
342 if (likely(ni
!= NULL
)) {
343 up(&base_ni
->extent_lock
);
344 atomic_dec(&base_ni
->count
);
345 /* We found the record; just have to map and return it. */
346 m
= map_mft_record(ni
);
347 /* map_mft_record() has incremented this on success. */
348 atomic_dec(&ni
->count
);
349 if (likely(!IS_ERR(m
))) {
350 /* Verify the sequence number. */
351 if (likely(le16_to_cpu(m
->sequence_number
) == seq_no
)) {
352 ntfs_debug("Done 1.");
356 unmap_mft_record(ni
);
357 ntfs_error(base_ni
->vol
->sb
, "Found stale extent mft "
358 "reference! Corrupt file system. "
360 return ERR_PTR(-EIO
);
363 ntfs_error(base_ni
->vol
->sb
, "Failed to map extent "
364 "mft record, error code %ld.", -PTR_ERR(m
));
367 /* Record wasn't there. Get a new ntfs inode and initialize it. */
368 ni
= ntfs_new_extent_inode(base_ni
->vol
->sb
, mft_no
);
370 up(&base_ni
->extent_lock
);
371 atomic_dec(&base_ni
->count
);
372 return ERR_PTR(-ENOMEM
);
374 ni
->vol
= base_ni
->vol
;
377 ni
->ext
.base_ntfs_ino
= base_ni
;
378 /* Now map the record. */
379 m
= map_mft_record(ni
);
380 if (unlikely(IS_ERR(m
))) {
381 up(&base_ni
->extent_lock
);
382 atomic_dec(&base_ni
->count
);
383 ntfs_clear_extent_inode(ni
);
386 /* Verify the sequence number. */
387 if (unlikely(le16_to_cpu(m
->sequence_number
) != seq_no
)) {
388 ntfs_error(base_ni
->vol
->sb
, "Found stale extent mft "
389 "reference! Corrupt file system. Run chkdsk.");
394 /* Attach extent inode to base inode, reallocating memory if needed. */
395 if (!(base_ni
->nr_extents
& 3)) {
397 int new_size
= (base_ni
->nr_extents
+ 4) * sizeof(ntfs_inode
*);
399 tmp
= (ntfs_inode
**)kmalloc(new_size
, GFP_NOFS
);
400 if (unlikely(!tmp
)) {
401 ntfs_error(base_ni
->vol
->sb
, "Failed to allocate "
404 m
= ERR_PTR(-ENOMEM
);
407 if (base_ni
->ext
.extent_ntfs_inos
) {
408 memcpy(tmp
, base_ni
->ext
.extent_ntfs_inos
, new_size
-
409 4 * sizeof(ntfs_inode
*));
410 kfree(base_ni
->ext
.extent_ntfs_inos
);
412 base_ni
->ext
.extent_ntfs_inos
= tmp
;
414 base_ni
->ext
.extent_ntfs_inos
[base_ni
->nr_extents
++] = ni
;
415 up(&base_ni
->extent_lock
);
416 atomic_dec(&base_ni
->count
);
417 ntfs_debug("Done 2.");
421 unmap_mft_record(ni
);
422 up(&base_ni
->extent_lock
);
423 atomic_dec(&base_ni
->count
);
425 * If the extent inode was not attached to the base inode we need to
426 * release it or we will leak memory.
429 ntfs_clear_extent_inode(ni
);