fs/ntfs/mft.c

   1 /**
   2  * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
   3  *
   4  * Copyright (c) 2001-2003 Anton Altaparmakov
   5  * Copyright (c) 2002 Richard Russon
   6  *
   7  * This program/include file is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License as published
   9  * by the Free Software Foundation; either version 2 of the License, or
  10  * (at your option) any later version.
  11  *
  12  * This program/include file is distributed in the hope that it will be
  13  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with this program (in the main directory of the Linux-NTFS
  19  * distribution in the file COPYING); if not, write to the Free Software
  20  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21  */
  22
  23 #include <linux/swap.h>
  24
  25 #include "ntfs.h"
  26
  27 /**
  28  * __format_mft_record - initialize an empty mft record
  29  * @m:          mapped, pinned and locked for writing mft record
  30  * @size:       size of the mft record
  31  * @rec_no:     mft record number / inode number
  32  *
  33  * Private function to initialize an empty mft record. Use one of the two
  34  * provided format_mft_record() functions instead.
  35  */
  36 static void __format_mft_record(MFT_RECORD *m, const int size,
  37                 const unsigned long rec_no)
  38 {
  39         ATTR_RECORD *a;
  40
  41         memset(m, 0, size);
  42         m->magic = magic_FILE;
  43         /* Aligned to 2-byte boundary. */
  44         m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1);
  45         m->usa_count = cpu_to_le16(size / NTFS_BLOCK_SIZE + 1);
  46         /* Set the update sequence number to 1. */
  47         *(u16*)((char*)m + ((sizeof(MFT_RECORD) + 1) & ~1)) = cpu_to_le16(1);
  48         m->lsn = cpu_to_le64(0LL);
  49         m->sequence_number = cpu_to_le16(1);
  50         m->link_count = cpu_to_le16(0);
  51         /* Aligned to 8-byte boundary. */
  52         m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) +
  53                         (le16_to_cpu(m->usa_count) << 1) + 7) & ~7);
  54         m->flags = cpu_to_le16(0);
  55         /*
  56          * Using attrs_offset plus eight bytes (for the termination attribute),
  57          * aligned to 8-byte boundary.
  58          */
  59         m->bytes_in_use = cpu_to_le32((le16_to_cpu(m->attrs_offset) + 8 + 7) &
  60                         ~7);
  61         m->bytes_allocated = cpu_to_le32(size);
  62         m->base_mft_record = cpu_to_le64((MFT_REF)0);
  63         m->next_attr_instance = cpu_to_le16(0);
  64         a = (ATTR_RECORD*)((char*)m + le16_to_cpu(m->attrs_offset));
  65         a->type = AT_END;
  66         a->length = cpu_to_le32(0);
  67 }
  68
  69 /**
  70  * format_mft_record - initialize an empty mft record
  71  * @ni:         ntfs inode of mft record
  72  * @mft_rec:    mapped, pinned and locked mft record (optional)
  73  *
  74  * Initialize an empty mft record. This is used when extending the MFT.
  75  *
  76  * If @mft_rec is NULL, we call map_mft_record() to obtain the
  77  * record and we unmap it again when finished.
  78  *
  79  * We return 0 on success or -errno on error.
  80  */
  81 int format_mft_record(ntfs_inode *ni, MFT_RECORD *mft_rec)
  82 {
  83         MFT_RECORD *m;
  84
  85         if (mft_rec)
  86                 m = mft_rec;
  87         else {
  88                 m = map_mft_record(ni);
  89                 if (IS_ERR(m))
  90                         return PTR_ERR(m);
  91         }
  92         __format_mft_record(m, ni->vol->mft_record_size, ni->mft_no);
  93         if (!mft_rec) {
  94                 // FIXME: Need to set the mft record dirty!
  95                 unmap_mft_record(ni);
  96         }
  97         return 0;
  98 }
  99
 100 /**
 101  * ntfs_readpage - external declaration, function is in fs/ntfs/aops.c
 102  */
 103 extern int ntfs_readpage(struct file *, struct page *);
 104
 105 /**
 106  * ntfs_mft_aops - address space operations for access to $MFT
 107  *
 108  * Address space operations for access to $MFT. This allows us to simply use
 109  * ntfs_map_page() in map_mft_record_page().
 110  */
 111 struct address_space_operations ntfs_mft_aops = {
 112         .readpage       = ntfs_readpage,        /* Fill page with data. */
 113         .sync_page      = block_sync_page,      /* Currently, just unplugs the
 114                                                    disk request queue. */
 115 };
 116
 117 /**
 118  * map_mft_record_page - map the page in which a specific mft record resides
 119  * @ni:         ntfs inode whose mft record page to map
 120  *
 121  * This maps the page in which the mft record of the ntfs inode @ni is situated
 122  * and returns a pointer to the mft record within the mapped page.
 123  *
 124  * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR()
 125  * contains the negative error code returned.
 126  */
 127 static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
 128 {
 129         ntfs_volume *vol = ni->vol;
 130         struct inode *mft_vi = vol->mft_ino;
 131         struct page *page;
 132         unsigned long index, ofs, end_index;
 133
 134         BUG_ON(ni->page);
 135         /*
 136          * The index into the page cache and the offset within the page cache
 137          * page of the wanted mft record. FIXME: We need to check for
 138          * overflowing the unsigned long, but I don't think we would ever get
 139          * here if the volume was that big...
 140          */
 141         index = ni->mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
 142         ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
 143
 144         /* The maximum valid index into the page cache for $MFT's data. */
 145         end_index = mft_vi->i_size >> PAGE_CACHE_SHIFT;
 146
 147         /* If the wanted index is out of bounds the mft record doesn't exist. */
 148         if (unlikely(index >= end_index)) {
 149                 if (index > end_index || (mft_vi->i_size & ~PAGE_CACHE_MASK) <
 150                                 ofs + vol->mft_record_size) {
 151                         page = ERR_PTR(-ENOENT);
 152                         goto err_out;
 153                 }
 154         }
 155         /* Read, map, and pin the page. */
 156         page = ntfs_map_page(mft_vi->i_mapping, index);
 157         if (likely(!IS_ERR(page))) {
 158                 ni->page = page;
 159                 ni->page_ofs = ofs;
 160                 return page_address(page) + ofs;
 161         }
 162 err_out:
 163         ni->page = NULL;
 164         ni->page_ofs = 0;
 165         ntfs_error(vol->sb, "Failed with error code %lu.", -PTR_ERR(page));
 166         return (void*)page;
 167 }
 168
 169 /**
 170  * map_mft_record - map, pin and lock an mft record
 171  * @ni:         ntfs inode whose MFT record to map
 172  *
 173  * First, take the mrec_lock semaphore. We might now be sleeping, while waiting
 174  * for the semaphore if it was already locked by someone else.
 175  *
 176  * The page of the record is mapped using map_mft_record_page() before being
 177  * returned to the caller.
 178  *
 179  * This in turn uses ntfs_map_page() to get the page containing the wanted mft
 180  * record (it in turn calls read_cache_page() which reads it in from disk if
 181  * necessary, increments the use count on the page so that it cannot disappear
 182  * under us and returns a reference to the page cache page).
 183  *
 184  * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it
 185  * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed
 186  * and the post-read mst fixups on each mft record in the page have been
 187  * performed, the page gets PG_uptodate set and PG_locked cleared (this is done
 188  * in our asynchronous I/O completion handler end_buffer_read_mft_async()).
 189  * ntfs_map_page() waits for PG_locked to become clear and checks if
 190  * PG_uptodate is set and returns an error code if not. This provides
 191  * sufficient protection against races when reading/using the page.
 192  *
 193  * However there is the write mapping to think about. Doing the above described
 194  * checking here will be fine, because when initiating the write we will set
 195  * PG_locked and clear PG_uptodate making sure nobody is touching the page
 196  * contents. Doing the locking this way means that the commit to disk code in
 197  * the page cache code paths is automatically sufficiently locked with us as
 198  * we will not touch a page that has been locked or is not uptodate. The only
 199  * locking problem then is them locking the page while we are accessing it.
 200  *
 201  * So that code will end up having to own the mrec_lock of all mft
 202  * records/inodes present in the page before I/O can proceed. In that case we
 203  * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be
 204  * accessing anything without owning the mrec_lock semaphore. But we do need
 205  * to use them because of the read_cache_page() invocation and the code becomes
 206  * so much simpler this way that it is well worth it.
 207  *
 208  * The mft record is now ours and we return a pointer to it. You need to check
 209  * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return
 210  * the error code.
 211  *
 212  * NOTE: Caller is responsible for setting the mft record dirty before calling
 213  * unmap_mft_record(). This is obviously only necessary if the caller really
 214  * modified the mft record...
 215  * Q: Do we want to recycle one of the VFS inode state bits instead?
 216  * A: No, the inode ones mean we want to change the mft record, not we want to
 217  * write it out.
 218  */
 219 MFT_RECORD *map_mft_record(ntfs_inode *ni)
 220 {
 221         MFT_RECORD *m;
 222
 223         ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
 224
 225         /* Make sure the ntfs inode doesn't go away. */
 226         atomic_inc(&ni->count);
 227
 228         /* Serialize access to this mft record. */
 229         down(&ni->mrec_lock);
 230
 231         m = map_mft_record_page(ni);
 232         if (likely(!IS_ERR(m)))
 233                 return m;
 234
 235         up(&ni->mrec_lock);
 236         atomic_dec(&ni->count);
 237         ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m));
 238         return m;
 239 }
 240
 241 /**
 242  * unmap_mft_record_page - unmap the page in which a specific mft record resides
 243  * @ni:         ntfs inode whose mft record page to unmap
 244  *
 245  * This unmaps the page in which the mft record of the ntfs inode @ni is
 246  * situated and returns. This is a NOOP if highmem is not configured.
 247  *
 248  * The unmap happens via ntfs_unmap_page() which in turn decrements the use
 249  * count on the page thus releasing it from the pinned state.
 250  *
 251  * We do not actually unmap the page from memory of course, as that will be
 252  * done by the page cache code itself when memory pressure increases or
 253  * whatever.
 254  */
 255 static inline void unmap_mft_record_page(ntfs_inode *ni)
 256 {
 257         BUG_ON(!ni->page);
 258
 259         // TODO: If dirty, blah...
 260         ntfs_unmap_page(ni->page);
 261         ni->page = NULL;
 262         ni->page_ofs = 0;
 263         return;
 264 }
 265
 266 /**
 267  * unmap_mft_record - release a mapped mft record
 268  * @ni:         ntfs inode whose MFT record to unmap
 269  *
 270  * We release the page mapping and the mrec_lock mutex which unmaps the mft
 271  * record and releases it for others to get hold of. We also release the ntfs
 272  * inode by decrementing the ntfs inode reference count.
 273  *
 274  * NOTE: If caller has modified the mft record, it is imperative to set the mft
 275  * record dirty BEFORE calling unmap_mft_record().
 276  */
 277 void unmap_mft_record(ntfs_inode *ni)
 278 {
 279         struct page *page = ni->page;
 280
 281         BUG_ON(!page);
 282
 283         ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
 284
 285         unmap_mft_record_page(ni);
 286         up(&ni->mrec_lock);
 287         atomic_dec(&ni->count);
 288         /*
 289          * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to
 290          * ntfs_clear_extent_inode() in the extent inode case, and to the
 291          * caller in the non-extent, yet pure ntfs inode case, to do the actual
 292          * tear down of all structures and freeing of all allocated memory.
 293          */
 294         return;
 295 }
 296
 297 /**
 298  * map_extent_mft_record - load an extent inode and attach it to its base
 299  * @base_ni:    base ntfs inode
 300  * @mref:       mft reference of the extent inode to load (in little endian)
 301  * @ntfs_ino:   on successful return, pointer to the ntfs_inode structure
 302  *
 303  * Load the extent mft record @mref and attach it to its base inode @base_ni.
 304  * Return the mapped extent mft record if IS_ERR(result) is false. Otherwise
 305  * PTR_ERR(result) gives the negative error code.
 306  *
 307  * On successful return, @ntfs_ino contains a pointer to the ntfs_inode
 308  * structure of the mapped extent inode.
 309  */
 310 MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
 311                 ntfs_inode **ntfs_ino)
 312 {
 313         MFT_RECORD *m;
 314         ntfs_inode *ni = NULL;
 315         ntfs_inode **extent_nis = NULL;
 316         int i;
 317         unsigned long mft_no = MREF_LE(mref);
 318         u16 seq_no = MSEQNO_LE(mref);
 319         BOOL destroy_ni = FALSE;
 320
 321         ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).",
 322                         mft_no, base_ni->mft_no);
 323         /* Make sure the base ntfs inode doesn't go away. */
 324         atomic_inc(&base_ni->count);
 325         /*
 326          * Check if this extent inode has already been added to the base inode,
 327          * in which case just return it. If not found, add it to the base
 328          * inode before returning it.
 329          */
 330         down(&base_ni->extent_lock);
 331         if (base_ni->nr_extents > 0) {
 332                 extent_nis = base_ni->ext.extent_ntfs_inos;
 333                 for (i = 0; i < base_ni->nr_extents; i++) {
 334                         if (mft_no != extent_nis[i]->mft_no)
 335                                 continue;
 336                         ni = extent_nis[i];
 337                         /* Make sure the ntfs inode doesn't go away. */
 338                         atomic_inc(&ni->count);
 339                         break;
 340                 }
 341         }
 342         if (likely(ni != NULL)) {
 343                 up(&base_ni->extent_lock);
 344                 atomic_dec(&base_ni->count);
 345                 /* We found the record; just have to map and return it. */
 346                 m = map_mft_record(ni);
 347                 /* map_mft_record() has incremented this on success. */
 348                 atomic_dec(&ni->count);
 349                 if (likely(!IS_ERR(m))) {
 350                         /* Verify the sequence number. */
 351                         if (likely(le16_to_cpu(m->sequence_number) == seq_no)) {
 352                                 ntfs_debug("Done 1.");
 353                                 *ntfs_ino = ni;
 354                                 return m;
 355                         }
 356                         unmap_mft_record(ni);
 357                         ntfs_error(base_ni->vol->sb, "Found stale extent mft "
 358                                         "reference! Corrupt file system. "
 359                                         "Run chkdsk.");
 360                         return ERR_PTR(-EIO);
 361                 }
 362 map_err_out:
 363                 ntfs_error(base_ni->vol->sb, "Failed to map extent "
 364                                 "mft record, error code %ld.", -PTR_ERR(m));
 365                 return m;
 366         }
 367         /* Record wasn't there. Get a new ntfs inode and initialize it. */
 368         ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no);
 369         if (unlikely(!ni)) {
 370                 up(&base_ni->extent_lock);
 371                 atomic_dec(&base_ni->count);
 372                 return ERR_PTR(-ENOMEM);
 373         }
 374         ni->vol = base_ni->vol;
 375         ni->seq_no = seq_no;
 376         ni->nr_extents = -1;
 377         ni->ext.base_ntfs_ino = base_ni;
 378         /* Now map the record. */
 379         m = map_mft_record(ni);
 380         if (unlikely(IS_ERR(m))) {
 381                 up(&base_ni->extent_lock);
 382                 atomic_dec(&base_ni->count);
 383                 ntfs_clear_extent_inode(ni);
 384                 goto map_err_out;
 385         }
 386         /* Verify the sequence number. */
 387         if (unlikely(le16_to_cpu(m->sequence_number) != seq_no)) {
 388                 ntfs_error(base_ni->vol->sb, "Found stale extent mft "
 389                                 "reference! Corrupt file system. Run chkdsk.");
 390                 destroy_ni = TRUE;
 391                 m = ERR_PTR(-EIO);
 392                 goto unm_err_out;
 393         }
 394         /* Attach extent inode to base inode, reallocating memory if needed. */
 395         if (!(base_ni->nr_extents & 3)) {
 396                 ntfs_inode **tmp;
 397                 int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *);
 398
 399                 tmp = (ntfs_inode **)kmalloc(new_size, GFP_NOFS);
 400                 if (unlikely(!tmp)) {
 401                         ntfs_error(base_ni->vol->sb, "Failed to allocate "
 402                                         "internal buffer.");
 403                         destroy_ni = TRUE;
 404                         m = ERR_PTR(-ENOMEM);
 405                         goto unm_err_out;
 406                 }
 407                 if (base_ni->ext.extent_ntfs_inos) {
 408                         memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size -
 409                                         4 * sizeof(ntfs_inode *));
 410                         kfree(base_ni->ext.extent_ntfs_inos);
 411                 }
 412                 base_ni->ext.extent_ntfs_inos = tmp;
 413         }
 414         base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni;
 415         up(&base_ni->extent_lock);
 416         atomic_dec(&base_ni->count);
 417         ntfs_debug("Done 2.");
 418         *ntfs_ino = ni;
 419         return m;
 420 unm_err_out:
 421         unmap_mft_record(ni);
 422         up(&base_ni->extent_lock);
 423         atomic_dec(&base_ni->count);
 424         /*
 425          * If the extent inode was not attached to the base inode we need to
 426          * release it or we will leak memory.
 427          */
 428         if (destroy_ni)
 429                 ntfs_clear_extent_inode(ni);
 430         return m;
 431 }
 432