fs/namei.c

   1 /*
   2  *  linux/fs/namei.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  * Some corrections by tytso.
   9  */
  10
  11 /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
  12  * lookup logic.
  13  */
  14
  15 #include <linux/mm.h>
  16 #include <linux/proc_fs.h>
  17 #include <linux/smp_lock.h>
  18 #include <linux/quotaops.h>
  19
  20 #include <asm/uaccess.h>
  21 #include <asm/unaligned.h>
  22 #include <asm/semaphore.h>
  23 #include <asm/page.h>
  24 #include <asm/pgtable.h>
  25
  26 #include <asm/namei.h>
  27
  28 /* This can be removed after the beta phase. */
  29 #define CACHE_SUPERVISE /* debug the correctness of dcache entries */
  30 #undef DEBUG            /* some other debugging */
  31
  32
  33 #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
  34
  35 /* [Feb-1997 T. Schoebel-Theuer]
  36  * Fundamental changes in the pathname lookup mechanisms (namei)
  37  * were necessary because of omirr.  The reason is that omirr needs
  38  * to know the _real_ pathname, not the user-supplied one, in case
  39  * of symlinks (and also when transname replacements occur).
  40  *
  41  * The new code replaces the old recursive symlink resolution with
  42  * an iterative one (in case of non-nested symlink chains).  It does
  43  * this with calls to <fs>_follow_link().
  44  * As a side effect, dir_namei(), _namei() and follow_link() are now
  45  * replaced with a single function lookup_dentry() that can handle all
  46  * the special cases of the former code.
  47  *
  48  * With the new dcache, the pathname is stored at each inode, at least as
  49  * long as the refcount of the inode is positive.  As a side effect, the
  50  * size of the dcache depends on the inode cache and thus is dynamic.
  51  *
  52  * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
  53  * resolution to correspond with current state of the code.
  54  *
  55  * Note that the symlink resolution is not *completely* iterative.
  56  * There is still a significant amount of tail- and mid- recursion in
  57  * the algorithm.  Also, note that <fs>_readlink() is not used in
  58  * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
  59  * may return different results than <fs>_follow_link().  Many virtual
  60  * filesystems (including /proc) exhibit this behavior.
  61  */
  62
  63 /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
  64  * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
  65  * and the name already exists in form of a symlink, try to create the new
  66  * name indicated by the symlink. The old code always complained that the
  67  * name already exists, due to not following the symlink even if its target
  68  * is nonexistent.  The new semantics affects also mknod() and link() when
  69  * the name is a symlink pointing to a non-existant name.
  70  *
  71  * I don't know which semantics is the right one, since I have no access
  72  * to standards. But I found by trial that HP-UX 9.0 has the full "new"
  73  * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
  74  * "old" one. Personally, I think the new semantics is much more logical.
  75  * Note that "ln old new" where "new" is a symlink pointing to a non-existing
  76  * file does succeed in both HP-UX and SunOs, but not in Solaris
  77  * and in the old Linux semantics.
  78  */
  79
  80 /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
  81  * semantics.  See the comments in "open_namei" and "do_link" below.
  82  *
  83  * [10-Sep-98 Alan Modra] Another symlink change.
  84  */
  85
  86 /* In order to reduce some races, while at the same time doing additional
  87  * checking and hopefully speeding things up, we copy filenames to the
  88  * kernel data space before using them..
  89  *
  90  * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
  91  */
  92 static inline int do_getname(const char *filename, char *page)
  93 {
  94         int retval;
  95         unsigned long len = PAGE_SIZE;
  96
  97         if ((unsigned long) filename >= TASK_SIZE) {
  98                 if (!segment_eq(get_fs(), KERNEL_DS))
  99                         return -EFAULT;
 100         } else if (TASK_SIZE - (unsigned long) filename < PAGE_SIZE)
 101                 len = TASK_SIZE - (unsigned long) filename;
 102
 103         retval = strncpy_from_user((char *)page, filename, len);
 104         if (retval > 0) {
 105                 if (retval < len)
 106                         return 0;
 107                 return -ENAMETOOLONG;
 108         } else if (!retval)
 109                 retval = -ENOENT;
 110         return retval;
 111 }
 112
 113 char * getname(const char * filename)
 114 {
 115         char *tmp, *result;
 116
 117         result = ERR_PTR(-ENOMEM);
 118         tmp = __getname();
 119         if (tmp)  {
 120                 int retval = do_getname(filename, tmp);
 121
 122                 result = tmp;
 123                 if (retval < 0) {
 124                         putname(tmp);
 125                         result = ERR_PTR(retval);
 126                 }
 127         }
 128         return result;
 129 }
 130
 131 /*
 132  *      permission()
 133  *
 134  * is used to check for read/write/execute permissions on a file.
 135  * We use "fsuid" for this, letting us set arbitrary permissions
 136  * for filesystem access without changing the "normal" uids which
 137  * are used for other things..
 138  */
 139 int permission(struct inode * inode,int mask)
 140 {
 141         int mode = inode->i_mode;
 142
 143         if (inode->i_op && inode->i_op->permission)
 144                 return inode->i_op->permission(inode, mask);
 145         else if ((mask & S_IWOTH) && IS_RDONLY(inode) &&
 146                  (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
 147                 return -EROFS; /* Nobody gets write access to a read-only fs */
 148         else if ((mask & S_IWOTH) && IS_IMMUTABLE(inode))
 149                 return -EACCES; /* Nobody gets write access to an immutable file */
 150         else if (current->fsuid == inode->i_uid)
 151                 mode >>= 6;
 152         else if (in_group_p(inode->i_gid))
 153                 mode >>= 3;
 154         if (((mode & mask & S_IRWXO) == mask) || capable(CAP_DAC_OVERRIDE))
 155                 return 0;
 156         /* read and search access */
 157         if ((mask == S_IROTH) ||
 158             (S_ISDIR(mode)  && !(mask & ~(S_IROTH | S_IXOTH))))
 159                 if (capable(CAP_DAC_READ_SEARCH))
 160                         return 0;
 161         return -EACCES;
 162 }
 163
 164 /*
 165  * get_write_access() gets write permission for a file.
 166  * put_write_access() releases this write permission.
 167  * This is used for regular files.
 168  * We cannot support write (and maybe mmap read-write shared) accesses and
 169  * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode
 170  * can have the following values:
 171  * 0: no writers, no VM_DENYWRITE mappings
 172  * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist
 173  * > 0: (i_writecount) users are writing to the file.
 174  *
 175  * WARNING: as soon as we will move get_write_access(), do_mmap() or
 176  * prepare_binfmt() out of the big lock we will need a spinlock protecting
 177  * the checks in all 3. For the time being it is not needed.
 178  */
 179 int get_write_access(struct inode * inode)
 180 {
 181         if (atomic_read(&inode->i_writecount) < 0)
 182                 return -ETXTBSY;
 183         atomic_inc(&inode->i_writecount);
 184         return 0;
 185 }
 186
 187 void put_write_access(struct inode * inode)
 188 {
 189         atomic_dec(&inode->i_writecount);
 190 }
 191
 192 /*
 193  * "." and ".." are special - ".." especially so because it has to be able
 194  * to know about the current root directory and parent relationships
 195  */
 196 static struct dentry * reserved_lookup(struct dentry * parent, struct qstr * name)
 197 {
 198         struct dentry *result = NULL;
 199         if (name->name[0] == '.') {
 200                 switch (name->len) {
 201                 default:
 202                         break;
 203                 case 2:
 204                         if (name->name[1] != '.')
 205                                 break;
 206
 207                         if (parent != current->fs->root)
 208                                 parent = parent->d_covers->d_parent;
 209                         /* fallthrough */
 210                 case 1:
 211                         result = parent;
 212                 }
 213         }
 214         return dget(result);
 215 }
 216
 217 /*
 218  * Internal lookup() using the new generic dcache.
 219  */
 220 static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags)
 221 {
 222         struct dentry * dentry = d_lookup(parent, name);
 223
 224         if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
 225                 if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) {
 226                         dput(dentry);
 227                         dentry = NULL;
 228                 }
 229         }
 230         return dentry;
 231 }
 232
 233 /*
 234  * This is called when everything else fails, and we actually have
 235  * to go to the low-level filesystem to find out what we should do..
 236  *
 237  * We get the directory semaphore, and after getting that we also
 238  * make sure that nobody added the entry to the dcache in the meantime..
 239  */
 240 static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags)
 241 {
 242         struct dentry * result;
 243         struct inode *dir = parent->d_inode;
 244
 245         down(&dir->i_sem);
 246         /*
 247          * First re-do the cached lookup just in case it was created
 248          * while we waited for the directory semaphore..
 249          *
 250          * FIXME! This could use version numbering or similar to
 251          * avoid unnecessary cache lookups.
 252          */
 253         result = d_lookup(parent, name);
 254         if (!result) {
 255                 struct dentry * dentry = d_alloc(parent, name);
 256                 result = ERR_PTR(-ENOMEM);
 257                 if (dentry) {
 258                         result = dir->i_op->lookup(dir, dentry);
 259                         if (result)
 260                                 dput(dentry);
 261                         else
 262                                 result = dentry;
 263                 }
 264                 up(&dir->i_sem);
 265                 return result;
 266         }
 267
 268         /*
 269          * Uhhuh! Nasty case: the cache was re-populated while
 270          * we waited on the semaphore. Need to revalidate, but
 271          * we're going to return this entry regardless (same
 272          * as if it was busy).
 273          */
 274         up(&dir->i_sem);
 275         if (result->d_op && result->d_op->d_revalidate)
 276                 result->d_op->d_revalidate(result, flags);
 277         return result;
 278 }
 279
 280 static struct dentry * do_follow_link(struct dentry *base, struct dentry *dentry, unsigned int follow)
 281 {
 282         struct inode * inode = dentry->d_inode;
 283
 284         if ((follow & LOOKUP_FOLLOW)
 285             && inode && inode->i_op && inode->i_op->follow_link) {
 286                 if (current->link_count < 5) {
 287                         struct dentry * result;
 288
 289                         current->link_count++;
 290                         /* This eats the base */
 291                         result = inode->i_op->follow_link(dentry, base, follow);
 292                         current->link_count--;
 293                         dput(dentry);
 294                         return result;
 295                 }
 296                 dput(dentry);
 297                 dentry = ERR_PTR(-ELOOP);
 298         }
 299         dput(base);
 300         return dentry;
 301 }
 302
 303 static inline struct dentry * follow_mount(struct dentry * dentry)
 304 {
 305         struct dentry * mnt = dentry->d_mounts;
 306
 307         if (mnt != dentry) {
 308                 dget(mnt);
 309                 dput(dentry);
 310                 dentry = mnt;
 311         }
 312         return dentry;
 313 }
 314
 315 /*
 316  * Name resolution.
 317  *
 318  * This is the basic name resolution function, turning a pathname
 319  * into the final dentry.
 320  */
 321 struct dentry * lookup_dentry(const char * name, struct dentry * base, unsigned int lookup_flags)
 322 {
 323         struct dentry * dentry;
 324         struct inode *inode;
 325
 326         if (*name == '/') {
 327                 if (base)
 328                         dput(base);
 329                 do {
 330                         name++;
 331                 } while (*name == '/');
 332                 __prefix_lookup_dentry(name, lookup_flags);
 333                 base = dget(current->fs->root);
 334         } else if (!base) {
 335                 base = dget(current->fs->pwd);
 336         }
 337
 338         if (!*name)
 339                 goto return_base;
 340
 341         inode = base->d_inode;
 342         lookup_flags &= LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_SLASHOK;
 343
 344         /* At this point we know we have a real path component. */
 345         for(;;) {
 346                 int err;
 347                 unsigned long hash;
 348                 struct qstr this;
 349                 unsigned int flags;
 350                 unsigned int c;
 351
 352                 err = permission(inode, MAY_EXEC);
 353                 dentry = ERR_PTR(err);
 354                 if (err)
 355                         break;
 356
 357                 this.name = name;
 358                 c = *(const unsigned char *)name;
 359
 360                 hash = init_name_hash();
 361                 do {
 362                         name++;
 363                         hash = partial_name_hash(c, hash);
 364                         c = *(const unsigned char *)name;
 365                 } while (c && (c != '/'));
 366                 this.len = name - (const char *) this.name;
 367                 this.hash = end_name_hash(hash);
 368
 369                 /* remove trailing slashes? */
 370                 flags = lookup_flags;
 371                 if (c) {
 372                         char tmp;
 373
 374                         flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 375                         do {
 376                                 tmp = *++name;
 377                         } while (tmp == '/');
 378                         if (tmp)
 379                                 flags |= LOOKUP_CONTINUE;
 380                 }
 381
 382                 /*
 383                  * See if the low-level filesystem might want
 384                  * to use its own hash..
 385                  */
 386                 if (base->d_op && base->d_op->d_hash) {
 387                         int error;
 388                         error = base->d_op->d_hash(base, &this);
 389                         if (error < 0) {
 390                                 dentry = ERR_PTR(error);
 391                                 break;
 392                         }
 393                 }
 394
 395                 /* This does the actual lookups.. */
 396                 dentry = reserved_lookup(base, &this);
 397                 if (!dentry) {
 398                         dentry = cached_lookup(base, &this, flags);
 399                         if (!dentry) {
 400                                 dentry = real_lookup(base, &this, flags);
 401                                 if (IS_ERR(dentry))
 402                                         break;
 403                         }
 404                 }
 405
 406                 /* Check mountpoints.. */
 407                 dentry = follow_mount(dentry);
 408
 409                 base = do_follow_link(base, dentry, flags);
 410                 if (IS_ERR(base))
 411                         goto return_base;
 412
 413                 inode = base->d_inode;
 414                 if (flags & LOOKUP_DIRECTORY) {
 415                         if (!inode)
 416                                 goto no_inode;
 417                         dentry = ERR_PTR(-ENOTDIR);
 418                         if (!inode->i_op || !inode->i_op->lookup)
 419                                 break;
 420                         if (flags & LOOKUP_CONTINUE)
 421                                 continue;
 422                 }
 423 return_base:
 424                 return base;
 425 /*
 426  * The case of a nonexisting file is special.
 427  *
 428  * In the middle of a pathname lookup (ie when
 429  * LOOKUP_CONTINUE is set), it's an obvious
 430  * error and returns ENOENT.
 431  *
 432  * At the end of a pathname lookup it's legal,
 433  * and we return a negative dentry. However, we
 434  * get here only if there were trailing slashes,
 435  * which is legal only if we know it's supposed
 436  * to be a directory (ie "mkdir"). Thus the
 437  * LOOKUP_SLASHOK flag.
 438  */
 439 no_inode:
 440                 dentry = ERR_PTR(-ENOENT);
 441                 if (flags & LOOKUP_CONTINUE)
 442                         break;
 443                 if (flags & LOOKUP_SLASHOK)
 444                         goto return_base;
 445                 break;
 446         }
 447         dput(base);
 448         return dentry;
 449 }
 450
 451 /*
 452  *      namei()
 453  *
 454  * is used by most simple commands to get the inode of a specified name.
 455  * Open, link etc use their own routines, but this is enough for things
 456  * like 'chmod' etc.
 457  *
 458  * namei exists in two versions: namei/lnamei. The only difference is
 459  * that namei follows links, while lnamei does not.
 460  */
 461 struct dentry * __namei(const char *pathname, unsigned int lookup_flags)
 462 {
 463         char *name;
 464         struct dentry *dentry;
 465
 466         name = getname(pathname);
 467         dentry = (struct dentry *) name;
 468         if (!IS_ERR(name)) {
 469                 dentry = lookup_dentry(name, NULL, lookup_flags);
 470                 putname(name);
 471                 if (!IS_ERR(dentry)) {
 472                         if (!dentry->d_inode) {
 473                                 dput(dentry);
 474                                 dentry = ERR_PTR(-ENOENT);
 475                         }
 476                 }
 477         }
 478         return dentry;
 479 }
 480
 481 /*
 482  * It's inline, so penalty for filesystems that don't use sticky bit is
 483  * minimal.
 484  */
 485 static inline int check_sticky(struct inode *dir, struct inode *inode)
 486 {
 487         if (!(dir->i_mode & S_ISVTX))
 488                 return 0;
 489         if (inode->i_uid == current->fsuid)
 490                 return 0;
 491         if (dir->i_uid == current->fsuid)
 492                 return 0;
 493         return !capable(CAP_FOWNER);
 494 }
 495
 496 /*
 497  *      Check whether we can remove a link victim from directory dir, check
 498  *  whether the type of victim is right.
 499  *  1. We can't do it if dir is read-only (done in permission())
 500  *  2. We should have write and exec permissions on dir
 501  *  3. We can't remove anything from append-only dir
 502  *  4. We can't do anything with immutable dir (done in permission())
 503  *  5. If the sticky bit on dir is set we should either
 504  *      a. be owner of dir, or
 505  *      b. be owner of victim, or
 506  *      c. have CAP_FOWNER capability
 507  *  6. If the victim is append-only or immutable we can't do antyhing with
 508  *     links pointing to it.
 509  *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
 510  *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
 511  *  9. We can't remove a root or mountpoint.
 512  */
 513 static inline int may_delete(struct inode *dir,struct dentry *victim, int isdir)
 514 {
 515         int error;
 516         if (!victim->d_inode || victim->d_parent->d_inode != dir)
 517                 return -ENOENT;
 518         error = permission(dir,MAY_WRITE | MAY_EXEC);
 519         if (error)
 520                 return error;
 521         if (IS_APPEND(dir))
 522                 return -EPERM;
 523         if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
 524             IS_IMMUTABLE(victim->d_inode))
 525                 return -EPERM;
 526         if (isdir) {
 527                 if (!S_ISDIR(victim->d_inode->i_mode))
 528                         return -ENOTDIR;
 529                 if (IS_ROOT(victim))
 530                         return -EBUSY;
 531                 if (victim->d_mounts != victim->d_covers)
 532                         return -EBUSY;
 533         } else if (S_ISDIR(victim->d_inode->i_mode))
 534                 return -EISDIR;
 535         return 0;
 536 }
 537
 538 /*      Check whether we can create an object with dentry child in directory
 539  *  dir.
 540  *  1. We can't do it if child already exists (open has special treatment for
 541  *     this case, but since we are inlined it's OK)
 542  *  2. We can't do it if dir is read-only (done in permission())
 543  *  3. We should have write and exec permissions on dir
 544  *  4. We can't do it if dir is immutable (done in permission())
 545  */
 546 static inline int may_create(struct inode *dir, struct dentry *child) {
 547         if (child->d_inode)
 548                 return -EEXIST;
 549         return permission(dir,MAY_WRITE | MAY_EXEC);
 550 }
 551
 552 static inline struct dentry *get_parent(struct dentry *dentry)
 553 {
 554         return dget(dentry->d_parent);
 555 }
 556
 557 static inline void unlock_dir(struct dentry *dir)
 558 {
 559         up(&dir->d_inode->i_sem);
 560         dput(dir);
 561 }
 562
 563 /*
 564  * We need to do a check-parent every time
 565  * after we have locked the parent - to verify
 566  * that the parent is still our parent and
 567  * that we are still hashed onto it..
 568  *
 569  * This is requied in case two processes race
 570  * on removing (or moving) the same entry: the
 571  * parent lock will serialize them, but the
 572  * other process will be too late..
 573  */
 574 #define check_parent(dir, dentry) \
 575         ((dir) == (dentry)->d_parent && !list_empty(&dentry->d_hash))
 576
 577 /*
 578  * Locking the parent is needed to:
 579  *  - serialize directory operations
 580  *  - make sure the parent doesn't change from
 581  *    under us in the middle of an operation.
 582  *
 583  * NOTE! Right now we'd rather use a "struct inode"
 584  * for this, but as I expect things to move toward
 585  * using dentries instead for most things it is
 586  * probably better to start with the conceptually
 587  * better interface of relying on a path of dentries.
 588  */
 589 static inline struct dentry *lock_parent(struct dentry *dentry)
 590 {
 591         struct dentry *dir = dget(dentry->d_parent);
 592
 593         down(&dir->d_inode->i_sem);
 594         return dir;
 595 }
 596
 597 /*
 598  * Whee.. Deadlock country. Happily there are only two VFS
 599  * operations that do this..
 600  */
 601 static inline void double_lock(struct dentry *d1, struct dentry *d2)
 602 {
 603         struct semaphore *s1 = &d1->d_inode->i_sem;
 604         struct semaphore *s2 = &d2->d_inode->i_sem;
 605
 606         if (s1 != s2) {
 607                 if ((unsigned long) s1 < (unsigned long) s2) {
 608                         struct semaphore *tmp = s2;
 609                         s2 = s1; s1 = tmp;
 610                 }
 611                 down(s1);
 612         }
 613         down(s2);
 614 }
 615
 616 static inline void double_unlock(struct dentry *d1, struct dentry *d2)
 617 {
 618         struct semaphore *s1 = &d1->d_inode->i_sem;
 619         struct semaphore *s2 = &d2->d_inode->i_sem;
 620
 621         up(s1);
 622         if (s1 != s2)
 623                 up(s2);
 624         dput(d1);
 625         dput(d2);
 626 }
 627
 628
 629 /*
 630  * Special case: O_CREAT|O_EXCL implies O_NOFOLLOW for security
 631  * reasons.
 632  *
 633  * O_DIRECTORY translates into forcing a directory lookup.
 634  */
 635 static inline int lookup_flags(unsigned int f)
 636 {
 637         unsigned long retval = LOOKUP_FOLLOW;
 638
 639         if (f & O_NOFOLLOW)
 640                 retval &= ~LOOKUP_FOLLOW;
 641
 642         if ((f & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
 643                 retval &= ~LOOKUP_FOLLOW;
 644
 645         if (f & O_DIRECTORY)
 646                 retval |= LOOKUP_DIRECTORY;
 647
 648         return retval;
 649 }
 650
 651 int vfs_create(struct inode *dir, struct dentry *dentry, int mode)
 652 {
 653         int error;
 654
 655         error = may_create(dir, dentry);
 656         if (error)
 657                 goto exit_lock;
 658
 659         error = -EACCES;        /* shouldn't it be ENOSYS? */
 660         if (!dir->i_op || !dir->i_op->create)
 661                 goto exit_lock;
 662
 663         DQUOT_INIT(dir);
 664         error = dir->i_op->create(dir, dentry, mode);
 665 exit_lock:
 666         return error;
 667 }
 668
 669 /*
 670  *      open_namei()
 671  *
 672  * namei for open - this is in fact almost the whole open-routine.
 673  *
 674  * Note that the low bits of "flag" aren't the same as in the open
 675  * system call - they are 00 - no permissions needed
 676  *                        01 - read permission needed
 677  *                        10 - write permission needed
 678  *                        11 - read/write permissions needed
 679  * which is a lot more logical, and also allows the "no perm" needed
 680  * for symlinks (where the permissions are checked later).
 681  */
 682 struct dentry * open_namei(const char * pathname, int flag, int mode)
 683 {
 684         int acc_mode, error;
 685         struct inode *inode;
 686         struct dentry *dentry;
 687
 688         mode &= S_IALLUGO & ~current->fs->umask;
 689         mode |= S_IFREG;
 690
 691         dentry = lookup_dentry(pathname, NULL, lookup_flags(flag));
 692         if (IS_ERR(dentry))
 693                 return dentry;
 694
 695         acc_mode = ACC_MODE(flag);
 696         if (flag & O_CREAT) {
 697                 struct dentry *dir;
 698
 699                 if (dentry->d_inode) {
 700                         if (!(flag & O_EXCL))
 701                                 goto nocreate;
 702                         error = -EEXIST;
 703                         goto exit;
 704                 }
 705
 706                 dir = lock_parent(dentry);
 707                 if (!check_parent(dir, dentry)) {
 708                         /*
 709                          * Really nasty race happened. What's the
 710                          * right error code? We had a dentry, but
 711                          * before we could use it it was removed
 712                          * by somebody else. We could just re-try
 713                          * everything, I guess.
 714                          *
 715                          * ENOENT is definitely wrong.
 716                          */
 717                         error = -ENOENT;
 718                         unlock_dir(dir);
 719                         goto exit;
 720                 }
 721
 722                 /*
 723                  * Somebody might have created the file while we
 724                  * waited for the directory lock.. So we have to
 725                  * re-do the existence test.
 726                  */
 727                 if (dentry->d_inode) {
 728                         error = 0;
 729                         if (flag & O_EXCL)
 730                                 error = -EEXIST;
 731                 } else {
 732                         error = vfs_create(dir->d_inode, dentry,mode);
 733                         /* Don't check for write permission, don't truncate */
 734                         acc_mode = 0;
 735                         flag &= ~O_TRUNC;
 736                 }
 737                 unlock_dir(dir);
 738                 if (error)
 739                         goto exit;
 740         }
 741
 742 nocreate:
 743         error = -ENOENT;
 744         inode = dentry->d_inode;
 745         if (!inode)
 746                 goto exit;
 747
 748         error = -ELOOP;
 749         if (S_ISLNK(inode->i_mode))
 750                 goto exit;
 751
 752         error = -EISDIR;
 753         if (S_ISDIR(inode->i_mode) && (flag & FMODE_WRITE))
 754                 goto exit;
 755
 756         error = permission(inode,acc_mode);
 757         if (error)
 758                 goto exit;
 759
 760         /*
 761          * FIFO's, sockets and device files are special: they don't
 762          * actually live on the filesystem itself, and as such you
 763          * can write to them even if the filesystem is read-only.
 764          */
 765         if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
 766                 flag &= ~O_TRUNC;
 767         } else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
 768                 error = -EACCES;
 769                 if (IS_NODEV(inode))
 770                         goto exit;
 771
 772                 flag &= ~O_TRUNC;
 773         } else {
 774                 error = -EROFS;
 775                 if (IS_RDONLY(inode) && (flag & 2))
 776                         goto exit;
 777         }
 778         /*
 779          * An append-only file must be opened in append mode for writing.
 780          */
 781         error = -EPERM;
 782         if (IS_APPEND(inode)) {
 783                 if  ((flag & FMODE_WRITE) && !(flag & O_APPEND))
 784                         goto exit;
 785                 if (flag & O_TRUNC)
 786                         goto exit;
 787         }
 788
 789         if (flag & O_TRUNC) {
 790                 error = get_write_access(inode);
 791                 if (error)
 792                         goto exit;
 793
 794                 /*
 795                  * Refuse to truncate files with mandatory locks held on them.
 796                  */
 797                 error = locks_verify_locked(inode);
 798                 if (!error) {
 799                         DQUOT_INIT(inode);
 800
 801                         error = do_truncate(dentry, 0);
 802                 }
 803                 put_write_access(inode);
 804                 if (error)
 805                         goto exit;
 806         } else
 807                 if (flag & FMODE_WRITE)
 808                         DQUOT_INIT(inode);
 809
 810         return dentry;
 811
 812 exit:
 813         dput(dentry);
 814         return ERR_PTR(error);
 815 }
 816
 817 struct dentry * do_mknod(const char * filename, int mode, dev_t dev)
 818 {
 819         int error;
 820         struct dentry *dir;
 821         struct dentry *dentry, *retval;
 822
 823         mode &= ~current->fs->umask;
 824         dentry = lookup_dentry(filename, NULL, LOOKUP_FOLLOW);
 825         if (IS_ERR(dentry))
 826                 return dentry;
 827
 828         dir = lock_parent(dentry);
 829         error = -ENOENT;
 830         if (!check_parent(dir, dentry))
 831                 goto exit_lock;
 832
 833         error = may_create(dir->d_inode, dentry);
 834         if (error)
 835                 goto exit_lock;
 836
 837         error = -EPERM;
 838         if (!dir->d_inode->i_op || !dir->d_inode->i_op->mknod)
 839                 goto exit_lock;
 840
 841         DQUOT_INIT(dir->d_inode);
 842         error = dir->d_inode->i_op->mknod(dir->d_inode, dentry, mode, dev);
 843 exit_lock:
 844         retval = ERR_PTR(error);
 845         if (!error)
 846                 retval = dget(dentry);
 847         unlock_dir(dir);
 848         dput(dentry);
 849         return retval;
 850 }
 851
 852 asmlinkage long sys_mknod(const char * filename, int mode, dev_t dev)
 853 {
 854         int error;
 855         char * tmp;
 856         struct dentry * dentry;
 857
 858         lock_kernel();
 859         error = -EPERM;
 860         if (S_ISDIR(mode) || (!S_ISFIFO(mode) && !capable(CAP_MKNOD)))
 861                 goto out;
 862         tmp = getname(filename);
 863         error = PTR_ERR(tmp);
 864         if (IS_ERR(tmp))
 865                 goto out;
 866
 867         error = -EINVAL;
 868         switch (mode & S_IFMT) {
 869         case 0:
 870                 mode |= S_IFREG;        /* fallthrough */
 871         case S_IFREG:
 872                 mode &= ~current->fs->umask;
 873                 dentry = lookup_dentry(filename, NULL, LOOKUP_FOLLOW);
 874                 if (IS_ERR(dentry))
 875                         error = PTR_ERR(dentry);
 876                 else {
 877                         struct dentry *dir = lock_parent(dentry);
 878                         error = -ENOENT;
 879                         if (check_parent(dir, dentry))
 880                                 error = vfs_create(dir->d_inode, dentry, mode);
 881                         dput(dentry);
 882                 }
 883                 break;
 884         case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK:
 885                 dentry = do_mknod(tmp,mode,dev);
 886                 error = PTR_ERR(dentry);
 887                 if (!IS_ERR(dentry)) {
 888                         dput(dentry);
 889                         error = 0;
 890                 }
 891                 break;
 892         }
 893         putname(tmp);
 894
 895 out:
 896         unlock_kernel();
 897         return error;
 898 }
 899
 900 static inline int do_mkdir(const char * pathname, int mode)
 901 {
 902         int error;
 903         struct dentry *dir;
 904         struct dentry *dentry;
 905
 906         dentry = lookup_dentry(pathname, NULL, LOOKUP_SLASHOK);
 907         error = PTR_ERR(dentry);
 908         if (IS_ERR(dentry))
 909                 goto exit;
 910
 911         /*
 912          * EEXIST is kind of a strange error code to
 913          * return, but basically if the dentry was moved
 914          * or unlinked while we locked the parent, we
 915          * do know that it _did_ exist before, and as
 916          * such it makes perfect sense.. In contrast,
 917          * ENOENT doesn't make sense for mkdir.
 918          */
 919         dir = lock_parent(dentry);
 920         error = -EEXIST;
 921         if (!check_parent(dir, dentry))
 922                 goto exit_lock;
 923
 924         error = may_create(dir->d_inode, dentry);
 925         if (error)
 926                 goto exit_lock;
 927
 928         error = -EPERM;
 929         if (!dir->d_inode->i_op || !dir->d_inode->i_op->mkdir)
 930                 goto exit_lock;
 931
 932         DQUOT_INIT(dir->d_inode);
 933         mode &= (S_IRWXUGO|S_ISVTX) & ~current->fs->umask;
 934         error = dir->d_inode->i_op->mkdir(dir->d_inode, dentry, mode);
 935
 936 exit_lock:
 937         unlock_dir(dir);
 938         dput(dentry);
 939 exit:
 940         return error;
 941 }
 942
 943 asmlinkage long sys_mkdir(const char * pathname, int mode)
 944 {
 945         int error;
 946         char * tmp;
 947
 948         lock_kernel();
 949         tmp = getname(pathname);
 950         error = PTR_ERR(tmp);
 951         if (!IS_ERR(tmp)) {
 952                 error = do_mkdir(tmp,mode);
 953                 putname(tmp);
 954         }
 955         unlock_kernel();
 956         return error;
 957 }
 958
 959 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 960 {
 961         int error;
 962
 963         error = may_delete(dir, dentry, 1);
 964         if (error)
 965                 return error;
 966
 967         if (!dir->i_op || !dir->i_op->rmdir)
 968                 return -EPERM;
 969
 970         DQUOT_INIT(dir);
 971
 972         /*
 973          * We try to drop the dentry early: we should have
 974          * a usage count of 2 if we're the only user of this
 975          * dentry, and if that is true (possibly after pruning
 976          * the dcache), then we drop the dentry now.
 977          *
 978          * A low-level filesystem can, if it choses, legally
 979          * do a
 980          *
 981          *      if (!list_empty(&dentry->d_hash))
 982          *              return -EBUSY;
 983          *
 984          * if it cannot handle the case of removing a directory
 985          * that is still in use by something else..
 986          */
 987         switch (dentry->d_count) {
 988         default:
 989                 shrink_dcache_parent(dentry);
 990                 if (dentry->d_count != 2)
 991                         break;
 992         case 2:
 993                 d_drop(dentry);
 994         }
 995
 996         error = dir->i_op->rmdir(dir, dentry);
 997
 998         return error;
 999 }
1000
1001 static inline int do_rmdir(const char * name)
1002 {
1003         int error;
1004         struct dentry *dir;
1005         struct dentry *dentry;
1006
1007         dentry = lookup_dentry(name, NULL, 0);
1008         error = PTR_ERR(dentry);
1009         if (IS_ERR(dentry))
1010                 goto exit;
1011
1012         error = -ENOENT;
1013         if (!dentry->d_inode)
1014                 goto exit_dput;
1015
1016         dir = dget(dentry->d_parent);
1017
1018         /*
1019          * The dentry->d_count stuff confuses d_delete() enough to
1020          * not kill the inode from under us while it is locked. This
1021          * wouldn't be needed, except the dentry semaphore is really
1022          * in the inode, not in the dentry..
1023          */
1024         dentry->d_count++;
1025         double_lock(dir, dentry);
1026
1027         error = -ENOENT;
1028         if (check_parent(dir, dentry))
1029                 error = vfs_rmdir(dir->d_inode, dentry);
1030
1031         double_unlock(dentry, dir);
1032 exit_dput:
1033         dput(dentry);
1034 exit:
1035         return error;
1036 }
1037
1038 asmlinkage long sys_rmdir(const char * pathname)
1039 {
1040         int error;
1041         char * tmp;
1042
1043         lock_kernel();
1044         tmp = getname(pathname);
1045         error = PTR_ERR(tmp);
1046         if (!IS_ERR(tmp)) {
1047                 error = do_rmdir(tmp);
1048                 putname(tmp);
1049         }
1050         unlock_kernel();
1051         return error;
1052 }
1053
1054 int vfs_unlink(struct inode *dir, struct dentry *dentry)
1055 {
1056         int error;
1057
1058         error = may_delete(dir, dentry, 0);
1059         if (!error) {
1060                 error = -EPERM;
1061                 if (dir->i_op && dir->i_op->unlink) {
1062                         DQUOT_INIT(dir);
1063                         error = dir->i_op->unlink(dir, dentry);
1064                 }
1065         }
1066         return error;
1067 }
1068
1069 static inline int do_unlink(const char * name)
1070 {
1071         int error;
1072         struct dentry *dir;
1073         struct dentry *dentry;
1074
1075         dentry = lookup_dentry(name, NULL, 0);
1076         error = PTR_ERR(dentry);
1077         if (IS_ERR(dentry))
1078                 goto exit;
1079
1080         dir = lock_parent(dentry);
1081         error = -ENOENT;
1082         if (check_parent(dir, dentry))
1083                 error = vfs_unlink(dir->d_inode, dentry);
1084
1085         unlock_dir(dir);
1086         dput(dentry);
1087 exit:
1088         return error;
1089 }
1090
1091 asmlinkage long sys_unlink(const char * pathname)
1092 {
1093         int error;
1094         char * tmp;
1095
1096         lock_kernel();
1097         tmp = getname(pathname);
1098         error = PTR_ERR(tmp);
1099         if (!IS_ERR(tmp)) {
1100                 error = do_unlink(tmp);
1101                 putname(tmp);
1102         }
1103         unlock_kernel();
1104         return error;
1105 }
1106
1107 static inline int do_symlink(const char * oldname, const char * newname)
1108 {
1109         int error;
1110         struct dentry *dir;
1111         struct dentry *dentry;
1112
1113         dentry = lookup_dentry(newname, NULL, 0);
1114
1115         error = PTR_ERR(dentry);
1116         if (IS_ERR(dentry))
1117                 goto exit;
1118
1119         dir = lock_parent(dentry);
1120         error = -ENOENT;
1121         if (!check_parent(dir, dentry))
1122                 goto exit_lock;
1123
1124         error = may_create(dir->d_inode, dentry);
1125         if (error)
1126                 goto exit_lock;
1127
1128         error = -EPERM;
1129         if (!dir->d_inode->i_op || !dir->d_inode->i_op->symlink)
1130                 goto exit_lock;
1131
1132         DQUOT_INIT(dir->d_inode);
1133         error = dir->d_inode->i_op->symlink(dir->d_inode, dentry, oldname);
1134
1135 exit_lock:
1136         unlock_dir(dir);
1137         dput(dentry);
1138 exit:
1139         return error;
1140 }
1141
1142 asmlinkage long sys_symlink(const char * oldname, const char * newname)
1143 {
1144         int error;
1145         char * from;
1146
1147         lock_kernel();
1148         from = getname(oldname);
1149         error = PTR_ERR(from);
1150         if (!IS_ERR(from)) {
1151                 char * to;
1152                 to = getname(newname);
1153                 error = PTR_ERR(to);
1154                 if (!IS_ERR(to)) {
1155                         error = do_symlink(from,to);
1156                         putname(to);
1157                 }
1158                 putname(from);
1159         }
1160         unlock_kernel();
1161         return error;
1162 }
1163
1164 static inline int do_link(const char * oldname, const char * newname)
1165 {
1166         struct dentry *old_dentry, *new_dentry, *dir;
1167         struct inode *inode;
1168         int error;
1169
1170         /*
1171          * Hardlinks are often used in delicate situations.  We avoid
1172          * security-related surprises by not following symlinks on the
1173          * newname.  --KAB
1174          *
1175          * We don't follow them on the oldname either to be compatible
1176          * with linux 2.0, and to avoid hard-linking to directories
1177          * and other special files.  --ADM
1178          */
1179         old_dentry = lookup_dentry(oldname, NULL, 0);
1180         error = PTR_ERR(old_dentry);
1181         if (IS_ERR(old_dentry))
1182                 goto exit;
1183
1184         new_dentry = lookup_dentry(newname, NULL, 0);
1185         error = PTR_ERR(new_dentry);
1186         if (IS_ERR(new_dentry))
1187                 goto exit_old;
1188
1189         dir = lock_parent(new_dentry);
1190         error = -ENOENT;
1191         if (!check_parent(dir, new_dentry))
1192                 goto exit_lock;
1193
1194         error = -ENOENT;
1195         inode = old_dentry->d_inode;
1196         if (!inode)
1197                 goto exit_lock;
1198
1199         error = may_create(dir->d_inode, new_dentry);
1200         if (error)
1201                 goto exit_lock;
1202
1203         error = -EXDEV;
1204         if (dir->d_inode->i_dev != inode->i_dev)
1205                 goto exit_lock;
1206
1207         /*
1208          * A link to an append-only or immutable file cannot be created.
1209          */
1210         error = -EPERM;
1211         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1212                 goto exit_lock;
1213
1214         error = -EPERM;
1215         if (!dir->d_inode->i_op || !dir->d_inode->i_op->link)
1216                 goto exit_lock;
1217
1218         DQUOT_INIT(dir->d_inode);
1219         error = dir->d_inode->i_op->link(old_dentry, dir->d_inode, new_dentry);
1220
1221 exit_lock:
1222         unlock_dir(dir);
1223         dput(new_dentry);
1224 exit_old:
1225         dput(old_dentry);
1226 exit:
1227         return error;
1228 }
1229
1230 asmlinkage long sys_link(const char * oldname, const char * newname)
1231 {
1232         int error;
1233         char * from;
1234
1235         lock_kernel();
1236         from = getname(oldname);
1237         error = PTR_ERR(from);
1238         if (!IS_ERR(from)) {
1239                 char * to;
1240                 to = getname(newname);
1241                 error = PTR_ERR(to);
1242                 if (!IS_ERR(to)) {
1243                         error = do_link(from,to);
1244                         putname(to);
1245                 }
1246                 putname(from);
1247         }
1248         unlock_kernel();
1249         return error;
1250 }
1251
1252 int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
1253                struct inode *new_dir, struct dentry *new_dentry)
1254 {
1255         int error;
1256         int need_rehash = 0;
1257
1258         if (old_dentry->d_inode == new_dentry->d_inode)
1259                 return 0;
1260
1261         error = may_delete(old_dir, old_dentry, 1);
1262         if (error)
1263                 return error;
1264
1265         if (new_dir->i_dev != old_dir->i_dev)
1266                 return -EXDEV;
1267
1268         if (!new_dentry->d_inode)
1269                 error = may_create(new_dir, new_dentry);
1270         else
1271                 error = may_delete(new_dir, new_dentry, 1);
1272         if (error)
1273                 return error;
1274
1275         if (!old_dir->i_op || !old_dir->i_op->rename)
1276                 return -EPERM;
1277
1278         /*
1279          * If we are going to change the parent - check write permissions,
1280          * we'll need to flip '..'.
1281          */
1282         if (new_dir != old_dir) {
1283                 error = permission(old_dentry->d_inode, MAY_WRITE);
1284         }
1285         if (error)
1286                 return error;
1287
1288         DQUOT_INIT(old_dir);
1289         DQUOT_INIT(new_dir);
1290         down(&old_dir->i_sb->s_vfs_rename_sem);
1291         error = -EINVAL;
1292         if (is_subdir(new_dentry, old_dentry))
1293                 goto out_unlock;
1294         if (new_dentry->d_inode) {
1295                 error = -EBUSY;
1296                 if (d_invalidate(new_dentry)<0)
1297                         goto out_unlock;
1298                 need_rehash = 1;
1299         }
1300         error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
1301         if (need_rehash)
1302                 d_rehash(new_dentry);
1303         if (!error)
1304                 d_move(old_dentry,new_dentry);
1305 out_unlock:
1306         up(&old_dir->i_sb->s_vfs_rename_sem);
1307         return error;
1308 }
1309
1310 int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
1311                struct inode *new_dir, struct dentry *new_dentry)
1312 {
1313         int error;
1314
1315         if (old_dentry->d_inode == new_dentry->d_inode)
1316                 return 0;
1317
1318         error = may_delete(old_dir, old_dentry, 0);
1319         if (error)
1320                 return error;
1321
1322         if (new_dir->i_dev != old_dir->i_dev)
1323                 return -EXDEV;
1324
1325         if (!new_dentry->d_inode)
1326                 error = may_create(new_dir, new_dentry);
1327         else
1328                 error = may_delete(new_dir, new_dentry, 0);
1329         if (error)
1330                 return error;
1331
1332         if (!old_dir->i_op || !old_dir->i_op->rename)
1333                 return -EPERM;
1334
1335         DQUOT_INIT(old_dir);
1336         DQUOT_INIT(new_dir);
1337         error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
1338         if (error)
1339                 return error;
1340         /* The following d_move() should become unconditional */
1341         if (!(old_dir->i_sb->s_flags & MS_ODD_RENAME)) {
1342                 d_move(old_dentry, new_dentry);
1343         }
1344         return 0;
1345 }
1346
1347 int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1348                struct inode *new_dir, struct dentry *new_dentry)
1349 {
1350         if (S_ISDIR(old_dentry->d_inode->i_mode))
1351                 return vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
1352         else
1353                 return vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
1354 }
1355
1356 static inline int do_rename(const char * oldname, const char * newname)
1357 {
1358         int error;
1359         struct dentry * old_dir, * new_dir;
1360         struct dentry * old_dentry, *new_dentry;
1361
1362         old_dentry = lookup_dentry(oldname, NULL, 0);
1363
1364         error = PTR_ERR(old_dentry);
1365         if (IS_ERR(old_dentry))
1366                 goto exit;
1367
1368         error = -ENOENT;
1369         if (!old_dentry->d_inode)
1370                 goto exit_old;
1371
1372         {
1373                 unsigned int flags = 0;
1374                 if (S_ISDIR(old_dentry->d_inode->i_mode))
1375                         flags = LOOKUP_SLASHOK;
1376                 new_dentry = lookup_dentry(newname, NULL, flags);
1377         }
1378
1379         error = PTR_ERR(new_dentry);
1380         if (IS_ERR(new_dentry))
1381                 goto exit_old;
1382
1383         new_dir = get_parent(new_dentry);
1384         old_dir = get_parent(old_dentry);
1385
1386         double_lock(new_dir, old_dir);
1387
1388         error = -ENOENT;
1389         if (check_parent(old_dir, old_dentry) && check_parent(new_dir, new_dentry))
1390                 error = vfs_rename(old_dir->d_inode, old_dentry,
1391                                    new_dir->d_inode, new_dentry);
1392
1393         double_unlock(new_dir, old_dir);
1394         dput(new_dentry);
1395 exit_old:
1396         dput(old_dentry);
1397 exit:
1398         return error;
1399 }
1400
1401 asmlinkage long sys_rename(const char * oldname, const char * newname)
1402 {
1403         int error;
1404         char * from;
1405
1406         lock_kernel();
1407         from = getname(oldname);
1408         error = PTR_ERR(from);
1409         if (!IS_ERR(from)) {
1410                 char * to;
1411                 to = getname(newname);
1412                 error = PTR_ERR(to);
1413                 if (!IS_ERR(to)) {
1414                         error = do_rename(from,to);
1415                         putname(to);
1416                 }
1417                 putname(from);
1418         }
1419         unlock_kernel();
1420         return error;
1421 }