fs/namei.c

   1 /*
   2  *  linux/fs/namei.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  * Some corrections by tytso.
   9  */
  10
  11 /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
  12  * lookup logic.
  13  */
  14
  15 #include <linux/mm.h>
  16 #include <linux/proc_fs.h>
  17 #include <linux/smp_lock.h>
  18 #include <linux/quotaops.h>
  19
  20 #include <asm/uaccess.h>
  21 #include <asm/unaligned.h>
  22 #include <asm/semaphore.h>
  23 #include <asm/page.h>
  24 #include <asm/pgtable.h>
  25
  26 #include <asm/namei.h>
  27
  28 /* This can be removed after the beta phase. */
  29 #define CACHE_SUPERVISE /* debug the correctness of dcache entries */
  30 #undef DEBUG            /* some other debugging */
  31
  32
  33 #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
  34
  35 /* [Feb-1997 T. Schoebel-Theuer]
  36  * Fundamental changes in the pathname lookup mechanisms (namei)
  37  * were necessary because of omirr.  The reason is that omirr needs
  38  * to know the _real_ pathname, not the user-supplied one, in case
  39  * of symlinks (and also when transname replacements occur).
  40  *
  41  * The new code replaces the old recursive symlink resolution with
  42  * an iterative one (in case of non-nested symlink chains).  It does
  43  * this with calls to <fs>_follow_link().
  44  * As a side effect, dir_namei(), _namei() and follow_link() are now
  45  * replaced with a single function lookup_dentry() that can handle all
  46  * the special cases of the former code.
  47  *
  48  * With the new dcache, the pathname is stored at each inode, at least as
  49  * long as the refcount of the inode is positive.  As a side effect, the
  50  * size of the dcache depends on the inode cache and thus is dynamic.
  51  *
  52  * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
  53  * resolution to correspond with current state of the code.
  54  *
  55  * Note that the symlink resolution is not *completely* iterative.
  56  * There is still a significant amount of tail- and mid- recursion in
  57  * the algorithm.  Also, note that <fs>_readlink() is not used in
  58  * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
  59  * may return different results than <fs>_follow_link().  Many virtual
  60  * filesystems (including /proc) exhibit this behavior.
  61  */
  62
  63 /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
  64  * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
  65  * and the name already exists in form of a symlink, try to create the new
  66  * name indicated by the symlink. The old code always complained that the
  67  * name already exists, due to not following the symlink even if its target
  68  * is nonexistent.  The new semantics affects also mknod() and link() when
  69  * the name is a symlink pointing to a non-existant name.
  70  *
  71  * I don't know which semantics is the right one, since I have no access
  72  * to standards. But I found by trial that HP-UX 9.0 has the full "new"
  73  * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
  74  * "old" one. Personally, I think the new semantics is much more logical.
  75  * Note that "ln old new" where "new" is a symlink pointing to a non-existing
  76  * file does succeed in both HP-UX and SunOs, but not in Solaris
  77  * and in the old Linux semantics.
  78  */
  79
  80 /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
  81  * semantics.  See the comments in "open_namei" and "do_link" below.
  82  *
  83  * [10-Sep-98 Alan Modra] Another symlink change.
  84  */
  85
  86 /* In order to reduce some races, while at the same time doing additional
  87  * checking and hopefully speeding things up, we copy filenames to the
  88  * kernel data space before using them..
  89  *
  90  * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
  91  */
  92 static inline int do_getname(const char *filename, char *page)
  93 {
  94         int retval;
  95         unsigned long len = PAGE_SIZE;
  96
  97         if ((unsigned long) filename >= TASK_SIZE) {
  98                 if (!segment_eq(get_fs(), KERNEL_DS))
  99                         return -EFAULT;
 100         } else if (TASK_SIZE - (unsigned long) filename < PAGE_SIZE)
 101                 len = TASK_SIZE - (unsigned long) filename;
 102
 103         retval = strncpy_from_user((char *)page, filename, len);
 104         if (retval > 0) {
 105                 if (retval < len)
 106                         return 0;
 107                 return -ENAMETOOLONG;
 108         } else if (!retval)
 109                 retval = -ENOENT;
 110         return retval;
 111 }
 112
 113 char * getname(const char * filename)
 114 {
 115         char *tmp, *result;
 116
 117         result = ERR_PTR(-ENOMEM);
 118         tmp = __getname();
 119         if (tmp)  {
 120                 int retval = do_getname(filename, tmp);
 121
 122                 result = tmp;
 123                 if (retval < 0) {
 124                         putname(tmp);
 125                         result = ERR_PTR(retval);
 126                 }
 127         }
 128         return result;
 129 }
 130
 131 /*
 132  *      permission()
 133  *
 134  * is used to check for read/write/execute permissions on a file.
 135  * We use "fsuid" for this, letting us set arbitrary permissions
 136  * for filesystem access without changing the "normal" uids which
 137  * are used for other things..
 138  */
 139 int permission(struct inode * inode,int mask)
 140 {
 141         int mode = inode->i_mode;
 142
 143         if (inode->i_op && inode->i_op->permission)
 144                 return inode->i_op->permission(inode, mask);
 145         else if ((mask & S_IWOTH) && IS_RDONLY(inode) &&
 146                  (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
 147                 return -EROFS; /* Nobody gets write access to a read-only fs */
 148         else if ((mask & S_IWOTH) && IS_IMMUTABLE(inode))
 149                 return -EACCES; /* Nobody gets write access to an immutable file */
 150         else if (current->fsuid == inode->i_uid)
 151                 mode >>= 6;
 152         else if (in_group_p(inode->i_gid))
 153                 mode >>= 3;
 154         if (((mode & mask & S_IRWXO) == mask) || capable(CAP_DAC_OVERRIDE))
 155                 return 0;
 156         /* read and search access */
 157         if ((mask == S_IROTH) ||
 158             (S_ISDIR(mode)  && !(mask & ~(S_IROTH | S_IXOTH))))
 159                 if (capable(CAP_DAC_READ_SEARCH))
 160                         return 0;
 161         return -EACCES;
 162 }
 163
 164 /*
 165  * get_write_access() gets write permission for a file.
 166  * put_write_access() releases this write permission.
 167  * This is used for regular files.
 168  * We cannot support write (and maybe mmap read-write shared) accesses and
 169  * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode
 170  * can have the following values:
 171  * 0: no writers, no VM_DENYWRITE mappings
 172  * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist
 173  * > 0: (i_writecount) users are writing to the file.
 174  */
 175 int get_write_access(struct inode * inode)
 176 {
 177         if (inode->i_writecount < 0)
 178                 return -ETXTBSY;
 179         inode->i_writecount++;
 180         return 0;
 181 }
 182
 183 void put_write_access(struct inode * inode)
 184 {
 185         inode->i_writecount--;
 186 }
 187
 188 /*
 189  * "." and ".." are special - ".." especially so because it has to be able
 190  * to know about the current root directory and parent relationships
 191  */
 192 static struct dentry * reserved_lookup(struct dentry * parent, struct qstr * name)
 193 {
 194         struct dentry *result = NULL;
 195         if (name->name[0] == '.') {
 196                 switch (name->len) {
 197                 default:
 198                         break;
 199                 case 2:
 200                         if (name->name[1] != '.')
 201                                 break;
 202
 203                         if (parent != current->fs->root)
 204                                 parent = parent->d_covers->d_parent;
 205                         /* fallthrough */
 206                 case 1:
 207                         result = parent;
 208                 }
 209         }
 210         return dget(result);
 211 }
 212
 213 /*
 214  * Internal lookup() using the new generic dcache.
 215  */
 216 static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags)
 217 {
 218         struct dentry * dentry = d_lookup(parent, name);
 219
 220         if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
 221                 if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) {
 222                         dput(dentry);
 223                         dentry = NULL;
 224                 }
 225         }
 226         return dentry;
 227 }
 228
 229 /*
 230  * This is called when everything else fails, and we actually have
 231  * to go to the low-level filesystem to find out what we should do..
 232  *
 233  * We get the directory semaphore, and after getting that we also
 234  * make sure that nobody added the entry to the dcache in the meantime..
 235  */
 236 static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags)
 237 {
 238         struct dentry * result;
 239         struct inode *dir = parent->d_inode;
 240
 241         down(&dir->i_sem);
 242         /*
 243          * First re-do the cached lookup just in case it was created
 244          * while we waited for the directory semaphore..
 245          *
 246          * FIXME! This could use version numbering or similar to
 247          * avoid unnecessary cache lookups.
 248          */
 249         result = cached_lookup(parent, name, flags);
 250         if (!result) {
 251                 struct dentry * dentry = d_alloc(parent, name);
 252                 result = ERR_PTR(-ENOMEM);
 253                 if (dentry) {
 254                         result = dir->i_op->lookup(dir, dentry);
 255                         if (result)
 256                                 dput(dentry);
 257                         else
 258                                 result = dentry;
 259                 }
 260         }
 261         up(&dir->i_sem);
 262         return result;
 263 }
 264
 265 static struct dentry * do_follow_link(struct dentry *base, struct dentry *dentry, unsigned int follow)
 266 {
 267         struct inode * inode = dentry->d_inode;
 268
 269         if ((follow & LOOKUP_FOLLOW)
 270             && inode && inode->i_op && inode->i_op->follow_link) {
 271                 if (current->link_count < 5) {
 272                         struct dentry * result;
 273
 274                         current->link_count++;
 275                         /* This eats the base */
 276                         result = inode->i_op->follow_link(dentry, base, follow);
 277                         current->link_count--;
 278                         dput(dentry);
 279                         return result;
 280                 }
 281                 dput(dentry);
 282                 dentry = ERR_PTR(-ELOOP);
 283         }
 284         dput(base);
 285         return dentry;
 286 }
 287
 288 static inline struct dentry * follow_mount(struct dentry * dentry)
 289 {
 290         struct dentry * mnt = dentry->d_mounts;
 291
 292         if (mnt != dentry) {
 293                 dget(mnt);
 294                 dput(dentry);
 295                 dentry = mnt;
 296         }
 297         return dentry;
 298 }
 299
 300 /*
 301  * Name resolution.
 302  *
 303  * This is the basic name resolution function, turning a pathname
 304  * into the final dentry.
 305  */
 306 struct dentry * lookup_dentry(const char * name, struct dentry * base, unsigned int lookup_flags)
 307 {
 308         struct dentry * dentry;
 309         struct inode *inode;
 310
 311         if (*name == '/') {
 312                 if (base)
 313                         dput(base);
 314                 do {
 315                         name++;
 316                 } while (*name == '/');
 317                 __prefix_lookup_dentry(name, lookup_flags);
 318                 base = dget(current->fs->root);
 319         } else if (!base) {
 320                 base = dget(current->fs->pwd);
 321         }
 322
 323         if (!*name)
 324                 goto return_base;
 325
 326         inode = base->d_inode;
 327         lookup_flags &= LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_SLASHOK;
 328
 329         /* At this point we know we have a real path component. */
 330         for(;;) {
 331                 int err;
 332                 unsigned long hash;
 333                 struct qstr this;
 334                 unsigned int flags;
 335                 unsigned int c;
 336
 337                 err = permission(inode, MAY_EXEC);
 338                 dentry = ERR_PTR(err);
 339                 if (err)
 340                         break;
 341
 342                 this.name = name;
 343                 c = *(const unsigned char *)name;
 344
 345                 hash = init_name_hash();
 346                 do {
 347                         name++;
 348                         hash = partial_name_hash(c, hash);
 349                         c = *(const unsigned char *)name;
 350                 } while (c && (c != '/'));
 351                 this.len = name - (const char *) this.name;
 352                 this.hash = end_name_hash(hash);
 353
 354                 /* remove trailing slashes? */
 355                 flags = lookup_flags;
 356                 if (c) {
 357                         char tmp;
 358
 359                         flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 360                         do {
 361                                 tmp = *++name;
 362                         } while (tmp == '/');
 363                         if (tmp)
 364                                 flags |= LOOKUP_CONTINUE;
 365                 }
 366
 367                 /*
 368                  * See if the low-level filesystem might want
 369                  * to use its own hash..
 370                  */
 371                 if (base->d_op && base->d_op->d_hash) {
 372                         int error;
 373                         error = base->d_op->d_hash(base, &this);
 374                         if (error < 0) {
 375                                 dentry = ERR_PTR(error);
 376                                 break;
 377                         }
 378                 }
 379
 380                 /* This does the actual lookups.. */
 381                 dentry = reserved_lookup(base, &this);
 382                 if (!dentry) {
 383                         dentry = cached_lookup(base, &this, flags);
 384                         if (!dentry) {
 385                                 dentry = real_lookup(base, &this, flags);
 386                                 if (IS_ERR(dentry))
 387                                         break;
 388                         }
 389                 }
 390
 391                 /* Check mountpoints.. */
 392                 dentry = follow_mount(dentry);
 393
 394                 base = do_follow_link(base, dentry, flags);
 395                 if (IS_ERR(base))
 396                         goto return_base;
 397
 398                 inode = base->d_inode;
 399                 if (flags & LOOKUP_DIRECTORY) {
 400                         if (!inode)
 401                                 goto no_inode;
 402                         dentry = ERR_PTR(-ENOTDIR);
 403                         if (!inode->i_op || !inode->i_op->lookup)
 404                                 break;
 405                         if (flags & LOOKUP_CONTINUE)
 406                                 continue;
 407                 }
 408 return_base:
 409                 return base;
 410 /*
 411  * The case of a nonexisting file is special.
 412  *
 413  * In the middle of a pathname lookup (ie when
 414  * LOOKUP_CONTINUE is set), it's an obvious
 415  * error and returns ENOENT.
 416  *
 417  * At the end of a pathname lookup it's legal,
 418  * and we return a negative dentry. However, we
 419  * get here only if there were trailing slashes,
 420  * which is legal only if we know it's supposed
 421  * to be a directory (ie "mkdir"). Thus the
 422  * LOOKUP_SLASHOK flag.
 423  */
 424 no_inode:
 425                 dentry = ERR_PTR(-ENOENT);
 426                 if (flags & LOOKUP_CONTINUE)
 427                         break;
 428                 if (flags & LOOKUP_SLASHOK)
 429                         goto return_base;
 430                 break;
 431         }
 432         dput(base);
 433         return dentry;
 434 }
 435
 436 /*
 437  *      namei()
 438  *
 439  * is used by most simple commands to get the inode of a specified name.
 440  * Open, link etc use their own routines, but this is enough for things
 441  * like 'chmod' etc.
 442  *
 443  * namei exists in two versions: namei/lnamei. The only difference is
 444  * that namei follows links, while lnamei does not.
 445  */
 446 struct dentry * __namei(const char *pathname, unsigned int lookup_flags)
 447 {
 448         char *name;
 449         struct dentry *dentry;
 450
 451         name = getname(pathname);
 452         dentry = (struct dentry *) name;
 453         if (!IS_ERR(name)) {
 454                 dentry = lookup_dentry(name, NULL, lookup_flags);
 455                 putname(name);
 456                 if (!IS_ERR(dentry)) {
 457                         if (!dentry->d_inode) {
 458                                 dput(dentry);
 459                                 dentry = ERR_PTR(-ENOENT);
 460                         }
 461                 }
 462         }
 463         return dentry;
 464 }
 465
 466 /*
 467  * It's inline, so penalty for filesystems that don't use sticky bit is
 468  * minimal.
 469  */
 470 static inline int check_sticky(struct inode *dir, struct inode *inode)
 471 {
 472         if (!(dir->i_mode & S_ISVTX))
 473                 return 0;
 474         if (inode->i_uid == current->fsuid)
 475                 return 0;
 476         if (dir->i_uid == current->fsuid)
 477                 return 0;
 478         return !capable(CAP_FOWNER);
 479 }
 480
 481 /*
 482  *      Check whether we can remove a link victim from directory dir, check
 483  *  whether the type of victim is right.
 484  *  1. We can't do it if dir is read-only (done in permission())
 485  *  2. We should have write and exec permissions on dir
 486  *  3. We can't remove anything from append-only dir
 487  *  4. We can't do anything with immutable dir (done in permission())
 488  *  5. If the sticky bit on dir is set we should either
 489  *      a. be owner of dir, or
 490  *      b. be owner of victim, or
 491  *      c. have CAP_FOWNER capability
 492  *  6. If the victim is append-only or immutable we can't do antyhing with
 493  *     links pointing to it.
 494  *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
 495  *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
 496  *  9. We can't remove a root or mountpoint.
 497  */
 498 static inline int may_delete(struct inode *dir,struct dentry *victim, int isdir)
 499 {
 500         int error;
 501         if (!victim->d_inode || victim->d_parent->d_inode != dir)
 502                 return -ENOENT;
 503         error = permission(dir,MAY_WRITE | MAY_EXEC);
 504         if (error)
 505                 return error;
 506         if (IS_APPEND(dir))
 507                 return -EPERM;
 508         if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
 509             IS_IMMUTABLE(victim->d_inode))
 510                 return -EPERM;
 511         if (isdir) {
 512                 if (!S_ISDIR(victim->d_inode->i_mode))
 513                         return -ENOTDIR;
 514                 if (IS_ROOT(victim))
 515                         return -EBUSY;
 516                 if (victim->d_mounts != victim->d_covers)
 517                         return -EBUSY;
 518         } else if (S_ISDIR(victim->d_inode->i_mode))
 519                 return -EISDIR;
 520         return 0;
 521 }
 522
 523 /*      Check whether we can create an object with dentry child in directory
 524  *  dir.
 525  *  1. We can't do it if child already exists (open has special treatment for
 526  *     this case, but since we are inlined it's OK)
 527  *  2. We can't do it if dir is read-only (done in permission())
 528  *  3. We should have write and exec permissions on dir
 529  *  4. We can't do it if dir is immutable (done in permission())
 530  */
 531 static inline int may_create(struct inode *dir, struct dentry *child) {
 532         if (child->d_inode)
 533                 return -EEXIST;
 534         return permission(dir,MAY_WRITE | MAY_EXEC);
 535 }
 536
 537 static inline struct dentry *get_parent(struct dentry *dentry)
 538 {
 539         return dget(dentry->d_parent);
 540 }
 541
 542 static inline void unlock_dir(struct dentry *dir)
 543 {
 544         up(&dir->d_inode->i_sem);
 545         dput(dir);
 546 }
 547
 548 /*
 549  * We need to do a check-parent every time
 550  * after we have locked the parent - to verify
 551  * that the parent is still our parent and
 552  * that we are still hashed onto it..
 553  *
 554  * This is requied in case two processes race
 555  * on removing (or moving) the same entry: the
 556  * parent lock will serialize them, but the
 557  * other process will be too late..
 558  */
 559 #define check_parent(dir, dentry) \
 560         ((dir) == (dentry)->d_parent && !list_empty(&dentry->d_hash))
 561
 562 /*
 563  * Locking the parent is needed to:
 564  *  - serialize directory operations
 565  *  - make sure the parent doesn't change from
 566  *    under us in the middle of an operation.
 567  *
 568  * NOTE! Right now we'd rather use a "struct inode"
 569  * for this, but as I expect things to move toward
 570  * using dentries instead for most things it is
 571  * probably better to start with the conceptually
 572  * better interface of relying on a path of dentries.
 573  */
 574 static inline struct dentry *lock_parent(struct dentry *dentry)
 575 {
 576         struct dentry *dir = dget(dentry->d_parent);
 577
 578         down(&dir->d_inode->i_sem);
 579         return dir;
 580 }
 581
 582 /*
 583  * Whee.. Deadlock country. Happily there are only two VFS
 584  * operations that do this..
 585  */
 586 static inline void double_lock(struct dentry *d1, struct dentry *d2)
 587 {
 588         struct semaphore *s1 = &d1->d_inode->i_sem;
 589         struct semaphore *s2 = &d2->d_inode->i_sem;
 590
 591         if (s1 != s2) {
 592                 if ((unsigned long) s1 < (unsigned long) s2) {
 593                         struct semaphore *tmp = s2;
 594                         s2 = s1; s1 = tmp;
 595                 }
 596                 down(s1);
 597         }
 598         down(s2);
 599 }
 600
 601 static inline void double_unlock(struct dentry *d1, struct dentry *d2)
 602 {
 603         struct semaphore *s1 = &d1->d_inode->i_sem;
 604         struct semaphore *s2 = &d2->d_inode->i_sem;
 605
 606         up(s1);
 607         if (s1 != s2)
 608                 up(s2);
 609         dput(d1);
 610         dput(d2);
 611 }
 612
 613
 614 /*
 615  * Special case: O_CREAT|O_EXCL implies O_NOFOLLOW for security
 616  * reasons.
 617  *
 618  * O_DIRECTORY translates into forcing a directory lookup.
 619  */
 620 static inline int lookup_flags(unsigned int f)
 621 {
 622         unsigned long retval = LOOKUP_FOLLOW;
 623
 624         if (f & O_NOFOLLOW)
 625                 retval &= ~LOOKUP_FOLLOW;
 626
 627         if ((f & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
 628                 retval &= ~LOOKUP_FOLLOW;
 629
 630         if (f & O_DIRECTORY)
 631                 retval |= LOOKUP_DIRECTORY;
 632
 633         return retval;
 634 }
 635
 636 int vfs_create(struct inode *dir, struct dentry *dentry, int mode)
 637 {
 638         int error;
 639
 640         error = may_create(dir, dentry);
 641         if (error)
 642                 goto exit_lock;
 643
 644         error = -EACCES;        /* shouldn't it be ENOSYS? */
 645         if (!dir->i_op || !dir->i_op->create)
 646                 goto exit_lock;
 647
 648         DQUOT_INIT(dir);
 649         error = dir->i_op->create(dir, dentry, mode);
 650 exit_lock:
 651         return error;
 652 }
 653
 654 /*
 655  *      open_namei()
 656  *
 657  * namei for open - this is in fact almost the whole open-routine.
 658  *
 659  * Note that the low bits of "flag" aren't the same as in the open
 660  * system call - they are 00 - no permissions needed
 661  *                        01 - read permission needed
 662  *                        10 - write permission needed
 663  *                        11 - read/write permissions needed
 664  * which is a lot more logical, and also allows the "no perm" needed
 665  * for symlinks (where the permissions are checked later).
 666  */
 667 struct dentry * open_namei(const char * pathname, int flag, int mode)
 668 {
 669         int acc_mode, error;
 670         struct inode *inode;
 671         struct dentry *dentry;
 672
 673         mode &= S_IALLUGO & ~current->fs->umask;
 674         mode |= S_IFREG;
 675
 676         dentry = lookup_dentry(pathname, NULL, lookup_flags(flag));
 677         if (IS_ERR(dentry))
 678                 return dentry;
 679
 680         acc_mode = ACC_MODE(flag);
 681         if (flag & O_CREAT) {
 682                 struct dentry *dir;
 683
 684                 if (dentry->d_inode) {
 685                         if (!(flag & O_EXCL))
 686                                 goto nocreate;
 687                         error = -EEXIST;
 688                         goto exit;
 689                 }
 690
 691                 dir = lock_parent(dentry);
 692                 if (!check_parent(dir, dentry)) {
 693                         /*
 694                          * Really nasty race happened. What's the
 695                          * right error code? We had a dentry, but
 696                          * before we could use it it was removed
 697                          * by somebody else. We could just re-try
 698                          * everything, I guess.
 699                          *
 700                          * ENOENT is definitely wrong.
 701                          */
 702                         error = -ENOENT;
 703                         unlock_dir(dir);
 704                         goto exit;
 705                 }
 706
 707                 /*
 708                  * Somebody might have created the file while we
 709                  * waited for the directory lock.. So we have to
 710                  * re-do the existence test.
 711                  */
 712                 if (dentry->d_inode) {
 713                         error = 0;
 714                         if (flag & O_EXCL)
 715                                 error = -EEXIST;
 716                 } else {
 717                         error = vfs_create(dir->d_inode, dentry,mode);
 718                         /* Don't check for write permission, don't truncate */
 719                         acc_mode = 0;
 720                         flag &= ~O_TRUNC;
 721                 }
 722                 unlock_dir(dir);
 723                 if (error)
 724                         goto exit;
 725         }
 726
 727 nocreate:
 728         error = -ENOENT;
 729         inode = dentry->d_inode;
 730         if (!inode)
 731                 goto exit;
 732
 733         error = -ELOOP;
 734         if (S_ISLNK(inode->i_mode))
 735                 goto exit;
 736
 737         error = -EISDIR;
 738         if (S_ISDIR(inode->i_mode) && (flag & FMODE_WRITE))
 739                 goto exit;
 740
 741         error = permission(inode,acc_mode);
 742         if (error)
 743                 goto exit;
 744
 745         /*
 746          * FIFO's, sockets and device files are special: they don't
 747          * actually live on the filesystem itself, and as such you
 748          * can write to them even if the filesystem is read-only.
 749          */
 750         if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
 751                 flag &= ~O_TRUNC;
 752         } else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
 753                 error = -EACCES;
 754                 if (IS_NODEV(inode))
 755                         goto exit;
 756
 757                 flag &= ~O_TRUNC;
 758         } else {
 759                 error = -EROFS;
 760                 if (IS_RDONLY(inode) && (flag & 2))
 761                         goto exit;
 762         }
 763         /*
 764          * An append-only file must be opened in append mode for writing.
 765          */
 766         error = -EPERM;
 767         if (IS_APPEND(inode)) {
 768                 if  ((flag & FMODE_WRITE) && !(flag & O_APPEND))
 769                         goto exit;
 770                 if (flag & O_TRUNC)
 771                         goto exit;
 772         }
 773
 774         if (flag & O_TRUNC) {
 775                 error = get_write_access(inode);
 776                 if (error)
 777                         goto exit;
 778
 779                 /*
 780                  * Refuse to truncate files with mandatory locks held on them.
 781                  */
 782                 error = locks_verify_locked(inode);
 783                 if (!error) {
 784                         DQUOT_INIT(inode);
 785
 786                         error = do_truncate(dentry, 0);
 787                 }
 788                 put_write_access(inode);
 789                 if (error)
 790                         goto exit;
 791         } else
 792                 if (flag & FMODE_WRITE)
 793                         DQUOT_INIT(inode);
 794
 795         return dentry;
 796
 797 exit:
 798         dput(dentry);
 799         return ERR_PTR(error);
 800 }
 801
 802 struct dentry * do_mknod(const char * filename, int mode, dev_t dev)
 803 {
 804         int error;
 805         struct dentry *dir;
 806         struct dentry *dentry, *retval;
 807
 808         mode &= ~current->fs->umask;
 809         dentry = lookup_dentry(filename, NULL, LOOKUP_FOLLOW);
 810         if (IS_ERR(dentry))
 811                 return dentry;
 812
 813         dir = lock_parent(dentry);
 814         error = -ENOENT;
 815         if (!check_parent(dir, dentry))
 816                 goto exit_lock;
 817
 818         error = may_create(dir->d_inode, dentry);
 819         if (error)
 820                 goto exit_lock;
 821
 822         error = -EPERM;
 823         if (!dir->d_inode->i_op || !dir->d_inode->i_op->mknod)
 824                 goto exit_lock;
 825
 826         DQUOT_INIT(dir->d_inode);
 827         error = dir->d_inode->i_op->mknod(dir->d_inode, dentry, mode, dev);
 828 exit_lock:
 829         retval = ERR_PTR(error);
 830         if (!error)
 831                 retval = dget(dentry);
 832         unlock_dir(dir);
 833         dput(dentry);
 834         return retval;
 835 }
 836
 837 asmlinkage int sys_mknod(const char * filename, int mode, dev_t dev)
 838 {
 839         int error;
 840         char * tmp;
 841         struct dentry * dentry;
 842
 843         lock_kernel();
 844         error = -EPERM;
 845         if (S_ISDIR(mode) || (!S_ISFIFO(mode) && !capable(CAP_SYS_ADMIN)))
 846                 goto out;
 847         tmp = getname(filename);
 848         error = PTR_ERR(tmp);
 849         if (IS_ERR(tmp))
 850                 goto out;
 851
 852         error = -EINVAL;
 853         switch (mode & S_IFMT) {
 854         case 0:
 855                 mode |= S_IFREG;        /* fallthrough */
 856         case S_IFREG:
 857                 mode &= ~current->fs->umask;
 858                 dentry = lookup_dentry(filename, NULL, LOOKUP_FOLLOW);
 859                 if (IS_ERR(dentry))
 860                         error = PTR_ERR(dentry);
 861                 else {
 862                         struct dentry *dir = lock_parent(dentry);
 863                         error = -ENOENT;
 864                         if (check_parent(dir, dentry))
 865                                 error = vfs_create(dir->d_inode, dentry, mode);
 866                         dput(dentry);
 867                 }
 868                 break;
 869         case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK:
 870                 dentry = do_mknod(tmp,mode,dev);
 871                 error = PTR_ERR(dentry);
 872                 if (!IS_ERR(dentry)) {
 873                         dput(dentry);
 874                         error = 0;
 875                 }
 876                 break;
 877         }
 878         putname(tmp);
 879
 880 out:
 881         unlock_kernel();
 882         return error;
 883 }
 884
 885 static inline int do_mkdir(const char * pathname, int mode)
 886 {
 887         int error;
 888         struct dentry *dir;
 889         struct dentry *dentry;
 890
 891         dentry = lookup_dentry(pathname, NULL, LOOKUP_SLASHOK);
 892         error = PTR_ERR(dentry);
 893         if (IS_ERR(dentry))
 894                 goto exit;
 895
 896         /*
 897          * EEXIST is kind of a strange error code to
 898          * return, but basically if the dentry was moved
 899          * or unlinked while we locked the parent, we
 900          * do know that it _did_ exist before, and as
 901          * such it makes perfect sense.. In contrast,
 902          * ENOENT doesn't make sense for mkdir.
 903          */
 904         dir = lock_parent(dentry);
 905         error = -EEXIST;
 906         if (!check_parent(dir, dentry))
 907                 goto exit_lock;
 908
 909         error = may_create(dir->d_inode, dentry);
 910         if (error)
 911                 goto exit_lock;
 912
 913         error = -EPERM;
 914         if (!dir->d_inode->i_op || !dir->d_inode->i_op->mkdir)
 915                 goto exit_lock;
 916
 917         DQUOT_INIT(dir->d_inode);
 918         mode &= (S_IRWXUGO|S_ISVTX) & ~current->fs->umask;
 919         error = dir->d_inode->i_op->mkdir(dir->d_inode, dentry, mode);
 920
 921 exit_lock:
 922         unlock_dir(dir);
 923         dput(dentry);
 924 exit:
 925         return error;
 926 }
 927
 928 asmlinkage int sys_mkdir(const char * pathname, int mode)
 929 {
 930         int error;
 931         char * tmp;
 932
 933         lock_kernel();
 934         tmp = getname(pathname);
 935         error = PTR_ERR(tmp);
 936         if (!IS_ERR(tmp)) {
 937                 error = do_mkdir(tmp,mode);
 938                 putname(tmp);
 939         }
 940         unlock_kernel();
 941         return error;
 942 }
 943
 944 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 945 {
 946         int error;
 947
 948         error = may_delete(dir, dentry, 1);
 949         if (error)
 950                 return error;
 951
 952         if (!dir->i_op || !dir->i_op->rmdir)
 953                 return -EPERM;
 954
 955         DQUOT_INIT(dir);
 956
 957         /*
 958          * We try to drop the dentry early: we should have
 959          * a usage count of 2 if we're the only user of this
 960          * dentry, and if that is true (possibly after pruning
 961          * the dcache), then we drop the dentry now.
 962          *
 963          * A low-level filesystem can, if it choses, legally
 964          * do a
 965          *
 966          *      if (!list_empty(&dentry->d_hash))
 967          *              return -EBUSY;
 968          *
 969          * if it cannot handle the case of removing a directory
 970          * that is still in use by something else..
 971          */
 972         switch (dentry->d_count) {
 973         default:
 974                 shrink_dcache_parent(dentry);
 975                 if (dentry->d_count != 2)
 976                         break;
 977         case 2:
 978                 d_drop(dentry);
 979         }
 980
 981         error = dir->i_op->rmdir(dir, dentry);
 982
 983         return error;
 984 }
 985
 986 static inline int do_rmdir(const char * name)
 987 {
 988         int error;
 989         struct dentry *dir;
 990         struct dentry *dentry;
 991
 992         dentry = lookup_dentry(name, NULL, 0);
 993         error = PTR_ERR(dentry);
 994         if (IS_ERR(dentry))
 995                 goto exit;
 996
 997         error = -ENOENT;
 998         if (!dentry->d_inode)
 999                 goto exit_dput;
1000
1001         dir = dget(dentry->d_parent);
1002
1003         /*
1004          * The dentry->d_count stuff confuses d_delete() enough to
1005          * not kill the inode from under us while it is locked. This
1006          * wouldn't be needed, except the dentry semaphore is really
1007          * in the inode, not in the dentry..
1008          */
1009         dentry->d_count++;
1010         double_lock(dir, dentry);
1011
1012         error = -ENOENT;
1013         if (check_parent(dir, dentry))
1014                 error = vfs_rmdir(dir->d_inode, dentry);
1015
1016         double_unlock(dentry, dir);
1017 exit_dput:
1018         dput(dentry);
1019 exit:
1020         return error;
1021 }
1022
1023 asmlinkage int sys_rmdir(const char * pathname)
1024 {
1025         int error;
1026         char * tmp;
1027
1028         lock_kernel();
1029         tmp = getname(pathname);
1030         error = PTR_ERR(tmp);
1031         if (!IS_ERR(tmp)) {
1032                 error = do_rmdir(tmp);
1033                 putname(tmp);
1034         }
1035         unlock_kernel();
1036         return error;
1037 }
1038
1039 int vfs_unlink(struct inode *dir, struct dentry *dentry)
1040 {
1041         int error;
1042
1043         error = may_delete(dir, dentry, 0);
1044         if (!error) {
1045                 error = -EPERM;
1046                 if (dir->i_op && dir->i_op->unlink) {
1047                         DQUOT_INIT(dir);
1048                         error = dir->i_op->unlink(dir, dentry);
1049                 }
1050         }
1051         return error;
1052 }
1053
1054 static inline int do_unlink(const char * name)
1055 {
1056         int error;
1057         struct dentry *dir;
1058         struct dentry *dentry;
1059
1060         dentry = lookup_dentry(name, NULL, 0);
1061         error = PTR_ERR(dentry);
1062         if (IS_ERR(dentry))
1063                 goto exit;
1064
1065         dir = lock_parent(dentry);
1066         error = -ENOENT;
1067         if (check_parent(dir, dentry))
1068                 error = vfs_unlink(dir->d_inode, dentry);
1069
1070         unlock_dir(dir);
1071         dput(dentry);
1072 exit:
1073         return error;
1074 }
1075
1076 asmlinkage int sys_unlink(const char * pathname)
1077 {
1078         int error;
1079         char * tmp;
1080
1081         lock_kernel();
1082         tmp = getname(pathname);
1083         error = PTR_ERR(tmp);
1084         if (!IS_ERR(tmp)) {
1085                 error = do_unlink(tmp);
1086                 putname(tmp);
1087         }
1088         unlock_kernel();
1089         return error;
1090 }
1091
1092 static inline int do_symlink(const char * oldname, const char * newname)
1093 {
1094         int error;
1095         struct dentry *dir;
1096         struct dentry *dentry;
1097
1098         dentry = lookup_dentry(newname, NULL, 0);
1099
1100         error = PTR_ERR(dentry);
1101         if (IS_ERR(dentry))
1102                 goto exit;
1103
1104         dir = lock_parent(dentry);
1105         error = -ENOENT;
1106         if (!check_parent(dir, dentry))
1107                 goto exit_lock;
1108
1109         error = may_create(dir->d_inode, dentry);
1110         if (error)
1111                 goto exit_lock;
1112
1113         error = -EPERM;
1114         if (!dir->d_inode->i_op || !dir->d_inode->i_op->symlink)
1115                 goto exit_lock;
1116
1117         DQUOT_INIT(dir->d_inode);
1118         error = dir->d_inode->i_op->symlink(dir->d_inode, dentry, oldname);
1119
1120 exit_lock:
1121         unlock_dir(dir);
1122         dput(dentry);
1123 exit:
1124         return error;
1125 }
1126
1127 asmlinkage int sys_symlink(const char * oldname, const char * newname)
1128 {
1129         int error;
1130         char * from;
1131
1132         lock_kernel();
1133         from = getname(oldname);
1134         error = PTR_ERR(from);
1135         if (!IS_ERR(from)) {
1136                 char * to;
1137                 to = getname(newname);
1138                 error = PTR_ERR(to);
1139                 if (!IS_ERR(to)) {
1140                         error = do_symlink(from,to);
1141                         putname(to);
1142                 }
1143                 putname(from);
1144         }
1145         unlock_kernel();
1146         return error;
1147 }
1148
1149 static inline int do_link(const char * oldname, const char * newname)
1150 {
1151         struct dentry *old_dentry, *new_dentry, *dir;
1152         struct inode *inode;
1153         int error;
1154
1155         /*
1156          * Hardlinks are often used in delicate situations.  We avoid
1157          * security-related surprises by not following symlinks on the
1158          * newname.  --KAB
1159          *
1160          * We don't follow them on the oldname either to be compatible
1161          * with linux 2.0, and to avoid hard-linking to directories
1162          * and other special files.  --ADM
1163          */
1164         old_dentry = lookup_dentry(oldname, NULL, 0);
1165         error = PTR_ERR(old_dentry);
1166         if (IS_ERR(old_dentry))
1167                 goto exit;
1168
1169         new_dentry = lookup_dentry(newname, NULL, 0);
1170         error = PTR_ERR(new_dentry);
1171         if (IS_ERR(new_dentry))
1172                 goto exit_old;
1173
1174         dir = lock_parent(new_dentry);
1175         error = -ENOENT;
1176         if (!check_parent(dir, new_dentry))
1177                 goto exit_lock;
1178
1179         error = -ENOENT;
1180         inode = old_dentry->d_inode;
1181         if (!inode)
1182                 goto exit_lock;
1183
1184         error = may_create(dir->d_inode, new_dentry);
1185         if (error)
1186                 goto exit_lock;
1187
1188         error = -EXDEV;
1189         if (dir->d_inode->i_dev != inode->i_dev)
1190                 goto exit_lock;
1191
1192         /*
1193          * A link to an append-only or immutable file cannot be created.
1194          */
1195         error = -EPERM;
1196         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1197                 goto exit_lock;
1198
1199         error = -EPERM;
1200         if (!dir->d_inode->i_op || !dir->d_inode->i_op->link)
1201                 goto exit_lock;
1202
1203         DQUOT_INIT(dir->d_inode);
1204         error = dir->d_inode->i_op->link(old_dentry, dir->d_inode, new_dentry);
1205
1206 exit_lock:
1207         unlock_dir(dir);
1208         dput(new_dentry);
1209 exit_old:
1210         dput(old_dentry);
1211 exit:
1212         return error;
1213 }
1214
1215 asmlinkage int sys_link(const char * oldname, const char * newname)
1216 {
1217         int error;
1218         char * from;
1219
1220         lock_kernel();
1221         from = getname(oldname);
1222         error = PTR_ERR(from);
1223         if (!IS_ERR(from)) {
1224                 char * to;
1225                 to = getname(newname);
1226                 error = PTR_ERR(to);
1227                 if (!IS_ERR(to)) {
1228                         error = do_link(from,to);
1229                         putname(to);
1230                 }
1231                 putname(from);
1232         }
1233         unlock_kernel();
1234         return error;
1235 }
1236
1237 int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
1238                struct inode *new_dir, struct dentry *new_dentry)
1239 {
1240         int error;
1241         int need_rehash = 0;
1242
1243         if (old_dentry->d_inode == new_dentry->d_inode)
1244                 return 0;
1245
1246         error = may_delete(old_dir, old_dentry, 1);
1247         if (error)
1248                 return error;
1249
1250         if (new_dir->i_dev != old_dir->i_dev)
1251                 return -EXDEV;
1252
1253         if (!new_dentry->d_inode)
1254                 error = may_create(new_dir, new_dentry);
1255         else
1256                 error = may_delete(new_dir, new_dentry, 1);
1257         if (error)
1258                 return error;
1259
1260         if (!old_dir->i_op || !old_dir->i_op->rename)
1261                 return -EPERM;
1262
1263         /*
1264          * If we are going to change the parent - check write permissions,
1265          * we'll need to flip '..'.
1266          */
1267         if (new_dir != old_dir) {
1268                 error = permission(old_dentry->d_inode, MAY_WRITE);
1269         }
1270         if (error)
1271                 return error;
1272
1273         DQUOT_INIT(old_dir);
1274         DQUOT_INIT(new_dir);
1275         down(&old_dir->i_sb->s_vfs_rename_sem);
1276         error = -EINVAL;
1277         if (is_subdir(new_dentry, old_dentry))
1278                 goto out_unlock;
1279         if (new_dentry->d_inode) {
1280                 error = -EBUSY;
1281                 if (d_invalidate(new_dentry)<0)
1282                         goto out_unlock;
1283                 need_rehash = 1;
1284         }
1285         error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
1286         if (need_rehash)
1287                 d_rehash(new_dentry);
1288         if (!error)
1289                 d_move(old_dentry,new_dentry);
1290 out_unlock:
1291         up(&old_dir->i_sb->s_vfs_rename_sem);
1292         return error;
1293 }
1294
1295 int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
1296                struct inode *new_dir, struct dentry *new_dentry)
1297 {
1298         int error;
1299
1300         if (old_dentry->d_inode == new_dentry->d_inode)
1301                 return 0;
1302
1303         error = may_delete(old_dir, old_dentry, 0);
1304         if (error)
1305                 return error;
1306
1307         if (new_dir->i_dev != old_dir->i_dev)
1308                 return -EXDEV;
1309
1310         if (!new_dentry->d_inode)
1311                 error = may_create(new_dir, new_dentry);
1312         else
1313                 error = may_delete(new_dir, new_dentry, 0);
1314         if (error)
1315                 return error;
1316
1317         if (!old_dir->i_op || !old_dir->i_op->rename)
1318                 return -EPERM;
1319
1320         DQUOT_INIT(old_dir);
1321         DQUOT_INIT(new_dir);
1322         error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
1323         if (error)
1324                 return error;
1325         /* The following d_move() should become unconditional */
1326         if (!(old_dir->i_sb->s_flags & MS_ODD_RENAME)) {
1327                 d_move(old_dentry, new_dentry);
1328         }
1329         return 0;
1330 }
1331
1332 int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1333                struct inode *new_dir, struct dentry *new_dentry)
1334 {
1335         if (S_ISDIR(old_dentry->d_inode->i_mode))
1336                 return vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
1337         else
1338                 return vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
1339 }
1340
1341 static inline int do_rename(const char * oldname, const char * newname)
1342 {
1343         int error;
1344         struct dentry * old_dir, * new_dir;
1345         struct dentry * old_dentry, *new_dentry;
1346
1347         old_dentry = lookup_dentry(oldname, NULL, 0);
1348
1349         error = PTR_ERR(old_dentry);
1350         if (IS_ERR(old_dentry))
1351                 goto exit;
1352
1353         error = -ENOENT;
1354         if (!old_dentry->d_inode)
1355                 goto exit_old;
1356
1357         {
1358                 unsigned int flags = 0;
1359                 if (S_ISDIR(old_dentry->d_inode->i_mode))
1360                         flags = LOOKUP_SLASHOK;
1361                 new_dentry = lookup_dentry(newname, NULL, flags);
1362         }
1363
1364         error = PTR_ERR(new_dentry);
1365         if (IS_ERR(new_dentry))
1366                 goto exit_old;
1367
1368         new_dir = get_parent(new_dentry);
1369         old_dir = get_parent(old_dentry);
1370
1371         double_lock(new_dir, old_dir);
1372
1373         error = -ENOENT;
1374         if (check_parent(old_dir, old_dentry) && check_parent(new_dir, new_dentry))
1375                 error = vfs_rename(old_dir->d_inode, old_dentry,
1376                                    new_dir->d_inode, new_dentry);
1377
1378         double_unlock(new_dir, old_dir);
1379         dput(new_dentry);
1380 exit_old:
1381         dput(old_dentry);
1382 exit:
1383         return error;
1384 }
1385
1386 asmlinkage int sys_rename(const char * oldname, const char * newname)
1387 {
1388         int error;
1389         char * from;
1390
1391         lock_kernel();
1392         from = getname(oldname);
1393         error = PTR_ERR(from);
1394         if (!IS_ERR(from)) {
1395                 char * to;
1396                 to = getname(newname);
1397                 error = PTR_ERR(to);
1398                 if (!IS_ERR(to)) {
1399                         error = do_rename(from,to);
1400                         putname(to);
1401                 }
1402                 putname(from);
1403         }
1404         unlock_kernel();
1405         return error;
1406 }