fs/file.c

   1 /*
   2  *  linux/fs/file.c
   3  *
   4  *  Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
   5  *
   6  *  Manage the dynamic fd arrays in the process files_struct.
   7  */
   8
   9 #include <linux/fs.h>
  10 #include <linux/mm.h>
  11 #include <linux/time.h>
  12 #include <linux/slab.h>
  13 #include <linux/vmalloc.h>
  14 #include <linux/file.h>
  15 #include <linux/fdtable.h>
  16 #include <linux/bitops.h>
  17 #include <linux/interrupt.h>
  18 #include <linux/spinlock.h>
  19 #include <linux/rcupdate.h>
  20 #include <linux/workqueue.h>
  21
  22 struct fdtable_defer {
  23         spinlock_t lock;
  24         struct work_struct wq;
  25         struct fdtable *next;
  26 };
  27
  28 int sysctl_nr_open __read_mostly = 1024*1024;
  29 int sysctl_nr_open_min = BITS_PER_LONG;
  30 int sysctl_nr_open_max = 1024 * 1024; /* raised later */
  31
  32 /*
  33  * We use this list to defer free fdtables that have vmalloced
  34  * sets/arrays. By keeping a per-cpu list, we avoid having to embed
  35  * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in
  36  * this per-task structure.
  37  */
  38 static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
  39
  40 static inline void * alloc_fdmem(unsigned int size)
  41 {
  42         if (size <= PAGE_SIZE)
  43                 return kmalloc(size, GFP_KERNEL);
  44         else
  45                 return vmalloc(size);
  46 }
  47
  48 static inline void free_fdarr(struct fdtable *fdt)
  49 {
  50         if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *)))
  51                 kfree(fdt->fd);
  52         else
  53                 vfree(fdt->fd);
  54 }
  55
  56 static inline void free_fdset(struct fdtable *fdt)
  57 {
  58         if (fdt->max_fds <= (PAGE_SIZE * BITS_PER_BYTE / 2))
  59                 kfree(fdt->open_fds);
  60         else
  61                 vfree(fdt->open_fds);
  62 }
  63
  64 static void free_fdtable_work(struct work_struct *work)
  65 {
  66         struct fdtable_defer *f =
  67                 container_of(work, struct fdtable_defer, wq);
  68         struct fdtable *fdt;
  69
  70         spin_lock_bh(&f->lock);
  71         fdt = f->next;
  72         f->next = NULL;
  73         spin_unlock_bh(&f->lock);
  74         while(fdt) {
  75                 struct fdtable *next = fdt->next;
  76                 vfree(fdt->fd);
  77                 free_fdset(fdt);
  78                 kfree(fdt);
  79                 fdt = next;
  80         }
  81 }
  82
  83 void free_fdtable_rcu(struct rcu_head *rcu)
  84 {
  85         struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
  86         struct fdtable_defer *fddef;
  87
  88         BUG_ON(!fdt);
  89
  90         if (fdt->max_fds <= NR_OPEN_DEFAULT) {
  91                 /*
  92                  * This fdtable is embedded in the files structure and that
  93                  * structure itself is getting destroyed.
  94                  */
  95                 kmem_cache_free(files_cachep,
  96                                 container_of(fdt, struct files_struct, fdtab));
  97                 return;
  98         }
  99         if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) {
 100                 kfree(fdt->fd);
 101                 kfree(fdt->open_fds);
 102                 kfree(fdt);
 103         } else {
 104                 fddef = &get_cpu_var(fdtable_defer_list);
 105                 spin_lock(&fddef->lock);
 106                 fdt->next = fddef->next;
 107                 fddef->next = fdt;
 108                 /* vmallocs are handled from the workqueue context */
 109                 schedule_work(&fddef->wq);
 110                 spin_unlock(&fddef->lock);
 111                 put_cpu_var(fdtable_defer_list);
 112         }
 113 }
 114
 115 /*
 116  * Expand the fdset in the files_struct.  Called with the files spinlock
 117  * held for write.
 118  */
 119 static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
 120 {
 121         unsigned int cpy, set;
 122
 123         BUG_ON(nfdt->max_fds < ofdt->max_fds);
 124
 125         cpy = ofdt->max_fds * sizeof(struct file *);
 126         set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
 127         memcpy(nfdt->fd, ofdt->fd, cpy);
 128         memset((char *)(nfdt->fd) + cpy, 0, set);
 129
 130         cpy = ofdt->max_fds / BITS_PER_BYTE;
 131         set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE;
 132         memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
 133         memset((char *)(nfdt->open_fds) + cpy, 0, set);
 134         memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
 135         memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
 136 }
 137
 138 static struct fdtable * alloc_fdtable(unsigned int nr)
 139 {
 140         struct fdtable *fdt;
 141         char *data;
 142
 143         /*
 144          * Figure out how many fds we actually want to support in this fdtable.
 145          * Allocation steps are keyed to the size of the fdarray, since it
 146          * grows far faster than any of the other dynamic data. We try to fit
 147          * the fdarray into comfortable page-tuned chunks: starting at 1024B
 148          * and growing in powers of two from there on.
 149          */
 150         nr /= (1024 / sizeof(struct file *));
 151         nr = roundup_pow_of_two(nr + 1);
 152         nr *= (1024 / sizeof(struct file *));
 153         /*
 154          * Note that this can drive nr *below* what we had passed if sysctl_nr_open
 155          * had been set lower between the check in expand_files() and here.  Deal
 156          * with that in caller, it's cheaper that way.
 157          *
 158          * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
 159          * bitmaps handling below becomes unpleasant, to put it mildly...
 160          */
 161         if (unlikely(nr > sysctl_nr_open))
 162                 nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
 163
 164         fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
 165         if (!fdt)
 166                 goto out;
 167         fdt->max_fds = nr;
 168         data = alloc_fdmem(nr * sizeof(struct file *));
 169         if (!data)
 170                 goto out_fdt;
 171         fdt->fd = (struct file **)data;
 172         data = alloc_fdmem(max_t(unsigned int,
 173                                  2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES));
 174         if (!data)
 175                 goto out_arr;
 176         fdt->open_fds = (fd_set *)data;
 177         data += nr / BITS_PER_BYTE;
 178         fdt->close_on_exec = (fd_set *)data;
 179         INIT_RCU_HEAD(&fdt->rcu);
 180         fdt->next = NULL;
 181
 182         return fdt;
 183
 184 out_arr:
 185         free_fdarr(fdt);
 186 out_fdt:
 187         kfree(fdt);
 188 out:
 189         return NULL;
 190 }
 191
 192 /*
 193  * Expand the file descriptor table.
 194  * This function will allocate a new fdtable and both fd array and fdset, of
 195  * the given size.
 196  * Return <0 error code on error; 1 on successful completion.
 197  * The files->file_lock should be held on entry, and will be held on exit.
 198  */
 199 static int expand_fdtable(struct files_struct *files, int nr)
 200         __releases(files->file_lock)
 201         __acquires(files->file_lock)
 202 {
 203         struct fdtable *new_fdt, *cur_fdt;
 204
 205         spin_unlock(&files->file_lock);
 206         new_fdt = alloc_fdtable(nr);
 207         spin_lock(&files->file_lock);
 208         if (!new_fdt)
 209                 return -ENOMEM;
 210         /*
 211          * extremely unlikely race - sysctl_nr_open decreased between the check in
 212          * caller and alloc_fdtable().  Cheaper to catch it here...
 213          */
 214         if (unlikely(new_fdt->max_fds <= nr)) {
 215                 free_fdarr(new_fdt);
 216                 free_fdset(new_fdt);
 217                 kfree(new_fdt);
 218                 return -EMFILE;
 219         }
 220         /*
 221          * Check again since another task may have expanded the fd table while
 222          * we dropped the lock
 223          */
 224         cur_fdt = files_fdtable(files);
 225         if (nr >= cur_fdt->max_fds) {
 226                 /* Continue as planned */
 227                 copy_fdtable(new_fdt, cur_fdt);
 228                 rcu_assign_pointer(files->fdt, new_fdt);
 229                 if (cur_fdt->max_fds > NR_OPEN_DEFAULT)
 230                         free_fdtable(cur_fdt);
 231         } else {
 232                 /* Somebody else expanded, so undo our attempt */
 233                 free_fdarr(new_fdt);
 234                 free_fdset(new_fdt);
 235                 kfree(new_fdt);
 236         }
 237         return 1;
 238 }
 239
 240 /*
 241  * Expand files.
 242  * This function will expand the file structures, if the requested size exceeds
 243  * the current capacity and there is room for expansion.
 244  * Return <0 error code on error; 0 when nothing done; 1 when files were
 245  * expanded and execution may have blocked.
 246  * The files->file_lock should be held on entry, and will be held on exit.
 247  */
 248 int expand_files(struct files_struct *files, int nr)
 249 {
 250         struct fdtable *fdt;
 251
 252         fdt = files_fdtable(files);
 253
 254         /*
 255          * N.B. For clone tasks sharing a files structure, this test
 256          * will limit the total number of files that can be opened.
 257          */
 258         if (nr >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
 259                 return -EMFILE;
 260
 261         /* Do we need to expand? */
 262         if (nr < fdt->max_fds)
 263                 return 0;
 264
 265         /* Can we expand? */
 266         if (nr >= sysctl_nr_open)
 267                 return -EMFILE;
 268
 269         /* All good, so we try */
 270         return expand_fdtable(files, nr);
 271 }
 272
 273 static int count_open_files(struct fdtable *fdt)
 274 {
 275         int size = fdt->max_fds;
 276         int i;
 277
 278         /* Find the last open fd */
 279         for (i = size/(8*sizeof(long)); i > 0; ) {
 280                 if (fdt->open_fds->fds_bits[--i])
 281                         break;
 282         }
 283         i = (i+1) * 8 * sizeof(long);
 284         return i;
 285 }
 286
 287 /*
 288  * Allocate a new files structure and copy contents from the
 289  * passed in files structure.
 290  * errorp will be valid only when the returned files_struct is NULL.
 291  */
 292 struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 293 {
 294         struct files_struct *newf;
 295         struct file **old_fds, **new_fds;
 296         int open_files, size, i;
 297         struct fdtable *old_fdt, *new_fdt;
 298
 299         *errorp = -ENOMEM;
 300         newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
 301         if (!newf)
 302                 goto out;
 303
 304         atomic_set(&newf->count, 1);
 305
 306         spin_lock_init(&newf->file_lock);
 307         newf->next_fd = 0;
 308         new_fdt = &newf->fdtab;
 309         new_fdt->max_fds = NR_OPEN_DEFAULT;
 310         new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
 311         new_fdt->open_fds = (fd_set *)&newf->open_fds_init;
 312         new_fdt->fd = &newf->fd_array[0];
 313         INIT_RCU_HEAD(&new_fdt->rcu);
 314         new_fdt->next = NULL;
 315
 316         spin_lock(&oldf->file_lock);
 317         old_fdt = files_fdtable(oldf);
 318         open_files = count_open_files(old_fdt);
 319
 320         /*
 321          * Check whether we need to allocate a larger fd array and fd set.
 322          */
 323         while (unlikely(open_files > new_fdt->max_fds)) {
 324                 spin_unlock(&oldf->file_lock);
 325
 326                 if (new_fdt != &newf->fdtab) {
 327                         free_fdarr(new_fdt);
 328                         free_fdset(new_fdt);
 329                         kfree(new_fdt);
 330                 }
 331
 332                 new_fdt = alloc_fdtable(open_files - 1);
 333                 if (!new_fdt) {
 334                         *errorp = -ENOMEM;
 335                         goto out_release;
 336                 }
 337
 338                 /* beyond sysctl_nr_open; nothing to do */
 339                 if (unlikely(new_fdt->max_fds < open_files)) {
 340                         free_fdarr(new_fdt);
 341                         free_fdset(new_fdt);
 342                         kfree(new_fdt);
 343                         *errorp = -EMFILE;
 344                         goto out_release;
 345                 }
 346
 347                 /*
 348                  * Reacquire the oldf lock and a pointer to its fd table
 349                  * who knows it may have a new bigger fd table. We need
 350                  * the latest pointer.
 351                  */
 352                 spin_lock(&oldf->file_lock);
 353                 old_fdt = files_fdtable(oldf);
 354                 open_files = count_open_files(old_fdt);
 355         }
 356
 357         old_fds = old_fdt->fd;
 358         new_fds = new_fdt->fd;
 359
 360         memcpy(new_fdt->open_fds->fds_bits,
 361                 old_fdt->open_fds->fds_bits, open_files/8);
 362         memcpy(new_fdt->close_on_exec->fds_bits,
 363                 old_fdt->close_on_exec->fds_bits, open_files/8);
 364
 365         for (i = open_files; i != 0; i--) {
 366                 struct file *f = *old_fds++;
 367                 if (f) {
 368                         get_file(f);
 369                 } else {
 370                         /*
 371                          * The fd may be claimed in the fd bitmap but not yet
 372                          * instantiated in the files array if a sibling thread
 373                          * is partway through open().  So make sure that this
 374                          * fd is available to the new process.
 375                          */
 376                         FD_CLR(open_files - i, new_fdt->open_fds);
 377                 }
 378                 rcu_assign_pointer(*new_fds++, f);
 379         }
 380         spin_unlock(&oldf->file_lock);
 381
 382         /* compute the remainder to be cleared */
 383         size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
 384
 385         /* This is long word aligned thus could use a optimized version */
 386         memset(new_fds, 0, size);
 387
 388         if (new_fdt->max_fds > open_files) {
 389                 int left = (new_fdt->max_fds-open_files)/8;
 390                 int start = open_files / (8 * sizeof(unsigned long));
 391
 392                 memset(&new_fdt->open_fds->fds_bits[start], 0, left);
 393                 memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
 394         }
 395
 396         rcu_assign_pointer(newf->fdt, new_fdt);
 397
 398         return newf;
 399
 400 out_release:
 401         kmem_cache_free(files_cachep, newf);
 402 out:
 403         return NULL;
 404 }
 405
 406 static void __devinit fdtable_defer_list_init(int cpu)
 407 {
 408         struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
 409         spin_lock_init(&fddef->lock);
 410         INIT_WORK(&fddef->wq, free_fdtable_work);
 411         fddef->next = NULL;
 412 }
 413
 414 void __init files_defer_init(void)
 415 {
 416         int i;
 417         for_each_possible_cpu(i)
 418                 fdtable_defer_list_init(i);
 419         sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) &
 420                              -BITS_PER_LONG;
 421 }
 422
 423 struct files_struct init_files = {
 424         .count          = ATOMIC_INIT(1),
 425         .fdt            = &init_files.fdtab,
 426         .fdtab          = {
 427                 .max_fds        = NR_OPEN_DEFAULT,
 428                 .fd             = &init_files.fd_array[0],
 429                 .close_on_exec  = (fd_set *)&init_files.close_on_exec_init,
 430                 .open_fds       = (fd_set *)&init_files.open_fds_init,
 431                 .rcu            = RCU_HEAD_INIT,
 432         },
 433         .file_lock      = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
 434 };