kernel/slow-work.c

   1 /* Worker thread pool for slow items, such as filesystem lookups or mkdirs
   2  *
   3  * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
   4  * Written by David Howells (dhowells@redhat.com)
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public Licence
   8  * as published by the Free Software Foundation; either version
   9  * 2 of the Licence, or (at your option) any later version.
  10  *
  11  * See Documentation/slow-work.txt
  12  */
  13
  14 #include <linux/module.h>
  15 #include <linux/slow-work.h>
  16 #include <linux/kthread.h>
  17 #include <linux/freezer.h>
  18 #include <linux/wait.h>
  19
  20 #define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
  21                                          * things to do */
  22 #define SLOW_WORK_OOM_TIMEOUT (5 * HZ)  /* can't start new threads for 5s after
  23                                          * OOM */
  24
  25 static void slow_work_cull_timeout(unsigned long);
  26 static void slow_work_oom_timeout(unsigned long);
  27
  28 #ifdef CONFIG_SYSCTL
  29 static int slow_work_min_threads_sysctl(struct ctl_table *, int,
  30                                         void __user *, size_t *, loff_t *);
  31
  32 static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
  33                                         void __user *, size_t *, loff_t *);
  34 #endif
  35
  36 /*
  37  * The pool of threads has at least min threads in it as long as someone is
  38  * using the facility, and may have as many as max.
  39  *
  40  * A portion of the pool may be processing very slow operations.
  41  */
  42 static unsigned slow_work_min_threads = 2;
  43 static unsigned slow_work_max_threads = 4;
  44 static unsigned vslow_work_proportion = 50; /* % of threads that may process
  45                                              * very slow work */
  46
  47 #ifdef CONFIG_SYSCTL
  48 static const int slow_work_min_min_threads = 2;
  49 static int slow_work_max_max_threads = 255;
  50 static const int slow_work_min_vslow = 1;
  51 static const int slow_work_max_vslow = 99;
  52
  53 ctl_table slow_work_sysctls[] = {
  54         {
  55                 .procname       = "min-threads",
  56                 .data           = &slow_work_min_threads,
  57                 .maxlen         = sizeof(unsigned),
  58                 .mode           = 0644,
  59                 .proc_handler   = slow_work_min_threads_sysctl,
  60                 .extra1         = (void *) &slow_work_min_min_threads,
  61                 .extra2         = &slow_work_max_threads,
  62         },
  63         {
  64                 .procname       = "max-threads",
  65                 .data           = &slow_work_max_threads,
  66                 .maxlen         = sizeof(unsigned),
  67                 .mode           = 0644,
  68                 .proc_handler   = slow_work_max_threads_sysctl,
  69                 .extra1         = &slow_work_min_threads,
  70                 .extra2         = (void *) &slow_work_max_max_threads,
  71         },
  72         {
  73                 .procname       = "vslow-percentage",
  74                 .data           = &vslow_work_proportion,
  75                 .maxlen         = sizeof(unsigned),
  76                 .mode           = 0644,
  77                 .proc_handler   = &proc_dointvec_minmax,
  78                 .extra1         = (void *) &slow_work_min_vslow,
  79                 .extra2         = (void *) &slow_work_max_vslow,
  80         },
  81         {}
  82 };
  83 #endif
  84
  85 /*
  86  * The active state of the thread pool
  87  */
  88 static atomic_t slow_work_thread_count;
  89 static atomic_t vslow_work_executing_count;
  90
  91 static bool slow_work_may_not_start_new_thread;
  92 static bool slow_work_cull; /* cull a thread due to lack of activity */
  93 static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
  94 static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
  95 static struct slow_work slow_work_new_thread; /* new thread starter */
  96
  97 /*
  98  * The queues of work items and the lock governing access to them.  These are
  99  * shared between all the CPUs.  It doesn't make sense to have per-CPU queues
 100  * as the number of threads bears no relation to the number of CPUs.
 101  *
 102  * There are two queues of work items: one for slow work items, and one for
 103  * very slow work items.
 104  */
 105 static LIST_HEAD(slow_work_queue);
 106 static LIST_HEAD(vslow_work_queue);
 107 static DEFINE_SPINLOCK(slow_work_queue_lock);
 108
 109 /*
 110  * The thread controls.  A variable used to signal to the threads that they
 111  * should exit when the queue is empty, a waitqueue used by the threads to wait
 112  * for signals, and a completion set by the last thread to exit.
 113  */
 114 static bool slow_work_threads_should_exit;
 115 static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
 116 static DECLARE_COMPLETION(slow_work_last_thread_exited);
 117
 118 /*
 119  * The number of users of the thread pool and its lock.  Whilst this is zero we
 120  * have no threads hanging around, and when this reaches zero, we wait for all
 121  * active or queued work items to complete and kill all the threads we do have.
 122  */
 123 static int slow_work_user_count;
 124 static DEFINE_MUTEX(slow_work_user_lock);
 125
 126 /*
 127  * Calculate the maximum number of active threads in the pool that are
 128  * permitted to process very slow work items.
 129  *
 130  * The answer is rounded up to at least 1, but may not equal or exceed the
 131  * maximum number of the threads in the pool.  This means we always have at
 132  * least one thread that can process slow work items, and we always have at
 133  * least one thread that won't get tied up doing so.
 134  */
 135 static unsigned slow_work_calc_vsmax(void)
 136 {
 137         unsigned vsmax;
 138
 139         vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
 140         vsmax /= 100;
 141         vsmax = max(vsmax, 1U);
 142         return min(vsmax, slow_work_max_threads - 1);
 143 }
 144
 145 /*
 146  * Attempt to execute stuff queued on a slow thread.  Return true if we managed
 147  * it, false if there was nothing to do.
 148  */
 149 static bool slow_work_execute(void)
 150 {
 151         struct slow_work *work = NULL;
 152         unsigned vsmax;
 153         bool very_slow;
 154
 155         vsmax = slow_work_calc_vsmax();
 156
 157         /* see if we can schedule a new thread to be started if we're not
 158          * keeping up with the work */
 159         if (!waitqueue_active(&slow_work_thread_wq) &&
 160             (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
 161             atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
 162             !slow_work_may_not_start_new_thread)
 163                 slow_work_enqueue(&slow_work_new_thread);
 164
 165         /* find something to execute */
 166         spin_lock_irq(&slow_work_queue_lock);
 167         if (!list_empty(&vslow_work_queue) &&
 168             atomic_read(&vslow_work_executing_count) < vsmax) {
 169                 work = list_entry(vslow_work_queue.next,
 170                                   struct slow_work, link);
 171                 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
 172                         BUG();
 173                 list_del_init(&work->link);
 174                 atomic_inc(&vslow_work_executing_count);
 175                 very_slow = true;
 176         } else if (!list_empty(&slow_work_queue)) {
 177                 work = list_entry(slow_work_queue.next,
 178                                   struct slow_work, link);
 179                 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
 180                         BUG();
 181                 list_del_init(&work->link);
 182                 very_slow = false;
 183         } else {
 184                 very_slow = false; /* avoid the compiler warning */
 185         }
 186         spin_unlock_irq(&slow_work_queue_lock);
 187
 188         if (!work)
 189                 return false;
 190
 191         if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
 192                 BUG();
 193
 194         work->ops->execute(work);
 195
 196         if (very_slow)
 197                 atomic_dec(&vslow_work_executing_count);
 198         clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
 199
 200         /* if someone tried to enqueue the item whilst we were executing it,
 201          * then it'll be left unenqueued to avoid multiple threads trying to
 202          * execute it simultaneously
 203          *
 204          * there is, however, a race between us testing the pending flag and
 205          * getting the spinlock, and between the enqueuer setting the pending
 206          * flag and getting the spinlock, so we use a deferral bit to tell us
 207          * if the enqueuer got there first
 208          */
 209         if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
 210                 spin_lock_irq(&slow_work_queue_lock);
 211
 212                 if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
 213                     test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
 214                         goto auto_requeue;
 215
 216                 spin_unlock_irq(&slow_work_queue_lock);
 217         }
 218
 219         work->ops->put_ref(work);
 220         return true;
 221
 222 auto_requeue:
 223         /* we must complete the enqueue operation
 224          * - we transfer our ref on the item back to the appropriate queue
 225          * - don't wake another thread up as we're awake already
 226          */
 227         if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
 228                 list_add_tail(&work->link, &vslow_work_queue);
 229         else
 230                 list_add_tail(&work->link, &slow_work_queue);
 231         spin_unlock_irq(&slow_work_queue_lock);
 232         return true;
 233 }
 234
 235 /**
 236  * slow_work_enqueue - Schedule a slow work item for processing
 237  * @work: The work item to queue
 238  *
 239  * Schedule a slow work item for processing.  If the item is already undergoing
 240  * execution, this guarantees not to re-enter the execution routine until the
 241  * first execution finishes.
 242  *
 243  * The item is pinned by this function as it retains a reference to it, managed
 244  * through the item operations.  The item is unpinned once it has been
 245  * executed.
 246  *
 247  * An item may hog the thread that is running it for a relatively large amount
 248  * of time, sufficient, for example, to perform several lookup, mkdir, create
 249  * and setxattr operations.  It may sleep on I/O and may sleep to obtain locks.
 250  *
 251  * Conversely, if a number of items are awaiting processing, it may take some
 252  * time before any given item is given attention.  The number of threads in the
 253  * pool may be increased to deal with demand, but only up to a limit.
 254  *
 255  * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
 256  * the very slow queue, from which only a portion of the threads will be
 257  * allowed to pick items to execute.  This ensures that very slow items won't
 258  * overly block ones that are just ordinarily slow.
 259  *
 260  * Returns 0 if successful, -EAGAIN if not.
 261  */
 262 int slow_work_enqueue(struct slow_work *work)
 263 {
 264         unsigned long flags;
 265
 266         BUG_ON(slow_work_user_count <= 0);
 267         BUG_ON(!work);
 268         BUG_ON(!work->ops);
 269         BUG_ON(!work->ops->get_ref);
 270
 271         /* when honouring an enqueue request, we only promise that we will run
 272          * the work function in the future; we do not promise to run it once
 273          * per enqueue request
 274          *
 275          * we use the PENDING bit to merge together repeat requests without
 276          * having to disable IRQs and take the spinlock, whilst still
 277          * maintaining our promise
 278          */
 279         if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
 280                 spin_lock_irqsave(&slow_work_queue_lock, flags);
 281
 282                 /* we promise that we will not attempt to execute the work
 283                  * function in more than one thread simultaneously
 284                  *
 285                  * this, however, leaves us with a problem if we're asked to
 286                  * enqueue the work whilst someone is executing the work
 287                  * function as simply queueing the work immediately means that
 288                  * another thread may try executing it whilst it is already
 289                  * under execution
 290                  *
 291                  * to deal with this, we set the ENQ_DEFERRED bit instead of
 292                  * enqueueing, and the thread currently executing the work
 293                  * function will enqueue the work item when the work function
 294                  * returns and it has cleared the EXECUTING bit
 295                  */
 296                 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
 297                         set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
 298                 } else {
 299                         if (work->ops->get_ref(work) < 0)
 300                                 goto cant_get_ref;
 301                         if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
 302                                 list_add_tail(&work->link, &vslow_work_queue);
 303                         else
 304                                 list_add_tail(&work->link, &slow_work_queue);
 305                         wake_up(&slow_work_thread_wq);
 306                 }
 307
 308                 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
 309         }
 310         return 0;
 311
 312 cant_get_ref:
 313         spin_unlock_irqrestore(&slow_work_queue_lock, flags);
 314         return -EAGAIN;
 315 }
 316 EXPORT_SYMBOL(slow_work_enqueue);
 317
 318 /*
 319  * Schedule a cull of the thread pool at some time in the near future
 320  */
 321 static void slow_work_schedule_cull(void)
 322 {
 323         mod_timer(&slow_work_cull_timer,
 324                   round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
 325 }
 326
 327 /*
 328  * Worker thread culling algorithm
 329  */
 330 static bool slow_work_cull_thread(void)
 331 {
 332         unsigned long flags;
 333         bool do_cull = false;
 334
 335         spin_lock_irqsave(&slow_work_queue_lock, flags);
 336
 337         if (slow_work_cull) {
 338                 slow_work_cull = false;
 339
 340                 if (list_empty(&slow_work_queue) &&
 341                     list_empty(&vslow_work_queue) &&
 342                     atomic_read(&slow_work_thread_count) >
 343                     slow_work_min_threads) {
 344                         slow_work_schedule_cull();
 345                         do_cull = true;
 346                 }
 347         }
 348
 349         spin_unlock_irqrestore(&slow_work_queue_lock, flags);
 350         return do_cull;
 351 }
 352
 353 /*
 354  * Determine if there is slow work available for dispatch
 355  */
 356 static inline bool slow_work_available(int vsmax)
 357 {
 358         return !list_empty(&slow_work_queue) ||
 359                 (!list_empty(&vslow_work_queue) &&
 360                  atomic_read(&vslow_work_executing_count) < vsmax);
 361 }
 362
 363 /*
 364  * Worker thread dispatcher
 365  */
 366 static int slow_work_thread(void *_data)
 367 {
 368         int vsmax;
 369
 370         DEFINE_WAIT(wait);
 371
 372         set_freezable();
 373         set_user_nice(current, -5);
 374
 375         for (;;) {
 376                 vsmax = vslow_work_proportion;
 377                 vsmax *= atomic_read(&slow_work_thread_count);
 378                 vsmax /= 100;
 379
 380                 prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
 381                                           TASK_INTERRUPTIBLE);
 382                 if (!freezing(current) &&
 383                     !slow_work_threads_should_exit &&
 384                     !slow_work_available(vsmax) &&
 385                     !slow_work_cull)
 386                         schedule();
 387                 finish_wait(&slow_work_thread_wq, &wait);
 388
 389                 try_to_freeze();
 390
 391                 vsmax = vslow_work_proportion;
 392                 vsmax *= atomic_read(&slow_work_thread_count);
 393                 vsmax /= 100;
 394
 395                 if (slow_work_available(vsmax) && slow_work_execute()) {
 396                         cond_resched();
 397                         if (list_empty(&slow_work_queue) &&
 398                             list_empty(&vslow_work_queue) &&
 399                             atomic_read(&slow_work_thread_count) >
 400                             slow_work_min_threads)
 401                                 slow_work_schedule_cull();
 402                         continue;
 403                 }
 404
 405                 if (slow_work_threads_should_exit)
 406                         break;
 407
 408                 if (slow_work_cull && slow_work_cull_thread())
 409                         break;
 410         }
 411
 412         if (atomic_dec_and_test(&slow_work_thread_count))
 413                 complete_and_exit(&slow_work_last_thread_exited, 0);
 414         return 0;
 415 }
 416
 417 /*
 418  * Handle thread cull timer expiration
 419  */
 420 static void slow_work_cull_timeout(unsigned long data)
 421 {
 422         slow_work_cull = true;
 423         wake_up(&slow_work_thread_wq);
 424 }
 425
 426 /*
 427  * Get a reference on slow work thread starter
 428  */
 429 static int slow_work_new_thread_get_ref(struct slow_work *work)
 430 {
 431         return 0;
 432 }
 433
 434 /*
 435  * Drop a reference on slow work thread starter
 436  */
 437 static void slow_work_new_thread_put_ref(struct slow_work *work)
 438 {
 439 }
 440
 441 /*
 442  * Start a new slow work thread
 443  */
 444 static void slow_work_new_thread_execute(struct slow_work *work)
 445 {
 446         struct task_struct *p;
 447
 448         if (slow_work_threads_should_exit)
 449                 return;
 450
 451         if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
 452                 return;
 453
 454         if (!mutex_trylock(&slow_work_user_lock))
 455                 return;
 456
 457         slow_work_may_not_start_new_thread = true;
 458         atomic_inc(&slow_work_thread_count);
 459         p = kthread_run(slow_work_thread, NULL, "kslowd");
 460         if (IS_ERR(p)) {
 461                 printk(KERN_DEBUG "Slow work thread pool: OOM\n");
 462                 if (atomic_dec_and_test(&slow_work_thread_count))
 463                         BUG(); /* we're running on a slow work thread... */
 464                 mod_timer(&slow_work_oom_timer,
 465                           round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
 466         } else {
 467                 /* ratelimit the starting of new threads */
 468                 mod_timer(&slow_work_oom_timer, jiffies + 1);
 469         }
 470
 471         mutex_unlock(&slow_work_user_lock);
 472 }
 473
 474 static const struct slow_work_ops slow_work_new_thread_ops = {
 475         .get_ref        = slow_work_new_thread_get_ref,
 476         .put_ref        = slow_work_new_thread_put_ref,
 477         .execute        = slow_work_new_thread_execute,
 478 };
 479
 480 /*
 481  * post-OOM new thread start suppression expiration
 482  */
 483 static void slow_work_oom_timeout(unsigned long data)
 484 {
 485         slow_work_may_not_start_new_thread = false;
 486 }
 487
 488 #ifdef CONFIG_SYSCTL
 489 /*
 490  * Handle adjustment of the minimum number of threads
 491  */
 492 static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
 493                                         void __user *buffer,
 494                                         size_t *lenp, loff_t *ppos)
 495 {
 496         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 497         int n;
 498
 499         if (ret == 0) {
 500                 mutex_lock(&slow_work_user_lock);
 501                 if (slow_work_user_count > 0) {
 502                         /* see if we need to start or stop threads */
 503                         n = atomic_read(&slow_work_thread_count) -
 504                                 slow_work_min_threads;
 505
 506                         if (n < 0 && !slow_work_may_not_start_new_thread)
 507                                 slow_work_enqueue(&slow_work_new_thread);
 508                         else if (n > 0)
 509                                 slow_work_schedule_cull();
 510                 }
 511                 mutex_unlock(&slow_work_user_lock);
 512         }
 513
 514         return ret;
 515 }
 516
 517 /*
 518  * Handle adjustment of the maximum number of threads
 519  */
 520 static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
 521                                         void __user *buffer,
 522                                         size_t *lenp, loff_t *ppos)
 523 {
 524         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 525         int n;
 526
 527         if (ret == 0) {
 528                 mutex_lock(&slow_work_user_lock);
 529                 if (slow_work_user_count > 0) {
 530                         /* see if we need to stop threads */
 531                         n = slow_work_max_threads -
 532                                 atomic_read(&slow_work_thread_count);
 533
 534                         if (n < 0)
 535                                 slow_work_schedule_cull();
 536                 }
 537                 mutex_unlock(&slow_work_user_lock);
 538         }
 539
 540         return ret;
 541 }
 542 #endif /* CONFIG_SYSCTL */
 543
 544 /**
 545  * slow_work_register_user - Register a user of the facility
 546  *
 547  * Register a user of the facility, starting up the initial threads if there
 548  * aren't any other users at this point.  This will return 0 if successful, or
 549  * an error if not.
 550  */
 551 int slow_work_register_user(void)
 552 {
 553         struct task_struct *p;
 554         int loop;
 555
 556         mutex_lock(&slow_work_user_lock);
 557
 558         if (slow_work_user_count == 0) {
 559                 printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
 560                 init_completion(&slow_work_last_thread_exited);
 561
 562                 slow_work_threads_should_exit = false;
 563                 slow_work_init(&slow_work_new_thread,
 564                                &slow_work_new_thread_ops);
 565                 slow_work_may_not_start_new_thread = false;
 566                 slow_work_cull = false;
 567
 568                 /* start the minimum number of threads */
 569                 for (loop = 0; loop < slow_work_min_threads; loop++) {
 570                         atomic_inc(&slow_work_thread_count);
 571                         p = kthread_run(slow_work_thread, NULL, "kslowd");
 572                         if (IS_ERR(p))
 573                                 goto error;
 574                 }
 575                 printk(KERN_NOTICE "Slow work thread pool: Ready\n");
 576         }
 577
 578         slow_work_user_count++;
 579         mutex_unlock(&slow_work_user_lock);
 580         return 0;
 581
 582 error:
 583         if (atomic_dec_and_test(&slow_work_thread_count))
 584                 complete(&slow_work_last_thread_exited);
 585         if (loop > 0) {
 586                 printk(KERN_ERR "Slow work thread pool:"
 587                        " Aborting startup on ENOMEM\n");
 588                 slow_work_threads_should_exit = true;
 589                 wake_up_all(&slow_work_thread_wq);
 590                 wait_for_completion(&slow_work_last_thread_exited);
 591                 printk(KERN_ERR "Slow work thread pool: Aborted\n");
 592         }
 593         mutex_unlock(&slow_work_user_lock);
 594         return PTR_ERR(p);
 595 }
 596 EXPORT_SYMBOL(slow_work_register_user);
 597
 598 /**
 599  * slow_work_unregister_user - Unregister a user of the facility
 600  *
 601  * Unregister a user of the facility, killing all the threads if this was the
 602  * last one.
 603  */
 604 void slow_work_unregister_user(void)
 605 {
 606         mutex_lock(&slow_work_user_lock);
 607
 608         BUG_ON(slow_work_user_count <= 0);
 609
 610         slow_work_user_count--;
 611         if (slow_work_user_count == 0) {
 612                 printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
 613                 slow_work_threads_should_exit = true;
 614                 del_timer_sync(&slow_work_cull_timer);
 615                 del_timer_sync(&slow_work_oom_timer);
 616                 wake_up_all(&slow_work_thread_wq);
 617                 wait_for_completion(&slow_work_last_thread_exited);
 618                 printk(KERN_NOTICE "Slow work thread pool:"
 619                        " Shut down complete\n");
 620         }
 621
 622         mutex_unlock(&slow_work_user_lock);
 623 }
 624 EXPORT_SYMBOL(slow_work_unregister_user);
 625
 626 /*
 627  * Initialise the slow work facility
 628  */
 629 static int __init init_slow_work(void)
 630 {
 631         unsigned nr_cpus = num_possible_cpus();
 632
 633         if (slow_work_max_threads < nr_cpus)
 634                 slow_work_max_threads = nr_cpus;
 635 #ifdef CONFIG_SYSCTL
 636         if (slow_work_max_max_threads < nr_cpus * 2)
 637                 slow_work_max_max_threads = nr_cpus * 2;
 638 #endif
 639         return 0;
 640 }
 641
 642 subsys_initcall(init_slow_work);