block/cfq-iosched.c

   1 /*
   2  *  CFQ, or complete fairness queueing, disk scheduler.
   3  *
   4  *  Based on ideas from a previously unfinished io
   5  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
   6  *
   7  *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
   8  */
   9 #include <linux/module.h>
  10 #include <linux/slab.h>
  11 #include <linux/sched/clock.h>
  12 #include <linux/blkdev.h>
  13 #include <linux/elevator.h>
  14 #include <linux/ktime.h>
  15 #include <linux/rbtree.h>
  16 #include <linux/ioprio.h>
  17 #include <linux/blktrace_api.h>
  18 #include <linux/blk-cgroup.h>
  19 #include "blk.h"
  20 #include "blk-wbt.h"
  21
  22 /*
  23  * tunables
  24  */
  25 /* max queue in one round of service */
  26 static const int cfq_quantum = 8;
  27 static const u64 cfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
  28 /* maximum backwards seek, in KiB */
  29 static const int cfq_back_max = 16 * 1024;
  30 /* penalty of a backwards seek */
  31 static const int cfq_back_penalty = 2;
  32 static const u64 cfq_slice_sync = NSEC_PER_SEC / 10;
  33 static u64 cfq_slice_async = NSEC_PER_SEC / 25;
  34 static const int cfq_slice_async_rq = 2;
  35 static u64 cfq_slice_idle = NSEC_PER_SEC / 125;
  36 static u64 cfq_group_idle = NSEC_PER_SEC / 125;
  37 static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 3/10; /* 300 ms */
  38 static const int cfq_hist_divisor = 4;
  39
  40 /*
  41  * offset from end of queue service tree for idle class
  42  */
  43 #define CFQ_IDLE_DELAY          (NSEC_PER_SEC / 5)
  44 /* offset from end of group service tree under time slice mode */
  45 #define CFQ_SLICE_MODE_GROUP_DELAY (NSEC_PER_SEC / 5)
  46 /* offset from end of group service under IOPS mode */
  47 #define CFQ_IOPS_MODE_GROUP_DELAY (HZ / 5)
  48
  49 /*
  50  * below this threshold, we consider thinktime immediate
  51  */
  52 #define CFQ_MIN_TT              (2 * NSEC_PER_SEC / HZ)
  53
  54 #define CFQ_SLICE_SCALE         (5)
  55 #define CFQ_HW_QUEUE_MIN        (5)
  56 #define CFQ_SERVICE_SHIFT       12
  57
  58 #define CFQQ_SEEK_THR           (sector_t)(8 * 100)
  59 #define CFQQ_CLOSE_THR          (sector_t)(8 * 1024)
  60 #define CFQQ_SECT_THR_NONROT    (sector_t)(2 * 32)
  61 #define CFQQ_SEEKY(cfqq)        (hweight32(cfqq->seek_history) > 32/8)
  62
  63 #define RQ_CIC(rq)              icq_to_cic((rq)->elv.icq)
  64 #define RQ_CFQQ(rq)             (struct cfq_queue *) ((rq)->elv.priv[0])
  65 #define RQ_CFQG(rq)             (struct cfq_group *) ((rq)->elv.priv[1])
  66
  67 static struct kmem_cache *cfq_pool;
  68
  69 #define CFQ_PRIO_LISTS          IOPRIO_BE_NR
  70 #define cfq_class_idle(cfqq)    ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
  71 #define cfq_class_rt(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
  72
  73 #define sample_valid(samples)   ((samples) > 80)
  74 #define rb_entry_cfqg(node)     rb_entry((node), struct cfq_group, rb_node)
  75
  76 /* blkio-related constants */
  77 #define CFQ_WEIGHT_LEGACY_MIN   10
  78 #define CFQ_WEIGHT_LEGACY_DFL   500
  79 #define CFQ_WEIGHT_LEGACY_MAX   1000
  80
  81 struct cfq_ttime {
  82         u64 last_end_request;
  83
  84         u64 ttime_total;
  85         u64 ttime_mean;
  86         unsigned long ttime_samples;
  87 };
  88
  89 /*
  90  * Most of our rbtree usage is for sorting with min extraction, so
  91  * if we cache the leftmost node we don't have to walk down the tree
  92  * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
  93  * move this into the elevator for the rq sorting as well.
  94  */
  95 struct cfq_rb_root {
  96         struct rb_root_cached rb;
  97         struct rb_node *rb_rightmost;
  98         unsigned count;
  99         u64 min_vdisktime;
 100         struct cfq_ttime ttime;
 101 };
 102 #define CFQ_RB_ROOT     (struct cfq_rb_root) { .rb = RB_ROOT_CACHED, \
 103                         .rb_rightmost = NULL,                        \
 104                         .ttime = {.last_end_request = ktime_get_ns(),},}
 105
 106 /*
 107  * Per process-grouping structure
 108  */
 109 struct cfq_queue {
 110         /* reference count */
 111         int ref;
 112         /* various state flags, see below */
 113         unsigned int flags;
 114         /* parent cfq_data */
 115         struct cfq_data *cfqd;
 116         /* service_tree member */
 117         struct rb_node rb_node;
 118         /* service_tree key */
 119         u64 rb_key;
 120         /* prio tree member */
 121         struct rb_node p_node;
 122         /* prio tree root we belong to, if any */
 123         struct rb_root *p_root;
 124         /* sorted list of pending requests */
 125         struct rb_root sort_list;
 126         /* if fifo isn't expired, next request to serve */
 127         struct request *next_rq;
 128         /* requests queued in sort_list */
 129         int queued[2];
 130         /* currently allocated requests */
 131         int allocated[2];
 132         /* fifo list of requests in sort_list */
 133         struct list_head fifo;
 134
 135         /* time when queue got scheduled in to dispatch first request. */
 136         u64 dispatch_start;
 137         u64 allocated_slice;
 138         u64 slice_dispatch;
 139         /* time when first request from queue completed and slice started. */
 140         u64 slice_start;
 141         u64 slice_end;
 142         s64 slice_resid;
 143
 144         /* pending priority requests */
 145         int prio_pending;
 146         /* number of requests that are on the dispatch list or inside driver */
 147         int dispatched;
 148
 149         /* io prio of this group */
 150         unsigned short ioprio, org_ioprio;
 151         unsigned short ioprio_class, org_ioprio_class;
 152
 153         pid_t pid;
 154
 155         u32 seek_history;
 156         sector_t last_request_pos;
 157
 158         struct cfq_rb_root *service_tree;
 159         struct cfq_queue *new_cfqq;
 160         struct cfq_group *cfqg;
 161         /* Number of sectors dispatched from queue in single dispatch round */
 162         unsigned long nr_sectors;
 163 };
 164
 165 /*
 166  * First index in the service_trees.
 167  * IDLE is handled separately, so it has negative index
 168  */
 169 enum wl_class_t {
 170         BE_WORKLOAD = 0,
 171         RT_WORKLOAD = 1,
 172         IDLE_WORKLOAD = 2,
 173         CFQ_PRIO_NR,
 174 };
 175
 176 /*
 177  * Second index in the service_trees.
 178  */
 179 enum wl_type_t {
 180         ASYNC_WORKLOAD = 0,
 181         SYNC_NOIDLE_WORKLOAD = 1,
 182         SYNC_WORKLOAD = 2
 183 };
 184
 185 struct cfqg_stats {
 186 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 187         /* number of ios merged */
 188         struct blkg_rwstat              merged;
 189         /* total time spent on device in ns, may not be accurate w/ queueing */
 190         struct blkg_rwstat              service_time;
 191         /* total time spent waiting in scheduler queue in ns */
 192         struct blkg_rwstat              wait_time;
 193         /* number of IOs queued up */
 194         struct blkg_rwstat              queued;
 195         /* total disk time and nr sectors dispatched by this group */
 196         struct blkg_stat                time;
 197 #ifdef CONFIG_DEBUG_BLK_CGROUP
 198         /* time not charged to this cgroup */
 199         struct blkg_stat                unaccounted_time;
 200         /* sum of number of ios queued across all samples */
 201         struct blkg_stat                avg_queue_size_sum;
 202         /* count of samples taken for average */
 203         struct blkg_stat                avg_queue_size_samples;
 204         /* how many times this group has been removed from service tree */
 205         struct blkg_stat                dequeue;
 206         /* total time spent waiting for it to be assigned a timeslice. */
 207         struct blkg_stat                group_wait_time;
 208         /* time spent idling for this blkcg_gq */
 209         struct blkg_stat                idle_time;
 210         /* total time with empty current active q with other requests queued */
 211         struct blkg_stat                empty_time;
 212         /* fields after this shouldn't be cleared on stat reset */
 213         u64                             start_group_wait_time;
 214         u64                             start_idle_time;
 215         u64                             start_empty_time;
 216         uint16_t                        flags;
 217 #endif  /* CONFIG_DEBUG_BLK_CGROUP */
 218 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 219 };
 220
 221 /* Per-cgroup data */
 222 struct cfq_group_data {
 223         /* must be the first member */
 224         struct blkcg_policy_data cpd;
 225
 226         unsigned int weight;
 227         unsigned int leaf_weight;
 228 };
 229
 230 /* This is per cgroup per device grouping structure */
 231 struct cfq_group {
 232         /* must be the first member */
 233         struct blkg_policy_data pd;
 234
 235         /* group service_tree member */
 236         struct rb_node rb_node;
 237
 238         /* group service_tree key */
 239         u64 vdisktime;
 240
 241         /*
 242          * The number of active cfqgs and sum of their weights under this
 243          * cfqg.  This covers this cfqg's leaf_weight and all children's
 244          * weights, but does not cover weights of further descendants.
 245          *
 246          * If a cfqg is on the service tree, it's active.  An active cfqg
 247          * also activates its parent and contributes to the children_weight
 248          * of the parent.
 249          */
 250         int nr_active;
 251         unsigned int children_weight;
 252
 253         /*
 254          * vfraction is the fraction of vdisktime that the tasks in this
 255          * cfqg are entitled to.  This is determined by compounding the
 256          * ratios walking up from this cfqg to the root.
 257          *
 258          * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
 259          * vfractions on a service tree is approximately 1.  The sum may
 260          * deviate a bit due to rounding errors and fluctuations caused by
 261          * cfqgs entering and leaving the service tree.
 262          */
 263         unsigned int vfraction;
 264
 265         /*
 266          * There are two weights - (internal) weight is the weight of this
 267          * cfqg against the sibling cfqgs.  leaf_weight is the wight of
 268          * this cfqg against the child cfqgs.  For the root cfqg, both
 269          * weights are kept in sync for backward compatibility.
 270          */
 271         unsigned int weight;
 272         unsigned int new_weight;
 273         unsigned int dev_weight;
 274
 275         unsigned int leaf_weight;
 276         unsigned int new_leaf_weight;
 277         unsigned int dev_leaf_weight;
 278
 279         /* number of cfqq currently on this group */
 280         int nr_cfqq;
 281
 282         /*
 283          * Per group busy queues average. Useful for workload slice calc. We
 284          * create the array for each prio class but at run time it is used
 285          * only for RT and BE class and slot for IDLE class remains unused.
 286          * This is primarily done to avoid confusion and a gcc warning.
 287          */
 288         unsigned int busy_queues_avg[CFQ_PRIO_NR];
 289         /*
 290          * rr lists of queues with requests. We maintain service trees for
 291          * RT and BE classes. These trees are subdivided in subclasses
 292          * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
 293          * class there is no subclassification and all the cfq queues go on
 294          * a single tree service_tree_idle.
 295          * Counts are embedded in the cfq_rb_root
 296          */
 297         struct cfq_rb_root service_trees[2][3];
 298         struct cfq_rb_root service_tree_idle;
 299
 300         u64 saved_wl_slice;
 301         enum wl_type_t saved_wl_type;
 302         enum wl_class_t saved_wl_class;
 303
 304         /* number of requests that are on the dispatch list or inside driver */
 305         int dispatched;
 306         struct cfq_ttime ttime;
 307         struct cfqg_stats stats;        /* stats for this cfqg */
 308
 309         /* async queue for each priority case */
 310         struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
 311         struct cfq_queue *async_idle_cfqq;
 312
 313 };
 314
 315 struct cfq_io_cq {
 316         struct io_cq            icq;            /* must be the first member */
 317         struct cfq_queue        *cfqq[2];
 318         struct cfq_ttime        ttime;
 319         int                     ioprio;         /* the current ioprio */
 320 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 321         uint64_t                blkcg_serial_nr; /* the current blkcg serial */
 322 #endif
 323 };
 324
 325 /*
 326  * Per block device queue structure
 327  */
 328 struct cfq_data {
 329         struct request_queue *queue;
 330         /* Root service tree for cfq_groups */
 331         struct cfq_rb_root grp_service_tree;
 332         struct cfq_group *root_group;
 333
 334         /*
 335          * The priority currently being served
 336          */
 337         enum wl_class_t serving_wl_class;
 338         enum wl_type_t serving_wl_type;
 339         u64 workload_expires;
 340         struct cfq_group *serving_group;
 341
 342         /*
 343          * Each priority tree is sorted by next_request position.  These
 344          * trees are used when determining if two or more queues are
 345          * interleaving requests (see cfq_close_cooperator).
 346          */
 347         struct rb_root prio_trees[CFQ_PRIO_LISTS];
 348
 349         unsigned int busy_queues;
 350         unsigned int busy_sync_queues;
 351
 352         int rq_in_driver;
 353         int rq_in_flight[2];
 354
 355         /*
 356          * queue-depth detection
 357          */
 358         int rq_queued;
 359         int hw_tag;
 360         /*
 361          * hw_tag can be
 362          * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
 363          *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
 364          *  0 => no NCQ
 365          */
 366         int hw_tag_est_depth;
 367         unsigned int hw_tag_samples;
 368
 369         /*
 370          * idle window management
 371          */
 372         struct hrtimer idle_slice_timer;
 373         struct work_struct unplug_work;
 374
 375         struct cfq_queue *active_queue;
 376         struct cfq_io_cq *active_cic;
 377
 378         sector_t last_position;
 379
 380         /*
 381          * tunables, see top of file
 382          */
 383         unsigned int cfq_quantum;
 384         unsigned int cfq_back_penalty;
 385         unsigned int cfq_back_max;
 386         unsigned int cfq_slice_async_rq;
 387         unsigned int cfq_latency;
 388         u64 cfq_fifo_expire[2];
 389         u64 cfq_slice[2];
 390         u64 cfq_slice_idle;
 391         u64 cfq_group_idle;
 392         u64 cfq_target_latency;
 393
 394         /*
 395          * Fallback dummy cfqq for extreme OOM conditions
 396          */
 397         struct cfq_queue oom_cfqq;
 398
 399         u64 last_delayed_sync;
 400 };
 401
 402 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
 403 static void cfq_put_queue(struct cfq_queue *cfqq);
 404
 405 static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
 406                                             enum wl_class_t class,
 407                                             enum wl_type_t type)
 408 {
 409         if (!cfqg)
 410                 return NULL;
 411
 412         if (class == IDLE_WORKLOAD)
 413                 return &cfqg->service_tree_idle;
 414
 415         return &cfqg->service_trees[class][type];
 416 }
 417
 418 enum cfqq_state_flags {
 419         CFQ_CFQQ_FLAG_on_rr = 0,        /* on round-robin busy list */
 420         CFQ_CFQQ_FLAG_wait_request,     /* waiting for a request */
 421         CFQ_CFQQ_FLAG_must_dispatch,    /* must be allowed a dispatch */
 422         CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
 423         CFQ_CFQQ_FLAG_fifo_expire,      /* FIFO checked in this slice */
 424         CFQ_CFQQ_FLAG_idle_window,      /* slice idling enabled */
 425         CFQ_CFQQ_FLAG_prio_changed,     /* task priority has changed */
 426         CFQ_CFQQ_FLAG_slice_new,        /* no requests dispatched in slice */
 427         CFQ_CFQQ_FLAG_sync,             /* synchronous queue */
 428         CFQ_CFQQ_FLAG_coop,             /* cfqq is shared */
 429         CFQ_CFQQ_FLAG_split_coop,       /* shared cfqq will be splitted */
 430         CFQ_CFQQ_FLAG_deep,             /* sync cfqq experienced large depth */
 431         CFQ_CFQQ_FLAG_wait_busy,        /* Waiting for next request */
 432 };
 433
 434 #define CFQ_CFQQ_FNS(name)                                              \
 435 static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)         \
 436 {                                                                       \
 437         (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);                   \
 438 }                                                                       \
 439 static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)        \
 440 {                                                                       \
 441         (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);                  \
 442 }                                                                       \
 443 static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)         \
 444 {                                                                       \
 445         return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;      \
 446 }
 447
 448 CFQ_CFQQ_FNS(on_rr);
 449 CFQ_CFQQ_FNS(wait_request);
 450 CFQ_CFQQ_FNS(must_dispatch);
 451 CFQ_CFQQ_FNS(must_alloc_slice);
 452 CFQ_CFQQ_FNS(fifo_expire);
 453 CFQ_CFQQ_FNS(idle_window);
 454 CFQ_CFQQ_FNS(prio_changed);
 455 CFQ_CFQQ_FNS(slice_new);
 456 CFQ_CFQQ_FNS(sync);
 457 CFQ_CFQQ_FNS(coop);
 458 CFQ_CFQQ_FNS(split_coop);
 459 CFQ_CFQQ_FNS(deep);
 460 CFQ_CFQQ_FNS(wait_busy);
 461 #undef CFQ_CFQQ_FNS
 462
 463 #if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
 464
 465 /* cfqg stats flags */
 466 enum cfqg_stats_flags {
 467         CFQG_stats_waiting = 0,
 468         CFQG_stats_idling,
 469         CFQG_stats_empty,
 470 };
 471
 472 #define CFQG_FLAG_FNS(name)                                             \
 473 static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats)     \
 474 {                                                                       \
 475         stats->flags |= (1 << CFQG_stats_##name);                       \
 476 }                                                                       \
 477 static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats)    \
 478 {                                                                       \
 479         stats->flags &= ~(1 << CFQG_stats_##name);                      \
 480 }                                                                       \
 481 static inline int cfqg_stats_##name(struct cfqg_stats *stats)           \
 482 {                                                                       \
 483         return (stats->flags & (1 << CFQG_stats_##name)) != 0;          \
 484 }                                                                       \
 485
 486 CFQG_FLAG_FNS(waiting)
 487 CFQG_FLAG_FNS(idling)
 488 CFQG_FLAG_FNS(empty)
 489 #undef CFQG_FLAG_FNS
 490
 491 /* This should be called with the queue_lock held. */
 492 static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
 493 {
 494         u64 now;
 495
 496         if (!cfqg_stats_waiting(stats))
 497                 return;
 498
 499         now = ktime_get_ns();
 500         if (now > stats->start_group_wait_time)
 501                 blkg_stat_add(&stats->group_wait_time,
 502                               now - stats->start_group_wait_time);
 503         cfqg_stats_clear_waiting(stats);
 504 }
 505
 506 /* This should be called with the queue_lock held. */
 507 static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
 508                                                  struct cfq_group *curr_cfqg)
 509 {
 510         struct cfqg_stats *stats = &cfqg->stats;
 511
 512         if (cfqg_stats_waiting(stats))
 513                 return;
 514         if (cfqg == curr_cfqg)
 515                 return;
 516         stats->start_group_wait_time = ktime_get_ns();
 517         cfqg_stats_mark_waiting(stats);
 518 }
 519
 520 /* This should be called with the queue_lock held. */
 521 static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
 522 {
 523         u64 now;
 524
 525         if (!cfqg_stats_empty(stats))
 526                 return;
 527
 528         now = ktime_get_ns();
 529         if (now > stats->start_empty_time)
 530                 blkg_stat_add(&stats->empty_time,
 531                               now - stats->start_empty_time);
 532         cfqg_stats_clear_empty(stats);
 533 }
 534
 535 static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
 536 {
 537         blkg_stat_add(&cfqg->stats.dequeue, 1);
 538 }
 539
 540 static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
 541 {
 542         struct cfqg_stats *stats = &cfqg->stats;
 543
 544         if (blkg_rwstat_total(&stats->queued))
 545                 return;
 546
 547         /*
 548          * group is already marked empty. This can happen if cfqq got new
 549          * request in parent group and moved to this group while being added
 550          * to service tree. Just ignore the event and move on.
 551          */
 552         if (cfqg_stats_empty(stats))
 553                 return;
 554
 555         stats->start_empty_time = ktime_get_ns();
 556         cfqg_stats_mark_empty(stats);
 557 }
 558
 559 static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
 560 {
 561         struct cfqg_stats *stats = &cfqg->stats;
 562
 563         if (cfqg_stats_idling(stats)) {
 564                 u64 now = ktime_get_ns();
 565
 566                 if (now > stats->start_idle_time)
 567                         blkg_stat_add(&stats->idle_time,
 568                                       now - stats->start_idle_time);
 569                 cfqg_stats_clear_idling(stats);
 570         }
 571 }
 572
 573 static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
 574 {
 575         struct cfqg_stats *stats = &cfqg->stats;
 576
 577         BUG_ON(cfqg_stats_idling(stats));
 578
 579         stats->start_idle_time = ktime_get_ns();
 580         cfqg_stats_mark_idling(stats);
 581 }
 582
 583 static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
 584 {
 585         struct cfqg_stats *stats = &cfqg->stats;
 586
 587         blkg_stat_add(&stats->avg_queue_size_sum,
 588                       blkg_rwstat_total(&stats->queued));
 589         blkg_stat_add(&stats->avg_queue_size_samples, 1);
 590         cfqg_stats_update_group_wait_time(stats);
 591 }
 592
 593 #else   /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
 594
 595 static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
 596 static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
 597 static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
 598 static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
 599 static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
 600 static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
 601 static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
 602
 603 #endif  /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
 604
 605 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 606
 607 static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
 608 {
 609         return pd ? container_of(pd, struct cfq_group, pd) : NULL;
 610 }
 611
 612 static struct cfq_group_data
 613 *cpd_to_cfqgd(struct blkcg_policy_data *cpd)
 614 {
 615         return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
 616 }
 617
 618 static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
 619 {
 620         return pd_to_blkg(&cfqg->pd);
 621 }
 622
 623 static struct blkcg_policy blkcg_policy_cfq;
 624
 625 static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
 626 {
 627         return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
 628 }
 629
 630 static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg)
 631 {
 632         return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
 633 }
 634
 635 static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
 636 {
 637         struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
 638
 639         return pblkg ? blkg_to_cfqg(pblkg) : NULL;
 640 }
 641
 642 static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
 643                                       struct cfq_group *ancestor)
 644 {
 645         return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup,
 646                                     cfqg_to_blkg(ancestor)->blkcg->css.cgroup);
 647 }
 648
 649 static inline void cfqg_get(struct cfq_group *cfqg)
 650 {
 651         return blkg_get(cfqg_to_blkg(cfqg));
 652 }
 653
 654 static inline void cfqg_put(struct cfq_group *cfqg)
 655 {
 656         return blkg_put(cfqg_to_blkg(cfqg));
 657 }
 658
 659 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  do {                    \
 660         blk_add_cgroup_trace_msg((cfqd)->queue,                         \
 661                         cfqg_to_blkg((cfqq)->cfqg)->blkcg,              \
 662                         "cfq%d%c%c " fmt, (cfqq)->pid,                  \
 663                         cfq_cfqq_sync((cfqq)) ? 'S' : 'A',              \
 664                         cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
 665                           ##args);                                      \
 666 } while (0)
 667
 668 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)  do {                    \
 669         blk_add_cgroup_trace_msg((cfqd)->queue,                         \
 670                         cfqg_to_blkg(cfqg)->blkcg, fmt, ##args);        \
 671 } while (0)
 672
 673 static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
 674                                             struct cfq_group *curr_cfqg,
 675                                             unsigned int op)
 676 {
 677         blkg_rwstat_add(&cfqg->stats.queued, op, 1);
 678         cfqg_stats_end_empty_time(&cfqg->stats);
 679         cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
 680 }
 681
 682 static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
 683                         uint64_t time, unsigned long unaccounted_time)
 684 {
 685         blkg_stat_add(&cfqg->stats.time, time);
 686 #ifdef CONFIG_DEBUG_BLK_CGROUP
 687         blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
 688 #endif
 689 }
 690
 691 static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
 692                                                unsigned int op)
 693 {
 694         blkg_rwstat_add(&cfqg->stats.queued, op, -1);
 695 }
 696
 697 static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
 698                                                unsigned int op)
 699 {
 700         blkg_rwstat_add(&cfqg->stats.merged, op, 1);
 701 }
 702
 703 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
 704                                                 u64 start_time_ns,
 705                                                 u64 io_start_time_ns,
 706                                                 unsigned int op)
 707 {
 708         struct cfqg_stats *stats = &cfqg->stats;
 709         u64 now = ktime_get_ns();
 710
 711         if (now > io_start_time_ns)
 712                 blkg_rwstat_add(&stats->service_time, op,
 713                                 now - io_start_time_ns);
 714         if (io_start_time_ns > start_time_ns)
 715                 blkg_rwstat_add(&stats->wait_time, op,
 716                                 io_start_time_ns - start_time_ns);
 717 }
 718
 719 /* @stats = 0 */
 720 static void cfqg_stats_reset(struct cfqg_stats *stats)
 721 {
 722         /* queued stats shouldn't be cleared */
 723         blkg_rwstat_reset(&stats->merged);
 724         blkg_rwstat_reset(&stats->service_time);
 725         blkg_rwstat_reset(&stats->wait_time);
 726         blkg_stat_reset(&stats->time);
 727 #ifdef CONFIG_DEBUG_BLK_CGROUP
 728         blkg_stat_reset(&stats->unaccounted_time);
 729         blkg_stat_reset(&stats->avg_queue_size_sum);
 730         blkg_stat_reset(&stats->avg_queue_size_samples);
 731         blkg_stat_reset(&stats->dequeue);
 732         blkg_stat_reset(&stats->group_wait_time);
 733         blkg_stat_reset(&stats->idle_time);
 734         blkg_stat_reset(&stats->empty_time);
 735 #endif
 736 }
 737
 738 /* @to += @from */
 739 static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
 740 {
 741         /* queued stats shouldn't be cleared */
 742         blkg_rwstat_add_aux(&to->merged, &from->merged);
 743         blkg_rwstat_add_aux(&to->service_time, &from->service_time);
 744         blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
 745         blkg_stat_add_aux(&from->time, &from->time);
 746 #ifdef CONFIG_DEBUG_BLK_CGROUP
 747         blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
 748         blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
 749         blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
 750         blkg_stat_add_aux(&to->dequeue, &from->dequeue);
 751         blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
 752         blkg_stat_add_aux(&to->idle_time, &from->idle_time);
 753         blkg_stat_add_aux(&to->empty_time, &from->empty_time);
 754 #endif
 755 }
 756
 757 /*
 758  * Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
 759  * recursive stats can still account for the amount used by this cfqg after
 760  * it's gone.
 761  */
 762 static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
 763 {
 764         struct cfq_group *parent = cfqg_parent(cfqg);
 765
 766         lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);
 767
 768         if (unlikely(!parent))
 769                 return;
 770
 771         cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
 772         cfqg_stats_reset(&cfqg->stats);
 773 }
 774
 775 #else   /* CONFIG_CFQ_GROUP_IOSCHED */
 776
 777 static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
 778 static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
 779                                       struct cfq_group *ancestor)
 780 {
 781         return true;
 782 }
 783 static inline void cfqg_get(struct cfq_group *cfqg) { }
 784 static inline void cfqg_put(struct cfq_group *cfqg) { }
 785
 786 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  \
 787         blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \
 788                         cfq_cfqq_sync((cfqq)) ? 'S' : 'A',              \
 789                         cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
 790                                 ##args)
 791 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)          do {} while (0)
 792
 793 static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
 794                         struct cfq_group *curr_cfqg, unsigned int op) { }
 795 static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
 796                         uint64_t time, unsigned long unaccounted_time) { }
 797 static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
 798                         unsigned int op) { }
 799 static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
 800                         unsigned int op) { }
 801 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
 802                                                 u64 start_time_ns,
 803                                                 u64 io_start_time_ns,
 804                                                 unsigned int op) { }
 805
 806 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 807
 808 #define cfq_log(cfqd, fmt, args...)     \
 809         blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
 810
 811 /* Traverses through cfq group service trees */
 812 #define for_each_cfqg_st(cfqg, i, j, st) \
 813         for (i = 0; i <= IDLE_WORKLOAD; i++) \
 814                 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
 815                         : &cfqg->service_tree_idle; \
 816                         (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
 817                         (i == IDLE_WORKLOAD && j == 0); \
 818                         j++, st = i < IDLE_WORKLOAD ? \
 819                         &cfqg->service_trees[i][j]: NULL) \
 820
 821 static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
 822         struct cfq_ttime *ttime, bool group_idle)
 823 {
 824         u64 slice;
 825         if (!sample_valid(ttime->ttime_samples))
 826                 return false;
 827         if (group_idle)
 828                 slice = cfqd->cfq_group_idle;
 829         else
 830                 slice = cfqd->cfq_slice_idle;
 831         return ttime->ttime_mean > slice;
 832 }
 833
 834 static inline bool iops_mode(struct cfq_data *cfqd)
 835 {
 836         /*
 837          * If we are not idling on queues and it is a NCQ drive, parallel
 838          * execution of requests is on and measuring time is not possible
 839          * in most of the cases until and unless we drive shallower queue
 840          * depths and that becomes a performance bottleneck. In such cases
 841          * switch to start providing fairness in terms of number of IOs.
 842          */
 843         if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
 844                 return true;
 845         else
 846                 return false;
 847 }
 848
 849 static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
 850 {
 851         if (cfq_class_idle(cfqq))
 852                 return IDLE_WORKLOAD;
 853         if (cfq_class_rt(cfqq))
 854                 return RT_WORKLOAD;
 855         return BE_WORKLOAD;
 856 }
 857
 858
 859 static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
 860 {
 861         if (!cfq_cfqq_sync(cfqq))
 862                 return ASYNC_WORKLOAD;
 863         if (!cfq_cfqq_idle_window(cfqq))
 864                 return SYNC_NOIDLE_WORKLOAD;
 865         return SYNC_WORKLOAD;
 866 }
 867
 868 static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
 869                                         struct cfq_data *cfqd,
 870                                         struct cfq_group *cfqg)
 871 {
 872         if (wl_class == IDLE_WORKLOAD)
 873                 return cfqg->service_tree_idle.count;
 874
 875         return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
 876                 cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
 877                 cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
 878 }
 879
 880 static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
 881                                         struct cfq_group *cfqg)
 882 {
 883         return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
 884                 cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
 885 }
 886
 887 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 888 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
 889                                        struct cfq_io_cq *cic, struct bio *bio);
 890
 891 static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
 892 {
 893         /* cic->icq is the first member, %NULL will convert to %NULL */
 894         return container_of(icq, struct cfq_io_cq, icq);
 895 }
 896
 897 static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
 898                                                struct io_context *ioc)
 899 {
 900         if (ioc)
 901                 return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
 902         return NULL;
 903 }
 904
 905 static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
 906 {
 907         return cic->cfqq[is_sync];
 908 }
 909
 910 static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
 911                                 bool is_sync)
 912 {
 913         cic->cfqq[is_sync] = cfqq;
 914 }
 915
 916 static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
 917 {
 918         return cic->icq.q->elevator->elevator_data;
 919 }
 920
 921 /*
 922  * scheduler run of queue, if there are requests pending and no one in the
 923  * driver that will restart queueing
 924  */
 925 static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
 926 {
 927         if (cfqd->busy_queues) {
 928                 cfq_log(cfqd, "schedule dispatch");
 929                 kblockd_schedule_work(&cfqd->unplug_work);
 930         }
 931 }
 932
 933 /*
 934  * Scale schedule slice based on io priority. Use the sync time slice only
 935  * if a queue is marked sync and has sync io queued. A sync queue with async
 936  * io only, should not get full sync slice length.
 937  */
 938 static inline u64 cfq_prio_slice(struct cfq_data *cfqd, bool sync,
 939                                  unsigned short prio)
 940 {
 941         u64 base_slice = cfqd->cfq_slice[sync];
 942         u64 slice = div_u64(base_slice, CFQ_SLICE_SCALE);
 943
 944         WARN_ON(prio >= IOPRIO_BE_NR);
 945
 946         return base_slice + (slice * (4 - prio));
 947 }
 948
 949 static inline u64
 950 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 951 {
 952         return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 953 }
 954
 955 /**
 956  * cfqg_scale_charge - scale disk time charge according to cfqg weight
 957  * @charge: disk time being charged
 958  * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
 959  *
 960  * Scale @charge according to @vfraction, which is in range (0, 1].  The
 961  * scaling is inversely proportional.
 962  *
 963  * scaled = charge / vfraction
 964  *
 965  * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
 966  */
 967 static inline u64 cfqg_scale_charge(u64 charge,
 968                                     unsigned int vfraction)
 969 {
 970         u64 c = charge << CFQ_SERVICE_SHIFT;    /* make it fixed point */
 971
 972         /* charge / vfraction */
 973         c <<= CFQ_SERVICE_SHIFT;
 974         return div_u64(c, vfraction);
 975 }
 976
 977 static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
 978 {
 979         s64 delta = (s64)(vdisktime - min_vdisktime);
 980         if (delta > 0)
 981                 min_vdisktime = vdisktime;
 982
 983         return min_vdisktime;
 984 }
 985
 986 static void update_min_vdisktime(struct cfq_rb_root *st)
 987 {
 988         if (!RB_EMPTY_ROOT(&st->rb.rb_root)) {
 989                 struct cfq_group *cfqg = rb_entry_cfqg(st->rb.rb_leftmost);
 990
 991                 st->min_vdisktime = max_vdisktime(st->min_vdisktime,
 992                                                   cfqg->vdisktime);
 993         }
 994 }
 995
 996 /*
 997  * get averaged number of queues of RT/BE priority.
 998  * average is updated, with a formula that gives more weight to higher numbers,
 999  * to quickly follows sudden increases and decrease slowly
1000  */
1001
1002 static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
1003                                         struct cfq_group *cfqg, bool rt)
1004 {
1005         unsigned min_q, max_q;
1006         unsigned mult  = cfq_hist_divisor - 1;
1007         unsigned round = cfq_hist_divisor / 2;
1008         unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
1009
1010         min_q = min(cfqg->busy_queues_avg[rt], busy);
1011         max_q = max(cfqg->busy_queues_avg[rt], busy);
1012         cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
1013                 cfq_hist_divisor;
1014         return cfqg->busy_queues_avg[rt];
1015 }
1016
1017 static inline u64
1018 cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
1019 {
1020         return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
1021 }
1022
1023 static inline u64
1024 cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1025 {
1026         u64 slice = cfq_prio_to_slice(cfqd, cfqq);
1027         if (cfqd->cfq_latency) {
1028                 /*
1029                  * interested queues (we consider only the ones with the same
1030                  * priority class in the cfq group)
1031                  */
1032                 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
1033                                                 cfq_class_rt(cfqq));
1034                 u64 sync_slice = cfqd->cfq_slice[1];
1035                 u64 expect_latency = sync_slice * iq;
1036                 u64 group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
1037
1038                 if (expect_latency > group_slice) {
1039                         u64 base_low_slice = 2 * cfqd->cfq_slice_idle;
1040                         u64 low_slice;
1041
1042                         /* scale low_slice according to IO priority
1043                          * and sync vs async */
1044                         low_slice = div64_u64(base_low_slice*slice, sync_slice);
1045                         low_slice = min(slice, low_slice);
1046                         /* the adapted slice value is scaled to fit all iqs
1047                          * into the target latency */
1048                         slice = div64_u64(slice*group_slice, expect_latency);
1049                         slice = max(slice, low_slice);
1050                 }
1051         }
1052         return slice;
1053 }
1054
1055 static inline void
1056 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1057 {
1058         u64 slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
1059         u64 now = ktime_get_ns();
1060
1061         cfqq->slice_start = now;
1062         cfqq->slice_end = now + slice;
1063         cfqq->allocated_slice = slice;
1064         cfq_log_cfqq(cfqd, cfqq, "set_slice=%llu", cfqq->slice_end - now);
1065 }
1066
1067 /*
1068  * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
1069  * isn't valid until the first request from the dispatch is activated
1070  * and the slice time set.
1071  */
1072 static inline bool cfq_slice_used(struct cfq_queue *cfqq)
1073 {
1074         if (cfq_cfqq_slice_new(cfqq))
1075                 return false;
1076         if (ktime_get_ns() < cfqq->slice_end)
1077                 return false;
1078
1079         return true;
1080 }
1081
1082 /*
1083  * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1084  * We choose the request that is closest to the head right now. Distance
1085  * behind the head is penalized and only allowed to a certain extent.
1086  */
1087 static struct request *
1088 cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
1089 {
1090         sector_t s1, s2, d1 = 0, d2 = 0;
1091         unsigned long back_max;
1092 #define CFQ_RQ1_WRAP    0x01 /* request 1 wraps */
1093 #define CFQ_RQ2_WRAP    0x02 /* request 2 wraps */
1094         unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1095
1096         if (rq1 == NULL || rq1 == rq2)
1097                 return rq2;
1098         if (rq2 == NULL)
1099                 return rq1;
1100
1101         if (rq_is_sync(rq1) != rq_is_sync(rq2))
1102                 return rq_is_sync(rq1) ? rq1 : rq2;
1103
1104         if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
1105                 return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
1106
1107         s1 = blk_rq_pos(rq1);
1108         s2 = blk_rq_pos(rq2);
1109
1110         /*
1111          * by definition, 1KiB is 2 sectors
1112          */
1113         back_max = cfqd->cfq_back_max * 2;
1114
1115         /*
1116          * Strict one way elevator _except_ in the case where we allow
1117          * short backward seeks which are biased as twice the cost of a
1118          * similar forward seek.
1119          */
1120         if (s1 >= last)
1121                 d1 = s1 - last;
1122         else if (s1 + back_max >= last)
1123                 d1 = (last - s1) * cfqd->cfq_back_penalty;
1124         else
1125                 wrap |= CFQ_RQ1_WRAP;
1126
1127         if (s2 >= last)
1128                 d2 = s2 - last;
1129         else if (s2 + back_max >= last)
1130                 d2 = (last - s2) * cfqd->cfq_back_penalty;
1131         else
1132                 wrap |= CFQ_RQ2_WRAP;
1133
1134         /* Found required data */
1135
1136         /*
1137          * By doing switch() on the bit mask "wrap" we avoid having to
1138          * check two variables for all permutations: --> faster!
1139          */
1140         switch (wrap) {
1141         case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1142                 if (d1 < d2)
1143                         return rq1;
1144                 else if (d2 < d1)
1145                         return rq2;
1146                 else {
1147                         if (s1 >= s2)
1148                                 return rq1;
1149                         else
1150                                 return rq2;
1151                 }
1152
1153         case CFQ_RQ2_WRAP:
1154                 return rq1;
1155         case CFQ_RQ1_WRAP:
1156                 return rq2;
1157         case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
1158         default:
1159                 /*
1160                  * Since both rqs are wrapped,
1161                  * start with the one that's further behind head
1162                  * (--> only *one* back seek required),
1163                  * since back seek takes more time than forward.
1164                  */
1165                 if (s1 <= s2)
1166                         return rq1;
1167                 else
1168                         return rq2;
1169         }
1170 }
1171
1172 static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
1173 {
1174         /* Service tree is empty */
1175         if (!root->count)
1176                 return NULL;
1177
1178         return rb_entry(rb_first_cached(&root->rb), struct cfq_queue, rb_node);
1179 }
1180
1181 static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
1182 {
1183         return rb_entry_cfqg(rb_first_cached(&root->rb));
1184 }
1185
1186 static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
1187 {
1188         if (root->rb_rightmost == n)
1189                 root->rb_rightmost = rb_prev(n);
1190
1191         rb_erase_cached(n, &root->rb);
1192         RB_CLEAR_NODE(n);
1193
1194         --root->count;
1195 }
1196
1197 /*
1198  * would be nice to take fifo expire time into account as well
1199  */
1200 static struct request *
1201 cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1202                   struct request *last)
1203 {
1204         struct rb_node *rbnext = rb_next(&last->rb_node);
1205         struct rb_node *rbprev = rb_prev(&last->rb_node);
1206         struct request *next = NULL, *prev = NULL;
1207
1208         BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1209
1210         if (rbprev)
1211                 prev = rb_entry_rq(rbprev);
1212
1213         if (rbnext)
1214                 next = rb_entry_rq(rbnext);
1215         else {
1216                 rbnext = rb_first(&cfqq->sort_list);
1217                 if (rbnext && rbnext != &last->rb_node)
1218                         next = rb_entry_rq(rbnext);
1219         }
1220
1221         return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
1222 }
1223
1224 static u64 cfq_slice_offset(struct cfq_data *cfqd,
1225                             struct cfq_queue *cfqq)
1226 {
1227         /*
1228          * just an approximation, should be ok.
1229          */
1230         return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
1231                        cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
1232 }
1233
1234 static inline s64
1235 cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
1236 {
1237         return cfqg->vdisktime - st->min_vdisktime;
1238 }
1239
1240 static void
1241 __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1242 {
1243         struct rb_node **node = &st->rb.rb_root.rb_node;
1244         struct rb_node *parent = NULL;
1245         struct cfq_group *__cfqg;
1246         s64 key = cfqg_key(st, cfqg);
1247         bool leftmost = true, rightmost = true;
1248
1249         while (*node != NULL) {
1250                 parent = *node;
1251                 __cfqg = rb_entry_cfqg(parent);
1252
1253                 if (key < cfqg_key(st, __cfqg)) {
1254                         node = &parent->rb_left;
1255                         rightmost = false;
1256                 } else {
1257                         node = &parent->rb_right;
1258                         leftmost = false;
1259                 }
1260         }
1261
1262         if (rightmost)
1263                 st->rb_rightmost = &cfqg->rb_node;
1264
1265         rb_link_node(&cfqg->rb_node, parent, node);
1266         rb_insert_color_cached(&cfqg->rb_node, &st->rb, leftmost);
1267 }
1268
1269 /*
1270  * This has to be called only on activation of cfqg
1271  */
1272 static void
1273 cfq_update_group_weight(struct cfq_group *cfqg)
1274 {
1275         if (cfqg->new_weight) {
1276                 cfqg->weight = cfqg->new_weight;
1277                 cfqg->new_weight = 0;
1278         }
1279 }
1280
1281 static void
1282 cfq_update_group_leaf_weight(struct cfq_group *cfqg)
1283 {
1284         BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1285
1286         if (cfqg->new_leaf_weight) {
1287                 cfqg->leaf_weight = cfqg->new_leaf_weight;
1288                 cfqg->new_leaf_weight = 0;
1289         }
1290 }
1291
1292 static void
1293 cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1294 {
1295         unsigned int vfr = 1 << CFQ_SERVICE_SHIFT;      /* start with 1 */
1296         struct cfq_group *pos = cfqg;
1297         struct cfq_group *parent;
1298         bool propagate;
1299
1300         /* add to the service tree */
1301         BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1302
1303         /*
1304          * Update leaf_weight.  We cannot update weight at this point
1305          * because cfqg might already have been activated and is
1306          * contributing its current weight to the parent's child_weight.
1307          */
1308         cfq_update_group_leaf_weight(cfqg);
1309         __cfq_group_service_tree_add(st, cfqg);
1310
1311         /*
1312          * Activate @cfqg and calculate the portion of vfraction @cfqg is
1313          * entitled to.  vfraction is calculated by walking the tree
1314          * towards the root calculating the fraction it has at each level.
1315          * The compounded ratio is how much vfraction @cfqg owns.
1316          *
1317          * Start with the proportion tasks in this cfqg has against active
1318          * children cfqgs - its leaf_weight against children_weight.
1319          */
1320         propagate = !pos->nr_active++;
1321         pos->children_weight += pos->leaf_weight;
1322         vfr = vfr * pos->leaf_weight / pos->children_weight;
1323
1324         /*
1325          * Compound ->weight walking up the tree.  Both activation and
1326          * vfraction calculation are done in the same loop.  Propagation
1327          * stops once an already activated node is met.  vfraction
1328          * calculation should always continue to the root.
1329          */
1330         while ((parent = cfqg_parent(pos))) {
1331                 if (propagate) {
1332                         cfq_update_group_weight(pos);
1333                         propagate = !parent->nr_active++;
1334                         parent->children_weight += pos->weight;
1335                 }
1336                 vfr = vfr * pos->weight / parent->children_weight;
1337                 pos = parent;
1338         }
1339
1340         cfqg->vfraction = max_t(unsigned, vfr, 1);
1341 }
1342
1343 static inline u64 cfq_get_cfqg_vdisktime_delay(struct cfq_data *cfqd)
1344 {
1345         if (!iops_mode(cfqd))
1346                 return CFQ_SLICE_MODE_GROUP_DELAY;
1347         else
1348                 return CFQ_IOPS_MODE_GROUP_DELAY;
1349 }
1350
1351 static void
1352 cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
1353 {
1354         struct cfq_rb_root *st = &cfqd->grp_service_tree;
1355         struct cfq_group *__cfqg;
1356         struct rb_node *n;
1357
1358         cfqg->nr_cfqq++;
1359         if (!RB_EMPTY_NODE(&cfqg->rb_node))
1360                 return;
1361
1362         /*
1363          * Currently put the group at the end. Later implement something
1364          * so that groups get lesser vtime based on their weights, so that
1365          * if group does not loose all if it was not continuously backlogged.
1366          */
1367         n = st->rb_rightmost;
1368         if (n) {
1369                 __cfqg = rb_entry_cfqg(n);
1370                 cfqg->vdisktime = __cfqg->vdisktime +
1371                         cfq_get_cfqg_vdisktime_delay(cfqd);
1372         } else
1373                 cfqg->vdisktime = st->min_vdisktime;
1374         cfq_group_service_tree_add(st, cfqg);
1375 }
1376
1377 static void
1378 cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
1379 {
1380         struct cfq_group *pos = cfqg;
1381         bool propagate;
1382
1383         /*
1384          * Undo activation from cfq_group_service_tree_add().  Deactivate
1385          * @cfqg and propagate deactivation upwards.
1386          */
1387         propagate = !--pos->nr_active;
1388         pos->children_weight -= pos->leaf_weight;
1389
1390         while (propagate) {
1391                 struct cfq_group *parent = cfqg_parent(pos);
1392
1393                 /* @pos has 0 nr_active at this point */
1394                 WARN_ON_ONCE(pos->children_weight);
1395                 pos->vfraction = 0;
1396
1397                 if (!parent)
1398                         break;
1399
1400                 propagate = !--parent->nr_active;
1401                 parent->children_weight -= pos->weight;
1402                 pos = parent;
1403         }
1404
1405         /* remove from the service tree */
1406         if (!RB_EMPTY_NODE(&cfqg->rb_node))
1407                 cfq_rb_erase(&cfqg->rb_node, st);
1408 }
1409
1410 static void
1411 cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
1412 {
1413         struct cfq_rb_root *st = &cfqd->grp_service_tree;
1414
1415         BUG_ON(cfqg->nr_cfqq < 1);
1416         cfqg->nr_cfqq--;
1417
1418         /* If there are other cfq queues under this group, don't delete it */
1419         if (cfqg->nr_cfqq)
1420                 return;
1421
1422         cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
1423         cfq_group_service_tree_del(st, cfqg);
1424         cfqg->saved_wl_slice = 0;
1425         cfqg_stats_update_dequeue(cfqg);
1426 }
1427
1428 static inline u64 cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
1429                                        u64 *unaccounted_time)
1430 {
1431         u64 slice_used;
1432         u64 now = ktime_get_ns();
1433
1434         /*
1435          * Queue got expired before even a single request completed or
1436          * got expired immediately after first request completion.
1437          */
1438         if (!cfqq->slice_start || cfqq->slice_start == now) {
1439                 /*
1440                  * Also charge the seek time incurred to the group, otherwise
1441                  * if there are mutiple queues in the group, each can dispatch
1442                  * a single request on seeky media and cause lots of seek time
1443                  * and group will never know it.
1444                  */
1445                 slice_used = max_t(u64, (now - cfqq->dispatch_start),
1446                                         jiffies_to_nsecs(1));
1447         } else {
1448                 slice_used = now - cfqq->slice_start;
1449                 if (slice_used > cfqq->allocated_slice) {
1450                         *unaccounted_time = slice_used - cfqq->allocated_slice;
1451                         slice_used = cfqq->allocated_slice;
1452                 }
1453                 if (cfqq->slice_start > cfqq->dispatch_start)
1454                         *unaccounted_time += cfqq->slice_start -
1455                                         cfqq->dispatch_start;
1456         }
1457
1458         return slice_used;
1459 }
1460
1461 static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
1462                                 struct cfq_queue *cfqq)
1463 {
1464         struct cfq_rb_root *st = &cfqd->grp_service_tree;
1465         u64 used_sl, charge, unaccounted_sl = 0;
1466         int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
1467                         - cfqg->service_tree_idle.count;
1468         unsigned int vfr;
1469         u64 now = ktime_get_ns();
1470
1471         BUG_ON(nr_sync < 0);
1472         used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
1473
1474         if (iops_mode(cfqd))
1475                 charge = cfqq->slice_dispatch;
1476         else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
1477                 charge = cfqq->allocated_slice;
1478
1479         /*
1480          * Can't update vdisktime while on service tree and cfqg->vfraction
1481          * is valid only while on it.  Cache vfr, leave the service tree,
1482          * update vdisktime and go back on.  The re-addition to the tree
1483          * will also update the weights as necessary.
1484          */
1485         vfr = cfqg->vfraction;
1486         cfq_group_service_tree_del(st, cfqg);
1487         cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
1488         cfq_group_service_tree_add(st, cfqg);
1489
1490         /* This group is being expired. Save the context */
1491         if (cfqd->workload_expires > now) {
1492                 cfqg->saved_wl_slice = cfqd->workload_expires - now;
1493                 cfqg->saved_wl_type = cfqd->serving_wl_type;
1494                 cfqg->saved_wl_class = cfqd->serving_wl_class;
1495         } else
1496                 cfqg->saved_wl_slice = 0;
1497
1498         cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
1499                                         st->min_vdisktime);
1500         cfq_log_cfqq(cfqq->cfqd, cfqq,
1501                      "sl_used=%llu disp=%llu charge=%llu iops=%u sect=%lu",
1502                      used_sl, cfqq->slice_dispatch, charge,
1503                      iops_mode(cfqd), cfqq->nr_sectors);
1504         cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
1505         cfqg_stats_set_start_empty_time(cfqg);
1506 }
1507
1508 /**
1509  * cfq_init_cfqg_base - initialize base part of a cfq_group
1510  * @cfqg: cfq_group to initialize
1511  *
1512  * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
1513  * is enabled or not.
1514  */
1515 static void cfq_init_cfqg_base(struct cfq_group *cfqg)
1516 {
1517         struct cfq_rb_root *st;
1518         int i, j;
1519
1520         for_each_cfqg_st(cfqg, i, j, st)
1521                 *st = CFQ_RB_ROOT;
1522         RB_CLEAR_NODE(&cfqg->rb_node);
1523
1524         cfqg->ttime.last_end_request = ktime_get_ns();
1525 }
1526
1527 #ifdef CONFIG_CFQ_GROUP_IOSCHED
1528 static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
1529                             bool on_dfl, bool reset_dev, bool is_leaf_weight);
1530
1531 static void cfqg_stats_exit(struct cfqg_stats *stats)
1532 {
1533         blkg_rwstat_exit(&stats->merged);
1534         blkg_rwstat_exit(&stats->service_time);
1535         blkg_rwstat_exit(&stats->wait_time);
1536         blkg_rwstat_exit(&stats->queued);
1537         blkg_stat_exit(&stats->time);
1538 #ifdef CONFIG_DEBUG_BLK_CGROUP
1539         blkg_stat_exit(&stats->unaccounted_time);
1540         blkg_stat_exit(&stats->avg_queue_size_sum);
1541         blkg_stat_exit(&stats->avg_queue_size_samples);
1542         blkg_stat_exit(&stats->dequeue);
1543         blkg_stat_exit(&stats->group_wait_time);
1544         blkg_stat_exit(&stats->idle_time);
1545         blkg_stat_exit(&stats->empty_time);
1546 #endif
1547 }
1548
1549 static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
1550 {
1551         if (blkg_rwstat_init(&stats->merged, gfp) ||
1552             blkg_rwstat_init(&stats->service_time, gfp) ||
1553             blkg_rwstat_init(&stats->wait_time, gfp) ||
1554             blkg_rwstat_init(&stats->queued, gfp) ||
1555             blkg_stat_init(&stats->time, gfp))
1556                 goto err;
1557
1558 #ifdef CONFIG_DEBUG_BLK_CGROUP
1559         if (blkg_stat_init(&stats->unaccounted_time, gfp) ||
1560             blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
1561             blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
1562             blkg_stat_init(&stats->dequeue, gfp) ||
1563             blkg_stat_init(&stats->group_wait_time, gfp) ||
1564             blkg_stat_init(&stats->idle_time, gfp) ||
1565             blkg_stat_init(&stats->empty_time, gfp))
1566                 goto err;
1567 #endif
1568         return 0;
1569 err:
1570         cfqg_stats_exit(stats);
1571         return -ENOMEM;
1572 }
1573
1574 static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
1575 {
1576         struct cfq_group_data *cgd;
1577
1578         cgd = kzalloc(sizeof(*cgd), gfp);
1579         if (!cgd)
1580                 return NULL;
1581         return &cgd->cpd;
1582 }
1583
1584 static void cfq_cpd_init(struct blkcg_policy_data *cpd)
1585 {
1586         struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
1587         unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
1588                               CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
1589
1590         if (cpd_to_blkcg(cpd) == &blkcg_root)
1591                 weight *= 2;
1592
1593         cgd->weight = weight;
1594         cgd->leaf_weight = weight;
1595 }
1596
1597 static void cfq_cpd_free(struct blkcg_policy_data *cpd)
1598 {
1599         kfree(cpd_to_cfqgd(cpd));
1600 }
1601
1602 static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
1603 {
1604         struct blkcg *blkcg = cpd_to_blkcg(cpd);
1605         bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys);
1606         unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
1607
1608         if (blkcg == &blkcg_root)
1609                 weight *= 2;
1610
1611         WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false));
1612         WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true));
1613 }
1614
1615 static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
1616 {
1617         struct cfq_group *cfqg;
1618
1619         cfqg = kzalloc_node(sizeof(*cfqg), gfp, node);
1620         if (!cfqg)
1621                 return NULL;
1622
1623         cfq_init_cfqg_base(cfqg);
1624         if (cfqg_stats_init(&cfqg->stats, gfp)) {
1625                 kfree(cfqg);
1626                 return NULL;
1627         }
1628
1629         return &cfqg->pd;
1630 }
1631
1632 static void cfq_pd_init(struct blkg_policy_data *pd)
1633 {
1634         struct cfq_group *cfqg = pd_to_cfqg(pd);
1635         struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg);
1636
1637         cfqg->weight = cgd->weight;
1638         cfqg->leaf_weight = cgd->leaf_weight;
1639 }
1640
1641 static void cfq_pd_offline(struct blkg_policy_data *pd)
1642 {
1643         struct cfq_group *cfqg = pd_to_cfqg(pd);
1644         int i;
1645
1646         for (i = 0; i < IOPRIO_BE_NR; i++) {
1647                 if (cfqg->async_cfqq[0][i])
1648                         cfq_put_queue(cfqg->async_cfqq[0][i]);
1649                 if (cfqg->async_cfqq[1][i])
1650                         cfq_put_queue(cfqg->async_cfqq[1][i]);
1651         }
1652
1653         if (cfqg->async_idle_cfqq)
1654                 cfq_put_queue(cfqg->async_idle_cfqq);
1655
1656         /*
1657          * @blkg is going offline and will be ignored by
1658          * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
1659          * that they don't get lost.  If IOs complete after this point, the
1660          * stats for them will be lost.  Oh well...
1661          */
1662         cfqg_stats_xfer_dead(cfqg);
1663 }
1664
1665 static void cfq_pd_free(struct blkg_policy_data *pd)
1666 {
1667         struct cfq_group *cfqg = pd_to_cfqg(pd);
1668
1669         cfqg_stats_exit(&cfqg->stats);
1670         return kfree(cfqg);
1671 }
1672
1673 static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
1674 {
1675         struct cfq_group *cfqg = pd_to_cfqg(pd);
1676
1677         cfqg_stats_reset(&cfqg->stats);
1678 }
1679
1680 static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
1681                                          struct blkcg *blkcg)
1682 {
1683         struct blkcg_gq *blkg;
1684
1685         blkg = blkg_lookup(blkcg, cfqd->queue);
1686         if (likely(blkg))
1687                 return blkg_to_cfqg(blkg);
1688         return NULL;
1689 }
1690
1691 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1692 {
1693         cfqq->cfqg = cfqg;
1694         /* cfqq reference on cfqg */
1695         cfqg_get(cfqg);
1696 }
1697
1698 static u64 cfqg_prfill_weight_device(struct seq_file *sf,
1699                                      struct blkg_policy_data *pd, int off)
1700 {
1701         struct cfq_group *cfqg = pd_to_cfqg(pd);
1702
1703         if (!cfqg->dev_weight)
1704                 return 0;
1705         return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
1706 }
1707
1708 static int cfqg_print_weight_device(struct seq_file *sf, void *v)
1709 {
1710         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1711                           cfqg_prfill_weight_device, &blkcg_policy_cfq,
1712                           0, false);
1713         return 0;
1714 }
1715
1716 static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
1717                                           struct blkg_policy_data *pd, int off)
1718 {
1719         struct cfq_group *cfqg = pd_to_cfqg(pd);
1720
1721         if (!cfqg->dev_leaf_weight)
1722                 return 0;
1723         return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
1724 }
1725
1726 static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
1727 {
1728         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1729                           cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq,
1730                           0, false);
1731         return 0;
1732 }
1733
1734 static int cfq_print_weight(struct seq_file *sf, void *v)
1735 {
1736         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1737         struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
1738         unsigned int val = 0;
1739
1740         if (cgd)
1741                 val = cgd->weight;
1742
1743         seq_printf(sf, "%u\n", val);
1744         return 0;
1745 }
1746
1747 static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
1748 {
1749         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1750         struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
1751         unsigned int val = 0;
1752
1753         if (cgd)
1754                 val = cgd->leaf_weight;
1755
1756         seq_printf(sf, "%u\n", val);
1757         return 0;
1758 }
1759
1760 static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
1761                                         char *buf, size_t nbytes, loff_t off,
1762                                         bool on_dfl, bool is_leaf_weight)
1763 {
1764         unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
1765         unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
1766         struct blkcg *blkcg = css_to_blkcg(of_css(of));
1767         struct blkg_conf_ctx ctx;
1768         struct cfq_group *cfqg;
1769         struct cfq_group_data *cfqgd;
1770         int ret;
1771         u64 v;
1772
1773         ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
1774         if (ret)
1775                 return ret;
1776
1777         if (sscanf(ctx.body, "%llu", &v) == 1) {
1778                 /* require "default" on dfl */
1779                 ret = -ERANGE;
1780                 if (!v && on_dfl)
1781                         goto out_finish;
1782         } else if (!strcmp(strim(ctx.body), "default")) {
1783                 v = 0;
1784         } else {
1785                 ret = -EINVAL;
1786                 goto out_finish;
1787         }
1788
1789         cfqg = blkg_to_cfqg(ctx.blkg);
1790         cfqgd = blkcg_to_cfqgd(blkcg);
1791
1792         ret = -ERANGE;
1793         if (!v || (v >= min && v <= max)) {
1794                 if (!is_leaf_weight) {
1795                         cfqg->dev_weight = v;
1796                         cfqg->new_weight = v ?: cfqgd->weight;
1797                 } else {
1798                         cfqg->dev_leaf_weight = v;
1799                         cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight;
1800                 }
1801                 ret = 0;
1802         }
1803 out_finish:
1804         blkg_conf_finish(&ctx);
1805         return ret ?: nbytes;
1806 }
1807
1808 static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
1809                                       char *buf, size_t nbytes, loff_t off)
1810 {
1811         return __cfqg_set_weight_device(of, buf, nbytes, off, false, false);
1812 }
1813
1814 static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
1815                                            char *buf, size_t nbytes, loff_t off)
1816 {
1817         return __cfqg_set_weight_device(of, buf, nbytes, off, false, true);
1818 }
1819
1820 static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
1821                             bool on_dfl, bool reset_dev, bool is_leaf_weight)
1822 {
1823         unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
1824         unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
1825         struct blkcg *blkcg = css_to_blkcg(css);
1826         struct blkcg_gq *blkg;
1827         struct cfq_group_data *cfqgd;
1828         int ret = 0;
1829
1830         if (val < min || val > max)
1831                 return -ERANGE;
1832
1833         spin_lock_irq(&blkcg->lock);
1834         cfqgd = blkcg_to_cfqgd(blkcg);
1835         if (!cfqgd) {
1836                 ret = -EINVAL;
1837                 goto out;
1838         }
1839
1840         if (!is_leaf_weight)
1841                 cfqgd->weight = val;
1842         else
1843                 cfqgd->leaf_weight = val;
1844
1845         hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
1846                 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1847
1848                 if (!cfqg)
1849                         continue;
1850
1851                 if (!is_leaf_weight) {
1852                         if (reset_dev)
1853                                 cfqg->dev_weight = 0;
1854                         if (!cfqg->dev_weight)
1855                                 cfqg->new_weight = cfqgd->weight;
1856                 } else {
1857                         if (reset_dev)
1858                                 cfqg->dev_leaf_weight = 0;
1859                         if (!cfqg->dev_leaf_weight)
1860                                 cfqg->new_leaf_weight = cfqgd->leaf_weight;
1861                 }
1862         }
1863
1864 out:
1865         spin_unlock_irq(&blkcg->lock);
1866         return ret;
1867 }
1868
1869 static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
1870                           u64 val)
1871 {
1872         return __cfq_set_weight(css, val, false, false, false);
1873 }
1874
1875 static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
1876                                struct cftype *cft, u64 val)
1877 {
1878         return __cfq_set_weight(css, val, false, false, true);
1879 }
1880
1881 static int cfqg_print_stat(struct seq_file *sf, void *v)
1882 {
1883         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
1884                           &blkcg_policy_cfq, seq_cft(sf)->private, false);
1885         return 0;
1886 }
1887
1888 static int cfqg_print_rwstat(struct seq_file *sf, void *v)
1889 {
1890         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
1891                           &blkcg_policy_cfq, seq_cft(sf)->private, true);
1892         return 0;
1893 }
1894
1895 static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
1896                                       struct blkg_policy_data *pd, int off)
1897 {
1898         u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
1899                                           &blkcg_policy_cfq, off);
1900         return __blkg_prfill_u64(sf, pd, sum);
1901 }
1902
1903 static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
1904                                         struct blkg_policy_data *pd, int off)
1905 {
1906         struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
1907                                                         &blkcg_policy_cfq, off);
1908         return __blkg_prfill_rwstat(sf, pd, &sum);
1909 }
1910
1911 static int cfqg_print_stat_recursive(struct seq_file *sf, void *v)
1912 {
1913         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1914                           cfqg_prfill_stat_recursive, &blkcg_policy_cfq,
1915                           seq_cft(sf)->private, false);
1916         return 0;
1917 }
1918
1919 static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
1920 {
1921         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1922                           cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq,
1923                           seq_cft(sf)->private, true);
1924         return 0;
1925 }
1926
1927 static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
1928                                int off)
1929 {
1930         u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
1931
1932         return __blkg_prfill_u64(sf, pd, sum >> 9);
1933 }
1934
1935 static int cfqg_print_stat_sectors(struct seq_file *sf, void *v)
1936 {
1937         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1938                           cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false);
1939         return 0;
1940 }
1941
1942 static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf,
1943                                          struct blkg_policy_data *pd, int off)
1944 {
1945         struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
1946                                         offsetof(struct blkcg_gq, stat_bytes));
1947         u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
1948                 atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
1949
1950         return __blkg_prfill_u64(sf, pd, sum >> 9);
1951 }
1952
1953 static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
1954 {
1955         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1956                           cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0,
1957                           false);
1958         return 0;
1959 }
1960
1961 #ifdef CONFIG_DEBUG_BLK_CGROUP
1962 static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
1963                                       struct blkg_policy_data *pd, int off)
1964 {
1965         struct cfq_group *cfqg = pd_to_cfqg(pd);
1966         u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
1967         u64 v = 0;
1968
1969         if (samples) {
1970                 v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
1971                 v = div64_u64(v, samples);
1972         }
1973         __blkg_prfill_u64(sf, pd, v);
1974         return 0;
1975 }
1976
1977 /* print avg_queue_size */
1978 static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
1979 {
1980         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1981                           cfqg_prfill_avg_queue_size, &blkcg_policy_cfq,
1982                           0, false);
1983         return 0;
1984 }
1985 #endif  /* CONFIG_DEBUG_BLK_CGROUP */
1986
1987 static struct cftype cfq_blkcg_legacy_files[] = {
1988         /* on root, weight is mapped to leaf_weight */
1989         {
1990                 .name = "weight_device",
1991                 .flags = CFTYPE_ONLY_ON_ROOT,
1992                 .seq_show = cfqg_print_leaf_weight_device,
1993                 .write = cfqg_set_leaf_weight_device,
1994         },
1995         {
1996                 .name = "weight",
1997                 .flags = CFTYPE_ONLY_ON_ROOT,
1998                 .seq_show = cfq_print_leaf_weight,
1999                 .write_u64 = cfq_set_leaf_weight,
2000         },
2001
2002         /* no such mapping necessary for !roots */
2003         {
2004                 .name = "weight_device",
2005                 .flags = CFTYPE_NOT_ON_ROOT,
2006                 .seq_show = cfqg_print_weight_device,
2007                 .write = cfqg_set_weight_device,
2008         },
2009         {
2010                 .name = "weight",
2011                 .flags = CFTYPE_NOT_ON_ROOT,
2012                 .seq_show = cfq_print_weight,
2013                 .write_u64 = cfq_set_weight,
2014         },
2015
2016         {
2017                 .name = "leaf_weight_device",
2018                 .seq_show = cfqg_print_leaf_weight_device,
2019                 .write = cfqg_set_leaf_weight_device,
2020         },
2021         {
2022                 .name = "leaf_weight",
2023                 .seq_show = cfq_print_leaf_weight,
2024                 .write_u64 = cfq_set_leaf_weight,
2025         },
2026
2027         /* statistics, covers only the tasks in the cfqg */
2028         {
2029                 .name = "time",
2030                 .private = offsetof(struct cfq_group, stats.time),
2031                 .seq_show = cfqg_print_stat,
2032         },
2033         {
2034                 .name = "sectors",
2035                 .seq_show = cfqg_print_stat_sectors,
2036         },
2037         {
2038                 .name = "io_service_bytes",
2039                 .private = (unsigned long)&blkcg_policy_cfq,
2040                 .seq_show = blkg_print_stat_bytes,
2041         },
2042         {
2043                 .name = "io_serviced",
2044                 .private = (unsigned long)&blkcg_policy_cfq,
2045                 .seq_show = blkg_print_stat_ios,
2046         },
2047         {
2048                 .name = "io_service_time",
2049                 .private = offsetof(struct cfq_group, stats.service_time),
2050                 .seq_show = cfqg_print_rwstat,
2051         },
2052         {
2053                 .name = "io_wait_time",
2054                 .private = offsetof(struct cfq_group, stats.wait_time),
2055                 .seq_show = cfqg_print_rwstat,
2056         },
2057         {
2058                 .name = "io_merged",
2059                 .private = offsetof(struct cfq_group, stats.merged),
2060                 .seq_show = cfqg_print_rwstat,
2061         },
2062         {
2063                 .name = "io_queued",
2064                 .private = offsetof(struct cfq_group, stats.queued),
2065                 .seq_show = cfqg_print_rwstat,
2066         },
2067
2068         /* the same statictics which cover the cfqg and its descendants */
2069         {
2070                 .name = "time_recursive",
2071                 .private = offsetof(struct cfq_group, stats.time),
2072                 .seq_show = cfqg_print_stat_recursive,
2073         },
2074         {
2075                 .name = "sectors_recursive",
2076                 .seq_show = cfqg_print_stat_sectors_recursive,
2077         },
2078         {
2079                 .name = "io_service_bytes_recursive",
2080                 .private = (unsigned long)&blkcg_policy_cfq,
2081                 .seq_show = blkg_print_stat_bytes_recursive,
2082         },
2083         {
2084                 .name = "io_serviced_recursive",
2085                 .private = (unsigned long)&blkcg_policy_cfq,
2086                 .seq_show = blkg_print_stat_ios_recursive,
2087         },
2088         {
2089                 .name = "io_service_time_recursive",
2090                 .private = offsetof(struct cfq_group, stats.service_time),
2091                 .seq_show = cfqg_print_rwstat_recursive,
2092         },
2093         {
2094                 .name = "io_wait_time_recursive",
2095                 .private = offsetof(struct cfq_group, stats.wait_time),
2096                 .seq_show = cfqg_print_rwstat_recursive,
2097         },
2098         {
2099                 .name = "io_merged_recursive",
2100                 .private = offsetof(struct cfq_group, stats.merged),
2101                 .seq_show = cfqg_print_rwstat_recursive,
2102         },
2103         {
2104                 .name = "io_queued_recursive",
2105                 .private = offsetof(struct cfq_group, stats.queued),
2106                 .seq_show = cfqg_print_rwstat_recursive,
2107         },
2108 #ifdef CONFIG_DEBUG_BLK_CGROUP
2109         {
2110                 .name = "avg_queue_size",
2111                 .seq_show = cfqg_print_avg_queue_size,
2112         },
2113         {
2114                 .name = "group_wait_time",
2115                 .private = offsetof(struct cfq_group, stats.group_wait_time),
2116                 .seq_show = cfqg_print_stat,
2117         },
2118         {
2119                 .name = "idle_time",
2120                 .private = offsetof(struct cfq_group, stats.idle_time),
2121                 .seq_show = cfqg_print_stat,
2122         },
2123         {
2124                 .name = "empty_time",
2125                 .private = offsetof(struct cfq_group, stats.empty_time),
2126                 .seq_show = cfqg_print_stat,
2127         },
2128         {
2129                 .name = "dequeue",
2130                 .private = offsetof(struct cfq_group, stats.dequeue),
2131                 .seq_show = cfqg_print_stat,
2132         },
2133         {
2134                 .name = "unaccounted_time",
2135                 .private = offsetof(struct cfq_group, stats.unaccounted_time),
2136                 .seq_show = cfqg_print_stat,
2137         },
2138 #endif  /* CONFIG_DEBUG_BLK_CGROUP */
2139         { }     /* terminate */
2140 };
2141
2142 static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v)
2143 {
2144         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2145         struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
2146
2147         seq_printf(sf, "default %u\n", cgd->weight);
2148         blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device,
2149                           &blkcg_policy_cfq, 0, false);
2150         return 0;
2151 }
2152
2153 static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
2154                                      char *buf, size_t nbytes, loff_t off)
2155 {
2156         char *endp;
2157         int ret;
2158         u64 v;
2159
2160         buf = strim(buf);
2161
2162         /* "WEIGHT" or "default WEIGHT" sets the default weight */
2163         v = simple_strtoull(buf, &endp, 0);
2164         if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
2165                 ret = __cfq_set_weight(of_css(of), v, true, false, false);
2166                 return ret ?: nbytes;
2167         }
2168
2169         /* "MAJ:MIN WEIGHT" */
2170         return __cfqg_set_weight_device(of, buf, nbytes, off, true, false);
2171 }
2172
2173 static struct cftype cfq_blkcg_files[] = {
2174         {
2175                 .name = "weight",
2176                 .flags = CFTYPE_NOT_ON_ROOT,
2177                 .seq_show = cfq_print_weight_on_dfl,
2178                 .write = cfq_set_weight_on_dfl,
2179         },
2180         { }     /* terminate */
2181 };
2182
2183 #else /* GROUP_IOSCHED */
2184 static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
2185                                          struct blkcg *blkcg)
2186 {
2187         return cfqd->root_group;
2188 }
2189
2190 static inline void
2191 cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
2192         cfqq->cfqg = cfqg;
2193 }
2194
2195 #endif /* GROUP_IOSCHED */
2196
2197 /*
2198  * The cfqd->service_trees holds all pending cfq_queue's that have
2199  * requests waiting to be processed. It is sorted in the order that
2200  * we will service the queues.
2201  */
2202 static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2203                                  bool add_front)
2204 {
2205         struct rb_node **p, *parent;
2206         struct cfq_queue *__cfqq;
2207         u64 rb_key;
2208         struct cfq_rb_root *st;
2209         bool leftmost = true;
2210         int new_cfqq = 1;
2211         u64 now = ktime_get_ns();
2212
2213         st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq));
2214         if (cfq_class_idle(cfqq)) {
2215                 rb_key = CFQ_IDLE_DELAY;
2216                 parent = st->rb_rightmost;
2217                 if (parent && parent != &cfqq->rb_node) {
2218                         __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
2219                         rb_key += __cfqq->rb_key;
2220                 } else
2221                         rb_key += now;
2222         } else if (!add_front) {
2223                 /*
2224                  * Get our rb key offset. Subtract any residual slice
2225                  * value carried from last service. A negative resid
2226                  * count indicates slice overrun, and this should position
2227                  * the next service time further away in the tree.
2228                  */
2229                 rb_key = cfq_slice_offset(cfqd, cfqq) + now;
2230                 rb_key -= cfqq->slice_resid;
2231                 cfqq->slice_resid = 0;
2232         } else {
2233                 rb_key = -NSEC_PER_SEC;
2234                 __cfqq = cfq_rb_first(st);
2235                 rb_key += __cfqq ? __cfqq->rb_key : now;
2236         }
2237
2238         if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
2239                 new_cfqq = 0;
2240                 /*
2241                  * same position, nothing more to do
2242                  */
2243                 if (rb_key == cfqq->rb_key && cfqq->service_tree == st)
2244                         return;
2245
2246                 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
2247                 cfqq->service_tree = NULL;
2248         }
2249
2250         parent = NULL;
2251         cfqq->service_tree = st;
2252         p = &st->rb.rb_root.rb_node;
2253         while (*p) {
2254                 parent = *p;
2255                 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
2256
2257                 /*
2258                  * sort by key, that represents service time.
2259                  */
2260                 if (rb_key < __cfqq->rb_key)
2261                         p = &parent->rb_left;
2262                 else {
2263                         p = &parent->rb_right;
2264                         leftmost = false;
2265                 }
2266         }
2267
2268         cfqq->rb_key = rb_key;
2269         rb_link_node(&cfqq->rb_node, parent, p);
2270         rb_insert_color_cached(&cfqq->rb_node, &st->rb, leftmost);
2271         st->count++;
2272         if (add_front || !new_cfqq)
2273                 return;
2274         cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
2275 }
2276
2277 static struct cfq_queue *
2278 cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
2279                      sector_t sector, struct rb_node **ret_parent,
2280                      struct rb_node ***rb_link)
2281 {
2282         struct rb_node **p, *parent;
2283         struct cfq_queue *cfqq = NULL;
2284
2285         parent = NULL;
2286         p = &root->rb_node;
2287         while (*p) {
2288                 struct rb_node **n;
2289
2290                 parent = *p;
2291                 cfqq = rb_entry(parent, struct cfq_queue, p_node);
2292
2293                 /*
2294                  * Sort strictly based on sector.  Smallest to the left,
2295                  * largest to the right.
2296                  */
2297                 if (sector > blk_rq_pos(cfqq->next_rq))
2298                         n = &(*p)->rb_right;
2299                 else if (sector < blk_rq_pos(cfqq->next_rq))
2300                         n = &(*p)->rb_left;
2301                 else
2302                         break;
2303                 p = n;
2304                 cfqq = NULL;
2305         }
2306
2307         *ret_parent = parent;
2308         if (rb_link)
2309                 *rb_link = p;
2310         return cfqq;
2311 }
2312
2313 static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2314 {
2315         struct rb_node **p, *parent;
2316         struct cfq_queue *__cfqq;
2317
2318         if (cfqq->p_root) {
2319                 rb_erase(&cfqq->p_node, cfqq->p_root);
2320                 cfqq->p_root = NULL;
2321         }
2322
2323         if (cfq_class_idle(cfqq))
2324                 return;
2325         if (!cfqq->next_rq)
2326                 return;
2327
2328         cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
2329         __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
2330                                       blk_rq_pos(cfqq->next_rq), &parent, &p);
2331         if (!__cfqq) {
2332                 rb_link_node(&cfqq->p_node, parent, p);
2333                 rb_insert_color(&cfqq->p_node, cfqq->p_root);
2334         } else
2335                 cfqq->p_root = NULL;
2336 }
2337
2338 /*
2339  * Update cfqq's position in the service tree.
2340  */
2341 static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2342 {
2343         /*
2344          * Resorting requires the cfqq to be on the RR list already.
2345          */
2346         if (cfq_cfqq_on_rr(cfqq)) {
2347                 cfq_service_tree_add(cfqd, cfqq, 0);
2348                 cfq_prio_tree_add(cfqd, cfqq);
2349         }
2350 }
2351
2352 /*
2353  * add to busy list of queues for service, trying to be fair in ordering
2354  * the pending list according to last request service
2355  */
2356 static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2357 {
2358         cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
2359         BUG_ON(cfq_cfqq_on_rr(cfqq));
2360         cfq_mark_cfqq_on_rr(cfqq);
2361         cfqd->busy_queues++;
2362         if (cfq_cfqq_sync(cfqq))
2363                 cfqd->busy_sync_queues++;
2364
2365         cfq_resort_rr_list(cfqd, cfqq);
2366 }
2367
2368 /*
2369  * Called when the cfqq no longer has requests pending, remove it from
2370  * the service tree.
2371  */
2372 static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2373 {
2374         cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
2375         BUG_ON(!cfq_cfqq_on_rr(cfqq));
2376         cfq_clear_cfqq_on_rr(cfqq);
2377
2378         if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
2379                 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
2380                 cfqq->service_tree = NULL;
2381         }
2382         if (cfqq->p_root) {
2383                 rb_erase(&cfqq->p_node, cfqq->p_root);
2384                 cfqq->p_root = NULL;
2385         }
2386
2387         cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
2388         BUG_ON(!cfqd->busy_queues);
2389         cfqd->busy_queues--;
2390         if (cfq_cfqq_sync(cfqq))
2391                 cfqd->busy_sync_queues--;
2392 }
2393
2394 /*
2395  * rb tree support functions
2396  */
2397 static void cfq_del_rq_rb(struct request *rq)
2398 {
2399         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2400         const int sync = rq_is_sync(rq);
2401
2402         BUG_ON(!cfqq->queued[sync]);
2403         cfqq->queued[sync]--;
2404
2405         elv_rb_del(&cfqq->sort_list, rq);
2406
2407         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
2408                 /*
2409                  * Queue will be deleted from service tree when we actually
2410                  * expire it later. Right now just remove it from prio tree
2411                  * as it is empty.
2412                  */
2413                 if (cfqq->p_root) {
2414                         rb_erase(&cfqq->p_node, cfqq->p_root);
2415                         cfqq->p_root = NULL;
2416                 }
2417         }
2418 }
2419
2420 static void cfq_add_rq_rb(struct request *rq)
2421 {
2422         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2423         struct cfq_data *cfqd = cfqq->cfqd;
2424         struct request *prev;
2425
2426         cfqq->queued[rq_is_sync(rq)]++;
2427
2428         elv_rb_add(&cfqq->sort_list, rq);
2429
2430         if (!cfq_cfqq_on_rr(cfqq))
2431                 cfq_add_cfqq_rr(cfqd, cfqq);
2432
2433         /*
2434          * check if this request is a better next-serve candidate
2435          */
2436         prev = cfqq->next_rq;
2437         cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
2438
2439         /*
2440          * adjust priority tree position, if ->next_rq changes
2441          */
2442         if (prev != cfqq->next_rq)
2443                 cfq_prio_tree_add(cfqd, cfqq);
2444
2445         BUG_ON(!cfqq->next_rq);
2446 }
2447
2448 static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
2449 {
2450         elv_rb_del(&cfqq->sort_list, rq);
2451         cfqq->queued[rq_is_sync(rq)]--;
2452         cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2453         cfq_add_rq_rb(rq);
2454         cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
2455                                  rq->cmd_flags);
2456 }
2457
2458 static struct request *
2459 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
2460 {
2461         struct task_struct *tsk = current;
2462         struct cfq_io_cq *cic;
2463         struct cfq_queue *cfqq;
2464
2465         cic = cfq_cic_lookup(cfqd, tsk->io_context);
2466         if (!cic)
2467                 return NULL;
2468
2469         cfqq = cic_to_cfqq(cic, op_is_sync(bio->bi_opf));
2470         if (cfqq)
2471                 return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio));
2472
2473         return NULL;
2474 }
2475
2476 static void cfq_activate_request(struct request_queue *q, struct request *rq)
2477 {
2478         struct cfq_data *cfqd = q->elevator->elevator_data;
2479
2480         cfqd->rq_in_driver++;
2481         cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
2482                                                 cfqd->rq_in_driver);
2483
2484         cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
2485 }
2486
2487 static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
2488 {
2489         struct cfq_data *cfqd = q->elevator->elevator_data;
2490
2491         WARN_ON(!cfqd->rq_in_driver);
2492         cfqd->rq_in_driver--;
2493         cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
2494                                                 cfqd->rq_in_driver);
2495 }
2496
2497 static void cfq_remove_request(struct request *rq)
2498 {
2499         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2500
2501         if (cfqq->next_rq == rq)
2502                 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
2503
2504         list_del_init(&rq->queuelist);
2505         cfq_del_rq_rb(rq);
2506
2507         cfqq->cfqd->rq_queued--;
2508         cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2509         if (rq->cmd_flags & REQ_PRIO) {
2510                 WARN_ON(!cfqq->prio_pending);
2511                 cfqq->prio_pending--;
2512         }
2513 }
2514
2515 static enum elv_merge cfq_merge(struct request_queue *q, struct request **req,
2516                      struct bio *bio)
2517 {
2518         struct cfq_data *cfqd = q->elevator->elevator_data;
2519         struct request *__rq;
2520
2521         __rq = cfq_find_rq_fmerge(cfqd, bio);
2522         if (__rq && elv_bio_merge_ok(__rq, bio)) {
2523                 *req = __rq;
2524                 return ELEVATOR_FRONT_MERGE;
2525         }
2526
2527         return ELEVATOR_NO_MERGE;
2528 }
2529
2530 static void cfq_merged_request(struct request_queue *q, struct request *req,
2531                                enum elv_merge type)
2532 {
2533         if (type == ELEVATOR_FRONT_MERGE) {
2534                 struct cfq_queue *cfqq = RQ_CFQQ(req);
2535
2536                 cfq_reposition_rq_rb(cfqq, req);
2537         }
2538 }
2539
2540 static void cfq_bio_merged(struct request_queue *q, struct request *req,
2541                                 struct bio *bio)
2542 {
2543         cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_opf);
2544 }
2545
2546 static void
2547 cfq_merged_requests(struct request_queue *q, struct request *rq,
2548                     struct request *next)
2549 {
2550         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2551         struct cfq_data *cfqd = q->elevator->elevator_data;
2552
2553         /*
2554          * reposition in fifo if next is older than rq
2555          */
2556         if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
2557             next->fifo_time < rq->fifo_time &&
2558             cfqq == RQ_CFQQ(next)) {
2559                 list_move(&rq->queuelist, &next->queuelist);
2560                 rq->fifo_time = next->fifo_time;
2561         }
2562
2563         if (cfqq->next_rq == next)
2564                 cfqq->next_rq = rq;
2565         cfq_remove_request(next);
2566         cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
2567
2568         cfqq = RQ_CFQQ(next);
2569         /*
2570          * all requests of this queue are merged to other queues, delete it
2571          * from the service tree. If it's the active_queue,
2572          * cfq_dispatch_requests() will choose to expire it or do idle
2573          */
2574         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
2575             cfqq != cfqd->active_queue)
2576                 cfq_del_cfqq_rr(cfqd, cfqq);
2577 }
2578
2579 static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq,
2580                                struct bio *bio)
2581 {
2582         struct cfq_data *cfqd = q->elevator->elevator_data;
2583         bool is_sync = op_is_sync(bio->bi_opf);
2584         struct cfq_io_cq *cic;
2585         struct cfq_queue *cfqq;
2586
2587         /*
2588          * Disallow merge of a sync bio into an async request.
2589          */
2590         if (is_sync && !rq_is_sync(rq))
2591                 return false;
2592
2593         /*
2594          * Lookup the cfqq that this bio will be queued with and allow
2595          * merge only if rq is queued there.
2596          */
2597         cic = cfq_cic_lookup(cfqd, current->io_context);
2598         if (!cic)
2599                 return false;
2600
2601         cfqq = cic_to_cfqq(cic, is_sync);
2602         return cfqq == RQ_CFQQ(rq);
2603 }
2604
2605 static int cfq_allow_rq_merge(struct request_queue *q, struct request *rq,
2606                               struct request *next)
2607 {
2608         return RQ_CFQQ(rq) == RQ_CFQQ(next);
2609 }
2610
2611 static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2612 {
2613         hrtimer_try_to_cancel(&cfqd->idle_slice_timer);
2614         cfqg_stats_update_idle_time(cfqq->cfqg);
2615 }
2616
2617 static void __cfq_set_active_queue(struct cfq_data *cfqd,
2618                                    struct cfq_queue *cfqq)
2619 {
2620         if (cfqq) {
2621                 cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d",
2622                                 cfqd->serving_wl_class, cfqd->serving_wl_type);
2623                 cfqg_stats_update_avg_queue_size(cfqq->cfqg);
2624                 cfqq->slice_start = 0;
2625                 cfqq->dispatch_start = ktime_get_ns();
2626                 cfqq->allocated_slice = 0;
2627                 cfqq->slice_end = 0;
2628                 cfqq->slice_dispatch = 0;
2629                 cfqq->nr_sectors = 0;
2630
2631                 cfq_clear_cfqq_wait_request(cfqq);
2632                 cfq_clear_cfqq_must_dispatch(cfqq);
2633                 cfq_clear_cfqq_must_alloc_slice(cfqq);
2634                 cfq_clear_cfqq_fifo_expire(cfqq);
2635                 cfq_mark_cfqq_slice_new(cfqq);
2636
2637                 cfq_del_timer(cfqd, cfqq);
2638         }
2639
2640         cfqd->active_queue = cfqq;
2641 }
2642
2643 /*
2644  * current cfqq expired its slice (or was too idle), select new one
2645  */
2646 static void
2647 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2648                     bool timed_out)
2649 {
2650         cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
2651
2652         if (cfq_cfqq_wait_request(cfqq))
2653                 cfq_del_timer(cfqd, cfqq);
2654
2655         cfq_clear_cfqq_wait_request(cfqq);
2656         cfq_clear_cfqq_wait_busy(cfqq);
2657
2658         /*
2659          * If this cfqq is shared between multiple processes, check to
2660          * make sure that those processes are still issuing I/Os within
2661          * the mean seek distance.  If not, it may be time to break the
2662          * queues apart again.
2663          */
2664         if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
2665                 cfq_mark_cfqq_split_coop(cfqq);
2666
2667         /*
2668          * store what was left of this slice, if the queue idled/timed out
2669          */
2670         if (timed_out) {
2671                 if (cfq_cfqq_slice_new(cfqq))
2672                         cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
2673                 else
2674                         cfqq->slice_resid = cfqq->slice_end - ktime_get_ns();
2675                 cfq_log_cfqq(cfqd, cfqq, "resid=%lld", cfqq->slice_resid);
2676         }
2677
2678         cfq_group_served(cfqd, cfqq->cfqg, cfqq);
2679
2680         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
2681                 cfq_del_cfqq_rr(cfqd, cfqq);
2682
2683         cfq_resort_rr_list(cfqd, cfqq);
2684
2685         if (cfqq == cfqd->active_queue)
2686                 cfqd->active_queue = NULL;
2687
2688         if (cfqd->active_cic) {
2689                 put_io_context(cfqd->active_cic->icq.ioc);
2690                 cfqd->active_cic = NULL;
2691         }
2692 }
2693
2694 static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
2695 {
2696         struct cfq_queue *cfqq = cfqd->active_queue;
2697
2698         if (cfqq)
2699                 __cfq_slice_expired(cfqd, cfqq, timed_out);
2700 }
2701
2702 /*
2703  * Get next queue for service. Unless we have a queue preemption,
2704  * we'll simply select the first cfqq in the service tree.
2705  */
2706 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
2707 {
2708         struct cfq_rb_root *st = st_for(cfqd->serving_group,
2709                         cfqd->serving_wl_class, cfqd->serving_wl_type);
2710
2711         if (!cfqd->rq_queued)
2712                 return NULL;
2713
2714         /* There is nothing to dispatch */
2715         if (!st)
2716                 return NULL;
2717         if (RB_EMPTY_ROOT(&st->rb.rb_root))
2718                 return NULL;
2719         return cfq_rb_first(st);
2720 }
2721
2722 static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
2723 {
2724         struct cfq_group *cfqg;
2725         struct cfq_queue *cfqq;
2726         int i, j;
2727         struct cfq_rb_root *st;
2728
2729         if (!cfqd->rq_queued)
2730                 return NULL;
2731
2732         cfqg = cfq_get_next_cfqg(cfqd);
2733         if (!cfqg)
2734                 return NULL;
2735
2736         for_each_cfqg_st(cfqg, i, j, st) {
2737                 cfqq = cfq_rb_first(st);
2738                 if (cfqq)
2739                         return cfqq;
2740         }
2741         return NULL;
2742 }
2743
2744 /*
2745  * Get and set a new active queue for service.
2746  */
2747 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
2748                                               struct cfq_queue *cfqq)
2749 {
2750         if (!cfqq)
2751                 cfqq = cfq_get_next_queue(cfqd);
2752
2753         __cfq_set_active_queue(cfqd, cfqq);
2754         return cfqq;
2755 }
2756
2757 static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
2758                                           struct request *rq)
2759 {
2760         if (blk_rq_pos(rq) >= cfqd->last_position)
2761                 return blk_rq_pos(rq) - cfqd->last_position;
2762         else
2763                 return cfqd->last_position - blk_rq_pos(rq);
2764 }
2765
2766 static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2767                                struct request *rq)
2768 {
2769         return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
2770 }
2771
2772 static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
2773                                     struct cfq_queue *cur_cfqq)
2774 {
2775         struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
2776         struct rb_node *parent, *node;
2777         struct cfq_queue *__cfqq;
2778         sector_t sector = cfqd->last_position;
2779
2780         if (RB_EMPTY_ROOT(root))
2781                 return NULL;
2782
2783         /*
2784          * First, if we find a request starting at the end of the last
2785          * request, choose it.
2786          */
2787         __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
2788         if (__cfqq)
2789                 return __cfqq;
2790
2791         /*
2792          * If the exact sector wasn't found, the parent of the NULL leaf
2793          * will contain the closest sector.
2794          */
2795         __cfqq = rb_entry(parent, struct cfq_queue, p_node);
2796         if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2797                 return __cfqq;
2798
2799         if (blk_rq_pos(__cfqq->next_rq) < sector)
2800                 node = rb_next(&__cfqq->p_node);
2801         else
2802                 node = rb_prev(&__cfqq->p_node);
2803         if (!node)
2804                 return NULL;
2805
2806         __cfqq = rb_entry(node, struct cfq_queue, p_node);
2807         if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2808                 return __cfqq;
2809
2810         return NULL;
2811 }
2812
2813 /*
2814  * cfqd - obvious
2815  * cur_cfqq - passed in so that we don't decide that the current queue is
2816  *            closely cooperating with itself.
2817  *
2818  * So, basically we're assuming that that cur_cfqq has dispatched at least
2819  * one request, and that cfqd->last_position reflects a position on the disk
2820  * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
2821  * assumption.
2822  */
2823 static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
2824                                               struct cfq_queue *cur_cfqq)
2825 {
2826         struct cfq_queue *cfqq;
2827
2828         if (cfq_class_idle(cur_cfqq))
2829                 return NULL;
2830         if (!cfq_cfqq_sync(cur_cfqq))
2831                 return NULL;
2832         if (CFQQ_SEEKY(cur_cfqq))
2833                 return NULL;
2834
2835         /*
2836          * Don't search priority tree if it's the only queue in the group.
2837          */
2838         if (cur_cfqq->cfqg->nr_cfqq == 1)
2839                 return NULL;
2840
2841         /*
2842          * We should notice if some of the queues are cooperating, eg
2843          * working closely on the same area of the disk. In that case,
2844          * we can group them together and don't waste time idling.
2845          */
2846         cfqq = cfqq_close(cfqd, cur_cfqq);
2847         if (!cfqq)
2848                 return NULL;
2849
2850         /* If new queue belongs to different cfq_group, don't choose it */
2851         if (cur_cfqq->cfqg != cfqq->cfqg)
2852                 return NULL;
2853
2854         /*
2855          * It only makes sense to merge sync queues.
2856          */
2857         if (!cfq_cfqq_sync(cfqq))
2858                 return NULL;
2859         if (CFQQ_SEEKY(cfqq))
2860                 return NULL;
2861
2862         /*
2863          * Do not merge queues of different priority classes
2864          */
2865         if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
2866                 return NULL;
2867
2868         return cfqq;
2869 }
2870
2871 /*
2872  * Determine whether we should enforce idle window for this queue.
2873  */
2874
2875 static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2876 {
2877         enum wl_class_t wl_class = cfqq_class(cfqq);
2878         struct cfq_rb_root *st = cfqq->service_tree;
2879
2880         BUG_ON(!st);
2881         BUG_ON(!st->count);
2882
2883         if (!cfqd->cfq_slice_idle)
2884                 return false;
2885
2886         /* We never do for idle class queues. */
2887         if (wl_class == IDLE_WORKLOAD)
2888                 return false;
2889
2890         /* We do for queues that were marked with idle window flag. */
2891         if (cfq_cfqq_idle_window(cfqq) &&
2892            !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
2893                 return true;
2894
2895         /*
2896          * Otherwise, we do only if they are the last ones
2897          * in their service tree.
2898          */
2899         if (st->count == 1 && cfq_cfqq_sync(cfqq) &&
2900            !cfq_io_thinktime_big(cfqd, &st->ttime, false))
2901                 return true;
2902         cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count);
2903         return false;
2904 }
2905
2906 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2907 {
2908         struct cfq_queue *cfqq = cfqd->active_queue;
2909         struct cfq_rb_root *st = cfqq->service_tree;
2910         struct cfq_io_cq *cic;
2911         u64 sl, group_idle = 0;
2912         u64 now = ktime_get_ns();
2913
2914         /*
2915          * SSD device without seek penalty, disable idling. But only do so
2916          * for devices that support queuing, otherwise we still have a problem
2917          * with sync vs async workloads.
2918          */
2919         if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag &&
2920                 !cfqd->cfq_group_idle)
2921                 return;
2922
2923         WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
2924         WARN_ON(cfq_cfqq_slice_new(cfqq));
2925
2926         /*
2927          * idle is disabled, either manually or by past process history
2928          */
2929         if (!cfq_should_idle(cfqd, cfqq)) {
2930                 /* no queue idling. Check for group idling */
2931                 if (cfqd->cfq_group_idle)
2932                         group_idle = cfqd->cfq_group_idle;
2933                 else
2934                         return;
2935         }
2936
2937         /*
2938          * still active requests from this queue, don't idle
2939          */
2940         if (cfqq->dispatched)
2941                 return;
2942
2943         /*
2944          * task has exited, don't wait
2945          */
2946         cic = cfqd->active_cic;
2947         if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
2948                 return;
2949
2950         /*
2951          * If our average think time is larger than the remaining time
2952          * slice, then don't idle. This avoids overrunning the allotted
2953          * time slice.
2954          */
2955         if (sample_valid(cic->ttime.ttime_samples) &&
2956             (cfqq->slice_end - now < cic->ttime.ttime_mean)) {
2957                 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%llu",
2958                              cic->ttime.ttime_mean);
2959                 return;
2960         }
2961
2962         /*
2963          * There are other queues in the group or this is the only group and
2964          * it has too big thinktime, don't do group idle.
2965          */
2966         if (group_idle &&
2967             (cfqq->cfqg->nr_cfqq > 1 ||
2968              cfq_io_thinktime_big(cfqd, &st->ttime, true)))
2969                 return;
2970
2971         cfq_mark_cfqq_wait_request(cfqq);
2972
2973         if (group_idle)
2974                 sl = cfqd->cfq_group_idle;
2975         else
2976                 sl = cfqd->cfq_slice_idle;
2977
2978         hrtimer_start(&cfqd->idle_slice_timer, ns_to_ktime(sl),
2979                       HRTIMER_MODE_REL);
2980         cfqg_stats_set_start_idle_time(cfqq->cfqg);
2981         cfq_log_cfqq(cfqd, cfqq, "arm_idle: %llu group_idle: %d", sl,
2982                         group_idle ? 1 : 0);
2983 }
2984
2985 /*
2986  * Move request from internal lists to the request queue dispatch list.
2987  */
2988 static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
2989 {
2990         struct cfq_data *cfqd = q->elevator->elevator_data;
2991         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2992
2993         cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
2994
2995         cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
2996         cfq_remove_request(rq);
2997         cfqq->dispatched++;
2998         (RQ_CFQG(rq))->dispatched++;
2999         elv_dispatch_sort(q, rq);
3000
3001         cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
3002         cfqq->nr_sectors += blk_rq_sectors(rq);
3003 }
3004
3005 /*
3006  * return expired entry, or NULL to just start from scratch in rbtree
3007  */
3008 static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
3009 {
3010         struct request *rq = NULL;
3011
3012         if (cfq_cfqq_fifo_expire(cfqq))
3013                 return NULL;
3014
3015         cfq_mark_cfqq_fifo_expire(cfqq);
3016
3017         if (list_empty(&cfqq->fifo))
3018                 return NULL;
3019
3020         rq = rq_entry_fifo(cfqq->fifo.next);
3021         if (ktime_get_ns() < rq->fifo_time)
3022                 rq = NULL;
3023
3024         return rq;
3025 }
3026
3027 static inline int
3028 cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3029 {
3030         const int base_rq = cfqd->cfq_slice_async_rq;
3031
3032         WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
3033
3034         return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
3035 }
3036
3037 /*
3038  * Must be called with the queue_lock held.
3039  */
3040 static int cfqq_process_refs(struct cfq_queue *cfqq)
3041 {
3042         int process_refs, io_refs;
3043
3044         io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
3045         process_refs = cfqq->ref - io_refs;
3046         BUG_ON(process_refs < 0);
3047         return process_refs;
3048 }
3049
3050 static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
3051 {
3052         int process_refs, new_process_refs;
3053         struct cfq_queue *__cfqq;
3054
3055         /*
3056          * If there are no process references on the new_cfqq, then it is
3057          * unsafe to follow the ->new_cfqq chain as other cfqq's in the
3058          * chain may have dropped their last reference (not just their
3059          * last process reference).
3060          */
3061         if (!cfqq_process_refs(new_cfqq))
3062                 return;
3063
3064         /* Avoid a circular list and skip interim queue merges */
3065         while ((__cfqq = new_cfqq->new_cfqq)) {
3066                 if (__cfqq == cfqq)
3067                         return;
3068                 new_cfqq = __cfqq;
3069         }
3070
3071         process_refs = cfqq_process_refs(cfqq);
3072         new_process_refs = cfqq_process_refs(new_cfqq);
3073         /*
3074          * If the process for the cfqq has gone away, there is no
3075          * sense in merging the queues.
3076          */
3077         if (process_refs == 0 || new_process_refs == 0)
3078                 return;
3079
3080         /*
3081          * Merge in the direction of the lesser amount of work.
3082          */
3083         if (new_process_refs >= process_refs) {
3084                 cfqq->new_cfqq = new_cfqq;
3085                 new_cfqq->ref += process_refs;
3086         } else {
3087                 new_cfqq->new_cfqq = cfqq;
3088                 cfqq->ref += new_process_refs;
3089         }
3090 }
3091
3092 static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd,
3093                         struct cfq_group *cfqg, enum wl_class_t wl_class)
3094 {
3095         struct cfq_queue *queue;
3096         int i;
3097         bool key_valid = false;
3098         u64 lowest_key = 0;
3099         enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
3100
3101         for (i = 0; i <= SYNC_WORKLOAD; ++i) {
3102                 /* select the one with lowest rb_key */
3103                 queue = cfq_rb_first(st_for(cfqg, wl_class, i));
3104                 if (queue &&
3105                     (!key_valid || queue->rb_key < lowest_key)) {
3106                         lowest_key = queue->rb_key;
3107                         cur_best = i;
3108                         key_valid = true;
3109                 }
3110         }
3111
3112         return cur_best;
3113 }
3114
3115 static void
3116 choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg)
3117 {
3118         u64 slice;
3119         unsigned count;
3120         struct cfq_rb_root *st;
3121         u64 group_slice;
3122         enum wl_class_t original_class = cfqd->serving_wl_class;
3123         u64 now = ktime_get_ns();
3124
3125         /* Choose next priority. RT > BE > IDLE */
3126         if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
3127                 cfqd->serving_wl_class = RT_WORKLOAD;
3128         else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
3129                 cfqd->serving_wl_class = BE_WORKLOAD;
3130         else {
3131                 cfqd->serving_wl_class = IDLE_WORKLOAD;
3132                 cfqd->workload_expires = now + jiffies_to_nsecs(1);
3133                 return;
3134         }
3135
3136         if (original_class != cfqd->serving_wl_class)
3137                 goto new_workload;
3138
3139         /*
3140          * For RT and BE, we have to choose also the type
3141          * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
3142          * expiration time
3143          */
3144         st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
3145         count = st->count;
3146
3147         /*
3148          * check workload expiration, and that we still have other queues ready
3149          */
3150         if (count && !(now > cfqd->workload_expires))
3151                 return;
3152
3153 new_workload:
3154         /* otherwise select new workload type */
3155         cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg,
3156                                         cfqd->serving_wl_class);
3157         st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
3158         count = st->count;
3159
3160         /*
3161          * the workload slice is computed as a fraction of target latency
3162          * proportional to the number of queues in that workload, over
3163          * all the queues in the same priority class
3164          */
3165         group_slice = cfq_group_slice(cfqd, cfqg);
3166
3167         slice = div_u64(group_slice * count,
3168                 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class],
3169                       cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd,
3170                                         cfqg)));
3171
3172         if (cfqd->serving_wl_type == ASYNC_WORKLOAD) {
3173                 u64 tmp;
3174
3175                 /*
3176                  * Async queues are currently system wide. Just taking
3177                  * proportion of queues with-in same group will lead to higher
3178                  * async ratio system wide as generally root group is going
3179                  * to have higher weight. A more accurate thing would be to
3180                  * calculate system wide asnc/sync ratio.
3181                  */
3182                 tmp = cfqd->cfq_target_latency *
3183                         cfqg_busy_async_queues(cfqd, cfqg);
3184                 tmp = div_u64(tmp, cfqd->busy_queues);
3185                 slice = min_t(u64, slice, tmp);
3186
3187                 /* async workload slice is scaled down according to
3188                  * the sync/async slice ratio. */
3189                 slice = div64_u64(slice*cfqd->cfq_slice[0], cfqd->cfq_slice[1]);
3190         } else
3191                 /* sync workload slice is at least 2 * cfq_slice_idle */
3192                 slice = max(slice, 2 * cfqd->cfq_slice_idle);
3193
3194         slice = max_t(u64, slice, CFQ_MIN_TT);
3195         cfq_log(cfqd, "workload slice:%llu", slice);
3196         cfqd->workload_expires = now + slice;
3197 }
3198
3199 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
3200 {
3201         struct cfq_rb_root *st = &cfqd->grp_service_tree;
3202         struct cfq_group *cfqg;
3203
3204         if (RB_EMPTY_ROOT(&st->rb.rb_root))
3205                 return NULL;
3206         cfqg = cfq_rb_first_group(st);
3207         update_min_vdisktime(st);
3208         return cfqg;
3209 }
3210
3211 static void cfq_choose_cfqg(struct cfq_data *cfqd)
3212 {
3213         struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
3214         u64 now = ktime_get_ns();
3215
3216         cfqd->serving_group = cfqg;
3217
3218         /* Restore the workload type data */
3219         if (cfqg->saved_wl_slice) {
3220                 cfqd->workload_expires = now + cfqg->saved_wl_slice;
3221                 cfqd->serving_wl_type = cfqg->saved_wl_type;
3222                 cfqd->serving_wl_class = cfqg->saved_wl_class;
3223         } else
3224                 cfqd->workload_expires = now - 1;
3225
3226         choose_wl_class_and_type(cfqd, cfqg);
3227 }
3228
3229 /*
3230  * Select a queue for service. If we have a current active queue,
3231  * check whether to continue servicing it, or retrieve and set a new one.
3232  */
3233 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
3234 {
3235         struct cfq_queue *cfqq, *new_cfqq = NULL;
3236         u64 now = ktime_get_ns();
3237
3238         cfqq = cfqd->active_queue;
3239         if (!cfqq)
3240                 goto new_queue;
3241
3242         if (!cfqd->rq_queued)
3243                 return NULL;
3244
3245         /*
3246          * We were waiting for group to get backlogged. Expire the queue
3247          */
3248         if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
3249                 goto expire;
3250
3251         /*
3252          * The active queue has run out of time, expire it and select new.
3253          */
3254         if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
3255                 /*
3256                  * If slice had not expired at the completion of last request
3257                  * we might not have turned on wait_busy flag. Don't expire
3258                  * the queue yet. Allow the group to get backlogged.
3259                  *
3260                  * The very fact that we have used the slice, that means we
3261                  * have been idling all along on this queue and it should be
3262                  * ok to wait for this request to complete.
3263                  */
3264                 if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
3265                     && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
3266                         cfqq = NULL;
3267                         goto keep_queue;
3268                 } else
3269                         goto check_group_idle;
3270         }
3271
3272         /*
3273          * The active queue has requests and isn't expired, allow it to
3274          * dispatch.
3275          */
3276         if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3277                 goto keep_queue;
3278
3279         /*
3280          * If another queue has a request waiting within our mean seek
3281          * distance, let it run.  The expire code will check for close
3282          * cooperators and put the close queue at the front of the service
3283          * tree.  If possible, merge the expiring queue with the new cfqq.
3284          */
3285         new_cfqq = cfq_close_cooperator(cfqd, cfqq);
3286         if (new_cfqq) {
3287                 if (!cfqq->new_cfqq)
3288                         cfq_setup_merge(cfqq, new_cfqq);
3289                 goto expire;
3290         }
3291
3292         /*
3293          * No requests pending. If the active queue still has requests in
3294          * flight or is idling for a new request, allow either of these
3295          * conditions to happen (or time out) before selecting a new queue.
3296          */
3297         if (hrtimer_active(&cfqd->idle_slice_timer)) {
3298                 cfqq = NULL;
3299                 goto keep_queue;
3300         }
3301
3302         /*
3303          * This is a deep seek queue, but the device is much faster than
3304          * the queue can deliver, don't idle
3305          **/
3306         if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
3307             (cfq_cfqq_slice_new(cfqq) ||
3308             (cfqq->slice_end - now > now - cfqq->slice_start))) {
3309                 cfq_clear_cfqq_deep(cfqq);
3310                 cfq_clear_cfqq_idle_window(cfqq);
3311         }
3312
3313         if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
3314                 cfqq = NULL;
3315                 goto keep_queue;
3316         }
3317
3318         /*
3319          * If group idle is enabled and there are requests dispatched from
3320          * this group, wait for requests to complete.
3321          */
3322 check_group_idle:
3323         if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
3324             cfqq->cfqg->dispatched &&
3325             !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
3326                 cfqq = NULL;
3327                 goto keep_queue;
3328         }
3329
3330 expire:
3331         cfq_slice_expired(cfqd, 0);
3332 new_queue:
3333         /*
3334          * Current queue expired. Check if we have to switch to a new
3335          * service tree
3336          */
3337         if (!new_cfqq)
3338                 cfq_choose_cfqg(cfqd);
3339
3340         cfqq = cfq_set_active_queue(cfqd, new_cfqq);
3341 keep_queue:
3342         return cfqq;
3343 }
3344
3345 static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
3346 {
3347         int dispatched = 0;
3348
3349         while (cfqq->next_rq) {
3350                 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
3351                 dispatched++;
3352         }
3353
3354         BUG_ON(!list_empty(&cfqq->fifo));
3355
3356         /* By default cfqq is not expired if it is empty. Do it explicitly */
3357         __cfq_slice_expired(cfqq->cfqd, cfqq, 0);
3358         return dispatched;
3359 }
3360
3361 /*
3362  * Drain our current requests. Used for barriers and when switching
3363  * io schedulers on-the-fly.
3364  */
3365 static int cfq_forced_dispatch(struct cfq_data *cfqd)
3366 {
3367         struct cfq_queue *cfqq;
3368         int dispatched = 0;
3369
3370         /* Expire the timeslice of the current active queue first */
3371         cfq_slice_expired(cfqd, 0);
3372         while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
3373                 __cfq_set_active_queue(cfqd, cfqq);
3374                 dispatched += __cfq_forced_dispatch_cfqq(cfqq);
3375         }
3376
3377         BUG_ON(cfqd->busy_queues);
3378
3379         cfq_log(cfqd, "forced_dispatch=%d", dispatched);
3380         return dispatched;
3381 }
3382
3383 static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
3384         struct cfq_queue *cfqq)
3385 {
3386         u64 now = ktime_get_ns();
3387
3388         /* the queue hasn't finished any request, can't estimate */
3389         if (cfq_cfqq_slice_new(cfqq))
3390                 return true;
3391         if (now + cfqd->cfq_slice_idle * cfqq->dispatched > cfqq->slice_end)
3392                 return true;
3393
3394         return false;
3395 }
3396
3397 static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3398 {
3399         unsigned int max_dispatch;
3400
3401         if (cfq_cfqq_must_dispatch(cfqq))
3402                 return true;
3403
3404         /*
3405          * Drain async requests before we start sync IO
3406          */
3407         if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
3408                 return false;
3409
3410         /*
3411          * If this is an async queue and we have sync IO in flight, let it wait
3412          */
3413         if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
3414                 return false;
3415
3416         max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
3417         if (cfq_class_idle(cfqq))
3418                 max_dispatch = 1;
3419
3420         /*
3421          * Does this cfqq already have too much IO in flight?
3422          */
3423         if (cfqq->dispatched >= max_dispatch) {
3424                 bool promote_sync = false;
3425                 /*
3426                  * idle queue must always only have a single IO in flight
3427                  */
3428                 if (cfq_class_idle(cfqq))
3429                         return false;
3430
3431                 /*
3432                  * If there is only one sync queue
3433                  * we can ignore async queue here and give the sync
3434                  * queue no dispatch limit. The reason is a sync queue can
3435                  * preempt async queue, limiting the sync queue doesn't make
3436                  * sense. This is useful for aiostress test.
3437                  */
3438                 if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
3439                         promote_sync = true;
3440
3441                 /*
3442                  * We have other queues, don't allow more IO from this one
3443                  */
3444                 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
3445                                 !promote_sync)
3446                         return false;
3447
3448                 /*
3449                  * Sole queue user, no limit
3450                  */
3451                 if (cfqd->busy_queues == 1 || promote_sync)
3452                         max_dispatch = -1;
3453                 else
3454                         /*
3455                          * Normally we start throttling cfqq when cfq_quantum/2
3456                          * requests have been dispatched. But we can drive
3457                          * deeper queue depths at the beginning of slice
3458                          * subjected to upper limit of cfq_quantum.
3459                          * */
3460                         max_dispatch = cfqd->cfq_quantum;
3461         }
3462
3463         /*
3464          * Async queues must wait a bit before being allowed dispatch.
3465          * We also ramp up the dispatch depth gradually for async IO,
3466          * based on the last sync IO we serviced
3467          */
3468         if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
3469                 u64 last_sync = ktime_get_ns() - cfqd->last_delayed_sync;
3470                 unsigned int depth;
3471
3472                 depth = div64_u64(last_sync, cfqd->cfq_slice[1]);
3473                 if (!depth && !cfqq->dispatched)
3474                         depth = 1;
3475                 if (depth < max_dispatch)
3476                         max_dispatch = depth;
3477         }
3478
3479         /*
3480          * If we're below the current max, allow a dispatch
3481          */
3482         return cfqq->dispatched < max_dispatch;
3483 }
3484
3485 /*
3486  * Dispatch a request from cfqq, moving them to the request queue
3487  * dispatch list.
3488  */
3489 static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3490 {
3491         struct request *rq;
3492
3493         BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
3494
3495         rq = cfq_check_fifo(cfqq);
3496         if (rq)
3497                 cfq_mark_cfqq_must_dispatch(cfqq);
3498
3499         if (!cfq_may_dispatch(cfqd, cfqq))
3500                 return false;
3501
3502         /*
3503          * follow expired path, else get first next available
3504          */
3505         if (!rq)
3506                 rq = cfqq->next_rq;
3507         else
3508                 cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
3509
3510         /*
3511          * insert request into driver dispatch list
3512          */
3513         cfq_dispatch_insert(cfqd->queue, rq);
3514
3515         if (!cfqd->active_cic) {
3516                 struct cfq_io_cq *cic = RQ_CIC(rq);
3517
3518                 atomic_long_inc(&cic->icq.ioc->refcount);
3519                 cfqd->active_cic = cic;
3520         }
3521
3522         return true;
3523 }
3524
3525 /*
3526  * Find the cfqq that we need to service and move a request from that to the
3527  * dispatch list
3528  */
3529 static int cfq_dispatch_requests(struct request_queue *q, int force)
3530 {
3531         struct cfq_data *cfqd = q->elevator->elevator_data;
3532         struct cfq_queue *cfqq;
3533
3534         if (!cfqd->busy_queues)
3535                 return 0;
3536
3537         if (unlikely(force))
3538                 return cfq_forced_dispatch(cfqd);
3539
3540         cfqq = cfq_select_queue(cfqd);
3541         if (!cfqq)
3542                 return 0;
3543
3544         /*
3545          * Dispatch a request from this cfqq, if it is allowed
3546          */
3547         if (!cfq_dispatch_request(cfqd, cfqq))
3548                 return 0;
3549
3550         cfqq->slice_dispatch++;
3551         cfq_clear_cfqq_must_dispatch(cfqq);
3552
3553         /*
3554          * expire an async queue immediately if it has used up its slice. idle
3555          * queue always expire after 1 dispatch round.
3556          */
3557         if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
3558             cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
3559             cfq_class_idle(cfqq))) {
3560                 cfqq->slice_end = ktime_get_ns() + 1;
3561                 cfq_slice_expired(cfqd, 0);
3562         }
3563
3564         cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
3565         return 1;
3566 }
3567
3568 /*
3569  * task holds one reference to the queue, dropped when task exits. each rq
3570  * in-flight on this queue also holds a reference, dropped when rq is freed.
3571  *
3572  * Each cfq queue took a reference on the parent group. Drop it now.
3573  * queue lock must be held here.
3574  */
3575 static void cfq_put_queue(struct cfq_queue *cfqq)
3576 {
3577         struct cfq_data *cfqd = cfqq->cfqd;
3578         struct cfq_group *cfqg;
3579
3580         BUG_ON(cfqq->ref <= 0);
3581
3582         cfqq->ref--;
3583         if (cfqq->ref)
3584                 return;
3585
3586         cfq_log_cfqq(cfqd, cfqq, "put_queue");
3587         BUG_ON(rb_first(&cfqq->sort_list));
3588         BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
3589         cfqg = cfqq->cfqg;
3590
3591         if (unlikely(cfqd->active_queue == cfqq)) {
3592                 __cfq_slice_expired(cfqd, cfqq, 0);
3593                 cfq_schedule_dispatch(cfqd);
3594         }
3595
3596         BUG_ON(cfq_cfqq_on_rr(cfqq));
3597         kmem_cache_free(cfq_pool, cfqq);
3598         cfqg_put(cfqg);
3599 }
3600
3601 static void cfq_put_cooperator(struct cfq_queue *cfqq)
3602 {
3603         struct cfq_queue *__cfqq, *next;
3604
3605         /*
3606          * If this queue was scheduled to merge with another queue, be
3607          * sure to drop the reference taken on that queue (and others in
3608          * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
3609          */
3610         __cfqq = cfqq->new_cfqq;
3611         while (__cfqq) {
3612                 if (__cfqq == cfqq) {
3613                         WARN(1, "cfqq->new_cfqq loop detected\n");
3614                         break;
3615                 }
3616                 next = __cfqq->new_cfqq;
3617                 cfq_put_queue(__cfqq);
3618                 __cfqq = next;
3619         }
3620 }
3621
3622 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3623 {
3624         if (unlikely(cfqq == cfqd->active_queue)) {
3625                 __cfq_slice_expired(cfqd, cfqq, 0);
3626                 cfq_schedule_dispatch(cfqd);
3627         }
3628
3629         cfq_put_cooperator(cfqq);
3630
3631         cfq_put_queue(cfqq);
3632 }
3633
3634 static void cfq_init_icq(struct io_cq *icq)
3635 {
3636         struct cfq_io_cq *cic = icq_to_cic(icq);
3637
3638         cic->ttime.last_end_request = ktime_get_ns();
3639 }
3640
3641 static void cfq_exit_icq(struct io_cq *icq)
3642 {
3643         struct cfq_io_cq *cic = icq_to_cic(icq);
3644         struct cfq_data *cfqd = cic_to_cfqd(cic);
3645
3646         if (cic_to_cfqq(cic, false)) {
3647                 cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false));
3648                 cic_set_cfqq(cic, NULL, false);
3649         }
3650
3651         if (cic_to_cfqq(cic, true)) {
3652                 cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true));
3653                 cic_set_cfqq(cic, NULL, true);
3654         }
3655 }
3656
3657 static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
3658 {
3659         struct task_struct *tsk = current;
3660         int ioprio_class;
3661
3662         if (!cfq_cfqq_prio_changed(cfqq))
3663                 return;
3664
3665         ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3666         switch (ioprio_class) {
3667         default:
3668                 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
3669                 /* fall through */
3670         case IOPRIO_CLASS_NONE:
3671                 /*
3672                  * no prio set, inherit CPU scheduling settings
3673                  */
3674                 cfqq->ioprio = task_nice_ioprio(tsk);
3675                 cfqq->ioprio_class = task_nice_ioclass(tsk);
3676                 break;
3677         case IOPRIO_CLASS_RT:
3678                 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3679                 cfqq->ioprio_class = IOPRIO_CLASS_RT;
3680                 break;
3681         case IOPRIO_CLASS_BE:
3682                 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3683                 cfqq->ioprio_class = IOPRIO_CLASS_BE;
3684                 break;
3685         case IOPRIO_CLASS_IDLE:
3686                 cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
3687                 cfqq->ioprio = 7;
3688                 cfq_clear_cfqq_idle_window(cfqq);
3689                 break;
3690         }
3691
3692         /*
3693          * keep track of original prio settings in case we have to temporarily
3694          * elevate the priority of this queue
3695          */
3696         cfqq->org_ioprio = cfqq->ioprio;
3697         cfqq->org_ioprio_class = cfqq->ioprio_class;
3698         cfq_clear_cfqq_prio_changed(cfqq);
3699 }
3700
3701 static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
3702 {
3703         int ioprio = cic->icq.ioc->ioprio;
3704         struct cfq_data *cfqd = cic_to_cfqd(cic);
3705         struct cfq_queue *cfqq;
3706
3707         /*
3708          * Check whether ioprio has changed.  The condition may trigger
3709          * spuriously on a newly created cic but there's no harm.
3710          */
3711         if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
3712                 return;
3713
3714         cfqq = cic_to_cfqq(cic, false);
3715         if (cfqq) {
3716                 cfq_put_queue(cfqq);
3717                 cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio);
3718                 cic_set_cfqq(cic, cfqq, false);
3719         }
3720
3721         cfqq = cic_to_cfqq(cic, true);
3722         if (cfqq)
3723                 cfq_mark_cfqq_prio_changed(cfqq);
3724
3725         cic->ioprio = ioprio;
3726 }
3727
3728 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3729                           pid_t pid, bool is_sync)
3730 {
3731         RB_CLEAR_NODE(&cfqq->rb_node);
3732         RB_CLEAR_NODE(&cfqq->p_node);
3733         INIT_LIST_HEAD(&cfqq->fifo);
3734
3735         cfqq->ref = 0;
3736         cfqq->cfqd = cfqd;
3737
3738         cfq_mark_cfqq_prio_changed(cfqq);
3739
3740         if (is_sync) {
3741                 if (!cfq_class_idle(cfqq))
3742                         cfq_mark_cfqq_idle_window(cfqq);
3743                 cfq_mark_cfqq_sync(cfqq);
3744         }
3745         cfqq->pid = pid;
3746 }
3747
3748 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3749 static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3750 {
3751         struct cfq_data *cfqd = cic_to_cfqd(cic);
3752         struct cfq_queue *cfqq;
3753         uint64_t serial_nr;
3754
3755         rcu_read_lock();
3756         serial_nr = bio_blkcg(bio)->css.serial_nr;
3757         rcu_read_unlock();
3758
3759         /*
3760          * Check whether blkcg has changed.  The condition may trigger
3761          * spuriously on a newly created cic but there's no harm.
3762          */
3763         if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
3764                 return;
3765
3766         /*
3767          * Drop reference to queues.  New queues will be assigned in new
3768          * group upon arrival of fresh requests.
3769          */
3770         cfqq = cic_to_cfqq(cic, false);
3771         if (cfqq) {
3772                 cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
3773                 cic_set_cfqq(cic, NULL, false);
3774                 cfq_put_queue(cfqq);
3775         }
3776
3777         cfqq = cic_to_cfqq(cic, true);
3778         if (cfqq) {
3779                 cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
3780                 cic_set_cfqq(cic, NULL, true);
3781                 cfq_put_queue(cfqq);
3782         }
3783
3784         cic->blkcg_serial_nr = serial_nr;
3785 }
3786 #else
3787 static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3788 {
3789 }
3790 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
3791
3792 static struct cfq_queue **
3793 cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio)
3794 {
3795         switch (ioprio_class) {
3796         case IOPRIO_CLASS_RT:
3797                 return &cfqg->async_cfqq[0][ioprio];
3798         case IOPRIO_CLASS_NONE:
3799                 ioprio = IOPRIO_NORM;
3800                 /* fall through */
3801         case IOPRIO_CLASS_BE:
3802                 return &cfqg->async_cfqq[1][ioprio];
3803         case IOPRIO_CLASS_IDLE:
3804                 return &cfqg->async_idle_cfqq;
3805         default:
3806                 BUG();
3807         }
3808 }
3809
3810 static struct cfq_queue *
3811 cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3812               struct bio *bio)
3813 {
3814         int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3815         int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3816         struct cfq_queue **async_cfqq = NULL;
3817         struct cfq_queue *cfqq;
3818         struct cfq_group *cfqg;
3819
3820         rcu_read_lock();
3821         cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
3822         if (!cfqg) {
3823                 cfqq = &cfqd->oom_cfqq;
3824                 goto out;
3825         }
3826
3827         if (!is_sync) {
3828                 if (!ioprio_valid(cic->ioprio)) {
3829                         struct task_struct *tsk = current;
3830                         ioprio = task_nice_ioprio(tsk);
3831                         ioprio_class = task_nice_ioclass(tsk);
3832                 }
3833                 async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio);
3834                 cfqq = *async_cfqq;
3835                 if (cfqq)
3836                         goto out;
3837         }
3838
3839         cfqq = kmem_cache_alloc_node(cfq_pool,
3840                                      GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
3841                                      cfqd->queue->node);
3842         if (!cfqq) {
3843                 cfqq = &cfqd->oom_cfqq;
3844                 goto out;
3845         }
3846
3847         /* cfq_init_cfqq() assumes cfqq->ioprio_class is initialized. */
3848         cfqq->ioprio_class = IOPRIO_CLASS_NONE;
3849         cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
3850         cfq_init_prio_data(cfqq, cic);
3851         cfq_link_cfqq_cfqg(cfqq, cfqg);
3852         cfq_log_cfqq(cfqd, cfqq, "alloced");
3853
3854         if (async_cfqq) {
3855                 /* a new async queue is created, pin and remember */
3856                 cfqq->ref++;
3857                 *async_cfqq = cfqq;
3858         }
3859 out:
3860         cfqq->ref++;
3861         rcu_read_unlock();
3862         return cfqq;
3863 }
3864
3865 static void
3866 __cfq_update_io_thinktime(struct cfq_ttime *ttime, u64 slice_idle)
3867 {
3868         u64 elapsed = ktime_get_ns() - ttime->last_end_request;
3869         elapsed = min(elapsed, 2UL * slice_idle);
3870
3871         ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
3872         ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed,  8);
3873         ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
3874                                      ttime->ttime_samples);
3875 }
3876
3877 static void
3878 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3879                         struct cfq_io_cq *cic)
3880 {
3881         if (cfq_cfqq_sync(cfqq)) {
3882                 __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
3883                 __cfq_update_io_thinktime(&cfqq->service_tree->ttime,
3884                         cfqd->cfq_slice_idle);
3885         }
3886 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3887         __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
3888 #endif
3889 }
3890
3891 static void
3892 cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3893                        struct request *rq)
3894 {
3895         sector_t sdist = 0;
3896         sector_t n_sec = blk_rq_sectors(rq);
3897         if (cfqq->last_request_pos) {
3898                 if (cfqq->last_request_pos < blk_rq_pos(rq))
3899                         sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
3900                 else
3901                         sdist = cfqq->last_request_pos - blk_rq_pos(rq);
3902         }
3903
3904         cfqq->seek_history <<= 1;
3905         if (blk_queue_nonrot(cfqd->queue))
3906                 cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
3907         else
3908                 cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
3909 }
3910
3911 static inline bool req_noidle(struct request *req)
3912 {
3913         return req_op(req) == REQ_OP_WRITE &&
3914                 (req->cmd_flags & (REQ_SYNC | REQ_IDLE)) == REQ_SYNC;
3915 }
3916
3917 /*
3918  * Disable idle window if the process thinks too long or seeks so much that
3919  * it doesn't matter
3920  */
3921 static void
3922 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3923                        struct cfq_io_cq *cic)
3924 {
3925         int old_idle, enable_idle;
3926
3927         /*
3928          * Don't idle for async or idle io prio class
3929          */
3930         if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
3931                 return;
3932
3933         enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
3934
3935         if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3936                 cfq_mark_cfqq_deep(cfqq);
3937
3938         if (cfqq->next_rq && req_noidle(cfqq->next_rq))
3939                 enable_idle = 0;
3940         else if (!atomic_read(&cic->icq.ioc->active_ref) ||
3941                  !cfqd->cfq_slice_idle ||
3942                  (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3943                 enable_idle = 0;
3944         else if (sample_valid(cic->ttime.ttime_samples)) {
3945                 if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
3946                         enable_idle = 0;
3947                 else
3948                         enable_idle = 1;
3949         }
3950
3951         if (old_idle != enable_idle) {
3952                 cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
3953                 if (enable_idle)
3954                         cfq_mark_cfqq_idle_window(cfqq);
3955                 else
3956                         cfq_clear_cfqq_idle_window(cfqq);
3957         }
3958 }
3959
3960 /*
3961  * Check if new_cfqq should preempt the currently active queue. Return 0 for
3962  * no or if we aren't sure, a 1 will cause a preempt.
3963  */
3964 static bool
3965 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3966                    struct request *rq)
3967 {
3968         struct cfq_queue *cfqq;
3969
3970         cfqq = cfqd->active_queue;
3971         if (!cfqq)
3972                 return false;
3973
3974         if (cfq_class_idle(new_cfqq))
3975                 return false;
3976
3977         if (cfq_class_idle(cfqq))
3978                 return true;
3979
3980         /*
3981          * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
3982          */
3983         if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
3984                 return false;
3985
3986         /*
3987          * if the new request is sync, but the currently running queue is
3988          * not, let the sync request have priority.
3989          */
3990         if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
3991                 return true;
3992
3993         /*
3994          * Treat ancestors of current cgroup the same way as current cgroup.
3995          * For anybody else we disallow preemption to guarantee service
3996          * fairness among cgroups.
3997          */
3998         if (!cfqg_is_descendant(cfqq->cfqg, new_cfqq->cfqg))
3999                 return false;
4000
4001         if (cfq_slice_used(cfqq))
4002                 return true;
4003
4004         /*
4005          * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
4006          */
4007         if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
4008                 return true;
4009
4010         WARN_ON_ONCE(cfqq->ioprio_class != new_cfqq->ioprio_class);
4011         /* Allow preemption only if we are idling on sync-noidle tree */
4012         if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
4013             cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
4014             RB_EMPTY_ROOT(&cfqq->sort_list))
4015                 return true;
4016
4017         /*
4018          * So both queues are sync. Let the new request get disk time if
4019          * it's a metadata request and the current queue is doing regular IO.
4020          */
4021         if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
4022                 return true;
4023
4024         /* An idle queue should not be idle now for some reason */
4025         if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
4026                 return true;
4027
4028         if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
4029                 return false;
4030
4031         /*
4032          * if this request is as-good as one we would expect from the
4033          * current cfqq, let it preempt
4034          */
4035         if (cfq_rq_close(cfqd, cfqq, rq))
4036                 return true;
4037
4038         return false;
4039 }
4040
4041 /*
4042  * cfqq preempts the active queue. if we allowed preempt with no slice left,
4043  * let it have half of its nominal slice.
4044  */
4045 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
4046 {
4047         enum wl_type_t old_type = cfqq_type(cfqd->active_queue);
4048
4049         cfq_log_cfqq(cfqd, cfqq, "preempt");
4050         cfq_slice_expired(cfqd, 1);
4051
4052         /*
4053          * workload type is changed, don't save slice, otherwise preempt
4054          * doesn't happen
4055          */
4056         if (old_type != cfqq_type(cfqq))
4057                 cfqq->cfqg->saved_wl_slice = 0;
4058
4059         /*
4060          * Put the new queue at the front of the of the current list,
4061          * so we know that it will be selected next.
4062          */
4063         BUG_ON(!cfq_cfqq_on_rr(cfqq));
4064
4065         cfq_service_tree_add(cfqd, cfqq, 1);
4066
4067         cfqq->slice_end = 0;
4068         cfq_mark_cfqq_slice_new(cfqq);
4069 }
4070
4071 /*
4072  * Called when a new fs request (rq) is added (to cfqq). Check if there's
4073  * something we should do about it
4074  */
4075 static void
4076 cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
4077                 struct request *rq)
4078 {
4079         struct cfq_io_cq *cic = RQ_CIC(rq);
4080
4081         cfqd->rq_queued++;
4082         if (rq->cmd_flags & REQ_PRIO)
4083                 cfqq->prio_pending++;
4084
4085         cfq_update_io_thinktime(cfqd, cfqq, cic);
4086         cfq_update_io_seektime(cfqd, cfqq, rq);
4087         cfq_update_idle_window(cfqd, cfqq, cic);
4088
4089         cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
4090
4091         if (cfqq == cfqd->active_queue) {
4092                 /*
4093                  * Remember that we saw a request from this process, but
4094                  * don't start queuing just yet. Otherwise we risk seeing lots
4095                  * of tiny requests, because we disrupt the normal plugging
4096                  * and merging. If the request is already larger than a single
4097                  * page, let it rip immediately. For that case we assume that
4098                  * merging is already done. Ditto for a busy system that
4099                  * has other work pending, don't risk delaying until the
4100                  * idle timer unplug to continue working.
4101                  */
4102                 if (cfq_cfqq_wait_request(cfqq)) {
4103                         if (blk_rq_bytes(rq) > PAGE_SIZE ||
4104                             cfqd->busy_queues > 1) {
4105                                 cfq_del_timer(cfqd, cfqq);
4106                                 cfq_clear_cfqq_wait_request(cfqq);
4107                                 __blk_run_queue(cfqd->queue);
4108                         } else {
4109                                 cfqg_stats_update_idle_time(cfqq->cfqg);
4110                                 cfq_mark_cfqq_must_dispatch(cfqq);
4111                         }
4112                 }
4113         } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
4114                 /*
4115                  * not the active queue - expire current slice if it is
4116                  * idle and has expired it's mean thinktime or this new queue
4117                  * has some old slice time left and is of higher priority or
4118                  * this new queue is RT and the current one is BE
4119                  */
4120                 cfq_preempt_queue(cfqd, cfqq);
4121                 __blk_run_queue(cfqd->queue);
4122         }
4123 }
4124
4125 static void cfq_insert_request(struct request_queue *q, struct request *rq)
4126 {
4127         struct cfq_data *cfqd = q->elevator->elevator_data;
4128         struct cfq_queue *cfqq = RQ_CFQQ(rq);
4129
4130         cfq_log_cfqq(cfqd, cfqq, "insert_request");
4131         cfq_init_prio_data(cfqq, RQ_CIC(rq));
4132
4133         rq->fifo_time = ktime_get_ns() + cfqd->cfq_fifo_expire[rq_is_sync(rq)];
4134         list_add_tail(&rq->queuelist, &cfqq->fifo);
4135         cfq_add_rq_rb(rq);
4136         cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
4137                                  rq->cmd_flags);
4138         cfq_rq_enqueued(cfqd, cfqq, rq);
4139 }
4140
4141 /*
4142  * Update hw_tag based on peak queue depth over 50 samples under
4143  * sufficient load.
4144  */
4145 static void cfq_update_hw_tag(struct cfq_data *cfqd)
4146 {
4147         struct cfq_queue *cfqq = cfqd->active_queue;
4148
4149         if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
4150                 cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
4151
4152         if (cfqd->hw_tag == 1)
4153                 return;
4154
4155         if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
4156             cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
4157                 return;
4158
4159         /*
4160          * If active queue hasn't enough requests and can idle, cfq might not
4161          * dispatch sufficient requests to hardware. Don't zero hw_tag in this
4162          * case
4163          */
4164         if (cfqq && cfq_cfqq_idle_window(cfqq) &&
4165             cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
4166             CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
4167                 return;
4168
4169         if (cfqd->hw_tag_samples++ < 50)
4170                 return;
4171
4172         if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
4173                 cfqd->hw_tag = 1;
4174         else
4175                 cfqd->hw_tag = 0;
4176 }
4177
4178 static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
4179 {
4180         struct cfq_io_cq *cic = cfqd->active_cic;
4181         u64 now = ktime_get_ns();
4182
4183         /* If the queue already has requests, don't wait */
4184         if (!RB_EMPTY_ROOT(&cfqq->sort_list))
4185                 return false;
4186
4187         /* If there are other queues in the group, don't wait */
4188         if (cfqq->cfqg->nr_cfqq > 1)
4189                 return false;
4190
4191         /* the only queue in the group, but think time is big */
4192         if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
4193                 return false;
4194
4195         if (cfq_slice_used(cfqq))
4196                 return true;
4197
4198         /* if slice left is less than think time, wait busy */
4199         if (cic && sample_valid(cic->ttime.ttime_samples)
4200             && (cfqq->slice_end - now < cic->ttime.ttime_mean))
4201                 return true;
4202
4203         /*
4204          * If think times is less than a jiffy than ttime_mean=0 and above
4205          * will not be true. It might happen that slice has not expired yet
4206          * but will expire soon (4-5 ns) during select_queue(). To cover the
4207          * case where think time is less than a jiffy, mark the queue wait
4208          * busy if only 1 jiffy is left in the slice.
4209          */
4210         if (cfqq->slice_end - now <= jiffies_to_nsecs(1))
4211                 return true;
4212
4213         return false;
4214 }
4215
4216 static void cfq_completed_request(struct request_queue *q, struct request *rq)
4217 {
4218         struct cfq_queue *cfqq = RQ_CFQQ(rq);
4219         struct cfq_data *cfqd = cfqq->cfqd;
4220         const int sync = rq_is_sync(rq);
4221         u64 now = ktime_get_ns();
4222
4223         cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", req_noidle(rq));
4224
4225         cfq_update_hw_tag(cfqd);
4226
4227         WARN_ON(!cfqd->rq_in_driver);
4228         WARN_ON(!cfqq->dispatched);
4229         cfqd->rq_in_driver--;
4230         cfqq->dispatched--;
4231         (RQ_CFQG(rq))->dispatched--;
4232         cfqg_stats_update_completion(cfqq->cfqg, rq->start_time_ns,
4233                                      rq->io_start_time_ns, rq->cmd_flags);
4234
4235         cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
4236
4237         if (sync) {
4238                 struct cfq_rb_root *st;
4239
4240                 RQ_CIC(rq)->ttime.last_end_request = now;
4241
4242                 if (cfq_cfqq_on_rr(cfqq))
4243                         st = cfqq->service_tree;
4244                 else
4245                         st = st_for(cfqq->cfqg, cfqq_class(cfqq),
4246                                         cfqq_type(cfqq));
4247
4248                 st->ttime.last_end_request = now;
4249                 if (rq->start_time_ns + cfqd->cfq_fifo_expire[1] <= now)
4250                         cfqd->last_delayed_sync = now;
4251         }
4252
4253 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4254         cfqq->cfqg->ttime.last_end_request = now;
4255 #endif
4256
4257         /*
4258          * If this is the active queue, check if it needs to be expired,
4259          * or if we want to idle in case it has no pending requests.
4260          */
4261         if (cfqd->active_queue == cfqq) {
4262                 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
4263
4264                 if (cfq_cfqq_slice_new(cfqq)) {
4265                         cfq_set_prio_slice(cfqd, cfqq);
4266                         cfq_clear_cfqq_slice_new(cfqq);
4267                 }
4268
4269                 /*
4270                  * Should we wait for next request to come in before we expire
4271                  * the queue.
4272                  */
4273                 if (cfq_should_wait_busy(cfqd, cfqq)) {
4274                         u64 extend_sl = cfqd->cfq_slice_idle;
4275                         if (!cfqd->cfq_slice_idle)
4276                                 extend_sl = cfqd->cfq_group_idle;
4277                         cfqq->slice_end = now + extend_sl;
4278                         cfq_mark_cfqq_wait_busy(cfqq);
4279                         cfq_log_cfqq(cfqd, cfqq, "will busy wait");
4280                 }
4281
4282                 /*
4283                  * Idling is not enabled on:
4284                  * - expired queues
4285                  * - idle-priority queues
4286                  * - async queues
4287                  * - queues with still some requests queued
4288                  * - when there is a close cooperator
4289                  */
4290                 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
4291                         cfq_slice_expired(cfqd, 1);
4292                 else if (sync && cfqq_empty &&
4293                          !cfq_close_cooperator(cfqd, cfqq)) {
4294                         cfq_arm_slice_timer(cfqd);
4295                 }
4296         }
4297
4298         if (!cfqd->rq_in_driver)
4299                 cfq_schedule_dispatch(cfqd);
4300 }
4301
4302 static void cfqq_boost_on_prio(struct cfq_queue *cfqq, unsigned int op)
4303 {
4304         /*
4305          * If REQ_PRIO is set, boost class and prio level, if it's below
4306          * BE/NORM. If prio is not set, restore the potentially boosted
4307          * class/prio level.
4308          */
4309         if (!(op & REQ_PRIO)) {
4310                 cfqq->ioprio_class = cfqq->org_ioprio_class;
4311                 cfqq->ioprio = cfqq->org_ioprio;
4312         } else {
4313                 if (cfq_class_idle(cfqq))
4314                         cfqq->ioprio_class = IOPRIO_CLASS_BE;
4315                 if (cfqq->ioprio > IOPRIO_NORM)
4316                         cfqq->ioprio = IOPRIO_NORM;
4317         }
4318 }
4319
4320 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
4321 {
4322         if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
4323                 cfq_mark_cfqq_must_alloc_slice(cfqq);
4324                 return ELV_MQUEUE_MUST;
4325         }
4326
4327         return ELV_MQUEUE_MAY;
4328 }
4329
4330 static int cfq_may_queue(struct request_queue *q, unsigned int op)
4331 {
4332         struct cfq_data *cfqd = q->elevator->elevator_data;
4333         struct task_struct *tsk = current;
4334         struct cfq_io_cq *cic;
4335         struct cfq_queue *cfqq;
4336
4337         /*
4338          * don't force setup of a queue from here, as a call to may_queue
4339          * does not necessarily imply that a request actually will be queued.
4340          * so just lookup a possibly existing queue, or return 'may queue'
4341          * if that fails
4342          */
4343         cic = cfq_cic_lookup(cfqd, tsk->io_context);
4344         if (!cic)
4345                 return ELV_MQUEUE_MAY;
4346
4347         cfqq = cic_to_cfqq(cic, op_is_sync(op));
4348         if (cfqq) {
4349                 cfq_init_prio_data(cfqq, cic);
4350                 cfqq_boost_on_prio(cfqq, op);
4351
4352                 return __cfq_may_queue(cfqq);
4353         }
4354
4355         return ELV_MQUEUE_MAY;
4356 }
4357
4358 /*
4359  * queue lock held here
4360  */
4361 static void cfq_put_request(struct request *rq)
4362 {
4363         struct cfq_queue *cfqq = RQ_CFQQ(rq);
4364
4365         if (cfqq) {
4366                 const int rw = rq_data_dir(rq);
4367
4368                 BUG_ON(!cfqq->allocated[rw]);
4369                 cfqq->allocated[rw]--;
4370
4371                 /* Put down rq reference on cfqg */
4372                 cfqg_put(RQ_CFQG(rq));
4373                 rq->elv.priv[0] = NULL;
4374                 rq->elv.priv[1] = NULL;
4375
4376                 cfq_put_queue(cfqq);
4377         }
4378 }
4379
4380 static struct cfq_queue *
4381 cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
4382                 struct cfq_queue *cfqq)
4383 {
4384         cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
4385         cic_set_cfqq(cic, cfqq->new_cfqq, 1);
4386         cfq_mark_cfqq_coop(cfqq->new_cfqq);
4387         cfq_put_queue(cfqq);
4388         return cic_to_cfqq(cic, 1);
4389 }
4390
4391 /*
4392  * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
4393  * was the last process referring to said cfqq.
4394  */
4395 static struct cfq_queue *
4396 split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
4397 {
4398         if (cfqq_process_refs(cfqq) == 1) {
4399                 cfqq->pid = current->pid;
4400                 cfq_clear_cfqq_coop(cfqq);
4401                 cfq_clear_cfqq_split_coop(cfqq);
4402                 return cfqq;
4403         }
4404
4405         cic_set_cfqq(cic, NULL, 1);
4406
4407         cfq_put_cooperator(cfqq);
4408
4409         cfq_put_queue(cfqq);
4410         return NULL;
4411 }
4412 /*
4413  * Allocate cfq data structures associated with this request.
4414  */
4415 static int
4416 cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
4417                 gfp_t gfp_mask)
4418 {
4419         struct cfq_data *cfqd = q->elevator->elevator_data;
4420         struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
4421         const int rw = rq_data_dir(rq);
4422         const bool is_sync = rq_is_sync(rq);
4423         struct cfq_queue *cfqq;
4424
4425         spin_lock_irq(q->queue_lock);
4426
4427         check_ioprio_changed(cic, bio);
4428         check_blkcg_changed(cic, bio);
4429 new_queue:
4430         cfqq = cic_to_cfqq(cic, is_sync);
4431         if (!cfqq || cfqq == &cfqd->oom_cfqq) {
4432                 if (cfqq)
4433                         cfq_put_queue(cfqq);
4434                 cfqq = cfq_get_queue(cfqd, is_sync, cic, bio);
4435                 cic_set_cfqq(cic, cfqq, is_sync);
4436         } else {
4437                 /*
4438                  * If the queue was seeky for too long, break it apart.
4439                  */
4440                 if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
4441                         cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
4442                         cfqq = split_cfqq(cic, cfqq);
4443                         if (!cfqq)
4444                                 goto new_queue;
4445                 }
4446
4447                 /*
4448                  * Check to see if this queue is scheduled to merge with
4449                  * another, closely cooperating queue.  The merging of
4450                  * queues happens here as it must be done in process context.
4451                  * The reference on new_cfqq was taken in merge_cfqqs.
4452                  */
4453                 if (cfqq->new_cfqq)
4454                         cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
4455         }
4456
4457         cfqq->allocated[rw]++;
4458
4459         cfqq->ref++;
4460         cfqg_get(cfqq->cfqg);
4461         rq->elv.priv[0] = cfqq;
4462         rq->elv.priv[1] = cfqq->cfqg;
4463         spin_unlock_irq(q->queue_lock);
4464
4465         return 0;
4466 }
4467
4468 static void cfq_kick_queue(struct work_struct *work)
4469 {
4470         struct cfq_data *cfqd =
4471                 container_of(work, struct cfq_data, unplug_work);
4472         struct request_queue *q = cfqd->queue;
4473
4474         spin_lock_irq(q->queue_lock);
4475         __blk_run_queue(cfqd->queue);
4476         spin_unlock_irq(q->queue_lock);
4477 }
4478
4479 /*
4480  * Timer running if the active_queue is currently idling inside its time slice
4481  */
4482 static enum hrtimer_restart cfq_idle_slice_timer(struct hrtimer *timer)
4483 {
4484         struct cfq_data *cfqd = container_of(timer, struct cfq_data,
4485                                              idle_slice_timer);
4486         struct cfq_queue *cfqq;
4487         unsigned long flags;
4488         int timed_out = 1;
4489
4490         cfq_log(cfqd, "idle timer fired");
4491
4492         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
4493
4494         cfqq = cfqd->active_queue;
4495         if (cfqq) {
4496                 timed_out = 0;
4497
4498                 /*
4499                  * We saw a request before the queue expired, let it through
4500                  */
4501                 if (cfq_cfqq_must_dispatch(cfqq))
4502                         goto out_kick;
4503
4504                 /*
4505                  * expired
4506                  */
4507                 if (cfq_slice_used(cfqq))
4508                         goto expire;
4509
4510                 /*
4511                  * only expire and reinvoke request handler, if there are
4512                  * other queues with pending requests
4513                  */
4514                 if (!cfqd->busy_queues)
4515                         goto out_cont;
4516
4517                 /*
4518                  * not expired and it has a request pending, let it dispatch
4519                  */
4520                 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
4521                         goto out_kick;
4522
4523                 /*
4524                  * Queue depth flag is reset only when the idle didn't succeed
4525                  */
4526                 cfq_clear_cfqq_deep(cfqq);
4527         }
4528 expire:
4529         cfq_slice_expired(cfqd, timed_out);
4530 out_kick:
4531         cfq_schedule_dispatch(cfqd);
4532 out_cont:
4533         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
4534         return HRTIMER_NORESTART;
4535 }
4536
4537 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
4538 {
4539         hrtimer_cancel(&cfqd->idle_slice_timer);
4540         cancel_work_sync(&cfqd->unplug_work);
4541 }
4542
4543 static void cfq_exit_queue(struct elevator_queue *e)
4544 {
4545         struct cfq_data *cfqd = e->elevator_data;
4546         struct request_queue *q = cfqd->queue;
4547
4548         cfq_shutdown_timer_wq(cfqd);
4549
4550         spin_lock_irq(q->queue_lock);
4551
4552         if (cfqd->active_queue)
4553                 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
4554
4555         spin_unlock_irq(q->queue_lock);
4556
4557         cfq_shutdown_timer_wq(cfqd);
4558
4559 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4560         blkcg_deactivate_policy(q, &blkcg_policy_cfq);
4561 #else
4562         kfree(cfqd->root_group);
4563 #endif
4564         kfree(cfqd);
4565 }
4566
4567 static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
4568 {
4569         struct cfq_data *cfqd;
4570         struct blkcg_gq *blkg __maybe_unused;
4571         int i, ret;
4572         struct elevator_queue *eq;
4573
4574         eq = elevator_alloc(q, e);
4575         if (!eq)
4576                 return -ENOMEM;
4577
4578         cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
4579         if (!cfqd) {
4580                 kobject_put(&eq->kobj);
4581                 return -ENOMEM;
4582         }
4583         eq->elevator_data = cfqd;
4584
4585         cfqd->queue = q;
4586         spin_lock_irq(q->queue_lock);
4587         q->elevator = eq;
4588         spin_unlock_irq(q->queue_lock);
4589
4590         /* Init root service tree */
4591         cfqd->grp_service_tree = CFQ_RB_ROOT;
4592
4593         /* Init root group and prefer root group over other groups by default */
4594 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4595         ret = blkcg_activate_policy(q, &blkcg_policy_cfq);
4596         if (ret)
4597                 goto out_free;
4598
4599         cfqd->root_group = blkg_to_cfqg(q->root_blkg);
4600 #else
4601         ret = -ENOMEM;
4602         cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
4603                                         GFP_KERNEL, cfqd->queue->node);
4604         if (!cfqd->root_group)
4605                 goto out_free;
4606
4607         cfq_init_cfqg_base(cfqd->root_group);
4608         cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
4609         cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
4610 #endif
4611
4612         /*
4613          * Not strictly needed (since RB_ROOT just clears the node and we
4614          * zeroed cfqd on alloc), but better be safe in case someone decides
4615          * to add magic to the rb code
4616          */
4617         for (i = 0; i < CFQ_PRIO_LISTS; i++)
4618                 cfqd->prio_trees[i] = RB_ROOT;
4619
4620         /*
4621          * Our fallback cfqq if cfq_get_queue() runs into OOM issues.
4622          * Grab a permanent reference to it, so that the normal code flow
4623          * will not attempt to free it.  oom_cfqq is linked to root_group
4624          * but shouldn't hold a reference as it'll never be unlinked.  Lose
4625          * the reference from linking right away.
4626          */
4627         cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
4628         cfqd->oom_cfqq.ref++;
4629
4630         spin_lock_irq(q->queue_lock);
4631         cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
4632         cfqg_put(cfqd->root_group);
4633         spin_unlock_irq(q->queue_lock);
4634
4635         hrtimer_init(&cfqd->idle_slice_timer, CLOCK_MONOTONIC,
4636                      HRTIMER_MODE_REL);
4637         cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
4638
4639         INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
4640
4641         cfqd->cfq_quantum = cfq_quantum;
4642         cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
4643         cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
4644         cfqd->cfq_back_max = cfq_back_max;
4645         cfqd->cfq_back_penalty = cfq_back_penalty;
4646         cfqd->cfq_slice[0] = cfq_slice_async;
4647         cfqd->cfq_slice[1] = cfq_slice_sync;
4648         cfqd->cfq_target_latency = cfq_target_latency;
4649         cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
4650         cfqd->cfq_slice_idle = cfq_slice_idle;
4651         cfqd->cfq_group_idle = cfq_group_idle;
4652         cfqd->cfq_latency = 1;
4653         cfqd->hw_tag = -1;
4654         /*
4655          * we optimistically start assuming sync ops weren't delayed in last
4656          * second, in order to have larger depth for async operations.
4657          */
4658         cfqd->last_delayed_sync = ktime_get_ns() - NSEC_PER_SEC;
4659         return 0;
4660
4661 out_free:
4662         kfree(cfqd);
4663         kobject_put(&eq->kobj);
4664         return ret;
4665 }
4666
4667 static void cfq_registered_queue(struct request_queue *q)
4668 {
4669         struct elevator_queue *e = q->elevator;
4670         struct cfq_data *cfqd = e->elevator_data;
4671
4672         /*
4673          * Default to IOPS mode with no idling for SSDs
4674          */
4675         if (blk_queue_nonrot(q))
4676                 cfqd->cfq_slice_idle = 0;
4677         wbt_disable_default(q);
4678 }
4679
4680 /*
4681  * sysfs parts below -->
4682  */
4683 static ssize_t
4684 cfq_var_show(unsigned int var, char *page)
4685 {
4686         return sprintf(page, "%u\n", var);
4687 }
4688
4689 static void
4690 cfq_var_store(unsigned int *var, const char *page)
4691 {
4692         char *p = (char *) page;
4693
4694         *var = simple_strtoul(p, &p, 10);
4695 }
4696
4697 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                            \
4698 static ssize_t __FUNC(struct elevator_queue *e, char *page)             \
4699 {                                                                       \
4700         struct cfq_data *cfqd = e->elevator_data;                       \
4701         u64 __data = __VAR;                                             \
4702         if (__CONV)                                                     \
4703                 __data = div_u64(__data, NSEC_PER_MSEC);                        \
4704         return cfq_var_show(__data, (page));                            \
4705 }
4706 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
4707 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
4708 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
4709 SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
4710 SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
4711 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
4712 SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
4713 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
4714 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
4715 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
4716 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
4717 SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1);
4718 #undef SHOW_FUNCTION
4719
4720 #define USEC_SHOW_FUNCTION(__FUNC, __VAR)                               \
4721 static ssize_t __FUNC(struct elevator_queue *e, char *page)             \
4722 {                                                                       \
4723         struct cfq_data *cfqd = e->elevator_data;                       \
4724         u64 __data = __VAR;                                             \
4725         __data = div_u64(__data, NSEC_PER_USEC);                        \
4726         return cfq_var_show(__data, (page));                            \
4727 }
4728 USEC_SHOW_FUNCTION(cfq_slice_idle_us_show, cfqd->cfq_slice_idle);
4729 USEC_SHOW_FUNCTION(cfq_group_idle_us_show, cfqd->cfq_group_idle);
4730 USEC_SHOW_FUNCTION(cfq_slice_sync_us_show, cfqd->cfq_slice[1]);
4731 USEC_SHOW_FUNCTION(cfq_slice_async_us_show, cfqd->cfq_slice[0]);
4732 USEC_SHOW_FUNCTION(cfq_target_latency_us_show, cfqd->cfq_target_latency);
4733 #undef USEC_SHOW_FUNCTION
4734
4735 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
4736 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
4737 {                                                                       \
4738         struct cfq_data *cfqd = e->elevator_data;                       \
4739         unsigned int __data, __min = (MIN), __max = (MAX);              \
4740                                                                         \
4741         cfq_var_store(&__data, (page));                                 \
4742         if (__data < __min)                                             \
4743                 __data = __min;                                         \
4744         else if (__data > __max)                                        \
4745                 __data = __max;                                         \
4746         if (__CONV)                                                     \
4747                 *(__PTR) = (u64)__data * NSEC_PER_MSEC;                 \
4748         else                                                            \
4749                 *(__PTR) = __data;                                      \
4750         return count;                                                   \
4751 }
4752 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
4753 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
4754                 UINT_MAX, 1);
4755 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
4756                 UINT_MAX, 1);
4757 STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
4758 STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
4759                 UINT_MAX, 0);
4760 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
4761 STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
4762 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
4763 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
4764 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
4765                 UINT_MAX, 0);
4766 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
4767 STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1);
4768 #undef STORE_FUNCTION
4769
4770 #define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)                    \
4771 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
4772 {                                                                       \
4773         struct cfq_data *cfqd = e->elevator_data;                       \
4774         unsigned int __data, __min = (MIN), __max = (MAX);              \
4775                                                                         \
4776         cfq_var_store(&__data, (page));                                 \
4777         if (__data < __min)                                             \
4778                 __data = __min;                                         \
4779         else if (__data > __max)                                        \
4780                 __data = __max;                                         \
4781         *(__PTR) = (u64)__data * NSEC_PER_USEC;                         \
4782         return count;                                                   \
4783 }
4784 USEC_STORE_FUNCTION(cfq_slice_idle_us_store, &cfqd->cfq_slice_idle, 0, UINT_MAX);
4785 USEC_STORE_FUNCTION(cfq_group_idle_us_store, &cfqd->cfq_group_idle, 0, UINT_MAX);
4786 USEC_STORE_FUNCTION(cfq_slice_sync_us_store, &cfqd->cfq_slice[1], 1, UINT_MAX);
4787 USEC_STORE_FUNCTION(cfq_slice_async_us_store, &cfqd->cfq_slice[0], 1, UINT_MAX);
4788 USEC_STORE_FUNCTION(cfq_target_latency_us_store, &cfqd->cfq_target_latency, 1, UINT_MAX);
4789 #undef USEC_STORE_FUNCTION
4790
4791 #define CFQ_ATTR(name) \
4792         __ATTR(name, 0644, cfq_##name##_show, cfq_##name##_store)
4793
4794 static struct elv_fs_entry cfq_attrs[] = {
4795         CFQ_ATTR(quantum),
4796         CFQ_ATTR(fifo_expire_sync),
4797         CFQ_ATTR(fifo_expire_async),
4798         CFQ_ATTR(back_seek_max),
4799         CFQ_ATTR(back_seek_penalty),
4800         CFQ_ATTR(slice_sync),
4801         CFQ_ATTR(slice_sync_us),
4802         CFQ_ATTR(slice_async),
4803         CFQ_ATTR(slice_async_us),
4804         CFQ_ATTR(slice_async_rq),
4805         CFQ_ATTR(slice_idle),
4806         CFQ_ATTR(slice_idle_us),
4807         CFQ_ATTR(group_idle),
4808         CFQ_ATTR(group_idle_us),
4809         CFQ_ATTR(low_latency),
4810         CFQ_ATTR(target_latency),
4811         CFQ_ATTR(target_latency_us),
4812         __ATTR_NULL
4813 };
4814
4815 static struct elevator_type iosched_cfq = {
4816         .ops.sq = {
4817                 .elevator_merge_fn =            cfq_merge,
4818                 .elevator_merged_fn =           cfq_merged_request,
4819                 .elevator_merge_req_fn =        cfq_merged_requests,
4820                 .elevator_allow_bio_merge_fn =  cfq_allow_bio_merge,
4821                 .elevator_allow_rq_merge_fn =   cfq_allow_rq_merge,
4822                 .elevator_bio_merged_fn =       cfq_bio_merged,
4823                 .elevator_dispatch_fn =         cfq_dispatch_requests,
4824                 .elevator_add_req_fn =          cfq_insert_request,
4825                 .elevator_activate_req_fn =     cfq_activate_request,
4826                 .elevator_deactivate_req_fn =   cfq_deactivate_request,
4827                 .elevator_completed_req_fn =    cfq_completed_request,
4828                 .elevator_former_req_fn =       elv_rb_former_request,
4829                 .elevator_latter_req_fn =       elv_rb_latter_request,
4830                 .elevator_init_icq_fn =         cfq_init_icq,
4831                 .elevator_exit_icq_fn =         cfq_exit_icq,
4832                 .elevator_set_req_fn =          cfq_set_request,
4833                 .elevator_put_req_fn =          cfq_put_request,
4834                 .elevator_may_queue_fn =        cfq_may_queue,
4835                 .elevator_init_fn =             cfq_init_queue,
4836                 .elevator_exit_fn =             cfq_exit_queue,
4837                 .elevator_registered_fn =       cfq_registered_queue,
4838         },
4839         .icq_size       =       sizeof(struct cfq_io_cq),
4840         .icq_align      =       __alignof__(struct cfq_io_cq),
4841         .elevator_attrs =       cfq_attrs,
4842         .elevator_name  =       "cfq",
4843         .elevator_owner =       THIS_MODULE,
4844 };
4845
4846 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4847 static struct blkcg_policy blkcg_policy_cfq = {
4848         .dfl_cftypes            = cfq_blkcg_files,
4849         .legacy_cftypes         = cfq_blkcg_legacy_files,
4850
4851         .cpd_alloc_fn           = cfq_cpd_alloc,
4852         .cpd_init_fn            = cfq_cpd_init,
4853         .cpd_free_fn            = cfq_cpd_free,
4854         .cpd_bind_fn            = cfq_cpd_bind,
4855
4856         .pd_alloc_fn            = cfq_pd_alloc,
4857         .pd_init_fn             = cfq_pd_init,
4858         .pd_offline_fn          = cfq_pd_offline,
4859         .pd_free_fn             = cfq_pd_free,
4860         .pd_reset_stats_fn      = cfq_pd_reset_stats,
4861 };
4862 #endif
4863
4864 static int __init cfq_init(void)
4865 {
4866         int ret;
4867
4868 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4869         ret = blkcg_policy_register(&blkcg_policy_cfq);
4870         if (ret)
4871                 return ret;
4872 #else
4873         cfq_group_idle = 0;
4874 #endif
4875
4876         ret = -ENOMEM;
4877         cfq_pool = KMEM_CACHE(cfq_queue, 0);
4878         if (!cfq_pool)
4879                 goto err_pol_unreg;
4880
4881         ret = elv_register(&iosched_cfq);
4882         if (ret)
4883                 goto err_free_pool;
4884
4885         return 0;
4886
4887 err_free_pool:
4888         kmem_cache_destroy(cfq_pool);
4889 err_pol_unreg:
4890 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4891         blkcg_policy_unregister(&blkcg_policy_cfq);
4892 #endif
4893         return ret;
4894 }
4895
4896 static void __exit cfq_exit(void)
4897 {
4898 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4899         blkcg_policy_unregister(&blkcg_policy_cfq);
4900 #endif
4901         elv_unregister(&iosched_cfq);
4902         kmem_cache_destroy(cfq_pool);
4903 }
4904
4905 module_init(cfq_init);
4906 module_exit(cfq_exit);
4907
4908 MODULE_AUTHOR("Jens Axboe");
4909 MODULE_LICENSE("GPL");
4910 MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");