block/blk-barrier.c

   1 /*
   2  * Functions related to barrier IO handling
   3  */
   4 #include <linux/kernel.h>
   5 #include <linux/module.h>
   6 #include <linux/bio.h>
   7 #include <linux/blkdev.h>
   8
   9 #include "blk.h"
  10
  11 /**
  12  * blk_queue_ordered - does this queue support ordered writes
  13  * @q:        the request queue
  14  * @ordered:  one of QUEUE_ORDERED_*
  15  * @prepare_flush_fn: rq setup helper for cache flush ordered writes
  16  *
  17  * Description:
  18  *   For journalled file systems, doing ordered writes on a commit
  19  *   block instead of explicitly doing wait_on_buffer (which is bad
  20  *   for performance) can be a big win. Block drivers supporting this
  21  *   feature should call this function and indicate so.
  22  *
  23  **/
  24 int blk_queue_ordered(struct request_queue *q, unsigned ordered,
  25                       prepare_flush_fn *prepare_flush_fn)
  26 {
  27         if (!prepare_flush_fn && (ordered & (QUEUE_ORDERED_DO_PREFLUSH |
  28                                              QUEUE_ORDERED_DO_POSTFLUSH))) {
  29                 printk(KERN_ERR "%s: prepare_flush_fn required\n", __func__);
  30                 return -EINVAL;
  31         }
  32
  33         if (ordered != QUEUE_ORDERED_NONE &&
  34             ordered != QUEUE_ORDERED_DRAIN &&
  35             ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
  36             ordered != QUEUE_ORDERED_DRAIN_FUA &&
  37             ordered != QUEUE_ORDERED_TAG &&
  38             ordered != QUEUE_ORDERED_TAG_FLUSH &&
  39             ordered != QUEUE_ORDERED_TAG_FUA) {
  40                 printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
  41                 return -EINVAL;
  42         }
  43
  44         q->ordered = ordered;
  45         q->next_ordered = ordered;
  46         q->prepare_flush_fn = prepare_flush_fn;
  47
  48         return 0;
  49 }
  50 EXPORT_SYMBOL(blk_queue_ordered);
  51
  52 /*
  53  * Cache flushing for ordered writes handling
  54  */
  55 unsigned blk_ordered_cur_seq(struct request_queue *q)
  56 {
  57         if (!q->ordseq)
  58                 return 0;
  59         return 1 << ffz(q->ordseq);
  60 }
  61
  62 unsigned blk_ordered_req_seq(struct request *rq)
  63 {
  64         struct request_queue *q = rq->q;
  65
  66         BUG_ON(q->ordseq == 0);
  67
  68         if (rq == &q->pre_flush_rq)
  69                 return QUEUE_ORDSEQ_PREFLUSH;
  70         if (rq == &q->bar_rq)
  71                 return QUEUE_ORDSEQ_BAR;
  72         if (rq == &q->post_flush_rq)
  73                 return QUEUE_ORDSEQ_POSTFLUSH;
  74
  75         /*
  76          * !fs requests don't need to follow barrier ordering.  Always
  77          * put them at the front.  This fixes the following deadlock.
  78          *
  79          * http://thread.gmane.org/gmane.linux.kernel/537473
  80          */
  81         if (!blk_fs_request(rq))
  82                 return QUEUE_ORDSEQ_DRAIN;
  83
  84         if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
  85             (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
  86                 return QUEUE_ORDSEQ_DRAIN;
  87         else
  88                 return QUEUE_ORDSEQ_DONE;
  89 }
  90
  91 bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
  92 {
  93         struct request *rq;
  94
  95         if (error && !q->orderr)
  96                 q->orderr = error;
  97
  98         BUG_ON(q->ordseq & seq);
  99         q->ordseq |= seq;
 100
 101         if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
 102                 return false;
 103
 104         /*
 105          * Okay, sequence complete.
 106          */
 107         q->ordseq = 0;
 108         rq = q->orig_bar_rq;
 109         __blk_end_request_all(rq, q->orderr);
 110         return true;
 111 }
 112
 113 static void pre_flush_end_io(struct request *rq, int error)
 114 {
 115         elv_completed_request(rq->q, rq);
 116         blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
 117 }
 118
 119 static void bar_end_io(struct request *rq, int error)
 120 {
 121         elv_completed_request(rq->q, rq);
 122         blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
 123 }
 124
 125 static void post_flush_end_io(struct request *rq, int error)
 126 {
 127         elv_completed_request(rq->q, rq);
 128         blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
 129 }
 130
 131 static void queue_flush(struct request_queue *q, unsigned which)
 132 {
 133         struct request *rq;
 134         rq_end_io_fn *end_io;
 135
 136         if (which == QUEUE_ORDERED_DO_PREFLUSH) {
 137                 rq = &q->pre_flush_rq;
 138                 end_io = pre_flush_end_io;
 139         } else {
 140                 rq = &q->post_flush_rq;
 141                 end_io = post_flush_end_io;
 142         }
 143
 144         blk_rq_init(q, rq);
 145         rq->cmd_flags = REQ_HARDBARRIER;
 146         rq->rq_disk = q->bar_rq.rq_disk;
 147         rq->end_io = end_io;
 148         q->prepare_flush_fn(q, rq);
 149
 150         elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 151 }
 152
 153 static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 154 {
 155         struct request *rq = *rqp;
 156         unsigned skip = 0;
 157
 158         q->orderr = 0;
 159         q->ordered = q->next_ordered;
 160         q->ordseq |= QUEUE_ORDSEQ_STARTED;
 161
 162         /*
 163          * For an empty barrier, there's no actual BAR request, which
 164          * in turn makes POSTFLUSH unnecessary.  Mask them off.
 165          */
 166         if (!blk_rq_sectors(rq)) {
 167                 q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
 168                                 QUEUE_ORDERED_DO_POSTFLUSH);
 169                 /*
 170                  * Empty barrier on a write-through device w/ ordered
 171                  * tag has no command to issue and without any command
 172                  * to issue, ordering by tag can't be used.  Drain
 173                  * instead.
 174                  */
 175                 if ((q->ordered & QUEUE_ORDERED_BY_TAG) &&
 176                     !(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) {
 177                         q->ordered &= ~QUEUE_ORDERED_BY_TAG;
 178                         q->ordered |= QUEUE_ORDERED_BY_DRAIN;
 179                 }
 180         }
 181
 182         /* stash away the original request */
 183         blk_dequeue_request(rq);
 184         q->orig_bar_rq = rq;
 185         rq = NULL;
 186
 187         /*
 188          * Queue ordered sequence.  As we stack them at the head, we
 189          * need to queue in reverse order.  Note that we rely on that
 190          * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
 191          * request gets inbetween ordered sequence.
 192          */
 193         if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) {
 194                 queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
 195                 rq = &q->post_flush_rq;
 196         } else
 197                 skip |= QUEUE_ORDSEQ_POSTFLUSH;
 198
 199         if (q->ordered & QUEUE_ORDERED_DO_BAR) {
 200                 rq = &q->bar_rq;
 201
 202                 /* initialize proxy request and queue it */
 203                 blk_rq_init(q, rq);
 204                 if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
 205                         rq->cmd_flags |= REQ_RW;
 206                 if (q->ordered & QUEUE_ORDERED_DO_FUA)
 207                         rq->cmd_flags |= REQ_FUA;
 208                 init_request_from_bio(rq, q->orig_bar_rq->bio);
 209                 rq->end_io = bar_end_io;
 210
 211                 elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 212         } else
 213                 skip |= QUEUE_ORDSEQ_BAR;
 214
 215         if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
 216                 queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
 217                 rq = &q->pre_flush_rq;
 218         } else
 219                 skip |= QUEUE_ORDSEQ_PREFLUSH;
 220
 221         if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && queue_in_flight(q))
 222                 rq = NULL;
 223         else
 224                 skip |= QUEUE_ORDSEQ_DRAIN;
 225
 226         *rqp = rq;
 227
 228         /*
 229          * Complete skipped sequences.  If whole sequence is complete,
 230          * return false to tell elevator that this request is gone.
 231          */
 232         return !blk_ordered_complete_seq(q, skip, 0);
 233 }
 234
 235 bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 236 {
 237         struct request *rq = *rqp;
 238         const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
 239
 240         if (!q->ordseq) {
 241                 if (!is_barrier)
 242                         return true;
 243
 244                 if (q->next_ordered != QUEUE_ORDERED_NONE)
 245                         return start_ordered(q, rqp);
 246                 else {
 247                         /*
 248                          * Queue ordering not supported.  Terminate
 249                          * with prejudice.
 250                          */
 251                         blk_dequeue_request(rq);
 252                         __blk_end_request_all(rq, -EOPNOTSUPP);
 253                         *rqp = NULL;
 254                         return false;
 255                 }
 256         }
 257
 258         /*
 259          * Ordered sequence in progress
 260          */
 261
 262         /* Special requests are not subject to ordering rules. */
 263         if (!blk_fs_request(rq) &&
 264             rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
 265                 return true;
 266
 267         if (q->ordered & QUEUE_ORDERED_BY_TAG) {
 268                 /* Ordered by tag.  Blocking the next barrier is enough. */
 269                 if (is_barrier && rq != &q->bar_rq)
 270                         *rqp = NULL;
 271         } else {
 272                 /* Ordered by draining.  Wait for turn. */
 273                 WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
 274                 if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
 275                         *rqp = NULL;
 276         }
 277
 278         return true;
 279 }
 280
 281 static void bio_end_empty_barrier(struct bio *bio, int err)
 282 {
 283         if (err) {
 284                 if (err == -EOPNOTSUPP)
 285                         set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
 286                 clear_bit(BIO_UPTODATE, &bio->bi_flags);
 287         }
 288
 289         complete(bio->bi_private);
 290 }
 291
 292 /**
 293  * blkdev_issue_flush - queue a flush
 294  * @bdev:       blockdev to issue flush for
 295  * @error_sector:       error sector
 296  *
 297  * Description:
 298  *    Issue a flush for the block device in question. Caller can supply
 299  *    room for storing the error offset in case of a flush error, if they
 300  *    wish to.
 301  */
 302 int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
 303 {
 304         DECLARE_COMPLETION_ONSTACK(wait);
 305         struct request_queue *q;
 306         struct bio *bio;
 307         int ret;
 308
 309         if (bdev->bd_disk == NULL)
 310                 return -ENXIO;
 311
 312         q = bdev_get_queue(bdev);
 313         if (!q)
 314                 return -ENXIO;
 315
 316         bio = bio_alloc(GFP_KERNEL, 0);
 317         bio->bi_end_io = bio_end_empty_barrier;
 318         bio->bi_private = &wait;
 319         bio->bi_bdev = bdev;
 320         submit_bio(WRITE_BARRIER, bio);
 321
 322         wait_for_completion(&wait);
 323
 324         /*
 325          * The driver must store the error location in ->bi_sector, if
 326          * it supports it. For non-stacked drivers, this should be copied
 327          * from blk_rq_pos(rq).
 328          */
 329         if (error_sector)
 330                 *error_sector = bio->bi_sector;
 331
 332         ret = 0;
 333         if (bio_flagged(bio, BIO_EOPNOTSUPP))
 334                 ret = -EOPNOTSUPP;
 335         else if (!bio_flagged(bio, BIO_UPTODATE))
 336                 ret = -EIO;
 337
 338         bio_put(bio);
 339         return ret;
 340 }
 341 EXPORT_SYMBOL(blkdev_issue_flush);
 342
 343 static void blkdev_discard_end_io(struct bio *bio, int err)
 344 {
 345         if (err) {
 346                 if (err == -EOPNOTSUPP)
 347                         set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
 348                 clear_bit(BIO_UPTODATE, &bio->bi_flags);
 349         }
 350
 351         if (bio->bi_private)
 352                 complete(bio->bi_private);
 353         __free_page(bio_page(bio));
 354
 355         bio_put(bio);
 356 }
 357
 358 /**
 359  * blkdev_issue_discard - queue a discard
 360  * @bdev:       blockdev to issue discard for
 361  * @sector:     start sector
 362  * @nr_sects:   number of sectors to discard
 363  * @gfp_mask:   memory allocation flags (for bio_alloc)
 364  * @flags:      DISCARD_FL_* flags to control behaviour
 365  *
 366  * Description:
 367  *    Issue a discard request for the sectors in question.
 368  */
 369 int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 370                 sector_t nr_sects, gfp_t gfp_mask, int flags)
 371 {
 372         DECLARE_COMPLETION_ONSTACK(wait);
 373         struct request_queue *q = bdev_get_queue(bdev);
 374         int type = flags & DISCARD_FL_BARRIER ?
 375                 DISCARD_BARRIER : DISCARD_NOBARRIER;
 376         struct bio *bio;
 377         struct page *page;
 378         int ret = 0;
 379
 380         if (!q)
 381                 return -ENXIO;
 382
 383         if (!blk_queue_discard(q))
 384                 return -EOPNOTSUPP;
 385
 386         while (nr_sects && !ret) {
 387                 unsigned int sector_size = q->limits.logical_block_size;
 388                 unsigned int max_discard_sectors =
 389                         min(q->limits.max_discard_sectors, UINT_MAX >> 9);
 390
 391                 bio = bio_alloc(gfp_mask, 1);
 392                 if (!bio)
 393                         goto out;
 394                 bio->bi_sector = sector;
 395                 bio->bi_end_io = blkdev_discard_end_io;
 396                 bio->bi_bdev = bdev;
 397                 if (flags & DISCARD_FL_WAIT)
 398                         bio->bi_private = &wait;
 399
 400                 /*
 401                  * Add a zeroed one-sector payload as that's what
 402                  * our current implementations need.  If we'll ever need
 403                  * more the interface will need revisiting.
 404                  */
 405                 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 406                 if (!page)
 407                         goto out_free_bio;
 408                 if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
 409                         goto out_free_page;
 410
 411                 /*
 412                  * And override the bio size - the way discard works we
 413                  * touch many more blocks on disk than the actual payload
 414                  * length.
 415                  */
 416                 if (nr_sects > max_discard_sectors) {
 417                         bio->bi_size = max_discard_sectors << 9;
 418                         nr_sects -= max_discard_sectors;
 419                         sector += max_discard_sectors;
 420                 } else {
 421                         bio->bi_size = nr_sects << 9;
 422                         nr_sects = 0;
 423                 }
 424
 425                 bio_get(bio);
 426                 submit_bio(type, bio);
 427
 428                 if (flags & DISCARD_FL_WAIT)
 429                         wait_for_completion(&wait);
 430
 431                 if (bio_flagged(bio, BIO_EOPNOTSUPP))
 432                         ret = -EOPNOTSUPP;
 433                 else if (!bio_flagged(bio, BIO_UPTODATE))
 434                         ret = -EIO;
 435                 bio_put(bio);
 436         }
 437         return ret;
 438 out_free_page:
 439         __free_page(page);
 440 out_free_bio:
 441         bio_put(bio);
 442 out:
 443         return -ENOMEM;
 444 }
 445 EXPORT_SYMBOL(blkdev_issue_discard);