block/blk-merge.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Functions related to segment and merge handling
   4  */
   5 #include <linux/kernel.h>
   6 #include <linux/module.h>
   7 #include <linux/bio.h>
   8 #include <linux/blkdev.h>
   9 #include <linux/blk-integrity.h>
  10 #include <linux/scatterlist.h>
  11 #include <linux/part_stat.h>
  12 #include <linux/blk-cgroup.h>
  13
  14 #include <trace/events/block.h>
  15
  16 #include "blk.h"
  17 #include "blk-mq-sched.h"
  18 #include "blk-rq-qos.h"
  19 #include "blk-throttle.h"
  20
  21 static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
  22 {
  23         *bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
  24 }
  25
  26 static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
  27 {
  28         struct bvec_iter iter = bio->bi_iter;
  29         int idx;
  30
  31         bio_get_first_bvec(bio, bv);
  32         if (bv->bv_len == bio->bi_iter.bi_size)
  33                 return;         /* this bio only has a single bvec */
  34
  35         bio_advance_iter(bio, &iter, iter.bi_size);
  36
  37         if (!iter.bi_bvec_done)
  38                 idx = iter.bi_idx - 1;
  39         else    /* in the middle of bvec */
  40                 idx = iter.bi_idx;
  41
  42         *bv = bio->bi_io_vec[idx];
  43
  44         /*
  45          * iter.bi_bvec_done records actual length of the last bvec
  46          * if this bio ends in the middle of one io vector
  47          */
  48         if (iter.bi_bvec_done)
  49                 bv->bv_len = iter.bi_bvec_done;
  50 }
  51
  52 static inline bool bio_will_gap(struct request_queue *q,
  53                 struct request *prev_rq, struct bio *prev, struct bio *next)
  54 {
  55         struct bio_vec pb, nb;
  56
  57         if (!bio_has_data(prev) || !queue_virt_boundary(q))
  58                 return false;
  59
  60         /*
  61          * Don't merge if the 1st bio starts with non-zero offset, otherwise it
  62          * is quite difficult to respect the sg gap limit.  We work hard to
  63          * merge a huge number of small single bios in case of mkfs.
  64          */
  65         if (prev_rq)
  66                 bio_get_first_bvec(prev_rq->bio, &pb);
  67         else
  68                 bio_get_first_bvec(prev, &pb);
  69         if (pb.bv_offset & queue_virt_boundary(q))
  70                 return true;
  71
  72         /*
  73          * We don't need to worry about the situation that the merged segment
  74          * ends in unaligned virt boundary:
  75          *
  76          * - if 'pb' ends aligned, the merged segment ends aligned
  77          * - if 'pb' ends unaligned, the next bio must include
  78          *   one single bvec of 'nb', otherwise the 'nb' can't
  79          *   merge with 'pb'
  80          */
  81         bio_get_last_bvec(prev, &pb);
  82         bio_get_first_bvec(next, &nb);
  83         if (biovec_phys_mergeable(q, &pb, &nb))
  84                 return false;
  85         return __bvec_gap_to_prev(&q->limits, &pb, nb.bv_offset);
  86 }
  87
  88 static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
  89 {
  90         return bio_will_gap(req->q, req, req->biotail, bio);
  91 }
  92
  93 static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
  94 {
  95         return bio_will_gap(req->q, NULL, bio, req->bio);
  96 }
  97
  98 /*
  99  * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size
 100  * is defined as 'unsigned int', meantime it has to be aligned to with the
 101  * logical block size, which is the minimum accepted unit by hardware.
 102  */
 103 static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim)
 104 {
 105         return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT;
 106 }
 107
 108 static struct bio *bio_split_discard(struct bio *bio,
 109                                      const struct queue_limits *lim,
 110                                      unsigned *nsegs, struct bio_set *bs)
 111 {
 112         unsigned int max_discard_sectors, granularity;
 113         sector_t tmp;
 114         unsigned split_sectors;
 115
 116         *nsegs = 1;
 117
 118         granularity = max(lim->discard_granularity >> 9, 1U);
 119
 120         max_discard_sectors =
 121                 min(lim->max_discard_sectors, bio_allowed_max_sectors(lim));
 122         max_discard_sectors -= max_discard_sectors % granularity;
 123         if (unlikely(!max_discard_sectors))
 124                 return NULL;
 125
 126         if (bio_sectors(bio) <= max_discard_sectors)
 127                 return NULL;
 128
 129         split_sectors = max_discard_sectors;
 130
 131         /*
 132          * If the next starting sector would be misaligned, stop the discard at
 133          * the previous aligned sector.
 134          */
 135         tmp = bio->bi_iter.bi_sector + split_sectors -
 136                 ((lim->discard_alignment >> 9) % granularity);
 137         tmp = sector_div(tmp, granularity);
 138
 139         if (split_sectors > tmp)
 140                 split_sectors -= tmp;
 141
 142         return bio_split(bio, split_sectors, GFP_NOIO, bs);
 143 }
 144
 145 static struct bio *bio_split_write_zeroes(struct bio *bio,
 146                                           const struct queue_limits *lim,
 147                                           unsigned *nsegs, struct bio_set *bs)
 148 {
 149         *nsegs = 0;
 150         if (!lim->max_write_zeroes_sectors)
 151                 return NULL;
 152         if (bio_sectors(bio) <= lim->max_write_zeroes_sectors)
 153                 return NULL;
 154         return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
 155 }
 156
 157 static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim,
 158                                                 bool is_atomic)
 159 {
 160         /*
 161          * chunk_sectors must be a multiple of atomic_write_boundary_sectors if
 162          * both non-zero.
 163          */
 164         if (is_atomic && lim->atomic_write_boundary_sectors)
 165                 return lim->atomic_write_boundary_sectors;
 166
 167         return lim->chunk_sectors;
 168 }
 169
 170 /*
 171  * Return the maximum number of sectors from the start of a bio that may be
 172  * submitted as a single request to a block device. If enough sectors remain,
 173  * align the end to the physical block size. Otherwise align the end to the
 174  * logical block size. This approach minimizes the number of non-aligned
 175  * requests that are submitted to a block device if the start of a bio is not
 176  * aligned to a physical block boundary.
 177  */
 178 static inline unsigned get_max_io_size(struct bio *bio,
 179                                        const struct queue_limits *lim)
 180 {
 181         unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT;
 182         unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT;
 183         bool is_atomic = bio->bi_opf & REQ_ATOMIC;
 184         unsigned boundary_sectors = blk_boundary_sectors(lim, is_atomic);
 185         unsigned max_sectors, start, end;
 186
 187         /*
 188          * We ignore lim->max_sectors for atomic writes because it may less
 189          * than the actual bio size, which we cannot tolerate.
 190          */
 191         if (is_atomic)
 192                 max_sectors = lim->atomic_write_max_sectors;
 193         else
 194                 max_sectors = lim->max_sectors;
 195
 196         if (boundary_sectors) {
 197                 max_sectors = min(max_sectors,
 198                         blk_boundary_sectors_left(bio->bi_iter.bi_sector,
 199                                               boundary_sectors));
 200         }
 201
 202         start = bio->bi_iter.bi_sector & (pbs - 1);
 203         end = (start + max_sectors) & ~(pbs - 1);
 204         if (end > start)
 205                 return end - start;
 206         return max_sectors & ~(lbs - 1);
 207 }
 208
 209 /**
 210  * get_max_segment_size() - maximum number of bytes to add as a single segment
 211  * @lim: Request queue limits.
 212  * @paddr: address of the range to add
 213  * @len: maximum length available to add at @paddr
 214  *
 215  * Returns the maximum number of bytes of the range starting at @paddr that can
 216  * be added to a single segment.
 217  */
 218 static inline unsigned get_max_segment_size(const struct queue_limits *lim,
 219                 phys_addr_t paddr, unsigned int len)
 220 {
 221         /*
 222          * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1
 223          * after having calculated the minimum.
 224          */
 225         return min_t(unsigned long, len,
 226                 min(lim->seg_boundary_mask - (lim->seg_boundary_mask & paddr),
 227                     (unsigned long)lim->max_segment_size - 1) + 1);
 228 }
 229
 230 /**
 231  * bvec_split_segs - verify whether or not a bvec should be split in the middle
 232  * @lim:      [in] queue limits to split based on
 233  * @bv:       [in] bvec to examine
 234  * @nsegs:    [in,out] Number of segments in the bio being built. Incremented
 235  *            by the number of segments from @bv that may be appended to that
 236  *            bio without exceeding @max_segs
 237  * @bytes:    [in,out] Number of bytes in the bio being built. Incremented
 238  *            by the number of bytes from @bv that may be appended to that
 239  *            bio without exceeding @max_bytes
 240  * @max_segs: [in] upper bound for *@nsegs
 241  * @max_bytes: [in] upper bound for *@bytes
 242  *
 243  * When splitting a bio, it can happen that a bvec is encountered that is too
 244  * big to fit in a single segment and hence that it has to be split in the
 245  * middle. This function verifies whether or not that should happen. The value
 246  * %true is returned if and only if appending the entire @bv to a bio with
 247  * *@nsegs segments and *@sectors sectors would make that bio unacceptable for
 248  * the block driver.
 249  */
 250 static bool bvec_split_segs(const struct queue_limits *lim,
 251                 const struct bio_vec *bv, unsigned *nsegs, unsigned *bytes,
 252                 unsigned max_segs, unsigned max_bytes)
 253 {
 254         unsigned max_len = min(max_bytes, UINT_MAX) - *bytes;
 255         unsigned len = min(bv->bv_len, max_len);
 256         unsigned total_len = 0;
 257         unsigned seg_size = 0;
 258
 259         while (len && *nsegs < max_segs) {
 260                 seg_size = get_max_segment_size(lim, bvec_phys(bv) + total_len, len);
 261
 262                 (*nsegs)++;
 263                 total_len += seg_size;
 264                 len -= seg_size;
 265
 266                 if ((bv->bv_offset + total_len) & lim->virt_boundary_mask)
 267                         break;
 268         }
 269
 270         *bytes += total_len;
 271
 272         /* tell the caller to split the bvec if it is too big to fit */
 273         return len > 0 || bv->bv_len > max_len;
 274 }
 275
 276 /**
 277  * bio_split_rw - split a bio in two bios
 278  * @bio:  [in] bio to be split
 279  * @lim:  [in] queue limits to split based on
 280  * @segs: [out] number of segments in the bio with the first half of the sectors
 281  * @bs:   [in] bio set to allocate the clone from
 282  * @max_bytes: [in] maximum number of bytes per bio
 283  *
 284  * Clone @bio, update the bi_iter of the clone to represent the first sectors
 285  * of @bio and update @bio->bi_iter to represent the remaining sectors. The
 286  * following is guaranteed for the cloned bio:
 287  * - That it has at most @max_bytes worth of data
 288  * - That it has at most queue_max_segments(@q) segments.
 289  *
 290  * Except for discard requests the cloned bio will point at the bi_io_vec of
 291  * the original bio. It is the responsibility of the caller to ensure that the
 292  * original bio is not freed before the cloned bio. The caller is also
 293  * responsible for ensuring that @bs is only destroyed after processing of the
 294  * split bio has finished.
 295  */
 296 struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
 297                 unsigned *segs, struct bio_set *bs, unsigned max_bytes)
 298 {
 299         struct bio_vec bv, bvprv, *bvprvp = NULL;
 300         struct bvec_iter iter;
 301         unsigned nsegs = 0, bytes = 0;
 302
 303         bio_for_each_bvec(bv, bio, iter) {
 304                 /*
 305                  * If the queue doesn't support SG gaps and adding this
 306                  * offset would create a gap, disallow it.
 307                  */
 308                 if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
 309                         goto split;
 310
 311                 if (nsegs < lim->max_segments &&
 312                     bytes + bv.bv_len <= max_bytes &&
 313                     bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
 314                         nsegs++;
 315                         bytes += bv.bv_len;
 316                 } else {
 317                         if (bvec_split_segs(lim, &bv, &nsegs, &bytes,
 318                                         lim->max_segments, max_bytes))
 319                                 goto split;
 320                 }
 321
 322                 bvprv = bv;
 323                 bvprvp = &bvprv;
 324         }
 325
 326         *segs = nsegs;
 327         return NULL;
 328 split:
 329         if (bio->bi_opf & REQ_ATOMIC) {
 330                 bio->bi_status = BLK_STS_INVAL;
 331                 bio_endio(bio);
 332                 return ERR_PTR(-EINVAL);
 333         }
 334         /*
 335          * We can't sanely support splitting for a REQ_NOWAIT bio. End it
 336          * with EAGAIN if splitting is required and return an error pointer.
 337          */
 338         if (bio->bi_opf & REQ_NOWAIT) {
 339                 bio->bi_status = BLK_STS_AGAIN;
 340                 bio_endio(bio);
 341                 return ERR_PTR(-EAGAIN);
 342         }
 343
 344         *segs = nsegs;
 345
 346         /*
 347          * Individual bvecs might not be logical block aligned. Round down the
 348          * split size so that each bio is properly block size aligned, even if
 349          * we do not use the full hardware limits.
 350          */
 351         bytes = ALIGN_DOWN(bytes, lim->logical_block_size);
 352
 353         /*
 354          * Bio splitting may cause subtle trouble such as hang when doing sync
 355          * iopoll in direct IO routine. Given performance gain of iopoll for
 356          * big IO can be trival, disable iopoll when split needed.
 357          */
 358         bio_clear_polled(bio);
 359         return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs);
 360 }
 361 EXPORT_SYMBOL_GPL(bio_split_rw);
 362
 363 /**
 364  * __bio_split_to_limits - split a bio to fit the queue limits
 365  * @bio:     bio to be split
 366  * @lim:     queue limits to split based on
 367  * @nr_segs: returns the number of segments in the returned bio
 368  *
 369  * Check if @bio needs splitting based on the queue limits, and if so split off
 370  * a bio fitting the limits from the beginning of @bio and return it.  @bio is
 371  * shortened to the remainder and re-submitted.
 372  *
 373  * The split bio is allocated from @q->bio_split, which is provided by the
 374  * block layer.
 375  */
 376 struct bio *__bio_split_to_limits(struct bio *bio,
 377                                   const struct queue_limits *lim,
 378                                   unsigned int *nr_segs)
 379 {
 380         struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split;
 381         struct bio *split;
 382
 383         switch (bio_op(bio)) {
 384         case REQ_OP_DISCARD:
 385         case REQ_OP_SECURE_ERASE:
 386                 split = bio_split_discard(bio, lim, nr_segs, bs);
 387                 break;
 388         case REQ_OP_WRITE_ZEROES:
 389                 split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
 390                 break;
 391         default:
 392                 split = bio_split_rw(bio, lim, nr_segs, bs,
 393                                 get_max_io_size(bio, lim) << SECTOR_SHIFT);
 394                 if (IS_ERR(split))
 395                         return NULL;
 396                 break;
 397         }
 398
 399         if (split) {
 400                 /* there isn't chance to merge the split bio */
 401                 split->bi_opf |= REQ_NOMERGE;
 402
 403                 blkcg_bio_issue_init(split);
 404                 bio_chain(split, bio);
 405                 trace_block_split(split, bio->bi_iter.bi_sector);
 406                 WARN_ON_ONCE(bio_zone_write_plugging(bio));
 407                 submit_bio_noacct(bio);
 408                 return split;
 409         }
 410         return bio;
 411 }
 412
 413 /**
 414  * bio_split_to_limits - split a bio to fit the queue limits
 415  * @bio:     bio to be split
 416  *
 417  * Check if @bio needs splitting based on the queue limits of @bio->bi_bdev, and
 418  * if so split off a bio fitting the limits from the beginning of @bio and
 419  * return it.  @bio is shortened to the remainder and re-submitted.
 420  *
 421  * The split bio is allocated from @q->bio_split, which is provided by the
 422  * block layer.
 423  */
 424 struct bio *bio_split_to_limits(struct bio *bio)
 425 {
 426         const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits;
 427         unsigned int nr_segs;
 428
 429         if (bio_may_exceed_limits(bio, lim))
 430                 return __bio_split_to_limits(bio, lim, &nr_segs);
 431         return bio;
 432 }
 433 EXPORT_SYMBOL(bio_split_to_limits);
 434
 435 unsigned int blk_recalc_rq_segments(struct request *rq)
 436 {
 437         unsigned int nr_phys_segs = 0;
 438         unsigned int bytes = 0;
 439         struct req_iterator iter;
 440         struct bio_vec bv;
 441
 442         if (!rq->bio)
 443                 return 0;
 444
 445         switch (bio_op(rq->bio)) {
 446         case REQ_OP_DISCARD:
 447         case REQ_OP_SECURE_ERASE:
 448                 if (queue_max_discard_segments(rq->q) > 1) {
 449                         struct bio *bio = rq->bio;
 450
 451                         for_each_bio(bio)
 452                                 nr_phys_segs++;
 453                         return nr_phys_segs;
 454                 }
 455                 return 1;
 456         case REQ_OP_WRITE_ZEROES:
 457                 return 0;
 458         default:
 459                 break;
 460         }
 461
 462         rq_for_each_bvec(bv, rq, iter)
 463                 bvec_split_segs(&rq->q->limits, &bv, &nr_phys_segs, &bytes,
 464                                 UINT_MAX, UINT_MAX);
 465         return nr_phys_segs;
 466 }
 467
 468 static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
 469                 struct scatterlist *sglist)
 470 {
 471         if (!*sg)
 472                 return sglist;
 473
 474         /*
 475          * If the driver previously mapped a shorter list, we could see a
 476          * termination bit prematurely unless it fully inits the sg table
 477          * on each mapping. We KNOW that there must be more entries here
 478          * or the driver would be buggy, so force clear the termination bit
 479          * to avoid doing a full sg_init_table() in drivers for each command.
 480          */
 481         sg_unmark_end(*sg);
 482         return sg_next(*sg);
 483 }
 484
 485 static unsigned blk_bvec_map_sg(struct request_queue *q,
 486                 struct bio_vec *bvec, struct scatterlist *sglist,
 487                 struct scatterlist **sg)
 488 {
 489         unsigned nbytes = bvec->bv_len;
 490         unsigned nsegs = 0, total = 0;
 491
 492         while (nbytes > 0) {
 493                 unsigned offset = bvec->bv_offset + total;
 494                 unsigned len = get_max_segment_size(&q->limits,
 495                                 bvec_phys(bvec) + total, nbytes);
 496                 struct page *page = bvec->bv_page;
 497
 498                 /*
 499                  * Unfortunately a fair number of drivers barf on scatterlists
 500                  * that have an offset larger than PAGE_SIZE, despite other
 501                  * subsystems dealing with that invariant just fine.  For now
 502                  * stick to the legacy format where we never present those from
 503                  * the block layer, but the code below should be removed once
 504                  * these offenders (mostly MMC/SD drivers) are fixed.
 505                  */
 506                 page += (offset >> PAGE_SHIFT);
 507                 offset &= ~PAGE_MASK;
 508
 509                 *sg = blk_next_sg(sg, sglist);
 510                 sg_set_page(*sg, page, len, offset);
 511
 512                 total += len;
 513                 nbytes -= len;
 514                 nsegs++;
 515         }
 516
 517         return nsegs;
 518 }
 519
 520 static inline int __blk_bvec_map_sg(struct bio_vec bv,
 521                 struct scatterlist *sglist, struct scatterlist **sg)
 522 {
 523         *sg = blk_next_sg(sg, sglist);
 524         sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset);
 525         return 1;
 526 }
 527
 528 /* only try to merge bvecs into one sg if they are from two bios */
 529 static inline bool
 530 __blk_segment_map_sg_merge(struct request_queue *q, struct bio_vec *bvec,
 531                            struct bio_vec *bvprv, struct scatterlist **sg)
 532 {
 533
 534         int nbytes = bvec->bv_len;
 535
 536         if (!*sg)
 537                 return false;
 538
 539         if ((*sg)->length + nbytes > queue_max_segment_size(q))
 540                 return false;
 541
 542         if (!biovec_phys_mergeable(q, bvprv, bvec))
 543                 return false;
 544
 545         (*sg)->length += nbytes;
 546
 547         return true;
 548 }
 549
 550 static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
 551                              struct scatterlist *sglist,
 552                              struct scatterlist **sg)
 553 {
 554         struct bio_vec bvec, bvprv = { NULL };
 555         struct bvec_iter iter;
 556         int nsegs = 0;
 557         bool new_bio = false;
 558
 559         for_each_bio(bio) {
 560                 bio_for_each_bvec(bvec, bio, iter) {
 561                         /*
 562                          * Only try to merge bvecs from two bios given we
 563                          * have done bio internal merge when adding pages
 564                          * to bio
 565                          */
 566                         if (new_bio &&
 567                             __blk_segment_map_sg_merge(q, &bvec, &bvprv, sg))
 568                                 goto next_bvec;
 569
 570                         if (bvec.bv_offset + bvec.bv_len <= PAGE_SIZE)
 571                                 nsegs += __blk_bvec_map_sg(bvec, sglist, sg);
 572                         else
 573                                 nsegs += blk_bvec_map_sg(q, &bvec, sglist, sg);
 574  next_bvec:
 575                         new_bio = false;
 576                 }
 577                 if (likely(bio->bi_iter.bi_size)) {
 578                         bvprv = bvec;
 579                         new_bio = true;
 580                 }
 581         }
 582
 583         return nsegs;
 584 }
 585
 586 /*
 587  * map a request to scatterlist, return number of sg entries setup. Caller
 588  * must make sure sg can hold rq->nr_phys_segments entries
 589  */
 590 int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
 591                 struct scatterlist *sglist, struct scatterlist **last_sg)
 592 {
 593         int nsegs = 0;
 594
 595         if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
 596                 nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg);
 597         else if (rq->bio)
 598                 nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg);
 599
 600         if (*last_sg)
 601                 sg_mark_end(*last_sg);
 602
 603         /*
 604          * Something must have been wrong if the figured number of
 605          * segment is bigger than number of req's physical segments
 606          */
 607         WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));
 608
 609         return nsegs;
 610 }
 611 EXPORT_SYMBOL(__blk_rq_map_sg);
 612
 613 static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
 614                                                   sector_t offset)
 615 {
 616         struct request_queue *q = rq->q;
 617         struct queue_limits *lim = &q->limits;
 618         unsigned int max_sectors, boundary_sectors;
 619         bool is_atomic = rq->cmd_flags & REQ_ATOMIC;
 620
 621         if (blk_rq_is_passthrough(rq))
 622                 return q->limits.max_hw_sectors;
 623
 624         boundary_sectors = blk_boundary_sectors(lim, is_atomic);
 625         max_sectors = blk_queue_get_max_sectors(rq);
 626
 627         if (!boundary_sectors ||
 628             req_op(rq) == REQ_OP_DISCARD ||
 629             req_op(rq) == REQ_OP_SECURE_ERASE)
 630                 return max_sectors;
 631         return min(max_sectors,
 632                    blk_boundary_sectors_left(offset, boundary_sectors));
 633 }
 634
 635 static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
 636                 unsigned int nr_phys_segs)
 637 {
 638         if (!blk_cgroup_mergeable(req, bio))
 639                 goto no_merge;
 640
 641         if (blk_integrity_merge_bio(req->q, req, bio) == false)
 642                 goto no_merge;
 643
 644         /* discard request merge won't add new segment */
 645         if (req_op(req) == REQ_OP_DISCARD)
 646                 return 1;
 647
 648         if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req))
 649                 goto no_merge;
 650
 651         /*
 652          * This will form the start of a new hw segment.  Bump both
 653          * counters.
 654          */
 655         req->nr_phys_segments += nr_phys_segs;
 656         return 1;
 657
 658 no_merge:
 659         req_set_nomerge(req->q, req);
 660         return 0;
 661 }
 662
 663 int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs)
 664 {
 665         if (req_gap_back_merge(req, bio))
 666                 return 0;
 667         if (blk_integrity_rq(req) &&
 668             integrity_req_gap_back_merge(req, bio))
 669                 return 0;
 670         if (!bio_crypt_ctx_back_mergeable(req, bio))
 671                 return 0;
 672         if (blk_rq_sectors(req) + bio_sectors(bio) >
 673             blk_rq_get_max_sectors(req, blk_rq_pos(req))) {
 674                 req_set_nomerge(req->q, req);
 675                 return 0;
 676         }
 677
 678         return ll_new_hw_segment(req, bio, nr_segs);
 679 }
 680
 681 static int ll_front_merge_fn(struct request *req, struct bio *bio,
 682                 unsigned int nr_segs)
 683 {
 684         if (req_gap_front_merge(req, bio))
 685                 return 0;
 686         if (blk_integrity_rq(req) &&
 687             integrity_req_gap_front_merge(req, bio))
 688                 return 0;
 689         if (!bio_crypt_ctx_front_mergeable(req, bio))
 690                 return 0;
 691         if (blk_rq_sectors(req) + bio_sectors(bio) >
 692             blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) {
 693                 req_set_nomerge(req->q, req);
 694                 return 0;
 695         }
 696
 697         return ll_new_hw_segment(req, bio, nr_segs);
 698 }
 699
 700 static bool req_attempt_discard_merge(struct request_queue *q, struct request *req,
 701                 struct request *next)
 702 {
 703         unsigned short segments = blk_rq_nr_discard_segments(req);
 704
 705         if (segments >= queue_max_discard_segments(q))
 706                 goto no_merge;
 707         if (blk_rq_sectors(req) + bio_sectors(next->bio) >
 708             blk_rq_get_max_sectors(req, blk_rq_pos(req)))
 709                 goto no_merge;
 710
 711         req->nr_phys_segments = segments + blk_rq_nr_discard_segments(next);
 712         return true;
 713 no_merge:
 714         req_set_nomerge(q, req);
 715         return false;
 716 }
 717
 718 static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 719                                 struct request *next)
 720 {
 721         int total_phys_segments;
 722
 723         if (req_gap_back_merge(req, next->bio))
 724                 return 0;
 725
 726         /*
 727          * Will it become too large?
 728          */
 729         if ((blk_rq_sectors(req) + blk_rq_sectors(next)) >
 730             blk_rq_get_max_sectors(req, blk_rq_pos(req)))
 731                 return 0;
 732
 733         total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
 734         if (total_phys_segments > blk_rq_get_max_segments(req))
 735                 return 0;
 736
 737         if (!blk_cgroup_mergeable(req, next->bio))
 738                 return 0;
 739
 740         if (blk_integrity_merge_rq(q, req, next) == false)
 741                 return 0;
 742
 743         if (!bio_crypt_ctx_merge_rq(req, next))
 744                 return 0;
 745
 746         /* Merge is OK... */
 747         req->nr_phys_segments = total_phys_segments;
 748         return 1;
 749 }
 750
 751 /**
 752  * blk_rq_set_mixed_merge - mark a request as mixed merge
 753  * @rq: request to mark as mixed merge
 754  *
 755  * Description:
 756  *     @rq is about to be mixed merged.  Make sure the attributes
 757  *     which can be mixed are set in each bio and mark @rq as mixed
 758  *     merged.
 759  */
 760 static void blk_rq_set_mixed_merge(struct request *rq)
 761 {
 762         blk_opf_t ff = rq->cmd_flags & REQ_FAILFAST_MASK;
 763         struct bio *bio;
 764
 765         if (rq->rq_flags & RQF_MIXED_MERGE)
 766                 return;
 767
 768         /*
 769          * @rq will no longer represent mixable attributes for all the
 770          * contained bios.  It will just track those of the first one.
 771          * Distributes the attributs to each bio.
 772          */
 773         for (bio = rq->bio; bio; bio = bio->bi_next) {
 774                 WARN_ON_ONCE((bio->bi_opf & REQ_FAILFAST_MASK) &&
 775                              (bio->bi_opf & REQ_FAILFAST_MASK) != ff);
 776                 bio->bi_opf |= ff;
 777         }
 778         rq->rq_flags |= RQF_MIXED_MERGE;
 779 }
 780
 781 static inline blk_opf_t bio_failfast(const struct bio *bio)
 782 {
 783         if (bio->bi_opf & REQ_RAHEAD)
 784                 return REQ_FAILFAST_MASK;
 785
 786         return bio->bi_opf & REQ_FAILFAST_MASK;
 787 }
 788
 789 /*
 790  * After we are marked as MIXED_MERGE, any new RA bio has to be updated
 791  * as failfast, and request's failfast has to be updated in case of
 792  * front merge.
 793  */
 794 static inline void blk_update_mixed_merge(struct request *req,
 795                 struct bio *bio, bool front_merge)
 796 {
 797         if (req->rq_flags & RQF_MIXED_MERGE) {
 798                 if (bio->bi_opf & REQ_RAHEAD)
 799                         bio->bi_opf |= REQ_FAILFAST_MASK;
 800
 801                 if (front_merge) {
 802                         req->cmd_flags &= ~REQ_FAILFAST_MASK;
 803                         req->cmd_flags |= bio->bi_opf & REQ_FAILFAST_MASK;
 804                 }
 805         }
 806 }
 807
 808 static void blk_account_io_merge_request(struct request *req)
 809 {
 810         if (blk_do_io_stat(req)) {
 811                 part_stat_lock();
 812                 part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
 813                 part_stat_local_dec(req->part,
 814                                     in_flight[op_is_write(req_op(req))]);
 815                 part_stat_unlock();
 816         }
 817 }
 818
 819 static enum elv_merge blk_try_req_merge(struct request *req,
 820                                         struct request *next)
 821 {
 822         if (blk_discard_mergable(req))
 823                 return ELEVATOR_DISCARD_MERGE;
 824         else if (blk_rq_pos(req) + blk_rq_sectors(req) == blk_rq_pos(next))
 825                 return ELEVATOR_BACK_MERGE;
 826
 827         return ELEVATOR_NO_MERGE;
 828 }
 829
 830 static bool blk_atomic_write_mergeable_rq_bio(struct request *rq,
 831                                               struct bio *bio)
 832 {
 833         return (rq->cmd_flags & REQ_ATOMIC) == (bio->bi_opf & REQ_ATOMIC);
 834 }
 835
 836 static bool blk_atomic_write_mergeable_rqs(struct request *rq,
 837                                            struct request *next)
 838 {
 839         return (rq->cmd_flags & REQ_ATOMIC) == (next->cmd_flags & REQ_ATOMIC);
 840 }
 841
 842 /*
 843  * For non-mq, this has to be called with the request spinlock acquired.
 844  * For mq with scheduling, the appropriate queue wide lock should be held.
 845  */
 846 static struct request *attempt_merge(struct request_queue *q,
 847                                      struct request *req, struct request *next)
 848 {
 849         if (!rq_mergeable(req) || !rq_mergeable(next))
 850                 return NULL;
 851
 852         if (req_op(req) != req_op(next))
 853                 return NULL;
 854
 855         if (rq_data_dir(req) != rq_data_dir(next))
 856                 return NULL;
 857
 858         /* Don't merge requests with different write hints. */
 859         if (req->write_hint != next->write_hint)
 860                 return NULL;
 861
 862         if (req->ioprio != next->ioprio)
 863                 return NULL;
 864
 865         if (!blk_atomic_write_mergeable_rqs(req, next))
 866                 return NULL;
 867
 868         /*
 869          * If we are allowed to merge, then append bio list
 870          * from next to rq and release next. merge_requests_fn
 871          * will have updated segment counts, update sector
 872          * counts here. Handle DISCARDs separately, as they
 873          * have separate settings.
 874          */
 875
 876         switch (blk_try_req_merge(req, next)) {
 877         case ELEVATOR_DISCARD_MERGE:
 878                 if (!req_attempt_discard_merge(q, req, next))
 879                         return NULL;
 880                 break;
 881         case ELEVATOR_BACK_MERGE:
 882                 if (!ll_merge_requests_fn(q, req, next))
 883                         return NULL;
 884                 break;
 885         default:
 886                 return NULL;
 887         }
 888
 889         /*
 890          * If failfast settings disagree or any of the two is already
 891          * a mixed merge, mark both as mixed before proceeding.  This
 892          * makes sure that all involved bios have mixable attributes
 893          * set properly.
 894          */
 895         if (((req->rq_flags | next->rq_flags) & RQF_MIXED_MERGE) ||
 896             (req->cmd_flags & REQ_FAILFAST_MASK) !=
 897             (next->cmd_flags & REQ_FAILFAST_MASK)) {
 898                 blk_rq_set_mixed_merge(req);
 899                 blk_rq_set_mixed_merge(next);
 900         }
 901
 902         /*
 903          * At this point we have either done a back merge or front merge. We
 904          * need the smaller start_time_ns of the merged requests to be the
 905          * current request for accounting purposes.
 906          */
 907         if (next->start_time_ns < req->start_time_ns)
 908                 req->start_time_ns = next->start_time_ns;
 909
 910         req->biotail->bi_next = next->bio;
 911         req->biotail = next->biotail;
 912
 913         req->__data_len += blk_rq_bytes(next);
 914
 915         if (!blk_discard_mergable(req))
 916                 elv_merge_requests(q, req, next);
 917
 918         blk_crypto_rq_put_keyslot(next);
 919
 920         /*
 921          * 'next' is going away, so update stats accordingly
 922          */
 923         blk_account_io_merge_request(next);
 924
 925         trace_block_rq_merge(next);
 926
 927         /*
 928          * ownership of bio passed from next to req, return 'next' for
 929          * the caller to free
 930          */
 931         next->bio = NULL;
 932         return next;
 933 }
 934
 935 static struct request *attempt_back_merge(struct request_queue *q,
 936                 struct request *rq)
 937 {
 938         struct request *next = elv_latter_request(q, rq);
 939
 940         if (next)
 941                 return attempt_merge(q, rq, next);
 942
 943         return NULL;
 944 }
 945
 946 static struct request *attempt_front_merge(struct request_queue *q,
 947                 struct request *rq)
 948 {
 949         struct request *prev = elv_former_request(q, rq);
 950
 951         if (prev)
 952                 return attempt_merge(q, prev, rq);
 953
 954         return NULL;
 955 }
 956
 957 /*
 958  * Try to merge 'next' into 'rq'. Return true if the merge happened, false
 959  * otherwise. The caller is responsible for freeing 'next' if the merge
 960  * happened.
 961  */
 962 bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
 963                            struct request *next)
 964 {
 965         return attempt_merge(q, rq, next);
 966 }
 967
 968 bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 969 {
 970         if (!rq_mergeable(rq) || !bio_mergeable(bio))
 971                 return false;
 972
 973         if (req_op(rq) != bio_op(bio))
 974                 return false;
 975
 976         /* different data direction or already started, don't merge */
 977         if (bio_data_dir(bio) != rq_data_dir(rq))
 978                 return false;
 979
 980         /* don't merge across cgroup boundaries */
 981         if (!blk_cgroup_mergeable(rq, bio))
 982                 return false;
 983
 984         /* only merge integrity protected bio into ditto rq */
 985         if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
 986                 return false;
 987
 988         /* Only merge if the crypt contexts are compatible */
 989         if (!bio_crypt_rq_ctx_compatible(rq, bio))
 990                 return false;
 991
 992         /* Don't merge requests with different write hints. */
 993         if (rq->write_hint != bio->bi_write_hint)
 994                 return false;
 995
 996         if (rq->ioprio != bio_prio(bio))
 997                 return false;
 998
 999         if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false)
1000                 return false;
1001
1002         return true;
1003 }
1004
1005 enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
1006 {
1007         if (blk_discard_mergable(rq))
1008                 return ELEVATOR_DISCARD_MERGE;
1009         else if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
1010                 return ELEVATOR_BACK_MERGE;
1011         else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
1012                 return ELEVATOR_FRONT_MERGE;
1013         return ELEVATOR_NO_MERGE;
1014 }
1015
1016 static void blk_account_io_merge_bio(struct request *req)
1017 {
1018         if (!blk_do_io_stat(req))
1019                 return;
1020
1021         part_stat_lock();
1022         part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
1023         part_stat_unlock();
1024 }
1025
1026 enum bio_merge_status bio_attempt_back_merge(struct request *req,
1027                 struct bio *bio, unsigned int nr_segs)
1028 {
1029         const blk_opf_t ff = bio_failfast(bio);
1030
1031         if (!ll_back_merge_fn(req, bio, nr_segs))
1032                 return BIO_MERGE_FAILED;
1033
1034         trace_block_bio_backmerge(bio);
1035         rq_qos_merge(req->q, req, bio);
1036
1037         if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1038                 blk_rq_set_mixed_merge(req);
1039
1040         blk_update_mixed_merge(req, bio, false);
1041
1042         if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
1043                 blk_zone_write_plug_bio_merged(bio);
1044
1045         req->biotail->bi_next = bio;
1046         req->biotail = bio;
1047         req->__data_len += bio->bi_iter.bi_size;
1048
1049         bio_crypt_free_ctx(bio);
1050
1051         blk_account_io_merge_bio(req);
1052         return BIO_MERGE_OK;
1053 }
1054
1055 static enum bio_merge_status bio_attempt_front_merge(struct request *req,
1056                 struct bio *bio, unsigned int nr_segs)
1057 {
1058         const blk_opf_t ff = bio_failfast(bio);
1059
1060         /*
1061          * A front merge for writes to sequential zones of a zoned block device
1062          * can happen only if the user submitted writes out of order. Do not
1063          * merge such write to let it fail.
1064          */
1065         if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
1066                 return BIO_MERGE_FAILED;
1067
1068         if (!ll_front_merge_fn(req, bio, nr_segs))
1069                 return BIO_MERGE_FAILED;
1070
1071         trace_block_bio_frontmerge(bio);
1072         rq_qos_merge(req->q, req, bio);
1073
1074         if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1075                 blk_rq_set_mixed_merge(req);
1076
1077         blk_update_mixed_merge(req, bio, true);
1078
1079         bio->bi_next = req->bio;
1080         req->bio = bio;
1081
1082         req->__sector = bio->bi_iter.bi_sector;
1083         req->__data_len += bio->bi_iter.bi_size;
1084
1085         bio_crypt_do_front_merge(req, bio);
1086
1087         blk_account_io_merge_bio(req);
1088         return BIO_MERGE_OK;
1089 }
1090
1091 static enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q,
1092                 struct request *req, struct bio *bio)
1093 {
1094         unsigned short segments = blk_rq_nr_discard_segments(req);
1095
1096         if (segments >= queue_max_discard_segments(q))
1097                 goto no_merge;
1098         if (blk_rq_sectors(req) + bio_sectors(bio) >
1099             blk_rq_get_max_sectors(req, blk_rq_pos(req)))
1100                 goto no_merge;
1101
1102         rq_qos_merge(q, req, bio);
1103
1104         req->biotail->bi_next = bio;
1105         req->biotail = bio;
1106         req->__data_len += bio->bi_iter.bi_size;
1107         req->nr_phys_segments = segments + 1;
1108
1109         blk_account_io_merge_bio(req);
1110         return BIO_MERGE_OK;
1111 no_merge:
1112         req_set_nomerge(q, req);
1113         return BIO_MERGE_FAILED;
1114 }
1115
1116 static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
1117                                                    struct request *rq,
1118                                                    struct bio *bio,
1119                                                    unsigned int nr_segs,
1120                                                    bool sched_allow_merge)
1121 {
1122         if (!blk_rq_merge_ok(rq, bio))
1123                 return BIO_MERGE_NONE;
1124
1125         switch (blk_try_merge(rq, bio)) {
1126         case ELEVATOR_BACK_MERGE:
1127                 if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio))
1128                         return bio_attempt_back_merge(rq, bio, nr_segs);
1129                 break;
1130         case ELEVATOR_FRONT_MERGE:
1131                 if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio))
1132                         return bio_attempt_front_merge(rq, bio, nr_segs);
1133                 break;
1134         case ELEVATOR_DISCARD_MERGE:
1135                 return bio_attempt_discard_merge(q, rq, bio);
1136         default:
1137                 return BIO_MERGE_NONE;
1138         }
1139
1140         return BIO_MERGE_FAILED;
1141 }
1142
1143 /**
1144  * blk_attempt_plug_merge - try to merge with %current's plugged list
1145  * @q: request_queue new bio is being queued at
1146  * @bio: new bio being queued
1147  * @nr_segs: number of segments in @bio
1148  * from the passed in @q already in the plug list
1149  *
1150  * Determine whether @bio being queued on @q can be merged with the previous
1151  * request on %current's plugged list.  Returns %true if merge was successful,
1152  * otherwise %false.
1153  *
1154  * Plugging coalesces IOs from the same issuer for the same purpose without
1155  * going through @q->queue_lock.  As such it's more of an issuing mechanism
1156  * than scheduling, and the request, while may have elvpriv data, is not
1157  * added on the elevator at this point.  In addition, we don't have
1158  * reliable access to the elevator outside queue lock.  Only check basic
1159  * merging parameters without querying the elevator.
1160  *
1161  * Caller must ensure !blk_queue_nomerges(q) beforehand.
1162  */
1163 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
1164                 unsigned int nr_segs)
1165 {
1166         struct blk_plug *plug = current->plug;
1167         struct request *rq;
1168
1169         if (!plug || rq_list_empty(plug->mq_list))
1170                 return false;
1171
1172         rq_list_for_each(&plug->mq_list, rq) {
1173                 if (rq->q == q) {
1174                         if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
1175                             BIO_MERGE_OK)
1176                                 return true;
1177                         break;
1178                 }
1179
1180                 /*
1181                  * Only keep iterating plug list for merges if we have multiple
1182                  * queues
1183                  */
1184                 if (!plug->multiple_queues)
1185                         break;
1186         }
1187         return false;
1188 }
1189
1190 /*
1191  * Iterate list of requests and see if we can merge this bio with any
1192  * of them.
1193  */
1194 bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
1195                         struct bio *bio, unsigned int nr_segs)
1196 {
1197         struct request *rq;
1198         int checked = 8;
1199
1200         list_for_each_entry_reverse(rq, list, queuelist) {
1201                 if (!checked--)
1202                         break;
1203
1204                 switch (blk_attempt_bio_merge(q, rq, bio, nr_segs, true)) {
1205                 case BIO_MERGE_NONE:
1206                         continue;
1207                 case BIO_MERGE_OK:
1208                         return true;
1209                 case BIO_MERGE_FAILED:
1210                         return false;
1211                 }
1212
1213         }
1214
1215         return false;
1216 }
1217 EXPORT_SYMBOL_GPL(blk_bio_list_merge);
1218
1219 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
1220                 unsigned int nr_segs, struct request **merged_request)
1221 {
1222         struct request *rq;
1223
1224         switch (elv_merge(q, &rq, bio)) {
1225         case ELEVATOR_BACK_MERGE:
1226                 if (!blk_mq_sched_allow_merge(q, rq, bio))
1227                         return false;
1228                 if (bio_attempt_back_merge(rq, bio, nr_segs) != BIO_MERGE_OK)
1229                         return false;
1230                 *merged_request = attempt_back_merge(q, rq);
1231                 if (!*merged_request)
1232                         elv_merged_request(q, rq, ELEVATOR_BACK_MERGE);
1233                 return true;
1234         case ELEVATOR_FRONT_MERGE:
1235                 if (!blk_mq_sched_allow_merge(q, rq, bio))
1236                         return false;
1237                 if (bio_attempt_front_merge(rq, bio, nr_segs) != BIO_MERGE_OK)
1238                         return false;
1239                 *merged_request = attempt_front_merge(q, rq);
1240                 if (!*merged_request)
1241                         elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
1242                 return true;
1243         case ELEVATOR_DISCARD_MERGE:
1244                 return bio_attempt_discard_merge(q, rq, bio) == BIO_MERGE_OK;
1245         default:
1246                 return false;
1247         }
1248 }
1249 EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);