1 // SPDX-License-Identifier: GPL-2.0
3 * Functions related to segment and merge handling
5 #include <linux/kernel.h>
6 #include <linux/module.h>
8 #include <linux/blkdev.h>
9 #include <linux/blk-integrity.h>
10 #include <linux/scatterlist.h>
11 #include <linux/part_stat.h>
12 #include <linux/blk-cgroup.h>
14 #include <trace/events/block.h>
17 #include "blk-mq-sched.h"
18 #include "blk-rq-qos.h"
19 #include "blk-throttle.h"
21 static inline void bio_get_first_bvec(struct bio
*bio
, struct bio_vec
*bv
)
23 *bv
= mp_bvec_iter_bvec(bio
->bi_io_vec
, bio
->bi_iter
);
26 static inline void bio_get_last_bvec(struct bio
*bio
, struct bio_vec
*bv
)
28 struct bvec_iter iter
= bio
->bi_iter
;
31 bio_get_first_bvec(bio
, bv
);
32 if (bv
->bv_len
== bio
->bi_iter
.bi_size
)
33 return; /* this bio only has a single bvec */
35 bio_advance_iter(bio
, &iter
, iter
.bi_size
);
37 if (!iter
.bi_bvec_done
)
38 idx
= iter
.bi_idx
- 1;
39 else /* in the middle of bvec */
42 *bv
= bio
->bi_io_vec
[idx
];
45 * iter.bi_bvec_done records actual length of the last bvec
46 * if this bio ends in the middle of one io vector
48 if (iter
.bi_bvec_done
)
49 bv
->bv_len
= iter
.bi_bvec_done
;
52 static inline bool bio_will_gap(struct request_queue
*q
,
53 struct request
*prev_rq
, struct bio
*prev
, struct bio
*next
)
55 struct bio_vec pb
, nb
;
57 if (!bio_has_data(prev
) || !queue_virt_boundary(q
))
61 * Don't merge if the 1st bio starts with non-zero offset, otherwise it
62 * is quite difficult to respect the sg gap limit. We work hard to
63 * merge a huge number of small single bios in case of mkfs.
66 bio_get_first_bvec(prev_rq
->bio
, &pb
);
68 bio_get_first_bvec(prev
, &pb
);
69 if (pb
.bv_offset
& queue_virt_boundary(q
))
73 * We don't need to worry about the situation that the merged segment
74 * ends in unaligned virt boundary:
76 * - if 'pb' ends aligned, the merged segment ends aligned
77 * - if 'pb' ends unaligned, the next bio must include
78 * one single bvec of 'nb', otherwise the 'nb' can't
81 bio_get_last_bvec(prev
, &pb
);
82 bio_get_first_bvec(next
, &nb
);
83 if (biovec_phys_mergeable(q
, &pb
, &nb
))
85 return __bvec_gap_to_prev(&q
->limits
, &pb
, nb
.bv_offset
);
88 static inline bool req_gap_back_merge(struct request
*req
, struct bio
*bio
)
90 return bio_will_gap(req
->q
, req
, req
->biotail
, bio
);
93 static inline bool req_gap_front_merge(struct request
*req
, struct bio
*bio
)
95 return bio_will_gap(req
->q
, NULL
, bio
, req
->bio
);
99 * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size
100 * is defined as 'unsigned int', meantime it has to be aligned to with the
101 * logical block size, which is the minimum accepted unit by hardware.
103 static unsigned int bio_allowed_max_sectors(const struct queue_limits
*lim
)
105 return round_down(UINT_MAX
, lim
->logical_block_size
) >> SECTOR_SHIFT
;
108 static struct bio
*bio_split_discard(struct bio
*bio
,
109 const struct queue_limits
*lim
,
110 unsigned *nsegs
, struct bio_set
*bs
)
112 unsigned int max_discard_sectors
, granularity
;
114 unsigned split_sectors
;
118 granularity
= max(lim
->discard_granularity
>> 9, 1U);
120 max_discard_sectors
=
121 min(lim
->max_discard_sectors
, bio_allowed_max_sectors(lim
));
122 max_discard_sectors
-= max_discard_sectors
% granularity
;
123 if (unlikely(!max_discard_sectors
))
126 if (bio_sectors(bio
) <= max_discard_sectors
)
129 split_sectors
= max_discard_sectors
;
132 * If the next starting sector would be misaligned, stop the discard at
133 * the previous aligned sector.
135 tmp
= bio
->bi_iter
.bi_sector
+ split_sectors
-
136 ((lim
->discard_alignment
>> 9) % granularity
);
137 tmp
= sector_div(tmp
, granularity
);
139 if (split_sectors
> tmp
)
140 split_sectors
-= tmp
;
142 return bio_split(bio
, split_sectors
, GFP_NOIO
, bs
);
145 static struct bio
*bio_split_write_zeroes(struct bio
*bio
,
146 const struct queue_limits
*lim
,
147 unsigned *nsegs
, struct bio_set
*bs
)
150 if (!lim
->max_write_zeroes_sectors
)
152 if (bio_sectors(bio
) <= lim
->max_write_zeroes_sectors
)
154 return bio_split(bio
, lim
->max_write_zeroes_sectors
, GFP_NOIO
, bs
);
157 static inline unsigned int blk_boundary_sectors(const struct queue_limits
*lim
,
161 * chunk_sectors must be a multiple of atomic_write_boundary_sectors if
164 if (is_atomic
&& lim
->atomic_write_boundary_sectors
)
165 return lim
->atomic_write_boundary_sectors
;
167 return lim
->chunk_sectors
;
171 * Return the maximum number of sectors from the start of a bio that may be
172 * submitted as a single request to a block device. If enough sectors remain,
173 * align the end to the physical block size. Otherwise align the end to the
174 * logical block size. This approach minimizes the number of non-aligned
175 * requests that are submitted to a block device if the start of a bio is not
176 * aligned to a physical block boundary.
178 static inline unsigned get_max_io_size(struct bio
*bio
,
179 const struct queue_limits
*lim
)
181 unsigned pbs
= lim
->physical_block_size
>> SECTOR_SHIFT
;
182 unsigned lbs
= lim
->logical_block_size
>> SECTOR_SHIFT
;
183 bool is_atomic
= bio
->bi_opf
& REQ_ATOMIC
;
184 unsigned boundary_sectors
= blk_boundary_sectors(lim
, is_atomic
);
185 unsigned max_sectors
, start
, end
;
188 * We ignore lim->max_sectors for atomic writes because it may less
189 * than the actual bio size, which we cannot tolerate.
192 max_sectors
= lim
->atomic_write_max_sectors
;
194 max_sectors
= lim
->max_sectors
;
196 if (boundary_sectors
) {
197 max_sectors
= min(max_sectors
,
198 blk_boundary_sectors_left(bio
->bi_iter
.bi_sector
,
202 start
= bio
->bi_iter
.bi_sector
& (pbs
- 1);
203 end
= (start
+ max_sectors
) & ~(pbs
- 1);
206 return max_sectors
& ~(lbs
- 1);
210 * get_max_segment_size() - maximum number of bytes to add as a single segment
211 * @lim: Request queue limits.
212 * @paddr: address of the range to add
213 * @len: maximum length available to add at @paddr
215 * Returns the maximum number of bytes of the range starting at @paddr that can
216 * be added to a single segment.
218 static inline unsigned get_max_segment_size(const struct queue_limits
*lim
,
219 phys_addr_t paddr
, unsigned int len
)
222 * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1
223 * after having calculated the minimum.
225 return min_t(unsigned long, len
,
226 min(lim
->seg_boundary_mask
- (lim
->seg_boundary_mask
& paddr
),
227 (unsigned long)lim
->max_segment_size
- 1) + 1);
231 * bvec_split_segs - verify whether or not a bvec should be split in the middle
232 * @lim: [in] queue limits to split based on
233 * @bv: [in] bvec to examine
234 * @nsegs: [in,out] Number of segments in the bio being built. Incremented
235 * by the number of segments from @bv that may be appended to that
236 * bio without exceeding @max_segs
237 * @bytes: [in,out] Number of bytes in the bio being built. Incremented
238 * by the number of bytes from @bv that may be appended to that
239 * bio without exceeding @max_bytes
240 * @max_segs: [in] upper bound for *@nsegs
241 * @max_bytes: [in] upper bound for *@bytes
243 * When splitting a bio, it can happen that a bvec is encountered that is too
244 * big to fit in a single segment and hence that it has to be split in the
245 * middle. This function verifies whether or not that should happen. The value
246 * %true is returned if and only if appending the entire @bv to a bio with
247 * *@nsegs segments and *@sectors sectors would make that bio unacceptable for
250 static bool bvec_split_segs(const struct queue_limits
*lim
,
251 const struct bio_vec
*bv
, unsigned *nsegs
, unsigned *bytes
,
252 unsigned max_segs
, unsigned max_bytes
)
254 unsigned max_len
= min(max_bytes
, UINT_MAX
) - *bytes
;
255 unsigned len
= min(bv
->bv_len
, max_len
);
256 unsigned total_len
= 0;
257 unsigned seg_size
= 0;
259 while (len
&& *nsegs
< max_segs
) {
260 seg_size
= get_max_segment_size(lim
, bvec_phys(bv
) + total_len
, len
);
263 total_len
+= seg_size
;
266 if ((bv
->bv_offset
+ total_len
) & lim
->virt_boundary_mask
)
272 /* tell the caller to split the bvec if it is too big to fit */
273 return len
> 0 || bv
->bv_len
> max_len
;
277 * bio_split_rw - split a bio in two bios
278 * @bio: [in] bio to be split
279 * @lim: [in] queue limits to split based on
280 * @segs: [out] number of segments in the bio with the first half of the sectors
281 * @bs: [in] bio set to allocate the clone from
282 * @max_bytes: [in] maximum number of bytes per bio
284 * Clone @bio, update the bi_iter of the clone to represent the first sectors
285 * of @bio and update @bio->bi_iter to represent the remaining sectors. The
286 * following is guaranteed for the cloned bio:
287 * - That it has at most @max_bytes worth of data
288 * - That it has at most queue_max_segments(@q) segments.
290 * Except for discard requests the cloned bio will point at the bi_io_vec of
291 * the original bio. It is the responsibility of the caller to ensure that the
292 * original bio is not freed before the cloned bio. The caller is also
293 * responsible for ensuring that @bs is only destroyed after processing of the
294 * split bio has finished.
296 struct bio
*bio_split_rw(struct bio
*bio
, const struct queue_limits
*lim
,
297 unsigned *segs
, struct bio_set
*bs
, unsigned max_bytes
)
299 struct bio_vec bv
, bvprv
, *bvprvp
= NULL
;
300 struct bvec_iter iter
;
301 unsigned nsegs
= 0, bytes
= 0;
303 bio_for_each_bvec(bv
, bio
, iter
) {
305 * If the queue doesn't support SG gaps and adding this
306 * offset would create a gap, disallow it.
308 if (bvprvp
&& bvec_gap_to_prev(lim
, bvprvp
, bv
.bv_offset
))
311 if (nsegs
< lim
->max_segments
&&
312 bytes
+ bv
.bv_len
<= max_bytes
&&
313 bv
.bv_offset
+ bv
.bv_len
<= PAGE_SIZE
) {
317 if (bvec_split_segs(lim
, &bv
, &nsegs
, &bytes
,
318 lim
->max_segments
, max_bytes
))
329 if (bio
->bi_opf
& REQ_ATOMIC
) {
330 bio
->bi_status
= BLK_STS_INVAL
;
332 return ERR_PTR(-EINVAL
);
335 * We can't sanely support splitting for a REQ_NOWAIT bio. End it
336 * with EAGAIN if splitting is required and return an error pointer.
338 if (bio
->bi_opf
& REQ_NOWAIT
) {
339 bio
->bi_status
= BLK_STS_AGAIN
;
341 return ERR_PTR(-EAGAIN
);
347 * Individual bvecs might not be logical block aligned. Round down the
348 * split size so that each bio is properly block size aligned, even if
349 * we do not use the full hardware limits.
351 bytes
= ALIGN_DOWN(bytes
, lim
->logical_block_size
);
354 * Bio splitting may cause subtle trouble such as hang when doing sync
355 * iopoll in direct IO routine. Given performance gain of iopoll for
356 * big IO can be trival, disable iopoll when split needed.
358 bio_clear_polled(bio
);
359 return bio_split(bio
, bytes
>> SECTOR_SHIFT
, GFP_NOIO
, bs
);
361 EXPORT_SYMBOL_GPL(bio_split_rw
);
364 * __bio_split_to_limits - split a bio to fit the queue limits
365 * @bio: bio to be split
366 * @lim: queue limits to split based on
367 * @nr_segs: returns the number of segments in the returned bio
369 * Check if @bio needs splitting based on the queue limits, and if so split off
370 * a bio fitting the limits from the beginning of @bio and return it. @bio is
371 * shortened to the remainder and re-submitted.
373 * The split bio is allocated from @q->bio_split, which is provided by the
376 struct bio
*__bio_split_to_limits(struct bio
*bio
,
377 const struct queue_limits
*lim
,
378 unsigned int *nr_segs
)
380 struct bio_set
*bs
= &bio
->bi_bdev
->bd_disk
->bio_split
;
383 switch (bio_op(bio
)) {
385 case REQ_OP_SECURE_ERASE
:
386 split
= bio_split_discard(bio
, lim
, nr_segs
, bs
);
388 case REQ_OP_WRITE_ZEROES
:
389 split
= bio_split_write_zeroes(bio
, lim
, nr_segs
, bs
);
392 split
= bio_split_rw(bio
, lim
, nr_segs
, bs
,
393 get_max_io_size(bio
, lim
) << SECTOR_SHIFT
);
400 /* there isn't chance to merge the split bio */
401 split
->bi_opf
|= REQ_NOMERGE
;
403 blkcg_bio_issue_init(split
);
404 bio_chain(split
, bio
);
405 trace_block_split(split
, bio
->bi_iter
.bi_sector
);
406 WARN_ON_ONCE(bio_zone_write_plugging(bio
));
407 submit_bio_noacct(bio
);
414 * bio_split_to_limits - split a bio to fit the queue limits
415 * @bio: bio to be split
417 * Check if @bio needs splitting based on the queue limits of @bio->bi_bdev, and
418 * if so split off a bio fitting the limits from the beginning of @bio and
419 * return it. @bio is shortened to the remainder and re-submitted.
421 * The split bio is allocated from @q->bio_split, which is provided by the
424 struct bio
*bio_split_to_limits(struct bio
*bio
)
426 const struct queue_limits
*lim
= &bdev_get_queue(bio
->bi_bdev
)->limits
;
427 unsigned int nr_segs
;
429 if (bio_may_exceed_limits(bio
, lim
))
430 return __bio_split_to_limits(bio
, lim
, &nr_segs
);
433 EXPORT_SYMBOL(bio_split_to_limits
);
435 unsigned int blk_recalc_rq_segments(struct request
*rq
)
437 unsigned int nr_phys_segs
= 0;
438 unsigned int bytes
= 0;
439 struct req_iterator iter
;
445 switch (bio_op(rq
->bio
)) {
447 case REQ_OP_SECURE_ERASE
:
448 if (queue_max_discard_segments(rq
->q
) > 1) {
449 struct bio
*bio
= rq
->bio
;
456 case REQ_OP_WRITE_ZEROES
:
462 rq_for_each_bvec(bv
, rq
, iter
)
463 bvec_split_segs(&rq
->q
->limits
, &bv
, &nr_phys_segs
, &bytes
,
468 static inline struct scatterlist
*blk_next_sg(struct scatterlist
**sg
,
469 struct scatterlist
*sglist
)
475 * If the driver previously mapped a shorter list, we could see a
476 * termination bit prematurely unless it fully inits the sg table
477 * on each mapping. We KNOW that there must be more entries here
478 * or the driver would be buggy, so force clear the termination bit
479 * to avoid doing a full sg_init_table() in drivers for each command.
485 static unsigned blk_bvec_map_sg(struct request_queue
*q
,
486 struct bio_vec
*bvec
, struct scatterlist
*sglist
,
487 struct scatterlist
**sg
)
489 unsigned nbytes
= bvec
->bv_len
;
490 unsigned nsegs
= 0, total
= 0;
493 unsigned offset
= bvec
->bv_offset
+ total
;
494 unsigned len
= get_max_segment_size(&q
->limits
,
495 bvec_phys(bvec
) + total
, nbytes
);
496 struct page
*page
= bvec
->bv_page
;
499 * Unfortunately a fair number of drivers barf on scatterlists
500 * that have an offset larger than PAGE_SIZE, despite other
501 * subsystems dealing with that invariant just fine. For now
502 * stick to the legacy format where we never present those from
503 * the block layer, but the code below should be removed once
504 * these offenders (mostly MMC/SD drivers) are fixed.
506 page
+= (offset
>> PAGE_SHIFT
);
507 offset
&= ~PAGE_MASK
;
509 *sg
= blk_next_sg(sg
, sglist
);
510 sg_set_page(*sg
, page
, len
, offset
);
520 static inline int __blk_bvec_map_sg(struct bio_vec bv
,
521 struct scatterlist
*sglist
, struct scatterlist
**sg
)
523 *sg
= blk_next_sg(sg
, sglist
);
524 sg_set_page(*sg
, bv
.bv_page
, bv
.bv_len
, bv
.bv_offset
);
528 /* only try to merge bvecs into one sg if they are from two bios */
530 __blk_segment_map_sg_merge(struct request_queue
*q
, struct bio_vec
*bvec
,
531 struct bio_vec
*bvprv
, struct scatterlist
**sg
)
534 int nbytes
= bvec
->bv_len
;
539 if ((*sg
)->length
+ nbytes
> queue_max_segment_size(q
))
542 if (!biovec_phys_mergeable(q
, bvprv
, bvec
))
545 (*sg
)->length
+= nbytes
;
550 static int __blk_bios_map_sg(struct request_queue
*q
, struct bio
*bio
,
551 struct scatterlist
*sglist
,
552 struct scatterlist
**sg
)
554 struct bio_vec bvec
, bvprv
= { NULL
};
555 struct bvec_iter iter
;
557 bool new_bio
= false;
560 bio_for_each_bvec(bvec
, bio
, iter
) {
562 * Only try to merge bvecs from two bios given we
563 * have done bio internal merge when adding pages
567 __blk_segment_map_sg_merge(q
, &bvec
, &bvprv
, sg
))
570 if (bvec
.bv_offset
+ bvec
.bv_len
<= PAGE_SIZE
)
571 nsegs
+= __blk_bvec_map_sg(bvec
, sglist
, sg
);
573 nsegs
+= blk_bvec_map_sg(q
, &bvec
, sglist
, sg
);
577 if (likely(bio
->bi_iter
.bi_size
)) {
587 * map a request to scatterlist, return number of sg entries setup. Caller
588 * must make sure sg can hold rq->nr_phys_segments entries
590 int __blk_rq_map_sg(struct request_queue
*q
, struct request
*rq
,
591 struct scatterlist
*sglist
, struct scatterlist
**last_sg
)
595 if (rq
->rq_flags
& RQF_SPECIAL_PAYLOAD
)
596 nsegs
= __blk_bvec_map_sg(rq
->special_vec
, sglist
, last_sg
);
598 nsegs
= __blk_bios_map_sg(q
, rq
->bio
, sglist
, last_sg
);
601 sg_mark_end(*last_sg
);
604 * Something must have been wrong if the figured number of
605 * segment is bigger than number of req's physical segments
607 WARN_ON(nsegs
> blk_rq_nr_phys_segments(rq
));
611 EXPORT_SYMBOL(__blk_rq_map_sg
);
613 static inline unsigned int blk_rq_get_max_sectors(struct request
*rq
,
616 struct request_queue
*q
= rq
->q
;
617 struct queue_limits
*lim
= &q
->limits
;
618 unsigned int max_sectors
, boundary_sectors
;
619 bool is_atomic
= rq
->cmd_flags
& REQ_ATOMIC
;
621 if (blk_rq_is_passthrough(rq
))
622 return q
->limits
.max_hw_sectors
;
624 boundary_sectors
= blk_boundary_sectors(lim
, is_atomic
);
625 max_sectors
= blk_queue_get_max_sectors(rq
);
627 if (!boundary_sectors
||
628 req_op(rq
) == REQ_OP_DISCARD
||
629 req_op(rq
) == REQ_OP_SECURE_ERASE
)
631 return min(max_sectors
,
632 blk_boundary_sectors_left(offset
, boundary_sectors
));
635 static inline int ll_new_hw_segment(struct request
*req
, struct bio
*bio
,
636 unsigned int nr_phys_segs
)
638 if (!blk_cgroup_mergeable(req
, bio
))
641 if (blk_integrity_merge_bio(req
->q
, req
, bio
) == false)
644 /* discard request merge won't add new segment */
645 if (req_op(req
) == REQ_OP_DISCARD
)
648 if (req
->nr_phys_segments
+ nr_phys_segs
> blk_rq_get_max_segments(req
))
652 * This will form the start of a new hw segment. Bump both
655 req
->nr_phys_segments
+= nr_phys_segs
;
659 req_set_nomerge(req
->q
, req
);
663 int ll_back_merge_fn(struct request
*req
, struct bio
*bio
, unsigned int nr_segs
)
665 if (req_gap_back_merge(req
, bio
))
667 if (blk_integrity_rq(req
) &&
668 integrity_req_gap_back_merge(req
, bio
))
670 if (!bio_crypt_ctx_back_mergeable(req
, bio
))
672 if (blk_rq_sectors(req
) + bio_sectors(bio
) >
673 blk_rq_get_max_sectors(req
, blk_rq_pos(req
))) {
674 req_set_nomerge(req
->q
, req
);
678 return ll_new_hw_segment(req
, bio
, nr_segs
);
681 static int ll_front_merge_fn(struct request
*req
, struct bio
*bio
,
682 unsigned int nr_segs
)
684 if (req_gap_front_merge(req
, bio
))
686 if (blk_integrity_rq(req
) &&
687 integrity_req_gap_front_merge(req
, bio
))
689 if (!bio_crypt_ctx_front_mergeable(req
, bio
))
691 if (blk_rq_sectors(req
) + bio_sectors(bio
) >
692 blk_rq_get_max_sectors(req
, bio
->bi_iter
.bi_sector
)) {
693 req_set_nomerge(req
->q
, req
);
697 return ll_new_hw_segment(req
, bio
, nr_segs
);
700 static bool req_attempt_discard_merge(struct request_queue
*q
, struct request
*req
,
701 struct request
*next
)
703 unsigned short segments
= blk_rq_nr_discard_segments(req
);
705 if (segments
>= queue_max_discard_segments(q
))
707 if (blk_rq_sectors(req
) + bio_sectors(next
->bio
) >
708 blk_rq_get_max_sectors(req
, blk_rq_pos(req
)))
711 req
->nr_phys_segments
= segments
+ blk_rq_nr_discard_segments(next
);
714 req_set_nomerge(q
, req
);
718 static int ll_merge_requests_fn(struct request_queue
*q
, struct request
*req
,
719 struct request
*next
)
721 int total_phys_segments
;
723 if (req_gap_back_merge(req
, next
->bio
))
727 * Will it become too large?
729 if ((blk_rq_sectors(req
) + blk_rq_sectors(next
)) >
730 blk_rq_get_max_sectors(req
, blk_rq_pos(req
)))
733 total_phys_segments
= req
->nr_phys_segments
+ next
->nr_phys_segments
;
734 if (total_phys_segments
> blk_rq_get_max_segments(req
))
737 if (!blk_cgroup_mergeable(req
, next
->bio
))
740 if (blk_integrity_merge_rq(q
, req
, next
) == false)
743 if (!bio_crypt_ctx_merge_rq(req
, next
))
747 req
->nr_phys_segments
= total_phys_segments
;
752 * blk_rq_set_mixed_merge - mark a request as mixed merge
753 * @rq: request to mark as mixed merge
756 * @rq is about to be mixed merged. Make sure the attributes
757 * which can be mixed are set in each bio and mark @rq as mixed
760 static void blk_rq_set_mixed_merge(struct request
*rq
)
762 blk_opf_t ff
= rq
->cmd_flags
& REQ_FAILFAST_MASK
;
765 if (rq
->rq_flags
& RQF_MIXED_MERGE
)
769 * @rq will no longer represent mixable attributes for all the
770 * contained bios. It will just track those of the first one.
771 * Distributes the attributs to each bio.
773 for (bio
= rq
->bio
; bio
; bio
= bio
->bi_next
) {
774 WARN_ON_ONCE((bio
->bi_opf
& REQ_FAILFAST_MASK
) &&
775 (bio
->bi_opf
& REQ_FAILFAST_MASK
) != ff
);
778 rq
->rq_flags
|= RQF_MIXED_MERGE
;
781 static inline blk_opf_t
bio_failfast(const struct bio
*bio
)
783 if (bio
->bi_opf
& REQ_RAHEAD
)
784 return REQ_FAILFAST_MASK
;
786 return bio
->bi_opf
& REQ_FAILFAST_MASK
;
790 * After we are marked as MIXED_MERGE, any new RA bio has to be updated
791 * as failfast, and request's failfast has to be updated in case of
794 static inline void blk_update_mixed_merge(struct request
*req
,
795 struct bio
*bio
, bool front_merge
)
797 if (req
->rq_flags
& RQF_MIXED_MERGE
) {
798 if (bio
->bi_opf
& REQ_RAHEAD
)
799 bio
->bi_opf
|= REQ_FAILFAST_MASK
;
802 req
->cmd_flags
&= ~REQ_FAILFAST_MASK
;
803 req
->cmd_flags
|= bio
->bi_opf
& REQ_FAILFAST_MASK
;
808 static void blk_account_io_merge_request(struct request
*req
)
810 if (blk_do_io_stat(req
)) {
812 part_stat_inc(req
->part
, merges
[op_stat_group(req_op(req
))]);
813 part_stat_local_dec(req
->part
,
814 in_flight
[op_is_write(req_op(req
))]);
819 static enum elv_merge
blk_try_req_merge(struct request
*req
,
820 struct request
*next
)
822 if (blk_discard_mergable(req
))
823 return ELEVATOR_DISCARD_MERGE
;
824 else if (blk_rq_pos(req
) + blk_rq_sectors(req
) == blk_rq_pos(next
))
825 return ELEVATOR_BACK_MERGE
;
827 return ELEVATOR_NO_MERGE
;
830 static bool blk_atomic_write_mergeable_rq_bio(struct request
*rq
,
833 return (rq
->cmd_flags
& REQ_ATOMIC
) == (bio
->bi_opf
& REQ_ATOMIC
);
836 static bool blk_atomic_write_mergeable_rqs(struct request
*rq
,
837 struct request
*next
)
839 return (rq
->cmd_flags
& REQ_ATOMIC
) == (next
->cmd_flags
& REQ_ATOMIC
);
843 * For non-mq, this has to be called with the request spinlock acquired.
844 * For mq with scheduling, the appropriate queue wide lock should be held.
846 static struct request
*attempt_merge(struct request_queue
*q
,
847 struct request
*req
, struct request
*next
)
849 if (!rq_mergeable(req
) || !rq_mergeable(next
))
852 if (req_op(req
) != req_op(next
))
855 if (rq_data_dir(req
) != rq_data_dir(next
))
858 /* Don't merge requests with different write hints. */
859 if (req
->write_hint
!= next
->write_hint
)
862 if (req
->ioprio
!= next
->ioprio
)
865 if (!blk_atomic_write_mergeable_rqs(req
, next
))
869 * If we are allowed to merge, then append bio list
870 * from next to rq and release next. merge_requests_fn
871 * will have updated segment counts, update sector
872 * counts here. Handle DISCARDs separately, as they
873 * have separate settings.
876 switch (blk_try_req_merge(req
, next
)) {
877 case ELEVATOR_DISCARD_MERGE
:
878 if (!req_attempt_discard_merge(q
, req
, next
))
881 case ELEVATOR_BACK_MERGE
:
882 if (!ll_merge_requests_fn(q
, req
, next
))
890 * If failfast settings disagree or any of the two is already
891 * a mixed merge, mark both as mixed before proceeding. This
892 * makes sure that all involved bios have mixable attributes
895 if (((req
->rq_flags
| next
->rq_flags
) & RQF_MIXED_MERGE
) ||
896 (req
->cmd_flags
& REQ_FAILFAST_MASK
) !=
897 (next
->cmd_flags
& REQ_FAILFAST_MASK
)) {
898 blk_rq_set_mixed_merge(req
);
899 blk_rq_set_mixed_merge(next
);
903 * At this point we have either done a back merge or front merge. We
904 * need the smaller start_time_ns of the merged requests to be the
905 * current request for accounting purposes.
907 if (next
->start_time_ns
< req
->start_time_ns
)
908 req
->start_time_ns
= next
->start_time_ns
;
910 req
->biotail
->bi_next
= next
->bio
;
911 req
->biotail
= next
->biotail
;
913 req
->__data_len
+= blk_rq_bytes(next
);
915 if (!blk_discard_mergable(req
))
916 elv_merge_requests(q
, req
, next
);
918 blk_crypto_rq_put_keyslot(next
);
921 * 'next' is going away, so update stats accordingly
923 blk_account_io_merge_request(next
);
925 trace_block_rq_merge(next
);
928 * ownership of bio passed from next to req, return 'next' for
935 static struct request
*attempt_back_merge(struct request_queue
*q
,
938 struct request
*next
= elv_latter_request(q
, rq
);
941 return attempt_merge(q
, rq
, next
);
946 static struct request
*attempt_front_merge(struct request_queue
*q
,
949 struct request
*prev
= elv_former_request(q
, rq
);
952 return attempt_merge(q
, prev
, rq
);
958 * Try to merge 'next' into 'rq'. Return true if the merge happened, false
959 * otherwise. The caller is responsible for freeing 'next' if the merge
962 bool blk_attempt_req_merge(struct request_queue
*q
, struct request
*rq
,
963 struct request
*next
)
965 return attempt_merge(q
, rq
, next
);
968 bool blk_rq_merge_ok(struct request
*rq
, struct bio
*bio
)
970 if (!rq_mergeable(rq
) || !bio_mergeable(bio
))
973 if (req_op(rq
) != bio_op(bio
))
976 /* different data direction or already started, don't merge */
977 if (bio_data_dir(bio
) != rq_data_dir(rq
))
980 /* don't merge across cgroup boundaries */
981 if (!blk_cgroup_mergeable(rq
, bio
))
984 /* only merge integrity protected bio into ditto rq */
985 if (blk_integrity_merge_bio(rq
->q
, rq
, bio
) == false)
988 /* Only merge if the crypt contexts are compatible */
989 if (!bio_crypt_rq_ctx_compatible(rq
, bio
))
992 /* Don't merge requests with different write hints. */
993 if (rq
->write_hint
!= bio
->bi_write_hint
)
996 if (rq
->ioprio
!= bio_prio(bio
))
999 if (blk_atomic_write_mergeable_rq_bio(rq
, bio
) == false)
1005 enum elv_merge
blk_try_merge(struct request
*rq
, struct bio
*bio
)
1007 if (blk_discard_mergable(rq
))
1008 return ELEVATOR_DISCARD_MERGE
;
1009 else if (blk_rq_pos(rq
) + blk_rq_sectors(rq
) == bio
->bi_iter
.bi_sector
)
1010 return ELEVATOR_BACK_MERGE
;
1011 else if (blk_rq_pos(rq
) - bio_sectors(bio
) == bio
->bi_iter
.bi_sector
)
1012 return ELEVATOR_FRONT_MERGE
;
1013 return ELEVATOR_NO_MERGE
;
1016 static void blk_account_io_merge_bio(struct request
*req
)
1018 if (!blk_do_io_stat(req
))
1022 part_stat_inc(req
->part
, merges
[op_stat_group(req_op(req
))]);
1026 enum bio_merge_status
bio_attempt_back_merge(struct request
*req
,
1027 struct bio
*bio
, unsigned int nr_segs
)
1029 const blk_opf_t ff
= bio_failfast(bio
);
1031 if (!ll_back_merge_fn(req
, bio
, nr_segs
))
1032 return BIO_MERGE_FAILED
;
1034 trace_block_bio_backmerge(bio
);
1035 rq_qos_merge(req
->q
, req
, bio
);
1037 if ((req
->cmd_flags
& REQ_FAILFAST_MASK
) != ff
)
1038 blk_rq_set_mixed_merge(req
);
1040 blk_update_mixed_merge(req
, bio
, false);
1042 if (req
->rq_flags
& RQF_ZONE_WRITE_PLUGGING
)
1043 blk_zone_write_plug_bio_merged(bio
);
1045 req
->biotail
->bi_next
= bio
;
1047 req
->__data_len
+= bio
->bi_iter
.bi_size
;
1049 bio_crypt_free_ctx(bio
);
1051 blk_account_io_merge_bio(req
);
1052 return BIO_MERGE_OK
;
1055 static enum bio_merge_status
bio_attempt_front_merge(struct request
*req
,
1056 struct bio
*bio
, unsigned int nr_segs
)
1058 const blk_opf_t ff
= bio_failfast(bio
);
1061 * A front merge for writes to sequential zones of a zoned block device
1062 * can happen only if the user submitted writes out of order. Do not
1063 * merge such write to let it fail.
1065 if (req
->rq_flags
& RQF_ZONE_WRITE_PLUGGING
)
1066 return BIO_MERGE_FAILED
;
1068 if (!ll_front_merge_fn(req
, bio
, nr_segs
))
1069 return BIO_MERGE_FAILED
;
1071 trace_block_bio_frontmerge(bio
);
1072 rq_qos_merge(req
->q
, req
, bio
);
1074 if ((req
->cmd_flags
& REQ_FAILFAST_MASK
) != ff
)
1075 blk_rq_set_mixed_merge(req
);
1077 blk_update_mixed_merge(req
, bio
, true);
1079 bio
->bi_next
= req
->bio
;
1082 req
->__sector
= bio
->bi_iter
.bi_sector
;
1083 req
->__data_len
+= bio
->bi_iter
.bi_size
;
1085 bio_crypt_do_front_merge(req
, bio
);
1087 blk_account_io_merge_bio(req
);
1088 return BIO_MERGE_OK
;
1091 static enum bio_merge_status
bio_attempt_discard_merge(struct request_queue
*q
,
1092 struct request
*req
, struct bio
*bio
)
1094 unsigned short segments
= blk_rq_nr_discard_segments(req
);
1096 if (segments
>= queue_max_discard_segments(q
))
1098 if (blk_rq_sectors(req
) + bio_sectors(bio
) >
1099 blk_rq_get_max_sectors(req
, blk_rq_pos(req
)))
1102 rq_qos_merge(q
, req
, bio
);
1104 req
->biotail
->bi_next
= bio
;
1106 req
->__data_len
+= bio
->bi_iter
.bi_size
;
1107 req
->nr_phys_segments
= segments
+ 1;
1109 blk_account_io_merge_bio(req
);
1110 return BIO_MERGE_OK
;
1112 req_set_nomerge(q
, req
);
1113 return BIO_MERGE_FAILED
;
1116 static enum bio_merge_status
blk_attempt_bio_merge(struct request_queue
*q
,
1119 unsigned int nr_segs
,
1120 bool sched_allow_merge
)
1122 if (!blk_rq_merge_ok(rq
, bio
))
1123 return BIO_MERGE_NONE
;
1125 switch (blk_try_merge(rq
, bio
)) {
1126 case ELEVATOR_BACK_MERGE
:
1127 if (!sched_allow_merge
|| blk_mq_sched_allow_merge(q
, rq
, bio
))
1128 return bio_attempt_back_merge(rq
, bio
, nr_segs
);
1130 case ELEVATOR_FRONT_MERGE
:
1131 if (!sched_allow_merge
|| blk_mq_sched_allow_merge(q
, rq
, bio
))
1132 return bio_attempt_front_merge(rq
, bio
, nr_segs
);
1134 case ELEVATOR_DISCARD_MERGE
:
1135 return bio_attempt_discard_merge(q
, rq
, bio
);
1137 return BIO_MERGE_NONE
;
1140 return BIO_MERGE_FAILED
;
1144 * blk_attempt_plug_merge - try to merge with %current's plugged list
1145 * @q: request_queue new bio is being queued at
1146 * @bio: new bio being queued
1147 * @nr_segs: number of segments in @bio
1148 * from the passed in @q already in the plug list
1150 * Determine whether @bio being queued on @q can be merged with the previous
1151 * request on %current's plugged list. Returns %true if merge was successful,
1154 * Plugging coalesces IOs from the same issuer for the same purpose without
1155 * going through @q->queue_lock. As such it's more of an issuing mechanism
1156 * than scheduling, and the request, while may have elvpriv data, is not
1157 * added on the elevator at this point. In addition, we don't have
1158 * reliable access to the elevator outside queue lock. Only check basic
1159 * merging parameters without querying the elevator.
1161 * Caller must ensure !blk_queue_nomerges(q) beforehand.
1163 bool blk_attempt_plug_merge(struct request_queue
*q
, struct bio
*bio
,
1164 unsigned int nr_segs
)
1166 struct blk_plug
*plug
= current
->plug
;
1169 if (!plug
|| rq_list_empty(plug
->mq_list
))
1172 rq_list_for_each(&plug
->mq_list
, rq
) {
1174 if (blk_attempt_bio_merge(q
, rq
, bio
, nr_segs
, false) ==
1181 * Only keep iterating plug list for merges if we have multiple
1184 if (!plug
->multiple_queues
)
1191 * Iterate list of requests and see if we can merge this bio with any
1194 bool blk_bio_list_merge(struct request_queue
*q
, struct list_head
*list
,
1195 struct bio
*bio
, unsigned int nr_segs
)
1200 list_for_each_entry_reverse(rq
, list
, queuelist
) {
1204 switch (blk_attempt_bio_merge(q
, rq
, bio
, nr_segs
, true)) {
1205 case BIO_MERGE_NONE
:
1209 case BIO_MERGE_FAILED
:
1217 EXPORT_SYMBOL_GPL(blk_bio_list_merge
);
1219 bool blk_mq_sched_try_merge(struct request_queue
*q
, struct bio
*bio
,
1220 unsigned int nr_segs
, struct request
**merged_request
)
1224 switch (elv_merge(q
, &rq
, bio
)) {
1225 case ELEVATOR_BACK_MERGE
:
1226 if (!blk_mq_sched_allow_merge(q
, rq
, bio
))
1228 if (bio_attempt_back_merge(rq
, bio
, nr_segs
) != BIO_MERGE_OK
)
1230 *merged_request
= attempt_back_merge(q
, rq
);
1231 if (!*merged_request
)
1232 elv_merged_request(q
, rq
, ELEVATOR_BACK_MERGE
);
1234 case ELEVATOR_FRONT_MERGE
:
1235 if (!blk_mq_sched_allow_merge(q
, rq
, bio
))
1237 if (bio_attempt_front_merge(rq
, bio
, nr_segs
) != BIO_MERGE_OK
)
1239 *merged_request
= attempt_front_merge(q
, rq
);
1240 if (!*merged_request
)
1241 elv_merged_request(q
, rq
, ELEVATOR_FRONT_MERGE
);
1243 case ELEVATOR_DISCARD_MERGE
:
1244 return bio_attempt_discard_merge(q
, rq
, bio
) == BIO_MERGE_OK
;
1249 EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge
);