3 * Boaz Harrosh <bharrosh@panasas.com>
5 * This file is part of the objects raid engine (ore).
7 * It is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as published
9 * by the Free Software Foundation.
11 * You should have received a copy of the GNU General Public License
12 * along with "ore". If not, write to the Free Software Foundation, Inc:
13 * "Free Software Foundation <info@fsf.org>"
16 #include <linux/gfp.h>
17 #include <linux/async_tx.h>
22 #define ORE_DBGMSG2 ORE_DBGMSG
24 struct page
*_raid_page_alloc(void)
26 return alloc_page(GFP_KERNEL
);
29 void _raid_page_free(struct page
*p
)
34 /* This struct is forward declare in ore_io_state, but is private to here.
35 * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit.
37 * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn.
38 * Ascending page index access is sp2d(p-minor, c-major). But storage is
39 * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor
42 struct __stripe_pages_2d
{
43 /* Cache some hot path repeated calculations */
46 unsigned pages_in_unit
;
50 /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */
51 struct __1_page_stripe
{
54 struct async_submit_ctl submit
;
55 struct dma_async_tx_descriptor
*tx
;
57 /* The size of this array is data_devs + parity */
59 struct page
**scribble
;
60 /* bool array, size of this array is data_devs */
65 /* This can get bigger then a page. So support multiple page allocations
66 * _sp2d_free should be called even if _sp2d_alloc fails (by returning
69 static int _sp2d_alloc(unsigned pages_in_unit
, unsigned group_width
,
70 unsigned parity
, struct __stripe_pages_2d
**psp2d
)
72 struct __stripe_pages_2d
*sp2d
;
73 unsigned data_devs
= group_width
- parity
;
74 struct _alloc_all_bytes
{
75 struct __alloc_stripe_pages_2d
{
76 struct __stripe_pages_2d sp2d
;
77 struct __1_page_stripe _1p_stripes
[pages_in_unit
];
79 struct __alloc_1p_arrays
{
80 struct page
*pages
[group_width
];
81 struct page
*scribble
[group_width
];
82 char page_is_read
[data_devs
];
83 } __a1pa
[pages_in_unit
];
85 struct __alloc_1p_arrays
*__a1pa
;
86 struct __alloc_1p_arrays
*__a1pa_end
;
87 const unsigned sizeof__a1pa
= sizeof(_aab
->__a1pa
[0]);
88 unsigned num_a1pa
, alloc_size
, i
;
90 /* FIXME: check these numbers in ore_verify_layout */
91 BUG_ON(sizeof(_aab
->__asp2d
) > PAGE_SIZE
);
92 BUG_ON(sizeof__a1pa
> PAGE_SIZE
);
94 if (sizeof(*_aab
) > PAGE_SIZE
) {
95 num_a1pa
= (PAGE_SIZE
- sizeof(_aab
->__asp2d
)) / sizeof__a1pa
;
96 alloc_size
= sizeof(_aab
->__asp2d
) + sizeof__a1pa
* num_a1pa
;
98 num_a1pa
= pages_in_unit
;
99 alloc_size
= sizeof(*_aab
);
102 _aab
= kzalloc(alloc_size
, GFP_KERNEL
);
103 if (unlikely(!_aab
)) {
104 ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size
);
108 sp2d
= &_aab
->__asp2d
.sp2d
;
109 *psp2d
= sp2d
; /* From here Just call _sp2d_free */
111 __a1pa
= _aab
->__a1pa
;
112 __a1pa_end
= __a1pa
+ num_a1pa
;
114 for (i
= 0; i
< pages_in_unit
; ++i
) {
115 if (unlikely(__a1pa
>= __a1pa_end
)) {
116 num_a1pa
= min_t(unsigned, PAGE_SIZE
/ sizeof__a1pa
,
119 __a1pa
= kzalloc(num_a1pa
* sizeof__a1pa
, GFP_KERNEL
);
120 if (unlikely(!__a1pa
)) {
121 ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n",
125 __a1pa_end
= __a1pa
+ num_a1pa
;
126 /* First *pages is marked for kfree of the buffer */
127 sp2d
->_1p_stripes
[i
].alloc
= true;
130 sp2d
->_1p_stripes
[i
].pages
= __a1pa
->pages
;
131 sp2d
->_1p_stripes
[i
].scribble
= __a1pa
->scribble
;
132 sp2d
->_1p_stripes
[i
].page_is_read
= __a1pa
->page_is_read
;
136 sp2d
->parity
= parity
;
137 sp2d
->data_devs
= data_devs
;
138 sp2d
->pages_in_unit
= pages_in_unit
;
142 static void _sp2d_reset(struct __stripe_pages_2d
*sp2d
,
143 const struct _ore_r4w_op
*r4w
, void *priv
)
145 unsigned data_devs
= sp2d
->data_devs
;
146 unsigned group_width
= data_devs
+ sp2d
->parity
;
152 for (p
= 0; p
< sp2d
->pages_in_unit
; p
++) {
153 struct __1_page_stripe
*_1ps
= &sp2d
->_1p_stripes
[p
];
155 if (_1ps
->write_count
< group_width
) {
158 for (c
= 0; c
< data_devs
; c
++)
159 if (_1ps
->page_is_read
[c
]) {
160 struct page
*page
= _1ps
->pages
[c
];
162 r4w
->put_page(priv
, page
);
163 _1ps
->page_is_read
[c
] = false;
167 memset(_1ps
->pages
, 0, group_width
* sizeof(*_1ps
->pages
));
168 _1ps
->write_count
= 0;
172 sp2d
->needed
= false;
175 static void _sp2d_free(struct __stripe_pages_2d
*sp2d
)
182 for (i
= 0; i
< sp2d
->pages_in_unit
; ++i
) {
183 if (sp2d
->_1p_stripes
[i
].alloc
)
184 kfree(sp2d
->_1p_stripes
[i
].pages
);
190 static unsigned _sp2d_min_pg(struct __stripe_pages_2d
*sp2d
)
194 for (p
= 0; p
< sp2d
->pages_in_unit
; p
++) {
195 struct __1_page_stripe
*_1ps
= &sp2d
->_1p_stripes
[p
];
197 if (_1ps
->write_count
)
204 static unsigned _sp2d_max_pg(struct __stripe_pages_2d
*sp2d
)
208 for (p
= sp2d
->pages_in_unit
- 1; p
>= 0; --p
) {
209 struct __1_page_stripe
*_1ps
= &sp2d
->_1p_stripes
[p
];
211 if (_1ps
->write_count
)
218 static void _gen_xor_unit(struct __stripe_pages_2d
*sp2d
)
221 for (p
= 0; p
< sp2d
->pages_in_unit
; p
++) {
222 struct __1_page_stripe
*_1ps
= &sp2d
->_1p_stripes
[p
];
224 if (!_1ps
->write_count
)
227 init_async_submit(&_1ps
->submit
,
228 ASYNC_TX_XOR_ZERO_DST
| ASYNC_TX_ACK
,
231 (addr_conv_t
*)_1ps
->scribble
);
234 _1ps
->tx
= async_xor(_1ps
->pages
[sp2d
->data_devs
], _1ps
->pages
,
235 0, sp2d
->data_devs
, PAGE_SIZE
,
239 for (p
= 0; p
< sp2d
->pages_in_unit
; p
++) {
240 struct __1_page_stripe
*_1ps
= &sp2d
->_1p_stripes
[p
];
241 /* NOTE: We wait for HW synchronously (I don't have such HW
242 * to test with.) Is parallelism needed with today's multi
245 async_tx_issue_pending(_1ps
->tx
);
249 void _ore_add_stripe_page(struct __stripe_pages_2d
*sp2d
,
250 struct ore_striping_info
*si
, struct page
*page
)
252 struct __1_page_stripe
*_1ps
;
256 _1ps
= &sp2d
->_1p_stripes
[si
->cur_pg
];
257 _1ps
->pages
[si
->cur_comp
] = page
;
260 si
->cur_pg
= (si
->cur_pg
+ 1) % sp2d
->pages_in_unit
;
261 /* si->cur_comp is advanced outside at main loop */
264 void _ore_add_sg_seg(struct ore_per_dev_state
*per_dev
, unsigned cur_len
,
267 struct osd_sg_entry
*sge
;
269 ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d "
270 "offset=0x%llx length=0x%x last_sgs_total=0x%x\n",
271 per_dev
->dev
, cur_len
, not_last
, per_dev
->cur_sg
,
272 _LLU(per_dev
->offset
), per_dev
->length
,
273 per_dev
->last_sgs_total
);
275 if (!per_dev
->cur_sg
) {
276 sge
= per_dev
->sglist
;
278 /* First time we prepare two entries */
279 if (per_dev
->length
) {
281 sge
->offset
= per_dev
->offset
;
282 sge
->len
= per_dev
->length
;
284 /* Here the parity is the first unit of this object.
285 * This happens every time we reach a parity device on
286 * the same stripe as the per_dev->offset. We need to
287 * just skip this unit.
289 per_dev
->offset
+= cur_len
;
293 /* finalize the last one */
294 sge
= &per_dev
->sglist
[per_dev
->cur_sg
- 1];
295 sge
->len
= per_dev
->length
- per_dev
->last_sgs_total
;
299 /* Partly prepare the next one */
300 struct osd_sg_entry
*next_sge
= sge
+ 1;
303 next_sge
->offset
= sge
->offset
+ sge
->len
+ cur_len
;
304 /* Save cur len so we know how mutch was added next time */
305 per_dev
->last_sgs_total
= per_dev
->length
;
307 } else if (!sge
->len
) {
308 /* Optimize for when the last unit is a parity */
313 static int _alloc_read_4_write(struct ore_io_state
*ios
)
315 struct ore_layout
*layout
= ios
->layout
;
317 /* We want to only read those pages not in cache so worst case
318 * is a stripe populated with every other page
320 unsigned sgs_per_dev
= ios
->sp2d
->pages_in_unit
+ 2;
322 ret
= _ore_get_io_state(layout
, ios
->oc
,
323 layout
->group_width
* layout
->mirrors_p1
,
324 sgs_per_dev
, 0, &ios
->ios_read_4_write
);
328 /* @si contains info of the to-be-inserted page. Update of @si should be
329 * maintained by caller. Specificaly si->dev, si->obj_offset, ...
331 static int _add_to_read_4_write(struct ore_io_state
*ios
,
332 struct ore_striping_info
*si
, struct page
*page
)
334 struct request_queue
*q
;
335 struct ore_per_dev_state
*per_dev
;
336 struct ore_io_state
*read_ios
;
337 unsigned first_dev
= si
->dev
- (si
->dev
%
338 (ios
->layout
->group_width
* ios
->layout
->mirrors_p1
));
339 unsigned comp
= si
->dev
- first_dev
;
342 if (!ios
->ios_read_4_write
) {
343 int ret
= _alloc_read_4_write(ios
);
349 read_ios
= ios
->ios_read_4_write
;
350 read_ios
->numdevs
= ios
->layout
->group_width
* ios
->layout
->mirrors_p1
;
352 per_dev
= &read_ios
->per_dev
[comp
];
353 if (!per_dev
->length
) {
354 per_dev
->bio
= bio_kmalloc(GFP_KERNEL
,
355 ios
->sp2d
->pages_in_unit
);
356 if (unlikely(!per_dev
->bio
)) {
357 ORE_DBGMSG("Failed to allocate BIO size=%u\n",
358 ios
->sp2d
->pages_in_unit
);
361 per_dev
->offset
= si
->obj_offset
;
362 per_dev
->dev
= si
->dev
;
363 } else if (si
->obj_offset
!= (per_dev
->offset
+ per_dev
->length
)) {
364 u64 gap
= si
->obj_offset
- (per_dev
->offset
+ per_dev
->length
);
366 _ore_add_sg_seg(per_dev
, gap
, true);
368 q
= osd_request_queue(ore_comp_dev(read_ios
->oc
, per_dev
->dev
));
369 added_len
= bio_add_pc_page(q
, per_dev
->bio
, page
, PAGE_SIZE
, 0);
370 if (unlikely(added_len
!= PAGE_SIZE
)) {
371 ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n",
372 per_dev
->bio
->bi_vcnt
);
376 per_dev
->length
+= PAGE_SIZE
;
380 static void _mark_read4write_pages_uptodate(struct ore_io_state
*ios
, int ret
)
385 /* loop on all devices all pages */
386 for (d
= 0; d
< ios
->numdevs
; d
++) {
387 struct bio
*bio
= ios
->per_dev
[d
].bio
;
392 __bio_for_each_segment(bv
, bio
, i
, 0) {
393 struct page
*page
= bv
->bv_page
;
395 SetPageUptodate(page
);
397 ClearPageError(page
);
402 /* read_4_write is hacked to read the start of the first stripe and/or
403 * the end of the last stripe. If needed, with an sg-gap at each device/page.
404 * It is assumed to be called after the to_be_written pages of the first stripe
405 * are populating ios->sp2d[][]
407 * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations
408 * These pages are held at sp2d[p].pages[c] but with
409 * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are
410 * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is
411 * @uptodate=true, so we don't need to read it, only unlock, after IO.
413 * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then
414 * to-be-written count, we should consider the xor-in-place mode.
415 * need_to_read_pages_count is the actual number of pages not present in cache.
416 * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough
417 * approximation? In this mode the read pages are put in the empty places of
418 * ios->sp2d[p][*], xor is calculated the same way. These pages are
419 * allocated/freed and don't go through cache
421 static int _read_4_write(struct ore_io_state
*ios
)
423 struct ore_io_state
*ios_read
;
424 struct ore_striping_info read_si
;
425 struct __stripe_pages_2d
*sp2d
= ios
->sp2d
;
426 u64 offset
= ios
->si
.first_stripe_start
;
428 unsigned bytes_in_stripe
= ios
->si
.bytes_in_stripe
;
429 unsigned i
, c
, p
, min_p
= sp2d
->pages_in_unit
, max_p
= -1;
432 if (offset
== ios
->offset
) /* Go to start collect $200 */
433 goto read_last_stripe
;
435 min_p
= _sp2d_min_pg(sp2d
);
436 max_p
= _sp2d_max_pg(sp2d
);
439 ore_calc_stripe_info(ios
->layout
, offset
, 0, &read_si
);
440 read_si
.obj_offset
+= min_p
* PAGE_SIZE
;
441 offset
+= min_p
* PAGE_SIZE
;
442 for (p
= min_p
; p
<= max_p
; p
++) {
443 struct __1_page_stripe
*_1ps
= &sp2d
->_1p_stripes
[p
];
444 struct page
**pp
= &_1ps
->pages
[c
];
448 /* to-be-written pages start here */
449 goto read_last_stripe
;
451 *pp
= ios
->r4w
->get_page(ios
->private, offset
,
457 _add_to_read_4_write(ios
, &read_si
, *pp
);
459 /* Mark read-pages to be cache_released */
460 _1ps
->page_is_read
[c
] = true;
461 read_si
.obj_offset
+= PAGE_SIZE
;
464 offset
+= (sp2d
->pages_in_unit
- p
) * PAGE_SIZE
;
468 offset
= ios
->offset
+ (ios
->length
+ PAGE_SIZE
- 1) /
469 PAGE_SIZE
* PAGE_SIZE
;
470 last_stripe_end
= div_u64(offset
+ bytes_in_stripe
- 1, bytes_in_stripe
)
472 if (offset
== last_stripe_end
) /* Optimize for the aligned case */
475 ore_calc_stripe_info(ios
->layout
, offset
, 0, &read_si
);
476 p
= read_si
.unit_off
/ PAGE_SIZE
;
477 c
= _dev_order(ios
->layout
->group_width
* ios
->layout
->mirrors_p1
,
478 ios
->layout
->mirrors_p1
, read_si
.par_dev
, read_si
.dev
);
480 BUG_ON(ios
->si
.first_stripe_start
+ bytes_in_stripe
!= last_stripe_end
);
481 /* unaligned IO must be within a single stripe */
483 if (min_p
== sp2d
->pages_in_unit
) {
484 /* Didn't do it yet */
485 min_p
= _sp2d_min_pg(sp2d
);
486 max_p
= _sp2d_max_pg(sp2d
);
489 while (offset
< last_stripe_end
) {
490 struct __1_page_stripe
*_1ps
= &sp2d
->_1p_stripes
[p
];
492 if ((min_p
<= p
) && (p
<= max_p
)) {
496 BUG_ON(_1ps
->pages
[c
]);
497 page
= ios
->r4w
->get_page(ios
->private, offset
,
502 _1ps
->pages
[c
] = page
;
503 /* Mark read-pages to be cache_released */
504 _1ps
->page_is_read
[c
] = true;
506 _add_to_read_4_write(ios
, &read_si
, page
);
510 if (p
== (sp2d
->pages_in_unit
- 1)) {
513 ore_calc_stripe_info(ios
->layout
, offset
, 0, &read_si
);
515 read_si
.obj_offset
+= PAGE_SIZE
;
521 ios_read
= ios
->ios_read_4_write
;
525 /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change
526 * to check for per_dev->bio
528 ios_read
->pages
= ios
->pages
;
530 /* Now read these devices */
531 for (i
= 0; i
< ios_read
->numdevs
; i
+= ios_read
->layout
->mirrors_p1
) {
532 ret
= _ore_read_mirror(ios_read
, i
);
537 ret
= ore_io_execute(ios_read
); /* Synchronus execution */
539 ORE_DBGMSG("!! ore_io_execute => %d\n", ret
);
543 _mark_read4write_pages_uptodate(ios_read
, ret
);
547 /* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */
548 int _ore_add_parity_unit(struct ore_io_state
*ios
,
549 struct ore_striping_info
*si
,
550 struct ore_per_dev_state
*per_dev
,
554 BUG_ON(per_dev
->cur_sg
>= ios
->sgs_per_dev
);
555 _ore_add_sg_seg(per_dev
, cur_len
, true);
557 struct __stripe_pages_2d
*sp2d
= ios
->sp2d
;
558 struct page
**pages
= ios
->parity_pages
+ ios
->cur_par_page
;
560 unsigned array_start
= 0;
564 si
->cur_pg
= _sp2d_min_pg(sp2d
);
565 num_pages
= _sp2d_max_pg(sp2d
) + 1 - si
->cur_pg
;
567 if (!cur_len
) /* If last stripe operate on parity comp */
568 si
->cur_comp
= sp2d
->data_devs
;
570 if (!per_dev
->length
) {
571 per_dev
->offset
+= si
->cur_pg
* PAGE_SIZE
;
572 /* If first stripe, Read in all read4write pages
573 * (if needed) before we calculate the first parity.
578 for (i
= 0; i
< num_pages
; i
++) {
579 pages
[i
] = _raid_page_alloc();
580 if (unlikely(!pages
[i
]))
583 ++(ios
->cur_par_page
);
586 BUG_ON(si
->cur_comp
!= sp2d
->data_devs
);
587 BUG_ON(si
->cur_pg
+ num_pages
> sp2d
->pages_in_unit
);
589 ret
= _ore_add_stripe_unit(ios
, &array_start
, 0, pages
,
590 per_dev
, num_pages
* PAGE_SIZE
);
594 /* TODO: raid6 if (last_parity_dev) */
596 _sp2d_reset(sp2d
, ios
->r4w
, ios
->private);
601 int _ore_post_alloc_raid_stuff(struct ore_io_state
*ios
)
603 struct ore_layout
*layout
= ios
->layout
;
605 if (ios
->parity_pages
) {
606 unsigned pages_in_unit
= layout
->stripe_unit
/ PAGE_SIZE
;
607 unsigned stripe_size
= ios
->si
.bytes_in_stripe
;
608 u64 last_stripe
, first_stripe
;
610 if (_sp2d_alloc(pages_in_unit
, layout
->group_width
,
611 layout
->parity
, &ios
->sp2d
)) {
615 BUG_ON(ios
->offset
% PAGE_SIZE
);
617 /* Round io down to last full strip */
618 first_stripe
= div_u64(ios
->offset
, stripe_size
);
619 last_stripe
= div_u64(ios
->offset
+ ios
->length
, stripe_size
);
621 /* If an IO spans more then a single stripe it must end at
622 * a stripe boundary. The reminder at the end is pushed into the
625 if (last_stripe
!= first_stripe
) {
626 ios
->length
= last_stripe
* stripe_size
- ios
->offset
;
628 BUG_ON(!ios
->length
);
629 ios
->nr_pages
= (ios
->length
+ PAGE_SIZE
- 1) /
631 ios
->si
.length
= ios
->length
; /*make it consistent */
637 void _ore_free_raid_stuff(struct ore_io_state
*ios
)
639 if (ios
->sp2d
) { /* writing and raid */
642 for (i
= 0; i
< ios
->cur_par_page
; i
++) {
643 struct page
*page
= ios
->parity_pages
[i
];
646 _raid_page_free(page
);
648 if (ios
->extra_part_alloc
)
649 kfree(ios
->parity_pages
);
650 /* If IO returned an error pages might need unlocking */
651 _sp2d_reset(ios
->sp2d
, ios
->r4w
, ios
->private);
652 _sp2d_free(ios
->sp2d
);
654 /* Will only be set if raid reading && sglist is big */
655 if (ios
->extra_part_alloc
)
656 kfree(ios
->per_dev
[0].sglist
);
658 if (ios
->ios_read_4_write
)
659 ore_put_io_state(ios
->ios_read_4_write
);