3 * Boaz Harrosh <bharrosh@panasas.com>
5 * This file is part of the objects raid engine (ore).
7 * It is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as published
9 * by the Free Software Foundation.
11 * You should have received a copy of the GNU General Public License
12 * along with "ore". If not, write to the Free Software Foundation, Inc:
13 * "Free Software Foundation <info@fsf.org>"
16 #include <linux/gfp.h>
17 #include <linux/async_tx.h>
22 #define ORE_DBGMSG2 ORE_DBGMSG
24 struct page
*_raid_page_alloc(void)
26 return alloc_page(GFP_KERNEL
);
29 void _raid_page_free(struct page
*p
)
34 /* This struct is forward declare in ore_io_state, but is private to here.
35 * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit.
37 * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn.
38 * Ascending page index access is sp2d(p-minor, c-major). But storage is
39 * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor
42 struct __stripe_pages_2d
{
43 /* Cache some hot path repeated calculations */
46 unsigned pages_in_unit
;
50 /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */
51 struct __1_page_stripe
{
54 struct async_submit_ctl submit
;
55 struct dma_async_tx_descriptor
*tx
;
57 /* The size of this array is data_devs + parity */
59 struct page
**scribble
;
60 /* bool array, size of this array is data_devs */
65 /* This can get bigger then a page. So support multiple page allocations
66 * _sp2d_free should be called even if _sp2d_alloc fails (by returning
69 static int _sp2d_alloc(unsigned pages_in_unit
, unsigned group_width
,
70 unsigned parity
, struct __stripe_pages_2d
**psp2d
)
72 struct __stripe_pages_2d
*sp2d
;
73 unsigned data_devs
= group_width
- parity
;
74 struct _alloc_all_bytes
{
75 struct __alloc_stripe_pages_2d
{
76 struct __stripe_pages_2d sp2d
;
77 struct __1_page_stripe _1p_stripes
[pages_in_unit
];
79 struct __alloc_1p_arrays
{
80 struct page
*pages
[group_width
];
81 struct page
*scribble
[group_width
];
82 char page_is_read
[data_devs
];
83 } __a1pa
[pages_in_unit
];
85 struct __alloc_1p_arrays
*__a1pa
;
86 struct __alloc_1p_arrays
*__a1pa_end
;
87 const unsigned sizeof__a1pa
= sizeof(_aab
->__a1pa
[0]);
88 unsigned num_a1pa
, alloc_size
, i
;
90 /* FIXME: check these numbers in ore_verify_layout */
91 BUG_ON(sizeof(_aab
->__asp2d
) > PAGE_SIZE
);
92 BUG_ON(sizeof__a1pa
> PAGE_SIZE
);
94 if (sizeof(*_aab
) > PAGE_SIZE
) {
95 num_a1pa
= (PAGE_SIZE
- sizeof(_aab
->__asp2d
)) / sizeof__a1pa
;
96 alloc_size
= sizeof(_aab
->__asp2d
) + sizeof__a1pa
* num_a1pa
;
98 num_a1pa
= pages_in_unit
;
99 alloc_size
= sizeof(*_aab
);
102 _aab
= kzalloc(alloc_size
, GFP_KERNEL
);
103 if (unlikely(!_aab
)) {
104 ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size
);
108 sp2d
= &_aab
->__asp2d
.sp2d
;
109 *psp2d
= sp2d
; /* From here Just call _sp2d_free */
111 __a1pa
= _aab
->__a1pa
;
112 __a1pa_end
= __a1pa
+ num_a1pa
;
114 for (i
= 0; i
< pages_in_unit
; ++i
) {
115 if (unlikely(__a1pa
>= __a1pa_end
)) {
116 num_a1pa
= min_t(unsigned, PAGE_SIZE
/ sizeof__a1pa
,
119 __a1pa
= kzalloc(num_a1pa
* sizeof__a1pa
, GFP_KERNEL
);
120 if (unlikely(!__a1pa
)) {
121 ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n",
125 __a1pa_end
= __a1pa
+ num_a1pa
;
126 /* First *pages is marked for kfree of the buffer */
127 sp2d
->_1p_stripes
[i
].alloc
= true;
130 sp2d
->_1p_stripes
[i
].pages
= __a1pa
->pages
;
131 sp2d
->_1p_stripes
[i
].scribble
= __a1pa
->scribble
;
132 sp2d
->_1p_stripes
[i
].page_is_read
= __a1pa
->page_is_read
;
136 sp2d
->parity
= parity
;
137 sp2d
->data_devs
= data_devs
;
138 sp2d
->pages_in_unit
= pages_in_unit
;
142 static void _sp2d_reset(struct __stripe_pages_2d
*sp2d
,
143 const struct _ore_r4w_op
*r4w
, void *priv
)
145 unsigned data_devs
= sp2d
->data_devs
;
146 unsigned group_width
= data_devs
+ sp2d
->parity
;
152 for (p
= 0; p
< sp2d
->pages_in_unit
; p
++) {
153 struct __1_page_stripe
*_1ps
= &sp2d
->_1p_stripes
[p
];
155 if (_1ps
->write_count
< group_width
) {
158 for (c
= 0; c
< data_devs
; c
++)
159 if (_1ps
->page_is_read
[c
]) {
160 struct page
*page
= _1ps
->pages
[c
];
162 r4w
->put_page(priv
, page
);
163 _1ps
->page_is_read
[c
] = false;
167 memset(_1ps
->pages
, 0, group_width
* sizeof(*_1ps
->pages
));
168 _1ps
->write_count
= 0;
172 sp2d
->needed
= false;
175 static void _sp2d_free(struct __stripe_pages_2d
*sp2d
)
182 for (i
= 0; i
< sp2d
->pages_in_unit
; ++i
) {
183 if (sp2d
->_1p_stripes
[i
].alloc
)
184 kfree(sp2d
->_1p_stripes
[i
].pages
);
190 static unsigned _sp2d_min_pg(struct __stripe_pages_2d
*sp2d
)
194 for (p
= 0; p
< sp2d
->pages_in_unit
; p
++) {
195 struct __1_page_stripe
*_1ps
= &sp2d
->_1p_stripes
[p
];
197 if (_1ps
->write_count
)
204 static unsigned _sp2d_max_pg(struct __stripe_pages_2d
*sp2d
)
208 for (p
= sp2d
->pages_in_unit
- 1; p
>= 0; --p
) {
209 struct __1_page_stripe
*_1ps
= &sp2d
->_1p_stripes
[p
];
211 if (_1ps
->write_count
)
218 static void _gen_xor_unit(struct __stripe_pages_2d
*sp2d
)
221 for (p
= 0; p
< sp2d
->pages_in_unit
; p
++) {
222 struct __1_page_stripe
*_1ps
= &sp2d
->_1p_stripes
[p
];
224 if (!_1ps
->write_count
)
227 init_async_submit(&_1ps
->submit
,
228 ASYNC_TX_XOR_ZERO_DST
| ASYNC_TX_ACK
,
231 (addr_conv_t
*)_1ps
->scribble
);
234 _1ps
->tx
= async_xor(_1ps
->pages
[sp2d
->data_devs
], _1ps
->pages
,
235 0, sp2d
->data_devs
, PAGE_SIZE
,
239 for (p
= 0; p
< sp2d
->pages_in_unit
; p
++) {
240 struct __1_page_stripe
*_1ps
= &sp2d
->_1p_stripes
[p
];
241 /* NOTE: We wait for HW synchronously (I don't have such HW
242 * to test with.) Is parallelism needed with today's multi
245 async_tx_issue_pending(_1ps
->tx
);
249 void _ore_add_stripe_page(struct __stripe_pages_2d
*sp2d
,
250 struct ore_striping_info
*si
, struct page
*page
)
252 struct __1_page_stripe
*_1ps
;
256 _1ps
= &sp2d
->_1p_stripes
[si
->cur_pg
];
257 _1ps
->pages
[si
->cur_comp
] = page
;
260 si
->cur_pg
= (si
->cur_pg
+ 1) % sp2d
->pages_in_unit
;
261 /* si->cur_comp is advanced outside at main loop */
264 void _ore_add_sg_seg(struct ore_per_dev_state
*per_dev
, unsigned cur_len
,
267 struct osd_sg_entry
*sge
;
269 ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d "
270 "offset=0x%llx length=0x%x last_sgs_total=0x%x\n",
271 per_dev
->dev
, cur_len
, not_last
, per_dev
->cur_sg
,
272 _LLU(per_dev
->offset
), per_dev
->length
,
273 per_dev
->last_sgs_total
);
275 if (!per_dev
->cur_sg
) {
276 sge
= per_dev
->sglist
;
278 /* First time we prepare two entries */
279 if (per_dev
->length
) {
281 sge
->offset
= per_dev
->offset
;
282 sge
->len
= per_dev
->length
;
284 /* Here the parity is the first unit of this object.
285 * This happens every time we reach a parity device on
286 * the same stripe as the per_dev->offset. We need to
287 * just skip this unit.
289 per_dev
->offset
+= cur_len
;
293 /* finalize the last one */
294 sge
= &per_dev
->sglist
[per_dev
->cur_sg
- 1];
295 sge
->len
= per_dev
->length
- per_dev
->last_sgs_total
;
299 /* Partly prepare the next one */
300 struct osd_sg_entry
*next_sge
= sge
+ 1;
303 next_sge
->offset
= sge
->offset
+ sge
->len
+ cur_len
;
304 /* Save cur len so we know how mutch was added next time */
305 per_dev
->last_sgs_total
= per_dev
->length
;
307 } else if (!sge
->len
) {
308 /* Optimize for when the last unit is a parity */
313 static int _alloc_read_4_write(struct ore_io_state
*ios
)
315 struct ore_layout
*layout
= ios
->layout
;
317 /* We want to only read those pages not in cache so worst case
318 * is a stripe populated with every other page
320 unsigned sgs_per_dev
= ios
->sp2d
->pages_in_unit
+ 2;
322 ret
= _ore_get_io_state(layout
, ios
->oc
,
323 layout
->group_width
* layout
->mirrors_p1
,
324 sgs_per_dev
, 0, &ios
->ios_read_4_write
);
328 /* @si contains info of the to-be-inserted page. Update of @si should be
329 * maintained by caller. Specificaly si->dev, si->obj_offset, ...
331 static int _add_to_r4w(struct ore_io_state
*ios
, struct ore_striping_info
*si
,
332 struct page
*page
, unsigned pg_len
)
334 struct request_queue
*q
;
335 struct ore_per_dev_state
*per_dev
;
336 struct ore_io_state
*read_ios
;
337 unsigned first_dev
= si
->dev
- (si
->dev
%
338 (ios
->layout
->group_width
* ios
->layout
->mirrors_p1
));
339 unsigned comp
= si
->dev
- first_dev
;
342 if (!ios
->ios_read_4_write
) {
343 int ret
= _alloc_read_4_write(ios
);
349 read_ios
= ios
->ios_read_4_write
;
350 read_ios
->numdevs
= ios
->layout
->group_width
* ios
->layout
->mirrors_p1
;
352 per_dev
= &read_ios
->per_dev
[comp
];
353 if (!per_dev
->length
) {
354 per_dev
->bio
= bio_kmalloc(GFP_KERNEL
,
355 ios
->sp2d
->pages_in_unit
);
356 if (unlikely(!per_dev
->bio
)) {
357 ORE_DBGMSG("Failed to allocate BIO size=%u\n",
358 ios
->sp2d
->pages_in_unit
);
361 per_dev
->offset
= si
->obj_offset
;
362 per_dev
->dev
= si
->dev
;
363 } else if (si
->obj_offset
!= (per_dev
->offset
+ per_dev
->length
)) {
364 u64 gap
= si
->obj_offset
- (per_dev
->offset
+ per_dev
->length
);
366 _ore_add_sg_seg(per_dev
, gap
, true);
368 q
= osd_request_queue(ore_comp_dev(read_ios
->oc
, per_dev
->dev
));
369 added_len
= bio_add_pc_page(q
, per_dev
->bio
, page
, pg_len
,
370 si
->obj_offset
% PAGE_SIZE
);
371 if (unlikely(added_len
!= pg_len
)) {
372 ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n",
373 per_dev
->bio
->bi_vcnt
);
377 per_dev
->length
+= pg_len
;
381 /* read the beginning of an unaligned first page */
382 static int _add_to_r4w_first_page(struct ore_io_state
*ios
, struct page
*page
)
384 struct ore_striping_info si
;
387 ore_calc_stripe_info(ios
->layout
, ios
->offset
, 0, &si
);
389 pg_len
= si
.obj_offset
% PAGE_SIZE
;
390 si
.obj_offset
-= pg_len
;
392 ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n",
393 _LLU(si
.obj_offset
), pg_len
, page
->index
, si
.dev
);
395 return _add_to_r4w(ios
, &si
, page
, pg_len
);
398 /* read the end of an incomplete last page */
399 static int _add_to_r4w_last_page(struct ore_io_state
*ios
, u64
*offset
)
401 struct ore_striping_info si
;
403 unsigned pg_len
, p
, c
;
405 ore_calc_stripe_info(ios
->layout
, *offset
, 0, &si
);
407 p
= si
.unit_off
/ PAGE_SIZE
;
408 c
= _dev_order(ios
->layout
->group_width
* ios
->layout
->mirrors_p1
,
409 ios
->layout
->mirrors_p1
, si
.par_dev
, si
.dev
);
410 page
= ios
->sp2d
->_1p_stripes
[p
].pages
[c
];
412 pg_len
= PAGE_SIZE
- (si
.unit_off
% PAGE_SIZE
);
415 ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n",
416 p
, c
, _LLU(*offset
), pg_len
, si
.dev
, si
.par_dev
);
420 return _add_to_r4w(ios
, &si
, page
, pg_len
);
423 static void _mark_read4write_pages_uptodate(struct ore_io_state
*ios
, int ret
)
428 /* loop on all devices all pages */
429 for (d
= 0; d
< ios
->numdevs
; d
++) {
430 struct bio
*bio
= ios
->per_dev
[d
].bio
;
435 __bio_for_each_segment(bv
, bio
, i
, 0) {
436 struct page
*page
= bv
->bv_page
;
438 SetPageUptodate(page
);
440 ClearPageError(page
);
445 /* read_4_write is hacked to read the start of the first stripe and/or
446 * the end of the last stripe. If needed, with an sg-gap at each device/page.
447 * It is assumed to be called after the to_be_written pages of the first stripe
448 * are populating ios->sp2d[][]
450 * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations
451 * These pages are held at sp2d[p].pages[c] but with
452 * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are
453 * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is
454 * @uptodate=true, so we don't need to read it, only unlock, after IO.
456 * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then
457 * to-be-written count, we should consider the xor-in-place mode.
458 * need_to_read_pages_count is the actual number of pages not present in cache.
459 * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough
460 * approximation? In this mode the read pages are put in the empty places of
461 * ios->sp2d[p][*], xor is calculated the same way. These pages are
462 * allocated/freed and don't go through cache
464 static int _read_4_write(struct ore_io_state
*ios
)
466 struct ore_io_state
*ios_read
;
467 struct ore_striping_info read_si
;
468 struct __stripe_pages_2d
*sp2d
= ios
->sp2d
;
469 u64 offset
= ios
->si
.first_stripe_start
;
471 unsigned bytes_in_stripe
= ios
->si
.bytes_in_stripe
;
472 unsigned i
, c
, p
, min_p
= sp2d
->pages_in_unit
, max_p
= -1;
475 if (offset
== ios
->offset
) /* Go to start collect $200 */
476 goto read_last_stripe
;
478 min_p
= _sp2d_min_pg(sp2d
);
479 max_p
= _sp2d_max_pg(sp2d
);
482 ore_calc_stripe_info(ios
->layout
, offset
, 0, &read_si
);
483 read_si
.obj_offset
+= min_p
* PAGE_SIZE
;
484 offset
+= min_p
* PAGE_SIZE
;
485 for (p
= min_p
; p
<= max_p
; p
++) {
486 struct __1_page_stripe
*_1ps
= &sp2d
->_1p_stripes
[p
];
487 struct page
**pp
= &_1ps
->pages
[c
];
491 if (ios
->offset
% PAGE_SIZE
)
492 /* Read the remainder of the page */
493 _add_to_r4w_first_page(ios
, *pp
);
494 /* to-be-written pages start here */
495 goto read_last_stripe
;
498 *pp
= ios
->r4w
->get_page(ios
->private, offset
,
504 _add_to_r4w(ios
, &read_si
, *pp
, PAGE_SIZE
);
506 /* Mark read-pages to be cache_released */
507 _1ps
->page_is_read
[c
] = true;
508 read_si
.obj_offset
+= PAGE_SIZE
;
511 offset
+= (sp2d
->pages_in_unit
- p
) * PAGE_SIZE
;
515 offset
= ios
->offset
+ ios
->length
;
516 if (offset
% PAGE_SIZE
)
517 _add_to_r4w_last_page(ios
, &offset
);
518 /* offset will be aligned to next page */
520 last_stripe_end
= div_u64(offset
+ bytes_in_stripe
- 1, bytes_in_stripe
)
522 if (offset
== last_stripe_end
) /* Optimize for the aligned case */
525 ore_calc_stripe_info(ios
->layout
, offset
, 0, &read_si
);
526 p
= read_si
.unit_off
/ PAGE_SIZE
;
527 c
= _dev_order(ios
->layout
->group_width
* ios
->layout
->mirrors_p1
,
528 ios
->layout
->mirrors_p1
, read_si
.par_dev
, read_si
.dev
);
530 BUG_ON(ios
->si
.first_stripe_start
+ bytes_in_stripe
!= last_stripe_end
);
531 /* unaligned IO must be within a single stripe */
533 if (min_p
== sp2d
->pages_in_unit
) {
534 /* Didn't do it yet */
535 min_p
= _sp2d_min_pg(sp2d
);
536 max_p
= _sp2d_max_pg(sp2d
);
539 while (offset
< last_stripe_end
) {
540 struct __1_page_stripe
*_1ps
= &sp2d
->_1p_stripes
[p
];
542 if ((min_p
<= p
) && (p
<= max_p
)) {
546 BUG_ON(_1ps
->pages
[c
]);
547 page
= ios
->r4w
->get_page(ios
->private, offset
,
552 _1ps
->pages
[c
] = page
;
553 /* Mark read-pages to be cache_released */
554 _1ps
->page_is_read
[c
] = true;
556 _add_to_r4w(ios
, &read_si
, page
, PAGE_SIZE
);
560 if (p
== (sp2d
->pages_in_unit
- 1)) {
563 ore_calc_stripe_info(ios
->layout
, offset
, 0, &read_si
);
565 read_si
.obj_offset
+= PAGE_SIZE
;
571 ios_read
= ios
->ios_read_4_write
;
575 /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change
576 * to check for per_dev->bio
578 ios_read
->pages
= ios
->pages
;
580 /* Now read these devices */
581 for (i
= 0; i
< ios_read
->numdevs
; i
+= ios_read
->layout
->mirrors_p1
) {
582 ret
= _ore_read_mirror(ios_read
, i
);
587 ret
= ore_io_execute(ios_read
); /* Synchronus execution */
589 ORE_DBGMSG("!! ore_io_execute => %d\n", ret
);
593 _mark_read4write_pages_uptodate(ios_read
, ret
);
597 /* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */
598 int _ore_add_parity_unit(struct ore_io_state
*ios
,
599 struct ore_striping_info
*si
,
600 struct ore_per_dev_state
*per_dev
,
604 if (per_dev
->cur_sg
>= ios
->sgs_per_dev
) {
605 ORE_DBGMSG("cur_sg(%d) >= sgs_per_dev(%d)\n" ,
606 per_dev
->cur_sg
, ios
->sgs_per_dev
);
609 _ore_add_sg_seg(per_dev
, cur_len
, true);
611 struct __stripe_pages_2d
*sp2d
= ios
->sp2d
;
612 struct page
**pages
= ios
->parity_pages
+ ios
->cur_par_page
;
614 unsigned array_start
= 0;
618 si
->cur_pg
= _sp2d_min_pg(sp2d
);
619 num_pages
= _sp2d_max_pg(sp2d
) + 1 - si
->cur_pg
;
621 if (!cur_len
) /* If last stripe operate on parity comp */
622 si
->cur_comp
= sp2d
->data_devs
;
624 if (!per_dev
->length
) {
625 per_dev
->offset
+= si
->cur_pg
* PAGE_SIZE
;
626 /* If first stripe, Read in all read4write pages
627 * (if needed) before we calculate the first parity.
632 for (i
= 0; i
< num_pages
; i
++) {
633 pages
[i
] = _raid_page_alloc();
634 if (unlikely(!pages
[i
]))
637 ++(ios
->cur_par_page
);
640 BUG_ON(si
->cur_comp
!= sp2d
->data_devs
);
641 BUG_ON(si
->cur_pg
+ num_pages
> sp2d
->pages_in_unit
);
643 ret
= _ore_add_stripe_unit(ios
, &array_start
, 0, pages
,
644 per_dev
, num_pages
* PAGE_SIZE
);
648 /* TODO: raid6 if (last_parity_dev) */
650 _sp2d_reset(sp2d
, ios
->r4w
, ios
->private);
655 int _ore_post_alloc_raid_stuff(struct ore_io_state
*ios
)
657 struct ore_layout
*layout
= ios
->layout
;
659 if (ios
->parity_pages
) {
660 unsigned pages_in_unit
= layout
->stripe_unit
/ PAGE_SIZE
;
661 unsigned stripe_size
= ios
->si
.bytes_in_stripe
;
662 u64 last_stripe
, first_stripe
;
664 if (_sp2d_alloc(pages_in_unit
, layout
->group_width
,
665 layout
->parity
, &ios
->sp2d
)) {
669 /* Round io down to last full strip */
670 first_stripe
= div_u64(ios
->offset
, stripe_size
);
671 last_stripe
= div_u64(ios
->offset
+ ios
->length
, stripe_size
);
673 /* If an IO spans more then a single stripe it must end at
674 * a stripe boundary. The reminder at the end is pushed into the
677 if (last_stripe
!= first_stripe
) {
678 ios
->length
= last_stripe
* stripe_size
- ios
->offset
;
680 BUG_ON(!ios
->length
);
681 ios
->nr_pages
= (ios
->length
+ PAGE_SIZE
- 1) /
683 ios
->si
.length
= ios
->length
; /*make it consistent */
689 void _ore_free_raid_stuff(struct ore_io_state
*ios
)
691 if (ios
->sp2d
) { /* writing and raid */
694 for (i
= 0; i
< ios
->cur_par_page
; i
++) {
695 struct page
*page
= ios
->parity_pages
[i
];
698 _raid_page_free(page
);
700 if (ios
->extra_part_alloc
)
701 kfree(ios
->parity_pages
);
702 /* If IO returned an error pages might need unlocking */
703 _sp2d_reset(ios
->sp2d
, ios
->r4w
, ios
->private);
704 _sp2d_free(ios
->sp2d
);
706 /* Will only be set if raid reading && sglist is big */
707 if (ios
->extra_part_alloc
)
708 kfree(ios
->per_dev
[0].sglist
);
710 if (ios
->ios_read_4_write
)
711 ore_put_io_state(ios
->ios_read_4_write
);