2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com)
4 * Copyright (C) 2008, 2009
5 * Boaz Harrosh <bharrosh@panasas.com>
7 * This file is part of exofs.
9 * exofs is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation. Since it is based on ext2, and the only
12 * valid version of GPL for the Linux kernel is version 2, the only valid
13 * version of GPL for exofs is version 2.
15 * exofs is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with exofs; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
25 #include <linux/slab.h>
26 #include <scsi/scsi_device.h>
27 #include <asm/div64.h>
31 #define EXOFS_DBGMSG2(M...) do {} while (0)
32 /* #define EXOFS_DBGMSG2 EXOFS_DBGMSG */
34 void exofs_make_credential(u8 cred_a
[OSD_CAP_LEN
], const struct osd_obj_id
*obj
)
36 osd_sec_init_nosec_doall_caps(cred_a
, obj
, false, true);
39 int exofs_read_kern(struct osd_dev
*od
, u8
*cred
, struct osd_obj_id
*obj
,
40 u64 offset
, void *p
, unsigned length
)
42 struct osd_request
*or = osd_start_request(od
, GFP_KERNEL
);
43 /* struct osd_sense_info osi = {.key = 0};*/
47 EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__
);
50 ret
= osd_req_read_kern(or, obj
, offset
, p
, length
);
52 EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__
);
56 ret
= osd_finalize_request(or, 0, cred
, NULL
);
58 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret
);
62 ret
= osd_execute_request(or);
64 EXOFS_DBGMSG("osd_execute_request() => %d\n", ret
);
65 /* osd_req_decode_sense(or, ret); */
72 int exofs_get_io_state(struct exofs_layout
*layout
,
73 struct exofs_io_state
**pios
)
75 struct exofs_io_state
*ios
;
77 /*TODO: Maybe use kmem_cach per sbi of size
78 * exofs_io_state_size(layout->s_numdevs)
80 ios
= kzalloc(exofs_io_state_size(layout
->s_numdevs
), GFP_KERNEL
);
82 EXOFS_DBGMSG("Faild kzalloc bytes=%d\n",
83 exofs_io_state_size(layout
->s_numdevs
));
89 ios
->obj
.partition
= layout
->s_pid
;
94 void exofs_put_io_state(struct exofs_io_state
*ios
)
99 for (i
= 0; i
< ios
->numdevs
; i
++) {
100 struct exofs_per_dev_state
*per_dev
= &ios
->per_dev
[i
];
103 osd_end_request(per_dev
->or);
105 bio_put(per_dev
->bio
);
112 unsigned exofs_layout_od_id(struct exofs_layout
*layout
,
113 osd_id obj_no
, unsigned layout_index
)
115 /* switch (layout->lay_func) {
116 case LAYOUT_MOVING_WINDOW:
118 unsigned dev_mod
= obj_no
;
120 return (layout_index
+ dev_mod
* layout
->mirrors_p1
) %
123 case LAYOUT_FUNC_IMPLICT:
124 return layout->devs[layout_index];
128 static inline struct osd_dev
*exofs_ios_od(struct exofs_io_state
*ios
,
129 unsigned layout_index
)
131 return ios
->layout
->s_ods
[
132 exofs_layout_od_id(ios
->layout
, ios
->obj
.id
, layout_index
)];
135 static void _sync_done(struct exofs_io_state
*ios
, void *p
)
137 struct completion
*waiting
= p
;
142 static void _last_io(struct kref
*kref
)
144 struct exofs_io_state
*ios
= container_of(
145 kref
, struct exofs_io_state
, kref
);
147 ios
->done(ios
, ios
->private);
150 static void _done_io(struct osd_request
*or, void *p
)
152 struct exofs_io_state
*ios
= p
;
154 kref_put(&ios
->kref
, _last_io
);
157 static int exofs_io_execute(struct exofs_io_state
*ios
)
159 DECLARE_COMPLETION_ONSTACK(wait
);
160 bool sync
= (ios
->done
== NULL
);
164 ios
->done
= _sync_done
;
165 ios
->private = &wait
;
168 for (i
= 0; i
< ios
->numdevs
; i
++) {
169 struct osd_request
*or = ios
->per_dev
[i
].or;
173 ret
= osd_finalize_request(or, 0, ios
->cred
, NULL
);
175 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n",
181 kref_init(&ios
->kref
);
183 for (i
= 0; i
< ios
->numdevs
; i
++) {
184 struct osd_request
*or = ios
->per_dev
[i
].or;
188 kref_get(&ios
->kref
);
189 osd_execute_request_async(or, _done_io
, ios
);
192 kref_put(&ios
->kref
, _last_io
);
196 wait_for_completion(&wait
);
197 ret
= exofs_check_io(ios
, NULL
);
202 static void _clear_bio(struct bio
*bio
)
207 __bio_for_each_segment(bv
, bio
, i
, 0) {
208 unsigned this_count
= bv
->bv_len
;
210 if (likely(PAGE_SIZE
== this_count
))
211 clear_highpage(bv
->bv_page
);
213 zero_user(bv
->bv_page
, bv
->bv_offset
, this_count
);
217 int exofs_check_io(struct exofs_io_state
*ios
, u64
*resid
)
219 enum osd_err_priority acumulated_osd_err
= 0;
220 int acumulated_lin_err
= 0;
223 for (i
= 0; i
< ios
->numdevs
; i
++) {
224 struct osd_sense_info osi
;
225 struct osd_request
*or = ios
->per_dev
[i
].or;
231 ret
= osd_req_decode_sense(or, &osi
);
235 if (OSD_ERR_PRI_CLEAR_PAGES
== osi
.osd_err_pri
) {
236 /* start read offset passed endof file */
237 _clear_bio(ios
->per_dev
[i
].bio
);
238 EXOFS_DBGMSG("start read offset passed end of file "
239 "offset=0x%llx, length=0x%llx\n",
240 _LLU(ios
->per_dev
[i
].offset
),
241 _LLU(ios
->per_dev
[i
].length
));
243 continue; /* we recovered */
246 if (osi
.osd_err_pri
>= acumulated_osd_err
) {
247 acumulated_osd_err
= osi
.osd_err_pri
;
248 acumulated_lin_err
= ret
;
252 /* TODO: raid specific residual calculations */
254 if (likely(!acumulated_lin_err
))
257 *resid
= ios
->length
;
260 return acumulated_lin_err
;
264 * L - logical offset into the file
266 * U - The number of bytes in a stripe within a group
268 * U = stripe_unit * group_width
270 * T - The number of bytes striped within a group of component objects
271 * (before advancing to the next group)
273 * T = stripe_unit * group_width * group_depth
275 * S - The number of bytes striped across all component objects
276 * before the pattern repeats
278 * S = stripe_unit * group_width * group_depth * group_count
280 * M - The "major" (i.e., across all components) stripe number
284 * G - Counts the groups from the beginning of the major stripe
286 * G = (L - (M * S)) / T [or (L % S) / T]
288 * H - The byte offset within the group
290 * H = (L - (M * S)) % T [or (L % S) % T]
292 * N - The "minor" (i.e., across the group) stripe number
296 * C - The component index coresponding to L
298 * C = (H - (N * U)) / stripe_unit + G * group_width
299 * [or (L % U) / stripe_unit + G * group_width]
301 * O - The component offset coresponding to L
303 * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
305 struct _striping_info
{
308 u64 total_group_length
;
314 static void _calc_stripe_info(struct exofs_io_state
*ios
, u64 file_offset
,
315 struct _striping_info
*si
)
317 u32 stripe_unit
= ios
->layout
->stripe_unit
;
318 u32 group_width
= ios
->layout
->group_width
;
319 u64 group_depth
= ios
->layout
->group_depth
;
321 u32 U
= stripe_unit
* group_width
;
322 u64 T
= U
* group_depth
;
323 u64 S
= T
* ios
->layout
->group_count
;
324 u64 M
= div64_u64(file_offset
, S
);
327 G = (L - (M * S)) / T
328 H = (L - (M * S)) % T
330 u64 LmodS
= file_offset
- M
* S
;
331 u32 G
= div64_u64(LmodS
, T
);
332 u64 H
= LmodS
- G
* T
;
334 u32 N
= div_u64(H
, U
);
336 /* "H - (N * U)" is just "H % U" so it's bound to u32 */
337 si
->dev
= (u32
)(H
- (N
* U
)) / stripe_unit
+ G
* group_width
;
338 si
->dev
*= ios
->layout
->mirrors_p1
;
340 div_u64_rem(file_offset
, stripe_unit
, &si
->unit_off
);
342 si
->obj_offset
= si
->unit_off
+ (N
* stripe_unit
) +
343 (M
* group_depth
* stripe_unit
);
345 si
->group_length
= T
- H
;
346 si
->total_group_length
= T
;
350 static int _add_stripe_unit(struct exofs_io_state
*ios
, unsigned *cur_pg
,
351 unsigned pgbase
, struct exofs_per_dev_state
*per_dev
,
354 unsigned pg
= *cur_pg
;
355 struct request_queue
*q
=
356 osd_request_queue(exofs_ios_od(ios
, per_dev
->dev
));
358 per_dev
->length
+= cur_len
;
360 if (per_dev
->bio
== NULL
) {
361 unsigned pages_in_stripe
= ios
->layout
->group_width
*
362 (ios
->layout
->stripe_unit
/ PAGE_SIZE
);
363 unsigned bio_size
= (ios
->nr_pages
+ pages_in_stripe
) /
364 ios
->layout
->group_width
;
366 per_dev
->bio
= bio_kmalloc(GFP_KERNEL
, bio_size
);
367 if (unlikely(!per_dev
->bio
)) {
368 EXOFS_DBGMSG("Faild to allocate BIO size=%u\n",
374 while (cur_len
> 0) {
375 unsigned pglen
= min_t(unsigned, PAGE_SIZE
- pgbase
, cur_len
);
378 BUG_ON(ios
->nr_pages
<= pg
);
381 added_len
= bio_add_pc_page(q
, per_dev
->bio
, ios
->pages
[pg
],
383 if (unlikely(pglen
!= added_len
))
394 static int _prepare_one_group(struct exofs_io_state
*ios
, u64 length
,
395 struct _striping_info
*si
, unsigned first_comp
)
397 unsigned stripe_unit
= ios
->layout
->stripe_unit
;
398 unsigned mirrors_p1
= ios
->layout
->mirrors_p1
;
399 unsigned devs_in_group
= ios
->layout
->group_width
* mirrors_p1
;
400 unsigned dev
= si
->dev
;
401 unsigned first_dev
= dev
- (dev
% devs_in_group
);
402 unsigned comp
= first_comp
+ (dev
- first_dev
);
403 unsigned max_comp
= ios
->numdevs
? ios
->numdevs
- mirrors_p1
: 0;
404 unsigned cur_pg
= ios
->pages_consumed
;
408 struct exofs_per_dev_state
*per_dev
= &ios
->per_dev
[comp
];
409 unsigned cur_len
, page_off
= 0;
411 if (!per_dev
->length
) {
414 per_dev
->offset
= si
->obj_offset
+ stripe_unit
-
416 cur_len
= stripe_unit
;
417 } else if (dev
== si
->dev
) {
418 per_dev
->offset
= si
->obj_offset
;
419 cur_len
= stripe_unit
- si
->unit_off
;
420 page_off
= si
->unit_off
& ~PAGE_MASK
;
421 BUG_ON(page_off
&& (page_off
!= ios
->pgbase
));
422 } else { /* dev > si->dev */
423 per_dev
->offset
= si
->obj_offset
- si
->unit_off
;
424 cur_len
= stripe_unit
;
431 dev
= (dev
% devs_in_group
) + first_dev
;
433 cur_len
= stripe_unit
;
435 if (cur_len
>= length
)
438 ret
= _add_stripe_unit(ios
, &cur_pg
, page_off
, per_dev
,
444 comp
= (comp
% devs_in_group
) + first_comp
;
449 ios
->numdevs
= max_comp
+ mirrors_p1
;
450 ios
->pages_consumed
= cur_pg
;
454 static int _prepare_for_striping(struct exofs_io_state
*ios
)
456 u64 length
= ios
->length
;
457 struct _striping_info si
;
458 unsigned devs_in_group
= ios
->layout
->group_width
*
459 ios
->layout
->mirrors_p1
;
460 unsigned first_comp
= 0;
463 _calc_stripe_info(ios
, ios
->offset
, &si
);
466 if (ios
->kern_buff
) {
467 struct exofs_per_dev_state
*per_dev
= &ios
->per_dev
[0];
469 per_dev
->offset
= si
.obj_offset
;
470 per_dev
->dev
= si
.dev
;
472 /* no cross device without page array */
473 BUG_ON((ios
->layout
->group_width
> 1) &&
474 (si
.unit_off
+ ios
->length
>
475 ios
->layout
->stripe_unit
));
477 ios
->numdevs
= ios
->layout
->mirrors_p1
;
482 if (length
< si
.group_length
)
483 si
.group_length
= length
;
485 ret
= _prepare_one_group(ios
, si
.group_length
, &si
, first_comp
);
489 length
-= si
.group_length
;
491 si
.group_length
= si
.total_group_length
;
494 si
.obj_offset
= si
.Major
* ios
->layout
->stripe_unit
*
495 ios
->layout
->group_depth
;
497 si
.dev
= (si
.dev
- (si
.dev
% devs_in_group
)) + devs_in_group
;
498 si
.dev
%= ios
->layout
->s_numdevs
;
500 first_comp
+= devs_in_group
;
501 first_comp
%= ios
->layout
->s_numdevs
;
508 int exofs_sbi_create(struct exofs_io_state
*ios
)
512 for (i
= 0; i
< ios
->layout
->s_numdevs
; i
++) {
513 struct osd_request
*or;
515 or = osd_start_request(exofs_ios_od(ios
, i
), GFP_KERNEL
);
517 EXOFS_ERR("%s: osd_start_request failed\n", __func__
);
521 ios
->per_dev
[i
].or = or;
524 osd_req_create_object(or, &ios
->obj
);
526 ret
= exofs_io_execute(ios
);
532 int exofs_sbi_remove(struct exofs_io_state
*ios
)
536 for (i
= 0; i
< ios
->layout
->s_numdevs
; i
++) {
537 struct osd_request
*or;
539 or = osd_start_request(exofs_ios_od(ios
, i
), GFP_KERNEL
);
541 EXOFS_ERR("%s: osd_start_request failed\n", __func__
);
545 ios
->per_dev
[i
].or = or;
548 osd_req_remove_object(or, &ios
->obj
);
550 ret
= exofs_io_execute(ios
);
556 static int _sbi_write_mirror(struct exofs_io_state
*ios
, int cur_comp
)
558 struct exofs_per_dev_state
*master_dev
= &ios
->per_dev
[cur_comp
];
559 unsigned dev
= ios
->per_dev
[cur_comp
].dev
;
560 unsigned last_comp
= cur_comp
+ ios
->layout
->mirrors_p1
;
563 if (ios
->pages
&& !master_dev
->length
)
564 return 0; /* Just an empty slot */
566 for (; cur_comp
< last_comp
; ++cur_comp
, ++dev
) {
567 struct exofs_per_dev_state
*per_dev
= &ios
->per_dev
[cur_comp
];
568 struct osd_request
*or;
570 or = osd_start_request(exofs_ios_od(ios
, dev
), GFP_KERNEL
);
572 EXOFS_ERR("%s: osd_start_request failed\n", __func__
);
577 per_dev
->offset
= master_dev
->offset
;
582 if (per_dev
!= master_dev
) {
583 bio
= bio_kmalloc(GFP_KERNEL
,
584 master_dev
->bio
->bi_max_vecs
);
585 if (unlikely(!bio
)) {
587 "Faild to allocate BIO size=%u\n",
588 master_dev
->bio
->bi_max_vecs
);
593 __bio_clone(bio
, master_dev
->bio
);
596 per_dev
->length
= master_dev
->length
;
600 bio
= master_dev
->bio
;
601 /* FIXME: bio_set_dir() */
602 bio
->bi_rw
|= (1 << BIO_RW
);
605 osd_req_write(or, &ios
->obj
, per_dev
->offset
, bio
,
607 EXOFS_DBGMSG("write(0x%llx) offset=0x%llx "
608 "length=0x%llx dev=%d\n",
609 _LLU(ios
->obj
.id
), _LLU(per_dev
->offset
),
610 _LLU(per_dev
->length
), dev
);
611 } else if (ios
->kern_buff
) {
612 ret
= osd_req_write_kern(or, &ios
->obj
, per_dev
->offset
,
613 ios
->kern_buff
, ios
->length
);
616 EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
617 "length=0x%llx dev=%d\n",
618 _LLU(ios
->obj
.id
), _LLU(per_dev
->offset
),
619 _LLU(ios
->length
), dev
);
621 osd_req_set_attributes(or, &ios
->obj
);
622 EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
623 _LLU(ios
->obj
.id
), ios
->out_attr_len
, dev
);
627 osd_req_add_set_attr_list(or, ios
->out_attr
,
631 osd_req_add_get_attr_list(or, ios
->in_attr
,
639 int exofs_sbi_write(struct exofs_io_state
*ios
)
644 ret
= _prepare_for_striping(ios
);
648 for (i
= 0; i
< ios
->numdevs
; i
+= ios
->layout
->mirrors_p1
) {
649 ret
= _sbi_write_mirror(ios
, i
);
654 ret
= exofs_io_execute(ios
);
658 static int _sbi_read_mirror(struct exofs_io_state
*ios
, unsigned cur_comp
)
660 struct osd_request
*or;
661 struct exofs_per_dev_state
*per_dev
= &ios
->per_dev
[cur_comp
];
662 unsigned first_dev
= (unsigned)ios
->obj
.id
;
664 if (ios
->pages
&& !per_dev
->length
)
665 return 0; /* Just an empty slot */
667 first_dev
= per_dev
->dev
+ first_dev
% ios
->layout
->mirrors_p1
;
668 or = osd_start_request(exofs_ios_od(ios
, first_dev
), GFP_KERNEL
);
670 EXOFS_ERR("%s: osd_start_request failed\n", __func__
);
676 osd_req_read(or, &ios
->obj
, per_dev
->offset
,
677 per_dev
->bio
, per_dev
->length
);
678 EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
679 " dev=%d\n", _LLU(ios
->obj
.id
),
680 _LLU(per_dev
->offset
), _LLU(per_dev
->length
),
682 } else if (ios
->kern_buff
) {
683 int ret
= osd_req_read_kern(or, &ios
->obj
, per_dev
->offset
,
684 ios
->kern_buff
, ios
->length
);
685 EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
686 "length=0x%llx dev=%d ret=>%d\n",
687 _LLU(ios
->obj
.id
), _LLU(per_dev
->offset
),
688 _LLU(ios
->length
), first_dev
, ret
);
692 osd_req_get_attributes(or, &ios
->obj
);
693 EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
694 _LLU(ios
->obj
.id
), ios
->in_attr_len
, first_dev
);
697 osd_req_add_set_attr_list(or, ios
->out_attr
, ios
->out_attr_len
);
700 osd_req_add_get_attr_list(or, ios
->in_attr
, ios
->in_attr_len
);
705 int exofs_sbi_read(struct exofs_io_state
*ios
)
710 ret
= _prepare_for_striping(ios
);
714 for (i
= 0; i
< ios
->numdevs
; i
+= ios
->layout
->mirrors_p1
) {
715 ret
= _sbi_read_mirror(ios
, i
);
720 ret
= exofs_io_execute(ios
);
724 int extract_attr_from_ios(struct exofs_io_state
*ios
, struct osd_attr
*attr
)
726 struct osd_attr cur_attr
= {.attr_page
= 0}; /* start with zeros */
732 osd_req_decode_get_attr_list(ios
->per_dev
[0].or,
733 &cur_attr
, &nelem
, &iter
);
734 if ((cur_attr
.attr_page
== attr
->attr_page
) &&
735 (cur_attr
.attr_id
== attr
->attr_id
)) {
736 attr
->len
= cur_attr
.len
;
737 attr
->val_ptr
= cur_attr
.val_ptr
;
745 static int _truncate_mirrors(struct exofs_io_state
*ios
, unsigned cur_comp
,
746 struct osd_attr
*attr
)
748 int last_comp
= cur_comp
+ ios
->layout
->mirrors_p1
;
750 for (; cur_comp
< last_comp
; ++cur_comp
) {
751 struct exofs_per_dev_state
*per_dev
= &ios
->per_dev
[cur_comp
];
752 struct osd_request
*or;
754 or = osd_start_request(exofs_ios_od(ios
, cur_comp
), GFP_KERNEL
);
756 EXOFS_ERR("%s: osd_start_request failed\n", __func__
);
761 osd_req_set_attributes(or, &ios
->obj
);
762 osd_req_add_set_attr_list(or, attr
, 1);
768 int exofs_oi_truncate(struct exofs_i_info
*oi
, u64 size
)
770 struct exofs_sb_info
*sbi
= oi
->vfs_inode
.i_sb
->s_fs_info
;
771 struct exofs_io_state
*ios
;
772 struct exofs_trunc_attr
{
773 struct osd_attr attr
;
776 struct _striping_info si
;
779 ret
= exofs_get_io_state(&sbi
->layout
, &ios
);
783 size_attrs
= kcalloc(ios
->layout
->group_width
, sizeof(*size_attrs
),
785 if (unlikely(!size_attrs
)) {
790 ios
->obj
.id
= exofs_oi_objno(oi
);
791 ios
->cred
= oi
->i_cred
;
793 ios
->numdevs
= ios
->layout
->s_numdevs
;
794 _calc_stripe_info(ios
, size
, &si
);
796 for (i
= 0; i
< ios
->layout
->group_width
; ++i
) {
797 struct exofs_trunc_attr
*size_attr
= &size_attrs
[i
];
801 obj_size
= si
.obj_offset
+
802 ios
->layout
->stripe_unit
- si
.unit_off
;
803 else if (i
== si
.dev
)
804 obj_size
= si
.obj_offset
;
805 else /* i > si.dev */
806 obj_size
= si
.obj_offset
- si
.unit_off
;
808 size_attr
->newsize
= cpu_to_be64(obj_size
);
809 size_attr
->attr
= g_attr_logical_length
;
810 size_attr
->attr
.val_ptr
= &size_attr
->newsize
;
812 ret
= _truncate_mirrors(ios
, i
* ios
->layout
->mirrors_p1
,
817 ret
= exofs_io_execute(ios
);
821 exofs_put_io_state(ios
);