linux/audit.h: move ptrace.h include to kernel header
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / drivers / block / drbd / drbd_actlog.c
blob92510f8ad0131f480aac0fddc48d8e889f8d690f
1 /*
2 drbd_actlog.c
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26 #include <linux/slab.h>
27 #include <linux/crc32c.h>
28 #include <linux/drbd.h>
29 #include <linux/drbd_limits.h>
30 #include <linux/dynamic_debug.h>
31 #include "drbd_int.h"
32 #include "drbd_wrappers.h"
35 enum al_transaction_types {
36 AL_TR_UPDATE = 0,
37 AL_TR_INITIALIZED = 0xffff
39 /* all fields on disc in big endian */
40 struct __packed al_transaction_on_disk {
41 /* don't we all like magic */
42 __be32 magic;
44 /* to identify the most recent transaction block
45 * in the on disk ring buffer */
46 __be32 tr_number;
48 /* checksum on the full 4k block, with this field set to 0. */
49 __be32 crc32c;
51 /* type of transaction, special transaction types like:
52 * purge-all, set-all-idle, set-all-active, ... to-be-defined
53 * see also enum al_transaction_types */
54 __be16 transaction_type;
56 /* we currently allow only a few thousand extents,
57 * so 16bit will be enough for the slot number. */
59 /* how many updates in this transaction */
60 __be16 n_updates;
62 /* maximum slot number, "al-extents" in drbd.conf speak.
63 * Having this in each transaction should make reconfiguration
64 * of that parameter easier. */
65 __be16 context_size;
67 /* slot number the context starts with */
68 __be16 context_start_slot_nr;
70 /* Some reserved bytes. Expected usage is a 64bit counter of
71 * sectors-written since device creation, and other data generation tag
72 * supporting usage */
73 __be32 __reserved[4];
75 /* --- 36 byte used --- */
77 /* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes
78 * in one transaction, then use the remaining byte in the 4k block for
79 * context information. "Flexible" number of updates per transaction
80 * does not help, as we have to account for the case when all update
81 * slots are used anyways, so it would only complicate code without
82 * additional benefit.
84 __be16 update_slot_nr[AL_UPDATES_PER_TRANSACTION];
86 /* but the extent number is 32bit, which at an extent size of 4 MiB
87 * allows to cover device sizes of up to 2**54 Byte (16 PiB) */
88 __be32 update_extent_nr[AL_UPDATES_PER_TRANSACTION];
90 /* --- 420 bytes used (36 + 64*6) --- */
92 /* 4096 - 420 = 3676 = 919 * 4 */
93 __be32 context[AL_CONTEXT_PER_TRANSACTION];
96 struct update_odbm_work {
97 struct drbd_work w;
98 unsigned int enr;
101 struct update_al_work {
102 struct drbd_work w;
103 struct completion event;
104 int err;
107 static int al_write_transaction(struct drbd_conf *mdev);
109 void *drbd_md_get_buffer(struct drbd_conf *mdev)
111 int r;
113 wait_event(mdev->misc_wait,
114 (r = atomic_cmpxchg(&mdev->md_io_in_use, 0, 1)) == 0 ||
115 mdev->state.disk <= D_FAILED);
117 return r ? NULL : page_address(mdev->md_io_page);
120 void drbd_md_put_buffer(struct drbd_conf *mdev)
122 if (atomic_dec_and_test(&mdev->md_io_in_use))
123 wake_up(&mdev->misc_wait);
126 void wait_until_done_or_force_detached(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
127 unsigned int *done)
129 long dt;
131 rcu_read_lock();
132 dt = rcu_dereference(bdev->disk_conf)->disk_timeout;
133 rcu_read_unlock();
134 dt = dt * HZ / 10;
135 if (dt == 0)
136 dt = MAX_SCHEDULE_TIMEOUT;
138 dt = wait_event_timeout(mdev->misc_wait,
139 *done || test_bit(FORCE_DETACH, &mdev->flags), dt);
140 if (dt == 0) {
141 dev_err(DEV, "meta-data IO operation timed out\n");
142 drbd_chk_io_error(mdev, 1, DRBD_FORCE_DETACH);
146 static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
147 struct drbd_backing_dev *bdev,
148 struct page *page, sector_t sector,
149 int rw, int size)
151 struct bio *bio;
152 int err;
154 mdev->md_io.done = 0;
155 mdev->md_io.error = -ENODEV;
157 if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags))
158 rw |= REQ_FUA | REQ_FLUSH;
159 rw |= REQ_SYNC;
161 bio = bio_alloc_drbd(GFP_NOIO);
162 bio->bi_bdev = bdev->md_bdev;
163 bio->bi_sector = sector;
164 err = -EIO;
165 if (bio_add_page(bio, page, size, 0) != size)
166 goto out;
167 bio->bi_private = &mdev->md_io;
168 bio->bi_end_io = drbd_md_io_complete;
169 bio->bi_rw = rw;
171 if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */
172 dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
173 err = -ENODEV;
174 goto out;
177 bio_get(bio); /* one bio_put() is in the completion handler */
178 atomic_inc(&mdev->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */
179 if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
180 bio_endio(bio, -EIO);
181 else
182 submit_bio(rw, bio);
183 wait_until_done_or_force_detached(mdev, bdev, &mdev->md_io.done);
184 if (bio_flagged(bio, BIO_UPTODATE))
185 err = mdev->md_io.error;
187 out:
188 bio_put(bio);
189 return err;
192 int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
193 sector_t sector, int rw)
195 int err;
196 struct page *iop = mdev->md_io_page;
198 D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1);
200 BUG_ON(!bdev->md_bdev);
202 dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n",
203 current->comm, current->pid, __func__,
204 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
206 if (sector < drbd_md_first_sector(bdev) ||
207 sector + 7 > drbd_md_last_sector(bdev))
208 dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
209 current->comm, current->pid, __func__,
210 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
212 err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE);
213 if (err) {
214 dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
215 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
217 return err;
220 static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
222 struct lc_element *al_ext;
223 struct lc_element *tmp;
224 int wake;
226 spin_lock_irq(&mdev->al_lock);
227 tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
228 if (unlikely(tmp != NULL)) {
229 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
230 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
231 wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
232 spin_unlock_irq(&mdev->al_lock);
233 if (wake)
234 wake_up(&mdev->al_wait);
235 return NULL;
238 al_ext = lc_get(mdev->act_log, enr);
239 spin_unlock_irq(&mdev->al_lock);
240 return al_ext;
243 void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
245 /* for bios crossing activity log extent boundaries,
246 * we may need to activate two extents in one go */
247 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
248 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
249 unsigned enr;
250 bool locked = false;
253 D_ASSERT(first <= last);
254 D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
256 for (enr = first; enr <= last; enr++)
257 wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL);
259 /* Serialize multiple transactions.
260 * This uses test_and_set_bit, memory barrier is implicit.
262 wait_event(mdev->al_wait,
263 mdev->act_log->pending_changes == 0 ||
264 (locked = lc_try_lock_for_transaction(mdev->act_log)));
266 if (locked) {
267 /* drbd_al_write_transaction(mdev,al_ext,enr);
268 * recurses into generic_make_request(), which
269 * disallows recursion, bios being serialized on the
270 * current->bio_tail list now.
271 * we have to delegate updates to the activity log
272 * to the worker thread. */
274 /* Double check: it may have been committed by someone else,
275 * while we have been waiting for the lock. */
276 if (mdev->act_log->pending_changes) {
277 bool write_al_updates;
279 rcu_read_lock();
280 write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
281 rcu_read_unlock();
283 if (write_al_updates) {
284 al_write_transaction(mdev);
285 mdev->al_writ_cnt++;
288 spin_lock_irq(&mdev->al_lock);
289 /* FIXME
290 if (err)
291 we need an "lc_cancel" here;
293 lc_committed(mdev->act_log);
294 spin_unlock_irq(&mdev->al_lock);
296 lc_unlock(mdev->act_log);
297 wake_up(&mdev->al_wait);
301 void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i)
303 /* for bios crossing activity log extent boundaries,
304 * we may need to activate two extents in one go */
305 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
306 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
307 unsigned enr;
308 struct lc_element *extent;
309 unsigned long flags;
311 D_ASSERT(first <= last);
312 spin_lock_irqsave(&mdev->al_lock, flags);
314 for (enr = first; enr <= last; enr++) {
315 extent = lc_find(mdev->act_log, enr);
316 if (!extent) {
317 dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
318 continue;
320 lc_put(mdev->act_log, extent);
322 spin_unlock_irqrestore(&mdev->al_lock, flags);
323 wake_up(&mdev->al_wait);
326 #if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
327 /* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
328 * are still coupled, or assume too much about their relation.
329 * Code below will not work if this is violated.
330 * Will be cleaned up with some followup patch.
332 # error FIXME
333 #endif
335 static unsigned int al_extent_to_bm_page(unsigned int al_enr)
337 return al_enr >>
338 /* bit to page */
339 ((PAGE_SHIFT + 3) -
340 /* al extent number to bit */
341 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
344 static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
346 return rs_enr >>
347 /* bit to page */
348 ((PAGE_SHIFT + 3) -
349 /* resync extent number to bit */
350 (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
353 static int
354 _al_write_transaction(struct drbd_conf *mdev)
356 struct al_transaction_on_disk *buffer;
357 struct lc_element *e;
358 sector_t sector;
359 int i, mx;
360 unsigned extent_nr;
361 unsigned crc = 0;
362 int err = 0;
364 if (!get_ldev(mdev)) {
365 dev_err(DEV, "disk is %s, cannot start al transaction\n",
366 drbd_disk_str(mdev->state.disk));
367 return -EIO;
370 /* The bitmap write may have failed, causing a state change. */
371 if (mdev->state.disk < D_INCONSISTENT) {
372 dev_err(DEV,
373 "disk is %s, cannot write al transaction\n",
374 drbd_disk_str(mdev->state.disk));
375 put_ldev(mdev);
376 return -EIO;
379 buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */
380 if (!buffer) {
381 dev_err(DEV, "disk failed while waiting for md_io buffer\n");
382 put_ldev(mdev);
383 return -ENODEV;
386 memset(buffer, 0, sizeof(*buffer));
387 buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
388 buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
390 i = 0;
392 /* Even though no one can start to change this list
393 * once we set the LC_LOCKED -- from drbd_al_begin_io(),
394 * lc_try_lock_for_transaction() --, someone may still
395 * be in the process of changing it. */
396 spin_lock_irq(&mdev->al_lock);
397 list_for_each_entry(e, &mdev->act_log->to_be_changed, list) {
398 if (i == AL_UPDATES_PER_TRANSACTION) {
399 i++;
400 break;
402 buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
403 buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
404 if (e->lc_number != LC_FREE)
405 drbd_bm_mark_for_writeout(mdev,
406 al_extent_to_bm_page(e->lc_number));
407 i++;
409 spin_unlock_irq(&mdev->al_lock);
410 BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
412 buffer->n_updates = cpu_to_be16(i);
413 for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
414 buffer->update_slot_nr[i] = cpu_to_be16(-1);
415 buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
418 buffer->context_size = cpu_to_be16(mdev->act_log->nr_elements);
419 buffer->context_start_slot_nr = cpu_to_be16(mdev->al_tr_cycle);
421 mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
422 mdev->act_log->nr_elements - mdev->al_tr_cycle);
423 for (i = 0; i < mx; i++) {
424 unsigned idx = mdev->al_tr_cycle + i;
425 extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number;
426 buffer->context[i] = cpu_to_be32(extent_nr);
428 for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
429 buffer->context[i] = cpu_to_be32(LC_FREE);
431 mdev->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
432 if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
433 mdev->al_tr_cycle = 0;
435 sector = mdev->ldev->md.md_offset
436 + mdev->ldev->md.al_offset
437 + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9);
439 crc = crc32c(0, buffer, 4096);
440 buffer->crc32c = cpu_to_be32(crc);
442 if (drbd_bm_write_hinted(mdev))
443 err = -EIO;
444 /* drbd_chk_io_error done already */
445 else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
446 err = -EIO;
447 drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
448 } else {
449 /* advance ringbuffer position and transaction counter */
450 mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE);
451 mdev->al_tr_number++;
454 drbd_md_put_buffer(mdev);
455 put_ldev(mdev);
457 return err;
461 static int w_al_write_transaction(struct drbd_work *w, int unused)
463 struct update_al_work *aw = container_of(w, struct update_al_work, w);
464 struct drbd_conf *mdev = w->mdev;
465 int err;
467 err = _al_write_transaction(mdev);
468 aw->err = err;
469 complete(&aw->event);
471 return err != -EIO ? err : 0;
474 /* Calls from worker context (see w_restart_disk_io()) need to write the
475 transaction directly. Others came through generic_make_request(),
476 those need to delegate it to the worker. */
477 static int al_write_transaction(struct drbd_conf *mdev)
479 struct update_al_work al_work;
481 if (current == mdev->tconn->worker.task)
482 return _al_write_transaction(mdev);
484 init_completion(&al_work.event);
485 al_work.w.cb = w_al_write_transaction;
486 al_work.w.mdev = mdev;
487 drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
488 wait_for_completion(&al_work.event);
490 return al_work.err;
493 static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
495 int rv;
497 spin_lock_irq(&mdev->al_lock);
498 rv = (al_ext->refcnt == 0);
499 if (likely(rv))
500 lc_del(mdev->act_log, al_ext);
501 spin_unlock_irq(&mdev->al_lock);
503 return rv;
507 * drbd_al_shrink() - Removes all active extents form the activity log
508 * @mdev: DRBD device.
510 * Removes all active extents form the activity log, waiting until
511 * the reference count of each entry dropped to 0 first, of course.
513 * You need to lock mdev->act_log with lc_try_lock() / lc_unlock()
515 void drbd_al_shrink(struct drbd_conf *mdev)
517 struct lc_element *al_ext;
518 int i;
520 D_ASSERT(test_bit(__LC_LOCKED, &mdev->act_log->flags));
522 for (i = 0; i < mdev->act_log->nr_elements; i++) {
523 al_ext = lc_element_by_index(mdev->act_log, i);
524 if (al_ext->lc_number == LC_FREE)
525 continue;
526 wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext));
529 wake_up(&mdev->al_wait);
532 static int w_update_odbm(struct drbd_work *w, int unused)
534 struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
535 struct drbd_conf *mdev = w->mdev;
536 struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
538 if (!get_ldev(mdev)) {
539 if (__ratelimit(&drbd_ratelimit_state))
540 dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n");
541 kfree(udw);
542 return 0;
545 drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr));
546 put_ldev(mdev);
548 kfree(udw);
550 if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) {
551 switch (mdev->state.conn) {
552 case C_SYNC_SOURCE: case C_SYNC_TARGET:
553 case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
554 drbd_resync_finished(mdev);
555 default:
556 /* nothing to do */
557 break;
560 drbd_bcast_event(mdev, &sib);
562 return 0;
566 /* ATTENTION. The AL's extents are 4MB each, while the extents in the
567 * resync LRU-cache are 16MB each.
568 * The caller of this function has to hold an get_ldev() reference.
570 * TODO will be obsoleted once we have a caching lru of the on disk bitmap
572 static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
573 int count, int success)
575 struct lc_element *e;
576 struct update_odbm_work *udw;
578 unsigned int enr;
580 D_ASSERT(atomic_read(&mdev->local_cnt));
582 /* I simply assume that a sector/size pair never crosses
583 * a 16 MB extent border. (Currently this is true...) */
584 enr = BM_SECT_TO_EXT(sector);
586 e = lc_get(mdev->resync, enr);
587 if (e) {
588 struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
589 if (ext->lce.lc_number == enr) {
590 if (success)
591 ext->rs_left -= count;
592 else
593 ext->rs_failed += count;
594 if (ext->rs_left < ext->rs_failed) {
595 dev_warn(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
596 "rs_failed=%d count=%d cstate=%s\n",
597 (unsigned long long)sector,
598 ext->lce.lc_number, ext->rs_left,
599 ext->rs_failed, count,
600 drbd_conn_str(mdev->state.conn));
602 /* We don't expect to be able to clear more bits
603 * than have been set when we originally counted
604 * the set bits to cache that value in ext->rs_left.
605 * Whatever the reason (disconnect during resync,
606 * delayed local completion of an application write),
607 * try to fix it up by recounting here. */
608 ext->rs_left = drbd_bm_e_weight(mdev, enr);
610 } else {
611 /* Normally this element should be in the cache,
612 * since drbd_rs_begin_io() pulled it already in.
614 * But maybe an application write finished, and we set
615 * something outside the resync lru_cache in sync.
617 int rs_left = drbd_bm_e_weight(mdev, enr);
618 if (ext->flags != 0) {
619 dev_warn(DEV, "changing resync lce: %d[%u;%02lx]"
620 " -> %d[%u;00]\n",
621 ext->lce.lc_number, ext->rs_left,
622 ext->flags, enr, rs_left);
623 ext->flags = 0;
625 if (ext->rs_failed) {
626 dev_warn(DEV, "Kicking resync_lru element enr=%u "
627 "out with rs_failed=%d\n",
628 ext->lce.lc_number, ext->rs_failed);
630 ext->rs_left = rs_left;
631 ext->rs_failed = success ? 0 : count;
632 /* we don't keep a persistent log of the resync lru,
633 * we can commit any change right away. */
634 lc_committed(mdev->resync);
636 lc_put(mdev->resync, &ext->lce);
637 /* no race, we are within the al_lock! */
639 if (ext->rs_left == ext->rs_failed) {
640 ext->rs_failed = 0;
642 udw = kmalloc(sizeof(*udw), GFP_ATOMIC);
643 if (udw) {
644 udw->enr = ext->lce.lc_number;
645 udw->w.cb = w_update_odbm;
646 udw->w.mdev = mdev;
647 drbd_queue_work_front(&mdev->tconn->sender_work, &udw->w);
648 } else {
649 dev_warn(DEV, "Could not kmalloc an udw\n");
652 } else {
653 dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n",
654 mdev->resync_locked,
655 mdev->resync->nr_elements,
656 mdev->resync->flags);
660 void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go)
662 unsigned long now = jiffies;
663 unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark];
664 int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS;
665 if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
666 if (mdev->rs_mark_left[mdev->rs_last_mark] != still_to_go &&
667 mdev->state.conn != C_PAUSED_SYNC_T &&
668 mdev->state.conn != C_PAUSED_SYNC_S) {
669 mdev->rs_mark_time[next] = now;
670 mdev->rs_mark_left[next] = still_to_go;
671 mdev->rs_last_mark = next;
676 /* clear the bit corresponding to the piece of storage in question:
677 * size byte of data starting from sector. Only clear a bits of the affected
678 * one ore more _aligned_ BM_BLOCK_SIZE blocks.
680 * called by worker on C_SYNC_TARGET and receiver on SyncSource.
683 void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
684 const char *file, const unsigned int line)
686 /* Is called from worker and receiver context _only_ */
687 unsigned long sbnr, ebnr, lbnr;
688 unsigned long count = 0;
689 sector_t esector, nr_sectors;
690 int wake_up = 0;
691 unsigned long flags;
693 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
694 dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
695 (unsigned long long)sector, size);
696 return;
699 if (!get_ldev(mdev))
700 return; /* no disk, no metadata, no bitmap to clear bits in */
702 nr_sectors = drbd_get_capacity(mdev->this_bdev);
703 esector = sector + (size >> 9) - 1;
705 if (!expect(sector < nr_sectors))
706 goto out;
707 if (!expect(esector < nr_sectors))
708 esector = nr_sectors - 1;
710 lbnr = BM_SECT_TO_BIT(nr_sectors-1);
712 /* we clear it (in sync).
713 * round up start sector, round down end sector. we make sure we only
714 * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
715 if (unlikely(esector < BM_SECT_PER_BIT-1))
716 goto out;
717 if (unlikely(esector == (nr_sectors-1)))
718 ebnr = lbnr;
719 else
720 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
721 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
723 if (sbnr > ebnr)
724 goto out;
727 * ok, (capacity & 7) != 0 sometimes, but who cares...
728 * we count rs_{total,left} in bits, not sectors.
730 count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
731 if (count) {
732 drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev));
733 spin_lock_irqsave(&mdev->al_lock, flags);
734 drbd_try_clear_on_disk_bm(mdev, sector, count, true);
735 spin_unlock_irqrestore(&mdev->al_lock, flags);
737 /* just wake_up unconditional now, various lc_chaged(),
738 * lc_put() in drbd_try_clear_on_disk_bm(). */
739 wake_up = 1;
741 out:
742 put_ldev(mdev);
743 if (wake_up)
744 wake_up(&mdev->al_wait);
748 * this is intended to set one request worth of data out of sync.
749 * affects at least 1 bit,
750 * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits.
752 * called by tl_clear and drbd_send_dblock (==drbd_make_request).
753 * so this can be _any_ process.
755 int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
756 const char *file, const unsigned int line)
758 unsigned long sbnr, ebnr, flags;
759 sector_t esector, nr_sectors;
760 unsigned int enr, count = 0;
761 struct lc_element *e;
763 /* this should be an empty REQ_FLUSH */
764 if (size == 0)
765 return 0;
767 if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
768 dev_err(DEV, "sector: %llus, size: %d\n",
769 (unsigned long long)sector, size);
770 return 0;
773 if (!get_ldev(mdev))
774 return 0; /* no disk, no metadata, no bitmap to set bits in */
776 nr_sectors = drbd_get_capacity(mdev->this_bdev);
777 esector = sector + (size >> 9) - 1;
779 if (!expect(sector < nr_sectors))
780 goto out;
781 if (!expect(esector < nr_sectors))
782 esector = nr_sectors - 1;
784 /* we set it out of sync,
785 * we do not need to round anything here */
786 sbnr = BM_SECT_TO_BIT(sector);
787 ebnr = BM_SECT_TO_BIT(esector);
789 /* ok, (capacity & 7) != 0 sometimes, but who cares...
790 * we count rs_{total,left} in bits, not sectors. */
791 spin_lock_irqsave(&mdev->al_lock, flags);
792 count = drbd_bm_set_bits(mdev, sbnr, ebnr);
794 enr = BM_SECT_TO_EXT(sector);
795 e = lc_find(mdev->resync, enr);
796 if (e)
797 lc_entry(e, struct bm_extent, lce)->rs_left += count;
798 spin_unlock_irqrestore(&mdev->al_lock, flags);
800 out:
801 put_ldev(mdev);
803 return count;
806 static
807 struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
809 struct lc_element *e;
810 struct bm_extent *bm_ext;
811 int wakeup = 0;
812 unsigned long rs_flags;
814 spin_lock_irq(&mdev->al_lock);
815 if (mdev->resync_locked > mdev->resync->nr_elements/2) {
816 spin_unlock_irq(&mdev->al_lock);
817 return NULL;
819 e = lc_get(mdev->resync, enr);
820 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
821 if (bm_ext) {
822 if (bm_ext->lce.lc_number != enr) {
823 bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
824 bm_ext->rs_failed = 0;
825 lc_committed(mdev->resync);
826 wakeup = 1;
828 if (bm_ext->lce.refcnt == 1)
829 mdev->resync_locked++;
830 set_bit(BME_NO_WRITES, &bm_ext->flags);
832 rs_flags = mdev->resync->flags;
833 spin_unlock_irq(&mdev->al_lock);
834 if (wakeup)
835 wake_up(&mdev->al_wait);
837 if (!bm_ext) {
838 if (rs_flags & LC_STARVING)
839 dev_warn(DEV, "Have to wait for element"
840 " (resync LRU too small?)\n");
841 BUG_ON(rs_flags & LC_LOCKED);
844 return bm_ext;
847 static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
849 int rv;
851 spin_lock_irq(&mdev->al_lock);
852 rv = lc_is_used(mdev->act_log, enr);
853 spin_unlock_irq(&mdev->al_lock);
855 return rv;
859 * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED
860 * @mdev: DRBD device.
861 * @sector: The sector number.
863 * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted.
865 int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
867 unsigned int enr = BM_SECT_TO_EXT(sector);
868 struct bm_extent *bm_ext;
869 int i, sig;
870 int sa = 200; /* Step aside 200 times, then grab the extent and let app-IO wait.
871 200 times -> 20 seconds. */
873 retry:
874 sig = wait_event_interruptible(mdev->al_wait,
875 (bm_ext = _bme_get(mdev, enr)));
876 if (sig)
877 return -EINTR;
879 if (test_bit(BME_LOCKED, &bm_ext->flags))
880 return 0;
882 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
883 sig = wait_event_interruptible(mdev->al_wait,
884 !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i) ||
885 test_bit(BME_PRIORITY, &bm_ext->flags));
887 if (sig || (test_bit(BME_PRIORITY, &bm_ext->flags) && sa)) {
888 spin_lock_irq(&mdev->al_lock);
889 if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
890 bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */
891 mdev->resync_locked--;
892 wake_up(&mdev->al_wait);
894 spin_unlock_irq(&mdev->al_lock);
895 if (sig)
896 return -EINTR;
897 if (schedule_timeout_interruptible(HZ/10))
898 return -EINTR;
899 if (sa && --sa == 0)
900 dev_warn(DEV,"drbd_rs_begin_io() stepped aside for 20sec."
901 "Resync stalled?\n");
902 goto retry;
905 set_bit(BME_LOCKED, &bm_ext->flags);
906 return 0;
910 * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep
911 * @mdev: DRBD device.
912 * @sector: The sector number.
914 * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then
915 * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN
916 * if there is still application IO going on in this area.
918 int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
920 unsigned int enr = BM_SECT_TO_EXT(sector);
921 const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT;
922 struct lc_element *e;
923 struct bm_extent *bm_ext;
924 int i;
926 spin_lock_irq(&mdev->al_lock);
927 if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) {
928 /* in case you have very heavy scattered io, it may
929 * stall the syncer undefined if we give up the ref count
930 * when we try again and requeue.
932 * if we don't give up the refcount, but the next time
933 * we are scheduled this extent has been "synced" by new
934 * application writes, we'd miss the lc_put on the
935 * extent we keep the refcount on.
936 * so we remembered which extent we had to try again, and
937 * if the next requested one is something else, we do
938 * the lc_put here...
939 * we also have to wake_up
941 e = lc_find(mdev->resync, mdev->resync_wenr);
942 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
943 if (bm_ext) {
944 D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
945 D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
946 clear_bit(BME_NO_WRITES, &bm_ext->flags);
947 mdev->resync_wenr = LC_FREE;
948 if (lc_put(mdev->resync, &bm_ext->lce) == 0)
949 mdev->resync_locked--;
950 wake_up(&mdev->al_wait);
951 } else {
952 dev_alert(DEV, "LOGIC BUG\n");
955 /* TRY. */
956 e = lc_try_get(mdev->resync, enr);
957 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
958 if (bm_ext) {
959 if (test_bit(BME_LOCKED, &bm_ext->flags))
960 goto proceed;
961 if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) {
962 mdev->resync_locked++;
963 } else {
964 /* we did set the BME_NO_WRITES,
965 * but then could not set BME_LOCKED,
966 * so we tried again.
967 * drop the extra reference. */
968 bm_ext->lce.refcnt--;
969 D_ASSERT(bm_ext->lce.refcnt > 0);
971 goto check_al;
972 } else {
973 /* do we rather want to try later? */
974 if (mdev->resync_locked > mdev->resync->nr_elements-3)
975 goto try_again;
976 /* Do or do not. There is no try. -- Yoda */
977 e = lc_get(mdev->resync, enr);
978 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
979 if (!bm_ext) {
980 const unsigned long rs_flags = mdev->resync->flags;
981 if (rs_flags & LC_STARVING)
982 dev_warn(DEV, "Have to wait for element"
983 " (resync LRU too small?)\n");
984 BUG_ON(rs_flags & LC_LOCKED);
985 goto try_again;
987 if (bm_ext->lce.lc_number != enr) {
988 bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
989 bm_ext->rs_failed = 0;
990 lc_committed(mdev->resync);
991 wake_up(&mdev->al_wait);
992 D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0);
994 set_bit(BME_NO_WRITES, &bm_ext->flags);
995 D_ASSERT(bm_ext->lce.refcnt == 1);
996 mdev->resync_locked++;
997 goto check_al;
999 check_al:
1000 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
1001 if (lc_is_used(mdev->act_log, al_enr+i))
1002 goto try_again;
1004 set_bit(BME_LOCKED, &bm_ext->flags);
1005 proceed:
1006 mdev->resync_wenr = LC_FREE;
1007 spin_unlock_irq(&mdev->al_lock);
1008 return 0;
1010 try_again:
1011 if (bm_ext)
1012 mdev->resync_wenr = enr;
1013 spin_unlock_irq(&mdev->al_lock);
1014 return -EAGAIN;
1017 void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector)
1019 unsigned int enr = BM_SECT_TO_EXT(sector);
1020 struct lc_element *e;
1021 struct bm_extent *bm_ext;
1022 unsigned long flags;
1024 spin_lock_irqsave(&mdev->al_lock, flags);
1025 e = lc_find(mdev->resync, enr);
1026 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1027 if (!bm_ext) {
1028 spin_unlock_irqrestore(&mdev->al_lock, flags);
1029 if (__ratelimit(&drbd_ratelimit_state))
1030 dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n");
1031 return;
1034 if (bm_ext->lce.refcnt == 0) {
1035 spin_unlock_irqrestore(&mdev->al_lock, flags);
1036 dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, "
1037 "but refcnt is 0!?\n",
1038 (unsigned long long)sector, enr);
1039 return;
1042 if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
1043 bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */
1044 mdev->resync_locked--;
1045 wake_up(&mdev->al_wait);
1048 spin_unlock_irqrestore(&mdev->al_lock, flags);
1052 * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED)
1053 * @mdev: DRBD device.
1055 void drbd_rs_cancel_all(struct drbd_conf *mdev)
1057 spin_lock_irq(&mdev->al_lock);
1059 if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */
1060 lc_reset(mdev->resync);
1061 put_ldev(mdev);
1063 mdev->resync_locked = 0;
1064 mdev->resync_wenr = LC_FREE;
1065 spin_unlock_irq(&mdev->al_lock);
1066 wake_up(&mdev->al_wait);
1070 * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU
1071 * @mdev: DRBD device.
1073 * Returns 0 upon success, -EAGAIN if at least one reference count was
1074 * not zero.
1076 int drbd_rs_del_all(struct drbd_conf *mdev)
1078 struct lc_element *e;
1079 struct bm_extent *bm_ext;
1080 int i;
1082 spin_lock_irq(&mdev->al_lock);
1084 if (get_ldev_if_state(mdev, D_FAILED)) {
1085 /* ok, ->resync is there. */
1086 for (i = 0; i < mdev->resync->nr_elements; i++) {
1087 e = lc_element_by_index(mdev->resync, i);
1088 bm_ext = lc_entry(e, struct bm_extent, lce);
1089 if (bm_ext->lce.lc_number == LC_FREE)
1090 continue;
1091 if (bm_ext->lce.lc_number == mdev->resync_wenr) {
1092 dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently"
1093 " got 'synced' by application io\n",
1094 mdev->resync_wenr);
1095 D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1096 D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
1097 clear_bit(BME_NO_WRITES, &bm_ext->flags);
1098 mdev->resync_wenr = LC_FREE;
1099 lc_put(mdev->resync, &bm_ext->lce);
1101 if (bm_ext->lce.refcnt != 0) {
1102 dev_info(DEV, "Retrying drbd_rs_del_all() later. "
1103 "refcnt=%d\n", bm_ext->lce.refcnt);
1104 put_ldev(mdev);
1105 spin_unlock_irq(&mdev->al_lock);
1106 return -EAGAIN;
1108 D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1109 D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags));
1110 lc_del(mdev->resync, &bm_ext->lce);
1112 D_ASSERT(mdev->resync->used == 0);
1113 put_ldev(mdev);
1115 spin_unlock_irq(&mdev->al_lock);
1116 wake_up(&mdev->al_wait);
1118 return 0;
1122 * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
1123 * @mdev: DRBD device.
1124 * @sector: The sector number.
1125 * @size: Size of failed IO operation, in byte.
1127 void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
1129 /* Is called from worker and receiver context _only_ */
1130 unsigned long sbnr, ebnr, lbnr;
1131 unsigned long count;
1132 sector_t esector, nr_sectors;
1133 int wake_up = 0;
1135 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
1136 dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
1137 (unsigned long long)sector, size);
1138 return;
1140 nr_sectors = drbd_get_capacity(mdev->this_bdev);
1141 esector = sector + (size >> 9) - 1;
1143 if (!expect(sector < nr_sectors))
1144 return;
1145 if (!expect(esector < nr_sectors))
1146 esector = nr_sectors - 1;
1148 lbnr = BM_SECT_TO_BIT(nr_sectors-1);
1151 * round up start sector, round down end sector. we make sure we only
1152 * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */
1153 if (unlikely(esector < BM_SECT_PER_BIT-1))
1154 return;
1155 if (unlikely(esector == (nr_sectors-1)))
1156 ebnr = lbnr;
1157 else
1158 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
1159 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
1161 if (sbnr > ebnr)
1162 return;
1165 * ok, (capacity & 7) != 0 sometimes, but who cares...
1166 * we count rs_{total,left} in bits, not sectors.
1168 spin_lock_irq(&mdev->al_lock);
1169 count = drbd_bm_count_bits(mdev, sbnr, ebnr);
1170 if (count) {
1171 mdev->rs_failed += count;
1173 if (get_ldev(mdev)) {
1174 drbd_try_clear_on_disk_bm(mdev, sector, count, false);
1175 put_ldev(mdev);
1178 /* just wake_up unconditional now, various lc_chaged(),
1179 * lc_put() in drbd_try_clear_on_disk_bm(). */
1180 wake_up = 1;
1182 spin_unlock_irq(&mdev->al_lock);
1183 if (wake_up)
1184 wake_up(&mdev->al_wait);