drivers/block/drbd/drbd_worker.c

   1 /*
   2    drbd_worker.c
   3
   4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   5
   6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
   9
  10    drbd is free software; you can redistribute it and/or modify
  11    it under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 2, or (at your option)
  13    any later version.
  14
  15    drbd is distributed in the hope that it will be useful,
  16    but WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18    GNU General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with drbd; see the file COPYING.  If not, write to
  22    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  23
  24 */
  25
  26 #include <linux/module.h>
  27 #include <linux/drbd.h>
  28 #include <linux/sched/signal.h>
  29 #include <linux/wait.h>
  30 #include <linux/mm.h>
  31 #include <linux/memcontrol.h>
  32 #include <linux/mm_inline.h>
  33 #include <linux/slab.h>
  34 #include <linux/random.h>
  35 #include <linux/string.h>
  36 #include <linux/scatterlist.h>
  37
  38 #include "drbd_int.h"
  39 #include "drbd_protocol.h"
  40 #include "drbd_req.h"
  41
  42 static int make_ov_request(struct drbd_device *, int);
  43 static int make_resync_request(struct drbd_device *, int);
  44
  45 /* endio handlers:
  46  *   drbd_md_endio (defined here)
  47  *   drbd_request_endio (defined here)
  48  *   drbd_peer_request_endio (defined here)
  49  *   drbd_bm_endio (defined in drbd_bitmap.c)
  50  *
  51  * For all these callbacks, note the following:
  52  * The callbacks will be called in irq context by the IDE drivers,
  53  * and in Softirqs/Tasklets/BH context by the SCSI drivers.
  54  * Try to get the locking right :)
  55  *
  56  */
  57
  58 /* used for synchronous meta data and bitmap IO
  59  * submitted by drbd_md_sync_page_io()
  60  */
  61 void drbd_md_endio(struct bio *bio)
  62 {
  63         struct drbd_device *device;
  64
  65         device = bio->bi_private;
  66         device->md_io.error = blk_status_to_errno(bio->bi_status);
  67
  68         /* special case: drbd_md_read() during drbd_adm_attach() */
  69         if (device->ldev)
  70                 put_ldev(device);
  71         bio_put(bio);
  72
  73         /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
  74          * to timeout on the lower level device, and eventually detach from it.
  75          * If this io completion runs after that timeout expired, this
  76          * drbd_md_put_buffer() may allow us to finally try and re-attach.
  77          * During normal operation, this only puts that extra reference
  78          * down to 1 again.
  79          * Make sure we first drop the reference, and only then signal
  80          * completion, or we may (in drbd_al_read_log()) cycle so fast into the
  81          * next drbd_md_sync_page_io(), that we trigger the
  82          * ASSERT(atomic_read(&device->md_io_in_use) == 1) there.
  83          */
  84         drbd_md_put_buffer(device);
  85         device->md_io.done = 1;
  86         wake_up(&device->misc_wait);
  87 }
  88
  89 /* reads on behalf of the partner,
  90  * "submitted" by the receiver
  91  */
  92 static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local)
  93 {
  94         unsigned long flags = 0;
  95         struct drbd_peer_device *peer_device = peer_req->peer_device;
  96         struct drbd_device *device = peer_device->device;
  97
  98         spin_lock_irqsave(&device->resource->req_lock, flags);
  99         device->read_cnt += peer_req->i.size >> 9;
 100         list_del(&peer_req->w.list);
 101         if (list_empty(&device->read_ee))
 102                 wake_up(&device->ee_wait);
 103         if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
 104                 __drbd_chk_io_error(device, DRBD_READ_ERROR);
 105         spin_unlock_irqrestore(&device->resource->req_lock, flags);
 106
 107         drbd_queue_work(&peer_device->connection->sender_work, &peer_req->w);
 108         put_ldev(device);
 109 }
 110
 111 /* writes on behalf of the partner, or resync writes,
 112  * "submitted" by the receiver, final stage.  */
 113 void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
 114 {
 115         unsigned long flags = 0;
 116         struct drbd_peer_device *peer_device = peer_req->peer_device;
 117         struct drbd_device *device = peer_device->device;
 118         struct drbd_connection *connection = peer_device->connection;
 119         struct drbd_interval i;
 120         int do_wake;
 121         u64 block_id;
 122         int do_al_complete_io;
 123
 124         /* after we moved peer_req to done_ee,
 125          * we may no longer access it,
 126          * it may be freed/reused already!
 127          * (as soon as we release the req_lock) */
 128         i = peer_req->i;
 129         do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
 130         block_id = peer_req->block_id;
 131         peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
 132
 133         if (peer_req->flags & EE_WAS_ERROR) {
 134                 /* In protocol != C, we usually do not send write acks.
 135                  * In case of a write error, send the neg ack anyways. */
 136                 if (!__test_and_set_bit(__EE_SEND_WRITE_ACK, &peer_req->flags))
 137                         inc_unacked(device);
 138                 drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
 139         }
 140
 141         spin_lock_irqsave(&device->resource->req_lock, flags);
 142         device->writ_cnt += peer_req->i.size >> 9;
 143         list_move_tail(&peer_req->w.list, &device->done_ee);
 144
 145         /*
 146          * Do not remove from the write_requests tree here: we did not send the
 147          * Ack yet and did not wake possibly waiting conflicting requests.
 148          * Removed from the tree from "drbd_process_done_ee" within the
 149          * appropriate dw.cb (e_end_block/e_end_resync_block) or from
 150          * _drbd_clear_done_ee.
 151          */
 152
 153         do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee);
 154
 155         /* FIXME do we want to detach for failed REQ_DISCARD?
 156          * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */
 157         if (peer_req->flags & EE_WAS_ERROR)
 158                 __drbd_chk_io_error(device, DRBD_WRITE_ERROR);
 159
 160         if (connection->cstate >= C_WF_REPORT_PARAMS) {
 161                 kref_get(&device->kref); /* put is in drbd_send_acks_wf() */
 162                 if (!queue_work(connection->ack_sender, &peer_device->send_acks_work))
 163                         kref_put(&device->kref, drbd_destroy_device);
 164         }
 165         spin_unlock_irqrestore(&device->resource->req_lock, flags);
 166
 167         if (block_id == ID_SYNCER)
 168                 drbd_rs_complete_io(device, i.sector);
 169
 170         if (do_wake)
 171                 wake_up(&device->ee_wait);
 172
 173         if (do_al_complete_io)
 174                 drbd_al_complete_io(device, &i);
 175
 176         put_ldev(device);
 177 }
 178
 179 /* writes on behalf of the partner, or resync writes,
 180  * "submitted" by the receiver.
 181  */
 182 void drbd_peer_request_endio(struct bio *bio)
 183 {
 184         struct drbd_peer_request *peer_req = bio->bi_private;
 185         struct drbd_device *device = peer_req->peer_device->device;
 186         bool is_write = bio_data_dir(bio) == WRITE;
 187         bool is_discard = bio_op(bio) == REQ_OP_WRITE_ZEROES ||
 188                           bio_op(bio) == REQ_OP_DISCARD;
 189
 190         if (bio->bi_status && __ratelimit(&drbd_ratelimit_state))
 191                 drbd_warn(device, "%s: error=%d s=%llus\n",
 192                                 is_write ? (is_discard ? "discard" : "write")
 193                                         : "read", bio->bi_status,
 194                                 (unsigned long long)peer_req->i.sector);
 195
 196         if (bio->bi_status)
 197                 set_bit(__EE_WAS_ERROR, &peer_req->flags);
 198
 199         bio_put(bio); /* no need for the bio anymore */
 200         if (atomic_dec_and_test(&peer_req->pending_bios)) {
 201                 if (is_write)
 202                         drbd_endio_write_sec_final(peer_req);
 203                 else
 204                         drbd_endio_read_sec_final(peer_req);
 205         }
 206 }
 207
 208 static void
 209 drbd_panic_after_delayed_completion_of_aborted_request(struct drbd_device *device)
 210 {
 211         panic("drbd%u %s/%u potential random memory corruption caused by delayed completion of aborted local request\n",
 212                 device->minor, device->resource->name, device->vnr);
 213 }
 214
 215 /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
 216  */
 217 void drbd_request_endio(struct bio *bio)
 218 {
 219         unsigned long flags;
 220         struct drbd_request *req = bio->bi_private;
 221         struct drbd_device *device = req->device;
 222         struct bio_and_error m;
 223         enum drbd_req_event what;
 224
 225         /* If this request was aborted locally before,
 226          * but now was completed "successfully",
 227          * chances are that this caused arbitrary data corruption.
 228          *
 229          * "aborting" requests, or force-detaching the disk, is intended for
 230          * completely blocked/hung local backing devices which do no longer
 231          * complete requests at all, not even do error completions.  In this
 232          * situation, usually a hard-reset and failover is the only way out.
 233          *
 234          * By "aborting", basically faking a local error-completion,
 235          * we allow for a more graceful swichover by cleanly migrating services.
 236          * Still the affected node has to be rebooted "soon".
 237          *
 238          * By completing these requests, we allow the upper layers to re-use
 239          * the associated data pages.
 240          *
 241          * If later the local backing device "recovers", and now DMAs some data
 242          * from disk into the original request pages, in the best case it will
 243          * just put random data into unused pages; but typically it will corrupt
 244          * meanwhile completely unrelated data, causing all sorts of damage.
 245          *
 246          * Which means delayed successful completion,
 247          * especially for READ requests,
 248          * is a reason to panic().
 249          *
 250          * We assume that a delayed *error* completion is OK,
 251          * though we still will complain noisily about it.
 252          */
 253         if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) {
 254                 if (__ratelimit(&drbd_ratelimit_state))
 255                         drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
 256
 257                 if (!bio->bi_status)
 258                         drbd_panic_after_delayed_completion_of_aborted_request(device);
 259         }
 260
 261         /* to avoid recursion in __req_mod */
 262         if (unlikely(bio->bi_status)) {
 263                 switch (bio_op(bio)) {
 264                 case REQ_OP_WRITE_ZEROES:
 265                 case REQ_OP_DISCARD:
 266                         if (bio->bi_status == BLK_STS_NOTSUPP)
 267                                 what = DISCARD_COMPLETED_NOTSUPP;
 268                         else
 269                                 what = DISCARD_COMPLETED_WITH_ERROR;
 270                         break;
 271                 case REQ_OP_READ:
 272                         if (bio->bi_opf & REQ_RAHEAD)
 273                                 what = READ_AHEAD_COMPLETED_WITH_ERROR;
 274                         else
 275                                 what = READ_COMPLETED_WITH_ERROR;
 276                         break;
 277                 default:
 278                         what = WRITE_COMPLETED_WITH_ERROR;
 279                         break;
 280                 }
 281         } else {
 282                 what = COMPLETED_OK;
 283         }
 284
 285         bio_put(req->private_bio);
 286         req->private_bio = ERR_PTR(blk_status_to_errno(bio->bi_status));
 287
 288         /* not req_mod(), we need irqsave here! */
 289         spin_lock_irqsave(&device->resource->req_lock, flags);
 290         __req_mod(req, what, &m);
 291         spin_unlock_irqrestore(&device->resource->req_lock, flags);
 292         put_ldev(device);
 293
 294         if (m.bio)
 295                 complete_master_bio(device, &m);
 296 }
 297
 298 void drbd_csum_ee(struct crypto_ahash *tfm, struct drbd_peer_request *peer_req, void *digest)
 299 {
 300         AHASH_REQUEST_ON_STACK(req, tfm);
 301         struct scatterlist sg;
 302         struct page *page = peer_req->pages;
 303         struct page *tmp;
 304         unsigned len;
 305
 306         ahash_request_set_tfm(req, tfm);
 307         ahash_request_set_callback(req, 0, NULL, NULL);
 308
 309         sg_init_table(&sg, 1);
 310         crypto_ahash_init(req);
 311
 312         while ((tmp = page_chain_next(page))) {
 313                 /* all but the last page will be fully used */
 314                 sg_set_page(&sg, page, PAGE_SIZE, 0);
 315                 ahash_request_set_crypt(req, &sg, NULL, sg.length);
 316                 crypto_ahash_update(req);
 317                 page = tmp;
 318         }
 319         /* and now the last, possibly only partially used page */
 320         len = peer_req->i.size & (PAGE_SIZE - 1);
 321         sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
 322         ahash_request_set_crypt(req, &sg, digest, sg.length);
 323         crypto_ahash_finup(req);
 324         ahash_request_zero(req);
 325 }
 326
 327 void drbd_csum_bio(struct crypto_ahash *tfm, struct bio *bio, void *digest)
 328 {
 329         AHASH_REQUEST_ON_STACK(req, tfm);
 330         struct scatterlist sg;
 331         struct bio_vec bvec;
 332         struct bvec_iter iter;
 333
 334         ahash_request_set_tfm(req, tfm);
 335         ahash_request_set_callback(req, 0, NULL, NULL);
 336
 337         sg_init_table(&sg, 1);
 338         crypto_ahash_init(req);
 339
 340         bio_for_each_segment(bvec, bio, iter) {
 341                 sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
 342                 ahash_request_set_crypt(req, &sg, NULL, sg.length);
 343                 crypto_ahash_update(req);
 344                 /* REQ_OP_WRITE_SAME has only one segment,
 345                  * checksum the payload only once. */
 346                 if (bio_op(bio) == REQ_OP_WRITE_SAME)
 347                         break;
 348         }
 349         ahash_request_set_crypt(req, NULL, digest, 0);
 350         crypto_ahash_final(req);
 351         ahash_request_zero(req);
 352 }
 353
 354 /* MAYBE merge common code with w_e_end_ov_req */
 355 static int w_e_send_csum(struct drbd_work *w, int cancel)
 356 {
 357         struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
 358         struct drbd_peer_device *peer_device = peer_req->peer_device;
 359         struct drbd_device *device = peer_device->device;
 360         int digest_size;
 361         void *digest;
 362         int err = 0;
 363
 364         if (unlikely(cancel))
 365                 goto out;
 366
 367         if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0))
 368                 goto out;
 369
 370         digest_size = crypto_ahash_digestsize(peer_device->connection->csums_tfm);
 371         digest = kmalloc(digest_size, GFP_NOIO);
 372         if (digest) {
 373                 sector_t sector = peer_req->i.sector;
 374                 unsigned int size = peer_req->i.size;
 375                 drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
 376                 /* Free peer_req and pages before send.
 377                  * In case we block on congestion, we could otherwise run into
 378                  * some distributed deadlock, if the other side blocks on
 379                  * congestion as well, because our receiver blocks in
 380                  * drbd_alloc_pages due to pp_in_use > max_buffers. */
 381                 drbd_free_peer_req(device, peer_req);
 382                 peer_req = NULL;
 383                 inc_rs_pending(device);
 384                 err = drbd_send_drequest_csum(peer_device, sector, size,
 385                                               digest, digest_size,
 386                                               P_CSUM_RS_REQUEST);
 387                 kfree(digest);
 388         } else {
 389                 drbd_err(device, "kmalloc() of digest failed.\n");
 390                 err = -ENOMEM;
 391         }
 392
 393 out:
 394         if (peer_req)
 395                 drbd_free_peer_req(device, peer_req);
 396
 397         if (unlikely(err))
 398                 drbd_err(device, "drbd_send_drequest(..., csum) failed\n");
 399         return err;
 400 }
 401
 402 #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
 403
 404 static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, int size)
 405 {
 406         struct drbd_device *device = peer_device->device;
 407         struct drbd_peer_request *peer_req;
 408
 409         if (!get_ldev(device))
 410                 return -EIO;
 411
 412         /* GFP_TRY, because if there is no memory available right now, this may
 413          * be rescheduled for later. It is "only" background resync, after all. */
 414         peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
 415                                        size, size, GFP_TRY);
 416         if (!peer_req)
 417                 goto defer;
 418
 419         peer_req->w.cb = w_e_send_csum;
 420         spin_lock_irq(&device->resource->req_lock);
 421         list_add_tail(&peer_req->w.list, &device->read_ee);
 422         spin_unlock_irq(&device->resource->req_lock);
 423
 424         atomic_add(size >> 9, &device->rs_sect_ev);
 425         if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, 0,
 426                                      DRBD_FAULT_RS_RD) == 0)
 427                 return 0;
 428
 429         /* If it failed because of ENOMEM, retry should help.  If it failed
 430          * because bio_add_page failed (probably broken lower level driver),
 431          * retry may or may not help.
 432          * If it does not, you may need to force disconnect. */
 433         spin_lock_irq(&device->resource->req_lock);
 434         list_del(&peer_req->w.list);
 435         spin_unlock_irq(&device->resource->req_lock);
 436
 437         drbd_free_peer_req(device, peer_req);
 438 defer:
 439         put_ldev(device);
 440         return -EAGAIN;
 441 }
 442
 443 int w_resync_timer(struct drbd_work *w, int cancel)
 444 {
 445         struct drbd_device *device =
 446                 container_of(w, struct drbd_device, resync_work);
 447
 448         switch (device->state.conn) {
 449         case C_VERIFY_S:
 450                 make_ov_request(device, cancel);
 451                 break;
 452         case C_SYNC_TARGET:
 453                 make_resync_request(device, cancel);
 454                 break;
 455         }
 456
 457         return 0;
 458 }
 459
 460 void resync_timer_fn(struct timer_list *t)
 461 {
 462         struct drbd_device *device = from_timer(device, t, resync_timer);
 463
 464         drbd_queue_work_if_unqueued(
 465                 &first_peer_device(device)->connection->sender_work,
 466                 &device->resync_work);
 467 }
 468
 469 static void fifo_set(struct fifo_buffer *fb, int value)
 470 {
 471         int i;
 472
 473         for (i = 0; i < fb->size; i++)
 474                 fb->values[i] = value;
 475 }
 476
 477 static int fifo_push(struct fifo_buffer *fb, int value)
 478 {
 479         int ov;
 480
 481         ov = fb->values[fb->head_index];
 482         fb->values[fb->head_index++] = value;
 483
 484         if (fb->head_index >= fb->size)
 485                 fb->head_index = 0;
 486
 487         return ov;
 488 }
 489
 490 static void fifo_add_val(struct fifo_buffer *fb, int value)
 491 {
 492         int i;
 493
 494         for (i = 0; i < fb->size; i++)
 495                 fb->values[i] += value;
 496 }
 497
 498 struct fifo_buffer *fifo_alloc(int fifo_size)
 499 {
 500         struct fifo_buffer *fb;
 501
 502         fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO);
 503         if (!fb)
 504                 return NULL;
 505
 506         fb->head_index = 0;
 507         fb->size = fifo_size;
 508         fb->total = 0;
 509
 510         return fb;
 511 }
 512
 513 static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in)
 514 {
 515         struct disk_conf *dc;
 516         unsigned int want;     /* The number of sectors we want in-flight */
 517         int req_sect; /* Number of sectors to request in this turn */
 518         int correction; /* Number of sectors more we need in-flight */
 519         int cps; /* correction per invocation of drbd_rs_controller() */
 520         int steps; /* Number of time steps to plan ahead */
 521         int curr_corr;
 522         int max_sect;
 523         struct fifo_buffer *plan;
 524
 525         dc = rcu_dereference(device->ldev->disk_conf);
 526         plan = rcu_dereference(device->rs_plan_s);
 527
 528         steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
 529
 530         if (device->rs_in_flight + sect_in == 0) { /* At start of resync */
 531                 want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps;
 532         } else { /* normal path */
 533                 want = dc->c_fill_target ? dc->c_fill_target :
 534                         sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10);
 535         }
 536
 537         correction = want - device->rs_in_flight - plan->total;
 538
 539         /* Plan ahead */
 540         cps = correction / steps;
 541         fifo_add_val(plan, cps);
 542         plan->total += cps * steps;
 543
 544         /* What we do in this step */
 545         curr_corr = fifo_push(plan, 0);
 546         plan->total -= curr_corr;
 547
 548         req_sect = sect_in + curr_corr;
 549         if (req_sect < 0)
 550                 req_sect = 0;
 551
 552         max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ;
 553         if (req_sect > max_sect)
 554                 req_sect = max_sect;
 555
 556         /*
 557         drbd_warn(device, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
 558                  sect_in, device->rs_in_flight, want, correction,
 559                  steps, cps, device->rs_planed, curr_corr, req_sect);
 560         */
 561
 562         return req_sect;
 563 }
 564
 565 static int drbd_rs_number_requests(struct drbd_device *device)
 566 {
 567         unsigned int sect_in;  /* Number of sectors that came in since the last turn */
 568         int number, mxb;
 569
 570         sect_in = atomic_xchg(&device->rs_sect_in, 0);
 571         device->rs_in_flight -= sect_in;
 572
 573         rcu_read_lock();
 574         mxb = drbd_get_max_buffers(device) / 2;
 575         if (rcu_dereference(device->rs_plan_s)->size) {
 576                 number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9);
 577                 device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
 578         } else {
 579                 device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate;
 580                 number = SLEEP_TIME * device->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
 581         }
 582         rcu_read_unlock();
 583
 584         /* Don't have more than "max-buffers"/2 in-flight.
 585          * Otherwise we may cause the remote site to stall on drbd_alloc_pages(),
 586          * potentially causing a distributed deadlock on congestion during
 587          * online-verify or (checksum-based) resync, if max-buffers,
 588          * socket buffer sizes and resync rate settings are mis-configured. */
 589
 590         /* note that "number" is in units of "BM_BLOCK_SIZE" (which is 4k),
 591          * mxb (as used here, and in drbd_alloc_pages on the peer) is
 592          * "number of pages" (typically also 4k),
 593          * but "rs_in_flight" is in "sectors" (512 Byte). */
 594         if (mxb - device->rs_in_flight/8 < number)
 595                 number = mxb - device->rs_in_flight/8;
 596
 597         return number;
 598 }
 599
 600 static int make_resync_request(struct drbd_device *const device, int cancel)
 601 {
 602         struct drbd_peer_device *const peer_device = first_peer_device(device);
 603         struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
 604         unsigned long bit;
 605         sector_t sector;
 606         const sector_t capacity = drbd_get_capacity(device->this_bdev);
 607         int max_bio_size;
 608         int number, rollback_i, size;
 609         int align, requeue = 0;
 610         int i = 0;
 611         int discard_granularity = 0;
 612
 613         if (unlikely(cancel))
 614                 return 0;
 615
 616         if (device->rs_total == 0) {
 617                 /* empty resync? */
 618                 drbd_resync_finished(device);
 619                 return 0;
 620         }
 621
 622         if (!get_ldev(device)) {
 623                 /* Since we only need to access device->rsync a
 624                    get_ldev_if_state(device,D_FAILED) would be sufficient, but
 625                    to continue resync with a broken disk makes no sense at
 626                    all */
 627                 drbd_err(device, "Disk broke down during resync!\n");
 628                 return 0;
 629         }
 630
 631         if (connection->agreed_features & DRBD_FF_THIN_RESYNC) {
 632                 rcu_read_lock();
 633                 discard_granularity = rcu_dereference(device->ldev->disk_conf)->rs_discard_granularity;
 634                 rcu_read_unlock();
 635         }
 636
 637         max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
 638         number = drbd_rs_number_requests(device);
 639         if (number <= 0)
 640                 goto requeue;
 641
 642         for (i = 0; i < number; i++) {
 643                 /* Stop generating RS requests when half of the send buffer is filled,
 644                  * but notify TCP that we'd like to have more space. */
 645                 mutex_lock(&connection->data.mutex);
 646                 if (connection->data.socket) {
 647                         struct sock *sk = connection->data.socket->sk;
 648                         int queued = sk->sk_wmem_queued;
 649                         int sndbuf = sk->sk_sndbuf;
 650                         if (queued > sndbuf / 2) {
 651                                 requeue = 1;
 652                                 if (sk->sk_socket)
 653                                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 654                         }
 655                 } else
 656                         requeue = 1;
 657                 mutex_unlock(&connection->data.mutex);
 658                 if (requeue)
 659                         goto requeue;
 660
 661 next_sector:
 662                 size = BM_BLOCK_SIZE;
 663                 bit  = drbd_bm_find_next(device, device->bm_resync_fo);
 664
 665                 if (bit == DRBD_END_OF_BITMAP) {
 666                         device->bm_resync_fo = drbd_bm_bits(device);
 667                         put_ldev(device);
 668                         return 0;
 669                 }
 670
 671                 sector = BM_BIT_TO_SECT(bit);
 672
 673                 if (drbd_try_rs_begin_io(device, sector)) {
 674                         device->bm_resync_fo = bit;
 675                         goto requeue;
 676                 }
 677                 device->bm_resync_fo = bit + 1;
 678
 679                 if (unlikely(drbd_bm_test_bit(device, bit) == 0)) {
 680                         drbd_rs_complete_io(device, sector);
 681                         goto next_sector;
 682                 }
 683
 684 #if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
 685                 /* try to find some adjacent bits.
 686                  * we stop if we have already the maximum req size.
 687                  *
 688                  * Additionally always align bigger requests, in order to
 689                  * be prepared for all stripe sizes of software RAIDs.
 690                  */
 691                 align = 1;
 692                 rollback_i = i;
 693                 while (i < number) {
 694                         if (size + BM_BLOCK_SIZE > max_bio_size)
 695                                 break;
 696
 697                         /* Be always aligned */
 698                         if (sector & ((1<<(align+3))-1))
 699                                 break;
 700
 701                         if (discard_granularity && size == discard_granularity)
 702                                 break;
 703
 704                         /* do not cross extent boundaries */
 705                         if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
 706                                 break;
 707                         /* now, is it actually dirty, after all?
 708                          * caution, drbd_bm_test_bit is tri-state for some
 709                          * obscure reason; ( b == 0 ) would get the out-of-band
 710                          * only accidentally right because of the "oddly sized"
 711                          * adjustment below */
 712                         if (drbd_bm_test_bit(device, bit+1) != 1)
 713                                 break;
 714                         bit++;
 715                         size += BM_BLOCK_SIZE;
 716                         if ((BM_BLOCK_SIZE << align) <= size)
 717                                 align++;
 718                         i++;
 719                 }
 720                 /* if we merged some,
 721                  * reset the offset to start the next drbd_bm_find_next from */
 722                 if (size > BM_BLOCK_SIZE)
 723                         device->bm_resync_fo = bit + 1;
 724 #endif
 725
 726                 /* adjust very last sectors, in case we are oddly sized */
 727                 if (sector + (size>>9) > capacity)
 728                         size = (capacity-sector)<<9;
 729
 730                 if (device->use_csums) {
 731                         switch (read_for_csum(peer_device, sector, size)) {
 732                         case -EIO: /* Disk failure */
 733                                 put_ldev(device);
 734                                 return -EIO;
 735                         case -EAGAIN: /* allocation failed, or ldev busy */
 736                                 drbd_rs_complete_io(device, sector);
 737                                 device->bm_resync_fo = BM_SECT_TO_BIT(sector);
 738                                 i = rollback_i;
 739                                 goto requeue;
 740                         case 0:
 741                                 /* everything ok */
 742                                 break;
 743                         default:
 744                                 BUG();
 745                         }
 746                 } else {
 747                         int err;
 748
 749                         inc_rs_pending(device);
 750                         err = drbd_send_drequest(peer_device,
 751                                                  size == discard_granularity ? P_RS_THIN_REQ : P_RS_DATA_REQUEST,
 752                                                  sector, size, ID_SYNCER);
 753                         if (err) {
 754                                 drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
 755                                 dec_rs_pending(device);
 756                                 put_ldev(device);
 757                                 return err;
 758                         }
 759                 }
 760         }
 761
 762         if (device->bm_resync_fo >= drbd_bm_bits(device)) {
 763                 /* last syncer _request_ was sent,
 764                  * but the P_RS_DATA_REPLY not yet received.  sync will end (and
 765                  * next sync group will resume), as soon as we receive the last
 766                  * resync data block, and the last bit is cleared.
 767                  * until then resync "work" is "inactive" ...
 768                  */
 769                 put_ldev(device);
 770                 return 0;
 771         }
 772
 773  requeue:
 774         device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
 775         mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
 776         put_ldev(device);
 777         return 0;
 778 }
 779
 780 static int make_ov_request(struct drbd_device *device, int cancel)
 781 {
 782         int number, i, size;
 783         sector_t sector;
 784         const sector_t capacity = drbd_get_capacity(device->this_bdev);
 785         bool stop_sector_reached = false;
 786
 787         if (unlikely(cancel))
 788                 return 1;
 789
 790         number = drbd_rs_number_requests(device);
 791
 792         sector = device->ov_position;
 793         for (i = 0; i < number; i++) {
 794                 if (sector >= capacity)
 795                         return 1;
 796
 797                 /* We check for "finished" only in the reply path:
 798                  * w_e_end_ov_reply().
 799                  * We need to send at least one request out. */
 800                 stop_sector_reached = i > 0
 801                         && verify_can_do_stop_sector(device)
 802                         && sector >= device->ov_stop_sector;
 803                 if (stop_sector_reached)
 804                         break;
 805
 806                 size = BM_BLOCK_SIZE;
 807
 808                 if (drbd_try_rs_begin_io(device, sector)) {
 809                         device->ov_position = sector;
 810                         goto requeue;
 811                 }
 812
 813                 if (sector + (size>>9) > capacity)
 814                         size = (capacity-sector)<<9;
 815
 816                 inc_rs_pending(device);
 817                 if (drbd_send_ov_request(first_peer_device(device), sector, size)) {
 818                         dec_rs_pending(device);
 819                         return 0;
 820                 }
 821                 sector += BM_SECT_PER_BIT;
 822         }
 823         device->ov_position = sector;
 824
 825  requeue:
 826         device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
 827         if (i == 0 || !stop_sector_reached)
 828                 mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
 829         return 1;
 830 }
 831
 832 int w_ov_finished(struct drbd_work *w, int cancel)
 833 {
 834         struct drbd_device_work *dw =
 835                 container_of(w, struct drbd_device_work, w);
 836         struct drbd_device *device = dw->device;
 837         kfree(dw);
 838         ov_out_of_sync_print(device);
 839         drbd_resync_finished(device);
 840
 841         return 0;
 842 }
 843
 844 static int w_resync_finished(struct drbd_work *w, int cancel)
 845 {
 846         struct drbd_device_work *dw =
 847                 container_of(w, struct drbd_device_work, w);
 848         struct drbd_device *device = dw->device;
 849         kfree(dw);
 850
 851         drbd_resync_finished(device);
 852
 853         return 0;
 854 }
 855
 856 static void ping_peer(struct drbd_device *device)
 857 {
 858         struct drbd_connection *connection = first_peer_device(device)->connection;
 859
 860         clear_bit(GOT_PING_ACK, &connection->flags);
 861         request_ping(connection);
 862         wait_event(connection->ping_wait,
 863                    test_bit(GOT_PING_ACK, &connection->flags) || device->state.conn < C_CONNECTED);
 864 }
 865
 866 int drbd_resync_finished(struct drbd_device *device)
 867 {
 868         struct drbd_connection *connection = first_peer_device(device)->connection;
 869         unsigned long db, dt, dbdt;
 870         unsigned long n_oos;
 871         union drbd_state os, ns;
 872         struct drbd_device_work *dw;
 873         char *khelper_cmd = NULL;
 874         int verify_done = 0;
 875
 876         /* Remove all elements from the resync LRU. Since future actions
 877          * might set bits in the (main) bitmap, then the entries in the
 878          * resync LRU would be wrong. */
 879         if (drbd_rs_del_all(device)) {
 880                 /* In case this is not possible now, most probably because
 881                  * there are P_RS_DATA_REPLY Packets lingering on the worker's
 882                  * queue (or even the read operations for those packets
 883                  * is not finished by now).   Retry in 100ms. */
 884
 885                 schedule_timeout_interruptible(HZ / 10);
 886                 dw = kmalloc(sizeof(struct drbd_device_work), GFP_ATOMIC);
 887                 if (dw) {
 888                         dw->w.cb = w_resync_finished;
 889                         dw->device = device;
 890                         drbd_queue_work(&connection->sender_work, &dw->w);
 891                         return 1;
 892                 }
 893                 drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n");
 894         }
 895
 896         dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
 897         if (dt <= 0)
 898                 dt = 1;
 899
 900         db = device->rs_total;
 901         /* adjust for verify start and stop sectors, respective reached position */
 902         if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
 903                 db -= device->ov_left;
 904
 905         dbdt = Bit2KB(db/dt);
 906         device->rs_paused /= HZ;
 907
 908         if (!get_ldev(device))
 909                 goto out;
 910
 911         ping_peer(device);
 912
 913         spin_lock_irq(&device->resource->req_lock);
 914         os = drbd_read_state(device);
 915
 916         verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
 917
 918         /* This protects us against multiple calls (that can happen in the presence
 919            of application IO), and against connectivity loss just before we arrive here. */
 920         if (os.conn <= C_CONNECTED)
 921                 goto out_unlock;
 922
 923         ns = os;
 924         ns.conn = C_CONNECTED;
 925
 926         drbd_info(device, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
 927              verify_done ? "Online verify" : "Resync",
 928              dt + device->rs_paused, device->rs_paused, dbdt);
 929
 930         n_oos = drbd_bm_total_weight(device);
 931
 932         if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
 933                 if (n_oos) {
 934                         drbd_alert(device, "Online verify found %lu %dk block out of sync!\n",
 935                               n_oos, Bit2KB(1));
 936                         khelper_cmd = "out-of-sync";
 937                 }
 938         } else {
 939                 D_ASSERT(device, (n_oos - device->rs_failed) == 0);
 940
 941                 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
 942                         khelper_cmd = "after-resync-target";
 943
 944                 if (device->use_csums && device->rs_total) {
 945                         const unsigned long s = device->rs_same_csum;
 946                         const unsigned long t = device->rs_total;
 947                         const int ratio =
 948                                 (t == 0)     ? 0 :
 949                         (t < 100000) ? ((s*100)/t) : (s/(t/100));
 950                         drbd_info(device, "%u %% had equal checksums, eliminated: %luK; "
 951                              "transferred %luK total %luK\n",
 952                              ratio,
 953                              Bit2KB(device->rs_same_csum),
 954                              Bit2KB(device->rs_total - device->rs_same_csum),
 955                              Bit2KB(device->rs_total));
 956                 }
 957         }
 958
 959         if (device->rs_failed) {
 960                 drbd_info(device, "            %lu failed blocks\n", device->rs_failed);
 961
 962                 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
 963                         ns.disk = D_INCONSISTENT;
 964                         ns.pdsk = D_UP_TO_DATE;
 965                 } else {
 966                         ns.disk = D_UP_TO_DATE;
 967                         ns.pdsk = D_INCONSISTENT;
 968                 }
 969         } else {
 970                 ns.disk = D_UP_TO_DATE;
 971                 ns.pdsk = D_UP_TO_DATE;
 972
 973                 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
 974                         if (device->p_uuid) {
 975                                 int i;
 976                                 for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
 977                                         _drbd_uuid_set(device, i, device->p_uuid[i]);
 978                                 drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_CURRENT]);
 979                                 _drbd_uuid_set(device, UI_CURRENT, device->p_uuid[UI_CURRENT]);
 980                         } else {
 981                                 drbd_err(device, "device->p_uuid is NULL! BUG\n");
 982                         }
 983                 }
 984
 985                 if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) {
 986                         /* for verify runs, we don't update uuids here,
 987                          * so there would be nothing to report. */
 988                         drbd_uuid_set_bm(device, 0UL);
 989                         drbd_print_uuids(device, "updated UUIDs");
 990                         if (device->p_uuid) {
 991                                 /* Now the two UUID sets are equal, update what we
 992                                  * know of the peer. */
 993                                 int i;
 994                                 for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
 995                                         device->p_uuid[i] = device->ldev->md.uuid[i];
 996                         }
 997                 }
 998         }
 999
1000         _drbd_set_state(device, ns, CS_VERBOSE, NULL);
1001 out_unlock:
1002         spin_unlock_irq(&device->resource->req_lock);
1003
1004         /* If we have been sync source, and have an effective fencing-policy,
1005          * once *all* volumes are back in sync, call "unfence". */
1006         if (os.conn == C_SYNC_SOURCE) {
1007                 enum drbd_disk_state disk_state = D_MASK;
1008                 enum drbd_disk_state pdsk_state = D_MASK;
1009                 enum drbd_fencing_p fp = FP_DONT_CARE;
1010
1011                 rcu_read_lock();
1012                 fp = rcu_dereference(device->ldev->disk_conf)->fencing;
1013                 if (fp != FP_DONT_CARE) {
1014                         struct drbd_peer_device *peer_device;
1015                         int vnr;
1016                         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1017                                 struct drbd_device *device = peer_device->device;
1018                                 disk_state = min_t(enum drbd_disk_state, disk_state, device->state.disk);
1019                                 pdsk_state = min_t(enum drbd_disk_state, pdsk_state, device->state.pdsk);
1020                         }
1021                 }
1022                 rcu_read_unlock();
1023                 if (disk_state == D_UP_TO_DATE && pdsk_state == D_UP_TO_DATE)
1024                         conn_khelper(connection, "unfence-peer");
1025         }
1026
1027         put_ldev(device);
1028 out:
1029         device->rs_total  = 0;
1030         device->rs_failed = 0;
1031         device->rs_paused = 0;
1032
1033         /* reset start sector, if we reached end of device */
1034         if (verify_done && device->ov_left == 0)
1035                 device->ov_start_sector = 0;
1036
1037         drbd_md_sync(device);
1038
1039         if (khelper_cmd)
1040                 drbd_khelper(device, khelper_cmd);
1041
1042         return 1;
1043 }
1044
1045 /* helper */
1046 static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_request *peer_req)
1047 {
1048         if (drbd_peer_req_has_active_page(peer_req)) {
1049                 /* This might happen if sendpage() has not finished */
1050                 int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT;
1051                 atomic_add(i, &device->pp_in_use_by_net);
1052                 atomic_sub(i, &device->pp_in_use);
1053                 spin_lock_irq(&device->resource->req_lock);
1054                 list_add_tail(&peer_req->w.list, &device->net_ee);
1055                 spin_unlock_irq(&device->resource->req_lock);
1056                 wake_up(&drbd_pp_wait);
1057         } else
1058                 drbd_free_peer_req(device, peer_req);
1059 }
1060
1061 /**
1062  * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
1063  * @w:          work object.
1064  * @cancel:     The connection will be closed anyways
1065  */
1066 int w_e_end_data_req(struct drbd_work *w, int cancel)
1067 {
1068         struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1069         struct drbd_peer_device *peer_device = peer_req->peer_device;
1070         struct drbd_device *device = peer_device->device;
1071         int err;
1072
1073         if (unlikely(cancel)) {
1074                 drbd_free_peer_req(device, peer_req);
1075                 dec_unacked(device);
1076                 return 0;
1077         }
1078
1079         if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1080                 err = drbd_send_block(peer_device, P_DATA_REPLY, peer_req);
1081         } else {
1082                 if (__ratelimit(&drbd_ratelimit_state))
1083                         drbd_err(device, "Sending NegDReply. sector=%llus.\n",
1084                             (unsigned long long)peer_req->i.sector);
1085
1086                 err = drbd_send_ack(peer_device, P_NEG_DREPLY, peer_req);
1087         }
1088
1089         dec_unacked(device);
1090
1091         move_to_net_ee_or_free(device, peer_req);
1092
1093         if (unlikely(err))
1094                 drbd_err(device, "drbd_send_block() failed\n");
1095         return err;
1096 }
1097
1098 static bool all_zero(struct drbd_peer_request *peer_req)
1099 {
1100         struct page *page = peer_req->pages;
1101         unsigned int len = peer_req->i.size;
1102
1103         page_chain_for_each(page) {
1104                 unsigned int l = min_t(unsigned int, len, PAGE_SIZE);
1105                 unsigned int i, words = l / sizeof(long);
1106                 unsigned long *d;
1107
1108                 d = kmap_atomic(page);
1109                 for (i = 0; i < words; i++) {
1110                         if (d[i]) {
1111                                 kunmap_atomic(d);
1112                                 return false;
1113                         }
1114                 }
1115                 kunmap_atomic(d);
1116                 len -= l;
1117         }
1118
1119         return true;
1120 }
1121
1122 /**
1123  * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
1124  * @w:          work object.
1125  * @cancel:     The connection will be closed anyways
1126  */
1127 int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
1128 {
1129         struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1130         struct drbd_peer_device *peer_device = peer_req->peer_device;
1131         struct drbd_device *device = peer_device->device;
1132         int err;
1133
1134         if (unlikely(cancel)) {
1135                 drbd_free_peer_req(device, peer_req);
1136                 dec_unacked(device);
1137                 return 0;
1138         }
1139
1140         if (get_ldev_if_state(device, D_FAILED)) {
1141                 drbd_rs_complete_io(device, peer_req->i.sector);
1142                 put_ldev(device);
1143         }
1144
1145         if (device->state.conn == C_AHEAD) {
1146                 err = drbd_send_ack(peer_device, P_RS_CANCEL, peer_req);
1147         } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1148                 if (likely(device->state.pdsk >= D_INCONSISTENT)) {
1149                         inc_rs_pending(device);
1150                         if (peer_req->flags & EE_RS_THIN_REQ && all_zero(peer_req))
1151                                 err = drbd_send_rs_deallocated(peer_device, peer_req);
1152                         else
1153                                 err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
1154                 } else {
1155                         if (__ratelimit(&drbd_ratelimit_state))
1156                                 drbd_err(device, "Not sending RSDataReply, "
1157                                     "partner DISKLESS!\n");
1158                         err = 0;
1159                 }
1160         } else {
1161                 if (__ratelimit(&drbd_ratelimit_state))
1162                         drbd_err(device, "Sending NegRSDReply. sector %llus.\n",
1163                             (unsigned long long)peer_req->i.sector);
1164
1165                 err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
1166
1167                 /* update resync data with failure */
1168                 drbd_rs_failed_io(device, peer_req->i.sector, peer_req->i.size);
1169         }
1170
1171         dec_unacked(device);
1172
1173         move_to_net_ee_or_free(device, peer_req);
1174
1175         if (unlikely(err))
1176                 drbd_err(device, "drbd_send_block() failed\n");
1177         return err;
1178 }
1179
1180 int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
1181 {
1182         struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1183         struct drbd_peer_device *peer_device = peer_req->peer_device;
1184         struct drbd_device *device = peer_device->device;
1185         struct digest_info *di;
1186         int digest_size;
1187         void *digest = NULL;
1188         int err, eq = 0;
1189
1190         if (unlikely(cancel)) {
1191                 drbd_free_peer_req(device, peer_req);
1192                 dec_unacked(device);
1193                 return 0;
1194         }
1195
1196         if (get_ldev(device)) {
1197                 drbd_rs_complete_io(device, peer_req->i.sector);
1198                 put_ldev(device);
1199         }
1200
1201         di = peer_req->digest;
1202
1203         if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1204                 /* quick hack to try to avoid a race against reconfiguration.
1205                  * a real fix would be much more involved,
1206                  * introducing more locking mechanisms */
1207                 if (peer_device->connection->csums_tfm) {
1208                         digest_size = crypto_ahash_digestsize(peer_device->connection->csums_tfm);
1209                         D_ASSERT(device, digest_size == di->digest_size);
1210                         digest = kmalloc(digest_size, GFP_NOIO);
1211                 }
1212                 if (digest) {
1213                         drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
1214                         eq = !memcmp(digest, di->digest, digest_size);
1215                         kfree(digest);
1216                 }
1217
1218                 if (eq) {
1219                         drbd_set_in_sync(device, peer_req->i.sector, peer_req->i.size);
1220                         /* rs_same_csums unit is BM_BLOCK_SIZE */
1221                         device->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT;
1222                         err = drbd_send_ack(peer_device, P_RS_IS_IN_SYNC, peer_req);
1223                 } else {
1224                         inc_rs_pending(device);
1225                         peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
1226                         peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */
1227                         kfree(di);
1228                         err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
1229                 }
1230         } else {
1231                 err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
1232                 if (__ratelimit(&drbd_ratelimit_state))
1233                         drbd_err(device, "Sending NegDReply. I guess it gets messy.\n");
1234         }
1235
1236         dec_unacked(device);
1237         move_to_net_ee_or_free(device, peer_req);
1238
1239         if (unlikely(err))
1240                 drbd_err(device, "drbd_send_block/ack() failed\n");
1241         return err;
1242 }
1243
1244 int w_e_end_ov_req(struct drbd_work *w, int cancel)
1245 {
1246         struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1247         struct drbd_peer_device *peer_device = peer_req->peer_device;
1248         struct drbd_device *device = peer_device->device;
1249         sector_t sector = peer_req->i.sector;
1250         unsigned int size = peer_req->i.size;
1251         int digest_size;
1252         void *digest;
1253         int err = 0;
1254
1255         if (unlikely(cancel))
1256                 goto out;
1257
1258         digest_size = crypto_ahash_digestsize(peer_device->connection->verify_tfm);
1259         digest = kmalloc(digest_size, GFP_NOIO);
1260         if (!digest) {
1261                 err = 1;        /* terminate the connection in case the allocation failed */
1262                 goto out;
1263         }
1264
1265         if (likely(!(peer_req->flags & EE_WAS_ERROR)))
1266                 drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
1267         else
1268                 memset(digest, 0, digest_size);
1269
1270         /* Free e and pages before send.
1271          * In case we block on congestion, we could otherwise run into
1272          * some distributed deadlock, if the other side blocks on
1273          * congestion as well, because our receiver blocks in
1274          * drbd_alloc_pages due to pp_in_use > max_buffers. */
1275         drbd_free_peer_req(device, peer_req);
1276         peer_req = NULL;
1277         inc_rs_pending(device);
1278         err = drbd_send_drequest_csum(peer_device, sector, size, digest, digest_size, P_OV_REPLY);
1279         if (err)
1280                 dec_rs_pending(device);
1281         kfree(digest);
1282
1283 out:
1284         if (peer_req)
1285                 drbd_free_peer_req(device, peer_req);
1286         dec_unacked(device);
1287         return err;
1288 }
1289
1290 void drbd_ov_out_of_sync_found(struct drbd_device *device, sector_t sector, int size)
1291 {
1292         if (device->ov_last_oos_start + device->ov_last_oos_size == sector) {
1293                 device->ov_last_oos_size += size>>9;
1294         } else {
1295                 device->ov_last_oos_start = sector;
1296                 device->ov_last_oos_size = size>>9;
1297         }
1298         drbd_set_out_of_sync(device, sector, size);
1299 }
1300
1301 int w_e_end_ov_reply(struct drbd_work *w, int cancel)
1302 {
1303         struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1304         struct drbd_peer_device *peer_device = peer_req->peer_device;
1305         struct drbd_device *device = peer_device->device;
1306         struct digest_info *di;
1307         void *digest;
1308         sector_t sector = peer_req->i.sector;
1309         unsigned int size = peer_req->i.size;
1310         int digest_size;
1311         int err, eq = 0;
1312         bool stop_sector_reached = false;
1313
1314         if (unlikely(cancel)) {
1315                 drbd_free_peer_req(device, peer_req);
1316                 dec_unacked(device);
1317                 return 0;
1318         }
1319
1320         /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
1321          * the resync lru has been cleaned up already */
1322         if (get_ldev(device)) {
1323                 drbd_rs_complete_io(device, peer_req->i.sector);
1324                 put_ldev(device);
1325         }
1326
1327         di = peer_req->digest;
1328
1329         if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1330                 digest_size = crypto_ahash_digestsize(peer_device->connection->verify_tfm);
1331                 digest = kmalloc(digest_size, GFP_NOIO);
1332                 if (digest) {
1333                         drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
1334
1335                         D_ASSERT(device, digest_size == di->digest_size);
1336                         eq = !memcmp(digest, di->digest, digest_size);
1337                         kfree(digest);
1338                 }
1339         }
1340
1341         /* Free peer_req and pages before send.
1342          * In case we block on congestion, we could otherwise run into
1343          * some distributed deadlock, if the other side blocks on
1344          * congestion as well, because our receiver blocks in
1345          * drbd_alloc_pages due to pp_in_use > max_buffers. */
1346         drbd_free_peer_req(device, peer_req);
1347         if (!eq)
1348                 drbd_ov_out_of_sync_found(device, sector, size);
1349         else
1350                 ov_out_of_sync_print(device);
1351
1352         err = drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size,
1353                                eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
1354
1355         dec_unacked(device);
1356
1357         --device->ov_left;
1358
1359         /* let's advance progress step marks only for every other megabyte */
1360         if ((device->ov_left & 0x200) == 0x200)
1361                 drbd_advance_rs_marks(device, device->ov_left);
1362
1363         stop_sector_reached = verify_can_do_stop_sector(device) &&
1364                 (sector + (size>>9)) >= device->ov_stop_sector;
1365
1366         if (device->ov_left == 0 || stop_sector_reached) {
1367                 ov_out_of_sync_print(device);
1368                 drbd_resync_finished(device);
1369         }
1370
1371         return err;
1372 }
1373
1374 /* FIXME
1375  * We need to track the number of pending barrier acks,
1376  * and to be able to wait for them.
1377  * See also comment in drbd_adm_attach before drbd_suspend_io.
1378  */
1379 static int drbd_send_barrier(struct drbd_connection *connection)
1380 {
1381         struct p_barrier *p;
1382         struct drbd_socket *sock;
1383
1384         sock = &connection->data;
1385         p = conn_prepare_command(connection, sock);
1386         if (!p)
1387                 return -EIO;
1388         p->barrier = connection->send.current_epoch_nr;
1389         p->pad = 0;
1390         connection->send.current_epoch_writes = 0;
1391         connection->send.last_sent_barrier_jif = jiffies;
1392
1393         return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0);
1394 }
1395
1396 static int pd_send_unplug_remote(struct drbd_peer_device *pd)
1397 {
1398         struct drbd_socket *sock = &pd->connection->data;
1399         if (!drbd_prepare_command(pd, sock))
1400                 return -EIO;
1401         return drbd_send_command(pd, sock, P_UNPLUG_REMOTE, 0, NULL, 0);
1402 }
1403
1404 int w_send_write_hint(struct drbd_work *w, int cancel)
1405 {
1406         struct drbd_device *device =
1407                 container_of(w, struct drbd_device, unplug_work);
1408
1409         if (cancel)
1410                 return 0;
1411         return pd_send_unplug_remote(first_peer_device(device));
1412 }
1413
1414 static void re_init_if_first_write(struct drbd_connection *connection, unsigned int epoch)
1415 {
1416         if (!connection->send.seen_any_write_yet) {
1417                 connection->send.seen_any_write_yet = true;
1418                 connection->send.current_epoch_nr = epoch;
1419                 connection->send.current_epoch_writes = 0;
1420                 connection->send.last_sent_barrier_jif = jiffies;
1421         }
1422 }
1423
1424 static void maybe_send_barrier(struct drbd_connection *connection, unsigned int epoch)
1425 {
1426         /* re-init if first write on this connection */
1427         if (!connection->send.seen_any_write_yet)
1428                 return;
1429         if (connection->send.current_epoch_nr != epoch) {
1430                 if (connection->send.current_epoch_writes)
1431                         drbd_send_barrier(connection);
1432                 connection->send.current_epoch_nr = epoch;
1433         }
1434 }
1435
1436 int w_send_out_of_sync(struct drbd_work *w, int cancel)
1437 {
1438         struct drbd_request *req = container_of(w, struct drbd_request, w);
1439         struct drbd_device *device = req->device;
1440         struct drbd_peer_device *const peer_device = first_peer_device(device);
1441         struct drbd_connection *const connection = peer_device->connection;
1442         int err;
1443
1444         if (unlikely(cancel)) {
1445                 req_mod(req, SEND_CANCELED);
1446                 return 0;
1447         }
1448         req->pre_send_jif = jiffies;
1449
1450         /* this time, no connection->send.current_epoch_writes++;
1451          * If it was sent, it was the closing barrier for the last
1452          * replicated epoch, before we went into AHEAD mode.
1453          * No more barriers will be sent, until we leave AHEAD mode again. */
1454         maybe_send_barrier(connection, req->epoch);
1455
1456         err = drbd_send_out_of_sync(peer_device, req);
1457         req_mod(req, OOS_HANDED_TO_NETWORK);
1458
1459         return err;
1460 }
1461
1462 /**
1463  * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
1464  * @w:          work object.
1465  * @cancel:     The connection will be closed anyways
1466  */
1467 int w_send_dblock(struct drbd_work *w, int cancel)
1468 {
1469         struct drbd_request *req = container_of(w, struct drbd_request, w);
1470         struct drbd_device *device = req->device;
1471         struct drbd_peer_device *const peer_device = first_peer_device(device);
1472         struct drbd_connection *connection = peer_device->connection;
1473         bool do_send_unplug = req->rq_state & RQ_UNPLUG;
1474         int err;
1475
1476         if (unlikely(cancel)) {
1477                 req_mod(req, SEND_CANCELED);
1478                 return 0;
1479         }
1480         req->pre_send_jif = jiffies;
1481
1482         re_init_if_first_write(connection, req->epoch);
1483         maybe_send_barrier(connection, req->epoch);
1484         connection->send.current_epoch_writes++;
1485
1486         err = drbd_send_dblock(peer_device, req);
1487         req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
1488
1489         if (do_send_unplug && !err)
1490                 pd_send_unplug_remote(peer_device);
1491
1492         return err;
1493 }
1494
1495 /**
1496  * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
1497  * @w:          work object.
1498  * @cancel:     The connection will be closed anyways
1499  */
1500 int w_send_read_req(struct drbd_work *w, int cancel)
1501 {
1502         struct drbd_request *req = container_of(w, struct drbd_request, w);
1503         struct drbd_device *device = req->device;
1504         struct drbd_peer_device *const peer_device = first_peer_device(device);
1505         struct drbd_connection *connection = peer_device->connection;
1506         bool do_send_unplug = req->rq_state & RQ_UNPLUG;
1507         int err;
1508
1509         if (unlikely(cancel)) {
1510                 req_mod(req, SEND_CANCELED);
1511                 return 0;
1512         }
1513         req->pre_send_jif = jiffies;
1514
1515         /* Even read requests may close a write epoch,
1516          * if there was any yet. */
1517         maybe_send_barrier(connection, req->epoch);
1518
1519         err = drbd_send_drequest(peer_device, P_DATA_REQUEST, req->i.sector, req->i.size,
1520                                  (unsigned long)req);
1521
1522         req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
1523
1524         if (do_send_unplug && !err)
1525                 pd_send_unplug_remote(peer_device);
1526
1527         return err;
1528 }
1529
1530 int w_restart_disk_io(struct drbd_work *w, int cancel)
1531 {
1532         struct drbd_request *req = container_of(w, struct drbd_request, w);
1533         struct drbd_device *device = req->device;
1534
1535         if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
1536                 drbd_al_begin_io(device, &req->i);
1537
1538         drbd_req_make_private_bio(req, req->master_bio);
1539         bio_set_dev(req->private_bio, device->ldev->backing_bdev);
1540         generic_make_request(req->private_bio);
1541
1542         return 0;
1543 }
1544
1545 static int _drbd_may_sync_now(struct drbd_device *device)
1546 {
1547         struct drbd_device *odev = device;
1548         int resync_after;
1549
1550         while (1) {
1551                 if (!odev->ldev || odev->state.disk == D_DISKLESS)
1552                         return 1;
1553                 rcu_read_lock();
1554                 resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
1555                 rcu_read_unlock();
1556                 if (resync_after == -1)
1557                         return 1;
1558                 odev = minor_to_device(resync_after);
1559                 if (!odev)
1560                         return 1;
1561                 if ((odev->state.conn >= C_SYNC_SOURCE &&
1562                      odev->state.conn <= C_PAUSED_SYNC_T) ||
1563                     odev->state.aftr_isp || odev->state.peer_isp ||
1564                     odev->state.user_isp)
1565                         return 0;
1566         }
1567 }
1568
1569 /**
1570  * drbd_pause_after() - Pause resync on all devices that may not resync now
1571  * @device:     DRBD device.
1572  *
1573  * Called from process context only (admin command and after_state_ch).
1574  */
1575 static bool drbd_pause_after(struct drbd_device *device)
1576 {
1577         bool changed = false;
1578         struct drbd_device *odev;
1579         int i;
1580
1581         rcu_read_lock();
1582         idr_for_each_entry(&drbd_devices, odev, i) {
1583                 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1584                         continue;
1585                 if (!_drbd_may_sync_now(odev) &&
1586                     _drbd_set_state(_NS(odev, aftr_isp, 1),
1587                                     CS_HARD, NULL) != SS_NOTHING_TO_DO)
1588                         changed = true;
1589         }
1590         rcu_read_unlock();
1591
1592         return changed;
1593 }
1594
1595 /**
1596  * drbd_resume_next() - Resume resync on all devices that may resync now
1597  * @device:     DRBD device.
1598  *
1599  * Called from process context only (admin command and worker).
1600  */
1601 static bool drbd_resume_next(struct drbd_device *device)
1602 {
1603         bool changed = false;
1604         struct drbd_device *odev;
1605         int i;
1606
1607         rcu_read_lock();
1608         idr_for_each_entry(&drbd_devices, odev, i) {
1609                 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1610                         continue;
1611                 if (odev->state.aftr_isp) {
1612                         if (_drbd_may_sync_now(odev) &&
1613                             _drbd_set_state(_NS(odev, aftr_isp, 0),
1614                                             CS_HARD, NULL) != SS_NOTHING_TO_DO)
1615                                 changed = true;
1616                 }
1617         }
1618         rcu_read_unlock();
1619         return changed;
1620 }
1621
1622 void resume_next_sg(struct drbd_device *device)
1623 {
1624         lock_all_resources();
1625         drbd_resume_next(device);
1626         unlock_all_resources();
1627 }
1628
1629 void suspend_other_sg(struct drbd_device *device)
1630 {
1631         lock_all_resources();
1632         drbd_pause_after(device);
1633         unlock_all_resources();
1634 }
1635
1636 /* caller must lock_all_resources() */
1637 enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor)
1638 {
1639         struct drbd_device *odev;
1640         int resync_after;
1641
1642         if (o_minor == -1)
1643                 return NO_ERROR;
1644         if (o_minor < -1 || o_minor > MINORMASK)
1645                 return ERR_RESYNC_AFTER;
1646
1647         /* check for loops */
1648         odev = minor_to_device(o_minor);
1649         while (1) {
1650                 if (odev == device)
1651                         return ERR_RESYNC_AFTER_CYCLE;
1652
1653                 /* You are free to depend on diskless, non-existing,
1654                  * or not yet/no longer existing minors.
1655                  * We only reject dependency loops.
1656                  * We cannot follow the dependency chain beyond a detached or
1657                  * missing minor.
1658                  */
1659                 if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS)
1660                         return NO_ERROR;
1661
1662                 rcu_read_lock();
1663                 resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
1664                 rcu_read_unlock();
1665                 /* dependency chain ends here, no cycles. */
1666                 if (resync_after == -1)
1667                         return NO_ERROR;
1668
1669                 /* follow the dependency chain */
1670                 odev = minor_to_device(resync_after);
1671         }
1672 }
1673
1674 /* caller must lock_all_resources() */
1675 void drbd_resync_after_changed(struct drbd_device *device)
1676 {
1677         int changed;
1678
1679         do {
1680                 changed  = drbd_pause_after(device);
1681                 changed |= drbd_resume_next(device);
1682         } while (changed);
1683 }
1684
1685 void drbd_rs_controller_reset(struct drbd_device *device)
1686 {
1687         struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
1688         struct fifo_buffer *plan;
1689
1690         atomic_set(&device->rs_sect_in, 0);
1691         atomic_set(&device->rs_sect_ev, 0);
1692         device->rs_in_flight = 0;
1693         device->rs_last_events =
1694                 (int)part_stat_read(&disk->part0, sectors[0]) +
1695                 (int)part_stat_read(&disk->part0, sectors[1]);
1696
1697         /* Updating the RCU protected object in place is necessary since
1698            this function gets called from atomic context.
1699            It is valid since all other updates also lead to an completely
1700            empty fifo */
1701         rcu_read_lock();
1702         plan = rcu_dereference(device->rs_plan_s);
1703         plan->total = 0;
1704         fifo_set(plan, 0);
1705         rcu_read_unlock();
1706 }
1707
1708 void start_resync_timer_fn(struct timer_list *t)
1709 {
1710         struct drbd_device *device = from_timer(device, t, start_resync_timer);
1711         drbd_device_post_work(device, RS_START);
1712 }
1713
1714 static void do_start_resync(struct drbd_device *device)
1715 {
1716         if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) {
1717                 drbd_warn(device, "postponing start_resync ...\n");
1718                 device->start_resync_timer.expires = jiffies + HZ/10;
1719                 add_timer(&device->start_resync_timer);
1720                 return;
1721         }
1722
1723         drbd_start_resync(device, C_SYNC_SOURCE);
1724         clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags);
1725 }
1726
1727 static bool use_checksum_based_resync(struct drbd_connection *connection, struct drbd_device *device)
1728 {
1729         bool csums_after_crash_only;
1730         rcu_read_lock();
1731         csums_after_crash_only = rcu_dereference(connection->net_conf)->csums_after_crash_only;
1732         rcu_read_unlock();
1733         return connection->agreed_pro_version >= 89 &&          /* supported? */
1734                 connection->csums_tfm &&                        /* configured? */
1735                 (csums_after_crash_only == false                /* use for each resync? */
1736                  || test_bit(CRASHED_PRIMARY, &device->flags)); /* or only after Primary crash? */
1737 }
1738
1739 /**
1740  * drbd_start_resync() - Start the resync process
1741  * @device:     DRBD device.
1742  * @side:       Either C_SYNC_SOURCE or C_SYNC_TARGET
1743  *
1744  * This function might bring you directly into one of the
1745  * C_PAUSED_SYNC_* states.
1746  */
1747 void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1748 {
1749         struct drbd_peer_device *peer_device = first_peer_device(device);
1750         struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
1751         union drbd_state ns;
1752         int r;
1753
1754         if (device->state.conn >= C_SYNC_SOURCE && device->state.conn < C_AHEAD) {
1755                 drbd_err(device, "Resync already running!\n");
1756                 return;
1757         }
1758
1759         if (!connection) {
1760                 drbd_err(device, "No connection to peer, aborting!\n");
1761                 return;
1762         }
1763
1764         if (!test_bit(B_RS_H_DONE, &device->flags)) {
1765                 if (side == C_SYNC_TARGET) {
1766                         /* Since application IO was locked out during C_WF_BITMAP_T and
1767                            C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
1768                            we check that we might make the data inconsistent. */
1769                         r = drbd_khelper(device, "before-resync-target");
1770                         r = (r >> 8) & 0xff;
1771                         if (r > 0) {
1772                                 drbd_info(device, "before-resync-target handler returned %d, "
1773                                          "dropping connection.\n", r);
1774                                 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
1775                                 return;
1776                         }
1777                 } else /* C_SYNC_SOURCE */ {
1778                         r = drbd_khelper(device, "before-resync-source");
1779                         r = (r >> 8) & 0xff;
1780                         if (r > 0) {
1781                                 if (r == 3) {
1782                                         drbd_info(device, "before-resync-source handler returned %d, "
1783                                                  "ignoring. Old userland tools?", r);
1784                                 } else {
1785                                         drbd_info(device, "before-resync-source handler returned %d, "
1786                                                  "dropping connection.\n", r);
1787                                         conn_request_state(connection,
1788                                                            NS(conn, C_DISCONNECTING), CS_HARD);
1789                                         return;
1790                                 }
1791                         }
1792                 }
1793         }
1794
1795         if (current == connection->worker.task) {
1796                 /* The worker should not sleep waiting for state_mutex,
1797                    that can take long */
1798                 if (!mutex_trylock(device->state_mutex)) {
1799                         set_bit(B_RS_H_DONE, &device->flags);
1800                         device->start_resync_timer.expires = jiffies + HZ/5;
1801                         add_timer(&device->start_resync_timer);
1802                         return;
1803                 }
1804         } else {
1805                 mutex_lock(device->state_mutex);
1806         }
1807
1808         lock_all_resources();
1809         clear_bit(B_RS_H_DONE, &device->flags);
1810         /* Did some connection breakage or IO error race with us? */
1811         if (device->state.conn < C_CONNECTED
1812         || !get_ldev_if_state(device, D_NEGOTIATING)) {
1813                 unlock_all_resources();
1814                 goto out;
1815         }
1816
1817         ns = drbd_read_state(device);
1818
1819         ns.aftr_isp = !_drbd_may_sync_now(device);
1820
1821         ns.conn = side;
1822
1823         if (side == C_SYNC_TARGET)
1824                 ns.disk = D_INCONSISTENT;
1825         else /* side == C_SYNC_SOURCE */
1826                 ns.pdsk = D_INCONSISTENT;
1827
1828         r = _drbd_set_state(device, ns, CS_VERBOSE, NULL);
1829         ns = drbd_read_state(device);
1830
1831         if (ns.conn < C_CONNECTED)
1832                 r = SS_UNKNOWN_ERROR;
1833
1834         if (r == SS_SUCCESS) {
1835                 unsigned long tw = drbd_bm_total_weight(device);
1836                 unsigned long now = jiffies;
1837                 int i;
1838
1839                 device->rs_failed    = 0;
1840                 device->rs_paused    = 0;
1841                 device->rs_same_csum = 0;
1842                 device->rs_last_sect_ev = 0;
1843                 device->rs_total     = tw;
1844                 device->rs_start     = now;
1845                 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1846                         device->rs_mark_left[i] = tw;
1847                         device->rs_mark_time[i] = now;
1848                 }
1849                 drbd_pause_after(device);
1850                 /* Forget potentially stale cached per resync extent bit-counts.
1851                  * Open coded drbd_rs_cancel_all(device), we already have IRQs
1852                  * disabled, and know the disk state is ok. */
1853                 spin_lock(&device->al_lock);
1854                 lc_reset(device->resync);
1855                 device->resync_locked = 0;
1856                 device->resync_wenr = LC_FREE;
1857                 spin_unlock(&device->al_lock);
1858         }
1859         unlock_all_resources();
1860
1861         if (r == SS_SUCCESS) {
1862                 wake_up(&device->al_wait); /* for lc_reset() above */
1863                 /* reset rs_last_bcast when a resync or verify is started,
1864                  * to deal with potential jiffies wrap. */
1865                 device->rs_last_bcast = jiffies - HZ;
1866
1867                 drbd_info(device, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
1868                      drbd_conn_str(ns.conn),
1869                      (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10),
1870                      (unsigned long) device->rs_total);
1871                 if (side == C_SYNC_TARGET) {
1872                         device->bm_resync_fo = 0;
1873                         device->use_csums = use_checksum_based_resync(connection, device);
1874                 } else {
1875                         device->use_csums = false;
1876                 }
1877
1878                 /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
1879                  * with w_send_oos, or the sync target will get confused as to
1880                  * how much bits to resync.  We cannot do that always, because for an
1881                  * empty resync and protocol < 95, we need to do it here, as we call
1882                  * drbd_resync_finished from here in that case.
1883                  * We drbd_gen_and_send_sync_uuid here for protocol < 96,
1884                  * and from after_state_ch otherwise. */
1885                 if (side == C_SYNC_SOURCE && connection->agreed_pro_version < 96)
1886                         drbd_gen_and_send_sync_uuid(peer_device);
1887
1888                 if (connection->agreed_pro_version < 95 && device->rs_total == 0) {
1889                         /* This still has a race (about when exactly the peers
1890                          * detect connection loss) that can lead to a full sync
1891                          * on next handshake. In 8.3.9 we fixed this with explicit
1892                          * resync-finished notifications, but the fix
1893                          * introduces a protocol change.  Sleeping for some
1894                          * time longer than the ping interval + timeout on the
1895                          * SyncSource, to give the SyncTarget the chance to
1896                          * detect connection loss, then waiting for a ping
1897                          * response (implicit in drbd_resync_finished) reduces
1898                          * the race considerably, but does not solve it. */
1899                         if (side == C_SYNC_SOURCE) {
1900                                 struct net_conf *nc;
1901                                 int timeo;
1902
1903                                 rcu_read_lock();
1904                                 nc = rcu_dereference(connection->net_conf);
1905                                 timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
1906                                 rcu_read_unlock();
1907                                 schedule_timeout_interruptible(timeo);
1908                         }
1909                         drbd_resync_finished(device);
1910                 }
1911
1912                 drbd_rs_controller_reset(device);
1913                 /* ns.conn may already be != device->state.conn,
1914                  * we may have been paused in between, or become paused until
1915                  * the timer triggers.
1916                  * No matter, that is handled in resync_timer_fn() */
1917                 if (ns.conn == C_SYNC_TARGET)
1918                         mod_timer(&device->resync_timer, jiffies);
1919
1920                 drbd_md_sync(device);
1921         }
1922         put_ldev(device);
1923 out:
1924         mutex_unlock(device->state_mutex);
1925 }
1926
1927 static void update_on_disk_bitmap(struct drbd_device *device, bool resync_done)
1928 {
1929         struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
1930         device->rs_last_bcast = jiffies;
1931
1932         if (!get_ldev(device))
1933                 return;
1934
1935         drbd_bm_write_lazy(device, 0);
1936         if (resync_done && is_sync_state(device->state.conn))
1937                 drbd_resync_finished(device);
1938
1939         drbd_bcast_event(device, &sib);
1940         /* update timestamp, in case it took a while to write out stuff */
1941         device->rs_last_bcast = jiffies;
1942         put_ldev(device);
1943 }
1944
1945 static void drbd_ldev_destroy(struct drbd_device *device)
1946 {
1947         lc_destroy(device->resync);
1948         device->resync = NULL;
1949         lc_destroy(device->act_log);
1950         device->act_log = NULL;
1951
1952         __acquire(local);
1953         drbd_backing_dev_free(device, device->ldev);
1954         device->ldev = NULL;
1955         __release(local);
1956
1957         clear_bit(GOING_DISKLESS, &device->flags);
1958         wake_up(&device->misc_wait);
1959 }
1960
1961 static void go_diskless(struct drbd_device *device)
1962 {
1963         D_ASSERT(device, device->state.disk == D_FAILED);
1964         /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
1965          * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
1966          * the protected members anymore, though, so once put_ldev reaches zero
1967          * again, it will be safe to free them. */
1968
1969         /* Try to write changed bitmap pages, read errors may have just
1970          * set some bits outside the area covered by the activity log.
1971          *
1972          * If we have an IO error during the bitmap writeout,
1973          * we will want a full sync next time, just in case.
1974          * (Do we want a specific meta data flag for this?)
1975          *
1976          * If that does not make it to stable storage either,
1977          * we cannot do anything about that anymore.
1978          *
1979          * We still need to check if both bitmap and ldev are present, we may
1980          * end up here after a failed attach, before ldev was even assigned.
1981          */
1982         if (device->bitmap && device->ldev) {
1983                 /* An interrupted resync or similar is allowed to recounts bits
1984                  * while we detach.
1985                  * Any modifications would not be expected anymore, though.
1986                  */
1987                 if (drbd_bitmap_io_from_worker(device, drbd_bm_write,
1988                                         "detach", BM_LOCKED_TEST_ALLOWED)) {
1989                         if (test_bit(WAS_READ_ERROR, &device->flags)) {
1990                                 drbd_md_set_flag(device, MDF_FULL_SYNC);
1991                                 drbd_md_sync(device);
1992                         }
1993                 }
1994         }
1995
1996         drbd_force_state(device, NS(disk, D_DISKLESS));
1997 }
1998
1999 static int do_md_sync(struct drbd_device *device)
2000 {
2001         drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
2002         drbd_md_sync(device);
2003         return 0;
2004 }
2005
2006 /* only called from drbd_worker thread, no locking */
2007 void __update_timing_details(
2008                 struct drbd_thread_timing_details *tdp,
2009                 unsigned int *cb_nr,
2010                 void *cb,
2011                 const char *fn, const unsigned int line)
2012 {
2013         unsigned int i = *cb_nr % DRBD_THREAD_DETAILS_HIST;
2014         struct drbd_thread_timing_details *td = tdp + i;
2015
2016         td->start_jif = jiffies;
2017         td->cb_addr = cb;
2018         td->caller_fn = fn;
2019         td->line = line;
2020         td->cb_nr = *cb_nr;
2021
2022         i = (i+1) % DRBD_THREAD_DETAILS_HIST;
2023         td = tdp + i;
2024         memset(td, 0, sizeof(*td));
2025
2026         ++(*cb_nr);
2027 }
2028
2029 static void do_device_work(struct drbd_device *device, const unsigned long todo)
2030 {
2031         if (test_bit(MD_SYNC, &todo))
2032                 do_md_sync(device);
2033         if (test_bit(RS_DONE, &todo) ||
2034             test_bit(RS_PROGRESS, &todo))
2035                 update_on_disk_bitmap(device, test_bit(RS_DONE, &todo));
2036         if (test_bit(GO_DISKLESS, &todo))
2037                 go_diskless(device);
2038         if (test_bit(DESTROY_DISK, &todo))
2039                 drbd_ldev_destroy(device);
2040         if (test_bit(RS_START, &todo))
2041                 do_start_resync(device);
2042 }
2043
2044 #define DRBD_DEVICE_WORK_MASK   \
2045         ((1UL << GO_DISKLESS)   \
2046         |(1UL << DESTROY_DISK)  \
2047         |(1UL << MD_SYNC)       \
2048         |(1UL << RS_START)      \
2049         |(1UL << RS_PROGRESS)   \
2050         |(1UL << RS_DONE)       \
2051         )
2052
2053 static unsigned long get_work_bits(unsigned long *flags)
2054 {
2055         unsigned long old, new;
2056         do {
2057                 old = *flags;
2058                 new = old & ~DRBD_DEVICE_WORK_MASK;
2059         } while (cmpxchg(flags, old, new) != old);
2060         return old & DRBD_DEVICE_WORK_MASK;
2061 }
2062
2063 static void do_unqueued_work(struct drbd_connection *connection)
2064 {
2065         struct drbd_peer_device *peer_device;
2066         int vnr;
2067
2068         rcu_read_lock();
2069         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
2070                 struct drbd_device *device = peer_device->device;
2071                 unsigned long todo = get_work_bits(&device->flags);
2072                 if (!todo)
2073                         continue;
2074
2075                 kref_get(&device->kref);
2076                 rcu_read_unlock();
2077                 do_device_work(device, todo);
2078                 kref_put(&device->kref, drbd_destroy_device);
2079                 rcu_read_lock();
2080         }
2081         rcu_read_unlock();
2082 }
2083
2084 static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
2085 {
2086         spin_lock_irq(&queue->q_lock);
2087         list_splice_tail_init(&queue->q, work_list);
2088         spin_unlock_irq(&queue->q_lock);
2089         return !list_empty(work_list);
2090 }
2091
2092 static void wait_for_work(struct drbd_connection *connection, struct list_head *work_list)
2093 {
2094         DEFINE_WAIT(wait);
2095         struct net_conf *nc;
2096         int uncork, cork;
2097
2098         dequeue_work_batch(&connection->sender_work, work_list);
2099         if (!list_empty(work_list))
2100                 return;
2101
2102         /* Still nothing to do?
2103          * Maybe we still need to close the current epoch,
2104          * even if no new requests are queued yet.
2105          *
2106          * Also, poke TCP, just in case.
2107          * Then wait for new work (or signal). */
2108         rcu_read_lock();
2109         nc = rcu_dereference(connection->net_conf);
2110         uncork = nc ? nc->tcp_cork : 0;
2111         rcu_read_unlock();
2112         if (uncork) {
2113                 mutex_lock(&connection->data.mutex);
2114                 if (connection->data.socket)
2115                         drbd_tcp_uncork(connection->data.socket);
2116                 mutex_unlock(&connection->data.mutex);
2117         }
2118
2119         for (;;) {
2120                 int send_barrier;
2121                 prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE);
2122                 spin_lock_irq(&connection->resource->req_lock);
2123                 spin_lock(&connection->sender_work.q_lock);     /* FIXME get rid of this one? */
2124                 if (!list_empty(&connection->sender_work.q))
2125                         list_splice_tail_init(&connection->sender_work.q, work_list);
2126                 spin_unlock(&connection->sender_work.q_lock);   /* FIXME get rid of this one? */
2127                 if (!list_empty(work_list) || signal_pending(current)) {
2128                         spin_unlock_irq(&connection->resource->req_lock);
2129                         break;
2130                 }
2131
2132                 /* We found nothing new to do, no to-be-communicated request,
2133                  * no other work item.  We may still need to close the last
2134                  * epoch.  Next incoming request epoch will be connection ->
2135                  * current transfer log epoch number.  If that is different
2136                  * from the epoch of the last request we communicated, it is
2137                  * safe to send the epoch separating barrier now.
2138                  */
2139                 send_barrier =
2140                         atomic_read(&connection->current_tle_nr) !=
2141                         connection->send.current_epoch_nr;
2142                 spin_unlock_irq(&connection->resource->req_lock);
2143
2144                 if (send_barrier)
2145                         maybe_send_barrier(connection,
2146                                         connection->send.current_epoch_nr + 1);
2147
2148                 if (test_bit(DEVICE_WORK_PENDING, &connection->flags))
2149                         break;
2150
2151                 /* drbd_send() may have called flush_signals() */
2152                 if (get_t_state(&connection->worker) != RUNNING)
2153                         break;
2154
2155                 schedule();
2156                 /* may be woken up for other things but new work, too,
2157                  * e.g. if the current epoch got closed.
2158                  * In which case we send the barrier above. */
2159         }
2160         finish_wait(&connection->sender_work.q_wait, &wait);
2161
2162         /* someone may have changed the config while we have been waiting above. */
2163         rcu_read_lock();
2164         nc = rcu_dereference(connection->net_conf);
2165         cork = nc ? nc->tcp_cork : 0;
2166         rcu_read_unlock();
2167         mutex_lock(&connection->data.mutex);
2168         if (connection->data.socket) {
2169                 if (cork)
2170                         drbd_tcp_cork(connection->data.socket);
2171                 else if (!uncork)
2172                         drbd_tcp_uncork(connection->data.socket);
2173         }
2174         mutex_unlock(&connection->data.mutex);
2175 }
2176
2177 int drbd_worker(struct drbd_thread *thi)
2178 {
2179         struct drbd_connection *connection = thi->connection;
2180         struct drbd_work *w = NULL;
2181         struct drbd_peer_device *peer_device;
2182         LIST_HEAD(work_list);
2183         int vnr;
2184
2185         while (get_t_state(thi) == RUNNING) {
2186                 drbd_thread_current_set_cpu(thi);
2187
2188                 if (list_empty(&work_list)) {
2189                         update_worker_timing_details(connection, wait_for_work);
2190                         wait_for_work(connection, &work_list);
2191                 }
2192
2193                 if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
2194                         update_worker_timing_details(connection, do_unqueued_work);
2195                         do_unqueued_work(connection);
2196                 }
2197
2198                 if (signal_pending(current)) {
2199                         flush_signals(current);
2200                         if (get_t_state(thi) == RUNNING) {
2201                                 drbd_warn(connection, "Worker got an unexpected signal\n");
2202                                 continue;
2203                         }
2204                         break;
2205                 }
2206
2207                 if (get_t_state(thi) != RUNNING)
2208                         break;
2209
2210                 if (!list_empty(&work_list)) {
2211                         w = list_first_entry(&work_list, struct drbd_work, list);
2212                         list_del_init(&w->list);
2213                         update_worker_timing_details(connection, w->cb);
2214                         if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0)
2215                                 continue;
2216                         if (connection->cstate >= C_WF_REPORT_PARAMS)
2217                                 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
2218                 }
2219         }
2220
2221         do {
2222                 if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
2223                         update_worker_timing_details(connection, do_unqueued_work);
2224                         do_unqueued_work(connection);
2225                 }
2226                 if (!list_empty(&work_list)) {
2227                         w = list_first_entry(&work_list, struct drbd_work, list);
2228                         list_del_init(&w->list);
2229                         update_worker_timing_details(connection, w->cb);
2230                         w->cb(w, 1);
2231                 } else
2232                         dequeue_work_batch(&connection->sender_work, &work_list);
2233         } while (!list_empty(&work_list) || test_bit(DEVICE_WORK_PENDING, &connection->flags));
2234
2235         rcu_read_lock();
2236         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
2237                 struct drbd_device *device = peer_device->device;
2238                 D_ASSERT(device, device->state.disk == D_DISKLESS && device->state.conn == C_STANDALONE);
2239                 kref_get(&device->kref);
2240                 rcu_read_unlock();
2241                 drbd_device_cleanup(device);
2242                 kref_put(&device->kref, drbd_destroy_device);
2243                 rcu_read_lock();
2244         }
2245         rcu_read_unlock();
2246
2247         return 0;
2248 }