net/ceph/mon_client.c

   1 #include <linux/ceph/ceph_debug.h>
   2
   3 #include <linux/module.h>
   4 #include <linux/types.h>
   5 #include <linux/slab.h>
   6 #include <linux/random.h>
   7 #include <linux/sched.h>
   8
   9 #include <linux/ceph/mon_client.h>
  10 #include <linux/ceph/libceph.h>
  11 #include <linux/ceph/debugfs.h>
  12 #include <linux/ceph/decode.h>
  13 #include <linux/ceph/auth.h>
  14
  15 /*
  16  * Interact with Ceph monitor cluster.  Handle requests for new map
  17  * versions, and periodically resend as needed.  Also implement
  18  * statfs() and umount().
  19  *
  20  * A small cluster of Ceph "monitors" are responsible for managing critical
  21  * cluster configuration and state information.  An odd number (e.g., 3, 5)
  22  * of cmon daemons use a modified version of the Paxos part-time parliament
  23  * algorithm to manage the MDS map (mds cluster membership), OSD map, and
  24  * list of clients who have mounted the file system.
  25  *
  26  * We maintain an open, active session with a monitor at all times in order to
  27  * receive timely MDSMap updates.  We periodically send a keepalive byte on the
  28  * TCP socket to ensure we detect a failure.  If the connection does break, we
  29  * randomly hunt for a new monitor.  Once the connection is reestablished, we
  30  * resend any outstanding requests.
  31  */
  32
  33 static const struct ceph_connection_operations mon_con_ops;
  34
  35 static int __validate_auth(struct ceph_mon_client *monc);
  36
  37 /*
  38  * Decode a monmap blob (e.g., during mount).
  39  */
  40 struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
  41 {
  42         struct ceph_monmap *m = NULL;
  43         int i, err = -EINVAL;
  44         struct ceph_fsid fsid;
  45         u32 epoch, num_mon;
  46         u16 version;
  47         u32 len;
  48
  49         ceph_decode_32_safe(&p, end, len, bad);
  50         ceph_decode_need(&p, end, len, bad);
  51
  52         dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
  53
  54         ceph_decode_16_safe(&p, end, version, bad);
  55
  56         ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
  57         ceph_decode_copy(&p, &fsid, sizeof(fsid));
  58         epoch = ceph_decode_32(&p);
  59
  60         num_mon = ceph_decode_32(&p);
  61         ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
  62
  63         if (num_mon >= CEPH_MAX_MON)
  64                 goto bad;
  65         m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
  66         if (m == NULL)
  67                 return ERR_PTR(-ENOMEM);
  68         m->fsid = fsid;
  69         m->epoch = epoch;
  70         m->num_mon = num_mon;
  71         ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
  72         for (i = 0; i < num_mon; i++)
  73                 ceph_decode_addr(&m->mon_inst[i].addr);
  74
  75         dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
  76              m->num_mon);
  77         for (i = 0; i < m->num_mon; i++)
  78                 dout("monmap_decode  mon%d is %s\n", i,
  79                      ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
  80         return m;
  81
  82 bad:
  83         dout("monmap_decode failed with %d\n", err);
  84         kfree(m);
  85         return ERR_PTR(err);
  86 }
  87
  88 /*
  89  * return true if *addr is included in the monmap.
  90  */
  91 int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
  92 {
  93         int i;
  94
  95         for (i = 0; i < m->num_mon; i++)
  96                 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
  97                         return 1;
  98         return 0;
  99 }
 100
 101 /*
 102  * Send an auth request.
 103  */
 104 static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
 105 {
 106         monc->pending_auth = 1;
 107         monc->m_auth->front.iov_len = len;
 108         monc->m_auth->hdr.front_len = cpu_to_le32(len);
 109         ceph_msg_revoke(monc->m_auth);
 110         ceph_msg_get(monc->m_auth);  /* keep our ref */
 111         ceph_con_send(&monc->con, monc->m_auth);
 112 }
 113
 114 /*
 115  * Close monitor session, if any.
 116  */
 117 static void __close_session(struct ceph_mon_client *monc)
 118 {
 119         dout("__close_session closing mon%d\n", monc->cur_mon);
 120         ceph_msg_revoke(monc->m_auth);
 121         ceph_msg_revoke_incoming(monc->m_auth_reply);
 122         ceph_msg_revoke(monc->m_subscribe);
 123         ceph_msg_revoke_incoming(monc->m_subscribe_ack);
 124         ceph_con_close(&monc->con);
 125         monc->cur_mon = -1;
 126         monc->pending_auth = 0;
 127         ceph_auth_reset(monc->auth);
 128 }
 129
 130 /*
 131  * Open a session with a (new) monitor.
 132  */
 133 static int __open_session(struct ceph_mon_client *monc)
 134 {
 135         char r;
 136         int ret;
 137
 138         if (monc->cur_mon < 0) {
 139                 get_random_bytes(&r, 1);
 140                 monc->cur_mon = r % monc->monmap->num_mon;
 141                 dout("open_session num=%d r=%d -> mon%d\n",
 142                      monc->monmap->num_mon, r, monc->cur_mon);
 143                 monc->sub_sent = 0;
 144                 monc->sub_renew_after = jiffies;  /* i.e., expired */
 145                 monc->want_next_osdmap = !!monc->want_next_osdmap;
 146
 147                 dout("open_session mon%d opening\n", monc->cur_mon);
 148                 ceph_con_open(&monc->con,
 149                               CEPH_ENTITY_TYPE_MON, monc->cur_mon,
 150                               &monc->monmap->mon_inst[monc->cur_mon].addr);
 151
 152                 /* initiatiate authentication handshake */
 153                 ret = ceph_auth_build_hello(monc->auth,
 154                                             monc->m_auth->front.iov_base,
 155                                             monc->m_auth->front_alloc_len);
 156                 __send_prepared_auth_request(monc, ret);
 157         } else {
 158                 dout("open_session mon%d already open\n", monc->cur_mon);
 159         }
 160         return 0;
 161 }
 162
 163 static bool __sub_expired(struct ceph_mon_client *monc)
 164 {
 165         return time_after_eq(jiffies, monc->sub_renew_after);
 166 }
 167
 168 /*
 169  * Reschedule delayed work timer.
 170  */
 171 static void __schedule_delayed(struct ceph_mon_client *monc)
 172 {
 173         unsigned int delay;
 174
 175         if (monc->cur_mon < 0 || __sub_expired(monc))
 176                 delay = 10 * HZ;
 177         else
 178                 delay = 20 * HZ;
 179         dout("__schedule_delayed after %u\n", delay);
 180         schedule_delayed_work(&monc->delayed_work, delay);
 181 }
 182
 183 /*
 184  * Send subscribe request for mdsmap and/or osdmap.
 185  */
 186 static void __send_subscribe(struct ceph_mon_client *monc)
 187 {
 188         dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
 189              (unsigned int)monc->sub_sent, __sub_expired(monc),
 190              monc->want_next_osdmap);
 191         if ((__sub_expired(monc) && !monc->sub_sent) ||
 192             monc->want_next_osdmap == 1) {
 193                 struct ceph_msg *msg = monc->m_subscribe;
 194                 struct ceph_mon_subscribe_item *i;
 195                 void *p, *end;
 196                 int num;
 197
 198                 p = msg->front.iov_base;
 199                 end = p + msg->front_alloc_len;
 200
 201                 num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
 202                 ceph_encode_32(&p, num);
 203
 204                 if (monc->want_next_osdmap) {
 205                         dout("__send_subscribe to 'osdmap' %u\n",
 206                              (unsigned int)monc->have_osdmap);
 207                         ceph_encode_string(&p, end, "osdmap", 6);
 208                         i = p;
 209                         i->have = cpu_to_le64(monc->have_osdmap);
 210                         i->onetime = 1;
 211                         p += sizeof(*i);
 212                         monc->want_next_osdmap = 2;  /* requested */
 213                 }
 214                 if (monc->want_mdsmap) {
 215                         dout("__send_subscribe to 'mdsmap' %u+\n",
 216                              (unsigned int)monc->have_mdsmap);
 217                         ceph_encode_string(&p, end, "mdsmap", 6);
 218                         i = p;
 219                         i->have = cpu_to_le64(monc->have_mdsmap);
 220                         i->onetime = 0;
 221                         p += sizeof(*i);
 222                 }
 223                 ceph_encode_string(&p, end, "monmap", 6);
 224                 i = p;
 225                 i->have = 0;
 226                 i->onetime = 0;
 227                 p += sizeof(*i);
 228
 229                 msg->front.iov_len = p - msg->front.iov_base;
 230                 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 231                 ceph_msg_revoke(msg);
 232                 ceph_con_send(&monc->con, ceph_msg_get(msg));
 233
 234                 monc->sub_sent = jiffies | 1;  /* never 0 */
 235         }
 236 }
 237
 238 static void handle_subscribe_ack(struct ceph_mon_client *monc,
 239                                  struct ceph_msg *msg)
 240 {
 241         unsigned int seconds;
 242         struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
 243
 244         if (msg->front.iov_len < sizeof(*h))
 245                 goto bad;
 246         seconds = le32_to_cpu(h->duration);
 247
 248         mutex_lock(&monc->mutex);
 249         if (monc->hunting) {
 250                 pr_info("mon%d %s session established\n",
 251                         monc->cur_mon,
 252                         ceph_pr_addr(&monc->con.peer_addr.in_addr));
 253                 monc->hunting = false;
 254         }
 255         dout("handle_subscribe_ack after %d seconds\n", seconds);
 256         monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
 257         monc->sub_sent = 0;
 258         mutex_unlock(&monc->mutex);
 259         return;
 260 bad:
 261         pr_err("got corrupt subscribe-ack msg\n");
 262         ceph_msg_dump(msg);
 263 }
 264
 265 /*
 266  * Keep track of which maps we have
 267  */
 268 int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
 269 {
 270         mutex_lock(&monc->mutex);
 271         monc->have_mdsmap = got;
 272         mutex_unlock(&monc->mutex);
 273         return 0;
 274 }
 275 EXPORT_SYMBOL(ceph_monc_got_mdsmap);
 276
 277 int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
 278 {
 279         mutex_lock(&monc->mutex);
 280         monc->have_osdmap = got;
 281         monc->want_next_osdmap = 0;
 282         mutex_unlock(&monc->mutex);
 283         return 0;
 284 }
 285
 286 /*
 287  * Register interest in the next osdmap
 288  */
 289 void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
 290 {
 291         dout("request_next_osdmap have %u\n", monc->have_osdmap);
 292         mutex_lock(&monc->mutex);
 293         if (!monc->want_next_osdmap)
 294                 monc->want_next_osdmap = 1;
 295         if (monc->want_next_osdmap < 2)
 296                 __send_subscribe(monc);
 297         mutex_unlock(&monc->mutex);
 298 }
 299 EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
 300
 301 int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
 302                           unsigned long timeout)
 303 {
 304         unsigned long started = jiffies;
 305         int ret;
 306
 307         mutex_lock(&monc->mutex);
 308         while (monc->have_osdmap < epoch) {
 309                 mutex_unlock(&monc->mutex);
 310
 311                 if (timeout != 0 && time_after_eq(jiffies, started + timeout))
 312                         return -ETIMEDOUT;
 313
 314                 ret = wait_event_interruptible_timeout(monc->client->auth_wq,
 315                                          monc->have_osdmap >= epoch, timeout);
 316                 if (ret < 0)
 317                         return ret;
 318
 319                 mutex_lock(&monc->mutex);
 320         }
 321
 322         mutex_unlock(&monc->mutex);
 323         return 0;
 324 }
 325 EXPORT_SYMBOL(ceph_monc_wait_osdmap);
 326
 327 /*
 328  *
 329  */
 330 int ceph_monc_open_session(struct ceph_mon_client *monc)
 331 {
 332         mutex_lock(&monc->mutex);
 333         __open_session(monc);
 334         __schedule_delayed(monc);
 335         mutex_unlock(&monc->mutex);
 336         return 0;
 337 }
 338 EXPORT_SYMBOL(ceph_monc_open_session);
 339
 340 /*
 341  * We require the fsid and global_id in order to initialize our
 342  * debugfs dir.
 343  */
 344 static bool have_debugfs_info(struct ceph_mon_client *monc)
 345 {
 346         dout("have_debugfs_info fsid %d globalid %lld\n",
 347              (int)monc->client->have_fsid, monc->auth->global_id);
 348         return monc->client->have_fsid && monc->auth->global_id > 0;
 349 }
 350
 351 /*
 352  * The monitor responds with mount ack indicate mount success.  The
 353  * included client ticket allows the client to talk to MDSs and OSDs.
 354  */
 355 static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 356                                  struct ceph_msg *msg)
 357 {
 358         struct ceph_client *client = monc->client;
 359         struct ceph_monmap *monmap = NULL, *old = monc->monmap;
 360         void *p, *end;
 361         int had_debugfs_info, init_debugfs = 0;
 362
 363         mutex_lock(&monc->mutex);
 364
 365         had_debugfs_info = have_debugfs_info(monc);
 366
 367         dout("handle_monmap\n");
 368         p = msg->front.iov_base;
 369         end = p + msg->front.iov_len;
 370
 371         monmap = ceph_monmap_decode(p, end);
 372         if (IS_ERR(monmap)) {
 373                 pr_err("problem decoding monmap, %d\n",
 374                        (int)PTR_ERR(monmap));
 375                 goto out;
 376         }
 377
 378         if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
 379                 kfree(monmap);
 380                 goto out;
 381         }
 382
 383         client->monc.monmap = monmap;
 384         kfree(old);
 385
 386         if (!client->have_fsid) {
 387                 client->have_fsid = true;
 388                 if (!had_debugfs_info && have_debugfs_info(monc)) {
 389                         pr_info("client%lld fsid %pU\n",
 390                                 ceph_client_id(monc->client),
 391                                 &monc->client->fsid);
 392                         init_debugfs = 1;
 393                 }
 394                 mutex_unlock(&monc->mutex);
 395
 396                 if (init_debugfs) {
 397                         /*
 398                          * do debugfs initialization without mutex to avoid
 399                          * creating a locking dependency
 400                          */
 401                         ceph_debugfs_client_init(monc->client);
 402                 }
 403
 404                 goto out_unlocked;
 405         }
 406 out:
 407         mutex_unlock(&monc->mutex);
 408 out_unlocked:
 409         wake_up_all(&client->auth_wq);
 410 }
 411
 412 /*
 413  * generic requests (e.g., statfs, poolop)
 414  */
 415 static struct ceph_mon_generic_request *__lookup_generic_req(
 416         struct ceph_mon_client *monc, u64 tid)
 417 {
 418         struct ceph_mon_generic_request *req;
 419         struct rb_node *n = monc->generic_request_tree.rb_node;
 420
 421         while (n) {
 422                 req = rb_entry(n, struct ceph_mon_generic_request, node);
 423                 if (tid < req->tid)
 424                         n = n->rb_left;
 425                 else if (tid > req->tid)
 426                         n = n->rb_right;
 427                 else
 428                         return req;
 429         }
 430         return NULL;
 431 }
 432
 433 static void __insert_generic_request(struct ceph_mon_client *monc,
 434                             struct ceph_mon_generic_request *new)
 435 {
 436         struct rb_node **p = &monc->generic_request_tree.rb_node;
 437         struct rb_node *parent = NULL;
 438         struct ceph_mon_generic_request *req = NULL;
 439
 440         while (*p) {
 441                 parent = *p;
 442                 req = rb_entry(parent, struct ceph_mon_generic_request, node);
 443                 if (new->tid < req->tid)
 444                         p = &(*p)->rb_left;
 445                 else if (new->tid > req->tid)
 446                         p = &(*p)->rb_right;
 447                 else
 448                         BUG();
 449         }
 450
 451         rb_link_node(&new->node, parent, p);
 452         rb_insert_color(&new->node, &monc->generic_request_tree);
 453 }
 454
 455 static void release_generic_request(struct kref *kref)
 456 {
 457         struct ceph_mon_generic_request *req =
 458                 container_of(kref, struct ceph_mon_generic_request, kref);
 459
 460         if (req->reply)
 461                 ceph_msg_put(req->reply);
 462         if (req->request)
 463                 ceph_msg_put(req->request);
 464
 465         kfree(req);
 466 }
 467
 468 static void put_generic_request(struct ceph_mon_generic_request *req)
 469 {
 470         kref_put(&req->kref, release_generic_request);
 471 }
 472
 473 static void get_generic_request(struct ceph_mon_generic_request *req)
 474 {
 475         kref_get(&req->kref);
 476 }
 477
 478 static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
 479                                          struct ceph_msg_header *hdr,
 480                                          int *skip)
 481 {
 482         struct ceph_mon_client *monc = con->private;
 483         struct ceph_mon_generic_request *req;
 484         u64 tid = le64_to_cpu(hdr->tid);
 485         struct ceph_msg *m;
 486
 487         mutex_lock(&monc->mutex);
 488         req = __lookup_generic_req(monc, tid);
 489         if (!req) {
 490                 dout("get_generic_reply %lld dne\n", tid);
 491                 *skip = 1;
 492                 m = NULL;
 493         } else {
 494                 dout("get_generic_reply %lld got %p\n", tid, req->reply);
 495                 *skip = 0;
 496                 m = ceph_msg_get(req->reply);
 497                 /*
 498                  * we don't need to track the connection reading into
 499                  * this reply because we only have one open connection
 500                  * at a time, ever.
 501                  */
 502         }
 503         mutex_unlock(&monc->mutex);
 504         return m;
 505 }
 506
 507 static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
 508                                 struct ceph_mon_generic_request *req)
 509 {
 510         int err;
 511
 512         /* register request */
 513         req->tid = tid != 0 ? tid : ++monc->last_tid;
 514         req->request->hdr.tid = cpu_to_le64(req->tid);
 515         __insert_generic_request(monc, req);
 516         monc->num_generic_requests++;
 517         ceph_con_send(&monc->con, ceph_msg_get(req->request));
 518         mutex_unlock(&monc->mutex);
 519
 520         err = wait_for_completion_interruptible(&req->completion);
 521
 522         mutex_lock(&monc->mutex);
 523         rb_erase(&req->node, &monc->generic_request_tree);
 524         monc->num_generic_requests--;
 525
 526         if (!err)
 527                 err = req->result;
 528         return err;
 529 }
 530
 531 static int do_generic_request(struct ceph_mon_client *monc,
 532                               struct ceph_mon_generic_request *req)
 533 {
 534         int err;
 535
 536         mutex_lock(&monc->mutex);
 537         err = __do_generic_request(monc, 0, req);
 538         mutex_unlock(&monc->mutex);
 539
 540         return err;
 541 }
 542
 543 /*
 544  * statfs
 545  */
 546 static void handle_statfs_reply(struct ceph_mon_client *monc,
 547                                 struct ceph_msg *msg)
 548 {
 549         struct ceph_mon_generic_request *req;
 550         struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
 551         u64 tid = le64_to_cpu(msg->hdr.tid);
 552
 553         if (msg->front.iov_len != sizeof(*reply))
 554                 goto bad;
 555         dout("handle_statfs_reply %p tid %llu\n", msg, tid);
 556
 557         mutex_lock(&monc->mutex);
 558         req = __lookup_generic_req(monc, tid);
 559         if (req) {
 560                 *(struct ceph_statfs *)req->buf = reply->st;
 561                 req->result = 0;
 562                 get_generic_request(req);
 563         }
 564         mutex_unlock(&monc->mutex);
 565         if (req) {
 566                 complete_all(&req->completion);
 567                 put_generic_request(req);
 568         }
 569         return;
 570
 571 bad:
 572         pr_err("corrupt generic reply, tid %llu\n", tid);
 573         ceph_msg_dump(msg);
 574 }
 575
 576 /*
 577  * Do a synchronous statfs().
 578  */
 579 int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 580 {
 581         struct ceph_mon_generic_request *req;
 582         struct ceph_mon_statfs *h;
 583         int err;
 584
 585         req = kzalloc(sizeof(*req), GFP_NOFS);
 586         if (!req)
 587                 return -ENOMEM;
 588
 589         kref_init(&req->kref);
 590         req->buf = buf;
 591         req->buf_len = sizeof(*buf);
 592         init_completion(&req->completion);
 593
 594         err = -ENOMEM;
 595         req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
 596                                     true);
 597         if (!req->request)
 598                 goto out;
 599         req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS,
 600                                   true);
 601         if (!req->reply)
 602                 goto out;
 603
 604         /* fill out request */
 605         h = req->request->front.iov_base;
 606         h->monhdr.have_version = 0;
 607         h->monhdr.session_mon = cpu_to_le16(-1);
 608         h->monhdr.session_mon_tid = 0;
 609         h->fsid = monc->monmap->fsid;
 610
 611         err = do_generic_request(monc, req);
 612
 613 out:
 614         kref_put(&req->kref, release_generic_request);
 615         return err;
 616 }
 617 EXPORT_SYMBOL(ceph_monc_do_statfs);
 618
 619 static void handle_get_version_reply(struct ceph_mon_client *monc,
 620                                      struct ceph_msg *msg)
 621 {
 622         struct ceph_mon_generic_request *req;
 623         u64 tid = le64_to_cpu(msg->hdr.tid);
 624         void *p = msg->front.iov_base;
 625         void *end = p + msg->front_alloc_len;
 626         u64 handle;
 627
 628         dout("%s %p tid %llu\n", __func__, msg, tid);
 629
 630         ceph_decode_need(&p, end, 2*sizeof(u64), bad);
 631         handle = ceph_decode_64(&p);
 632         if (tid != 0 && tid != handle)
 633                 goto bad;
 634
 635         mutex_lock(&monc->mutex);
 636         req = __lookup_generic_req(monc, handle);
 637         if (req) {
 638                 *(u64 *)req->buf = ceph_decode_64(&p);
 639                 req->result = 0;
 640                 get_generic_request(req);
 641         }
 642         mutex_unlock(&monc->mutex);
 643         if (req) {
 644                 complete_all(&req->completion);
 645                 put_generic_request(req);
 646         }
 647
 648         return;
 649 bad:
 650         pr_err("corrupt mon_get_version reply\n");
 651         ceph_msg_dump(msg);
 652 }
 653
 654 /*
 655  * Send MMonGetVersion and wait for the reply.
 656  *
 657  * @what: one of "mdsmap", "osdmap" or "monmap"
 658  */
 659 int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
 660                              u64 *newest)
 661 {
 662         struct ceph_mon_generic_request *req;
 663         void *p, *end;
 664         u64 tid;
 665         int err;
 666
 667         req = kzalloc(sizeof(*req), GFP_NOFS);
 668         if (!req)
 669                 return -ENOMEM;
 670
 671         kref_init(&req->kref);
 672         req->buf = newest;
 673         req->buf_len = sizeof(*newest);
 674         init_completion(&req->completion);
 675
 676         req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
 677                                     sizeof(u64) + sizeof(u32) + strlen(what),
 678                                     GFP_NOFS, true);
 679         if (!req->request) {
 680                 err = -ENOMEM;
 681                 goto out;
 682         }
 683
 684         req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024,
 685                                   GFP_NOFS, true);
 686         if (!req->reply) {
 687                 err = -ENOMEM;
 688                 goto out;
 689         }
 690
 691         p = req->request->front.iov_base;
 692         end = p + req->request->front_alloc_len;
 693
 694         /* fill out request */
 695         mutex_lock(&monc->mutex);
 696         tid = ++monc->last_tid;
 697         ceph_encode_64(&p, tid); /* handle */
 698         ceph_encode_string(&p, end, what, strlen(what));
 699
 700         err = __do_generic_request(monc, tid, req);
 701
 702         mutex_unlock(&monc->mutex);
 703 out:
 704         kref_put(&req->kref, release_generic_request);
 705         return err;
 706 }
 707 EXPORT_SYMBOL(ceph_monc_do_get_version);
 708
 709 /*
 710  * pool ops
 711  */
 712 static int get_poolop_reply_buf(const char *src, size_t src_len,
 713                                 char *dst, size_t dst_len)
 714 {
 715         u32 buf_len;
 716
 717         if (src_len != sizeof(u32) + dst_len)
 718                 return -EINVAL;
 719
 720         buf_len = le32_to_cpu(*(u32 *)src);
 721         if (buf_len != dst_len)
 722                 return -EINVAL;
 723
 724         memcpy(dst, src + sizeof(u32), dst_len);
 725         return 0;
 726 }
 727
 728 static void handle_poolop_reply(struct ceph_mon_client *monc,
 729                                 struct ceph_msg *msg)
 730 {
 731         struct ceph_mon_generic_request *req;
 732         struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
 733         u64 tid = le64_to_cpu(msg->hdr.tid);
 734
 735         if (msg->front.iov_len < sizeof(*reply))
 736                 goto bad;
 737         dout("handle_poolop_reply %p tid %llu\n", msg, tid);
 738
 739         mutex_lock(&monc->mutex);
 740         req = __lookup_generic_req(monc, tid);
 741         if (req) {
 742                 if (req->buf_len &&
 743                     get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
 744                                      msg->front.iov_len - sizeof(*reply),
 745                                      req->buf, req->buf_len) < 0) {
 746                         mutex_unlock(&monc->mutex);
 747                         goto bad;
 748                 }
 749                 req->result = le32_to_cpu(reply->reply_code);
 750                 get_generic_request(req);
 751         }
 752         mutex_unlock(&monc->mutex);
 753         if (req) {
 754                 complete(&req->completion);
 755                 put_generic_request(req);
 756         }
 757         return;
 758
 759 bad:
 760         pr_err("corrupt generic reply, tid %llu\n", tid);
 761         ceph_msg_dump(msg);
 762 }
 763
 764 /*
 765  * Do a synchronous pool op.
 766  */
 767 static int do_poolop(struct ceph_mon_client *monc, u32 op,
 768                         u32 pool, u64 snapid,
 769                         char *buf, int len)
 770 {
 771         struct ceph_mon_generic_request *req;
 772         struct ceph_mon_poolop *h;
 773         int err;
 774
 775         req = kzalloc(sizeof(*req), GFP_NOFS);
 776         if (!req)
 777                 return -ENOMEM;
 778
 779         kref_init(&req->kref);
 780         req->buf = buf;
 781         req->buf_len = len;
 782         init_completion(&req->completion);
 783
 784         err = -ENOMEM;
 785         req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS,
 786                                     true);
 787         if (!req->request)
 788                 goto out;
 789         req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS,
 790                                   true);
 791         if (!req->reply)
 792                 goto out;
 793
 794         /* fill out request */
 795         req->request->hdr.version = cpu_to_le16(2);
 796         h = req->request->front.iov_base;
 797         h->monhdr.have_version = 0;
 798         h->monhdr.session_mon = cpu_to_le16(-1);
 799         h->monhdr.session_mon_tid = 0;
 800         h->fsid = monc->monmap->fsid;
 801         h->pool = cpu_to_le32(pool);
 802         h->op = cpu_to_le32(op);
 803         h->auid = 0;
 804         h->snapid = cpu_to_le64(snapid);
 805         h->name_len = 0;
 806
 807         err = do_generic_request(monc, req);
 808
 809 out:
 810         kref_put(&req->kref, release_generic_request);
 811         return err;
 812 }
 813
 814 int ceph_monc_create_snapid(struct ceph_mon_client *monc,
 815                             u32 pool, u64 *snapid)
 816 {
 817         return do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
 818                                    pool, 0, (char *)snapid, sizeof(*snapid));
 819
 820 }
 821 EXPORT_SYMBOL(ceph_monc_create_snapid);
 822
 823 int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
 824                             u32 pool, u64 snapid)
 825 {
 826         return do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
 827                                    pool, snapid, NULL, 0);
 828
 829 }
 830
 831 /*
 832  * Resend pending generic requests.
 833  */
 834 static void __resend_generic_request(struct ceph_mon_client *monc)
 835 {
 836         struct ceph_mon_generic_request *req;
 837         struct rb_node *p;
 838
 839         for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
 840                 req = rb_entry(p, struct ceph_mon_generic_request, node);
 841                 ceph_msg_revoke(req->request);
 842                 ceph_msg_revoke_incoming(req->reply);
 843                 ceph_con_send(&monc->con, ceph_msg_get(req->request));
 844         }
 845 }
 846
 847 /*
 848  * Delayed work.  If we haven't mounted yet, retry.  Otherwise,
 849  * renew/retry subscription as needed (in case it is timing out, or we
 850  * got an ENOMEM).  And keep the monitor connection alive.
 851  */
 852 static void delayed_work(struct work_struct *work)
 853 {
 854         struct ceph_mon_client *monc =
 855                 container_of(work, struct ceph_mon_client, delayed_work.work);
 856
 857         dout("monc delayed_work\n");
 858         mutex_lock(&monc->mutex);
 859         if (monc->hunting) {
 860                 __close_session(monc);
 861                 __open_session(monc);  /* continue hunting */
 862         } else {
 863                 ceph_con_keepalive(&monc->con);
 864
 865                 __validate_auth(monc);
 866
 867                 if (ceph_auth_is_authenticated(monc->auth))
 868                         __send_subscribe(monc);
 869         }
 870         __schedule_delayed(monc);
 871         mutex_unlock(&monc->mutex);
 872 }
 873
 874 /*
 875  * On startup, we build a temporary monmap populated with the IPs
 876  * provided by mount(2).
 877  */
 878 static int build_initial_monmap(struct ceph_mon_client *monc)
 879 {
 880         struct ceph_options *opt = monc->client->options;
 881         struct ceph_entity_addr *mon_addr = opt->mon_addr;
 882         int num_mon = opt->num_mon;
 883         int i;
 884
 885         /* build initial monmap */
 886         monc->monmap = kzalloc(sizeof(*monc->monmap) +
 887                                num_mon*sizeof(monc->monmap->mon_inst[0]),
 888                                GFP_KERNEL);
 889         if (!monc->monmap)
 890                 return -ENOMEM;
 891         for (i = 0; i < num_mon; i++) {
 892                 monc->monmap->mon_inst[i].addr = mon_addr[i];
 893                 monc->monmap->mon_inst[i].addr.nonce = 0;
 894                 monc->monmap->mon_inst[i].name.type =
 895                         CEPH_ENTITY_TYPE_MON;
 896                 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
 897         }
 898         monc->monmap->num_mon = num_mon;
 899         return 0;
 900 }
 901
 902 int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 903 {
 904         int err = 0;
 905
 906         dout("init\n");
 907         memset(monc, 0, sizeof(*monc));
 908         monc->client = cl;
 909         monc->monmap = NULL;
 910         mutex_init(&monc->mutex);
 911
 912         err = build_initial_monmap(monc);
 913         if (err)
 914                 goto out;
 915
 916         /* connection */
 917         /* authentication */
 918         monc->auth = ceph_auth_init(cl->options->name,
 919                                     cl->options->key);
 920         if (IS_ERR(monc->auth)) {
 921                 err = PTR_ERR(monc->auth);
 922                 goto out_monmap;
 923         }
 924         monc->auth->want_keys =
 925                 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
 926                 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
 927
 928         /* msgs */
 929         err = -ENOMEM;
 930         monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
 931                                      sizeof(struct ceph_mon_subscribe_ack),
 932                                      GFP_NOFS, true);
 933         if (!monc->m_subscribe_ack)
 934                 goto out_auth;
 935
 936         monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS,
 937                                          true);
 938         if (!monc->m_subscribe)
 939                 goto out_subscribe_ack;
 940
 941         monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS,
 942                                           true);
 943         if (!monc->m_auth_reply)
 944                 goto out_subscribe;
 945
 946         monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true);
 947         monc->pending_auth = 0;
 948         if (!monc->m_auth)
 949                 goto out_auth_reply;
 950
 951         ceph_con_init(&monc->con, monc, &mon_con_ops,
 952                       &monc->client->msgr);
 953
 954         monc->cur_mon = -1;
 955         monc->hunting = true;
 956         monc->sub_renew_after = jiffies;
 957         monc->sub_sent = 0;
 958
 959         INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
 960         monc->generic_request_tree = RB_ROOT;
 961         monc->num_generic_requests = 0;
 962         monc->last_tid = 0;
 963
 964         monc->have_mdsmap = 0;
 965         monc->have_osdmap = 0;
 966         monc->want_next_osdmap = 1;
 967         return 0;
 968
 969 out_auth_reply:
 970         ceph_msg_put(monc->m_auth_reply);
 971 out_subscribe:
 972         ceph_msg_put(monc->m_subscribe);
 973 out_subscribe_ack:
 974         ceph_msg_put(monc->m_subscribe_ack);
 975 out_auth:
 976         ceph_auth_destroy(monc->auth);
 977 out_monmap:
 978         kfree(monc->monmap);
 979 out:
 980         return err;
 981 }
 982 EXPORT_SYMBOL(ceph_monc_init);
 983
 984 void ceph_monc_stop(struct ceph_mon_client *monc)
 985 {
 986         dout("stop\n");
 987         cancel_delayed_work_sync(&monc->delayed_work);
 988
 989         mutex_lock(&monc->mutex);
 990         __close_session(monc);
 991
 992         mutex_unlock(&monc->mutex);
 993
 994         /*
 995          * flush msgr queue before we destroy ourselves to ensure that:
 996          *  - any work that references our embedded con is finished.
 997          *  - any osd_client or other work that may reference an authorizer
 998          *    finishes before we shut down the auth subsystem.
 999          */
1000         ceph_msgr_flush();
1001
1002         ceph_auth_destroy(monc->auth);
1003
1004         ceph_msg_put(monc->m_auth);
1005         ceph_msg_put(monc->m_auth_reply);
1006         ceph_msg_put(monc->m_subscribe);
1007         ceph_msg_put(monc->m_subscribe_ack);
1008
1009         kfree(monc->monmap);
1010 }
1011 EXPORT_SYMBOL(ceph_monc_stop);
1012
1013 static void handle_auth_reply(struct ceph_mon_client *monc,
1014                               struct ceph_msg *msg)
1015 {
1016         int ret;
1017         int was_auth = 0;
1018         int had_debugfs_info, init_debugfs = 0;
1019
1020         mutex_lock(&monc->mutex);
1021         had_debugfs_info = have_debugfs_info(monc);
1022         was_auth = ceph_auth_is_authenticated(monc->auth);
1023         monc->pending_auth = 0;
1024         ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
1025                                      msg->front.iov_len,
1026                                      monc->m_auth->front.iov_base,
1027                                      monc->m_auth->front_alloc_len);
1028         if (ret < 0) {
1029                 monc->client->auth_err = ret;
1030                 wake_up_all(&monc->client->auth_wq);
1031         } else if (ret > 0) {
1032                 __send_prepared_auth_request(monc, ret);
1033         } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) {
1034                 dout("authenticated, starting session\n");
1035
1036                 monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
1037                 monc->client->msgr.inst.name.num =
1038                                         cpu_to_le64(monc->auth->global_id);
1039
1040                 __send_subscribe(monc);
1041                 __resend_generic_request(monc);
1042         }
1043
1044         if (!had_debugfs_info && have_debugfs_info(monc)) {
1045                 pr_info("client%lld fsid %pU\n",
1046                         ceph_client_id(monc->client),
1047                         &monc->client->fsid);
1048                 init_debugfs = 1;
1049         }
1050         mutex_unlock(&monc->mutex);
1051
1052         if (init_debugfs) {
1053                 /*
1054                  * do debugfs initialization without mutex to avoid
1055                  * creating a locking dependency
1056                  */
1057                 ceph_debugfs_client_init(monc->client);
1058         }
1059 }
1060
1061 static int __validate_auth(struct ceph_mon_client *monc)
1062 {
1063         int ret;
1064
1065         if (monc->pending_auth)
1066                 return 0;
1067
1068         ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
1069                               monc->m_auth->front_alloc_len);
1070         if (ret <= 0)
1071                 return ret; /* either an error, or no need to authenticate */
1072         __send_prepared_auth_request(monc, ret);
1073         return 0;
1074 }
1075
1076 int ceph_monc_validate_auth(struct ceph_mon_client *monc)
1077 {
1078         int ret;
1079
1080         mutex_lock(&monc->mutex);
1081         ret = __validate_auth(monc);
1082         mutex_unlock(&monc->mutex);
1083         return ret;
1084 }
1085 EXPORT_SYMBOL(ceph_monc_validate_auth);
1086
1087 /*
1088  * handle incoming message
1089  */
1090 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1091 {
1092         struct ceph_mon_client *monc = con->private;
1093         int type = le16_to_cpu(msg->hdr.type);
1094
1095         if (!monc)
1096                 return;
1097
1098         switch (type) {
1099         case CEPH_MSG_AUTH_REPLY:
1100                 handle_auth_reply(monc, msg);
1101                 break;
1102
1103         case CEPH_MSG_MON_SUBSCRIBE_ACK:
1104                 handle_subscribe_ack(monc, msg);
1105                 break;
1106
1107         case CEPH_MSG_STATFS_REPLY:
1108                 handle_statfs_reply(monc, msg);
1109                 break;
1110
1111         case CEPH_MSG_MON_GET_VERSION_REPLY:
1112                 handle_get_version_reply(monc, msg);
1113                 break;
1114
1115         case CEPH_MSG_POOLOP_REPLY:
1116                 handle_poolop_reply(monc, msg);
1117                 break;
1118
1119         case CEPH_MSG_MON_MAP:
1120                 ceph_monc_handle_map(monc, msg);
1121                 break;
1122
1123         case CEPH_MSG_OSD_MAP:
1124                 ceph_osdc_handle_map(&monc->client->osdc, msg);
1125                 break;
1126
1127         default:
1128                 /* can the chained handler handle it? */
1129                 if (monc->client->extra_mon_dispatch &&
1130                     monc->client->extra_mon_dispatch(monc->client, msg) == 0)
1131                         break;
1132
1133                 pr_err("received unknown message type %d %s\n", type,
1134                        ceph_msg_type_name(type));
1135         }
1136         ceph_msg_put(msg);
1137 }
1138
1139 /*
1140  * Allocate memory for incoming message
1141  */
1142 static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
1143                                       struct ceph_msg_header *hdr,
1144                                       int *skip)
1145 {
1146         struct ceph_mon_client *monc = con->private;
1147         int type = le16_to_cpu(hdr->type);
1148         int front_len = le32_to_cpu(hdr->front_len);
1149         struct ceph_msg *m = NULL;
1150
1151         *skip = 0;
1152
1153         switch (type) {
1154         case CEPH_MSG_MON_SUBSCRIBE_ACK:
1155                 m = ceph_msg_get(monc->m_subscribe_ack);
1156                 break;
1157         case CEPH_MSG_POOLOP_REPLY:
1158         case CEPH_MSG_STATFS_REPLY:
1159                 return get_generic_reply(con, hdr, skip);
1160         case CEPH_MSG_AUTH_REPLY:
1161                 m = ceph_msg_get(monc->m_auth_reply);
1162                 break;
1163         case CEPH_MSG_MON_GET_VERSION_REPLY:
1164                 if (le64_to_cpu(hdr->tid) != 0)
1165                         return get_generic_reply(con, hdr, skip);
1166
1167                 /*
1168                  * Older OSDs don't set reply tid even if the orignal
1169                  * request had a non-zero tid.  Workaround this weirdness
1170                  * by falling through to the allocate case.
1171                  */
1172         case CEPH_MSG_MON_MAP:
1173         case CEPH_MSG_MDS_MAP:
1174         case CEPH_MSG_OSD_MAP:
1175                 m = ceph_msg_new(type, front_len, GFP_NOFS, false);
1176                 if (!m)
1177                         return NULL;    /* ENOMEM--return skip == 0 */
1178                 break;
1179         }
1180
1181         if (!m) {
1182                 pr_info("alloc_msg unknown type %d\n", type);
1183                 *skip = 1;
1184         }
1185         return m;
1186 }
1187
1188 /*
1189  * If the monitor connection resets, pick a new monitor and resubmit
1190  * any pending requests.
1191  */
1192 static void mon_fault(struct ceph_connection *con)
1193 {
1194         struct ceph_mon_client *monc = con->private;
1195
1196         if (!monc)
1197                 return;
1198
1199         dout("mon_fault\n");
1200         mutex_lock(&monc->mutex);
1201         if (!con->private)
1202                 goto out;
1203
1204         if (!monc->hunting)
1205                 pr_info("mon%d %s session lost, "
1206                         "hunting for new mon\n", monc->cur_mon,
1207                         ceph_pr_addr(&monc->con.peer_addr.in_addr));
1208
1209         __close_session(monc);
1210         if (!monc->hunting) {
1211                 /* start hunting */
1212                 monc->hunting = true;
1213                 __open_session(monc);
1214         } else {
1215                 /* already hunting, let's wait a bit */
1216                 __schedule_delayed(monc);
1217         }
1218 out:
1219         mutex_unlock(&monc->mutex);
1220 }
1221
1222 /*
1223  * We can ignore refcounting on the connection struct, as all references
1224  * will come from the messenger workqueue, which is drained prior to
1225  * mon_client destruction.
1226  */
1227 static struct ceph_connection *con_get(struct ceph_connection *con)
1228 {
1229         return con;
1230 }
1231
1232 static void con_put(struct ceph_connection *con)
1233 {
1234 }
1235
1236 static const struct ceph_connection_operations mon_con_ops = {
1237         .get = con_get,
1238         .put = con_put,
1239         .dispatch = dispatch,
1240         .fault = mon_fault,
1241         .alloc_msg = mon_alloc_msg,
1242 };