usr/src/uts/common/os/flock.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28 /*      All Rights Reserved */
  29
  30 /*
  31  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  32  */
  33
  34 #include <sys/flock_impl.h>
  35 #include <sys/vfs.h>
  36 #include <sys/t_lock.h>         /* for <sys/callb.h> */
  37 #include <sys/callb.h>
  38 #include <sys/clconf.h>
  39 #include <sys/cladm.h>
  40 #include <sys/nbmlock.h>
  41 #include <sys/cred.h>
  42 #include <sys/policy.h>
  43
  44 /*
  45  * The following four variables are for statistics purposes and they are
  46  * not protected by locks. They may not be accurate but will at least be
  47  * close to the actual value.
  48  */
  49
  50 int     flk_lock_allocs;
  51 int     flk_lock_frees;
  52 int     edge_allocs;
  53 int     edge_frees;
  54 int     flk_proc_vertex_allocs;
  55 int     flk_proc_edge_allocs;
  56 int     flk_proc_vertex_frees;
  57 int     flk_proc_edge_frees;
  58
  59 static kmutex_t flock_lock;
  60
  61 #ifdef DEBUG
  62 int check_debug = 0;
  63 #define CHECK_ACTIVE_LOCKS(gp)  if (check_debug) \
  64                                         check_active_locks(gp);
  65 #define CHECK_SLEEPING_LOCKS(gp)        if (check_debug) \
  66                                                 check_sleeping_locks(gp);
  67 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)   \
  68                 if (check_debug)        \
  69                         check_owner_locks(gp, pid, sysid, vp);
  70 #define CHECK_LOCK_TRANSITION(old_state, new_state) \
  71         { \
  72                 if (check_lock_transition(old_state, new_state)) { \
  73                         cmn_err(CE_PANIC, "Illegal lock transition \
  74                             from %d to %d", old_state, new_state); \
  75                 } \
  76         }
  77 #else
  78
  79 #define CHECK_ACTIVE_LOCKS(gp)
  80 #define CHECK_SLEEPING_LOCKS(gp)
  81 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
  82 #define CHECK_LOCK_TRANSITION(old_state, new_state)
  83
  84 #endif /* DEBUG */
  85
  86 struct kmem_cache       *flk_edge_cache;
  87
  88 graph_t         *lock_graph[HASH_SIZE];
  89 proc_graph_t    pgraph;
  90
  91 /*
  92  * Clustering.
  93  *
  94  * NLM REGISTRY TYPE IMPLEMENTATION
  95  *
  96  * Assumptions:
  97  *  1.  Nodes in a cluster are numbered starting at 1; always non-negative
  98  *      integers; maximum node id is returned by clconf_maximum_nodeid().
  99  *  2.  We use this node id to identify the node an NLM server runs on.
 100  */
 101
 102 /*
 103  * NLM registry object keeps track of NLM servers via their
 104  * nlmids (which are the node ids of the node in the cluster they run on)
 105  * that have requested locks at this LLM with which this registry is
 106  * associated.
 107  *
 108  * Representation of abstraction:
 109  *    rep = record[     states: array[nlm_state],
 110  *                      lock: mutex]
 111  *
 112  *    Representation invariants:
 113  *      1. index i of rep.states is between 0 and n - 1 where n is number
 114  *         of elements in the array, which happen to be the maximum number
 115  *         of nodes in the cluster configuration + 1.
 116  *      2. map nlmid to index i of rep.states
 117  *              0   -> 0
 118  *              1   -> 1
 119  *              2   -> 2
 120  *              n-1 -> clconf_maximum_nodeid()+1
 121  *      3.  This 1-1 mapping is quite convenient and it avoids errors resulting
 122  *          from forgetting to subtract 1 from the index.
 123  *      4.  The reason we keep the 0th index is the following.  A legitimate
 124  *          cluster configuration includes making a UFS file system NFS
 125  *          exportable.  The code is structured so that if you're in a cluster
 126  *          you do one thing; otherwise, you do something else.  The problem
 127  *          is what to do if you think you're in a cluster with PXFS loaded,
 128  *          but you're using UFS not PXFS?  The upper two bytes of the sysid
 129  *          encode the node id of the node where NLM server runs; these bytes
 130  *          are zero for UFS.  Since the nodeid is used to index into the
 131  *          registry, we can record the NLM server state information at index
 132  *          0 using the same mechanism used for PXFS file locks!
 133  */
 134 static flk_nlm_status_t *nlm_reg_status = NULL; /* state array 0..N-1 */
 135 static kmutex_t nlm_reg_lock;                   /* lock to protect arrary */
 136 static uint_t nlm_status_size;                  /* size of state array */
 137
 138 /*
 139  * Although we need a global lock dependency graph (and associated data
 140  * structures), we also need a per-zone notion of whether the lock manager is
 141  * running, and so whether to allow lock manager requests or not.
 142  *
 143  * Thus, on a per-zone basis we maintain a ``global'' variable
 144  * (flk_lockmgr_status), protected by flock_lock, and set when the lock
 145  * manager is determined to be changing state (starting or stopping).
 146  *
 147  * Each graph/zone pair also has a copy of this variable, which is protected by
 148  * the graph's mutex.
 149  *
 150  * The per-graph copies are used to synchronize lock requests with shutdown
 151  * requests.  The global copy is used to initialize the per-graph field when a
 152  * new graph is created.
 153  */
 154 struct flock_globals {
 155         flk_lockmgr_status_t flk_lockmgr_status;
 156         flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
 157 };
 158
 159 zone_key_t flock_zone_key;
 160
 161 static void create_flock(lock_descriptor_t *, flock64_t *);
 162 static lock_descriptor_t        *flk_get_lock(void);
 163 static void     flk_free_lock(lock_descriptor_t *lock);
 164 static void     flk_get_first_blocking_lock(lock_descriptor_t *request);
 165 static int flk_process_request(lock_descriptor_t *);
 166 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
 167 static edge_t *flk_get_edge(void);
 168 static int flk_wait_execute_request(lock_descriptor_t *);
 169 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
 170 static void flk_insert_active_lock(lock_descriptor_t *);
 171 static void flk_delete_active_lock(lock_descriptor_t *, int);
 172 static void flk_insert_sleeping_lock(lock_descriptor_t *);
 173 static void flk_graph_uncolor(graph_t *);
 174 static void flk_wakeup(lock_descriptor_t *, int);
 175 static void flk_free_edge(edge_t *);
 176 static void flk_recompute_dependencies(lock_descriptor_t *,
 177                         lock_descriptor_t **,  int, int);
 178 static int flk_find_barriers(lock_descriptor_t *);
 179 static void flk_update_barriers(lock_descriptor_t *);
 180 static int flk_color_reachables(lock_descriptor_t *);
 181 static int flk_canceled(lock_descriptor_t *);
 182 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
 183 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
 184 static void wait_for_lock(lock_descriptor_t *);
 185 static void unlock_lockmgr_granted(struct flock_globals *);
 186 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
 187
 188 /* Clustering hooks */
 189 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
 190 static void cl_flk_wakeup_sleeping_nlm_locks(int);
 191 static void cl_flk_unlock_nlm_granted(int);
 192
 193 #ifdef DEBUG
 194 static int check_lock_transition(int, int);
 195 static void check_sleeping_locks(graph_t *);
 196 static void check_active_locks(graph_t *);
 197 static int no_path(lock_descriptor_t *, lock_descriptor_t *);
 198 static void path(lock_descriptor_t *, lock_descriptor_t *);
 199 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
 200 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
 201 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
 202 #endif
 203
 204 /*      proc_graph function definitions */
 205 static int flk_check_deadlock(lock_descriptor_t *);
 206 static void flk_proc_graph_uncolor(void);
 207 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
 208 static proc_edge_t *flk_get_proc_edge(void);
 209 static void flk_proc_release(proc_vertex_t *);
 210 static void flk_free_proc_edge(proc_edge_t *);
 211 static void flk_update_proc_graph(edge_t *, int);
 212
 213 /* Non-blocking mandatory locking */
 214 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
 215                         u_offset_t);
 216
 217 static struct flock_globals *
 218 flk_get_globals(void)
 219 {
 220         /*
 221          * The KLM module had better be loaded if we're attempting to handle
 222          * lockmgr requests.
 223          */
 224         ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
 225         return (zone_getspecific(flock_zone_key, curproc->p_zone));
 226 }
 227
 228 static flk_lockmgr_status_t
 229 flk_get_lockmgr_status(void)
 230 {
 231         struct flock_globals *fg;
 232
 233         ASSERT(MUTEX_HELD(&flock_lock));
 234
 235         if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
 236                 /*
 237                  * KLM module not loaded; lock manager definitely not running.
 238                  */
 239                 return (FLK_LOCKMGR_DOWN);
 240         }
 241         fg = flk_get_globals();
 242         return (fg->flk_lockmgr_status);
 243 }
 244
 245 /*
 246  * Routine called from fs_frlock in fs/fs_subr.c
 247  */
 248
 249 int
 250 reclock(vnode_t         *vp,
 251         flock64_t       *lckdat,
 252         int             cmd,
 253         int             flag,
 254         u_offset_t      offset,
 255         flk_callback_t  *flk_cbp)
 256 {
 257         lock_descriptor_t       stack_lock_request;
 258         lock_descriptor_t       *lock_request;
 259         int error = 0;
 260         graph_t *gp;
 261         int                     nlmid;
 262
 263         /*
 264          * Check access permissions
 265          */
 266         if ((cmd & SETFLCK) &&
 267             ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
 268             (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
 269                         return (EBADF);
 270
 271         /*
 272          * for query and unlock we use the stack_lock_request
 273          */
 274
 275         if ((lckdat->l_type == F_UNLCK) ||
 276             !((cmd & INOFLCK) || (cmd & SETFLCK))) {
 277                 lock_request = &stack_lock_request;
 278                 (void) bzero((caddr_t)lock_request,
 279                     sizeof (lock_descriptor_t));
 280
 281                 /*
 282                  * following is added to make the assertions in
 283                  * flk_execute_request() to pass through
 284                  */
 285
 286                 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
 287                 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
 288                 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
 289                 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
 290                 lock_request->l_status = FLK_INITIAL_STATE;
 291         } else {
 292                 lock_request = flk_get_lock();
 293         }
 294         lock_request->l_state = 0;
 295         lock_request->l_vnode = vp;
 296         lock_request->l_zoneid = getzoneid();
 297
 298         /*
 299          * Convert the request range into the canonical start and end
 300          * values.  The NLM protocol supports locking over the entire
 301          * 32-bit range, so there's no range checking for remote requests,
 302          * but we still need to verify that local requests obey the rules.
 303          */
 304         /* Clustering */
 305         if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
 306                 ASSERT(lckdat->l_whence == 0);
 307                 lock_request->l_start = lckdat->l_start;
 308                 lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
 309                     lckdat->l_start + (lckdat->l_len - 1);
 310         } else {
 311                 /* check the validity of the lock range */
 312                 error = flk_convert_lock_data(vp, lckdat,
 313                     &lock_request->l_start, &lock_request->l_end,
 314                     offset);
 315                 if (error) {
 316                         goto done;
 317                 }
 318                 error = flk_check_lock_data(lock_request->l_start,
 319                     lock_request->l_end, MAXEND);
 320                 if (error) {
 321                         goto done;
 322                 }
 323         }
 324
 325         ASSERT(lock_request->l_end >= lock_request->l_start);
 326
 327         lock_request->l_type = lckdat->l_type;
 328         if (cmd & INOFLCK)
 329                 lock_request->l_state |= IO_LOCK;
 330         if (cmd & SLPFLCK)
 331                 lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
 332         if (cmd & RCMDLCK)
 333                 lock_request->l_state |= LOCKMGR_LOCK;
 334         if (cmd & NBMLCK)
 335                 lock_request->l_state |= NBMAND_LOCK;
 336         /*
 337          * Clustering: set flag for PXFS locks
 338          * We do not _only_ check for the PCMDLCK flag because PXFS locks could
 339          * also be of type 'RCMDLCK'.
 340          * We do not _only_ check the GETPXFSID() macro because local PXFS
 341          * clients use a pxfsid of zero to permit deadlock detection in the LLM.
 342          */
 343
 344         if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
 345                 lock_request->l_state |= PXFS_LOCK;
 346         }
 347         if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
 348                 if (lock_request->l_type == F_RDLCK ||
 349                     lock_request->l_type == F_WRLCK)
 350                         lock_request->l_state |= QUERY_LOCK;
 351         }
 352         lock_request->l_flock = (*lckdat);
 353         lock_request->l_callbacks = flk_cbp;
 354
 355         /*
 356          * We are ready for processing the request
 357          */
 358         if (IS_LOCKMGR(lock_request)) {
 359                 /*
 360                  * If the lock request is an NLM server request ....
 361                  */
 362                 if (nlm_status_size == 0) { /* not booted as cluster */
 363                         mutex_enter(&flock_lock);
 364                         /*
 365                          * Bail out if this is a lock manager request and the
 366                          * lock manager is not supposed to be running.
 367                          */
 368                         if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
 369                                 mutex_exit(&flock_lock);
 370                                 error = ENOLCK;
 371                                 goto done;
 372                         }
 373                         mutex_exit(&flock_lock);
 374                 } else {                        /* booted as a cluster */
 375                         nlmid = GETNLMID(lock_request->l_flock.l_sysid);
 376                         ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
 377
 378                         mutex_enter(&nlm_reg_lock);
 379                         /*
 380                          * If the NLM registry does not know about this
 381                          * NLM server making the request, add its nlmid
 382                          * to the registry.
 383                          */
 384                         if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
 385                             nlmid)) {
 386                                 FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
 387                         } else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
 388                             nlmid)) {
 389                                 /*
 390                                  * If the NLM server is already known (has made
 391                                  * previous lock requests) and its state is
 392                                  * not NLM_UP (means that NLM server is
 393                                  * shutting down), then bail out with an
 394                                  * error to deny the lock request.
 395                                  */
 396                                 mutex_exit(&nlm_reg_lock);
 397                                 error = ENOLCK;
 398                                 goto done;
 399                         }
 400                         mutex_exit(&nlm_reg_lock);
 401                 }
 402         }
 403
 404         /* Now get the lock graph for a particular vnode */
 405         gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
 406
 407         /*
 408          * We drop rwlock here otherwise this might end up causing a
 409          * deadlock if this IOLOCK sleeps. (bugid # 1183392).
 410          */
 411
 412         if (IS_IO_LOCK(lock_request)) {
 413                 VOP_RWUNLOCK(vp,
 414                     (lock_request->l_type == F_RDLCK) ?
 415                     V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
 416         }
 417         mutex_enter(&gp->gp_mutex);
 418
 419         lock_request->l_state |= REFERENCED_LOCK;
 420         lock_request->l_graph = gp;
 421
 422         switch (lock_request->l_type) {
 423         case F_RDLCK:
 424         case F_WRLCK:
 425                 if (IS_QUERY_LOCK(lock_request)) {
 426                         flk_get_first_blocking_lock(lock_request);
 427                         (*lckdat) = lock_request->l_flock;
 428                         break;
 429                 }
 430
 431                 /* process the request now */
 432
 433                 error = flk_process_request(lock_request);
 434                 break;
 435
 436         case F_UNLCK:
 437                 /* unlock request will not block so execute it immediately */
 438
 439                 if (IS_LOCKMGR(lock_request) &&
 440                     flk_canceled(lock_request)) {
 441                         error = 0;
 442                 } else {
 443                         error = flk_execute_request(lock_request);
 444                 }
 445                 break;
 446
 447         case F_UNLKSYS:
 448                 /*
 449                  * Recovery mechanism to release lock manager locks when
 450                  * NFS client crashes and restart. NFS server will clear
 451                  * old locks and grant new locks.
 452                  */
 453
 454                 if (lock_request->l_flock.l_sysid == 0) {
 455                         mutex_exit(&gp->gp_mutex);
 456                         return (EINVAL);
 457                 }
 458                 if (secpolicy_nfs(CRED()) != 0) {
 459                         mutex_exit(&gp->gp_mutex);
 460                         return (EPERM);
 461                 }
 462                 flk_delete_locks_by_sysid(lock_request);
 463                 lock_request->l_state &= ~REFERENCED_LOCK;
 464                 flk_set_state(lock_request, FLK_DEAD_STATE);
 465                 flk_free_lock(lock_request);
 466                 mutex_exit(&gp->gp_mutex);
 467                 return (0);
 468
 469         default:
 470                 error = EINVAL;
 471                 break;
 472         }
 473
 474         /* Clustering: For blocked PXFS locks, return */
 475         if (error == PXFS_LOCK_BLOCKED) {
 476                 lock_request->l_state &= ~REFERENCED_LOCK;
 477                 mutex_exit(&gp->gp_mutex);
 478                 return (error);
 479         }
 480
 481         /*
 482          * Now that we have seen the status of locks in the system for
 483          * this vnode we acquire the rwlock if it is an IO_LOCK.
 484          */
 485
 486         if (IS_IO_LOCK(lock_request)) {
 487                 (void) VOP_RWLOCK(vp,
 488                     (lock_request->l_type == F_RDLCK) ?
 489                     V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
 490                 if (!error) {
 491                         lckdat->l_type = F_UNLCK;
 492
 493                         /*
 494                          * This wake up is needed otherwise
 495                          * if IO_LOCK has slept the dependents on this
 496                          * will not be woken up at all. (bugid # 1185482).
 497                          */
 498
 499                         flk_wakeup(lock_request, 1);
 500                         flk_set_state(lock_request, FLK_DEAD_STATE);
 501                         flk_free_lock(lock_request);
 502                 }
 503                 /*
 504                  * else if error had occurred either flk_process_request()
 505                  * has returned EDEADLK in which case there will be no
 506                  * dependents for this lock or EINTR from flk_wait_execute_
 507                  * request() in which case flk_cancel_sleeping_lock()
 508                  * would have been done. same is true with EBADF.
 509                  */
 510         }
 511
 512         if (lock_request == &stack_lock_request) {
 513                 flk_set_state(lock_request, FLK_DEAD_STATE);
 514         } else {
 515                 lock_request->l_state &= ~REFERENCED_LOCK;
 516                 if ((error != 0) || IS_DELETED(lock_request)) {
 517                         flk_set_state(lock_request, FLK_DEAD_STATE);
 518                         flk_free_lock(lock_request);
 519                 }
 520         }
 521
 522         mutex_exit(&gp->gp_mutex);
 523         return (error);
 524
 525 done:
 526         flk_set_state(lock_request, FLK_DEAD_STATE);
 527         if (lock_request != &stack_lock_request)
 528                 flk_free_lock(lock_request);
 529         return (error);
 530 }
 531
 532 /*
 533  * Invoke the callbacks in the given list.  If before sleeping, invoke in
 534  * list order.  If after sleeping, invoke in reverse order.
 535  *
 536  * CPR (suspend/resume) support: if one of the callbacks returns a
 537  * callb_cpr_t, return it.   This will be used to make the thread CPR-safe
 538  * while it is sleeping.  There should be at most one callb_cpr_t for the
 539  * thread.
 540  * XXX This is unnecessarily complicated.  The CPR information should just
 541  * get passed in directly through VOP_FRLOCK and reclock, rather than
 542  * sneaking it in via a callback.
 543  */
 544
 545 callb_cpr_t *
 546 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
 547 {
 548         callb_cpr_t *cpr_callbackp = NULL;
 549         callb_cpr_t *one_result;
 550         flk_callback_t *cb;
 551
 552         if (cblist == NULL)
 553                 return (NULL);
 554
 555         if (when == FLK_BEFORE_SLEEP) {
 556                 cb = cblist;
 557                 do {
 558                         one_result = (*cb->cb_callback)(when, cb->cb_data);
 559                         if (one_result != NULL) {
 560                                 ASSERT(cpr_callbackp == NULL);
 561                                 cpr_callbackp = one_result;
 562                         }
 563                         cb = cb->cb_next;
 564                 } while (cb != cblist);
 565         } else {
 566                 cb = cblist->cb_prev;
 567                 do {
 568                         one_result = (*cb->cb_callback)(when, cb->cb_data);
 569                         if (one_result != NULL) {
 570                                 cpr_callbackp = one_result;
 571                         }
 572                         cb = cb->cb_prev;
 573                 } while (cb != cblist->cb_prev);
 574         }
 575
 576         return (cpr_callbackp);
 577 }
 578
 579 /*
 580  * Initialize a flk_callback_t to hold the given callback.
 581  */
 582
 583 void
 584 flk_init_callback(flk_callback_t *flk_cb,
 585         callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
 586 {
 587         flk_cb->cb_next = flk_cb;
 588         flk_cb->cb_prev = flk_cb;
 589         flk_cb->cb_callback = cb_fcn;
 590         flk_cb->cb_data = cbdata;
 591 }
 592
 593 /*
 594  * Initialize an flk_callback_t and then link it into the head of an
 595  * existing list (which may be NULL).
 596  */
 597
 598 void
 599 flk_add_callback(flk_callback_t *newcb,
 600                 callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
 601                 void *cbdata, flk_callback_t *cblist)
 602 {
 603         flk_init_callback(newcb, cb_fcn, cbdata);
 604
 605         if (cblist == NULL)
 606                 return;
 607
 608         newcb->cb_prev = cblist->cb_prev;
 609         newcb->cb_next = cblist;
 610         cblist->cb_prev->cb_next = newcb;
 611         cblist->cb_prev = newcb;
 612 }
 613
 614 /*
 615  * Initialize the flk_edge_cache data structure and create the
 616  * nlm_reg_status array.
 617  */
 618
 619 void
 620 flk_init(void)
 621 {
 622         uint_t  i;
 623
 624         flk_edge_cache = kmem_cache_create("flk_edges",
 625             sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
 626         if (flk_edge_cache == NULL) {
 627                 cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
 628         }
 629         /*
 630          * Create the NLM registry object.
 631          */
 632
 633         if (cluster_bootflags & CLUSTER_BOOTED) {
 634                 /*
 635                  * This routine tells you the maximum node id that will be used
 636                  * in the cluster.  This number will be the size of the nlm
 637                  * registry status array.  We add 1 because we will be using
 638                  * all entries indexed from 0 to maxnodeid; e.g., from 0
 639                  * to 64, for a total of 65 entries.
 640                  */
 641                 nlm_status_size = clconf_maximum_nodeid() + 1;
 642         } else {
 643                 nlm_status_size = 0;
 644         }
 645
 646         if (nlm_status_size != 0) {     /* booted as a cluster */
 647                 nlm_reg_status = (flk_nlm_status_t *)
 648                     kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
 649                     KM_SLEEP);
 650
 651                 /* initialize all NLM states in array to NLM_UNKNOWN */
 652                 for (i = 0; i < nlm_status_size; i++) {
 653                         nlm_reg_status[i] = FLK_NLM_UNKNOWN;
 654                 }
 655         }
 656 }
 657
 658 /*
 659  * Zone constructor/destructor callbacks to be executed when a zone is
 660  * created/destroyed.
 661  */
 662 /* ARGSUSED */
 663 void *
 664 flk_zone_init(zoneid_t zoneid)
 665 {
 666         struct flock_globals *fg;
 667         uint_t i;
 668
 669         fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
 670         fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
 671         for (i = 0; i < HASH_SIZE; i++)
 672                 fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
 673         return (fg);
 674 }
 675
 676 /* ARGSUSED */
 677 void
 678 flk_zone_fini(zoneid_t zoneid, void *data)
 679 {
 680         struct flock_globals *fg = data;
 681
 682         kmem_free(fg, sizeof (*fg));
 683 }
 684
 685 /*
 686  * Get a lock_descriptor structure with initialization of edge lists.
 687  */
 688
 689 static lock_descriptor_t *
 690 flk_get_lock(void)
 691 {
 692         lock_descriptor_t       *l;
 693
 694         l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
 695
 696         cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
 697         l->l_edge.edge_in_next = &l->l_edge;
 698         l->l_edge.edge_in_prev = &l->l_edge;
 699         l->l_edge.edge_adj_next = &l->l_edge;
 700         l->l_edge.edge_adj_prev = &l->l_edge;
 701         l->pvertex = -1;
 702         l->l_status = FLK_INITIAL_STATE;
 703         flk_lock_allocs++;
 704         return (l);
 705 }
 706
 707 /*
 708  * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
 709  * when some thread has a reference to it as in reclock().
 710  */
 711
 712 void
 713 flk_free_lock(lock_descriptor_t *lock)
 714 {
 715         ASSERT(IS_DEAD(lock));
 716         if (IS_REFERENCED(lock)) {
 717                 lock->l_state |= DELETED_LOCK;
 718                 return;
 719         }
 720         flk_lock_frees++;
 721         kmem_free((void *)lock, sizeof (lock_descriptor_t));
 722 }
 723
 724 void
 725 flk_set_state(lock_descriptor_t *lock, int new_state)
 726 {
 727         /*
 728          * Locks in the sleeping list may be woken up in a number of ways,
 729          * and more than once.  If a sleeping lock is signaled awake more
 730          * than once, then it may or may not change state depending on its
 731          * current state.
 732          * Also note that NLM locks that are sleeping could be moved to an
 733          * interrupted state more than once if the unlock request is
 734          * retransmitted by the NLM client - the second time around, this is
 735          * just a nop.
 736          * The ordering of being signaled awake is:
 737          * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
 738          * The checks below implement this ordering.
 739          */
 740         if (IS_INTERRUPTED(lock)) {
 741                 if ((new_state == FLK_CANCELLED_STATE) ||
 742                     (new_state == FLK_GRANTED_STATE) ||
 743                     (new_state == FLK_INTERRUPTED_STATE)) {
 744                         return;
 745                 }
 746         }
 747         if (IS_CANCELLED(lock)) {
 748                 if ((new_state == FLK_GRANTED_STATE) ||
 749                     (new_state == FLK_CANCELLED_STATE)) {
 750                         return;
 751                 }
 752         }
 753         CHECK_LOCK_TRANSITION(lock->l_status, new_state);
 754         if (IS_PXFS(lock)) {
 755                 cl_flk_state_transition_notify(lock, lock->l_status, new_state);
 756         }
 757         lock->l_status = new_state;
 758 }
 759
 760 /*
 761  * Routine that checks whether there are any blocking locks in the system.
 762  *
 763  * The policy followed is if a write lock is sleeping we don't allow read
 764  * locks before this write lock even though there may not be any active
 765  * locks corresponding to the read locks' region.
 766  *
 767  * flk_add_edge() function adds an edge between l1 and l2 iff there
 768  * is no path between l1 and l2. This is done to have a "minimum
 769  * storage representation" of the dependency graph.
 770  *
 771  * Another property of the graph is since only the new request throws
 772  * edges to the existing locks in the graph, the graph is always topologically
 773  * ordered.
 774  */
 775
 776 static int
 777 flk_process_request(lock_descriptor_t *request)
 778 {
 779         graph_t *gp = request->l_graph;
 780         lock_descriptor_t *lock;
 781         int request_blocked_by_active = 0;
 782         int request_blocked_by_granted = 0;
 783         int request_blocked_by_sleeping = 0;
 784         vnode_t *vp = request->l_vnode;
 785         int     error = 0;
 786         int request_will_wait = 0;
 787         int found_covering_lock = 0;
 788         lock_descriptor_t *covered_by = NULL;
 789
 790         ASSERT(MUTEX_HELD(&gp->gp_mutex));
 791         request_will_wait = IS_WILLING_TO_SLEEP(request);
 792
 793         /*
 794          * check active locks
 795          */
 796
 797         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 798
 799
 800         if (lock) {
 801                 do {
 802                         if (BLOCKS(lock, request)) {
 803                                 if (!request_will_wait)
 804                                         return (EAGAIN);
 805                                 request_blocked_by_active = 1;
 806                                 break;
 807                         }
 808                         /*
 809                          * Grant lock if it is for the same owner holding active
 810                          * lock that covers the request.
 811                          */
 812
 813                         if (SAME_OWNER(lock, request) &&
 814                             COVERS(lock, request) &&
 815                             (request->l_type == F_RDLCK))
 816                                 return (flk_execute_request(request));
 817                         lock = lock->l_next;
 818                 } while (lock->l_vnode == vp);
 819         }
 820
 821         if (!request_blocked_by_active) {
 822                         lock_descriptor_t *lk[1];
 823                         lock_descriptor_t *first_glock = NULL;
 824                 /*
 825                  * Shall we grant this?! NO!!
 826                  * What about those locks that were just granted and still
 827                  * in sleep queue. Those threads are woken up and so locks
 828                  * are almost active.
 829                  */
 830                 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
 831                 if (lock) {
 832                         do {
 833                                 if (BLOCKS(lock, request)) {
 834                                         if (IS_GRANTED(lock)) {
 835                                                 request_blocked_by_granted = 1;
 836                                         } else {
 837                                                 request_blocked_by_sleeping = 1;
 838                                         }
 839                                 }
 840
 841                                 lock = lock->l_next;
 842                         } while ((lock->l_vnode == vp));
 843                         first_glock = lock->l_prev;
 844                         ASSERT(first_glock->l_vnode == vp);
 845                 }
 846
 847                 if (request_blocked_by_granted)
 848                         goto block;
 849
 850                 if (!request_blocked_by_sleeping) {
 851                         /*
 852                          * If the request isn't going to be blocked by a
 853                          * sleeping request, we know that it isn't going to
 854                          * be blocked; we can just execute the request --
 855                          * without performing costly deadlock detection.
 856                          */
 857                         ASSERT(!request_blocked_by_active);
 858                         return (flk_execute_request(request));
 859                 } else if (request->l_type == F_RDLCK) {
 860                         /*
 861                          * If we have a sleeping writer in the requested
 862                          * lock's range, block.
 863                          */
 864                         goto block;
 865                 }
 866
 867                 lk[0] = request;
 868                 request->l_state |= RECOMPUTE_LOCK;
 869                 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 870                 if (lock) {
 871                         do {
 872                                 flk_recompute_dependencies(lock, lk, 1, 0);
 873                                 lock = lock->l_next;
 874                         } while (lock->l_vnode == vp);
 875                 }
 876                 lock = first_glock;
 877                 if (lock) {
 878                         do {
 879                                 if (IS_GRANTED(lock)) {
 880                                 flk_recompute_dependencies(lock, lk, 1, 0);
 881                                 }
 882                                 lock = lock->l_prev;
 883                         } while ((lock->l_vnode == vp));
 884                 }
 885                 request->l_state &= ~RECOMPUTE_LOCK;
 886                 if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
 887                         return (EDEADLK);
 888                 return (flk_execute_request(request));
 889         }
 890
 891 block:
 892         if (request_will_wait)
 893                 flk_graph_uncolor(gp);
 894
 895         /* check sleeping locks */
 896
 897         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
 898
 899         /*
 900          * If we find a sleeping write lock that is a superset of the
 901          * region wanted by request we can be assured that by adding an
 902          * edge to this write lock we have paths to all locks in the
 903          * graph that blocks the request except in one case and that is why
 904          * another check for SAME_OWNER in the loop below. The exception
 905          * case is when this process that owns the sleeping write lock 'l1'
 906          * has other locks l2, l3, l4 that are in the system and arrived
 907          * before l1. l1 does not have path to these locks as they are from
 908          * same process. We break when we find a second covering sleeping
 909          * lock l5 owned by a process different from that owning l1, because
 910          * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
 911          * it has l1 would have produced a deadlock already.
 912          */
 913
 914         if (lock) {
 915                 do {
 916                         if (BLOCKS(lock, request)) {
 917                                 if (!request_will_wait)
 918                                         return (EAGAIN);
 919                                 if (COVERS(lock, request) &&
 920                                     lock->l_type == F_WRLCK) {
 921                                         if (found_covering_lock &&
 922                                             !SAME_OWNER(lock, covered_by)) {
 923                                                 found_covering_lock++;
 924                                                 break;
 925                                         }
 926                                         found_covering_lock = 1;
 927                                         covered_by = lock;
 928                                 }
 929                                 if (found_covering_lock &&
 930                                     !SAME_OWNER(lock, covered_by)) {
 931                                         lock = lock->l_next;
 932                                         continue;
 933                                 }
 934                                 if ((error = flk_add_edge(request, lock,
 935                                     !found_covering_lock, 0)))
 936                                         return (error);
 937                         }
 938                         lock = lock->l_next;
 939                 } while (lock->l_vnode == vp);
 940         }
 941
 942 /*
 943  * found_covering_lock == 2 iff at this point 'request' has paths
 944  * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
 945  * point 'request' has paths to all locks that blocks 'request' whose owners
 946  * are not same as the one that covers 'request' (covered_by above) and
 947  * we can have locks whose owner is same as covered_by in the active list.
 948  */
 949
 950         if (request_blocked_by_active && found_covering_lock != 2) {
 951                 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 952                 ASSERT(lock != NULL);
 953                 do {
 954                         if (BLOCKS(lock, request)) {
 955                                 if (found_covering_lock &&
 956                                     !SAME_OWNER(lock, covered_by)) {
 957                                         lock = lock->l_next;
 958                                         continue;
 959                                 }
 960                                 if ((error = flk_add_edge(request, lock,
 961                                     CHECK_CYCLE, 0)))
 962                                         return (error);
 963                         }
 964                         lock = lock->l_next;
 965                 } while (lock->l_vnode == vp);
 966         }
 967
 968         if (NOT_BLOCKED(request)) {
 969                 /*
 970                  * request not dependent on any other locks
 971                  * so execute this request
 972                  */
 973                 return (flk_execute_request(request));
 974         } else {
 975                 /*
 976                  * check for deadlock
 977                  */
 978                 if (flk_check_deadlock(request))
 979                         return (EDEADLK);
 980                 /*
 981                  * this thread has to sleep
 982                  */
 983                 return (flk_wait_execute_request(request));
 984         }
 985 }
 986
 987 /*
 988  * The actual execution of the request in the simple case is only to
 989  * insert the 'request' in the list of active locks if it is not an
 990  * UNLOCK.
 991  * We have to consider the existing active locks' relation to
 992  * this 'request' if they are owned by same process. flk_relation() does
 993  * this job and sees to that the dependency graph information is maintained
 994  * properly.
 995  */
 996
 997 int
 998 flk_execute_request(lock_descriptor_t *request)
 999 {
1000         graph_t *gp = request->l_graph;
1001         vnode_t *vp = request->l_vnode;
1002         lock_descriptor_t       *lock, *lock1;
1003         int done_searching = 0;
1004
1005         CHECK_SLEEPING_LOCKS(gp);
1006         CHECK_ACTIVE_LOCKS(gp);
1007
1008         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1009
1010         flk_set_state(request, FLK_START_STATE);
1011
1012         ASSERT(NOT_BLOCKED(request));
1013
1014         /* IO_LOCK requests are only to check status */
1015
1016         if (IS_IO_LOCK(request))
1017                 return (0);
1018
1019         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1020
1021         if (lock == NULL && request->l_type == F_UNLCK)
1022                 return (0);
1023         if (lock == NULL) {
1024                 flk_insert_active_lock(request);
1025                 return (0);
1026         }
1027
1028         do {
1029                 lock1 = lock->l_next;
1030                 if (SAME_OWNER(request, lock)) {
1031                         done_searching = flk_relation(lock, request);
1032                 }
1033                 lock = lock1;
1034         } while (lock->l_vnode == vp && !done_searching);
1035
1036         /*
1037          * insert in active queue
1038          */
1039
1040         if (request->l_type != F_UNLCK)
1041                 flk_insert_active_lock(request);
1042
1043         return (0);
1044 }
1045
1046 /*
1047  * 'request' is blocked by some one therefore we put it into sleep queue.
1048  */
1049 static int
1050 flk_wait_execute_request(lock_descriptor_t *request)
1051 {
1052         graph_t *gp = request->l_graph;
1053         callb_cpr_t     *cprp;          /* CPR info from callback */
1054         struct flock_globals *fg;
1055         int index;
1056
1057         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1058         ASSERT(IS_WILLING_TO_SLEEP(request));
1059
1060         flk_insert_sleeping_lock(request);
1061
1062         if (IS_LOCKMGR(request)) {
1063                 index = HASH_INDEX(request->l_vnode);
1064                 fg = flk_get_globals();
1065
1066                 if (nlm_status_size == 0) {     /* not booted as a cluster */
1067                         if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1068                                 flk_cancel_sleeping_lock(request, 1);
1069                                 return (ENOLCK);
1070                         }
1071                 } else {                        /* booted as a cluster */
1072                         /*
1073                          * If the request is an NLM server lock request,
1074                          * and the NLM state of the lock request is not
1075                          * NLM_UP (because the NLM server is shutting
1076                          * down), then cancel the sleeping lock and
1077                          * return error ENOLCK that will encourage the
1078                          * client to retransmit.
1079                          */
1080                         if (!IS_NLM_UP(request)) {
1081                                 flk_cancel_sleeping_lock(request, 1);
1082                                 return (ENOLCK);
1083                         }
1084                 }
1085         }
1086
1087         /* Clustering: For blocking PXFS locks, return */
1088         if (IS_PXFS(request)) {
1089                 /*
1090                  * PXFS locks sleep on the client side.
1091                  * The callback argument is used to wake up the sleeper
1092                  * when the lock is granted.
1093                  * We return -1 (rather than an errno value) to indicate
1094                  * the client side should sleep
1095                  */
1096                 return (PXFS_LOCK_BLOCKED);
1097         }
1098
1099         if (request->l_callbacks != NULL) {
1100                 /*
1101                  * To make sure the shutdown code works correctly, either
1102                  * the callback must happen after putting the lock on the
1103                  * sleep list, or we must check the shutdown status after
1104                  * returning from the callback (and before sleeping).  At
1105                  * least for now, we'll use the first option.  If a
1106                  * shutdown or signal or whatever happened while the graph
1107                  * mutex was dropped, that will be detected by
1108                  * wait_for_lock().
1109                  */
1110                 mutex_exit(&gp->gp_mutex);
1111
1112                 cprp = flk_invoke_callbacks(request->l_callbacks,
1113                     FLK_BEFORE_SLEEP);
1114
1115                 mutex_enter(&gp->gp_mutex);
1116
1117                 if (cprp == NULL) {
1118                         wait_for_lock(request);
1119                 } else {
1120                         mutex_enter(cprp->cc_lockp);
1121                         CALLB_CPR_SAFE_BEGIN(cprp);
1122                         mutex_exit(cprp->cc_lockp);
1123                         wait_for_lock(request);
1124                         mutex_enter(cprp->cc_lockp);
1125                         CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1126                         mutex_exit(cprp->cc_lockp);
1127                 }
1128
1129                 mutex_exit(&gp->gp_mutex);
1130                 (void) flk_invoke_callbacks(request->l_callbacks,
1131                     FLK_AFTER_SLEEP);
1132                 mutex_enter(&gp->gp_mutex);
1133         } else {
1134                 wait_for_lock(request);
1135         }
1136
1137         if (IS_LOCKMGR(request)) {
1138                 /*
1139                  * If the lock manager is shutting down, return an
1140                  * error that will encourage the client to retransmit.
1141                  */
1142                 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1143                     !IS_GRANTED(request)) {
1144                         flk_cancel_sleeping_lock(request, 1);
1145                         return (ENOLCK);
1146                 }
1147         }
1148
1149         if (IS_INTERRUPTED(request)) {
1150                 /* we got a signal, or act like we did */
1151                 flk_cancel_sleeping_lock(request, 1);
1152                 return (EINTR);
1153         }
1154
1155         /* Cancelled if some other thread has closed the file */
1156
1157         if (IS_CANCELLED(request)) {
1158                 flk_cancel_sleeping_lock(request, 1);
1159                 return (EBADF);
1160         }
1161
1162         request->l_state &= ~GRANTED_LOCK;
1163         REMOVE_SLEEP_QUEUE(request);
1164         return (flk_execute_request(request));
1165 }
1166
1167 /*
1168  * This routine adds an edge between from and to because from depends
1169  * to. If asked to check for deadlock it checks whether there are any
1170  * reachable locks from "from_lock" that is owned by the same process
1171  * as "from_lock".
1172  * NOTE: It is the caller's responsibility to make sure that the color
1173  * of the graph is consistent between the calls to flk_add_edge as done
1174  * in flk_process_request. This routine does not color and check for
1175  * deadlock explicitly.
1176  */
1177
1178 static int
1179 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1180                         int check_cycle, int update_graph)
1181 {
1182         edge_t  *edge;
1183         edge_t  *ep;
1184         lock_descriptor_t       *vertex;
1185         lock_descriptor_t *vertex_stack;
1186
1187         STACK_INIT(vertex_stack);
1188
1189         /*
1190          * if to vertex already has mark_color just return
1191          * don't add an edge as it is reachable from from vertex
1192          * before itself.
1193          */
1194
1195         if (COLORED(to_lock))
1196                 return (0);
1197
1198         edge = flk_get_edge();
1199
1200         /*
1201          * set the from and to vertex
1202          */
1203
1204         edge->from_vertex = from_lock;
1205         edge->to_vertex = to_lock;
1206
1207         /*
1208          * put in adjacency list of from vertex
1209          */
1210
1211         from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1212         edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1213         edge->edge_adj_prev = &from_lock->l_edge;
1214         from_lock->l_edge.edge_adj_next = edge;
1215
1216         /*
1217          * put in in list of to vertex
1218          */
1219
1220         to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1221         edge->edge_in_next = to_lock->l_edge.edge_in_next;
1222         to_lock->l_edge.edge_in_next = edge;
1223         edge->edge_in_prev = &to_lock->l_edge;
1224
1225
1226         if (update_graph) {
1227                 flk_update_proc_graph(edge, 0);
1228                 return (0);
1229         }
1230         if (!check_cycle) {
1231                 return (0);
1232         }
1233
1234         STACK_PUSH(vertex_stack, from_lock, l_stack);
1235
1236         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1237
1238                 STACK_POP(vertex_stack, l_stack);
1239
1240                 for (ep = FIRST_ADJ(vertex);
1241                     ep != HEAD(vertex);
1242                     ep = NEXT_ADJ(ep)) {
1243                         if (COLORED(ep->to_vertex))
1244                                 continue;
1245                         COLOR(ep->to_vertex);
1246                         if (SAME_OWNER(ep->to_vertex, from_lock))
1247                                 goto dead_lock;
1248                         STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1249                 }
1250         }
1251         return (0);
1252
1253 dead_lock:
1254
1255         /*
1256          * remove all edges
1257          */
1258
1259         ep = FIRST_ADJ(from_lock);
1260
1261         while (ep != HEAD(from_lock)) {
1262                 IN_LIST_REMOVE(ep);
1263                 from_lock->l_sedge = NEXT_ADJ(ep);
1264                 ADJ_LIST_REMOVE(ep);
1265                 flk_free_edge(ep);
1266                 ep = from_lock->l_sedge;
1267         }
1268         return (EDEADLK);
1269 }
1270
1271 /*
1272  * Get an edge structure for representing the dependency between two locks.
1273  */
1274
1275 static edge_t *
1276 flk_get_edge()
1277 {
1278         edge_t  *ep;
1279
1280         ASSERT(flk_edge_cache != NULL);
1281
1282         ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1283         edge_allocs++;
1284         return (ep);
1285 }
1286
1287 /*
1288  * Free the edge structure.
1289  */
1290
1291 static void
1292 flk_free_edge(edge_t *ep)
1293 {
1294         edge_frees++;
1295         kmem_cache_free(flk_edge_cache, (void *)ep);
1296 }
1297
1298 /*
1299  * Check the relationship of request with lock and perform the
1300  * recomputation of dependencies, break lock if required, and return
1301  * 1 if request cannot have any more relationship with the next
1302  * active locks.
1303  * The 'lock' and 'request' are compared and in case of overlap we
1304  * delete the 'lock' and form new locks to represent the non-overlapped
1305  * portion of original 'lock'. This function has side effects such as
1306  * 'lock' will be freed, new locks will be added to the active list.
1307  */
1308
1309 static int
1310 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1311 {
1312         int lock_effect;
1313         lock_descriptor_t *lock1, *lock2;
1314         lock_descriptor_t *topology[3];
1315         int nvertex = 0;
1316         int i;
1317         edge_t  *ep;
1318         graph_t *gp = (lock->l_graph);
1319
1320
1321         CHECK_SLEEPING_LOCKS(gp);
1322         CHECK_ACTIVE_LOCKS(gp);
1323
1324         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1325
1326         topology[0] = topology[1] = topology[2] = NULL;
1327
1328         if (request->l_type == F_UNLCK)
1329                 lock_effect = FLK_UNLOCK;
1330         else if (request->l_type == F_RDLCK &&
1331             lock->l_type == F_WRLCK)
1332                 lock_effect = FLK_DOWNGRADE;
1333         else if (request->l_type == F_WRLCK &&
1334             lock->l_type == F_RDLCK)
1335                 lock_effect = FLK_UPGRADE;
1336         else
1337                 lock_effect = FLK_STAY_SAME;
1338
1339         if (lock->l_end < request->l_start) {
1340                 if (lock->l_end == request->l_start - 1 &&
1341                     lock_effect == FLK_STAY_SAME) {
1342                         topology[0] = request;
1343                         request->l_start = lock->l_start;
1344                         nvertex = 1;
1345                         goto recompute;
1346                 } else {
1347                         return (0);
1348                 }
1349         }
1350
1351         if (lock->l_start > request->l_end) {
1352                 if (request->l_end == lock->l_start - 1 &&
1353                     lock_effect == FLK_STAY_SAME) {
1354                         topology[0] = request;
1355                         request->l_end = lock->l_end;
1356                         nvertex = 1;
1357                         goto recompute;
1358                 } else {
1359                         return (1);
1360                 }
1361         }
1362
1363         if (request->l_end < lock->l_end) {
1364                 if (request->l_start > lock->l_start) {
1365                         if (lock_effect == FLK_STAY_SAME) {
1366                                 request->l_start = lock->l_start;
1367                                 request->l_end = lock->l_end;
1368                                 topology[0] = request;
1369                                 nvertex = 1;
1370                         } else {
1371                                 lock1 = flk_get_lock();
1372                                 lock2 = flk_get_lock();
1373                                 COPY(lock1, lock);
1374                                 COPY(lock2, lock);
1375                                 lock1->l_start = lock->l_start;
1376                                 lock1->l_end = request->l_start - 1;
1377                                 lock2->l_start = request->l_end + 1;
1378                                 lock2->l_end = lock->l_end;
1379                                 topology[0] = lock1;
1380                                 topology[1] = lock2;
1381                                 topology[2] = request;
1382                                 nvertex = 3;
1383                         }
1384                 } else if (request->l_start < lock->l_start) {
1385                         if (lock_effect == FLK_STAY_SAME) {
1386                                 request->l_end = lock->l_end;
1387                                 topology[0] = request;
1388                                 nvertex = 1;
1389                         } else {
1390                                 lock1 = flk_get_lock();
1391                                 COPY(lock1, lock);
1392                                 lock1->l_start = request->l_end + 1;
1393                                 topology[0] = lock1;
1394                                 topology[1] = request;
1395                                 nvertex = 2;
1396                         }
1397                 } else  {
1398                         if (lock_effect == FLK_STAY_SAME) {
1399                                 request->l_start = lock->l_start;
1400                                 request->l_end = lock->l_end;
1401                                 topology[0] = request;
1402                                 nvertex = 1;
1403                         } else {
1404                                 lock1 = flk_get_lock();
1405                                 COPY(lock1, lock);
1406                                 lock1->l_start = request->l_end + 1;
1407                                 topology[0] = lock1;
1408                                 topology[1] = request;
1409                                 nvertex = 2;
1410                         }
1411                 }
1412         } else if (request->l_end > lock->l_end) {
1413                 if (request->l_start > lock->l_start)  {
1414                         if (lock_effect == FLK_STAY_SAME) {
1415                                 request->l_start = lock->l_start;
1416                                 topology[0] = request;
1417                                 nvertex = 1;
1418                         } else {
1419                                 lock1 = flk_get_lock();
1420                                 COPY(lock1, lock);
1421                                 lock1->l_end = request->l_start - 1;
1422                                 topology[0] = lock1;
1423                                 topology[1] = request;
1424                                 nvertex = 2;
1425                         }
1426                 } else if (request->l_start < lock->l_start)  {
1427                         topology[0] = request;
1428                         nvertex = 1;
1429                 } else {
1430                         topology[0] = request;
1431                         nvertex = 1;
1432                 }
1433         } else {
1434                 if (request->l_start > lock->l_start) {
1435                         if (lock_effect == FLK_STAY_SAME) {
1436                                 request->l_start = lock->l_start;
1437                                 topology[0] = request;
1438                                 nvertex = 1;
1439                         } else {
1440                                 lock1 = flk_get_lock();
1441                                 COPY(lock1, lock);
1442                                 lock1->l_end = request->l_start - 1;
1443                                 topology[0] = lock1;
1444                                 topology[1] = request;
1445                                 nvertex = 2;
1446                         }
1447                 } else if (request->l_start < lock->l_start) {
1448                         topology[0] = request;
1449                         nvertex = 1;
1450                 } else {
1451                         if (lock_effect !=  FLK_UNLOCK) {
1452                                 topology[0] = request;
1453                                 nvertex = 1;
1454                         } else {
1455                                 flk_delete_active_lock(lock, 0);
1456                                 flk_wakeup(lock, 1);
1457                                 flk_free_lock(lock);
1458                                 CHECK_SLEEPING_LOCKS(gp);
1459                                 CHECK_ACTIVE_LOCKS(gp);
1460                                 return (1);
1461                         }
1462                 }
1463         }
1464
1465 recompute:
1466
1467         /*
1468          * For unlock we don't send the 'request' to for recomputing
1469          * dependencies because no lock will add an edge to this.
1470          */
1471
1472         if (lock_effect == FLK_UNLOCK) {
1473                 topology[nvertex-1] = NULL;
1474                 nvertex--;
1475         }
1476         for (i = 0; i < nvertex; i++) {
1477                 topology[i]->l_state |= RECOMPUTE_LOCK;
1478                 topology[i]->l_color = NO_COLOR;
1479         }
1480
1481         ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1482
1483         /*
1484          * we remove the adjacent edges for all vertices' to this vertex
1485          * 'lock'.
1486          */
1487
1488         ep = FIRST_IN(lock);
1489         while (ep != HEAD(lock)) {
1490                 ADJ_LIST_REMOVE(ep);
1491                 ep = NEXT_IN(ep);
1492         }
1493
1494         flk_delete_active_lock(lock, 0);
1495
1496         /* We are ready for recomputing the dependencies now */
1497
1498         flk_recompute_dependencies(lock, topology, nvertex, 1);
1499
1500         for (i = 0; i < nvertex; i++) {
1501                 topology[i]->l_state &= ~RECOMPUTE_LOCK;
1502                 topology[i]->l_color = NO_COLOR;
1503         }
1504
1505
1506         if (lock_effect == FLK_UNLOCK) {
1507                 nvertex++;
1508         }
1509         for (i = 0; i < nvertex - 1; i++) {
1510                 flk_insert_active_lock(topology[i]);
1511         }
1512
1513
1514         if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1515                 flk_wakeup(lock, 0);
1516         } else {
1517                 ep = FIRST_IN(lock);
1518                 while (ep != HEAD(lock)) {
1519                         lock->l_sedge = NEXT_IN(ep);
1520                         IN_LIST_REMOVE(ep);
1521                         flk_update_proc_graph(ep, 1);
1522                         flk_free_edge(ep);
1523                         ep = lock->l_sedge;
1524                 }
1525         }
1526         flk_free_lock(lock);
1527
1528         CHECK_SLEEPING_LOCKS(gp);
1529         CHECK_ACTIVE_LOCKS(gp);
1530         return (0);
1531 }
1532
1533 /*
1534  * Insert a lock into the active queue.
1535  */
1536
1537 static void
1538 flk_insert_active_lock(lock_descriptor_t *new_lock)
1539 {
1540         graph_t *gp = new_lock->l_graph;
1541         vnode_t *vp = new_lock->l_vnode;
1542         lock_descriptor_t *first_lock, *lock;
1543
1544         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1545
1546         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1547         first_lock = lock;
1548
1549         if (first_lock != NULL) {
1550                 for (; (lock->l_vnode == vp &&
1551                     lock->l_start < new_lock->l_start); lock = lock->l_next)
1552                         ;
1553         } else {
1554                 lock = ACTIVE_HEAD(gp);
1555         }
1556
1557         lock->l_prev->l_next = new_lock;
1558         new_lock->l_next = lock;
1559         new_lock->l_prev = lock->l_prev;
1560         lock->l_prev = new_lock;
1561
1562         if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
1563                 vp->v_filocks = (struct filock *)new_lock;
1564         }
1565         flk_set_state(new_lock, FLK_ACTIVE_STATE);
1566         new_lock->l_state |= ACTIVE_LOCK;
1567
1568         CHECK_ACTIVE_LOCKS(gp);
1569         CHECK_SLEEPING_LOCKS(gp);
1570 }
1571
1572 /*
1573  * Delete the active lock : Performs two functions depending on the
1574  * value of second parameter. One is to remove from the active lists
1575  * only and other is to both remove and free the lock.
1576  */
1577
1578 static void
1579 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1580 {
1581         vnode_t *vp = lock->l_vnode;
1582         graph_t *gp = lock->l_graph;
1583
1584         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1585         if (free_lock)
1586                 ASSERT(NO_DEPENDENTS(lock));
1587         ASSERT(NOT_BLOCKED(lock));
1588         ASSERT(IS_ACTIVE(lock));
1589
1590         ASSERT((vp->v_filocks != NULL));
1591
1592         if (vp->v_filocks == (struct filock *)lock) {
1593                 vp->v_filocks = (struct filock *)
1594                     ((lock->l_next->l_vnode == vp) ? lock->l_next :
1595                     NULL);
1596         }
1597         lock->l_next->l_prev = lock->l_prev;
1598         lock->l_prev->l_next = lock->l_next;
1599         lock->l_next = lock->l_prev = NULL;
1600         flk_set_state(lock, FLK_DEAD_STATE);
1601         lock->l_state &= ~ACTIVE_LOCK;
1602
1603         if (free_lock)
1604                 flk_free_lock(lock);
1605         CHECK_ACTIVE_LOCKS(gp);
1606         CHECK_SLEEPING_LOCKS(gp);
1607 }
1608
1609 /*
1610  * Insert into the sleep queue.
1611  */
1612
1613 static void
1614 flk_insert_sleeping_lock(lock_descriptor_t *request)
1615 {
1616         graph_t *gp = request->l_graph;
1617         vnode_t *vp = request->l_vnode;
1618         lock_descriptor_t       *lock;
1619
1620         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1621         ASSERT(IS_INITIAL(request));
1622
1623         for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
1624             lock->l_vnode < vp); lock = lock->l_next)
1625                 ;
1626
1627         lock->l_prev->l_next = request;
1628         request->l_prev = lock->l_prev;
1629         lock->l_prev = request;
1630         request->l_next = lock;
1631         flk_set_state(request, FLK_SLEEPING_STATE);
1632         request->l_state |= SLEEPING_LOCK;
1633 }
1634
1635 /*
1636  * Cancelling a sleeping lock implies removing a vertex from the
1637  * dependency graph and therefore we should recompute the dependencies
1638  * of all vertices that have a path  to this vertex, w.r.t. all
1639  * vertices reachable from this vertex.
1640  */
1641
1642 void
1643 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
1644 {
1645         graph_t *gp = request->l_graph;
1646         vnode_t *vp = request->l_vnode;
1647         lock_descriptor_t **topology = NULL;
1648         edge_t  *ep;
1649         lock_descriptor_t *vertex, *lock;
1650         int nvertex = 0;
1651         int i;
1652         lock_descriptor_t *vertex_stack;
1653
1654         STACK_INIT(vertex_stack);
1655
1656         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1657         /*
1658          * count number of vertex pointers that has to be allocated
1659          * All vertices that are reachable from request.
1660          */
1661
1662         STACK_PUSH(vertex_stack, request, l_stack);
1663
1664         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1665                 STACK_POP(vertex_stack, l_stack);
1666                 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
1667                     ep = NEXT_ADJ(ep)) {
1668                         if (IS_RECOMPUTE(ep->to_vertex))
1669                                 continue;
1670                         ep->to_vertex->l_state |= RECOMPUTE_LOCK;
1671                         STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1672                         nvertex++;
1673                 }
1674         }
1675
1676         /*
1677          * allocate memory for holding the vertex pointers
1678          */
1679
1680         if (nvertex) {
1681                 topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
1682                     KM_SLEEP);
1683         }
1684
1685         /*
1686          * one more pass to actually store the vertices in the
1687          * allocated array.
1688          * We first check sleeping locks and then active locks
1689          * so that topology array will be in a topological
1690          * order.
1691          */
1692
1693         nvertex = 0;
1694         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1695
1696         if (lock) {
1697                 do {
1698                         if (IS_RECOMPUTE(lock)) {
1699                                 lock->l_index = nvertex;
1700                                 topology[nvertex++] = lock;
1701                         }
1702                         lock->l_color = NO_COLOR;
1703                         lock = lock->l_next;
1704                 } while (lock->l_vnode == vp);
1705         }
1706
1707         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1708
1709         if (lock) {
1710                 do {
1711                         if (IS_RECOMPUTE(lock)) {
1712                                 lock->l_index = nvertex;
1713                                 topology[nvertex++] = lock;
1714                         }
1715                         lock->l_color = NO_COLOR;
1716                         lock = lock->l_next;
1717                 } while (lock->l_vnode == vp);
1718         }
1719
1720         /*
1721          * remove in and out edges of request
1722          * They are freed after updating proc_graph below.
1723          */
1724
1725         for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
1726                 ADJ_LIST_REMOVE(ep);
1727         }
1728
1729
1730         if (remove_from_queue)
1731                 REMOVE_SLEEP_QUEUE(request);
1732
1733         /* we are ready to recompute */
1734
1735         flk_recompute_dependencies(request, topology, nvertex, 1);
1736
1737         ep = FIRST_ADJ(request);
1738         while (ep != HEAD(request)) {
1739                 IN_LIST_REMOVE(ep);
1740                 request->l_sedge = NEXT_ADJ(ep);
1741                 ADJ_LIST_REMOVE(ep);
1742                 flk_update_proc_graph(ep, 1);
1743                 flk_free_edge(ep);
1744                 ep = request->l_sedge;
1745         }
1746
1747
1748         /*
1749          * unset the RECOMPUTE flag in those vertices
1750          */
1751
1752         for (i = 0; i < nvertex; i++) {
1753                 topology[i]->l_state &= ~RECOMPUTE_LOCK;
1754         }
1755
1756         /*
1757          * free the topology
1758          */
1759         if (nvertex)
1760                 kmem_free((void *)topology,
1761                     (nvertex * sizeof (lock_descriptor_t *)));
1762         /*
1763          * Possibility of some locks unblocked now
1764          */
1765
1766         flk_wakeup(request, 0);
1767
1768         /*
1769          * we expect to have a correctly recomputed graph  now.
1770          */
1771         flk_set_state(request, FLK_DEAD_STATE);
1772         flk_free_lock(request);
1773         CHECK_SLEEPING_LOCKS(gp);
1774         CHECK_ACTIVE_LOCKS(gp);
1775
1776 }
1777
1778 /*
1779  * Uncoloring the graph is simply to increment the mark value of the graph
1780  * And only when wrap round takes place will we color all vertices in
1781  * the graph explicitly.
1782  */
1783
1784 static void
1785 flk_graph_uncolor(graph_t *gp)
1786 {
1787         lock_descriptor_t *lock;
1788
1789         if (gp->mark == UINT_MAX) {
1790                 gp->mark = 1;
1791         for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
1792             lock = lock->l_next)
1793                         lock->l_color  = 0;
1794
1795         for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
1796             lock = lock->l_next)
1797                         lock->l_color  = 0;
1798         } else {
1799                 gp->mark++;
1800         }
1801 }
1802
1803 /*
1804  * Wake up locks that are blocked on the given lock.
1805  */
1806
1807 static void
1808 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
1809 {
1810         edge_t  *ep;
1811         graph_t *gp = lock->l_graph;
1812         lock_descriptor_t       *lck;
1813
1814         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1815         if (NO_DEPENDENTS(lock))
1816                 return;
1817         ep = FIRST_IN(lock);
1818         do {
1819                 /*
1820                  * delete the edge from the adjacency list
1821                  * of from vertex. if no more adjacent edges
1822                  * for this vertex wake this process.
1823                  */
1824                 lck = ep->from_vertex;
1825                 if (adj_list_remove)
1826                         ADJ_LIST_REMOVE(ep);
1827                 flk_update_proc_graph(ep, 1);
1828                 if (NOT_BLOCKED(lck)) {
1829                         GRANT_WAKEUP(lck);
1830                 }
1831                 lock->l_sedge = NEXT_IN(ep);
1832                 IN_LIST_REMOVE(ep);
1833                 flk_free_edge(ep);
1834                 ep = lock->l_sedge;
1835         } while (ep != HEAD(lock));
1836         ASSERT(NO_DEPENDENTS(lock));
1837 }
1838
1839 /*
1840  * The dependents of request, is checked for its dependency against the
1841  * locks in topology (called topology because the array is and should be in
1842  * topological order for this algorithm, if not in topological order the
1843  * inner loop below might add more edges than necessary. Topological ordering
1844  * of vertices satisfies the property that all edges will be from left to
1845  * right i.e., topology[i] can have an edge to  topology[j], iff i<j)
1846  * If lock l1 in the dependent set of request is dependent (blocked by)
1847  * on lock l2 in topology but does not have a path to it, we add an edge
1848  * in the inner loop below.
1849  *
1850  * We don't want to add an edge between l1 and l2 if there exists
1851  * already a path from l1 to l2, so care has to be taken for those vertices
1852  * that  have two paths to 'request'. These vertices are referred to here
1853  * as barrier locks.
1854  *
1855  * The barriers has to be found (those vertex that originally had two paths
1856  * to request) because otherwise we may end up adding edges unnecessarily
1857  * to vertices in topology, and thus barrier vertices can have an edge
1858  * to a vertex in topology as well a path to it.
1859  */
1860
1861 static void
1862 flk_recompute_dependencies(lock_descriptor_t *request,
1863                 lock_descriptor_t **topology,
1864                         int nvertex, int update_graph)
1865 {
1866         lock_descriptor_t *vertex, *lock;
1867         graph_t *gp = request->l_graph;
1868         int i, count;
1869         int barrier_found = 0;
1870         edge_t  *ep;
1871         lock_descriptor_t *vertex_stack;
1872
1873         STACK_INIT(vertex_stack);
1874
1875         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1876         if (nvertex == 0)
1877                 return;
1878         flk_graph_uncolor(request->l_graph);
1879         barrier_found = flk_find_barriers(request);
1880         request->l_state |= RECOMPUTE_DONE;
1881
1882         STACK_PUSH(vertex_stack, request, l_stack);
1883         request->l_sedge = FIRST_IN(request);
1884
1885
1886         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1887                 if (vertex->l_state & RECOMPUTE_DONE) {
1888                         count = 0;
1889                         goto next_in_edge;
1890                 }
1891                 if (IS_BARRIER(vertex)) {
1892                         /* decrement the barrier count */
1893                         if (vertex->l_index) {
1894                                 vertex->l_index--;
1895                                 /* this guy will be pushed again anyway ? */
1896                                 STACK_POP(vertex_stack, l_stack);
1897                                 if (vertex->l_index == 0)  {
1898                                 /*
1899                                  * barrier is over we can recompute
1900                                  * dependencies for this lock in the
1901                                  * next stack pop
1902                                  */
1903                                         vertex->l_state &= ~BARRIER_LOCK;
1904                                 }
1905                                 continue;
1906                         }
1907                 }
1908                 vertex->l_state |= RECOMPUTE_DONE;
1909                 flk_graph_uncolor(gp);
1910                 count = flk_color_reachables(vertex);
1911                 for (i = 0; i < nvertex; i++) {
1912                         lock = topology[i];
1913                         if (COLORED(lock))
1914                                 continue;
1915                         if (BLOCKS(lock, vertex)) {
1916                                 (void) flk_add_edge(vertex, lock,
1917                                     NO_CHECK_CYCLE, update_graph);
1918                                 COLOR(lock);
1919                                 count++;
1920                                 count += flk_color_reachables(lock);
1921                         }
1922
1923                 }
1924
1925 next_in_edge:
1926                 if (count == nvertex ||
1927                     vertex->l_sedge == HEAD(vertex)) {
1928                         /* prune the tree below this */
1929                         STACK_POP(vertex_stack, l_stack);
1930                         vertex->l_state &= ~RECOMPUTE_DONE;
1931                         /* update the barrier locks below this! */
1932                         if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
1933                                 flk_graph_uncolor(gp);
1934                                 flk_update_barriers(vertex);
1935                         }
1936                         continue;
1937                 }
1938
1939                 ep = vertex->l_sedge;
1940                 lock = ep->from_vertex;
1941                 STACK_PUSH(vertex_stack, lock, l_stack);
1942                 lock->l_sedge = FIRST_IN(lock);
1943                 vertex->l_sedge = NEXT_IN(ep);
1944         }
1945
1946 }
1947
1948 /*
1949  * Color all reachable vertices from vertex that belongs to topology (here
1950  * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
1951  *
1952  * Note: we need to use a different stack_link l_stack1 because this is
1953  * called from flk_recompute_dependencies() that already uses a stack with
1954  * l_stack as stack_link.
1955  */
1956
1957 static int
1958 flk_color_reachables(lock_descriptor_t *vertex)
1959 {
1960         lock_descriptor_t *ver, *lock;
1961         int count;
1962         edge_t  *ep;
1963         lock_descriptor_t *vertex_stack;
1964
1965         STACK_INIT(vertex_stack);
1966
1967         STACK_PUSH(vertex_stack, vertex, l_stack1);
1968         count = 0;
1969         while ((ver = STACK_TOP(vertex_stack)) != NULL) {
1970
1971                 STACK_POP(vertex_stack, l_stack1);
1972                 for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
1973                     ep = NEXT_ADJ(ep)) {
1974                         lock = ep->to_vertex;
1975                         if (COLORED(lock))
1976                                 continue;
1977                         COLOR(lock);
1978                         if (IS_RECOMPUTE(lock))
1979                                 count++;
1980                         STACK_PUSH(vertex_stack, lock, l_stack1);
1981                 }
1982
1983         }
1984         return (count);
1985 }
1986
1987 /*
1988  * Called from flk_recompute_dependencies() this routine decrements
1989  * the barrier count of barrier vertices that are reachable from lock.
1990  */
1991
1992 static void
1993 flk_update_barriers(lock_descriptor_t *lock)
1994 {
1995         lock_descriptor_t *vertex, *lck;
1996         edge_t  *ep;
1997         lock_descriptor_t *vertex_stack;
1998
1999         STACK_INIT(vertex_stack);
2000
2001         STACK_PUSH(vertex_stack, lock, l_stack1);
2002
2003         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2004                 STACK_POP(vertex_stack, l_stack1);
2005                 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2006                     ep = NEXT_IN(ep)) {
2007                         lck = ep->from_vertex;
2008                         if (COLORED(lck)) {
2009                                 if (IS_BARRIER(lck)) {
2010                                         ASSERT(lck->l_index > 0);
2011                                         lck->l_index--;
2012                                         if (lck->l_index == 0)
2013                                                 lck->l_state &= ~BARRIER_LOCK;
2014                                 }
2015                                 continue;
2016                         }
2017                         COLOR(lck);
2018                         if (IS_BARRIER(lck)) {
2019                                 ASSERT(lck->l_index > 0);
2020                                 lck->l_index--;
2021                                 if (lck->l_index == 0)
2022                                         lck->l_state &= ~BARRIER_LOCK;
2023                         }
2024                         STACK_PUSH(vertex_stack, lck, l_stack1);
2025                 }
2026         }
2027 }
2028
2029 /*
2030  * Finds all vertices that are reachable from 'lock' more than once and
2031  * mark them as barrier vertices and increment their barrier count.
2032  * The barrier count is one minus the total number of paths from lock
2033  * to that vertex.
2034  */
2035
2036 static int
2037 flk_find_barriers(lock_descriptor_t *lock)
2038 {
2039         lock_descriptor_t *vertex, *lck;
2040         int found = 0;
2041         edge_t  *ep;
2042         lock_descriptor_t *vertex_stack;
2043
2044         STACK_INIT(vertex_stack);
2045
2046         STACK_PUSH(vertex_stack, lock, l_stack1);
2047
2048         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2049                 STACK_POP(vertex_stack, l_stack1);
2050                 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2051                     ep = NEXT_IN(ep)) {
2052                         lck = ep->from_vertex;
2053                         if (COLORED(lck)) {
2054                                 /* this is a barrier */
2055                                 lck->l_state |= BARRIER_LOCK;
2056                                 /* index will have barrier count */
2057                                 lck->l_index++;
2058                                 if (!found)
2059                                         found = 1;
2060                                 continue;
2061                         }
2062                         COLOR(lck);
2063                         lck->l_index = 0;
2064                         STACK_PUSH(vertex_stack, lck, l_stack1);
2065                 }
2066         }
2067         return (found);
2068 }
2069
2070 /*
2071  * Finds the first lock that is mainly responsible for blocking this
2072  * request.  If there is no such lock, request->l_flock.l_type is set to
2073  * F_UNLCK.  Otherwise, request->l_flock is filled in with the particulars
2074  * of the blocking lock.
2075  *
2076  * Note: It is possible a request is blocked by a sleeping lock because
2077  * of the fairness policy used in flk_process_request() to construct the
2078  * dependencies. (see comments before flk_process_request()).
2079  */
2080
2081 static void
2082 flk_get_first_blocking_lock(lock_descriptor_t *request)
2083 {
2084         graph_t *gp = request->l_graph;
2085         vnode_t *vp = request->l_vnode;
2086         lock_descriptor_t *lock, *blocker;
2087
2088         ASSERT(MUTEX_HELD(&gp->gp_mutex));
2089         blocker = NULL;
2090         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2091
2092         if (lock) {
2093                 do {
2094                         if (BLOCKS(lock, request)) {
2095                                 blocker = lock;
2096                                 break;
2097                         }
2098                         lock = lock->l_next;
2099                 } while (lock->l_vnode == vp);
2100         }
2101
2102         if (blocker == NULL && request->l_flock.l_type == F_RDLCK) {
2103                 /*
2104                  * No active lock is blocking this request, but if a read
2105                  * lock is requested, it may also get blocked by a waiting
2106                  * writer. So search all sleeping locks and see if there is
2107                  * a writer waiting.
2108                  */
2109                 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2110                 if (lock) {
2111                         do {
2112                                 if (BLOCKS(lock, request)) {
2113                                         blocker = lock;
2114                                         break;
2115                                 }
2116                                 lock = lock->l_next;
2117                         } while (lock->l_vnode == vp);
2118                 }
2119         }
2120
2121         if (blocker) {
2122                 report_blocker(blocker, request);
2123         } else
2124                 request->l_flock.l_type = F_UNLCK;
2125 }
2126
2127 /*
2128  * Get the graph_t structure associated with a vnode.
2129  * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2130  * not yet been initialized, then a new element is allocated and returned.
2131  */
2132 graph_t *
2133 flk_get_lock_graph(vnode_t *vp, int initialize)
2134 {
2135         graph_t *gp;
2136         graph_t *gp_alloc = NULL;
2137         int index = HASH_INDEX(vp);
2138
2139         if (initialize == FLK_USE_GRAPH) {
2140                 mutex_enter(&flock_lock);
2141                 gp = lock_graph[index];
2142                 mutex_exit(&flock_lock);
2143                 return (gp);
2144         }
2145
2146         ASSERT(initialize == FLK_INIT_GRAPH);
2147
2148         if (lock_graph[index] == NULL) {
2149
2150                 gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2151
2152                 /* Initialize the graph */
2153
2154                 gp_alloc->active_locks.l_next =
2155                     gp_alloc->active_locks.l_prev =
2156                     (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2157                 gp_alloc->sleeping_locks.l_next =
2158                     gp_alloc->sleeping_locks.l_prev =
2159                     (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2160                 gp_alloc->index = index;
2161                 mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2162         }
2163
2164         mutex_enter(&flock_lock);
2165
2166         gp = lock_graph[index];
2167
2168         /* Recheck the value within flock_lock */
2169         if (gp == NULL) {
2170                 struct flock_globals *fg;
2171
2172                 /* We must have previously allocated the graph_t structure */
2173                 ASSERT(gp_alloc != NULL);
2174                 lock_graph[index] = gp = gp_alloc;
2175                 /*
2176                  * The lockmgr status is only needed if KLM is loaded.
2177                  */
2178                 if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2179                         fg = flk_get_globals();
2180                         fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2181                 }
2182         }
2183
2184         mutex_exit(&flock_lock);
2185
2186         if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2187                 /* There was a race to allocate the graph_t and we lost */
2188                 mutex_destroy(&gp_alloc->gp_mutex);
2189                 kmem_free(gp_alloc, sizeof (graph_t));
2190         }
2191
2192         return (gp);
2193 }
2194
2195 /*
2196  * PSARC case 1997/292
2197  */
2198 int
2199 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2200 {
2201         lock_descriptor_t *lock;
2202         int result = 0;
2203         graph_t *gp;
2204         int                     lock_nlmid;
2205
2206         /*
2207          * Check to see if node is booted as a cluster. If not, return.
2208          */
2209         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2210                 return (0);
2211         }
2212
2213         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2214         if (gp == NULL) {
2215                 return (0);
2216         }
2217
2218         mutex_enter(&gp->gp_mutex);
2219
2220         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2221
2222         if (lock) {
2223                 while (lock->l_vnode == vp) {
2224                         /* get NLM id from sysid */
2225                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2226
2227                         /*
2228                          * If NLM server request _and_ nlmid of lock matches
2229                          * nlmid of argument, then we've found a remote lock.
2230                          */
2231                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2232                                 result = 1;
2233                                 goto done;
2234                         }
2235                         lock = lock->l_next;
2236                 }
2237         }
2238
2239         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2240
2241         if (lock) {
2242                 while (lock->l_vnode == vp) {
2243                         /* get NLM id from sysid */
2244                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2245
2246                         /*
2247                          * If NLM server request _and_ nlmid of lock matches
2248                          * nlmid of argument, then we've found a remote lock.
2249                          */
2250                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2251                                 result = 1;
2252                                 goto done;
2253                         }
2254                         lock = lock->l_next;
2255                 }
2256         }
2257
2258 done:
2259         mutex_exit(&gp->gp_mutex);
2260         return (result);
2261 }
2262
2263 /*
2264  * Determine whether there are any locks for the given vnode with a remote
2265  * sysid.  Returns zero if not, non-zero if there are.
2266  *
2267  * Note that the return value from this function is potentially invalid
2268  * once it has been returned.  The caller is responsible for providing its
2269  * own synchronization mechanism to ensure that the return value is useful
2270  * (e.g., see nfs_lockcompletion()).
2271  */
2272 int
2273 flk_has_remote_locks(vnode_t *vp)
2274 {
2275         lock_descriptor_t *lock;
2276         int result = 0;
2277         graph_t *gp;
2278
2279         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2280         if (gp == NULL) {
2281                 return (0);
2282         }
2283
2284         mutex_enter(&gp->gp_mutex);
2285
2286         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2287
2288         if (lock) {
2289                 while (lock->l_vnode == vp) {
2290                         if (IS_REMOTE(lock)) {
2291                                 result = 1;
2292                                 goto done;
2293                         }
2294                         lock = lock->l_next;
2295                 }
2296         }
2297
2298         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2299
2300         if (lock) {
2301                 while (lock->l_vnode == vp) {
2302                         if (IS_REMOTE(lock)) {
2303                                 result = 1;
2304                                 goto done;
2305                         }
2306                         lock = lock->l_next;
2307                 }
2308         }
2309
2310 done:
2311         mutex_exit(&gp->gp_mutex);
2312         return (result);
2313 }
2314
2315 /*
2316  * Determine whether there are any locks for the given vnode with a remote
2317  * sysid matching given sysid.
2318  * Used by the new (open source) NFS Lock Manager (NLM)
2319  */
2320 int
2321 flk_has_remote_locks_for_sysid(vnode_t *vp, int sysid)
2322 {
2323         lock_descriptor_t *lock;
2324         int result = 0;
2325         graph_t *gp;
2326
2327         if (sysid == 0)
2328                 return (0);
2329
2330         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2331         if (gp == NULL) {
2332                 return (0);
2333         }
2334
2335         mutex_enter(&gp->gp_mutex);
2336
2337         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2338
2339         if (lock) {
2340                 while (lock->l_vnode == vp) {
2341                         if (lock->l_flock.l_sysid == sysid) {
2342                                 result = 1;
2343                                 goto done;
2344                         }
2345                         lock = lock->l_next;
2346                 }
2347         }
2348
2349         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2350
2351         if (lock) {
2352                 while (lock->l_vnode == vp) {
2353                         if (lock->l_flock.l_sysid == sysid) {
2354                                 result = 1;
2355                                 goto done;
2356                         }
2357                         lock = lock->l_next;
2358                 }
2359         }
2360
2361 done:
2362         mutex_exit(&gp->gp_mutex);
2363         return (result);
2364 }
2365
2366 /*
2367  * Determine if there are any locks owned by the given sysid.
2368  * Returns zero if not, non-zero if there are.  Note that this return code
2369  * could be derived from flk_get_{sleeping,active}_locks, but this routine
2370  * avoids all the memory allocations of those routines.
2371  *
2372  * This routine has the same synchronization issues as
2373  * flk_has_remote_locks.
2374  */
2375
2376 int
2377 flk_sysid_has_locks(int sysid, int lck_type)
2378 {
2379         int             has_locks = 0;
2380         lock_descriptor_t       *lock;
2381         graph_t         *gp;
2382         int             i;
2383
2384         for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2385                 mutex_enter(&flock_lock);
2386                 gp = lock_graph[i];
2387                 mutex_exit(&flock_lock);
2388                 if (gp == NULL) {
2389                         continue;
2390                 }
2391
2392                 mutex_enter(&gp->gp_mutex);
2393
2394                 if (lck_type & FLK_QUERY_ACTIVE) {
2395                         for (lock = ACTIVE_HEAD(gp)->l_next;
2396                             lock != ACTIVE_HEAD(gp) && !has_locks;
2397                             lock = lock->l_next) {
2398                                 if (lock->l_flock.l_sysid == sysid)
2399                                         has_locks = 1;
2400                         }
2401                 }
2402
2403                 if (lck_type & FLK_QUERY_SLEEPING) {
2404                         for (lock = SLEEPING_HEAD(gp)->l_next;
2405                             lock != SLEEPING_HEAD(gp) && !has_locks;
2406                             lock = lock->l_next) {
2407                                 if (lock->l_flock.l_sysid == sysid)
2408                                         has_locks = 1;
2409                         }
2410                 }
2411                 mutex_exit(&gp->gp_mutex);
2412         }
2413
2414         return (has_locks);
2415 }
2416
2417
2418 /*
2419  * PSARC case 1997/292
2420  *
2421  * Requires: "sysid" is a pair [nlmid, sysid].  The lower half is 16-bit
2422  *  quantity, the real sysid generated by the NLM server; the upper half
2423  *  identifies the node of the cluster where the NLM server ran.
2424  *  This routine is only called by an NLM server running in a cluster.
2425  * Effects: Remove all locks held on behalf of the client identified
2426  *  by "sysid."
2427  */
2428 void
2429 cl_flk_remove_locks_by_sysid(int sysid)
2430 {
2431         graph_t *gp;
2432         int i;
2433         lock_descriptor_t *lock, *nlock;
2434
2435         /*
2436          * Check to see if node is booted as a cluster. If not, return.
2437          */
2438         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2439                 return;
2440         }
2441
2442         ASSERT(sysid != 0);
2443         for (i = 0; i < HASH_SIZE; i++) {
2444                 mutex_enter(&flock_lock);
2445                 gp = lock_graph[i];
2446                 mutex_exit(&flock_lock);
2447
2448                 if (gp == NULL)
2449                         continue;
2450
2451                 mutex_enter(&gp->gp_mutex);     /*  get mutex on lock graph */
2452
2453                 /* signal sleeping requests so that they bail out */
2454                 lock = SLEEPING_HEAD(gp)->l_next;
2455                 while (lock != SLEEPING_HEAD(gp)) {
2456                         nlock = lock->l_next;
2457                         if (lock->l_flock.l_sysid == sysid) {
2458                                 INTERRUPT_WAKEUP(lock);
2459                         }
2460                         lock = nlock;
2461                 }
2462
2463                 /* delete active locks */
2464                 lock = ACTIVE_HEAD(gp)->l_next;
2465                 while (lock != ACTIVE_HEAD(gp)) {
2466                         nlock = lock->l_next;
2467                         if (lock->l_flock.l_sysid == sysid) {
2468                                 flk_delete_active_lock(lock, 0);
2469                                 flk_wakeup(lock, 1);
2470                                 flk_free_lock(lock);
2471                         }
2472                         lock = nlock;
2473                 }
2474                 mutex_exit(&gp->gp_mutex);    /* release mutex on lock graph */
2475         }
2476 }
2477
2478 /*
2479  * Delete all locks in the system that belongs to the sysid of the request.
2480  */
2481
2482 static void
2483 flk_delete_locks_by_sysid(lock_descriptor_t *request)
2484 {
2485         int     sysid  = request->l_flock.l_sysid;
2486         lock_descriptor_t *lock, *nlock;
2487         graph_t *gp;
2488         int i;
2489
2490         ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
2491         ASSERT(sysid != 0);
2492
2493         mutex_exit(&request->l_graph->gp_mutex);
2494
2495         for (i = 0; i < HASH_SIZE; i++) {
2496                 mutex_enter(&flock_lock);
2497                 gp = lock_graph[i];
2498                 mutex_exit(&flock_lock);
2499
2500                 if (gp == NULL)
2501                         continue;
2502
2503                 mutex_enter(&gp->gp_mutex);
2504
2505                 /* signal sleeping requests so that they bail out */
2506                 lock = SLEEPING_HEAD(gp)->l_next;
2507                 while (lock != SLEEPING_HEAD(gp)) {
2508                         nlock = lock->l_next;
2509                         if (lock->l_flock.l_sysid == sysid) {
2510                                 INTERRUPT_WAKEUP(lock);
2511                         }
2512                         lock = nlock;
2513                 }
2514
2515                 /* delete active locks */
2516                 lock = ACTIVE_HEAD(gp)->l_next;
2517                 while (lock != ACTIVE_HEAD(gp)) {
2518                         nlock = lock->l_next;
2519                         if (lock->l_flock.l_sysid == sysid) {
2520                                 flk_delete_active_lock(lock, 0);
2521                                 flk_wakeup(lock, 1);
2522                                 flk_free_lock(lock);
2523                         }
2524                         lock = nlock;
2525                 }
2526                 mutex_exit(&gp->gp_mutex);
2527         }
2528
2529         mutex_enter(&request->l_graph->gp_mutex);
2530 }
2531
2532 /*
2533  * Clustering: Deletes PXFS locks
2534  * Effects: Delete all locks on files in the given file system and with the
2535  *  given PXFS id.
2536  */
2537 void
2538 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
2539 {
2540         lock_descriptor_t *lock, *nlock;
2541         graph_t *gp;
2542         int i;
2543
2544         for (i = 0; i < HASH_SIZE; i++) {
2545                 mutex_enter(&flock_lock);
2546                 gp = lock_graph[i];
2547                 mutex_exit(&flock_lock);
2548
2549                 if (gp == NULL)
2550                         continue;
2551
2552                 mutex_enter(&gp->gp_mutex);
2553
2554                 /* signal sleeping requests so that they bail out */
2555                 lock = SLEEPING_HEAD(gp)->l_next;
2556                 while (lock != SLEEPING_HEAD(gp)) {
2557                         nlock = lock->l_next;
2558                         if (lock->l_vnode->v_vfsp == vfsp) {
2559                                 ASSERT(IS_PXFS(lock));
2560                                 if (GETPXFSID(lock->l_flock.l_sysid) ==
2561                                     pxfsid) {
2562                                         flk_set_state(lock,
2563                                             FLK_CANCELLED_STATE);
2564                                         flk_cancel_sleeping_lock(lock, 1);
2565                                 }
2566                         }
2567                         lock = nlock;
2568                 }
2569
2570                 /* delete active locks */
2571                 lock = ACTIVE_HEAD(gp)->l_next;
2572                 while (lock != ACTIVE_HEAD(gp)) {
2573                         nlock = lock->l_next;
2574                         if (lock->l_vnode->v_vfsp == vfsp) {
2575                                 ASSERT(IS_PXFS(lock));
2576                                 if (GETPXFSID(lock->l_flock.l_sysid) ==
2577                                     pxfsid) {
2578                                         flk_delete_active_lock(lock, 0);
2579                                         flk_wakeup(lock, 1);
2580                                         flk_free_lock(lock);
2581                                 }
2582                         }
2583                         lock = nlock;
2584                 }
2585                 mutex_exit(&gp->gp_mutex);
2586         }
2587 }
2588
2589 /*
2590  * Search for a sleeping lock manager lock which matches exactly this lock
2591  * request; if one is found, fake a signal to cancel it.
2592  *
2593  * Return 1 if a matching lock was found, 0 otherwise.
2594  */
2595
2596 static int
2597 flk_canceled(lock_descriptor_t *request)
2598 {
2599         lock_descriptor_t *lock, *nlock;
2600         graph_t *gp = request->l_graph;
2601         vnode_t *vp = request->l_vnode;
2602
2603         ASSERT(MUTEX_HELD(&gp->gp_mutex));
2604         ASSERT(IS_LOCKMGR(request));
2605         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2606
2607         if (lock) {
2608                 while (lock->l_vnode == vp) {
2609                         nlock = lock->l_next;
2610                         if (SAME_OWNER(lock, request) &&
2611                             lock->l_start == request->l_start &&
2612                             lock->l_end == request->l_end) {
2613                                 INTERRUPT_WAKEUP(lock);
2614                                 return (1);
2615                         }
2616                         lock = nlock;
2617                 }
2618         }
2619         return (0);
2620 }
2621
2622 /*
2623  * Remove all the locks for the vnode belonging to the given pid and sysid.
2624  */
2625
2626 void
2627 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
2628 {
2629         graph_t *gp;
2630         lock_descriptor_t *lock, *nlock;
2631         lock_descriptor_t *link_stack;
2632
2633         STACK_INIT(link_stack);
2634
2635         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2636
2637         if (gp == NULL)
2638                 return;
2639         mutex_enter(&gp->gp_mutex);
2640
2641         CHECK_SLEEPING_LOCKS(gp);
2642         CHECK_ACTIVE_LOCKS(gp);
2643
2644         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2645
2646         if (lock) {
2647                 do {
2648                         nlock = lock->l_next;
2649                         if ((lock->l_flock.l_pid == pid ||
2650                             pid == IGN_PID) &&
2651                             lock->l_flock.l_sysid == sysid) {
2652                                 CANCEL_WAKEUP(lock);
2653                         }
2654                         lock = nlock;
2655                 } while (lock->l_vnode == vp);
2656         }
2657
2658         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2659
2660         if (lock) {
2661                 do {
2662                         nlock = lock->l_next;
2663                         if ((lock->l_flock.l_pid == pid ||
2664                             pid == IGN_PID) &&
2665                             lock->l_flock.l_sysid == sysid) {
2666                                 flk_delete_active_lock(lock, 0);
2667                                 STACK_PUSH(link_stack, lock, l_stack);
2668                         }
2669                         lock = nlock;
2670                 } while (lock->l_vnode == vp);
2671         }
2672
2673         while ((lock = STACK_TOP(link_stack)) != NULL) {
2674                 STACK_POP(link_stack, l_stack);
2675                 flk_wakeup(lock, 1);
2676                 flk_free_lock(lock);
2677         }
2678
2679         CHECK_SLEEPING_LOCKS(gp);
2680         CHECK_ACTIVE_LOCKS(gp);
2681         CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2682         mutex_exit(&gp->gp_mutex);
2683 }
2684
2685
2686 /*
2687  * Called from 'fs' read and write routines for files that have mandatory
2688  * locking enabled.
2689  */
2690
2691 int
2692 chklock(
2693         struct vnode    *vp,
2694         int             iomode,
2695         u_offset_t      offset,
2696         ssize_t         len,
2697         int             fmode,
2698         caller_context_t *ct)
2699 {
2700         register int    i;
2701         struct flock64  bf;
2702         int             error = 0;
2703
2704         bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
2705         bf.l_whence = 0;
2706         bf.l_start = offset;
2707         bf.l_len = len;
2708         if (ct == NULL) {
2709                 bf.l_pid = curproc->p_pid;
2710                 bf.l_sysid = 0;
2711         } else {
2712                 bf.l_pid = ct->cc_pid;
2713                 bf.l_sysid = ct->cc_sysid;
2714         }
2715         i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
2716         if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
2717             bf.l_type != F_UNLCK)
2718                 error = i ? i : EAGAIN;
2719         return (error);
2720 }
2721
2722 /*
2723  * convoff - converts the given data (start, whence) to the
2724  * given whence.
2725  */
2726 int
2727 convoff(vp, lckdat, whence, offset)
2728         struct vnode    *vp;
2729         struct flock64  *lckdat;
2730         int             whence;
2731         offset_t        offset;
2732 {
2733         int             error;
2734         struct vattr    vattr;
2735
2736         if ((lckdat->l_whence == 2) || (whence == 2)) {
2737                 vattr.va_mask = AT_SIZE;
2738                 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
2739                         return (error);
2740         }
2741
2742         switch (lckdat->l_whence) {
2743         case 1:
2744                 lckdat->l_start += offset;
2745                 break;
2746         case 2:
2747                 lckdat->l_start += vattr.va_size;
2748                 /* FALLTHRU */
2749         case 0:
2750                 break;
2751         default:
2752                 return (EINVAL);
2753         }
2754
2755         if (lckdat->l_start < 0)
2756                 return (EINVAL);
2757
2758         switch (whence) {
2759         case 1:
2760                 lckdat->l_start -= offset;
2761                 break;
2762         case 2:
2763                 lckdat->l_start -= vattr.va_size;
2764                 /* FALLTHRU */
2765         case 0:
2766                 break;
2767         default:
2768                 return (EINVAL);
2769         }
2770
2771         lckdat->l_whence = (short)whence;
2772         return (0);
2773 }
2774
2775
2776 /*      proc_graph function definitions */
2777
2778 /*
2779  * Function checks for deadlock due to the new 'lock'. If deadlock found
2780  * edges of this lock are freed and returned.
2781  */
2782
2783 static int
2784 flk_check_deadlock(lock_descriptor_t *lock)
2785 {
2786         proc_vertex_t   *start_vertex, *pvertex;
2787         proc_vertex_t *dvertex;
2788         proc_edge_t *pep, *ppep;
2789         edge_t  *ep, *nep;
2790         proc_vertex_t *process_stack;
2791
2792         STACK_INIT(process_stack);
2793
2794         mutex_enter(&flock_lock);
2795         start_vertex = flk_get_proc_vertex(lock);
2796         ASSERT(start_vertex != NULL);
2797
2798         /* construct the edges from this process to other processes */
2799
2800         ep = FIRST_ADJ(lock);
2801         while (ep != HEAD(lock)) {
2802                 proc_vertex_t *adj_proc;
2803
2804                 adj_proc = flk_get_proc_vertex(ep->to_vertex);
2805                 for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
2806                         if (pep->to_proc == adj_proc) {
2807                                 ASSERT(pep->refcount);
2808                                 pep->refcount++;
2809                                 break;
2810                         }
2811                 }
2812                 if (pep == NULL) {
2813                         pep = flk_get_proc_edge();
2814                         pep->to_proc = adj_proc;
2815                         pep->refcount = 1;
2816                         adj_proc->incount++;
2817                         pep->next = start_vertex->edge;
2818                         start_vertex->edge = pep;
2819                 }
2820                 ep = NEXT_ADJ(ep);
2821         }
2822
2823         ep = FIRST_IN(lock);
2824
2825         while (ep != HEAD(lock)) {
2826                 proc_vertex_t *in_proc;
2827
2828                 in_proc = flk_get_proc_vertex(ep->from_vertex);
2829
2830                 for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
2831                         if (pep->to_proc == start_vertex) {
2832                                 ASSERT(pep->refcount);
2833                                 pep->refcount++;
2834                                 break;
2835                         }
2836                 }
2837                 if (pep == NULL) {
2838                         pep = flk_get_proc_edge();
2839                         pep->to_proc = start_vertex;
2840                         pep->refcount = 1;
2841                         start_vertex->incount++;
2842                         pep->next = in_proc->edge;
2843                         in_proc->edge = pep;
2844                 }
2845                 ep = NEXT_IN(ep);
2846         }
2847
2848         if (start_vertex->incount == 0) {
2849                 mutex_exit(&flock_lock);
2850                 return (0);
2851         }
2852
2853         flk_proc_graph_uncolor();
2854
2855         start_vertex->p_sedge = start_vertex->edge;
2856
2857         STACK_PUSH(process_stack, start_vertex, p_stack);
2858
2859         while ((pvertex = STACK_TOP(process_stack)) != NULL) {
2860                 for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
2861                         dvertex = pep->to_proc;
2862                         if (!PROC_ARRIVED(dvertex)) {
2863                                 STACK_PUSH(process_stack, dvertex, p_stack);
2864                                 dvertex->p_sedge = dvertex->edge;
2865                                 PROC_ARRIVE(pvertex);
2866                                 pvertex->p_sedge = pep->next;
2867                                 break;
2868                         }
2869                         if (!PROC_DEPARTED(dvertex))
2870                                 goto deadlock;
2871                 }
2872                 if (pep == NULL) {
2873                         PROC_DEPART(pvertex);
2874                         STACK_POP(process_stack, p_stack);
2875                 }
2876         }
2877         mutex_exit(&flock_lock);
2878         return (0);
2879
2880 deadlock:
2881
2882         /* we remove all lock edges and proc edges */
2883
2884         ep = FIRST_ADJ(lock);
2885         while (ep != HEAD(lock)) {
2886                 proc_vertex_t *adj_proc;
2887                 adj_proc = flk_get_proc_vertex(ep->to_vertex);
2888                 nep = NEXT_ADJ(ep);
2889                 IN_LIST_REMOVE(ep);
2890                 ADJ_LIST_REMOVE(ep);
2891                 flk_free_edge(ep);
2892                 ppep = start_vertex->edge;
2893                 for (pep = start_vertex->edge; pep != NULL; ppep = pep,
2894                     pep = ppep->next) {
2895                         if (pep->to_proc == adj_proc) {
2896                                 pep->refcount--;
2897                                 if (pep->refcount == 0) {
2898                                         if (pep == ppep) {
2899                                                 start_vertex->edge = pep->next;
2900                                         } else {
2901                                                 ppep->next = pep->next;
2902                                         }
2903                                         adj_proc->incount--;
2904                                         flk_proc_release(adj_proc);
2905                                         flk_free_proc_edge(pep);
2906                                 }
2907                                 break;
2908                         }
2909                 }
2910                 ep = nep;
2911         }
2912         ep = FIRST_IN(lock);
2913         while (ep != HEAD(lock)) {
2914                 proc_vertex_t *in_proc;
2915                 in_proc = flk_get_proc_vertex(ep->from_vertex);
2916                 nep = NEXT_IN(ep);
2917                 IN_LIST_REMOVE(ep);
2918                 ADJ_LIST_REMOVE(ep);
2919                 flk_free_edge(ep);
2920                 ppep = in_proc->edge;
2921                 for (pep = in_proc->edge; pep != NULL; ppep = pep,
2922                     pep = ppep->next) {
2923                         if (pep->to_proc == start_vertex) {
2924                                 pep->refcount--;
2925                                 if (pep->refcount == 0) {
2926                                         if (pep == ppep) {
2927                                                 in_proc->edge = pep->next;
2928                                         } else {
2929                                                 ppep->next = pep->next;
2930                                         }
2931                                         start_vertex->incount--;
2932                                         flk_proc_release(in_proc);
2933                                         flk_free_proc_edge(pep);
2934                                 }
2935                                 break;
2936                         }
2937                 }
2938                 ep = nep;
2939         }
2940         flk_proc_release(start_vertex);
2941         mutex_exit(&flock_lock);
2942         return (1);
2943 }
2944
2945 /*
2946  * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
2947  * from the list we return that, otherwise we allocate one. If necessary,
2948  * we grow the list of vertices also.
2949  */
2950
2951 static proc_vertex_t *
2952 flk_get_proc_vertex(lock_descriptor_t *lock)
2953 {
2954         int i;
2955         proc_vertex_t   *pv;
2956         proc_vertex_t   **palloc;
2957
2958         ASSERT(MUTEX_HELD(&flock_lock));
2959         if (lock->pvertex != -1) {
2960                 ASSERT(lock->pvertex >= 0);
2961                 pv = pgraph.proc[lock->pvertex];
2962                 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2963                         return (pv);
2964                 }
2965         }
2966         for (i = 0; i < pgraph.gcount; i++) {
2967                 pv = pgraph.proc[i];
2968                 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2969                         lock->pvertex = pv->index = i;
2970                         return (pv);
2971                 }
2972         }
2973         pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
2974         pv->pid = lock->l_flock.l_pid;
2975         pv->sysid = lock->l_flock.l_sysid;
2976         flk_proc_vertex_allocs++;
2977         if (pgraph.free != 0) {
2978                 for (i = 0; i < pgraph.gcount; i++) {
2979                         if (pgraph.proc[i] == NULL) {
2980                                 pgraph.proc[i] = pv;
2981                                 lock->pvertex = pv->index = i;
2982                                 pgraph.free--;
2983                                 return (pv);
2984                         }
2985                 }
2986         }
2987         palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
2988             sizeof (proc_vertex_t *), KM_SLEEP);
2989
2990         if (pgraph.proc) {
2991                 bcopy(pgraph.proc, palloc,
2992                     pgraph.gcount * sizeof (proc_vertex_t *));
2993
2994                 kmem_free(pgraph.proc,
2995                     pgraph.gcount * sizeof (proc_vertex_t *));
2996         }
2997         pgraph.proc = palloc;
2998         pgraph.free += (PROC_CHUNK - 1);
2999         pv->index = lock->pvertex = pgraph.gcount;
3000         pgraph.gcount += PROC_CHUNK;
3001         pgraph.proc[pv->index] = pv;
3002         return (pv);
3003 }
3004
3005 /*
3006  * Allocate a proc edge.
3007  */
3008
3009 static proc_edge_t *
3010 flk_get_proc_edge()
3011 {
3012         proc_edge_t *pep;
3013
3014         pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
3015         flk_proc_edge_allocs++;
3016         return (pep);
3017 }
3018
3019 /*
3020  * Free the proc edge. Called whenever its reference count goes to zero.
3021  */
3022
3023 static void
3024 flk_free_proc_edge(proc_edge_t *pep)
3025 {
3026         ASSERT(pep->refcount == 0);
3027         kmem_free((void *)pep, sizeof (proc_edge_t));
3028         flk_proc_edge_frees++;
3029 }
3030
3031 /*
3032  * Color the graph explicitly done only when the mark value hits max value.
3033  */
3034
3035 static void
3036 flk_proc_graph_uncolor()
3037 {
3038         int i;
3039
3040         if (pgraph.mark == UINT_MAX) {
3041                 for (i = 0; i < pgraph.gcount; i++)
3042                         if (pgraph.proc[i] != NULL) {
3043                                 pgraph.proc[i]->atime = 0;
3044                                 pgraph.proc[i]->dtime = 0;
3045                         }
3046                 pgraph.mark = 1;
3047         } else {
3048                 pgraph.mark++;
3049         }
3050 }
3051
3052 /*
3053  * Release the proc vertex iff both there are no in edges and out edges
3054  */
3055
3056 static void
3057 flk_proc_release(proc_vertex_t *proc)
3058 {
3059         ASSERT(MUTEX_HELD(&flock_lock));
3060         if (proc->edge == NULL && proc->incount == 0) {
3061                 pgraph.proc[proc->index] = NULL;
3062                 pgraph.free++;
3063                 kmem_free(proc, sizeof (proc_vertex_t));
3064                 flk_proc_vertex_frees++;
3065         }
3066 }
3067
3068 /*
3069  * Updates process graph to reflect change in a lock_graph.
3070  * Note: We should call this function only after we have a correctly
3071  * recomputed lock graph. Otherwise we might miss a deadlock detection.
3072  * eg: in function flk_relation() we call this function after flk_recompute_
3073  * dependencies() otherwise if a process tries to lock a vnode hashed
3074  * into another graph it might sleep for ever.
3075  */
3076
3077 static void
3078 flk_update_proc_graph(edge_t *ep, int delete)
3079 {
3080         proc_vertex_t *toproc, *fromproc;
3081         proc_edge_t *pep, *prevpep;
3082
3083         mutex_enter(&flock_lock);
3084         toproc = flk_get_proc_vertex(ep->to_vertex);
3085         fromproc = flk_get_proc_vertex(ep->from_vertex);
3086
3087         if (!delete)
3088                 goto add;
3089         pep = prevpep = fromproc->edge;
3090
3091         ASSERT(pep != NULL);
3092         while (pep != NULL) {
3093                 if (pep->to_proc == toproc) {
3094                         ASSERT(pep->refcount > 0);
3095                         pep->refcount--;
3096                         if (pep->refcount == 0) {
3097                                 if (pep == prevpep) {
3098                                         fromproc->edge = pep->next;
3099                                 } else {
3100                                         prevpep->next = pep->next;
3101                                 }
3102                                 toproc->incount--;
3103                                 flk_proc_release(toproc);
3104                                 flk_free_proc_edge(pep);
3105                         }
3106                         break;
3107                 }
3108                 prevpep = pep;
3109                 pep = pep->next;
3110         }
3111         flk_proc_release(fromproc);
3112         mutex_exit(&flock_lock);
3113         return;
3114 add:
3115
3116         pep = fromproc->edge;
3117
3118         while (pep != NULL) {
3119                 if (pep->to_proc == toproc) {
3120                         ASSERT(pep->refcount > 0);
3121                         pep->refcount++;
3122                         break;
3123                 }
3124                 pep = pep->next;
3125         }
3126         if (pep == NULL) {
3127                 pep = flk_get_proc_edge();
3128                 pep->to_proc = toproc;
3129                 pep->refcount = 1;
3130                 toproc->incount++;
3131                 pep->next = fromproc->edge;
3132                 fromproc->edge = pep;
3133         }
3134         mutex_exit(&flock_lock);
3135 }
3136
3137 /*
3138  * Set the control status for lock manager requests.
3139  *
3140  */
3141
3142 /*
3143  * PSARC case 1997/292
3144  *
3145  * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3146  * Effects: Set the state of the NLM server identified by "nlmid"
3147  *   in the NLM registry to state "nlm_state."
3148  *   Raises exception no_such_nlm if "nlmid" doesn't identify a known
3149  *   NLM server to this LLM.
3150  *   Note that when this routine is called with NLM_SHUTTING_DOWN there
3151  *   may be locks requests that have gotten started but not finished.  In
3152  *   particular, there may be blocking requests that are in the callback code
3153  *   before sleeping (so they're not holding the lock for the graph).  If
3154  *   such a thread reacquires the graph's lock (to go to sleep) after
3155  *   NLM state in the NLM registry  is set to a non-up value,
3156  *   it will notice the status and bail out.  If the request gets
3157  *   granted before the thread can check the NLM registry, let it
3158  *   continue normally.  It will get flushed when we are called with NLM_DOWN.
3159  *
3160  * Modifies: nlm_reg_obj (global)
3161  * Arguments:
3162  *    nlmid     (IN):    id uniquely identifying an NLM server
3163  *    nlm_state (IN):    NLM server state to change "nlmid" to
3164  */
3165 void
3166 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3167 {
3168         /*
3169          * Check to see if node is booted as a cluster. If not, return.
3170          */
3171         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3172                 return;
3173         }
3174
3175         /*
3176          * Check for development/debugging.  It is possible to boot a node
3177          * in non-cluster mode, and then run a special script, currently
3178          * available only to developers, to bring up the node as part of a
3179          * cluster.  The problem is that running such a script does not
3180          * result in the routine flk_init() being called and hence global array
3181          * nlm_reg_status is NULL.  The NLM thinks it's in cluster mode,
3182          * but the LLM needs to do an additional check to see if the global
3183          * array has been created or not. If nlm_reg_status is NULL, then
3184          * return, else continue.
3185          */
3186         if (nlm_reg_status == NULL) {
3187                 return;
3188         }
3189
3190         ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3191         mutex_enter(&nlm_reg_lock);
3192
3193         if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3194                 /*
3195                  * If the NLM server "nlmid" is unknown in the NLM registry,
3196                  * add it to the registry in the nlm shutting down state.
3197                  */
3198                 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3199                     FLK_NLM_SHUTTING_DOWN);
3200         } else {
3201                 /*
3202                  * Change the state of the NLM server identified by "nlmid"
3203                  * in the NLM registry to the argument "nlm_state."
3204                  */
3205                 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3206                     nlm_state);
3207         }
3208
3209         /*
3210          *  The reason we must register the NLM server that is shutting down
3211          *  with an LLM that doesn't already know about it (never sent a lock
3212          *  request) is to handle correctly a race between shutdown and a new
3213          *  lock request.  Suppose that a shutdown request from the NLM server
3214          *  invokes this routine at the LLM, and a thread is spawned to
3215          *  service the request. Now suppose a new lock request is in
3216          *  progress and has already passed the first line of defense in
3217          *  reclock(), which denies new locks requests from NLM servers
3218          *  that are not in the NLM_UP state.  After the current routine
3219          *  is invoked for both phases of shutdown, the routine will return,
3220          *  having done nothing, and the lock request will proceed and
3221          *  probably be granted.  The problem is that the shutdown was ignored
3222          *  by the lock request because there was no record of that NLM server
3223          *  shutting down.   We will be in the peculiar position of thinking
3224          *  that we've shutdown the NLM server and all locks at all LLMs have
3225          *  been discarded, but in fact there's still one lock held.
3226          *  The solution is to record the existence of NLM server and change
3227          *  its state immediately to NLM_SHUTTING_DOWN.  The lock request in
3228          *  progress may proceed because the next phase NLM_DOWN will catch
3229          *  this lock and discard it.
3230          */
3231         mutex_exit(&nlm_reg_lock);
3232
3233         switch (nlm_state) {
3234         case FLK_NLM_UP:
3235                 /*
3236                  * Change the NLM state of all locks still held on behalf of
3237                  * the NLM server identified by "nlmid" to NLM_UP.
3238                  */
3239                 cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3240                 break;
3241
3242         case FLK_NLM_SHUTTING_DOWN:
3243                 /*
3244                  * Wake up all sleeping locks for the NLM server identified
3245                  * by "nlmid." Note that eventually all woken threads will
3246                  * have their lock requests cancelled and descriptors
3247                  * removed from the sleeping lock list.  Note that the NLM
3248                  * server state associated with each lock descriptor is
3249                  * changed to FLK_NLM_SHUTTING_DOWN.
3250                  */
3251                 cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3252                 break;
3253
3254         case FLK_NLM_DOWN:
3255                 /*
3256                  * Discard all active, granted locks for this NLM server
3257                  * identified by "nlmid."
3258                  */
3259                 cl_flk_unlock_nlm_granted(nlmid);
3260                 break;
3261
3262         default:
3263                 panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3264         }
3265 }
3266
3267 /*
3268  * Set the control status for lock manager requests.
3269  *
3270  * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3271  * may be locks requests that have gotten started but not finished.  In
3272  * particular, there may be blocking requests that are in the callback code
3273  * before sleeping (so they're not holding the lock for the graph).  If
3274  * such a thread reacquires the graph's lock (to go to sleep) after
3275  * flk_lockmgr_status is set to a non-up value, it will notice the status
3276  * and bail out.  If the request gets granted before the thread can check
3277  * flk_lockmgr_status, let it continue normally.  It will get flushed when
3278  * we are called with FLK_LOCKMGR_DOWN.
3279  */
3280
3281 void
3282 flk_set_lockmgr_status(flk_lockmgr_status_t status)
3283 {
3284         int i;
3285         graph_t *gp;
3286         struct flock_globals *fg;
3287
3288         fg = flk_get_globals();
3289         ASSERT(fg != NULL);
3290
3291         mutex_enter(&flock_lock);
3292         fg->flk_lockmgr_status = status;
3293         mutex_exit(&flock_lock);
3294
3295         /*
3296          * If the lock manager is coming back up, all that's needed is to
3297          * propagate this information to the graphs.  If the lock manager
3298          * is going down, additional action is required, and each graph's
3299          * copy of the state is updated atomically with this other action.
3300          */
3301         switch (status) {
3302         case FLK_LOCKMGR_UP:
3303                 for (i = 0; i < HASH_SIZE; i++) {
3304                         mutex_enter(&flock_lock);
3305                         gp = lock_graph[i];
3306                         mutex_exit(&flock_lock);
3307                         if (gp == NULL)
3308                                 continue;
3309                         mutex_enter(&gp->gp_mutex);
3310                         fg->lockmgr_status[i] = status;
3311                         mutex_exit(&gp->gp_mutex);
3312                 }
3313                 break;
3314         case FLK_WAKEUP_SLEEPERS:
3315                 wakeup_sleeping_lockmgr_locks(fg);
3316                 break;
3317         case FLK_LOCKMGR_DOWN:
3318                 unlock_lockmgr_granted(fg);
3319                 break;
3320         default:
3321                 panic("flk_set_lockmgr_status: bad status (%d)", status);
3322                 break;
3323         }
3324 }
3325
3326 /*
3327  * This routine returns all the locks that are active or sleeping and are
3328  * associated with a particular set of identifiers.  If lock_state != 0, then
3329  * only locks that match the lock_state are returned. If lock_state == 0, then
3330  * all locks are returned. If pid == NOPID, the pid is ignored.  If
3331  * use_sysid is FALSE, then the sysid is ignored.  If vp is NULL, then the
3332  * vnode pointer is ignored.
3333  *
3334  * A list containing the vnode pointer and an flock structure
3335  * describing the lock is returned.  Each element in the list is
3336  * dynamically allocated and must be freed by the caller.  The
3337  * last item in the list is denoted by a NULL value in the ll_next
3338  * field.
3339  *
3340  * The vnode pointers returned are held.  The caller is responsible
3341  * for releasing these.  Note that the returned list is only a snapshot of
3342  * the current lock information, and that it is a snapshot of a moving
3343  * target (only one graph is locked at a time).
3344  */
3345
3346 locklist_t *
3347 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3348                 pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3349 {
3350         lock_descriptor_t       *lock;
3351         lock_descriptor_t       *graph_head;
3352         locklist_t              listhead;
3353         locklist_t              *llheadp;
3354         locklist_t              *llp;
3355         locklist_t              *lltp;
3356         graph_t                 *gp;
3357         int                     i;
3358         int                     first_index; /* graph index */
3359         int                     num_indexes; /* graph index */
3360
3361         ASSERT((list_type == FLK_ACTIVE_STATE) ||
3362             (list_type == FLK_SLEEPING_STATE));
3363
3364         /*
3365          * Get a pointer to something to use as a list head while building
3366          * the rest of the list.
3367          */
3368         llheadp = &listhead;
3369         lltp = llheadp;
3370         llheadp->ll_next = (locklist_t *)NULL;
3371
3372         /* Figure out which graphs we want to look at. */
3373         if (vp == NULL) {
3374                 first_index = 0;
3375                 num_indexes = HASH_SIZE;
3376         } else {
3377                 first_index = HASH_INDEX(vp);
3378                 num_indexes = 1;
3379         }
3380
3381         for (i = first_index; i < first_index + num_indexes; i++) {
3382                 mutex_enter(&flock_lock);
3383                 gp = lock_graph[i];
3384                 mutex_exit(&flock_lock);
3385                 if (gp == NULL) {
3386                         continue;
3387                 }
3388
3389                 mutex_enter(&gp->gp_mutex);
3390                 graph_head = (list_type == FLK_ACTIVE_STATE) ?
3391                     ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3392                 for (lock = graph_head->l_next;
3393                     lock != graph_head;
3394                     lock = lock->l_next) {
3395                         if (use_sysid && lock->l_flock.l_sysid != sysid)
3396                                 continue;
3397                         if (pid != NOPID && lock->l_flock.l_pid != pid)
3398                                 continue;
3399                         if (vp != NULL && lock->l_vnode != vp)
3400                                 continue;
3401                         if (lock_state && !(lock_state & lock->l_state))
3402                                 continue;
3403                         if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3404                                 continue;
3405                         /*
3406                          * A matching lock was found.  Allocate
3407                          * space for a new locklist entry and fill
3408                          * it in.
3409                          */
3410                         llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3411                         lltp->ll_next = llp;
3412                         VN_HOLD(lock->l_vnode);
3413                         llp->ll_vp = lock->l_vnode;
3414                         create_flock(lock, &(llp->ll_flock));
3415                         llp->ll_next = (locklist_t *)NULL;
3416                         lltp = llp;
3417                 }
3418                 mutex_exit(&gp->gp_mutex);
3419         }
3420
3421         llp = llheadp->ll_next;
3422         return (llp);
3423 }
3424
3425 /*
3426  * These two functions are simply interfaces to get_lock_list.  They return
3427  * a list of sleeping or active locks for the given sysid and pid.  See
3428  * get_lock_list for details.
3429  *
3430  * In either case we don't particularly care to specify the zone of interest;
3431  * the sysid-space is global across zones, so the sysid will map to exactly one
3432  * zone, and we'll return information for that zone.
3433  */
3434
3435 locklist_t *
3436 flk_get_sleeping_locks(int sysid, pid_t pid)
3437 {
3438         return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
3439             ALL_ZONES));
3440 }
3441
3442 locklist_t *
3443 flk_get_active_locks(int sysid, pid_t pid)
3444 {
3445         return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
3446             ALL_ZONES));
3447 }
3448
3449 /*
3450  * Another interface to get_lock_list.  This one returns all the active
3451  * locks for a given vnode.  Again, see get_lock_list for details.
3452  *
3453  * We don't need to specify which zone's locks we're interested in.  The matter
3454  * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3455  * be used by multiple zones, so the list of locks will all be from the right
3456  * zone.
3457  */
3458
3459 locklist_t *
3460 flk_active_locks_for_vp(const vnode_t *vp)
3461 {
3462         return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
3463             ALL_ZONES));
3464 }
3465
3466 /*
3467  * Another interface to get_lock_list.  This one returns all the active
3468  * nbmand locks for a given vnode.  Again, see get_lock_list for details.
3469  *
3470  * See the comment for flk_active_locks_for_vp() for why we don't care to
3471  * specify the particular zone of interest.
3472  */
3473 locklist_t *
3474 flk_active_nbmand_locks_for_vp(const vnode_t *vp)
3475 {
3476         return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3477             NOPID, vp, ALL_ZONES));
3478 }
3479
3480 /*
3481  * Another interface to get_lock_list.  This one returns all the active
3482  * nbmand locks for a given pid.  Again, see get_lock_list for details.
3483  *
3484  * The zone doesn't need to be specified here; the locks held by a
3485  * particular process will either be local (ie, non-NFS) or from the zone
3486  * the process is executing in.  This is because other parts of the system
3487  * ensure that an NFS vnode can't be used in a zone other than that in
3488  * which it was opened.
3489  */
3490 locklist_t *
3491 flk_active_nbmand_locks(pid_t pid)
3492 {
3493         return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3494             pid, NULL, ALL_ZONES));
3495 }
3496
3497 /*
3498  * Free up all entries in the locklist.
3499  */
3500 void
3501 flk_free_locklist(locklist_t *llp)
3502 {
3503         locklist_t *next_llp;
3504
3505         while (llp) {
3506                 next_llp = llp->ll_next;
3507                 VN_RELE(llp->ll_vp);
3508                 kmem_free(llp, sizeof (*llp));
3509                 llp = next_llp;
3510         }
3511 }
3512
3513 static void
3514 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
3515 {
3516         /*
3517          * For each graph "lg" in the hash table lock_graph do
3518          * a.  Get the list of sleeping locks
3519          * b.  For each lock descriptor in the list do
3520          *      i.   If the requested lock is an NLM server request AND
3521          *              the nlmid is the same as the routine argument then
3522          *              change the lock descriptor's state field to
3523          *              "nlm_state."
3524          * c.  Get the list of active locks
3525          * d.  For each lock descriptor in the list do
3526          *      i.   If the requested lock is an NLM server request AND
3527          *              the nlmid is the same as the routine argument then
3528          *              change the lock descriptor's state field to
3529          *              "nlm_state."
3530          */
3531
3532         int                     i;
3533         graph_t                 *gp;                    /* lock graph */
3534         lock_descriptor_t       *lock;                  /* lock */
3535         lock_descriptor_t       *nlock = NULL;          /* next lock */
3536         int                     lock_nlmid;
3537
3538         for (i = 0; i < HASH_SIZE; i++) {
3539                 mutex_enter(&flock_lock);
3540                 gp = lock_graph[i];
3541                 mutex_exit(&flock_lock);
3542                 if (gp == NULL) {
3543                         continue;
3544                 }
3545
3546                 /* Get list of sleeping locks in current lock graph. */
3547                 mutex_enter(&gp->gp_mutex);
3548                 for (lock = SLEEPING_HEAD(gp)->l_next;
3549                     lock != SLEEPING_HEAD(gp);
3550                     lock = nlock) {
3551                         nlock = lock->l_next;
3552                         /* get NLM id */
3553                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3554
3555                         /*
3556                          * If NLM server request AND nlmid of lock matches
3557                          * nlmid of argument, then set the NLM state of the
3558                          * lock to "nlm_state."
3559                          */
3560                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3561                                 SET_NLM_STATE(lock, nlm_state);
3562                         }
3563                 }
3564
3565                 /* Get list of active locks in current lock graph. */
3566                 for (lock = ACTIVE_HEAD(gp)->l_next;
3567                     lock != ACTIVE_HEAD(gp);
3568                     lock = nlock) {
3569                         nlock = lock->l_next;
3570                         /* get NLM id */
3571                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3572
3573                         /*
3574                          * If NLM server request AND nlmid of lock matches
3575                          * nlmid of argument, then set the NLM state of the
3576                          * lock to "nlm_state."
3577                          */
3578                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3579                                 ASSERT(IS_ACTIVE(lock));
3580                                 SET_NLM_STATE(lock, nlm_state);
3581                         }
3582                 }
3583                 mutex_exit(&gp->gp_mutex);
3584         }
3585 }
3586
3587 /*
3588  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
3589  * Effects: Find all sleeping lock manager requests _only_ for the NLM server
3590  *   identified by "nlmid." Poke those lock requests.
3591  */
3592 static void
3593 cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
3594 {
3595         lock_descriptor_t *lock;
3596         lock_descriptor_t *nlock = NULL; /* next lock */
3597         int i;
3598         graph_t *gp;
3599         int     lock_nlmid;
3600
3601         for (i = 0; i < HASH_SIZE; i++) {
3602                 mutex_enter(&flock_lock);
3603                 gp = lock_graph[i];
3604                 mutex_exit(&flock_lock);
3605                 if (gp == NULL) {
3606                         continue;
3607                 }
3608
3609                 mutex_enter(&gp->gp_mutex);
3610                 for (lock = SLEEPING_HEAD(gp)->l_next;
3611                     lock != SLEEPING_HEAD(gp);
3612                     lock = nlock) {
3613                         nlock = lock->l_next;
3614                         /*
3615                          * If NLM server request _and_ nlmid of lock matches
3616                          * nlmid of argument, then set the NLM state of the
3617                          * lock to NLM_SHUTTING_DOWN, and wake up sleeping
3618                          * request.
3619                          */
3620                         if (IS_LOCKMGR(lock)) {
3621                                 /* get NLM id */
3622                                 lock_nlmid =
3623                                     GETNLMID(lock->l_flock.l_sysid);
3624                                 if (nlmid == lock_nlmid) {
3625                                         SET_NLM_STATE(lock,
3626                                             FLK_NLM_SHUTTING_DOWN);
3627                                         INTERRUPT_WAKEUP(lock);
3628                                 }
3629                         }
3630                 }
3631                 mutex_exit(&gp->gp_mutex);
3632         }
3633 }
3634
3635 /*
3636  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
3637  * Effects:  Find all active (granted) lock manager locks _only_ for the
3638  *   NLM server identified by "nlmid" and release them.
3639  */
3640 static void
3641 cl_flk_unlock_nlm_granted(int nlmid)
3642 {
3643         lock_descriptor_t *lock;
3644         lock_descriptor_t *nlock = NULL; /* next lock */
3645         int i;
3646         graph_t *gp;
3647         int     lock_nlmid;
3648
3649         for (i = 0; i < HASH_SIZE; i++) {
3650                 mutex_enter(&flock_lock);
3651                 gp = lock_graph[i];
3652                 mutex_exit(&flock_lock);
3653                 if (gp == NULL) {
3654                         continue;
3655                 }
3656
3657                 mutex_enter(&gp->gp_mutex);
3658                 for (lock = ACTIVE_HEAD(gp)->l_next;
3659                     lock != ACTIVE_HEAD(gp);
3660                     lock = nlock) {
3661                         nlock = lock->l_next;
3662                         ASSERT(IS_ACTIVE(lock));
3663
3664                         /*
3665                          * If it's an  NLM server request _and_ nlmid of
3666                          * the lock matches nlmid of argument, then
3667                          * remove the active lock the list, wakup blocked
3668                          * threads, and free the storage for the lock.
3669                          * Note that there's no need to mark the NLM state
3670                          * of this lock to NLM_DOWN because the lock will
3671                          * be deleted anyway and its storage freed.
3672                          */
3673                         if (IS_LOCKMGR(lock)) {
3674                                 /* get NLM id */
3675                                 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3676                                 if (nlmid == lock_nlmid) {
3677                                         flk_delete_active_lock(lock, 0);
3678                                         flk_wakeup(lock, 1);
3679                                         flk_free_lock(lock);
3680                                 }
3681                         }
3682                 }
3683                 mutex_exit(&gp->gp_mutex);
3684         }
3685 }
3686
3687 /*
3688  * Find all sleeping lock manager requests and poke them.
3689  */
3690 static void
3691 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
3692 {
3693         lock_descriptor_t *lock;
3694         lock_descriptor_t *nlock = NULL; /* next lock */
3695         int i;
3696         graph_t *gp;
3697         zoneid_t zoneid = getzoneid();
3698
3699         for (i = 0; i < HASH_SIZE; i++) {
3700                 mutex_enter(&flock_lock);
3701                 gp = lock_graph[i];
3702                 mutex_exit(&flock_lock);
3703                 if (gp == NULL) {
3704                         continue;
3705                 }
3706
3707                 mutex_enter(&gp->gp_mutex);
3708                 fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
3709                 for (lock = SLEEPING_HEAD(gp)->l_next;
3710                     lock != SLEEPING_HEAD(gp);
3711                     lock = nlock) {
3712                         nlock = lock->l_next;
3713                         if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3714                                 INTERRUPT_WAKEUP(lock);
3715                         }
3716                 }
3717                 mutex_exit(&gp->gp_mutex);
3718         }
3719 }
3720
3721
3722 /*
3723  * Find all active (granted) lock manager locks and release them.
3724  */
3725 static void
3726 unlock_lockmgr_granted(struct flock_globals *fg)
3727 {
3728         lock_descriptor_t *lock;
3729         lock_descriptor_t *nlock = NULL; /* next lock */
3730         int i;
3731         graph_t *gp;
3732         zoneid_t zoneid = getzoneid();
3733
3734         for (i = 0; i < HASH_SIZE; i++) {
3735                 mutex_enter(&flock_lock);
3736                 gp = lock_graph[i];
3737                 mutex_exit(&flock_lock);
3738                 if (gp == NULL) {
3739                         continue;
3740                 }
3741
3742                 mutex_enter(&gp->gp_mutex);
3743                 fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
3744                 for (lock = ACTIVE_HEAD(gp)->l_next;
3745                     lock != ACTIVE_HEAD(gp);
3746                     lock = nlock) {
3747                         nlock = lock->l_next;
3748                         if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3749                                 ASSERT(IS_ACTIVE(lock));
3750                                 flk_delete_active_lock(lock, 0);
3751                                 flk_wakeup(lock, 1);
3752                                 flk_free_lock(lock);
3753                         }
3754                 }
3755                 mutex_exit(&gp->gp_mutex);
3756         }
3757 }
3758
3759
3760 /*
3761  * Wait until a lock is granted, cancelled, or interrupted.
3762  */
3763
3764 static void
3765 wait_for_lock(lock_descriptor_t *request)
3766 {
3767         graph_t *gp = request->l_graph;
3768
3769         ASSERT(MUTEX_HELD(&gp->gp_mutex));
3770
3771         while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
3772             !(IS_INTERRUPTED(request))) {
3773                 if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
3774                         flk_set_state(request, FLK_INTERRUPTED_STATE);
3775                         request->l_state |= INTERRUPTED_LOCK;
3776                 }
3777         }
3778 }
3779
3780 /*
3781  * Create an flock structure from the existing lock information
3782  *
3783  * This routine is used to create flock structures for the lock manager
3784  * to use in a reclaim request.  Since the lock was originated on this
3785  * host, it must be conforming to UNIX semantics, so no checking is
3786  * done to make sure it falls within the lower half of the 32-bit range.
3787  */
3788
3789 static void
3790 create_flock(lock_descriptor_t *lp, flock64_t *flp)
3791 {
3792         ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
3793         ASSERT(lp->l_end >= lp->l_start);
3794
3795         flp->l_type = lp->l_type;
3796         flp->l_whence = 0;
3797         flp->l_start = lp->l_start;
3798         flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
3799             (lp->l_end - lp->l_start + 1);
3800         flp->l_sysid = lp->l_flock.l_sysid;
3801         flp->l_pid = lp->l_flock.l_pid;
3802 }
3803
3804 /*
3805  * Convert flock_t data describing a lock range into unsigned long starting
3806  * and ending points, which are put into lock_request.  Returns 0 or an
3807  * errno value.
3808  * Large Files: max is passed by the caller and we return EOVERFLOW
3809  * as defined by LFS API.
3810  */
3811
3812 int
3813 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
3814     u_offset_t *start, u_offset_t *end, offset_t offset)
3815 {
3816         struct vattr    vattr;
3817         int     error;
3818
3819         /*
3820          * Determine the starting point of the request
3821          */
3822         switch (flp->l_whence) {
3823         case 0:         /* SEEK_SET */
3824                 *start = (u_offset_t)flp->l_start;
3825                 break;
3826         case 1:         /* SEEK_CUR */
3827                 *start = (u_offset_t)(flp->l_start + offset);
3828                 break;
3829         case 2:         /* SEEK_END */
3830                 vattr.va_mask = AT_SIZE;
3831                 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
3832                         return (error);
3833                 *start = (u_offset_t)(flp->l_start + vattr.va_size);
3834                 break;
3835         default:
3836                 return (EINVAL);
3837         }
3838
3839         /*
3840          * Determine the range covered by the request.
3841          */
3842         if (flp->l_len == 0)
3843                 *end = MAX_U_OFFSET_T;
3844         else if ((offset_t)flp->l_len > 0) {
3845                 *end = (u_offset_t)(*start + (flp->l_len - 1));
3846         } else {
3847                 /*
3848                  * Negative length; why do we even allow this ?
3849                  * Because this allows easy specification of
3850                  * the last n bytes of the file.
3851                  */
3852                 *end = *start;
3853                 *start += (u_offset_t)flp->l_len;
3854                 (*start)++;
3855         }
3856         return (0);
3857 }
3858
3859 /*
3860  * Check the validity of lock data.  This can used by the NFS
3861  * frlock routines to check data before contacting the server.  The
3862  * server must support semantics that aren't as restrictive as
3863  * the UNIX API, so the NFS client is required to check.
3864  * The maximum is now passed in by the caller.
3865  */
3866
3867 int
3868 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
3869 {
3870         /*
3871          * The end (length) for local locking should never be greater
3872          * than MAXEND. However, the representation for
3873          * the entire file is MAX_U_OFFSET_T.
3874          */
3875         if ((start > max) ||
3876             ((end > max) && (end != MAX_U_OFFSET_T))) {
3877                 return (EINVAL);
3878         }
3879         if (start > end) {
3880                 return (EINVAL);
3881         }
3882         return (0);
3883 }
3884
3885 /*
3886  * Fill in request->l_flock with information about the lock blocking the
3887  * request.  The complexity here is that lock manager requests are allowed
3888  * to see into the upper part of the 32-bit address range, whereas local
3889  * requests are only allowed to see signed values.
3890  *
3891  * What should be done when "blocker" is a lock manager lock that uses the
3892  * upper portion of the 32-bit range, but "request" is local?  Since the
3893  * request has already been determined to have been blocked by the blocker,
3894  * at least some portion of "blocker" must be in the range of the request,
3895  * or the request extends to the end of file.  For the first case, the
3896  * portion in the lower range is returned with the indication that it goes
3897  * "to EOF."  For the second case, the last byte of the lower range is
3898  * returned with the indication that it goes "to EOF."
3899  */
3900
3901 static void
3902 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
3903 {
3904         flock64_t *flrp;                        /* l_flock portion of request */
3905
3906         ASSERT(blocker != NULL);
3907
3908         flrp = &request->l_flock;
3909         flrp->l_whence = 0;
3910         flrp->l_type = blocker->l_type;
3911         flrp->l_pid = blocker->l_flock.l_pid;
3912         flrp->l_sysid = blocker->l_flock.l_sysid;
3913
3914         if (IS_LOCKMGR(request)) {
3915                 flrp->l_start = blocker->l_start;
3916                 if (blocker->l_end == MAX_U_OFFSET_T)
3917                         flrp->l_len = 0;
3918                 else
3919                         flrp->l_len = blocker->l_end - blocker->l_start + 1;
3920         } else {
3921                 if (blocker->l_start > MAXEND) {
3922                         flrp->l_start = MAXEND;
3923                         flrp->l_len = 0;
3924                 } else {
3925                         flrp->l_start = blocker->l_start;
3926                         if (blocker->l_end == MAX_U_OFFSET_T)
3927                                 flrp->l_len = 0;
3928                         else
3929                                 flrp->l_len = blocker->l_end -
3930                                     blocker->l_start + 1;
3931                 }
3932         }
3933 }
3934
3935 /*
3936  * PSARC case 1997/292
3937  */
3938 /*
3939  * This is the public routine exported by flock.h.
3940  */
3941 void
3942 cl_flk_change_nlm_state_to_unknown(int nlmid)
3943 {
3944         /*
3945          * Check to see if node is booted as a cluster. If not, return.
3946          */
3947         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3948                 return;
3949         }
3950
3951         /*
3952          * See comment in cl_flk_set_nlm_status().
3953          */
3954         if (nlm_reg_status == NULL) {
3955                 return;
3956         }
3957
3958         /*
3959          * protect NLM registry state with a mutex.
3960          */
3961         ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3962         mutex_enter(&nlm_reg_lock);
3963         FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
3964         mutex_exit(&nlm_reg_lock);
3965 }
3966
3967 /*
3968  * Return non-zero if the given I/O request conflicts with an active NBMAND
3969  * lock.
3970  * If svmand is non-zero, it means look at all active locks, not just NBMAND
3971  * locks.
3972  */
3973
3974 int
3975 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
3976                 ssize_t length, int svmand, caller_context_t *ct)
3977 {
3978         int conflict = 0;
3979         graph_t                 *gp;
3980         lock_descriptor_t       *lock;
3981         pid_t pid;
3982         int sysid;
3983
3984         if (ct == NULL) {
3985                 pid = curproc->p_pid;
3986                 sysid = 0;
3987         } else {
3988                 pid = ct->cc_pid;
3989                 sysid = ct->cc_sysid;
3990         }
3991
3992         mutex_enter(&flock_lock);
3993         gp = lock_graph[HASH_INDEX(vp)];
3994         mutex_exit(&flock_lock);
3995         if (gp == NULL)
3996                 return (0);
3997
3998         mutex_enter(&gp->gp_mutex);
3999         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4000
4001         for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
4002                 if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
4003                     (lock->l_flock.l_sysid != sysid ||
4004                     lock->l_flock.l_pid != pid) &&
4005                     lock_blocks_io(op, offset, length,
4006                     lock->l_type, lock->l_start, lock->l_end)) {
4007                         conflict = 1;
4008                         break;
4009                 }
4010         }
4011         mutex_exit(&gp->gp_mutex);
4012
4013         return (conflict);
4014 }
4015
4016 /*
4017  * Return non-zero if the given I/O request conflicts with the given lock.
4018  */
4019
4020 static int
4021 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
4022             int lock_type, u_offset_t lock_start, u_offset_t lock_end)
4023 {
4024         ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
4025         ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
4026
4027         if (op == NBL_READ && lock_type == F_RDLCK)
4028                 return (0);
4029
4030         if (offset <= lock_start && lock_start < offset + length)
4031                 return (1);
4032         if (lock_start <= offset && offset <= lock_end)
4033                 return (1);
4034
4035         return (0);
4036 }
4037
4038 #ifdef DEBUG
4039 static void
4040 check_active_locks(graph_t *gp)
4041 {
4042         lock_descriptor_t *lock, *lock1;
4043         edge_t  *ep;
4044
4045         for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
4046             lock = lock->l_next) {
4047                 ASSERT(IS_ACTIVE(lock));
4048                 ASSERT(NOT_BLOCKED(lock));
4049                 ASSERT(!IS_BARRIER(lock));
4050
4051                 ep = FIRST_IN(lock);
4052
4053                 while (ep != HEAD(lock)) {
4054                         ASSERT(IS_SLEEPING(ep->from_vertex));
4055                         ASSERT(!NOT_BLOCKED(ep->from_vertex));
4056                         ep = NEXT_IN(ep);
4057                 }
4058
4059                 for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
4060                     lock1 = lock1->l_next) {
4061                         if (lock1->l_vnode == lock->l_vnode) {
4062                         if (BLOCKS(lock1, lock)) {
4063                                 cmn_err(CE_PANIC,
4064                                     "active lock %p blocks %p",
4065                                     (void *)lock1, (void *)lock);
4066                         } else if (BLOCKS(lock, lock1)) {
4067                                 cmn_err(CE_PANIC,
4068                                     "active lock %p blocks %p",
4069                                     (void *)lock, (void *)lock1);
4070                         }
4071                         }
4072                 }
4073         }
4074 }
4075
4076 /*
4077  * Effect: This functions checks to see if the transition from 'old_state' to
4078  *      'new_state' is a valid one.  It returns 0 if the transition is valid
4079  *      and 1 if it is not.
4080  *      For a map of valid transitions, see sys/flock_impl.h
4081  */
4082 static int
4083 check_lock_transition(int old_state, int new_state)
4084 {
4085         switch (old_state) {
4086         case FLK_INITIAL_STATE:
4087                 if ((new_state == FLK_START_STATE) ||
4088                     (new_state == FLK_SLEEPING_STATE) ||
4089                     (new_state == FLK_ACTIVE_STATE) ||
4090                     (new_state == FLK_DEAD_STATE)) {
4091                         return (0);
4092                 } else {
4093                         return (1);
4094                 }
4095         case FLK_START_STATE:
4096                 if ((new_state == FLK_ACTIVE_STATE) ||
4097                     (new_state == FLK_DEAD_STATE)) {
4098                         return (0);
4099                 } else {
4100                         return (1);
4101                 }
4102         case FLK_ACTIVE_STATE:
4103                 if (new_state == FLK_DEAD_STATE) {
4104                         return (0);
4105                 } else {
4106                         return (1);
4107                 }
4108         case FLK_SLEEPING_STATE:
4109                 if ((new_state == FLK_GRANTED_STATE) ||
4110                     (new_state == FLK_INTERRUPTED_STATE) ||
4111                     (new_state == FLK_CANCELLED_STATE)) {
4112                         return (0);
4113                 } else {
4114                         return (1);
4115                 }
4116         case FLK_GRANTED_STATE:
4117                 if ((new_state == FLK_START_STATE) ||
4118                     (new_state == FLK_INTERRUPTED_STATE) ||
4119                     (new_state == FLK_CANCELLED_STATE)) {
4120                         return (0);
4121                 } else {
4122                         return (1);
4123                 }
4124         case FLK_CANCELLED_STATE:
4125                 if ((new_state == FLK_INTERRUPTED_STATE) ||
4126                     (new_state == FLK_DEAD_STATE)) {
4127                         return (0);
4128                 } else {
4129                         return (1);
4130                 }
4131         case FLK_INTERRUPTED_STATE:
4132                 if (new_state == FLK_DEAD_STATE) {
4133                         return (0);
4134                 } else {
4135                         return (1);
4136                 }
4137         case FLK_DEAD_STATE:
4138                 /* May be set more than once */
4139                 if (new_state == FLK_DEAD_STATE) {
4140                         return (0);
4141                 } else {
4142                         return (1);
4143                 }
4144         default:
4145                 return (1);
4146         }
4147 }
4148
4149 static void
4150 check_sleeping_locks(graph_t *gp)
4151 {
4152         lock_descriptor_t *lock1, *lock2;
4153         edge_t *ep;
4154         for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4155             lock1 = lock1->l_next) {
4156                                 ASSERT(!IS_BARRIER(lock1));
4157         for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4158             lock2 = lock2->l_next) {
4159                 if (lock1->l_vnode == lock2->l_vnode) {
4160                         if (BLOCKS(lock2, lock1)) {
4161                                 ASSERT(!IS_GRANTED(lock1));
4162                                 ASSERT(!NOT_BLOCKED(lock1));
4163                                 path(lock1, lock2);
4164                         }
4165                 }
4166         }
4167
4168         for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4169             lock2 = lock2->l_next) {
4170                                 ASSERT(!IS_BARRIER(lock1));
4171                 if (lock1->l_vnode == lock2->l_vnode) {
4172                         if (BLOCKS(lock2, lock1)) {
4173                                 ASSERT(!IS_GRANTED(lock1));
4174                                 ASSERT(!NOT_BLOCKED(lock1));
4175                                 path(lock1, lock2);
4176                         }
4177                 }
4178         }
4179         ep = FIRST_ADJ(lock1);
4180         while (ep != HEAD(lock1)) {
4181                 ASSERT(BLOCKS(ep->to_vertex, lock1));
4182                 ep = NEXT_ADJ(ep);
4183         }
4184         }
4185 }
4186
4187 static int
4188 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4189 {
4190         edge_t  *ep;
4191         lock_descriptor_t       *vertex;
4192         lock_descriptor_t *vertex_stack;
4193
4194         STACK_INIT(vertex_stack);
4195
4196         flk_graph_uncolor(lock1->l_graph);
4197         ep = FIRST_ADJ(lock1);
4198         ASSERT(ep != HEAD(lock1));
4199         while (ep != HEAD(lock1)) {
4200                 if (no_path)
4201                         ASSERT(ep->to_vertex != lock2);
4202                 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4203                 COLOR(ep->to_vertex);
4204                 ep = NEXT_ADJ(ep);
4205         }
4206
4207         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4208                 STACK_POP(vertex_stack, l_dstack);
4209                 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4210                     ep = NEXT_ADJ(ep)) {
4211                         if (COLORED(ep->to_vertex))
4212                                 continue;
4213                         COLOR(ep->to_vertex);
4214                         if (ep->to_vertex == lock2)
4215                                 return (1);
4216
4217                         STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4218                 }
4219         }
4220         return (0);
4221 }
4222
4223 static void
4224 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4225 {
4226         lock_descriptor_t *lock;
4227
4228         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4229
4230         if (lock) {
4231                 while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4232                         if (lock->l_flock.l_pid == pid &&
4233                             lock->l_flock.l_sysid == sysid)
4234                                 cmn_err(CE_PANIC,
4235                                     "owner pid %d's lock %p in active queue",
4236                                     pid, (void *)lock);
4237                         lock = lock->l_next;
4238                 }
4239         }
4240         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4241
4242         if (lock) {
4243                 while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4244                         if (lock->l_flock.l_pid == pid &&
4245                             lock->l_flock.l_sysid == sysid)
4246                                 cmn_err(CE_PANIC,
4247                                     "owner pid %d's lock %p in sleep queue",
4248                                     pid, (void *)lock);
4249                         lock = lock->l_next;
4250                 }
4251         }
4252 }
4253
4254 static int
4255 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4256 {
4257         edge_t *ep = FIRST_ADJ(lock1);
4258
4259         while (ep != HEAD(lock1)) {
4260                 if (ep->to_vertex == lock2)
4261                         return (1);
4262                 else
4263                         ep = NEXT_ADJ(ep);
4264         }
4265         return (0);
4266 }
4267
4268 static int
4269 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4270 {
4271         return (!level_two_path(lock1, lock2, 1));
4272 }
4273
4274 static void
4275 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4276 {
4277         if (level_one_path(lock1, lock2)) {
4278                 if (level_two_path(lock1, lock2, 0) != 0) {
4279                         cmn_err(CE_WARN,
4280                             "one edge one path from lock1 %p lock2 %p",
4281                             (void *)lock1, (void *)lock2);
4282                 }
4283         } else if (no_path(lock1, lock2)) {
4284                 cmn_err(CE_PANIC,
4285                     "No path from  lock1 %p to lock2 %p",
4286                     (void *)lock1, (void *)lock2);
4287         }
4288 }
4289 #endif /* DEBUG */