Merge commit 'ea01a15a654b9e1c7b37d958f4d1911882ed7781'
[unleashed.git] / kernel / os / flock.c
blob7dc7126a510735e242fbab87cf215da8cefd764f
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
31 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
32 * Copyright 2015 Joyent, Inc.
35 #include <sys/flock_impl.h>
36 #include <sys/vfs.h>
37 #include <sys/t_lock.h> /* for <sys/callb.h> */
38 #include <sys/callb.h>
39 #include <sys/nbmlock.h>
40 #include <sys/cred.h>
41 #include <sys/policy.h>
44 * The following four variables are for statistics purposes and they are
45 * not protected by locks. They may not be accurate but will at least be
46 * close to the actual value.
49 int flk_lock_allocs;
50 int flk_lock_frees;
51 int edge_allocs;
52 int edge_frees;
53 int flk_proc_vertex_allocs;
54 int flk_proc_edge_allocs;
55 int flk_proc_vertex_frees;
56 int flk_proc_edge_frees;
58 static kmutex_t flock_lock;
60 #ifdef DEBUG
61 int check_debug = 0;
62 #define CHECK_ACTIVE_LOCKS(gp) if (check_debug) \
63 check_active_locks(gp);
64 #define CHECK_SLEEPING_LOCKS(gp) if (check_debug) \
65 check_sleeping_locks(gp);
66 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp) \
67 if (check_debug) \
68 check_owner_locks(gp, pid, sysid, vp);
69 #define CHECK_LOCK_TRANSITION(old_state, new_state) \
70 { \
71 if (check_lock_transition(old_state, new_state)) { \
72 cmn_err(CE_PANIC, "Illegal lock transition \
73 from %d to %d", old_state, new_state); \
74 } \
76 #else
78 #define CHECK_ACTIVE_LOCKS(gp)
79 #define CHECK_SLEEPING_LOCKS(gp)
80 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
81 #define CHECK_LOCK_TRANSITION(old_state, new_state)
83 #endif /* DEBUG */
85 struct kmem_cache *flk_edge_cache;
87 graph_t *lock_graph[HASH_SIZE];
88 proc_graph_t pgraph;
91 * Clustering.
93 * NLM REGISTRY TYPE IMPLEMENTATION
95 * Assumptions:
96 * 1. Nodes in a cluster are numbered starting at 1; always non-negative
97 * integers.
98 * 2. We use this node id to identify the node an NLM server runs on.
102 * NLM registry object keeps track of NLM servers via their
103 * nlmids (which are the node ids of the node in the cluster they run on)
104 * that have requested locks at this LLM with which this registry is
105 * associated.
107 * Representation of abstraction:
108 * rep = record[ states: array[nlm_state],
109 * lock: mutex]
111 * Representation invariants:
112 * 1. index i of rep.states is between 0 and n - 1 where n is number
113 * of elements in the array, which happen to be the maximum number
114 * of nodes in the cluster configuration + 1.
115 * 2. map nlmid to index i of rep.states
116 * 0 -> 0
117 * 1 -> 1
118 * 2 -> 2
119 * 3. This 1-1 mapping is quite convenient and it avoids errors resulting
120 * from forgetting to subtract 1 from the index.
121 * 4. The reason we keep the 0th index is the following. A legitimate
122 * cluster configuration includes making a UFS file system NFS
123 * exportable. The code is structured so that if you're in a cluster
124 * you do one thing; otherwise, you do something else. The problem
125 * is what to do if you think you're in a cluster with PXFS loaded,
126 * but you're using UFS not PXFS? The upper two bytes of the sysid
127 * encode the node id of the node where NLM server runs; these bytes
128 * are zero for UFS. Since the nodeid is used to index into the
129 * registry, we can record the NLM server state information at index
130 * 0 using the same mechanism used for PXFS file locks!
132 static flk_nlm_status_t *nlm_reg_status = NULL; /* state array 0..N-1 */
133 static kmutex_t nlm_reg_lock; /* lock to protect arrary */
134 static uint_t nlm_status_size; /* size of state array */
137 * Although we need a global lock dependency graph (and associated data
138 * structures), we also need a per-zone notion of whether the lock manager is
139 * running, and so whether to allow lock manager requests or not.
141 * Thus, on a per-zone basis we maintain a ``global'' variable
142 * (flk_lockmgr_status), protected by flock_lock, and set when the lock
143 * manager is determined to be changing state (starting or stopping).
145 * Each graph/zone pair also has a copy of this variable, which is protected by
146 * the graph's mutex.
148 * The per-graph copies are used to synchronize lock requests with shutdown
149 * requests. The global copy is used to initialize the per-graph field when a
150 * new graph is created.
152 struct flock_globals {
153 flk_lockmgr_status_t flk_lockmgr_status;
154 flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
157 zone_key_t flock_zone_key;
159 static void create_flock(lock_descriptor_t *, flock64_t *);
160 static lock_descriptor_t *flk_get_lock(void);
161 static void flk_free_lock(lock_descriptor_t *lock);
162 static void flk_get_first_blocking_lock(lock_descriptor_t *request);
163 static int flk_process_request(lock_descriptor_t *);
164 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
165 static edge_t *flk_get_edge(void);
166 static int flk_wait_execute_request(lock_descriptor_t *);
167 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
168 static void flk_insert_active_lock(lock_descriptor_t *);
169 static void flk_delete_active_lock(lock_descriptor_t *, int);
170 static void flk_insert_sleeping_lock(lock_descriptor_t *);
171 static void flk_graph_uncolor(graph_t *);
172 static void flk_wakeup(lock_descriptor_t *, int);
173 static void flk_free_edge(edge_t *);
174 static void flk_recompute_dependencies(lock_descriptor_t *,
175 lock_descriptor_t **, int, int);
176 static int flk_find_barriers(lock_descriptor_t *);
177 static void flk_update_barriers(lock_descriptor_t *);
178 static int flk_color_reachables(lock_descriptor_t *);
179 static int flk_canceled(lock_descriptor_t *);
180 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
181 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
182 static void wait_for_lock(lock_descriptor_t *);
183 static void unlock_lockmgr_granted(struct flock_globals *);
184 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
186 #ifdef DEBUG
187 static int check_lock_transition(int, int);
188 static void check_sleeping_locks(graph_t *);
189 static void check_active_locks(graph_t *);
190 static int no_path(lock_descriptor_t *, lock_descriptor_t *);
191 static void path(lock_descriptor_t *, lock_descriptor_t *);
192 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
193 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
194 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
195 #endif
197 /* proc_graph function definitions */
198 static int flk_check_deadlock(lock_descriptor_t *);
199 static void flk_proc_graph_uncolor(void);
200 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
201 static proc_edge_t *flk_get_proc_edge(void);
202 static void flk_proc_release(proc_vertex_t *);
203 static void flk_free_proc_edge(proc_edge_t *);
204 static void flk_update_proc_graph(edge_t *, int);
206 /* Non-blocking mandatory locking */
207 static int lock_blocks_io(nbl_op_t, uoff_t, ssize_t, int, uoff_t,
208 uoff_t);
210 static struct flock_globals *
211 flk_get_globals(void)
214 * The KLM module had better be loaded if we're attempting to handle
215 * lockmgr requests.
217 ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
218 return (zone_getspecific(flock_zone_key, curproc->p_zone));
221 static flk_lockmgr_status_t
222 flk_get_lockmgr_status(void)
224 struct flock_globals *fg;
226 ASSERT(MUTEX_HELD(&flock_lock));
228 if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
230 * KLM module not loaded; lock manager definitely not running.
232 return (FLK_LOCKMGR_DOWN);
234 fg = flk_get_globals();
235 return (fg->flk_lockmgr_status);
239 * This implements Open File Description (not descriptor) style record locking.
240 * These locks can also be thought of as pid-less since they are not tied to a
241 * specific process, thus they're preserved across fork.
243 * Called directly from fcntl.
245 * See reclock() for the implementation of the traditional POSIX style record
246 * locking scheme (pid-ful). This function is derived from reclock() but
247 * simplified and modified to work for OFD style locking.
249 * The two primary advantages of OFD style of locking are:
250 * 1) It is per-file description, so closing a file descriptor that refers to a
251 * different file description for the same file will not drop the lock (i.e.
252 * two open's of the same file get different descriptions but a dup or fork
253 * will refer to the same description).
254 * 2) Locks are preserved across fork(2).
256 * Because these locks are per-description a lock ptr lives at the f_filocks
257 * member of the file_t and the lock_descriptor includes a file_t pointer
258 * to enable unique lock identification and management.
260 * Since these locks are pid-less we cannot do deadlock detection with the
261 * current process-oriented implementation. This is consistent with OFD locking
262 * behavior on other operating systems such as Linux. Since we don't do
263 * deadlock detection we never interact with the process graph that is
264 * maintained for deadlock detection on the traditional POSIX-style locks.
266 * Future Work:
268 * The current implementation does not support record locks. That is,
269 * currently the single lock must cover the entire file. This is validated in
270 * fcntl. To support record locks the f_filock pointer in the file_t needs to
271 * be changed to a list of pointers to the locks. That list needs to be
272 * managed independently of the lock list on the vnode itself and it needs to
273 * be maintained as record locks are created, split, coalesced and deleted.
275 * The current implementation does not support remote file systems (e.g.
276 * NFS or CIFS). This is handled in fs_frlock(). The design of how OFD locks
277 * interact with the NLM is not clear since the NLM protocol/implementation
278 * appears to be oriented around locks associated with a process. A further
279 * problem is that a design is needed for what nlm_send_siglost() should do and
280 * where it will send SIGLOST. More recent versions of Linux apparently try to
281 * emulate OFD locks on NFS by converting them to traditional POSIX style locks
282 * that work with the NLM. It is not clear that this provides the correct
283 * semantics in all cases.
286 ofdlock(file_t *fp, int fcmd, flock64_t *lckdat, int flag, uoff_t offset)
288 int cmd = 0;
289 vnode_t *vp;
290 lock_descriptor_t stack_lock_request;
291 lock_descriptor_t *lock_request;
292 int error = 0;
293 graph_t *gp;
294 int serialize = 0;
296 if (fcmd != F_OFD_GETLK)
297 cmd = SETFLCK;
299 if (fcmd == F_OFD_SETLKW || fcmd == F_FLOCKW)
300 cmd |= SLPFLCK;
302 /* see block comment */
303 VERIFY(lckdat->l_whence == 0);
304 VERIFY(lckdat->l_start == 0);
305 VERIFY(lckdat->l_len == 0);
307 vp = fp->f_vnode;
310 * For reclock fs_frlock() would normally have set these in a few
311 * places but for us it's cleaner to centralize it here. Note that
312 * IGN_PID is -1. We use 0 for our pid-less locks.
314 lckdat->l_pid = 0;
315 lckdat->l_sysid = 0;
318 * Check access permissions
320 if ((fcmd == F_OFD_SETLK || fcmd == F_OFD_SETLKW) &&
321 ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
322 (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
323 return (EBADF);
326 * for query and unlock we use the stack_lock_request
328 if (lckdat->l_type == F_UNLCK || !(cmd & SETFLCK)) {
329 lock_request = &stack_lock_request;
330 (void) bzero((caddr_t)lock_request,
331 sizeof (lock_descriptor_t));
334 * following is added to make the assertions in
335 * flk_execute_request() pass
337 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
338 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
339 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
340 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
341 lock_request->l_status = FLK_INITIAL_STATE;
342 } else {
343 lock_request = flk_get_lock();
344 fp->f_filock = (struct filock *)lock_request;
346 lock_request->l_state = 0;
347 lock_request->l_vnode = vp;
348 lock_request->l_zoneid = getzoneid();
349 lock_request->l_ofd = fp;
352 * Convert the request range into the canonical start and end
353 * values then check the validity of the lock range.
355 error = flk_convert_lock_data(vp, lckdat, &lock_request->l_start,
356 &lock_request->l_end, offset);
357 if (error)
358 goto done;
360 error = flk_check_lock_data(lock_request->l_start, lock_request->l_end,
361 MAXEND);
362 if (error)
363 goto done;
365 ASSERT(lock_request->l_end >= lock_request->l_start);
367 lock_request->l_type = lckdat->l_type;
368 if (cmd & SLPFLCK)
369 lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
371 if (!(cmd & SETFLCK)) {
372 if (lock_request->l_type == F_RDLCK ||
373 lock_request->l_type == F_WRLCK)
374 lock_request->l_state |= QUERY_LOCK;
376 lock_request->l_flock = (*lckdat);
379 * We are ready for processing the request
382 if (fcmd != F_OFD_GETLK && lock_request->l_type != F_UNLCK &&
383 nbl_need_check(vp)) {
384 nbl_start_crit(vp, RW_WRITER);
385 serialize = 1;
388 /* Get the lock graph for a particular vnode */
389 gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
391 mutex_enter(&gp->gp_mutex);
393 lock_request->l_state |= REFERENCED_LOCK;
394 lock_request->l_graph = gp;
396 switch (lock_request->l_type) {
397 case F_RDLCK:
398 case F_WRLCK:
399 if (IS_QUERY_LOCK(lock_request)) {
400 flk_get_first_blocking_lock(lock_request);
401 if (lock_request->l_ofd != NULL)
402 lock_request->l_flock.l_pid = -1;
403 (*lckdat) = lock_request->l_flock;
404 } else {
405 /* process the request now */
406 error = flk_process_request(lock_request);
408 break;
410 case F_UNLCK:
411 /* unlock request will not block so execute it immediately */
412 error = flk_execute_request(lock_request);
413 break;
415 default:
416 error = EINVAL;
417 break;
420 if (lock_request == &stack_lock_request) {
421 flk_set_state(lock_request, FLK_DEAD_STATE);
422 } else {
423 lock_request->l_state &= ~REFERENCED_LOCK;
424 if ((error != 0) || IS_DELETED(lock_request)) {
425 flk_set_state(lock_request, FLK_DEAD_STATE);
426 flk_free_lock(lock_request);
430 mutex_exit(&gp->gp_mutex);
431 if (serialize)
432 nbl_end_crit(vp);
434 return (error);
436 done:
437 flk_set_state(lock_request, FLK_DEAD_STATE);
438 if (lock_request != &stack_lock_request)
439 flk_free_lock(lock_request);
440 return (error);
444 * Remove any lock on the vnode belonging to the given file_t.
445 * Called from closef on last close, file_t is locked.
447 * This is modeled on the cleanlocks() function but only removes the single
448 * lock associated with fp.
450 void
451 ofdcleanlock(file_t *fp)
453 lock_descriptor_t *fplock, *lock, *nlock;
454 vnode_t *vp;
455 graph_t *gp;
457 ASSERT(MUTEX_HELD(&fp->f_tlock));
459 if ((fplock = (lock_descriptor_t *)fp->f_filock) == NULL)
460 return;
462 fp->f_filock = NULL;
463 vp = fp->f_vnode;
465 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
467 if (gp == NULL)
468 return;
469 mutex_enter(&gp->gp_mutex);
471 CHECK_SLEEPING_LOCKS(gp);
472 CHECK_ACTIVE_LOCKS(gp);
474 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
476 if (lock) {
477 do {
478 nlock = lock->l_next;
479 if (fplock == lock) {
480 CANCEL_WAKEUP(lock);
481 break;
483 lock = nlock;
484 } while (lock->l_vnode == vp);
487 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
489 if (lock) {
490 do {
491 nlock = lock->l_next;
492 if (fplock == lock) {
493 flk_delete_active_lock(lock, 0);
494 flk_wakeup(lock, 1);
495 flk_free_lock(lock);
496 break;
498 lock = nlock;
499 } while (lock->l_vnode == vp);
502 CHECK_SLEEPING_LOCKS(gp);
503 CHECK_ACTIVE_LOCKS(gp);
504 mutex_exit(&gp->gp_mutex);
508 * Routine called from fs_frlock in fs/fs_subr.c
510 * This implements traditional POSIX style record locking. The two primary
511 * drawbacks to this style of locking are:
512 * 1) It is per-process, so any close of a file descriptor that refers to the
513 * file will drop the lock (e.g. lock /etc/passwd, call a library function
514 * which opens /etc/passwd to read the file, when the library closes it's
515 * file descriptor the application loses its lock and does not know).
516 * 2) Locks are not preserved across fork(2).
518 * Because these locks are only associated with a PID, they are per-process.
519 * This is why any close will drop the lock and is also why, once the process
520 * forks, the lock is no longer related to the new process. These locks can
521 * be considered as PID-ful.
523 * See ofdlock() for the implementation of a similar but improved locking
524 * scheme.
527 reclock(vnode_t *vp, flock64_t *lckdat, int cmd, int flag, uoff_t offset,
528 flk_callback_t *flk_cbp)
530 lock_descriptor_t stack_lock_request;
531 lock_descriptor_t *lock_request;
532 int error = 0;
533 graph_t *gp;
534 int nlmid;
537 * Check access permissions
539 if ((cmd & SETFLCK) &&
540 ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
541 (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
542 return (EBADF);
545 * for query and unlock we use the stack_lock_request
548 if ((lckdat->l_type == F_UNLCK) ||
549 !((cmd & INOFLCK) || (cmd & SETFLCK))) {
550 lock_request = &stack_lock_request;
551 (void) bzero((caddr_t)lock_request,
552 sizeof (lock_descriptor_t));
555 * following is added to make the assertions in
556 * flk_execute_request() to pass through
559 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
560 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
561 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
562 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
563 lock_request->l_status = FLK_INITIAL_STATE;
564 } else {
565 lock_request = flk_get_lock();
567 lock_request->l_state = 0;
568 lock_request->l_vnode = vp;
569 lock_request->l_zoneid = getzoneid();
572 * Convert the request range into the canonical start and end
573 * values. The NLM protocol supports locking over the entire
574 * 32-bit range, so there's no range checking for remote requests,
575 * but we still need to verify that local requests obey the rules.
577 if ((cmd & RCMDLCK) != 0) {
578 ASSERT(lckdat->l_whence == 0);
579 lock_request->l_start = lckdat->l_start;
580 lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
581 lckdat->l_start + (lckdat->l_len - 1);
582 } else {
583 /* check the validity of the lock range */
584 error = flk_convert_lock_data(vp, lckdat,
585 &lock_request->l_start, &lock_request->l_end,
586 offset);
587 if (error) {
588 goto done;
590 error = flk_check_lock_data(lock_request->l_start,
591 lock_request->l_end, MAXEND);
592 if (error) {
593 goto done;
597 ASSERT(lock_request->l_end >= lock_request->l_start);
599 lock_request->l_type = lckdat->l_type;
600 if (cmd & INOFLCK)
601 lock_request->l_state |= IO_LOCK;
602 if (cmd & SLPFLCK)
603 lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
604 if (cmd & RCMDLCK)
605 lock_request->l_state |= LOCKMGR_LOCK;
606 if (cmd & NBMLCK)
607 lock_request->l_state |= NBMAND_LOCK;
609 if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
610 if (lock_request->l_type == F_RDLCK ||
611 lock_request->l_type == F_WRLCK)
612 lock_request->l_state |= QUERY_LOCK;
614 lock_request->l_flock = (*lckdat);
615 lock_request->l_callbacks = flk_cbp;
618 * We are ready for processing the request
620 if (IS_LOCKMGR(lock_request)) {
622 * If the lock request is an NLM server request ....
624 if (nlm_status_size == 0) { /* not booted as cluster */
625 mutex_enter(&flock_lock);
627 * Bail out if this is a lock manager request and the
628 * lock manager is not supposed to be running.
630 if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
631 mutex_exit(&flock_lock);
632 error = ENOLCK;
633 goto done;
635 mutex_exit(&flock_lock);
636 } else { /* booted as a cluster */
637 nlmid = GETNLMID(lock_request->l_flock.l_sysid);
638 ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
640 mutex_enter(&nlm_reg_lock);
642 * If the NLM registry does not know about this
643 * NLM server making the request, add its nlmid
644 * to the registry.
646 if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
647 nlmid)) {
648 FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
649 } else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
650 nlmid)) {
652 * If the NLM server is already known (has made
653 * previous lock requests) and its state is
654 * not NLM_UP (means that NLM server is
655 * shutting down), then bail out with an
656 * error to deny the lock request.
658 mutex_exit(&nlm_reg_lock);
659 error = ENOLCK;
660 goto done;
662 mutex_exit(&nlm_reg_lock);
666 /* Now get the lock graph for a particular vnode */
667 gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
670 * We drop rwlock here otherwise this might end up causing a
671 * deadlock if this IOLOCK sleeps. (bugid # 1183392).
674 if (IS_IO_LOCK(lock_request)) {
675 fop_rwunlock(vp,
676 (lock_request->l_type == F_RDLCK) ?
677 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
679 mutex_enter(&gp->gp_mutex);
681 lock_request->l_state |= REFERENCED_LOCK;
682 lock_request->l_graph = gp;
684 switch (lock_request->l_type) {
685 case F_RDLCK:
686 case F_WRLCK:
687 if (IS_QUERY_LOCK(lock_request)) {
688 flk_get_first_blocking_lock(lock_request);
689 if (lock_request->l_ofd != NULL)
690 lock_request->l_flock.l_pid = -1;
691 (*lckdat) = lock_request->l_flock;
692 break;
695 /* process the request now */
697 error = flk_process_request(lock_request);
698 break;
700 case F_UNLCK:
701 /* unlock request will not block so execute it immediately */
703 if (IS_LOCKMGR(lock_request) &&
704 flk_canceled(lock_request)) {
705 error = 0;
706 } else {
707 error = flk_execute_request(lock_request);
709 break;
711 case F_UNLKSYS:
713 * Recovery mechanism to release lock manager locks when
714 * NFS client crashes and restart. NFS server will clear
715 * old locks and grant new locks.
718 if (lock_request->l_flock.l_sysid == 0) {
719 mutex_exit(&gp->gp_mutex);
720 return (EINVAL);
722 if (secpolicy_nfs(CRED()) != 0) {
723 mutex_exit(&gp->gp_mutex);
724 return (EPERM);
726 flk_delete_locks_by_sysid(lock_request);
727 lock_request->l_state &= ~REFERENCED_LOCK;
728 flk_set_state(lock_request, FLK_DEAD_STATE);
729 flk_free_lock(lock_request);
730 mutex_exit(&gp->gp_mutex);
731 return (0);
733 default:
734 error = EINVAL;
735 break;
739 * Now that we have seen the status of locks in the system for
740 * this vnode we acquire the rwlock if it is an IO_LOCK.
743 if (IS_IO_LOCK(lock_request)) {
744 (void) fop_rwlock(vp,
745 (lock_request->l_type == F_RDLCK) ?
746 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
747 if (!error) {
748 lckdat->l_type = F_UNLCK;
751 * This wake up is needed otherwise
752 * if IO_LOCK has slept the dependents on this
753 * will not be woken up at all. (bugid # 1185482).
756 flk_wakeup(lock_request, 1);
757 flk_set_state(lock_request, FLK_DEAD_STATE);
758 flk_free_lock(lock_request);
761 * else if error had occurred either flk_process_request()
762 * has returned EDEADLK in which case there will be no
763 * dependents for this lock or EINTR from flk_wait_execute_
764 * request() in which case flk_cancel_sleeping_lock()
765 * would have been done. same is true with EBADF.
769 if (lock_request == &stack_lock_request) {
770 flk_set_state(lock_request, FLK_DEAD_STATE);
771 } else {
772 lock_request->l_state &= ~REFERENCED_LOCK;
773 if ((error != 0) || IS_DELETED(lock_request)) {
774 flk_set_state(lock_request, FLK_DEAD_STATE);
775 flk_free_lock(lock_request);
779 mutex_exit(&gp->gp_mutex);
780 return (error);
782 done:
783 flk_set_state(lock_request, FLK_DEAD_STATE);
784 if (lock_request != &stack_lock_request)
785 flk_free_lock(lock_request);
786 return (error);
790 * Invoke the callbacks in the given list. If before sleeping, invoke in
791 * list order. If after sleeping, invoke in reverse order.
793 * CPR (suspend/resume) support: if one of the callbacks returns a
794 * callb_cpr_t, return it. This will be used to make the thread CPR-safe
795 * while it is sleeping. There should be at most one callb_cpr_t for the
796 * thread.
797 * XXX This is unnecessarily complicated. The CPR information should just
798 * get passed in directly through fop_frlock and reclock, rather than
799 * sneaking it in via a callback.
802 callb_cpr_t *
803 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
805 callb_cpr_t *cpr_callbackp = NULL;
806 callb_cpr_t *one_result;
807 flk_callback_t *cb;
809 if (cblist == NULL)
810 return (NULL);
812 if (when == FLK_BEFORE_SLEEP) {
813 cb = cblist;
814 do {
815 one_result = (*cb->cb_callback)(when, cb->cb_data);
816 if (one_result != NULL) {
817 ASSERT(cpr_callbackp == NULL);
818 cpr_callbackp = one_result;
820 cb = cb->cb_next;
821 } while (cb != cblist);
822 } else {
823 cb = cblist->cb_prev;
824 do {
825 one_result = (*cb->cb_callback)(when, cb->cb_data);
826 if (one_result != NULL) {
827 cpr_callbackp = one_result;
829 cb = cb->cb_prev;
830 } while (cb != cblist->cb_prev);
833 return (cpr_callbackp);
837 * Initialize a flk_callback_t to hold the given callback.
840 void
841 flk_init_callback(flk_callback_t *flk_cb,
842 callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
844 flk_cb->cb_next = flk_cb;
845 flk_cb->cb_prev = flk_cb;
846 flk_cb->cb_callback = cb_fcn;
847 flk_cb->cb_data = cbdata;
851 * Initialize an flk_callback_t and then link it into the head of an
852 * existing list (which may be NULL).
855 void
856 flk_add_callback(flk_callback_t *newcb,
857 callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
858 void *cbdata, flk_callback_t *cblist)
860 flk_init_callback(newcb, cb_fcn, cbdata);
862 if (cblist == NULL)
863 return;
865 newcb->cb_prev = cblist->cb_prev;
866 newcb->cb_next = cblist;
867 cblist->cb_prev->cb_next = newcb;
868 cblist->cb_prev = newcb;
872 * Remove the callback from a list.
875 void
876 flk_del_callback(flk_callback_t *flk_cb)
878 flk_cb->cb_next->cb_prev = flk_cb->cb_prev;
879 flk_cb->cb_prev->cb_next = flk_cb->cb_next;
881 flk_cb->cb_prev = flk_cb;
882 flk_cb->cb_next = flk_cb;
886 * Initialize the flk_edge_cache data structure and create the
887 * nlm_reg_status array.
890 void
891 flk_init(void)
893 uint_t i;
895 flk_edge_cache = kmem_cache_create("flk_edges",
896 sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
897 if (flk_edge_cache == NULL) {
898 cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
901 * Create the NLM registry object.
904 nlm_status_size = 0;
908 * Zone constructor/destructor callbacks to be executed when a zone is
909 * created/destroyed.
911 /* ARGSUSED */
912 void *
913 flk_zone_init(zoneid_t zoneid)
915 struct flock_globals *fg;
916 uint_t i;
918 fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
919 fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
920 for (i = 0; i < HASH_SIZE; i++)
921 fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
922 return (fg);
925 /* ARGSUSED */
926 void
927 flk_zone_fini(zoneid_t zoneid, void *data)
929 struct flock_globals *fg = data;
931 kmem_free(fg, sizeof (*fg));
935 * Get a lock_descriptor structure with initialization of edge lists.
938 static lock_descriptor_t *
939 flk_get_lock(void)
941 lock_descriptor_t *l;
943 l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
945 cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
946 l->l_edge.edge_in_next = &l->l_edge;
947 l->l_edge.edge_in_prev = &l->l_edge;
948 l->l_edge.edge_adj_next = &l->l_edge;
949 l->l_edge.edge_adj_prev = &l->l_edge;
950 l->pvertex = -1;
951 l->l_status = FLK_INITIAL_STATE;
952 flk_lock_allocs++;
953 return (l);
957 * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
958 * when some thread has a reference to it as in reclock().
961 void
962 flk_free_lock(lock_descriptor_t *lock)
964 file_t *fp;
966 ASSERT(IS_DEAD(lock));
968 if ((fp = lock->l_ofd) != NULL && fp->f_filock == (struct filock *)lock)
969 fp->f_filock = NULL;
971 if (IS_REFERENCED(lock)) {
972 lock->l_state |= DELETED_LOCK;
973 return;
975 flk_lock_frees++;
976 kmem_free((void *)lock, sizeof (lock_descriptor_t));
979 void
980 flk_set_state(lock_descriptor_t *lock, int new_state)
983 * Locks in the sleeping list may be woken up in a number of ways,
984 * and more than once. If a sleeping lock is signaled awake more
985 * than once, then it may or may not change state depending on its
986 * current state.
987 * Also note that NLM locks that are sleeping could be moved to an
988 * interrupted state more than once if the unlock request is
989 * retransmitted by the NLM client - the second time around, this is
990 * just a nop.
991 * The ordering of being signaled awake is:
992 * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
993 * The checks below implement this ordering.
995 if (IS_INTERRUPTED(lock)) {
996 if ((new_state == FLK_CANCELLED_STATE) ||
997 (new_state == FLK_GRANTED_STATE) ||
998 (new_state == FLK_INTERRUPTED_STATE)) {
999 return;
1002 if (IS_CANCELLED(lock)) {
1003 if ((new_state == FLK_GRANTED_STATE) ||
1004 (new_state == FLK_CANCELLED_STATE)) {
1005 return;
1008 CHECK_LOCK_TRANSITION(lock->l_status, new_state);
1009 lock->l_status = new_state;
1013 * Routine that checks whether there are any blocking locks in the system.
1015 * The policy followed is if a write lock is sleeping we don't allow read
1016 * locks before this write lock even though there may not be any active
1017 * locks corresponding to the read locks' region.
1019 * flk_add_edge() function adds an edge between l1 and l2 iff there
1020 * is no path between l1 and l2. This is done to have a "minimum
1021 * storage representation" of the dependency graph.
1023 * Another property of the graph is since only the new request throws
1024 * edges to the existing locks in the graph, the graph is always topologically
1025 * ordered.
1028 static int
1029 flk_process_request(lock_descriptor_t *request)
1031 graph_t *gp = request->l_graph;
1032 lock_descriptor_t *lock;
1033 int request_blocked_by_active = 0;
1034 int request_blocked_by_granted = 0;
1035 int request_blocked_by_sleeping = 0;
1036 vnode_t *vp = request->l_vnode;
1037 int error = 0;
1038 int request_will_wait = 0;
1039 int found_covering_lock = 0;
1040 lock_descriptor_t *covered_by = NULL;
1042 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1043 request_will_wait = IS_WILLING_TO_SLEEP(request);
1046 * check active locks
1049 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1052 if (lock) {
1053 do {
1054 if (BLOCKS(lock, request)) {
1055 if (!request_will_wait)
1056 return (EAGAIN);
1057 request_blocked_by_active = 1;
1058 break;
1061 * Grant lock if it is for the same owner holding active
1062 * lock that covers the request.
1065 if (SAME_OWNER(lock, request) &&
1066 COVERS(lock, request) &&
1067 (request->l_type == F_RDLCK))
1068 return (flk_execute_request(request));
1069 lock = lock->l_next;
1070 } while (lock->l_vnode == vp);
1073 if (!request_blocked_by_active) {
1074 lock_descriptor_t *lk[1];
1075 lock_descriptor_t *first_glock = NULL;
1077 * Shall we grant this?! NO!!
1078 * What about those locks that were just granted and still
1079 * in sleep queue. Those threads are woken up and so locks
1080 * are almost active.
1082 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1083 if (lock) {
1084 do {
1085 if (BLOCKS(lock, request)) {
1086 if (IS_GRANTED(lock)) {
1087 request_blocked_by_granted = 1;
1088 } else {
1089 request_blocked_by_sleeping = 1;
1093 lock = lock->l_next;
1094 } while ((lock->l_vnode == vp));
1095 first_glock = lock->l_prev;
1096 ASSERT(first_glock->l_vnode == vp);
1099 if (request_blocked_by_granted)
1100 goto block;
1102 if (!request_blocked_by_sleeping) {
1104 * If the request isn't going to be blocked by a
1105 * sleeping request, we know that it isn't going to
1106 * be blocked; we can just execute the request --
1107 * without performing costly deadlock detection.
1109 ASSERT(!request_blocked_by_active);
1110 return (flk_execute_request(request));
1111 } else if (request->l_type == F_RDLCK) {
1113 * If we have a sleeping writer in the requested
1114 * lock's range, block.
1116 goto block;
1119 lk[0] = request;
1120 request->l_state |= RECOMPUTE_LOCK;
1121 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1122 if (lock) {
1123 do {
1124 flk_recompute_dependencies(lock, lk, 1, 0);
1125 lock = lock->l_next;
1126 } while (lock->l_vnode == vp);
1128 lock = first_glock;
1129 if (lock) {
1130 do {
1131 if (IS_GRANTED(lock)) {
1132 flk_recompute_dependencies(lock, lk, 1, 0);
1134 lock = lock->l_prev;
1135 } while ((lock->l_vnode == vp));
1137 request->l_state &= ~RECOMPUTE_LOCK;
1138 if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
1139 return (EDEADLK);
1140 return (flk_execute_request(request));
1143 block:
1144 if (request_will_wait)
1145 flk_graph_uncolor(gp);
1147 /* check sleeping locks */
1149 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1152 * If we find a sleeping write lock that is a superset of the
1153 * region wanted by request we can be assured that by adding an
1154 * edge to this write lock we have paths to all locks in the
1155 * graph that blocks the request except in one case and that is why
1156 * another check for SAME_OWNER in the loop below. The exception
1157 * case is when this process that owns the sleeping write lock 'l1'
1158 * has other locks l2, l3, l4 that are in the system and arrived
1159 * before l1. l1 does not have path to these locks as they are from
1160 * same process. We break when we find a second covering sleeping
1161 * lock l5 owned by a process different from that owning l1, because
1162 * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
1163 * it has l1 would have produced a deadlock already.
1166 if (lock) {
1167 do {
1168 if (BLOCKS(lock, request)) {
1169 if (!request_will_wait)
1170 return (EAGAIN);
1171 if (COVERS(lock, request) &&
1172 lock->l_type == F_WRLCK) {
1173 if (found_covering_lock &&
1174 !SAME_OWNER(lock, covered_by)) {
1175 found_covering_lock++;
1176 break;
1178 found_covering_lock = 1;
1179 covered_by = lock;
1181 if (found_covering_lock &&
1182 !SAME_OWNER(lock, covered_by)) {
1183 lock = lock->l_next;
1184 continue;
1186 if ((error = flk_add_edge(request, lock,
1187 !found_covering_lock, 0)))
1188 return (error);
1190 lock = lock->l_next;
1191 } while (lock->l_vnode == vp);
1195 * found_covering_lock == 2 iff at this point 'request' has paths
1196 * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
1197 * point 'request' has paths to all locks that blocks 'request' whose owners
1198 * are not same as the one that covers 'request' (covered_by above) and
1199 * we can have locks whose owner is same as covered_by in the active list.
1202 if (request_blocked_by_active && found_covering_lock != 2) {
1203 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1204 ASSERT(lock != NULL);
1205 do {
1206 if (BLOCKS(lock, request)) {
1207 if (found_covering_lock &&
1208 !SAME_OWNER(lock, covered_by)) {
1209 lock = lock->l_next;
1210 continue;
1212 if ((error = flk_add_edge(request, lock,
1213 CHECK_CYCLE, 0)))
1214 return (error);
1216 lock = lock->l_next;
1217 } while (lock->l_vnode == vp);
1220 if (NOT_BLOCKED(request)) {
1222 * request not dependent on any other locks
1223 * so execute this request
1225 return (flk_execute_request(request));
1226 } else {
1228 * check for deadlock
1230 if (flk_check_deadlock(request))
1231 return (EDEADLK);
1233 * this thread has to sleep
1235 return (flk_wait_execute_request(request));
1240 * The actual execution of the request in the simple case is only to
1241 * insert the 'request' in the list of active locks if it is not an
1242 * UNLOCK.
1243 * We have to consider the existing active locks' relation to
1244 * this 'request' if they are owned by same process. flk_relation() does
1245 * this job and sees to that the dependency graph information is maintained
1246 * properly.
1250 flk_execute_request(lock_descriptor_t *request)
1252 graph_t *gp = request->l_graph;
1253 vnode_t *vp = request->l_vnode;
1254 lock_descriptor_t *lock, *lock1;
1255 int done_searching = 0;
1257 CHECK_SLEEPING_LOCKS(gp);
1258 CHECK_ACTIVE_LOCKS(gp);
1260 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1262 flk_set_state(request, FLK_START_STATE);
1264 ASSERT(NOT_BLOCKED(request));
1266 /* IO_LOCK requests are only to check status */
1268 if (IS_IO_LOCK(request))
1269 return (0);
1271 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1273 if (lock == NULL && request->l_type == F_UNLCK)
1274 return (0);
1275 if (lock == NULL) {
1276 flk_insert_active_lock(request);
1277 return (0);
1280 do {
1281 lock1 = lock->l_next;
1282 if (SAME_OWNER(request, lock)) {
1283 done_searching = flk_relation(lock, request);
1285 lock = lock1;
1286 } while (lock->l_vnode == vp && !done_searching);
1289 * insert in active queue
1292 if (request->l_type != F_UNLCK)
1293 flk_insert_active_lock(request);
1295 return (0);
1299 * 'request' is blocked by some one therefore we put it into sleep queue.
1301 static int
1302 flk_wait_execute_request(lock_descriptor_t *request)
1304 graph_t *gp = request->l_graph;
1305 callb_cpr_t *cprp; /* CPR info from callback */
1306 struct flock_globals *fg;
1307 int index;
1309 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1310 ASSERT(IS_WILLING_TO_SLEEP(request));
1312 flk_insert_sleeping_lock(request);
1314 if (IS_LOCKMGR(request)) {
1315 index = HASH_INDEX(request->l_vnode);
1316 fg = flk_get_globals();
1318 if (nlm_status_size == 0) { /* not booted as a cluster */
1319 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1320 flk_cancel_sleeping_lock(request, 1);
1321 return (ENOLCK);
1323 } else { /* booted as a cluster */
1325 * If the request is an NLM server lock request,
1326 * and the NLM state of the lock request is not
1327 * NLM_UP (because the NLM server is shutting
1328 * down), then cancel the sleeping lock and
1329 * return error ENOLCK that will encourage the
1330 * client to retransmit.
1332 if (!IS_NLM_UP(request)) {
1333 flk_cancel_sleeping_lock(request, 1);
1334 return (ENOLCK);
1339 if (request->l_callbacks != NULL) {
1341 * To make sure the shutdown code works correctly, either
1342 * the callback must happen after putting the lock on the
1343 * sleep list, or we must check the shutdown status after
1344 * returning from the callback (and before sleeping). At
1345 * least for now, we'll use the first option. If a
1346 * shutdown or signal or whatever happened while the graph
1347 * mutex was dropped, that will be detected by
1348 * wait_for_lock().
1350 mutex_exit(&gp->gp_mutex);
1352 cprp = flk_invoke_callbacks(request->l_callbacks,
1353 FLK_BEFORE_SLEEP);
1355 mutex_enter(&gp->gp_mutex);
1357 if (cprp == NULL) {
1358 wait_for_lock(request);
1359 } else {
1360 mutex_enter(cprp->cc_lockp);
1361 CALLB_CPR_SAFE_BEGIN(cprp);
1362 mutex_exit(cprp->cc_lockp);
1363 wait_for_lock(request);
1364 mutex_enter(cprp->cc_lockp);
1365 CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1366 mutex_exit(cprp->cc_lockp);
1369 mutex_exit(&gp->gp_mutex);
1370 (void) flk_invoke_callbacks(request->l_callbacks,
1371 FLK_AFTER_SLEEP);
1372 mutex_enter(&gp->gp_mutex);
1373 } else {
1374 wait_for_lock(request);
1377 if (IS_LOCKMGR(request)) {
1379 * If the lock manager is shutting down, return an
1380 * error that will encourage the client to retransmit.
1382 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1383 !IS_GRANTED(request)) {
1384 flk_cancel_sleeping_lock(request, 1);
1385 return (ENOLCK);
1389 if (IS_INTERRUPTED(request)) {
1390 /* we got a signal, or act like we did */
1391 flk_cancel_sleeping_lock(request, 1);
1392 return (EINTR);
1395 /* Cancelled if some other thread has closed the file */
1397 if (IS_CANCELLED(request)) {
1398 flk_cancel_sleeping_lock(request, 1);
1399 return (EBADF);
1402 request->l_state &= ~GRANTED_LOCK;
1403 REMOVE_SLEEP_QUEUE(request);
1404 return (flk_execute_request(request));
1408 * This routine adds an edge between from and to because from depends
1409 * to. If asked to check for deadlock it checks whether there are any
1410 * reachable locks from "from_lock" that is owned by the same process
1411 * as "from_lock".
1412 * NOTE: It is the caller's responsibility to make sure that the color
1413 * of the graph is consistent between the calls to flk_add_edge as done
1414 * in flk_process_request. This routine does not color and check for
1415 * deadlock explicitly.
1418 static int
1419 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1420 int check_cycle, int update_graph)
1422 edge_t *edge;
1423 edge_t *ep;
1424 lock_descriptor_t *vertex;
1425 lock_descriptor_t *vertex_stack;
1427 STACK_INIT(vertex_stack);
1430 * if to vertex already has mark_color just return
1431 * don't add an edge as it is reachable from from vertex
1432 * before itself.
1435 if (COLORED(to_lock))
1436 return (0);
1438 edge = flk_get_edge();
1441 * set the from and to vertex
1444 edge->from_vertex = from_lock;
1445 edge->to_vertex = to_lock;
1448 * put in adjacency list of from vertex
1451 from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1452 edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1453 edge->edge_adj_prev = &from_lock->l_edge;
1454 from_lock->l_edge.edge_adj_next = edge;
1457 * put in list of to vertex
1460 to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1461 edge->edge_in_next = to_lock->l_edge.edge_in_next;
1462 to_lock->l_edge.edge_in_next = edge;
1463 edge->edge_in_prev = &to_lock->l_edge;
1466 if (update_graph) {
1467 flk_update_proc_graph(edge, 0);
1468 return (0);
1470 if (!check_cycle) {
1471 return (0);
1474 STACK_PUSH(vertex_stack, from_lock, l_stack);
1476 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1478 STACK_POP(vertex_stack, l_stack);
1480 for (ep = FIRST_ADJ(vertex);
1481 ep != HEAD(vertex);
1482 ep = NEXT_ADJ(ep)) {
1483 if (COLORED(ep->to_vertex))
1484 continue;
1485 COLOR(ep->to_vertex);
1486 if (SAME_OWNER(ep->to_vertex, from_lock))
1487 goto dead_lock;
1488 STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1491 return (0);
1493 dead_lock:
1496 * remove all edges
1499 ep = FIRST_ADJ(from_lock);
1501 while (ep != HEAD(from_lock)) {
1502 IN_LIST_REMOVE(ep);
1503 from_lock->l_sedge = NEXT_ADJ(ep);
1504 ADJ_LIST_REMOVE(ep);
1505 flk_free_edge(ep);
1506 ep = from_lock->l_sedge;
1508 return (EDEADLK);
1512 * Get an edge structure for representing the dependency between two locks.
1515 static edge_t *
1516 flk_get_edge()
1518 edge_t *ep;
1520 ASSERT(flk_edge_cache != NULL);
1522 ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1523 edge_allocs++;
1524 return (ep);
1528 * Free the edge structure.
1531 static void
1532 flk_free_edge(edge_t *ep)
1534 edge_frees++;
1535 kmem_cache_free(flk_edge_cache, (void *)ep);
1539 * Check the relationship of request with lock and perform the
1540 * recomputation of dependencies, break lock if required, and return
1541 * 1 if request cannot have any more relationship with the next
1542 * active locks.
1543 * The 'lock' and 'request' are compared and in case of overlap we
1544 * delete the 'lock' and form new locks to represent the non-overlapped
1545 * portion of original 'lock'. This function has side effects such as
1546 * 'lock' will be freed, new locks will be added to the active list.
1549 static int
1550 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1552 int lock_effect;
1553 lock_descriptor_t *lock1, *lock2;
1554 lock_descriptor_t *topology[3];
1555 int nvertex = 0;
1556 int i;
1557 edge_t *ep;
1558 graph_t *gp = (lock->l_graph);
1561 CHECK_SLEEPING_LOCKS(gp);
1562 CHECK_ACTIVE_LOCKS(gp);
1564 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1566 topology[0] = topology[1] = topology[2] = NULL;
1568 if (request->l_type == F_UNLCK)
1569 lock_effect = FLK_UNLOCK;
1570 else if (request->l_type == F_RDLCK &&
1571 lock->l_type == F_WRLCK)
1572 lock_effect = FLK_DOWNGRADE;
1573 else if (request->l_type == F_WRLCK &&
1574 lock->l_type == F_RDLCK)
1575 lock_effect = FLK_UPGRADE;
1576 else
1577 lock_effect = FLK_STAY_SAME;
1579 if (lock->l_end < request->l_start) {
1580 if (lock->l_end == request->l_start - 1 &&
1581 lock_effect == FLK_STAY_SAME) {
1582 topology[0] = request;
1583 request->l_start = lock->l_start;
1584 nvertex = 1;
1585 goto recompute;
1586 } else {
1587 return (0);
1591 if (lock->l_start > request->l_end) {
1592 if (request->l_end == lock->l_start - 1 &&
1593 lock_effect == FLK_STAY_SAME) {
1594 topology[0] = request;
1595 request->l_end = lock->l_end;
1596 nvertex = 1;
1597 goto recompute;
1598 } else {
1599 return (1);
1603 if (request->l_end < lock->l_end) {
1604 if (request->l_start > lock->l_start) {
1605 if (lock_effect == FLK_STAY_SAME) {
1606 request->l_start = lock->l_start;
1607 request->l_end = lock->l_end;
1608 topology[0] = request;
1609 nvertex = 1;
1610 } else {
1611 lock1 = flk_get_lock();
1612 lock2 = flk_get_lock();
1613 COPY(lock1, lock);
1614 COPY(lock2, lock);
1615 lock1->l_start = lock->l_start;
1616 lock1->l_end = request->l_start - 1;
1617 lock2->l_start = request->l_end + 1;
1618 lock2->l_end = lock->l_end;
1619 topology[0] = lock1;
1620 topology[1] = lock2;
1621 topology[2] = request;
1622 nvertex = 3;
1624 } else if (request->l_start < lock->l_start) {
1625 if (lock_effect == FLK_STAY_SAME) {
1626 request->l_end = lock->l_end;
1627 topology[0] = request;
1628 nvertex = 1;
1629 } else {
1630 lock1 = flk_get_lock();
1631 COPY(lock1, lock);
1632 lock1->l_start = request->l_end + 1;
1633 topology[0] = lock1;
1634 topology[1] = request;
1635 nvertex = 2;
1637 } else {
1638 if (lock_effect == FLK_STAY_SAME) {
1639 request->l_start = lock->l_start;
1640 request->l_end = lock->l_end;
1641 topology[0] = request;
1642 nvertex = 1;
1643 } else {
1644 lock1 = flk_get_lock();
1645 COPY(lock1, lock);
1646 lock1->l_start = request->l_end + 1;
1647 topology[0] = lock1;
1648 topology[1] = request;
1649 nvertex = 2;
1652 } else if (request->l_end > lock->l_end) {
1653 if (request->l_start > lock->l_start) {
1654 if (lock_effect == FLK_STAY_SAME) {
1655 request->l_start = lock->l_start;
1656 topology[0] = request;
1657 nvertex = 1;
1658 } else {
1659 lock1 = flk_get_lock();
1660 COPY(lock1, lock);
1661 lock1->l_end = request->l_start - 1;
1662 topology[0] = lock1;
1663 topology[1] = request;
1664 nvertex = 2;
1666 } else if (request->l_start < lock->l_start) {
1667 topology[0] = request;
1668 nvertex = 1;
1669 } else {
1670 topology[0] = request;
1671 nvertex = 1;
1673 } else {
1674 if (request->l_start > lock->l_start) {
1675 if (lock_effect == FLK_STAY_SAME) {
1676 request->l_start = lock->l_start;
1677 topology[0] = request;
1678 nvertex = 1;
1679 } else {
1680 lock1 = flk_get_lock();
1681 COPY(lock1, lock);
1682 lock1->l_end = request->l_start - 1;
1683 topology[0] = lock1;
1684 topology[1] = request;
1685 nvertex = 2;
1687 } else if (request->l_start < lock->l_start) {
1688 topology[0] = request;
1689 nvertex = 1;
1690 } else {
1691 if (lock_effect != FLK_UNLOCK) {
1692 topology[0] = request;
1693 nvertex = 1;
1694 } else {
1695 flk_delete_active_lock(lock, 0);
1696 flk_wakeup(lock, 1);
1697 flk_free_lock(lock);
1698 CHECK_SLEEPING_LOCKS(gp);
1699 CHECK_ACTIVE_LOCKS(gp);
1700 return (1);
1705 recompute:
1708 * For unlock we don't send the 'request' to for recomputing
1709 * dependencies because no lock will add an edge to this.
1712 if (lock_effect == FLK_UNLOCK) {
1713 topology[nvertex-1] = NULL;
1714 nvertex--;
1716 for (i = 0; i < nvertex; i++) {
1717 topology[i]->l_state |= RECOMPUTE_LOCK;
1718 topology[i]->l_color = NO_COLOR;
1721 ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1724 * we remove the adjacent edges for all vertices' to this vertex
1725 * 'lock'.
1728 ep = FIRST_IN(lock);
1729 while (ep != HEAD(lock)) {
1730 ADJ_LIST_REMOVE(ep);
1731 ep = NEXT_IN(ep);
1734 flk_delete_active_lock(lock, 0);
1736 /* We are ready for recomputing the dependencies now */
1738 flk_recompute_dependencies(lock, topology, nvertex, 1);
1740 for (i = 0; i < nvertex; i++) {
1741 topology[i]->l_state &= ~RECOMPUTE_LOCK;
1742 topology[i]->l_color = NO_COLOR;
1746 if (lock_effect == FLK_UNLOCK) {
1747 nvertex++;
1749 for (i = 0; i < nvertex - 1; i++) {
1750 flk_insert_active_lock(topology[i]);
1754 if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1755 flk_wakeup(lock, 0);
1756 } else {
1757 ep = FIRST_IN(lock);
1758 while (ep != HEAD(lock)) {
1759 lock->l_sedge = NEXT_IN(ep);
1760 IN_LIST_REMOVE(ep);
1761 flk_update_proc_graph(ep, 1);
1762 flk_free_edge(ep);
1763 ep = lock->l_sedge;
1766 flk_free_lock(lock);
1768 CHECK_SLEEPING_LOCKS(gp);
1769 CHECK_ACTIVE_LOCKS(gp);
1770 return (0);
1774 * Insert a lock into the active queue.
1777 static void
1778 flk_insert_active_lock(lock_descriptor_t *new_lock)
1780 graph_t *gp = new_lock->l_graph;
1781 vnode_t *vp = new_lock->l_vnode;
1782 lock_descriptor_t *first_lock, *lock;
1784 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1786 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1787 first_lock = lock;
1789 if (first_lock != NULL) {
1790 for (; (lock->l_vnode == vp &&
1791 lock->l_start < new_lock->l_start); lock = lock->l_next)
1793 } else {
1794 lock = ACTIVE_HEAD(gp);
1797 lock->l_prev->l_next = new_lock;
1798 new_lock->l_next = lock;
1799 new_lock->l_prev = lock->l_prev;
1800 lock->l_prev = new_lock;
1802 if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
1803 vp->v_filocks = (struct filock *)new_lock;
1805 flk_set_state(new_lock, FLK_ACTIVE_STATE);
1806 new_lock->l_state |= ACTIVE_LOCK;
1808 CHECK_ACTIVE_LOCKS(gp);
1809 CHECK_SLEEPING_LOCKS(gp);
1813 * Delete the active lock : Performs two functions depending on the
1814 * value of second parameter. One is to remove from the active lists
1815 * only and other is to both remove and free the lock.
1818 static void
1819 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1821 vnode_t *vp = lock->l_vnode;
1822 graph_t *gp = lock->l_graph;
1824 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1825 if (free_lock)
1826 ASSERT(NO_DEPENDENTS(lock));
1827 ASSERT(NOT_BLOCKED(lock));
1828 ASSERT(IS_ACTIVE(lock));
1830 ASSERT((vp->v_filocks != NULL));
1832 if (vp->v_filocks == (struct filock *)lock) {
1833 vp->v_filocks = (struct filock *)
1834 ((lock->l_next->l_vnode == vp) ? lock->l_next :
1835 NULL);
1837 lock->l_next->l_prev = lock->l_prev;
1838 lock->l_prev->l_next = lock->l_next;
1839 lock->l_next = lock->l_prev = NULL;
1840 flk_set_state(lock, FLK_DEAD_STATE);
1841 lock->l_state &= ~ACTIVE_LOCK;
1843 if (free_lock)
1844 flk_free_lock(lock);
1845 CHECK_ACTIVE_LOCKS(gp);
1846 CHECK_SLEEPING_LOCKS(gp);
1850 * Insert into the sleep queue.
1853 static void
1854 flk_insert_sleeping_lock(lock_descriptor_t *request)
1856 graph_t *gp = request->l_graph;
1857 vnode_t *vp = request->l_vnode;
1858 lock_descriptor_t *lock;
1860 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1861 ASSERT(IS_INITIAL(request));
1863 for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
1864 lock->l_vnode < vp); lock = lock->l_next)
1867 lock->l_prev->l_next = request;
1868 request->l_prev = lock->l_prev;
1869 lock->l_prev = request;
1870 request->l_next = lock;
1871 flk_set_state(request, FLK_SLEEPING_STATE);
1872 request->l_state |= SLEEPING_LOCK;
1876 * Cancelling a sleeping lock implies removing a vertex from the
1877 * dependency graph and therefore we should recompute the dependencies
1878 * of all vertices that have a path to this vertex, w.r.t. all
1879 * vertices reachable from this vertex.
1882 void
1883 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
1885 graph_t *gp = request->l_graph;
1886 vnode_t *vp = request->l_vnode;
1887 lock_descriptor_t **topology = NULL;
1888 edge_t *ep;
1889 lock_descriptor_t *vertex, *lock;
1890 int nvertex = 0;
1891 int i;
1892 lock_descriptor_t *vertex_stack;
1894 STACK_INIT(vertex_stack);
1896 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1898 * count number of vertex pointers that has to be allocated
1899 * All vertices that are reachable from request.
1902 STACK_PUSH(vertex_stack, request, l_stack);
1904 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1905 STACK_POP(vertex_stack, l_stack);
1906 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
1907 ep = NEXT_ADJ(ep)) {
1908 if (IS_RECOMPUTE(ep->to_vertex))
1909 continue;
1910 ep->to_vertex->l_state |= RECOMPUTE_LOCK;
1911 STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1912 nvertex++;
1917 * allocate memory for holding the vertex pointers
1920 if (nvertex) {
1921 topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
1922 KM_SLEEP);
1926 * one more pass to actually store the vertices in the
1927 * allocated array.
1928 * We first check sleeping locks and then active locks
1929 * so that topology array will be in a topological
1930 * order.
1933 nvertex = 0;
1934 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1936 if (lock) {
1937 do {
1938 if (IS_RECOMPUTE(lock)) {
1939 lock->l_index = nvertex;
1940 topology[nvertex++] = lock;
1942 lock->l_color = NO_COLOR;
1943 lock = lock->l_next;
1944 } while (lock->l_vnode == vp);
1947 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1949 if (lock) {
1950 do {
1951 if (IS_RECOMPUTE(lock)) {
1952 lock->l_index = nvertex;
1953 topology[nvertex++] = lock;
1955 lock->l_color = NO_COLOR;
1956 lock = lock->l_next;
1957 } while (lock->l_vnode == vp);
1961 * remove in and out edges of request
1962 * They are freed after updating proc_graph below.
1965 for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
1966 ADJ_LIST_REMOVE(ep);
1970 if (remove_from_queue)
1971 REMOVE_SLEEP_QUEUE(request);
1973 /* we are ready to recompute */
1975 flk_recompute_dependencies(request, topology, nvertex, 1);
1977 ep = FIRST_ADJ(request);
1978 while (ep != HEAD(request)) {
1979 IN_LIST_REMOVE(ep);
1980 request->l_sedge = NEXT_ADJ(ep);
1981 ADJ_LIST_REMOVE(ep);
1982 flk_update_proc_graph(ep, 1);
1983 flk_free_edge(ep);
1984 ep = request->l_sedge;
1989 * unset the RECOMPUTE flag in those vertices
1992 for (i = 0; i < nvertex; i++) {
1993 topology[i]->l_state &= ~RECOMPUTE_LOCK;
1997 * free the topology
1999 if (nvertex)
2000 kmem_free((void *)topology,
2001 (nvertex * sizeof (lock_descriptor_t *)));
2003 * Possibility of some locks unblocked now
2006 flk_wakeup(request, 0);
2009 * we expect to have a correctly recomputed graph now.
2011 flk_set_state(request, FLK_DEAD_STATE);
2012 flk_free_lock(request);
2013 CHECK_SLEEPING_LOCKS(gp);
2014 CHECK_ACTIVE_LOCKS(gp);
2019 * Uncoloring the graph is simply to increment the mark value of the graph
2020 * And only when wrap round takes place will we color all vertices in
2021 * the graph explicitly.
2024 static void
2025 flk_graph_uncolor(graph_t *gp)
2027 lock_descriptor_t *lock;
2029 if (gp->mark == UINT_MAX) {
2030 gp->mark = 1;
2031 for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
2032 lock = lock->l_next)
2033 lock->l_color = 0;
2035 for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
2036 lock = lock->l_next)
2037 lock->l_color = 0;
2038 } else {
2039 gp->mark++;
2044 * Wake up locks that are blocked on the given lock.
2047 static void
2048 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
2050 edge_t *ep;
2051 graph_t *gp = lock->l_graph;
2052 lock_descriptor_t *lck;
2054 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2055 if (NO_DEPENDENTS(lock))
2056 return;
2057 ep = FIRST_IN(lock);
2058 do {
2060 * delete the edge from the adjacency list
2061 * of from vertex. if no more adjacent edges
2062 * for this vertex wake this process.
2064 lck = ep->from_vertex;
2065 if (adj_list_remove)
2066 ADJ_LIST_REMOVE(ep);
2067 flk_update_proc_graph(ep, 1);
2068 if (NOT_BLOCKED(lck)) {
2069 GRANT_WAKEUP(lck);
2071 lock->l_sedge = NEXT_IN(ep);
2072 IN_LIST_REMOVE(ep);
2073 flk_free_edge(ep);
2074 ep = lock->l_sedge;
2075 } while (ep != HEAD(lock));
2076 ASSERT(NO_DEPENDENTS(lock));
2080 * The dependents of request, is checked for its dependency against the
2081 * locks in topology (called topology because the array is and should be in
2082 * topological order for this algorithm, if not in topological order the
2083 * inner loop below might add more edges than necessary. Topological ordering
2084 * of vertices satisfies the property that all edges will be from left to
2085 * right i.e., topology[i] can have an edge to topology[j], iff i<j)
2086 * If lock l1 in the dependent set of request is dependent (blocked by)
2087 * on lock l2 in topology but does not have a path to it, we add an edge
2088 * in the inner loop below.
2090 * We don't want to add an edge between l1 and l2 if there exists
2091 * already a path from l1 to l2, so care has to be taken for those vertices
2092 * that have two paths to 'request'. These vertices are referred to here
2093 * as barrier locks.
2095 * The barriers has to be found (those vertex that originally had two paths
2096 * to request) because otherwise we may end up adding edges unnecessarily
2097 * to vertices in topology, and thus barrier vertices can have an edge
2098 * to a vertex in topology as well a path to it.
2101 static void
2102 flk_recompute_dependencies(lock_descriptor_t *request,
2103 lock_descriptor_t **topology, int nvertex, int update_graph)
2105 lock_descriptor_t *vertex, *lock;
2106 graph_t *gp = request->l_graph;
2107 int i, count;
2108 int barrier_found = 0;
2109 edge_t *ep;
2110 lock_descriptor_t *vertex_stack;
2112 STACK_INIT(vertex_stack);
2114 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2115 if (nvertex == 0)
2116 return;
2117 flk_graph_uncolor(request->l_graph);
2118 barrier_found = flk_find_barriers(request);
2119 request->l_state |= RECOMPUTE_DONE;
2121 STACK_PUSH(vertex_stack, request, l_stack);
2122 request->l_sedge = FIRST_IN(request);
2125 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2126 if (vertex->l_state & RECOMPUTE_DONE) {
2127 count = 0;
2128 goto next_in_edge;
2130 if (IS_BARRIER(vertex)) {
2131 /* decrement the barrier count */
2132 if (vertex->l_index) {
2133 vertex->l_index--;
2134 /* this guy will be pushed again anyway ? */
2135 STACK_POP(vertex_stack, l_stack);
2136 if (vertex->l_index == 0) {
2138 * barrier is over we can recompute
2139 * dependencies for this lock in the
2140 * next stack pop
2142 vertex->l_state &= ~BARRIER_LOCK;
2144 continue;
2147 vertex->l_state |= RECOMPUTE_DONE;
2148 flk_graph_uncolor(gp);
2149 count = flk_color_reachables(vertex);
2150 for (i = 0; i < nvertex; i++) {
2151 lock = topology[i];
2152 if (COLORED(lock))
2153 continue;
2154 if (BLOCKS(lock, vertex)) {
2155 (void) flk_add_edge(vertex, lock,
2156 NO_CHECK_CYCLE, update_graph);
2157 COLOR(lock);
2158 count++;
2159 count += flk_color_reachables(lock);
2164 next_in_edge:
2165 if (count == nvertex ||
2166 vertex->l_sedge == HEAD(vertex)) {
2167 /* prune the tree below this */
2168 STACK_POP(vertex_stack, l_stack);
2169 vertex->l_state &= ~RECOMPUTE_DONE;
2170 /* update the barrier locks below this! */
2171 if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
2172 flk_graph_uncolor(gp);
2173 flk_update_barriers(vertex);
2175 continue;
2178 ep = vertex->l_sedge;
2179 lock = ep->from_vertex;
2180 STACK_PUSH(vertex_stack, lock, l_stack);
2181 lock->l_sedge = FIRST_IN(lock);
2182 vertex->l_sedge = NEXT_IN(ep);
2188 * Color all reachable vertices from vertex that belongs to topology (here
2189 * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
2191 * Note: we need to use a different stack_link l_stack1 because this is
2192 * called from flk_recompute_dependencies() that already uses a stack with
2193 * l_stack as stack_link.
2196 static int
2197 flk_color_reachables(lock_descriptor_t *vertex)
2199 lock_descriptor_t *ver, *lock;
2200 int count;
2201 edge_t *ep;
2202 lock_descriptor_t *vertex_stack;
2204 STACK_INIT(vertex_stack);
2206 STACK_PUSH(vertex_stack, vertex, l_stack1);
2207 count = 0;
2208 while ((ver = STACK_TOP(vertex_stack)) != NULL) {
2210 STACK_POP(vertex_stack, l_stack1);
2211 for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
2212 ep = NEXT_ADJ(ep)) {
2213 lock = ep->to_vertex;
2214 if (COLORED(lock))
2215 continue;
2216 COLOR(lock);
2217 if (IS_RECOMPUTE(lock))
2218 count++;
2219 STACK_PUSH(vertex_stack, lock, l_stack1);
2223 return (count);
2227 * Called from flk_recompute_dependencies() this routine decrements
2228 * the barrier count of barrier vertices that are reachable from lock.
2231 static void
2232 flk_update_barriers(lock_descriptor_t *lock)
2234 lock_descriptor_t *vertex, *lck;
2235 edge_t *ep;
2236 lock_descriptor_t *vertex_stack;
2238 STACK_INIT(vertex_stack);
2240 STACK_PUSH(vertex_stack, lock, l_stack1);
2242 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2243 STACK_POP(vertex_stack, l_stack1);
2244 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2245 ep = NEXT_IN(ep)) {
2246 lck = ep->from_vertex;
2247 if (COLORED(lck)) {
2248 if (IS_BARRIER(lck)) {
2249 ASSERT(lck->l_index > 0);
2250 lck->l_index--;
2251 if (lck->l_index == 0)
2252 lck->l_state &= ~BARRIER_LOCK;
2254 continue;
2256 COLOR(lck);
2257 if (IS_BARRIER(lck)) {
2258 ASSERT(lck->l_index > 0);
2259 lck->l_index--;
2260 if (lck->l_index == 0)
2261 lck->l_state &= ~BARRIER_LOCK;
2263 STACK_PUSH(vertex_stack, lck, l_stack1);
2269 * Finds all vertices that are reachable from 'lock' more than once and
2270 * mark them as barrier vertices and increment their barrier count.
2271 * The barrier count is one minus the total number of paths from lock
2272 * to that vertex.
2275 static int
2276 flk_find_barriers(lock_descriptor_t *lock)
2278 lock_descriptor_t *vertex, *lck;
2279 int found = 0;
2280 edge_t *ep;
2281 lock_descriptor_t *vertex_stack;
2283 STACK_INIT(vertex_stack);
2285 STACK_PUSH(vertex_stack, lock, l_stack1);
2287 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2288 STACK_POP(vertex_stack, l_stack1);
2289 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2290 ep = NEXT_IN(ep)) {
2291 lck = ep->from_vertex;
2292 if (COLORED(lck)) {
2293 /* this is a barrier */
2294 lck->l_state |= BARRIER_LOCK;
2295 /* index will have barrier count */
2296 lck->l_index++;
2297 if (!found)
2298 found = 1;
2299 continue;
2301 COLOR(lck);
2302 lck->l_index = 0;
2303 STACK_PUSH(vertex_stack, lck, l_stack1);
2306 return (found);
2310 * Finds the first lock that is mainly responsible for blocking this
2311 * request. If there is no such lock, request->l_flock.l_type is set to
2312 * F_UNLCK. Otherwise, request->l_flock is filled in with the particulars
2313 * of the blocking lock.
2315 * Note: It is possible a request is blocked by a sleeping lock because
2316 * of the fairness policy used in flk_process_request() to construct the
2317 * dependencies. (see comments before flk_process_request()).
2320 static void
2321 flk_get_first_blocking_lock(lock_descriptor_t *request)
2323 graph_t *gp = request->l_graph;
2324 vnode_t *vp = request->l_vnode;
2325 lock_descriptor_t *lock, *blocker;
2327 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2328 blocker = NULL;
2329 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2331 if (lock) {
2332 do {
2333 if (BLOCKS(lock, request)) {
2334 blocker = lock;
2335 break;
2337 lock = lock->l_next;
2338 } while (lock->l_vnode == vp);
2341 if (blocker == NULL && request->l_flock.l_type == F_RDLCK) {
2343 * No active lock is blocking this request, but if a read
2344 * lock is requested, it may also get blocked by a waiting
2345 * writer. So search all sleeping locks and see if there is
2346 * a writer waiting.
2348 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2349 if (lock) {
2350 do {
2351 if (BLOCKS(lock, request)) {
2352 blocker = lock;
2353 break;
2355 lock = lock->l_next;
2356 } while (lock->l_vnode == vp);
2360 if (blocker) {
2361 report_blocker(blocker, request);
2362 } else
2363 request->l_flock.l_type = F_UNLCK;
2367 * Get the graph_t structure associated with a vnode.
2368 * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2369 * not yet been initialized, then a new element is allocated and returned.
2371 graph_t *
2372 flk_get_lock_graph(vnode_t *vp, int initialize)
2374 graph_t *gp;
2375 graph_t *gp_alloc = NULL;
2376 int index = HASH_INDEX(vp);
2378 if (initialize == FLK_USE_GRAPH) {
2379 mutex_enter(&flock_lock);
2380 gp = lock_graph[index];
2381 mutex_exit(&flock_lock);
2382 return (gp);
2385 ASSERT(initialize == FLK_INIT_GRAPH);
2387 if (lock_graph[index] == NULL) {
2389 gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2391 /* Initialize the graph */
2393 gp_alloc->active_locks.l_next =
2394 gp_alloc->active_locks.l_prev =
2395 (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2396 gp_alloc->sleeping_locks.l_next =
2397 gp_alloc->sleeping_locks.l_prev =
2398 (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2399 gp_alloc->index = index;
2400 mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2403 mutex_enter(&flock_lock);
2405 gp = lock_graph[index];
2407 /* Recheck the value within flock_lock */
2408 if (gp == NULL) {
2409 struct flock_globals *fg;
2411 /* We must have previously allocated the graph_t structure */
2412 ASSERT(gp_alloc != NULL);
2413 lock_graph[index] = gp = gp_alloc;
2415 * The lockmgr status is only needed if KLM is loaded.
2417 if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2418 fg = flk_get_globals();
2419 fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2423 mutex_exit(&flock_lock);
2425 if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2426 /* There was a race to allocate the graph_t and we lost */
2427 mutex_destroy(&gp_alloc->gp_mutex);
2428 kmem_free(gp_alloc, sizeof (graph_t));
2431 return (gp);
2435 * Determine whether there are any locks for the given vnode with a remote
2436 * sysid. Returns zero if not, non-zero if there are.
2438 * Note that the return value from this function is potentially invalid
2439 * once it has been returned. The caller is responsible for providing its
2440 * own synchronization mechanism to ensure that the return value is useful
2441 * (e.g., see nfs_lockcompletion()).
2444 flk_has_remote_locks(vnode_t *vp)
2446 lock_descriptor_t *lock;
2447 int result = 0;
2448 graph_t *gp;
2450 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2451 if (gp == NULL) {
2452 return (0);
2455 mutex_enter(&gp->gp_mutex);
2457 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2459 if (lock) {
2460 while (lock->l_vnode == vp) {
2461 if (IS_REMOTE(lock)) {
2462 result = 1;
2463 goto done;
2465 lock = lock->l_next;
2469 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2471 if (lock) {
2472 while (lock->l_vnode == vp) {
2473 if (IS_REMOTE(lock)) {
2474 result = 1;
2475 goto done;
2477 lock = lock->l_next;
2481 done:
2482 mutex_exit(&gp->gp_mutex);
2483 return (result);
2487 * Determine whether there are any locks for the given vnode with a remote
2488 * sysid matching given sysid.
2489 * Used by the new (open source) NFS Lock Manager (NLM)
2492 flk_has_remote_locks_for_sysid(vnode_t *vp, int sysid)
2494 lock_descriptor_t *lock;
2495 int result = 0;
2496 graph_t *gp;
2498 if (sysid == 0)
2499 return (0);
2501 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2502 if (gp == NULL) {
2503 return (0);
2506 mutex_enter(&gp->gp_mutex);
2508 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2510 if (lock) {
2511 while (lock->l_vnode == vp) {
2512 if (lock->l_flock.l_sysid == sysid) {
2513 result = 1;
2514 goto done;
2516 lock = lock->l_next;
2520 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2522 if (lock) {
2523 while (lock->l_vnode == vp) {
2524 if (lock->l_flock.l_sysid == sysid) {
2525 result = 1;
2526 goto done;
2528 lock = lock->l_next;
2532 done:
2533 mutex_exit(&gp->gp_mutex);
2534 return (result);
2538 * Determine if there are any locks owned by the given sysid.
2539 * Returns zero if not, non-zero if there are. Note that this return code
2540 * could be derived from flk_get_{sleeping,active}_locks, but this routine
2541 * avoids all the memory allocations of those routines.
2543 * This routine has the same synchronization issues as
2544 * flk_has_remote_locks.
2548 flk_sysid_has_locks(int sysid, int lck_type)
2550 int has_locks = 0;
2551 lock_descriptor_t *lock;
2552 graph_t *gp;
2553 int i;
2555 for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2556 mutex_enter(&flock_lock);
2557 gp = lock_graph[i];
2558 mutex_exit(&flock_lock);
2559 if (gp == NULL) {
2560 continue;
2563 mutex_enter(&gp->gp_mutex);
2565 if (lck_type & FLK_QUERY_ACTIVE) {
2566 for (lock = ACTIVE_HEAD(gp)->l_next;
2567 lock != ACTIVE_HEAD(gp) && !has_locks;
2568 lock = lock->l_next) {
2569 if (lock->l_flock.l_sysid == sysid)
2570 has_locks = 1;
2574 if (lck_type & FLK_QUERY_SLEEPING) {
2575 for (lock = SLEEPING_HEAD(gp)->l_next;
2576 lock != SLEEPING_HEAD(gp) && !has_locks;
2577 lock = lock->l_next) {
2578 if (lock->l_flock.l_sysid == sysid)
2579 has_locks = 1;
2582 mutex_exit(&gp->gp_mutex);
2585 return (has_locks);
2590 * Delete all locks in the system that belongs to the sysid of the request.
2593 static void
2594 flk_delete_locks_by_sysid(lock_descriptor_t *request)
2596 int sysid = request->l_flock.l_sysid;
2597 lock_descriptor_t *lock, *nlock;
2598 graph_t *gp;
2599 int i;
2601 ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
2602 ASSERT(sysid != 0);
2604 mutex_exit(&request->l_graph->gp_mutex);
2606 for (i = 0; i < HASH_SIZE; i++) {
2607 mutex_enter(&flock_lock);
2608 gp = lock_graph[i];
2609 mutex_exit(&flock_lock);
2611 if (gp == NULL)
2612 continue;
2614 mutex_enter(&gp->gp_mutex);
2616 /* signal sleeping requests so that they bail out */
2617 lock = SLEEPING_HEAD(gp)->l_next;
2618 while (lock != SLEEPING_HEAD(gp)) {
2619 nlock = lock->l_next;
2620 if (lock->l_flock.l_sysid == sysid) {
2621 INTERRUPT_WAKEUP(lock);
2623 lock = nlock;
2626 /* delete active locks */
2627 lock = ACTIVE_HEAD(gp)->l_next;
2628 while (lock != ACTIVE_HEAD(gp)) {
2629 nlock = lock->l_next;
2630 if (lock->l_flock.l_sysid == sysid) {
2631 flk_delete_active_lock(lock, 0);
2632 flk_wakeup(lock, 1);
2633 flk_free_lock(lock);
2635 lock = nlock;
2637 mutex_exit(&gp->gp_mutex);
2640 mutex_enter(&request->l_graph->gp_mutex);
2644 * Search for a sleeping lock manager lock which matches exactly this lock
2645 * request; if one is found, fake a signal to cancel it.
2647 * Return 1 if a matching lock was found, 0 otherwise.
2650 static int
2651 flk_canceled(lock_descriptor_t *request)
2653 lock_descriptor_t *lock, *nlock;
2654 graph_t *gp = request->l_graph;
2655 vnode_t *vp = request->l_vnode;
2657 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2658 ASSERT(IS_LOCKMGR(request));
2659 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2661 if (lock) {
2662 while (lock->l_vnode == vp) {
2663 nlock = lock->l_next;
2664 if (SAME_OWNER(lock, request) &&
2665 lock->l_start == request->l_start &&
2666 lock->l_end == request->l_end) {
2667 INTERRUPT_WAKEUP(lock);
2668 return (1);
2670 lock = nlock;
2673 return (0);
2677 * Remove all non-OFD locks for the vnode belonging to the given pid and sysid.
2678 * That is, since OFD locks are pid-less we'll never match on the incoming
2679 * pid. OFD locks are removed earlier in the close() path via closef() and
2680 * ofdcleanlock().
2682 void
2683 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
2685 graph_t *gp;
2686 lock_descriptor_t *lock, *nlock;
2687 lock_descriptor_t *link_stack;
2689 STACK_INIT(link_stack);
2691 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2693 if (gp == NULL)
2694 return;
2695 mutex_enter(&gp->gp_mutex);
2697 CHECK_SLEEPING_LOCKS(gp);
2698 CHECK_ACTIVE_LOCKS(gp);
2700 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2702 if (lock) {
2703 do {
2704 nlock = lock->l_next;
2705 if ((lock->l_flock.l_pid == pid ||
2706 pid == IGN_PID) &&
2707 lock->l_flock.l_sysid == sysid) {
2708 CANCEL_WAKEUP(lock);
2710 lock = nlock;
2711 } while (lock->l_vnode == vp);
2714 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2716 if (lock) {
2717 do {
2718 nlock = lock->l_next;
2719 if ((lock->l_flock.l_pid == pid ||
2720 pid == IGN_PID) &&
2721 lock->l_flock.l_sysid == sysid) {
2722 flk_delete_active_lock(lock, 0);
2723 STACK_PUSH(link_stack, lock, l_stack);
2725 lock = nlock;
2726 } while (lock->l_vnode == vp);
2729 while ((lock = STACK_TOP(link_stack)) != NULL) {
2730 STACK_POP(link_stack, l_stack);
2731 flk_wakeup(lock, 1);
2732 flk_free_lock(lock);
2735 CHECK_SLEEPING_LOCKS(gp);
2736 CHECK_ACTIVE_LOCKS(gp);
2737 CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2738 mutex_exit(&gp->gp_mutex);
2743 * Called from 'fs' read and write routines for files that have mandatory
2744 * locking enabled.
2748 chklock(struct vnode *vp, int iomode, uoff_t offset, ssize_t len, int fmode,
2749 caller_context_t *ct)
2751 register int i;
2752 struct flock64 bf;
2753 int error = 0;
2755 bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
2756 bf.l_whence = 0;
2757 bf.l_start = offset;
2758 bf.l_len = len;
2759 if (ct == NULL) {
2760 bf.l_pid = curproc->p_pid;
2761 bf.l_sysid = 0;
2762 } else {
2763 bf.l_pid = ct->cc_pid;
2764 bf.l_sysid = ct->cc_sysid;
2766 i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
2767 if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
2768 bf.l_type != F_UNLCK)
2769 error = i ? i : EAGAIN;
2770 return (error);
2774 * convoff - converts the given data (start, whence) to the
2775 * given whence.
2778 convoff(struct vnode *vp, struct flock64 *lckdat, int whence, offset_t offset)
2780 int error;
2781 struct vattr vattr;
2783 if ((lckdat->l_whence == 2) || (whence == 2)) {
2784 vattr.va_mask = AT_SIZE;
2785 if (error = fop_getattr(vp, &vattr, 0, CRED(), NULL))
2786 return (error);
2789 switch (lckdat->l_whence) {
2790 case 1:
2791 lckdat->l_start += offset;
2792 break;
2793 case 2:
2794 lckdat->l_start += vattr.va_size;
2795 /* FALLTHRU */
2796 case 0:
2797 break;
2798 default:
2799 return (EINVAL);
2802 if (lckdat->l_start < 0)
2803 return (EINVAL);
2805 switch (whence) {
2806 case 1:
2807 lckdat->l_start -= offset;
2808 break;
2809 case 2:
2810 lckdat->l_start -= vattr.va_size;
2811 /* FALLTHRU */
2812 case 0:
2813 break;
2814 default:
2815 return (EINVAL);
2818 lckdat->l_whence = (short)whence;
2819 return (0);
2823 /* proc_graph function definitions */
2826 * Function checks for deadlock due to the new 'lock'. If deadlock found
2827 * edges of this lock are freed and returned.
2830 static int
2831 flk_check_deadlock(lock_descriptor_t *lock)
2833 proc_vertex_t *start_vertex, *pvertex;
2834 proc_vertex_t *dvertex;
2835 proc_edge_t *pep, *ppep;
2836 edge_t *ep, *nep;
2837 proc_vertex_t *process_stack;
2840 * OFD style locks are not associated with any process so there is
2841 * no proc graph for these. Thus we cannot, and do not, do deadlock
2842 * detection.
2844 if (lock->l_ofd != NULL)
2845 return (0);
2847 STACK_INIT(process_stack);
2849 mutex_enter(&flock_lock);
2850 start_vertex = flk_get_proc_vertex(lock);
2851 ASSERT(start_vertex != NULL);
2853 /* construct the edges from this process to other processes */
2855 ep = FIRST_ADJ(lock);
2856 while (ep != HEAD(lock)) {
2857 proc_vertex_t *adj_proc;
2859 adj_proc = flk_get_proc_vertex(ep->to_vertex);
2860 for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
2861 if (pep->to_proc == adj_proc) {
2862 ASSERT(pep->refcount);
2863 pep->refcount++;
2864 break;
2867 if (pep == NULL) {
2868 pep = flk_get_proc_edge();
2869 pep->to_proc = adj_proc;
2870 pep->refcount = 1;
2871 adj_proc->incount++;
2872 pep->next = start_vertex->edge;
2873 start_vertex->edge = pep;
2875 ep = NEXT_ADJ(ep);
2878 ep = FIRST_IN(lock);
2880 while (ep != HEAD(lock)) {
2881 proc_vertex_t *in_proc;
2883 in_proc = flk_get_proc_vertex(ep->from_vertex);
2885 for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
2886 if (pep->to_proc == start_vertex) {
2887 ASSERT(pep->refcount);
2888 pep->refcount++;
2889 break;
2892 if (pep == NULL) {
2893 pep = flk_get_proc_edge();
2894 pep->to_proc = start_vertex;
2895 pep->refcount = 1;
2896 start_vertex->incount++;
2897 pep->next = in_proc->edge;
2898 in_proc->edge = pep;
2900 ep = NEXT_IN(ep);
2903 if (start_vertex->incount == 0) {
2904 mutex_exit(&flock_lock);
2905 return (0);
2908 flk_proc_graph_uncolor();
2910 start_vertex->p_sedge = start_vertex->edge;
2912 STACK_PUSH(process_stack, start_vertex, p_stack);
2914 while ((pvertex = STACK_TOP(process_stack)) != NULL) {
2915 for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
2916 dvertex = pep->to_proc;
2917 if (!PROC_ARRIVED(dvertex)) {
2918 STACK_PUSH(process_stack, dvertex, p_stack);
2919 dvertex->p_sedge = dvertex->edge;
2920 PROC_ARRIVE(pvertex);
2921 pvertex->p_sedge = pep->next;
2922 break;
2924 if (!PROC_DEPARTED(dvertex))
2925 goto deadlock;
2927 if (pep == NULL) {
2928 PROC_DEPART(pvertex);
2929 STACK_POP(process_stack, p_stack);
2932 mutex_exit(&flock_lock);
2933 return (0);
2935 deadlock:
2937 /* we remove all lock edges and proc edges */
2939 ep = FIRST_ADJ(lock);
2940 while (ep != HEAD(lock)) {
2941 proc_vertex_t *adj_proc;
2942 adj_proc = flk_get_proc_vertex(ep->to_vertex);
2943 nep = NEXT_ADJ(ep);
2944 IN_LIST_REMOVE(ep);
2945 ADJ_LIST_REMOVE(ep);
2946 flk_free_edge(ep);
2947 ppep = start_vertex->edge;
2948 for (pep = start_vertex->edge; pep != NULL; ppep = pep,
2949 pep = ppep->next) {
2950 if (pep->to_proc == adj_proc) {
2951 pep->refcount--;
2952 if (pep->refcount == 0) {
2953 if (pep == ppep) {
2954 start_vertex->edge = pep->next;
2955 } else {
2956 ppep->next = pep->next;
2958 adj_proc->incount--;
2959 flk_proc_release(adj_proc);
2960 flk_free_proc_edge(pep);
2962 break;
2965 ep = nep;
2967 ep = FIRST_IN(lock);
2968 while (ep != HEAD(lock)) {
2969 proc_vertex_t *in_proc;
2970 in_proc = flk_get_proc_vertex(ep->from_vertex);
2971 nep = NEXT_IN(ep);
2972 IN_LIST_REMOVE(ep);
2973 ADJ_LIST_REMOVE(ep);
2974 flk_free_edge(ep);
2975 ppep = in_proc->edge;
2976 for (pep = in_proc->edge; pep != NULL; ppep = pep,
2977 pep = ppep->next) {
2978 if (pep->to_proc == start_vertex) {
2979 pep->refcount--;
2980 if (pep->refcount == 0) {
2981 if (pep == ppep) {
2982 in_proc->edge = pep->next;
2983 } else {
2984 ppep->next = pep->next;
2986 start_vertex->incount--;
2987 flk_proc_release(in_proc);
2988 flk_free_proc_edge(pep);
2990 break;
2993 ep = nep;
2995 flk_proc_release(start_vertex);
2996 mutex_exit(&flock_lock);
2997 return (1);
3001 * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
3002 * from the list we return that, otherwise we allocate one. If necessary,
3003 * we grow the list of vertices also.
3006 static proc_vertex_t *
3007 flk_get_proc_vertex(lock_descriptor_t *lock)
3009 int i;
3010 proc_vertex_t *pv;
3011 proc_vertex_t **palloc;
3013 ASSERT(MUTEX_HELD(&flock_lock));
3014 if (lock->pvertex != -1) {
3015 ASSERT(lock->pvertex >= 0);
3016 pv = pgraph.proc[lock->pvertex];
3017 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
3018 return (pv);
3021 for (i = 0; i < pgraph.gcount; i++) {
3022 pv = pgraph.proc[i];
3023 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
3024 lock->pvertex = pv->index = i;
3025 return (pv);
3028 pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
3029 pv->pid = lock->l_flock.l_pid;
3030 pv->sysid = lock->l_flock.l_sysid;
3031 flk_proc_vertex_allocs++;
3032 if (pgraph.free != 0) {
3033 for (i = 0; i < pgraph.gcount; i++) {
3034 if (pgraph.proc[i] == NULL) {
3035 pgraph.proc[i] = pv;
3036 lock->pvertex = pv->index = i;
3037 pgraph.free--;
3038 return (pv);
3042 palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
3043 sizeof (proc_vertex_t *), KM_SLEEP);
3045 if (pgraph.proc) {
3046 bcopy(pgraph.proc, palloc,
3047 pgraph.gcount * sizeof (proc_vertex_t *));
3049 kmem_free(pgraph.proc,
3050 pgraph.gcount * sizeof (proc_vertex_t *));
3052 pgraph.proc = palloc;
3053 pgraph.free += (PROC_CHUNK - 1);
3054 pv->index = lock->pvertex = pgraph.gcount;
3055 pgraph.gcount += PROC_CHUNK;
3056 pgraph.proc[pv->index] = pv;
3057 return (pv);
3061 * Allocate a proc edge.
3064 static proc_edge_t *
3065 flk_get_proc_edge()
3067 proc_edge_t *pep;
3069 pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
3070 flk_proc_edge_allocs++;
3071 return (pep);
3075 * Free the proc edge. Called whenever its reference count goes to zero.
3078 static void
3079 flk_free_proc_edge(proc_edge_t *pep)
3081 ASSERT(pep->refcount == 0);
3082 kmem_free((void *)pep, sizeof (proc_edge_t));
3083 flk_proc_edge_frees++;
3087 * Color the graph explicitly done only when the mark value hits max value.
3090 static void
3091 flk_proc_graph_uncolor()
3093 int i;
3095 if (pgraph.mark == UINT_MAX) {
3096 for (i = 0; i < pgraph.gcount; i++)
3097 if (pgraph.proc[i] != NULL) {
3098 pgraph.proc[i]->atime = 0;
3099 pgraph.proc[i]->dtime = 0;
3101 pgraph.mark = 1;
3102 } else {
3103 pgraph.mark++;
3108 * Release the proc vertex iff both there are no in edges and out edges
3111 static void
3112 flk_proc_release(proc_vertex_t *proc)
3114 ASSERT(MUTEX_HELD(&flock_lock));
3115 if (proc->edge == NULL && proc->incount == 0) {
3116 pgraph.proc[proc->index] = NULL;
3117 pgraph.free++;
3118 kmem_free(proc, sizeof (proc_vertex_t));
3119 flk_proc_vertex_frees++;
3124 * Updates process graph to reflect change in a lock_graph.
3125 * Note: We should call this function only after we have a correctly
3126 * recomputed lock graph. Otherwise we might miss a deadlock detection.
3127 * eg: in function flk_relation() we call this function after flk_recompute_
3128 * dependencies() otherwise if a process tries to lock a vnode hashed
3129 * into another graph it might sleep for ever.
3132 static void
3133 flk_update_proc_graph(edge_t *ep, int delete)
3135 proc_vertex_t *toproc, *fromproc;
3136 proc_edge_t *pep, *prevpep;
3138 mutex_enter(&flock_lock);
3141 * OFD style locks are not associated with any process so there is
3142 * no proc graph for these.
3144 if (ep->from_vertex->l_ofd != NULL) {
3145 mutex_exit(&flock_lock);
3146 return;
3149 toproc = flk_get_proc_vertex(ep->to_vertex);
3150 fromproc = flk_get_proc_vertex(ep->from_vertex);
3152 if (!delete)
3153 goto add;
3154 pep = prevpep = fromproc->edge;
3156 ASSERT(pep != NULL);
3157 while (pep != NULL) {
3158 if (pep->to_proc == toproc) {
3159 ASSERT(pep->refcount > 0);
3160 pep->refcount--;
3161 if (pep->refcount == 0) {
3162 if (pep == prevpep) {
3163 fromproc->edge = pep->next;
3164 } else {
3165 prevpep->next = pep->next;
3167 toproc->incount--;
3168 flk_proc_release(toproc);
3169 flk_free_proc_edge(pep);
3171 break;
3173 prevpep = pep;
3174 pep = pep->next;
3176 flk_proc_release(fromproc);
3177 mutex_exit(&flock_lock);
3178 return;
3179 add:
3181 pep = fromproc->edge;
3183 while (pep != NULL) {
3184 if (pep->to_proc == toproc) {
3185 ASSERT(pep->refcount > 0);
3186 pep->refcount++;
3187 break;
3189 pep = pep->next;
3191 if (pep == NULL) {
3192 pep = flk_get_proc_edge();
3193 pep->to_proc = toproc;
3194 pep->refcount = 1;
3195 toproc->incount++;
3196 pep->next = fromproc->edge;
3197 fromproc->edge = pep;
3199 mutex_exit(&flock_lock);
3203 * Set the control status for lock manager requests.
3205 * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3206 * may be locks requests that have gotten started but not finished. In
3207 * particular, there may be blocking requests that are in the callback code
3208 * before sleeping (so they're not holding the lock for the graph). If
3209 * such a thread reacquires the graph's lock (to go to sleep) after
3210 * flk_lockmgr_status is set to a non-up value, it will notice the status
3211 * and bail out. If the request gets granted before the thread can check
3212 * flk_lockmgr_status, let it continue normally. It will get flushed when
3213 * we are called with FLK_LOCKMGR_DOWN.
3216 void
3217 flk_set_lockmgr_status(flk_lockmgr_status_t status)
3219 int i;
3220 graph_t *gp;
3221 struct flock_globals *fg;
3223 fg = flk_get_globals();
3224 ASSERT(fg != NULL);
3226 mutex_enter(&flock_lock);
3227 fg->flk_lockmgr_status = status;
3228 mutex_exit(&flock_lock);
3231 * If the lock manager is coming back up, all that's needed is to
3232 * propagate this information to the graphs. If the lock manager
3233 * is going down, additional action is required, and each graph's
3234 * copy of the state is updated atomically with this other action.
3236 switch (status) {
3237 case FLK_LOCKMGR_UP:
3238 for (i = 0; i < HASH_SIZE; i++) {
3239 mutex_enter(&flock_lock);
3240 gp = lock_graph[i];
3241 mutex_exit(&flock_lock);
3242 if (gp == NULL)
3243 continue;
3244 mutex_enter(&gp->gp_mutex);
3245 fg->lockmgr_status[i] = status;
3246 mutex_exit(&gp->gp_mutex);
3248 break;
3249 case FLK_WAKEUP_SLEEPERS:
3250 wakeup_sleeping_lockmgr_locks(fg);
3251 break;
3252 case FLK_LOCKMGR_DOWN:
3253 unlock_lockmgr_granted(fg);
3254 break;
3255 default:
3256 panic("flk_set_lockmgr_status: bad status (%d)", status);
3257 break;
3262 * This routine returns all the locks that are active or sleeping and are
3263 * associated with a particular set of identifiers. If lock_state != 0, then
3264 * only locks that match the lock_state are returned. If lock_state == 0, then
3265 * all locks are returned. If pid == NOPID, the pid is ignored. If
3266 * use_sysid is FALSE, then the sysid is ignored. If vp is NULL, then the
3267 * vnode pointer is ignored.
3269 * A list containing the vnode pointer and an flock structure
3270 * describing the lock is returned. Each element in the list is
3271 * dynamically allocated and must be freed by the caller. The
3272 * last item in the list is denoted by a NULL value in the ll_next
3273 * field.
3275 * The vnode pointers returned are held. The caller is responsible
3276 * for releasing these. Note that the returned list is only a snapshot of
3277 * the current lock information, and that it is a snapshot of a moving
3278 * target (only one graph is locked at a time).
3281 locklist_t *
3282 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3283 pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3285 lock_descriptor_t *lock;
3286 lock_descriptor_t *graph_head;
3287 locklist_t listhead;
3288 locklist_t *llheadp;
3289 locklist_t *llp;
3290 locklist_t *lltp;
3291 graph_t *gp;
3292 int i;
3293 int first_index; /* graph index */
3294 int num_indexes; /* graph index */
3296 ASSERT((list_type == FLK_ACTIVE_STATE) ||
3297 (list_type == FLK_SLEEPING_STATE));
3300 * Get a pointer to something to use as a list head while building
3301 * the rest of the list.
3303 llheadp = &listhead;
3304 lltp = llheadp;
3305 llheadp->ll_next = (locklist_t *)NULL;
3307 /* Figure out which graphs we want to look at. */
3308 if (vp == NULL) {
3309 first_index = 0;
3310 num_indexes = HASH_SIZE;
3311 } else {
3312 first_index = HASH_INDEX(vp);
3313 num_indexes = 1;
3316 for (i = first_index; i < first_index + num_indexes; i++) {
3317 mutex_enter(&flock_lock);
3318 gp = lock_graph[i];
3319 mutex_exit(&flock_lock);
3320 if (gp == NULL) {
3321 continue;
3324 mutex_enter(&gp->gp_mutex);
3325 graph_head = (list_type == FLK_ACTIVE_STATE) ?
3326 ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3327 for (lock = graph_head->l_next;
3328 lock != graph_head;
3329 lock = lock->l_next) {
3330 if (use_sysid && lock->l_flock.l_sysid != sysid)
3331 continue;
3332 if (pid != NOPID && lock->l_flock.l_pid != pid)
3333 continue;
3334 if (vp != NULL && lock->l_vnode != vp)
3335 continue;
3336 if (lock_state && !(lock_state & lock->l_state))
3337 continue;
3338 if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3339 continue;
3341 * A matching lock was found. Allocate
3342 * space for a new locklist entry and fill
3343 * it in.
3345 llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3346 lltp->ll_next = llp;
3347 VN_HOLD(lock->l_vnode);
3348 llp->ll_vp = lock->l_vnode;
3349 create_flock(lock, &(llp->ll_flock));
3350 llp->ll_next = (locklist_t *)NULL;
3351 lltp = llp;
3353 mutex_exit(&gp->gp_mutex);
3356 llp = llheadp->ll_next;
3357 return (llp);
3361 * These two functions are simply interfaces to get_lock_list. They return
3362 * a list of sleeping or active locks for the given sysid and pid. See
3363 * get_lock_list for details.
3365 * In either case we don't particularly care to specify the zone of interest;
3366 * the sysid-space is global across zones, so the sysid will map to exactly one
3367 * zone, and we'll return information for that zone.
3370 locklist_t *
3371 flk_get_sleeping_locks(int sysid, pid_t pid)
3373 return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
3374 ALL_ZONES));
3377 locklist_t *
3378 flk_get_active_locks(int sysid, pid_t pid)
3380 return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
3381 ALL_ZONES));
3385 * Another interface to get_lock_list. This one returns all the active
3386 * locks for a given vnode. Again, see get_lock_list for details.
3388 * We don't need to specify which zone's locks we're interested in. The matter
3389 * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3390 * be used by multiple zones, so the list of locks will all be from the right
3391 * zone.
3394 locklist_t *
3395 flk_active_locks_for_vp(const vnode_t *vp)
3397 return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
3398 ALL_ZONES));
3402 * Another interface to get_lock_list. This one returns all the active
3403 * nbmand locks for a given vnode. Again, see get_lock_list for details.
3405 * See the comment for flk_active_locks_for_vp() for why we don't care to
3406 * specify the particular zone of interest.
3408 locklist_t *
3409 flk_active_nbmand_locks_for_vp(const vnode_t *vp)
3411 return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3412 NOPID, vp, ALL_ZONES));
3416 * Another interface to get_lock_list. This one returns all the active
3417 * nbmand locks for a given pid. Again, see get_lock_list for details.
3419 * The zone doesn't need to be specified here; the locks held by a
3420 * particular process will either be local (ie, non-NFS) or from the zone
3421 * the process is executing in. This is because other parts of the system
3422 * ensure that an NFS vnode can't be used in a zone other than that in
3423 * which it was opened.
3425 locklist_t *
3426 flk_active_nbmand_locks(pid_t pid)
3428 return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3429 pid, NULL, ALL_ZONES));
3433 * Free up all entries in the locklist.
3435 void
3436 flk_free_locklist(locklist_t *llp)
3438 locklist_t *next_llp;
3440 while (llp) {
3441 next_llp = llp->ll_next;
3442 VN_RELE(llp->ll_vp);
3443 kmem_free(llp, sizeof (*llp));
3444 llp = next_llp;
3449 * Find all sleeping lock manager requests and poke them.
3451 static void
3452 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
3454 lock_descriptor_t *lock;
3455 lock_descriptor_t *nlock = NULL; /* next lock */
3456 int i;
3457 graph_t *gp;
3458 zoneid_t zoneid = getzoneid();
3460 for (i = 0; i < HASH_SIZE; i++) {
3461 mutex_enter(&flock_lock);
3462 gp = lock_graph[i];
3463 mutex_exit(&flock_lock);
3464 if (gp == NULL) {
3465 continue;
3468 mutex_enter(&gp->gp_mutex);
3469 fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
3470 for (lock = SLEEPING_HEAD(gp)->l_next;
3471 lock != SLEEPING_HEAD(gp);
3472 lock = nlock) {
3473 nlock = lock->l_next;
3474 if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3475 INTERRUPT_WAKEUP(lock);
3478 mutex_exit(&gp->gp_mutex);
3484 * Find all active (granted) lock manager locks and release them.
3486 static void
3487 unlock_lockmgr_granted(struct flock_globals *fg)
3489 lock_descriptor_t *lock;
3490 lock_descriptor_t *nlock = NULL; /* next lock */
3491 int i;
3492 graph_t *gp;
3493 zoneid_t zoneid = getzoneid();
3495 for (i = 0; i < HASH_SIZE; i++) {
3496 mutex_enter(&flock_lock);
3497 gp = lock_graph[i];
3498 mutex_exit(&flock_lock);
3499 if (gp == NULL) {
3500 continue;
3503 mutex_enter(&gp->gp_mutex);
3504 fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
3505 for (lock = ACTIVE_HEAD(gp)->l_next;
3506 lock != ACTIVE_HEAD(gp);
3507 lock = nlock) {
3508 nlock = lock->l_next;
3509 if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3510 ASSERT(IS_ACTIVE(lock));
3511 flk_delete_active_lock(lock, 0);
3512 flk_wakeup(lock, 1);
3513 flk_free_lock(lock);
3516 mutex_exit(&gp->gp_mutex);
3522 * Wait until a lock is granted, cancelled, or interrupted.
3525 static void
3526 wait_for_lock(lock_descriptor_t *request)
3528 graph_t *gp = request->l_graph;
3530 ASSERT(MUTEX_HELD(&gp->gp_mutex));
3532 while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
3533 !(IS_INTERRUPTED(request))) {
3534 if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
3535 flk_set_state(request, FLK_INTERRUPTED_STATE);
3536 request->l_state |= INTERRUPTED_LOCK;
3542 * Create an flock structure from the existing lock information
3544 * This routine is used to create flock structures for the lock manager
3545 * to use in a reclaim request. Since the lock was originated on this
3546 * host, it must be conforming to UNIX semantics, so no checking is
3547 * done to make sure it falls within the lower half of the 32-bit range.
3550 static void
3551 create_flock(lock_descriptor_t *lp, flock64_t *flp)
3553 ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
3554 ASSERT(lp->l_end >= lp->l_start);
3556 flp->l_type = lp->l_type;
3557 flp->l_whence = 0;
3558 flp->l_start = lp->l_start;
3559 flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
3560 (lp->l_end - lp->l_start + 1);
3561 flp->l_sysid = lp->l_flock.l_sysid;
3562 flp->l_pid = lp->l_flock.l_pid;
3566 * Convert flock_t data describing a lock range into unsigned long starting
3567 * and ending points, which are put into lock_request. Returns 0 or an
3568 * errno value.
3572 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
3573 uoff_t *start, uoff_t *end, offset_t offset)
3575 struct vattr vattr;
3576 int error;
3579 * Determine the starting point of the request
3581 switch (flp->l_whence) {
3582 case 0: /* SEEK_SET */
3583 *start = (uoff_t)flp->l_start;
3584 break;
3585 case 1: /* SEEK_CUR */
3586 *start = (uoff_t)(flp->l_start + offset);
3587 break;
3588 case 2: /* SEEK_END */
3589 vattr.va_mask = AT_SIZE;
3590 if (error = fop_getattr(vp, &vattr, 0, CRED(), NULL))
3591 return (error);
3592 *start = (uoff_t)(flp->l_start + vattr.va_size);
3593 break;
3594 default:
3595 return (EINVAL);
3599 * Determine the range covered by the request.
3601 if (flp->l_len == 0)
3602 *end = MAX_U_OFFSET_T;
3603 else if ((offset_t)flp->l_len > 0) {
3604 *end = (uoff_t)(*start + (flp->l_len - 1));
3605 } else {
3607 * Negative length; why do we even allow this ?
3608 * Because this allows easy specification of
3609 * the last n bytes of the file.
3611 *end = *start;
3612 *start += (uoff_t)flp->l_len;
3613 (*start)++;
3615 return (0);
3619 * Check the validity of lock data. This can used by the NFS
3620 * frlock routines to check data before contacting the server. The
3621 * server must support semantics that aren't as restrictive as
3622 * the UNIX API, so the NFS client is required to check.
3623 * The maximum is now passed in by the caller.
3627 flk_check_lock_data(uoff_t start, uoff_t end, offset_t max)
3630 * The end (length) for local locking should never be greater
3631 * than MAXEND. However, the representation for
3632 * the entire file is MAX_U_OFFSET_T.
3634 if ((start > max) ||
3635 ((end > max) && (end != MAX_U_OFFSET_T))) {
3636 return (EINVAL);
3638 if (start > end) {
3639 return (EINVAL);
3641 return (0);
3645 * Fill in request->l_flock with information about the lock blocking the
3646 * request. The complexity here is that lock manager requests are allowed
3647 * to see into the upper part of the 32-bit address range, whereas local
3648 * requests are only allowed to see signed values.
3650 * What should be done when "blocker" is a lock manager lock that uses the
3651 * upper portion of the 32-bit range, but "request" is local? Since the
3652 * request has already been determined to have been blocked by the blocker,
3653 * at least some portion of "blocker" must be in the range of the request,
3654 * or the request extends to the end of file. For the first case, the
3655 * portion in the lower range is returned with the indication that it goes
3656 * "to EOF." For the second case, the last byte of the lower range is
3657 * returned with the indication that it goes "to EOF."
3660 static void
3661 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
3663 flock64_t *flrp; /* l_flock portion of request */
3665 ASSERT(blocker != NULL);
3667 flrp = &request->l_flock;
3668 flrp->l_whence = 0;
3669 flrp->l_type = blocker->l_type;
3670 flrp->l_pid = blocker->l_flock.l_pid;
3671 flrp->l_sysid = blocker->l_flock.l_sysid;
3672 request->l_ofd = blocker->l_ofd;
3674 if (IS_LOCKMGR(request)) {
3675 flrp->l_start = blocker->l_start;
3676 if (blocker->l_end == MAX_U_OFFSET_T)
3677 flrp->l_len = 0;
3678 else
3679 flrp->l_len = blocker->l_end - blocker->l_start + 1;
3680 } else {
3681 if (blocker->l_start > MAXEND) {
3682 flrp->l_start = MAXEND;
3683 flrp->l_len = 0;
3684 } else {
3685 flrp->l_start = blocker->l_start;
3686 if (blocker->l_end == MAX_U_OFFSET_T)
3687 flrp->l_len = 0;
3688 else
3689 flrp->l_len = blocker->l_end -
3690 blocker->l_start + 1;
3696 * Return non-zero if the given I/O request conflicts with an active NBMAND
3697 * lock.
3698 * If svmand is non-zero, it means look at all active locks, not just NBMAND
3699 * locks.
3703 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, uoff_t offset,
3704 ssize_t length, int svmand, caller_context_t *ct)
3706 int conflict = 0;
3707 graph_t *gp;
3708 lock_descriptor_t *lock;
3709 pid_t pid;
3710 int sysid;
3712 if (ct == NULL) {
3713 pid = curproc->p_pid;
3714 sysid = 0;
3715 } else {
3716 pid = ct->cc_pid;
3717 sysid = ct->cc_sysid;
3720 mutex_enter(&flock_lock);
3721 gp = lock_graph[HASH_INDEX(vp)];
3722 mutex_exit(&flock_lock);
3723 if (gp == NULL)
3724 return (0);
3726 mutex_enter(&gp->gp_mutex);
3727 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
3729 for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
3730 if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
3731 (lock->l_flock.l_sysid != sysid ||
3732 lock->l_flock.l_pid != pid) &&
3733 lock_blocks_io(op, offset, length,
3734 lock->l_type, lock->l_start, lock->l_end)) {
3735 conflict = 1;
3736 break;
3739 mutex_exit(&gp->gp_mutex);
3741 return (conflict);
3745 * Return non-zero if the given I/O request conflicts with the given lock.
3748 static int
3749 lock_blocks_io(nbl_op_t op, uoff_t offset, ssize_t length,
3750 int lock_type, uoff_t lock_start, uoff_t lock_end)
3752 ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
3753 ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
3755 if (op == NBL_READ && lock_type == F_RDLCK)
3756 return (0);
3758 if (offset <= lock_start && lock_start < offset + length)
3759 return (1);
3760 if (lock_start <= offset && offset <= lock_end)
3761 return (1);
3763 return (0);
3766 #ifdef DEBUG
3767 static void
3768 check_active_locks(graph_t *gp)
3770 lock_descriptor_t *lock, *lock1;
3771 edge_t *ep;
3773 for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
3774 lock = lock->l_next) {
3775 ASSERT(IS_ACTIVE(lock));
3776 ASSERT(NOT_BLOCKED(lock));
3777 ASSERT(!IS_BARRIER(lock));
3779 ep = FIRST_IN(lock);
3781 while (ep != HEAD(lock)) {
3782 ASSERT(IS_SLEEPING(ep->from_vertex));
3783 ASSERT(!NOT_BLOCKED(ep->from_vertex));
3784 ep = NEXT_IN(ep);
3787 for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
3788 lock1 = lock1->l_next) {
3789 if (lock1->l_vnode == lock->l_vnode) {
3790 if (BLOCKS(lock1, lock)) {
3791 cmn_err(CE_PANIC,
3792 "active lock %p blocks %p",
3793 (void *)lock1, (void *)lock);
3794 } else if (BLOCKS(lock, lock1)) {
3795 cmn_err(CE_PANIC,
3796 "active lock %p blocks %p",
3797 (void *)lock, (void *)lock1);
3805 * Effect: This functions checks to see if the transition from 'old_state' to
3806 * 'new_state' is a valid one. It returns 0 if the transition is valid
3807 * and 1 if it is not.
3808 * For a map of valid transitions, see sys/flock_impl.h
3810 static int
3811 check_lock_transition(int old_state, int new_state)
3813 switch (old_state) {
3814 case FLK_INITIAL_STATE:
3815 if ((new_state == FLK_START_STATE) ||
3816 (new_state == FLK_SLEEPING_STATE) ||
3817 (new_state == FLK_ACTIVE_STATE) ||
3818 (new_state == FLK_DEAD_STATE)) {
3819 return (0);
3820 } else {
3821 return (1);
3823 case FLK_START_STATE:
3824 if ((new_state == FLK_ACTIVE_STATE) ||
3825 (new_state == FLK_DEAD_STATE)) {
3826 return (0);
3827 } else {
3828 return (1);
3830 case FLK_ACTIVE_STATE:
3831 if (new_state == FLK_DEAD_STATE) {
3832 return (0);
3833 } else {
3834 return (1);
3836 case FLK_SLEEPING_STATE:
3837 if ((new_state == FLK_GRANTED_STATE) ||
3838 (new_state == FLK_INTERRUPTED_STATE) ||
3839 (new_state == FLK_CANCELLED_STATE)) {
3840 return (0);
3841 } else {
3842 return (1);
3844 case FLK_GRANTED_STATE:
3845 if ((new_state == FLK_START_STATE) ||
3846 (new_state == FLK_INTERRUPTED_STATE) ||
3847 (new_state == FLK_CANCELLED_STATE)) {
3848 return (0);
3849 } else {
3850 return (1);
3852 case FLK_CANCELLED_STATE:
3853 if ((new_state == FLK_INTERRUPTED_STATE) ||
3854 (new_state == FLK_DEAD_STATE)) {
3855 return (0);
3856 } else {
3857 return (1);
3859 case FLK_INTERRUPTED_STATE:
3860 if (new_state == FLK_DEAD_STATE) {
3861 return (0);
3862 } else {
3863 return (1);
3865 case FLK_DEAD_STATE:
3866 /* May be set more than once */
3867 if (new_state == FLK_DEAD_STATE) {
3868 return (0);
3869 } else {
3870 return (1);
3872 default:
3873 return (1);
3877 static void
3878 check_sleeping_locks(graph_t *gp)
3880 lock_descriptor_t *lock1, *lock2;
3881 edge_t *ep;
3882 for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
3883 lock1 = lock1->l_next) {
3884 ASSERT(!IS_BARRIER(lock1));
3885 for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
3886 lock2 = lock2->l_next) {
3887 if (lock1->l_vnode == lock2->l_vnode) {
3888 if (BLOCKS(lock2, lock1)) {
3889 ASSERT(!IS_GRANTED(lock1));
3890 ASSERT(!NOT_BLOCKED(lock1));
3891 path(lock1, lock2);
3896 for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
3897 lock2 = lock2->l_next) {
3898 ASSERT(!IS_BARRIER(lock1));
3899 if (lock1->l_vnode == lock2->l_vnode) {
3900 if (BLOCKS(lock2, lock1)) {
3901 ASSERT(!IS_GRANTED(lock1));
3902 ASSERT(!NOT_BLOCKED(lock1));
3903 path(lock1, lock2);
3907 ep = FIRST_ADJ(lock1);
3908 while (ep != HEAD(lock1)) {
3909 ASSERT(BLOCKS(ep->to_vertex, lock1));
3910 ep = NEXT_ADJ(ep);
3915 static int
3916 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
3918 edge_t *ep;
3919 lock_descriptor_t *vertex;
3920 lock_descriptor_t *vertex_stack;
3922 STACK_INIT(vertex_stack);
3924 flk_graph_uncolor(lock1->l_graph);
3925 ep = FIRST_ADJ(lock1);
3926 ASSERT(ep != HEAD(lock1));
3927 while (ep != HEAD(lock1)) {
3928 if (no_path)
3929 ASSERT(ep->to_vertex != lock2);
3930 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
3931 COLOR(ep->to_vertex);
3932 ep = NEXT_ADJ(ep);
3935 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
3936 STACK_POP(vertex_stack, l_dstack);
3937 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
3938 ep = NEXT_ADJ(ep)) {
3939 if (COLORED(ep->to_vertex))
3940 continue;
3941 COLOR(ep->to_vertex);
3942 if (ep->to_vertex == lock2)
3943 return (1);
3945 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
3948 return (0);
3951 static void
3952 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
3954 lock_descriptor_t *lock;
3956 /* Ignore OFD style locks since they're not process-wide. */
3957 if (pid == 0)
3958 return;
3960 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
3962 if (lock) {
3963 while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
3964 if (lock->l_flock.l_pid == pid &&
3965 lock->l_flock.l_sysid == sysid)
3966 cmn_err(CE_PANIC,
3967 "owner pid %d's lock %p in active queue",
3968 pid, (void *)lock);
3969 lock = lock->l_next;
3972 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
3974 if (lock) {
3975 while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
3976 if (lock->l_flock.l_pid == pid &&
3977 lock->l_flock.l_sysid == sysid)
3978 cmn_err(CE_PANIC,
3979 "owner pid %d's lock %p in sleep queue",
3980 pid, (void *)lock);
3981 lock = lock->l_next;
3986 static int
3987 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
3989 edge_t *ep = FIRST_ADJ(lock1);
3991 while (ep != HEAD(lock1)) {
3992 if (ep->to_vertex == lock2)
3993 return (1);
3994 else
3995 ep = NEXT_ADJ(ep);
3997 return (0);
4000 static int
4001 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4003 return (!level_two_path(lock1, lock2, 1));
4006 static void
4007 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4009 if (level_one_path(lock1, lock2)) {
4010 if (level_two_path(lock1, lock2, 0) != 0) {
4011 cmn_err(CE_WARN,
4012 "one edge one path from lock1 %p lock2 %p",
4013 (void *)lock1, (void *)lock2);
4015 } else if (no_path(lock1, lock2)) {
4016 cmn_err(CE_PANIC,
4017 "No path from lock1 %p to lock2 %p",
4018 (void *)lock1, (void *)lock2);
4021 #endif /* DEBUG */