2 * Copyright (c) 2004 Joerg Sonnenberger <joerg@bec.de>. All rights reserved.
4 * Copyright (c) 1982, 1986, 1989, 1993
5 * The Regents of the University of California. All rights reserved.
7 * This code is derived from software contributed to Berkeley by
8 * Scooter Morris at Genentech Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
39 * $FreeBSD: src/sys/kern/kern_lockf.c,v 1.25 1999/11/16 16:28:56 phk Exp $
40 * $DragonFly: src/sys/kern/kern_lockf.c,v 1.22 2005/03/02 11:48:15 joerg Exp $
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
48 #include <sys/unistd.h>
49 #include <sys/vnode.h>
50 #include <sys/malloc.h>
51 #include <sys/fcntl.h>
52 #include <sys/resourcevar.h>
54 #include <sys/lockf.h>
55 #include <machine/limits.h> /* for LLONG_MAX */
56 #include <machine/stdarg.h>
59 int lf_global_counter
= 0;
63 int lf_print_ranges
= 0;
65 static void _lf_print_lock(const struct lockf
*);
66 static void _lf_printf(const char *, ...);
68 #define lf_print_lock(lock) if (lf_print_ranges) _lf_print_lock(lock)
69 #define lf_printf(ctl, args...) if (lf_print_ranges) _lf_printf(ctl, args)
71 #define lf_print_lock(lock)
72 #define lf_printf(ctl, args...)
75 static MALLOC_DEFINE(M_LOCKF
, "lockf", "Byte-range locking structures");
77 static void lf_wakeup(struct lockf
*, off_t
, off_t
);
78 static int lf_overlap(const struct lockf_range
*, off_t
, off_t
);
79 static int lf_overlap_left(const struct lockf_range
*, off_t
, off_t
);
80 static int lf_overlap_right(const struct lockf_range
*, off_t
, off_t
);
81 static int lf_overlap_left2(const struct lockf_range
*, off_t
, off_t
);
82 static int lf_overlap_right2(const struct lockf_range
*, off_t
, off_t
);
83 static int lf_overlap_embedded(const struct lockf_range
*, off_t
, off_t
);
84 static struct lockf_range
*lf_alloc_range(void);
85 static void lf_create_range(struct lockf_range
*, struct proc
*, int, int,
87 static void lf_destroy_range(struct lockf_range
*, int);
89 static int lf_setlock(struct lockf
*, struct proc
*, int, int,
91 static int lf_clearlock(struct lockf
*, struct proc
*, int, int,
93 static int lf_getlock(struct flock
*, struct lockf
*, struct proc
*,
94 int, int, off_t
, off_t
);
96 static int lf_count_change(struct proc
*, int);
99 * Change the POSIX lock accounting for the given process.
102 lf_count_adjust(struct proc
*p
, int increase
)
108 uip
= p
->p_ucred
->cr_uidinfo
;
111 uip
->ui_posixlocks
+= p
->p_numposixlocks
;
113 uip
->ui_posixlocks
-= p
->p_numposixlocks
;
115 KASSERT(uip
->ui_posixlocks
>= 0,
116 ("Negative number of POSIX locks held by %s user: %d.",
117 increase
? "new" : "old", uip
->ui_posixlocks
));
121 lf_count_change(struct proc
*owner
, int diff
)
126 /* we might actually not have a process context */
130 uip
= owner
->p_ucred
->cr_uidinfo
;
132 max
= MIN(owner
->p_rlimit
[RLIMIT_POSIXLOCKS
].rlim_cur
,
133 maxposixlocksperuid
);
134 if (diff
> 0 && owner
->p_ucred
->cr_uid
!= 0 && max
!= -1 &&
135 uip
->ui_posixlocks
>= max
)
138 uip
->ui_posixlocks
+= diff
;
140 KASSERT(uip
->ui_posixlocks
>= 0,
141 ("Negative number of POSIX locks held by user: %d.",
142 uip
->ui_posixlocks
));
148 * Advisory record locking support
151 lf_advlock(struct vop_advlock_args
*ap
, struct lockf
*lock
, u_quad_t size
)
153 struct flock
*fl
= ap
->a_fl
;
156 int type
, flags
, error
;
160 * Convert the flock structure into a start and end.
162 switch (fl
->l_whence
) {
166 * Caller is responsible for adding any necessary offset
167 * when SEEK_CUR is used.
173 start
= size
+ fl
->l_start
;
181 if (fl
->l_len
== 0) {
185 end
= start
+ fl
->l_len
- 1;
193 * This isn't really correct for flock-style locks,
194 * but the current handling is somewhat broken anyway.
196 owner
= (struct proc
*)ap
->a_id
;
199 * Do the requested operation.
201 lwkt_gettoken(&ilock
, lwkt_token_pool_get(lock
));
203 if (lock
->init_done
== 0) {
204 TAILQ_INIT(&lock
->lf_range
);
205 TAILQ_INIT(&lock
->lf_blocked
);
211 error
= lf_setlock(lock
, owner
, type
, flags
, start
, end
);
215 error
= lf_clearlock(lock
, owner
, type
, flags
, start
, end
);
219 error
= lf_getlock(fl
, lock
, owner
, type
, flags
, start
, end
);
226 lwkt_reltoken(&ilock
);
231 lf_setlock(struct lockf
*lock
, struct proc
*owner
, int type
, int flags
,
232 off_t start
, off_t end
)
234 struct lockf_range
*range
, *first_match
, *insert_point
;
235 int wakeup_needed
, lock_needed
;
236 /* pre-allocation to avoid blocking in the middle of the algorithm */
237 struct lockf_range
*new_range1
= NULL
, *new_range2
= NULL
;
240 /* for restauration in case of hitting the POSIX lock limit below */
241 struct lockf_range
*orig_first_match
= NULL
;
246 if (new_range1
== NULL
)
247 new_range1
= lf_alloc_range();
248 if (new_range2
== NULL
)
249 new_range2
= lf_alloc_range();
256 TAILQ_FOREACH(range
, &lock
->lf_range
, lf_link
) {
257 if (insert_point
== NULL
&& range
->lf_start
>= start
)
258 insert_point
= range
;
259 if (lf_overlap(range
, start
, end
) == 0)
261 if (range
->lf_owner
== owner
) {
262 if (first_match
== NULL
)
266 if (type
== F_WRLCK
|| range
->lf_type
== F_WRLCK
)
271 struct lockf_range
*brange
;
273 if ((flags
& F_WAIT
) == 0) {
279 * We are blocked. For POSIX locks we have to check
280 * for deadlocks and return with EDEADLK. This is done
281 * by checking wether range->lf_owner is already
284 * Since flock-style locks cover the whole file, a
285 * deadlock between those is nearly impossible.
286 * This can only occur if a process tries to lock the
287 * same inode exclusively while holding a shared lock
288 * with another descriptor.
289 * XXX How can we cleanly detect this?
290 * XXX The current mixing of flock & fcntl/lockf is evil.
292 * Handle existing locks of flock-style like POSIX locks.
294 if (flags
& F_POSIX
) {
295 TAILQ_FOREACH(brange
, &lock
->lf_blocked
, lf_link
)
296 if (brange
->lf_owner
== range
->lf_owner
) {
303 * For flock-style locks, we must first remove
304 * any shared locks that we hold before we sleep
305 * waiting for an exclusive lock.
307 if ((flags
& F_FLOCK
) && type
== F_WRLCK
)
308 lf_clearlock(lock
, owner
, type
, flags
, start
, end
);
312 lf_create_range(brange
, owner
, type
, 0, start
, end
, 0);
313 TAILQ_INSERT_TAIL(&lock
->lf_blocked
, brange
, lf_link
);
314 error
= tsleep(brange
, PCATCH
, "lockf", 0);
317 * We may have been awaked by a signal and/or by a
318 * debugger continuing us (in which case we must remove
319 * ourselves from the blocked list) and/or by another
320 * process releasing/downgrading a lock (in which case
321 * we have already been removed from the blocked list
322 * and our lf_flags field is 1).
324 if (brange
->lf_flags
== 0)
325 TAILQ_REMOVE(&lock
->lf_blocked
, brange
, lf_link
);
326 tsleep(brange
, 0, "lockfz", 2); /* XXX debug livelock */
327 lf_destroy_range(brange
, 0);
334 if (first_match
== NULL
) {
335 if (flags
& F_POSIX
) {
336 if (lf_count_change(owner
, 1)) {
343 lf_create_range(range
, owner
, type
, flags
, start
, end
, 1);
344 if (insert_point
!= NULL
)
345 TAILQ_INSERT_BEFORE(insert_point
, range
, lf_link
);
347 TAILQ_INSERT_TAIL(&lock
->lf_range
, range
, lf_link
);
353 if (lf_overlap_left(first_match
, start
, end
)) {
354 KKASSERT((flags
& F_POSIX
) != 0);
355 if (first_match
->lf_end
> end
) {
356 if (first_match
->lf_type
== type
)
358 if (lf_count_change(owner
, 2)) {
364 lf_create_range(range
, owner
, type
, flags
,
366 if (insert_point
!= NULL
)
367 TAILQ_INSERT_BEFORE(insert_point
, range
,
370 TAILQ_INSERT_TAIL(&lock
->lf_range
, range
,
372 insert_point
= range
;
375 lf_create_range(range
, owner
, first_match
->lf_type
,
376 first_match
->lf_flags
, end
+ 1,
377 first_match
->lf_end
, 1);
378 TAILQ_INSERT_AFTER(&lock
->lf_range
, insert_point
,
380 first_match
->lf_flags
&= ~F_NOEND
;
381 first_match
->lf_end
= start
- 1;
387 * left match, but not right match
389 * handle the lf_type != type case directly,
390 * merge the other case with the !lock_needed path.
392 if (first_match
->lf_type
!= type
) {
394 * This is needed if the lockf acquisition below fails.
396 orig_first_match
= first_match
;
397 orig_end
= first_match
->lf_end
;
398 orig_flags
= first_match
->lf_flags
;
399 first_match
->lf_end
= start
- 1;
400 first_match
->lf_flags
&= ~F_NOEND
;
403 /* Try to find the next matching range */
404 range
= TAILQ_NEXT(first_match
, lf_link
);
405 while (range
!= NULL
) {
406 if (range
->lf_owner
== owner
&&
407 lf_overlap(range
, start
, end
))
409 range
= TAILQ_NEXT(range
, lf_link
);
414 /* fall through to !left_match behaviour */
416 first_match
->lf_end
= end
;
417 first_match
->lf_flags
|= flags
& F_NOEND
;
422 if (lf_overlap_embedded(first_match
, start
, end
)) {
423 if (first_match
!= insert_point
) {
424 TAILQ_REMOVE(&lock
->lf_range
, first_match
, lf_link
);
425 TAILQ_INSERT_BEFORE(insert_point
, first_match
, lf_link
);
427 first_match
->lf_start
= start
;
428 first_match
->lf_end
= end
;
429 first_match
->lf_flags
|= flags
& F_NOEND
;
430 first_match
->lf_type
= type
;
434 if (lock_needed
== 0) {
435 struct lockf_range
*nrange
;
437 range
= TAILQ_NEXT(first_match
, lf_link
);
438 while (range
!= NULL
) {
439 if (range
->lf_owner
!= owner
) {
440 range
= TAILQ_NEXT(range
, lf_link
);
443 if (lf_overlap_embedded(range
, start
, end
)) {
444 nrange
= TAILQ_NEXT(range
, lf_link
);
445 TAILQ_REMOVE(&lock
->lf_range
, range
,
447 lf_count_change(owner
, -1);
448 lf_destroy_range(range
, 1);
452 if (lf_overlap_right(range
, start
, end
) == 0) {
453 range
= TAILQ_NEXT(range
, lf_link
);
456 if (range
->lf_type
!= type
) {
457 range
->lf_start
= end
+ 1;
458 nrange
= TAILQ_NEXT(range
, lf_link
);
459 TAILQ_REMOVE(&lock
->lf_range
, range
, lf_link
);
460 while (nrange
!= NULL
) {
461 if (nrange
->lf_start
>= end
+ 1)
463 nrange
= TAILQ_NEXT(nrange
, lf_link
);
466 TAILQ_INSERT_BEFORE(nrange
, range
,
469 TAILQ_INSERT_TAIL(&lock
->lf_range
,
473 first_match
->lf_end
= range
->lf_end
;
474 first_match
->lf_flags
|=
475 range
->lf_flags
& F_NOEND
;
476 TAILQ_REMOVE(&lock
->lf_range
, range
, lf_link
);
477 lf_count_change(owner
, -1);
478 lf_destroy_range(range
, 1);
484 if (lf_overlap_right(first_match
, start
, end
)) {
485 KKASSERT((flags
& F_POSIX
) != 0);
486 if (first_match
->lf_type
== type
) {
487 first_match
->lf_start
= start
;
488 if (first_match
!= insert_point
) {
489 TAILQ_REMOVE(&lock
->lf_range
, first_match
,
491 TAILQ_INSERT_BEFORE(insert_point
, first_match
,
496 if (lf_count_change(owner
, 1)) {
497 if (orig_first_match
!= NULL
) {
498 orig_first_match
->lf_end
= orig_end
;
499 orig_first_match
->lf_flags
= orig_end
;
504 first_match
->lf_start
= end
+ 1;
505 KKASSERT(new_range1
!= NULL
);
508 lf_create_range(range
, owner
, type
, flags
, start
, end
, 1);
509 TAILQ_INSERT_BEFORE(insert_point
, range
, lf_link
);
510 range
= TAILQ_NEXT(first_match
, lf_link
);
511 TAILQ_REMOVE(&lock
->lf_range
, first_match
, lf_link
);
512 while (range
!= NULL
) {
513 if (range
->lf_start
>= first_match
->lf_start
)
515 range
= TAILQ_NEXT(range
, lf_link
);
518 TAILQ_INSERT_BEFORE(range
, first_match
, lf_link
);
520 TAILQ_INSERT_TAIL(&lock
->lf_range
, first_match
, lf_link
);
527 lf_wakeup(lock
, start
, end
);
530 if (new_range1
!= NULL
)
531 lf_destroy_range(new_range1
, 0);
532 if (new_range2
!= NULL
)
533 lf_destroy_range(new_range2
, 0);
538 lf_clearlock(struct lockf
*lock
, struct proc
*owner
, int type
, int flags
,
539 off_t start
, off_t end
)
541 struct lockf_range
*range
, *trange
;
542 struct lockf_range
*new_range
;
545 new_range
= lf_alloc_range();
547 TAILQ_FOREACH_MUTABLE(range
, &lock
->lf_range
, lf_link
, trange
) {
548 if (range
->lf_end
< start
)
550 if (range
->lf_start
> end
)
552 if (range
->lf_owner
!= owner
)
554 if (lf_overlap_embedded(range
, start
, end
)) {
555 TAILQ_REMOVE(&lock
->lf_range
, range
, lf_link
);
556 /* flock-locks are equal */
557 if (range
->lf_flags
& F_POSIX
)
558 lf_count_change(owner
, -1);
559 lf_destroy_range(range
, 1);
562 if (lf_overlap_left2(range
, start
, end
)) {
563 KKASSERT(range
->lf_flags
& F_POSIX
);
564 if (lf_overlap_right2(range
, start
, end
)) {
565 struct lockf_range
*nrange
;
567 if (lf_count_change(owner
, 1)) {
573 lf_create_range(nrange
, range
->lf_owner
,
574 range
->lf_type
, range
->lf_flags
,
575 end
+ 1, range
->lf_end
, 1);
576 range
->lf_end
= start
;
577 range
->lf_flags
&= ~F_NOEND
;
578 for (; range
!= NULL
;
579 range
= TAILQ_NEXT(range
, lf_link
))
580 if (range
->lf_start
>= nrange
->lf_start
)
583 TAILQ_INSERT_BEFORE(range
, nrange
,
586 TAILQ_INSERT_TAIL(&lock
->lf_range
,
590 range
->lf_end
= start
- 1;
591 range
->lf_flags
&= ~F_NOEND
;
594 if (lf_overlap_right2(range
, start
, end
)) {
595 struct lockf_range
*nrange
= range
;
597 KKASSERT(range
->lf_flags
& F_POSIX
);
599 range
= TAILQ_NEXT(range
, lf_link
);
600 TAILQ_REMOVE(&lock
->lf_range
, nrange
, lf_link
);
601 for (; range
!= NULL
;
602 range
= TAILQ_NEXT(range
, lf_link
))
603 if (range
->lf_start
>= nrange
->lf_start
)
606 TAILQ_INSERT_BEFORE(range
, nrange
, lf_link
);
608 TAILQ_INSERT_TAIL(&lock
->lf_range
, nrange
,
614 lf_wakeup(lock
, start
, end
);
618 if (new_range
!= NULL
)
619 lf_destroy_range(new_range
, 0);
625 * Check whether there is a blocking lock,
626 * and if so return its process identifier.
629 lf_getlock(struct flock
*fl
, struct lockf
*lock
, struct proc
*owner
,
630 int type
, int flags
, off_t start
, off_t end
)
632 struct lockf_range
*range
;
634 TAILQ_FOREACH(range
, &lock
->lf_range
, lf_link
)
635 if (range
->lf_owner
!= owner
&&
636 lf_overlap(range
, start
, end
) &&
637 (type
== F_WRLCK
|| range
->lf_type
== F_WRLCK
))
640 fl
->l_type
= F_UNLCK
;
643 fl
->l_type
= range
->lf_type
;
644 fl
->l_whence
= SEEK_SET
;
645 fl
->l_start
= range
->lf_start
;
646 if (range
->lf_flags
& F_NOEND
)
649 fl
->l_len
= range
->lf_end
- range
->lf_start
+ 1;
650 if (range
->lf_owner
!= NULL
&& (range
->lf_flags
& F_POSIX
))
651 fl
->l_pid
= range
->lf_owner
->p_pid
;
658 * Check wether range and [start, end] overlap.
661 lf_overlap(const struct lockf_range
*range
, off_t start
, off_t end
)
663 if (range
->lf_start
>= start
&& range
->lf_start
<= end
)
665 else if (start
>= range
->lf_start
&& start
<= range
->lf_end
)
672 * Wakeup pending lock attempts.
675 lf_wakeup(struct lockf
*lock
, off_t start
, off_t end
)
677 struct lockf_range
*range
, *nrange
;
678 TAILQ_FOREACH_MUTABLE(range
, &lock
->lf_blocked
, lf_link
, nrange
) {
679 if (lf_overlap(range
, start
, end
) == 0)
681 TAILQ_REMOVE(&lock
->lf_blocked
, range
, lf_link
);
684 if (lf_overlap_embedded(range
, start
, end
))
690 lf_overlap_left(const struct lockf_range
*range
, off_t start
, off_t end
)
692 if (range
->lf_start
< start
&& range
->lf_end
>= start
- 1 &&
693 range
->lf_end
<= end
)
701 lf_overlap_right(const struct lockf_range
*range
, off_t start
, off_t end
)
703 if (range
->lf_end
> end
&& range
->lf_start
>= start
&&
704 range
->lf_start
- 1 <= end
)
711 lf_overlap_left2(const struct lockf_range
*range
, off_t start
, off_t end
)
713 if (range
->lf_start
< start
&& range
->lf_end
>= start
&&
714 range
->lf_end
<= end
)
722 lf_overlap_right2(const struct lockf_range
*range
, off_t start
, off_t end
)
724 if (range
->lf_end
> end
&& range
->lf_start
>= start
&&
725 range
->lf_start
<= end
)
732 lf_overlap_embedded(const struct lockf_range
*range
, off_t start
, off_t end
)
734 if (range
->lf_start
>= start
&& range
->lf_end
<= end
)
741 * Allocate a range structure and initialize it sufficiently such that
742 * lf_destroy_range() does not barf.
744 static struct lockf_range
*
747 struct lockf_range
*range
;
752 range
= malloc(sizeof(struct lockf_range
), M_LOCKF
, M_WAITOK
);
753 range
->lf_owner
= NULL
;
758 lf_create_range(struct lockf_range
*range
, struct proc
*owner
, int type
,
759 int flags
, off_t start
, off_t end
, int accounting
)
761 KKASSERT(start
<= end
);
762 if (owner
!= NULL
&& (flags
& F_POSIX
) && accounting
)
763 ++owner
->p_numposixlocks
;
764 range
->lf_type
= type
;
765 range
->lf_flags
= flags
;
766 range
->lf_start
= start
;
768 range
->lf_owner
= owner
;
770 lf_printf("lf_create_range: %lld..%lld\n",
771 range
->lf_start
, range
->lf_end
);
775 lf_destroy_range(struct lockf_range
*range
, int accounting
)
777 struct proc
*owner
= range
->lf_owner
;
778 int flags
= range
->lf_flags
;
780 lf_printf("lf_destroy_range: %lld..%lld\n",
781 range
->lf_start
, range
->lf_end
);
783 free(range
, M_LOCKF
);
784 if (owner
!= NULL
&& (flags
& F_POSIX
) && accounting
) {
785 --owner
->p_numposixlocks
;
786 KASSERT(owner
->p_numposixlocks
>= 0,
787 ("Negative number of POSIX locks held by process: %d",
788 owner
->p_numposixlocks
));
793 KKASSERT(lf_global_counter
>=0);
800 _lf_printf(const char *ctl
, ...)
805 if (lf_print_ranges
) {
806 if ((p
= curproc
) != NULL
)
807 printf("pid %d (%s): ", p
->p_pid
, p
->p_comm
);
815 _lf_print_lock(const struct lockf
*lock
)
817 struct lockf_range
*range
;
819 if (lf_print_ranges
== 0)
822 if (TAILQ_EMPTY(&lock
->lf_range
)) {
823 lf_printf("lockf %p: no ranges locked\n", lock
);
825 lf_printf("lockf %p:\n", lock
);
827 TAILQ_FOREACH(range
, &lock
->lf_range
, lf_link
)
828 printf("\t%lld..%lld type %s owned by %d\n",
829 range
->lf_start
, range
->lf_end
,
830 range
->lf_type
== F_RDLCK
? "shared" : "exclusive",
831 range
->lf_flags
& F_POSIX
? range
->lf_owner
->p_pid
: -1);
832 if (TAILQ_EMPTY(&lock
->lf_blocked
))
833 printf("no process waiting for range\n");
835 printf("blocked locks:");
836 TAILQ_FOREACH(range
, &lock
->lf_blocked
, lf_link
)
837 printf("\t%lld..%lld type %s waiting on %p\n",
838 range
->lf_start
, range
->lf_end
,
839 range
->lf_type
== F_RDLCK
? "shared" : "exclusive",
842 #endif /* LOCKF_DEBUG */