sys/kern/kern_lockf.c

   1 /*
   2  * Copyright (c) 2004 Joerg Sonnenberger <joerg@bec.de>.  All rights reserved.
   3  * Copyright (c) 2006 Matthew Dillon <dillon@backplane.com>.  All rights reserved.
   4  *
   5  * Copyright (c) 1982, 1986, 1989, 1993
   6  *      The Regents of the University of California.  All rights reserved.
   7  *
   8  * This code is derived from software contributed to Berkeley by
   9  * Scooter Morris at Genentech Inc.
  10  *
  11  * Redistribution and use in source and binary forms, with or without
  12  * modification, are permitted provided that the following conditions
  13  * are met:
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  * 2. Redistributions in binary form must reproduce the above copyright
  17  *    notice, this list of conditions and the following disclaimer in the
  18  *    documentation and/or other materials provided with the distribution.
  19  * 3. Neither the name of the University nor the names of its contributors
  20  *    may be used to endorse or promote products derived from this software
  21  *    without specific prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  *
  35  *      @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
  36  * $FreeBSD: src/sys/kern/kern_lockf.c,v 1.25 1999/11/16 16:28:56 phk Exp $
  37  */
  38
  39 #include "opt_debug_lockf.h"
  40
  41 #include <sys/param.h>
  42 #include <sys/systm.h>
  43 #include <sys/kernel.h>
  44 #include <sys/lock.h>
  45 #include <sys/proc.h>
  46 #include <sys/unistd.h>
  47 #include <sys/vnode.h>
  48 #include <sys/malloc.h>
  49 #include <sys/fcntl.h>
  50 #include <sys/resourcevar.h>
  51
  52 #include <sys/lockf.h>
  53 #include <machine/limits.h>     /* for LLONG_MAX */
  54 #include <machine/stdarg.h>
  55
  56 #include <sys/spinlock2.h>
  57
  58 #ifdef INVARIANTS
  59 int lf_global_counter = 0;
  60 #endif
  61
  62 #ifdef LOCKF_DEBUG
  63 int lf_print_ranges = 0;
  64
  65 static void _lf_print_lock(const struct lockf *);
  66 static void _lf_printf(const char *, ...) __printflike(1, 2);
  67
  68 #define lf_print_lock(lock) if (lf_print_ranges) _lf_print_lock(lock)
  69 #define lf_printf(ctl, args...) if (lf_print_ranges) _lf_printf(ctl, args)
  70 #else
  71 #define lf_print_lock(lock)
  72 #define lf_printf(ctl, args...)
  73 #endif
  74
  75 static MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
  76
  77 static void     lf_wakeup(struct lockf *, off_t, off_t);
  78 static struct lockf_range *lf_alloc_range(void);
  79 static void     lf_create_range(struct lockf_range *, struct proc *, int, int,
  80                                 off_t, off_t);
  81 static void     lf_insert(struct lockf_range_list *list,
  82                                 struct lockf_range *elm,
  83                                 struct lockf_range *insert_point);
  84 static void     lf_destroy_range(struct lockf_range *);
  85
  86 static int      lf_setlock(struct lockf *, struct proc *, int, int,
  87                            off_t, off_t);
  88 static int      lf_getlock(struct flock *, struct lockf *, struct proc *,
  89                            int, int, off_t, off_t);
  90
  91 static int      lf_count_change(struct proc *, int);
  92
  93 /*
  94  * Return TRUE (non-zero) if the type and posix flags match.
  95  */
  96 static __inline
  97 int
  98 lf_match(struct lockf_range *range, int type, int flags)
  99 {
 100         if (range->lf_type != type)
 101                 return(0);
 102         if ((range->lf_flags ^ flags) & F_POSIX)
 103                 return(0);
 104         return(1);
 105 }
 106
 107 /*
 108  * Check whether range and [start, end] overlap.
 109  */
 110 static __inline
 111 int
 112 lf_overlap(const struct lockf_range *range, off_t start, off_t end)
 113 {
 114         if (range->lf_start >= start && range->lf_start <= end)
 115                 return(1);
 116         else if (start >= range->lf_start && start <= range->lf_end)
 117                 return(1);
 118         else
 119                 return(0);
 120 }
 121
 122
 123 /*
 124  * Change the POSIX lock accounting for the given process.
 125  */
 126 void
 127 lf_count_adjust(struct proc *p, int increase)
 128 {
 129         struct uidinfo *uip;
 130
 131         KKASSERT(p != NULL);
 132
 133         uip = p->p_ucred->cr_uidinfo;
 134         if (increase)
 135                 atomic_add_int(&uip->ui_posixlocks, p->p_numposixlocks);
 136         else
 137                 atomic_add_int(&uip->ui_posixlocks, -p->p_numposixlocks);
 138
 139         KASSERT(uip->ui_posixlocks >= 0,
 140                 ("Negative number of POSIX locks held by %s user: %d.",
 141                  increase ? "new" : "old", uip->ui_posixlocks));
 142 }
 143
 144 static int
 145 lf_count_change(struct proc *owner, int diff)
 146 {
 147         struct uidinfo *uip;
 148         int max, ret;
 149
 150         /* we might actually not have a process context */
 151         if (owner == NULL)
 152                 return(0);
 153
 154         uip = owner->p_ucred->cr_uidinfo;
 155
 156         max = MIN(owner->p_rlimit[RLIMIT_POSIXLOCKS].rlim_cur,
 157                   maxposixlocksperuid);
 158
 159         if (diff > 0 && owner->p_ucred->cr_uid != 0 && max != -1 &&
 160             uip->ui_posixlocks >= max ) {
 161                 ret = 1;
 162         } else {
 163                 atomic_add_int(&uip->ui_posixlocks, diff);
 164                 atomic_add_int(&owner->p_numposixlocks, diff);
 165                 KASSERT(uip->ui_posixlocks >= 0,
 166                         ("Negative number of POSIX locks held by user: %d.",
 167                          uip->ui_posixlocks));
 168                 KASSERT(owner->p_numposixlocks >= 0,
 169                         ("Negative number of POSIX locks held by proc: %d.",
 170                          uip->ui_posixlocks));
 171                 ret = 0;
 172         }
 173         return ret;
 174 }
 175
 176 /*
 177  * Advisory record locking support
 178  */
 179 int
 180 lf_advlock(struct vop_advlock_args *ap, struct lockf *lock, u_quad_t size)
 181 {
 182         struct flock *fl = ap->a_fl;
 183         struct proc *owner;
 184         off_t start, end;
 185         int type, flags, error;
 186         lwkt_token_t token;
 187
 188         /*
 189          * Convert the flock structure into a start and end.
 190          */
 191         switch (fl->l_whence) {
 192         case SEEK_SET:
 193         case SEEK_CUR:
 194                 /*
 195                  * Caller is responsible for adding any necessary offset
 196                  * when SEEK_CUR is used.
 197                  */
 198                 start = fl->l_start;
 199                 break;
 200
 201         case SEEK_END:
 202                 start = size + fl->l_start;
 203                 break;
 204
 205         default:
 206                 return(EINVAL);
 207         }
 208
 209         flags = ap->a_flags;
 210         if (start < 0)
 211                 return(EINVAL);
 212         if (fl->l_len == 0) {
 213                 flags |= F_NOEND;
 214                 end = LLONG_MAX;
 215         } else if (fl->l_len < 0) {
 216                 return(EINVAL);
 217         } else {
 218                 end = start + fl->l_len - 1;
 219                 if (end < start)
 220                         return(EINVAL);
 221         }
 222
 223         type = fl->l_type;
 224         /*
 225          * This isn't really correct for flock-style locks,
 226          * but the current handling is somewhat broken anyway.
 227          */
 228         owner = (struct proc *)ap->a_id;
 229
 230         /*
 231          * Do the requested operation.
 232          */
 233         token = lwkt_getpooltoken(lock);
 234
 235         if (lock->init_done == 0) {
 236                 TAILQ_INIT(&lock->lf_range);
 237                 TAILQ_INIT(&lock->lf_blocked);
 238                 lock->init_done = 1;
 239         }
 240
 241         switch(ap->a_op) {
 242         case F_SETLK:
 243                 /*
 244                  * NOTE: It is possible for both lf_range and lf_blocked to
 245                  * be empty if we block and get woken up, but another process
 246                  * then gets in and issues an unlock.  So VMAYHAVELOCKS must
 247                  * be set after the lf_setlock() operation completes rather
 248                  * then before.
 249                  */
 250                 error = lf_setlock(lock, owner, type, flags, start, end);
 251                 vsetflags(ap->a_vp, VMAYHAVELOCKS);
 252                 break;
 253
 254         case F_UNLCK:
 255                 error = lf_setlock(lock, owner, type, flags, start, end);
 256                 if (TAILQ_EMPTY(&lock->lf_range) &&
 257                     TAILQ_EMPTY(&lock->lf_blocked)) {
 258                         vclrflags(ap->a_vp, VMAYHAVELOCKS);
 259                 }
 260                 break;
 261
 262         case F_GETLK:
 263                 error = lf_getlock(fl, lock, owner, type, flags, start, end);
 264                 break;
 265
 266         default:
 267                 error = EINVAL;
 268                 break;
 269         }
 270         lwkt_reltoken(token);
 271         return(error);
 272 }
 273
 274 static int
 275 lf_setlock(struct lockf *lock, struct proc *owner, int type, int flags,
 276            off_t start, off_t end)
 277 {
 278         struct lockf_range *range;
 279         struct lockf_range *brange;
 280         struct lockf_range *next;
 281         struct lockf_range *first_match;
 282         struct lockf_range *last_match;
 283         struct lockf_range *insert_point;
 284         struct lockf_range *new_range1;
 285         struct lockf_range *new_range2;
 286         int wakeup_needed;
 287         int double_clip;
 288         int unlock_override;
 289         int error = 0;
 290         int count;
 291         struct lockf_range_list deadlist;
 292
 293         new_range1 = NULL;
 294         new_range2 = NULL;
 295         count = 0;
 296
 297 restart:
 298         /*
 299          * Preallocate two ranges so we don't have to worry about blocking
 300          * in the middle of the lock code.
 301          */
 302         if (new_range1 == NULL)
 303                 new_range1 = lf_alloc_range();
 304         if (new_range2 == NULL)
 305                 new_range2 = lf_alloc_range();
 306         first_match = NULL;
 307         last_match = NULL;
 308         insert_point = NULL;
 309         wakeup_needed = 0;
 310
 311         lf_print_lock(lock);
 312
 313         /*
 314          * Locate the insertion point for the new lock (the first range
 315          * with an lf_start >= start).
 316          *
 317          * Locate the first and latch ranges owned by us that overlap
 318          * the requested range.
 319          */
 320         TAILQ_FOREACH(range, &lock->lf_range, lf_link) {
 321                 if (insert_point == NULL && range->lf_start >= start)
 322                         insert_point = range;
 323
 324                 /*
 325                  * Skip non-overlapping locks.  Locks are sorted by lf_start
 326                  * So we can terminate the search when lf_start exceeds the
 327                  * requested range (insert_point is still guarenteed to be
 328                  * set properly).
 329                  */
 330                 if (range->lf_end < start)
 331                         continue;
 332                 if (range->lf_start > end) {
 333                         range = NULL;
 334                         break;
 335                 }
 336
 337                 /*
 338                  * Overlapping lock.  Set first_match and last_match if we
 339                  * are the owner.
 340                  */
 341                 if (range->lf_owner == owner) {
 342                         if (first_match == NULL)
 343                                 first_match = range;
 344                         last_match = range;
 345                         continue;
 346                 }
 347
 348                 /*
 349                  * If we aren't the owner check for a conflicting lock.  Only
 350                  * if not unlocking.
 351                  */
 352                 if (type != F_UNLCK) {
 353                         if (type == F_WRLCK || range->lf_type == F_WRLCK)
 354                                 break;
 355                 }
 356         }
 357
 358         /*
 359          * If a conflicting lock was observed, block or fail as appropriate.
 360          * (this code is skipped when unlocking)
 361          */
 362         if (range != NULL) {
 363                 if ((flags & F_WAIT) == 0) {
 364                         error = EAGAIN;
 365                         goto do_cleanup;
 366                 }
 367
 368                 /*
 369                  * We are blocked. For POSIX locks we have to check
 370                  * for deadlocks and return with EDEADLK. This is done
 371                  * by checking whether range->lf_owner is already
 372                  * blocked.
 373                  *
 374                  * Since flock-style locks cover the whole file, a
 375                  * deadlock between those is nearly impossible.
 376                  * This can only occur if a process tries to lock the
 377                  * same inode exclusively while holding a shared lock
 378                  * with another descriptor.
 379                  * XXX How can we cleanly detect this?
 380                  * XXX The current mixing of flock & fcntl/lockf is evil.
 381                  *
 382                  * Handle existing locks of flock-style like POSIX locks.
 383                  */
 384                 if (flags & F_POSIX) {
 385                         TAILQ_FOREACH(brange, &lock->lf_blocked, lf_link) {
 386                                 if (brange->lf_owner == range->lf_owner) {
 387                                         error = EDEADLK;
 388                                         goto do_cleanup;
 389                                 }
 390                         }
 391                 }
 392
 393                 /*
 394                  * For flock-style locks, we must first remove
 395                  * any shared locks that we hold before we sleep
 396                  * waiting for an exclusive lock.
 397                  */
 398                 if ((flags & F_POSIX) == 0 && type == F_WRLCK)
 399                         lf_setlock(lock, owner, F_UNLCK, 0, start, end);
 400
 401                 brange = new_range1;
 402                 new_range1 = NULL;
 403                 lf_create_range(brange, owner, type, 0, start, end);
 404                 TAILQ_INSERT_TAIL(&lock->lf_blocked, brange, lf_link);
 405                 error = tsleep(brange, PCATCH, "lockf", 0);
 406
 407                 /*
 408                  * We may have been awaked by a signal and/or by a
 409                  * debugger continuing us (in which case we must remove
 410                  * ourselves from the blocked list) and/or by another
 411                  * process releasing/downgrading a lock (in which case
 412                  * we have already been removed from the blocked list
 413                  * and our lf_flags field is 1).
 414                  *
 415                  * Sleep if it looks like we might be livelocking.
 416                  */
 417                 if (brange->lf_flags == 0)
 418                         TAILQ_REMOVE(&lock->lf_blocked, brange, lf_link);
 419                 if (count == 2)
 420                         tsleep(brange, 0, "lockfz", 2);
 421                 else
 422                         ++count;
 423                 lf_destroy_range(brange);
 424
 425                 if (error)
 426                         goto do_cleanup;
 427                 goto restart;
 428         }
 429
 430         /*
 431          * If there are no overlapping locks owned by us then creating
 432          * the new lock is easy.  This is the most common case.
 433          */
 434         if (first_match == NULL) {
 435                 if (type == F_UNLCK)
 436                         goto do_wakeup;
 437                 if (flags & F_POSIX) {
 438                         if (lf_count_change(owner, 1)) {
 439                                 error = ENOLCK;
 440                                 goto do_cleanup;
 441                         }
 442                 }
 443                 range = new_range1;
 444                 new_range1 = NULL;
 445                 lf_create_range(range, owner, type, flags, start, end);
 446                 lf_insert(&lock->lf_range, range, insert_point);
 447                 goto do_wakeup;
 448         }
 449
 450         /*
 451          * double_clip - Calculate a special case where TWO locks may have
 452          *               to be added due to the new lock breaking up an
 453          *               existing incompatible lock in the middle.
 454          *
 455          * unlock_override - Calculate a special case where NO locks
 456          *               need to be created.  This occurs when an unlock
 457          *               does not clip any locks at the front and rear.
 458          *
 459          * WARNING!  closef() and fdrop() assume that an F_UNLCK of the
 460          *           entire range will always succeed so the unlock_override
 461          *           case is mandatory.
 462          */
 463         double_clip = 0;
 464         unlock_override = 0;
 465         if (first_match->lf_start < start) {
 466                 if (first_match == last_match && last_match->lf_end > end)
 467                         double_clip = 1;
 468         } else if (type == F_UNLCK && last_match->lf_end <= end) {
 469                 unlock_override = 1;
 470         }
 471
 472         /*
 473          * Figure out the worst case net increase in POSIX locks and account
 474          * for it now before we start modifying things.  If neither the
 475          * first or last locks match we have an issue.  If there is only
 476          * one overlapping range which needs to be clipped on both ends
 477          * we wind up having to create up to two new locks, else only one.
 478          *
 479          * When unlocking the worst case is always 1 new lock if our
 480          * unlock request cuts the middle out of an existing lock range.
 481          *
 482          * count represents the 'cleanup' adjustment needed.  It starts
 483          * negative, is incremented whenever we create a new POSIX lock,
 484          * and decremented whenever we delete an existing one.  At the
 485          * end of the day it had better be <= 0 or we didn't calculate the
 486          * worse case properly here.
 487          */
 488         count = 0;
 489         if ((flags & F_POSIX) && !unlock_override) {
 490                 if (!lf_match(first_match, type, flags) &&
 491                     !lf_match(last_match, type, flags)
 492                 ) {
 493                         if (double_clip && type != F_UNLCK)
 494                                 count = -2;
 495                         else
 496                                 count = -1;
 497                 }
 498                 if (count && lf_count_change(owner, -count)) {
 499                         error = ENOLCK;
 500                         goto do_cleanup;
 501                 }
 502         }
 503         /* else flock style lock which encompasses entire range */
 504
 505         /*
 506          * Create and insert the lock represented the requested range.
 507          * Adjust the net POSIX lock count.  We have to move our insertion
 508          * point since brange now represents the first record >= start.
 509          *
 510          * When unlocking, no new lock is inserted but we still clip.
 511          */
 512         if (type != F_UNLCK) {
 513                 brange = new_range1;
 514                 new_range1 = NULL;
 515                 lf_create_range(brange, owner, type, flags, start, end);
 516                 lf_insert(&lock->lf_range, brange, insert_point);
 517                 insert_point = brange;
 518                 if (flags & F_POSIX)
 519                         ++count;
 520         } else {
 521                 brange = NULL;
 522         }
 523
 524         /*
 525          * Handle the double_clip case.  This is the only case where
 526          * we wind up having to add TWO locks.
 527          */
 528         if (double_clip) {
 529                 KKASSERT(first_match == last_match);
 530                 last_match = new_range2;
 531                 new_range2 = NULL;
 532                 lf_create_range(last_match, first_match->lf_owner,
 533                                 first_match->lf_type, first_match->lf_flags,
 534                                 end + 1, first_match->lf_end);
 535                 first_match->lf_end = start - 1;
 536                 first_match->lf_flags &= ~F_NOEND;
 537
 538                 /*
 539                  * Figure out where to insert the right side clip.
 540                  */
 541                 lf_insert(&lock->lf_range, last_match, first_match);
 542                 if (last_match->lf_flags & F_POSIX)
 543                         ++count;
 544         }
 545
 546         /*
 547          * Clip or destroy the locks between first_match and last_match,
 548          * inclusive.  Ignore the primary lock we created (brange).  Note
 549          * that if double-clipped, first_match and last_match will be
 550          * outside our clipping range.  Otherwise first_match and last_match
 551          * will be deleted.
 552          *
 553          * We have already taken care of any double clipping.
 554          *
 555          * The insert_point may become invalid as we delete records, do not
 556          * use that pointer any more.  Also, when removing something other
 557          * then 'range' we have to check to see if the item we are removing
 558          * is 'next' and adjust 'next' properly.
 559          *
 560          * NOTE: brange will be NULL if F_UNLCKing.
 561          */
 562         TAILQ_INIT(&deadlist);
 563         next = first_match;
 564
 565         while ((range = next) != NULL) {
 566                 next = TAILQ_NEXT(range, lf_link);
 567
 568                 /*
 569                  * Ignore elements that we do not own and ignore the
 570                  * primary request range which we just created.
 571                  */
 572                 if (range->lf_owner != owner || range == brange)
 573                         continue;
 574
 575                 /*
 576                  * We may have to wakeup a waiter when downgrading a lock.
 577                  */
 578                 if (type == F_UNLCK)
 579                         wakeup_needed = 1;
 580                 if (type == F_RDLCK && range->lf_type == F_WRLCK)
 581                         wakeup_needed = 1;
 582
 583                 /*
 584                  * Clip left.  This can only occur on first_match.
 585                  *
 586                  * Merge the left clip with brange if possible.  This must
 587                  * be done specifically, not in the optimized merge heuristic
 588                  * below, since we may have counted on it in our 'count'
 589                  * calculation above.
 590                  */
 591                 if (range->lf_start < start) {
 592                         KKASSERT(range == first_match);
 593                         if (brange &&
 594                             range->lf_end >= start - 1 &&
 595                             lf_match(range, type, flags)) {
 596                                 range->lf_end = brange->lf_end;
 597                                 range->lf_flags |= brange->lf_flags & F_NOEND;
 598                                 /*
 599                                  * Removing something other then 'range',
 600                                  * adjust 'next' if necessary.
 601                                  */
 602                                 if (next == brange)
 603                                         next = TAILQ_NEXT(next, lf_link);
 604                                 TAILQ_REMOVE(&lock->lf_range, brange, lf_link);
 605                                 if (brange->lf_flags & F_POSIX)
 606                                         --count;
 607                                 TAILQ_INSERT_TAIL(&deadlist, brange, lf_link);
 608                                 brange = range;
 609                         } else if (range->lf_end >= start) {
 610                                 range->lf_end = start - 1;
 611                                 if (type != F_UNLCK)
 612                                         range->lf_flags &= ~F_NOEND;
 613                         }
 614                         if (range == last_match)
 615                                 break;
 616                         continue;
 617                 }
 618
 619                 /*
 620                  * Clip right.  This can only occur on last_match.
 621                  *
 622                  * Merge the right clip if possible.  This must be done
 623                  * specifically, not in the optimized merge heuristic
 624                  * below, since we may have counted on it in our 'count'
 625                  * calculation.
 626                  *
 627                  * Since we are adjusting lf_start, we have to move the
 628                  * record to maintain the sorted list.  Since lf_start is
 629                  * only getting larger we can use the next element as the
 630                  * insert point (we don't have to backtrack).
 631                  */
 632                 if (range->lf_end > end) {
 633                         KKASSERT(range == last_match);
 634                         if (brange &&
 635                             range->lf_start <= end + 1 &&
 636                             lf_match(range, type, flags)) {
 637                                 brange->lf_end = range->lf_end;
 638                                 brange->lf_flags |= range->lf_flags & F_NOEND;
 639                                 TAILQ_REMOVE(&lock->lf_range, range, lf_link);
 640                                 if (range->lf_flags & F_POSIX)
 641                                         --count;
 642                                 TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
 643                         } else if (range->lf_start <= end) {
 644                                 range->lf_start = end + 1;
 645                                 TAILQ_REMOVE(&lock->lf_range, range, lf_link);
 646                                 lf_insert(&lock->lf_range, range, next);
 647                         }
 648                         /* range == last_match, we are done */
 649                         break;
 650                 }
 651
 652                 /*
 653                  * The record must be entirely enclosed.  Note that the
 654                  * record could be first_match or last_match, and will be
 655                  * deleted.
 656                  */
 657                 KKASSERT(range->lf_start >= start && range->lf_end <= end);
 658                 TAILQ_REMOVE(&lock->lf_range, range, lf_link);
 659                 if (range->lf_flags & F_POSIX)
 660                         --count;
 661                 TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
 662                 if (range == last_match)
 663                         break;
 664         }
 665
 666         /*
 667          * Attempt to merge locks adjacent to brange.  For example, we may
 668          * have had to clip first_match and/or last_match, and they might
 669          * be adjacent.  Or there might simply have been an adjacent lock
 670          * already there.
 671          *
 672          * Don't get fancy, just check adjacent elements in the list if they
 673          * happen to be owned by us.
 674          *
 675          * This case only gets hit if we have a situation where a shared
 676          * and exclusive lock are adjacent, and the exclusive lock is
 677          * downgraded to shared or the shared lock is upgraded to exclusive.
 678          */
 679         if (brange) {
 680                 range = TAILQ_PREV(brange, lockf_range_list, lf_link);
 681                 if (range &&
 682                     range->lf_owner == owner &&
 683                     range->lf_end == brange->lf_start - 1 &&
 684                     lf_match(range, type, flags)
 685                 ) {
 686                         /*
 687                          * Extend range to cover brange and scrap brange.
 688                          */
 689                         range->lf_end = brange->lf_end;
 690                         range->lf_flags |= brange->lf_flags & F_NOEND;
 691                         TAILQ_REMOVE(&lock->lf_range, brange, lf_link);
 692                         if (brange->lf_flags & F_POSIX)
 693                                 --count;
 694                         TAILQ_INSERT_TAIL(&deadlist, brange, lf_link);
 695                         brange = range;
 696                 }
 697                 range = TAILQ_NEXT(brange, lf_link);
 698                 if (range &&
 699                     range->lf_owner == owner &&
 700                     range->lf_start == brange->lf_end + 1 &&
 701                     lf_match(range, type, flags)
 702                 ) {
 703                         /*
 704                          * Extend brange to cover range and scrap range.
 705                          */
 706                         brange->lf_end = range->lf_end;
 707                         brange->lf_flags |= range->lf_flags & F_NOEND;
 708                         TAILQ_REMOVE(&lock->lf_range, range, lf_link);
 709                         if (range->lf_flags & F_POSIX)
 710                                 --count;
 711                         TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
 712                 }
 713         }
 714
 715         /*
 716          * Destroy deleted elements.  We didn't want to do it in the loop
 717          * because the free() might have blocked.
 718          *
 719          * Adjust the count for any posix locks we thought we might create
 720          * but didn't.
 721          */
 722         while ((range = TAILQ_FIRST(&deadlist)) != NULL) {
 723                 TAILQ_REMOVE(&deadlist, range, lf_link);
 724                 lf_destroy_range(range);
 725         }
 726
 727         KKASSERT(count <= 0);
 728         if (count < 0)
 729                 lf_count_change(owner, count);
 730 do_wakeup:
 731         lf_print_lock(lock);
 732         if (wakeup_needed)
 733                 lf_wakeup(lock, start, end);
 734         error = 0;
 735 do_cleanup:
 736         if (new_range1 != NULL)
 737                 lf_destroy_range(new_range1);
 738         if (new_range2 != NULL)
 739                 lf_destroy_range(new_range2);
 740         return(error);
 741 }
 742
 743 /*
 744  * Check whether there is a blocking lock,
 745  * and if so return its process identifier.
 746  */
 747 static int
 748 lf_getlock(struct flock *fl, struct lockf *lock, struct proc *owner,
 749            int type, int flags, off_t start, off_t end)
 750 {
 751         struct lockf_range *range;
 752
 753         TAILQ_FOREACH(range, &lock->lf_range, lf_link)
 754                 if (range->lf_owner != owner &&
 755                     lf_overlap(range, start, end) &&
 756                     (type == F_WRLCK || range->lf_type == F_WRLCK))
 757                         break;
 758         if (range == NULL) {
 759                 fl->l_type = F_UNLCK;
 760                 return(0);
 761         }
 762         fl->l_type = range->lf_type;
 763         fl->l_whence = SEEK_SET;
 764         fl->l_start = range->lf_start;
 765         if (range->lf_flags & F_NOEND)
 766                 fl->l_len = 0;
 767         else
 768                 fl->l_len = range->lf_end - range->lf_start + 1;
 769         if (range->lf_owner != NULL && (range->lf_flags & F_POSIX))
 770                 fl->l_pid = range->lf_owner->p_pid;
 771         else
 772                 fl->l_pid = -1;
 773         return(0);
 774 }
 775
 776 /*
 777  * Wakeup pending lock attempts.  Theoretically we can stop as soon as
 778  * we encounter an exclusive request that covers the whole range (at least
 779  * insofar as the sleep code above calls lf_wakeup() if it would otherwise
 780  * exit instead of loop), but for now just wakeup all overlapping
 781  * requests.  XXX
 782  */
 783 static void
 784 lf_wakeup(struct lockf *lock, off_t start, off_t end)
 785 {
 786         struct lockf_range *range, *nrange;
 787
 788         TAILQ_FOREACH_MUTABLE(range, &lock->lf_blocked, lf_link, nrange) {
 789                 if (lf_overlap(range, start, end) == 0)
 790                         continue;
 791                 TAILQ_REMOVE(&lock->lf_blocked, range, lf_link);
 792                 range->lf_flags = 1;
 793                 wakeup(range);
 794         }
 795 }
 796
 797 /*
 798  * Allocate a range structure and initialize it sufficiently such that
 799  * lf_destroy_range() does not barf.
 800  */
 801 static struct lockf_range *
 802 lf_alloc_range(void)
 803 {
 804         struct lockf_range *range;
 805
 806 #ifdef INVARIANTS
 807         atomic_add_int(&lf_global_counter, 1);
 808 #endif
 809         range = kmalloc(sizeof(struct lockf_range), M_LOCKF, M_WAITOK);
 810         range->lf_owner = NULL;
 811         return(range);
 812 }
 813
 814 static void
 815 lf_insert(struct lockf_range_list *list, struct lockf_range *elm,
 816           struct lockf_range *insert_point)
 817 {
 818         while (insert_point && insert_point->lf_start < elm->lf_start)
 819                 insert_point = TAILQ_NEXT(insert_point, lf_link);
 820         if (insert_point != NULL)
 821                 TAILQ_INSERT_BEFORE(insert_point, elm, lf_link);
 822         else
 823                 TAILQ_INSERT_TAIL(list, elm, lf_link);
 824 }
 825
 826 static void
 827 lf_create_range(struct lockf_range *range, struct proc *owner, int type,
 828                 int flags, off_t start, off_t end)
 829 {
 830         KKASSERT(start <= end);
 831         range->lf_type = type;
 832         range->lf_flags = flags;
 833         range->lf_start = start;
 834         range->lf_end = end;
 835         range->lf_owner = owner;
 836
 837         lf_printf("lf_create_range: %ju..%ju\n",
 838             (uintmax_t)range->lf_start, (uintmax_t)range->lf_end);
 839 }
 840
 841 static void
 842 lf_destroy_range(struct lockf_range *range)
 843 {
 844         lf_printf("lf_destroy_range: %ju..%ju\n",
 845             (uintmax_t)range->lf_start, (uintmax_t)range->lf_end);
 846         kfree(range, M_LOCKF);
 847 #ifdef INVARIANTS
 848         atomic_add_int(&lf_global_counter, -1);
 849         KKASSERT(lf_global_counter >= 0);
 850 #endif
 851 }
 852
 853 #ifdef LOCKF_DEBUG
 854
 855 static void
 856 _lf_printf(const char *ctl, ...)
 857 {
 858         struct proc *p;
 859         __va_list va;
 860
 861         if (lf_print_ranges) {
 862             if ((p = curproc) != NULL)
 863                 kprintf("pid %d (%s): ", p->p_pid, p->p_comm);
 864         }
 865         __va_start(va, ctl);
 866         kvprintf(ctl, va);
 867         __va_end(va);
 868 }
 869
 870 static void
 871 _lf_print_lock(const struct lockf *lock)
 872 {
 873         struct lockf_range *range;
 874
 875         if (lf_print_ranges == 0)
 876                 return;
 877
 878         if (TAILQ_EMPTY(&lock->lf_range)) {
 879                 lf_printf("lockf %p: no ranges locked\n", lock);
 880         } else {
 881                 lf_printf("lockf %p:\n", lock);
 882         }
 883         TAILQ_FOREACH(range, &lock->lf_range, lf_link)
 884                 kprintf("\t%jd..%jd type %s owned by %d\n",
 885                        (uintmax_t)range->lf_start, (uintmax_t)range->lf_end,
 886                        range->lf_type == F_RDLCK ? "shared" : "exclusive",
 887                        range->lf_flags & F_POSIX ? range->lf_owner->p_pid : -1);
 888         if (TAILQ_EMPTY(&lock->lf_blocked))
 889                 kprintf("no process waiting for range\n");
 890         else
 891                 kprintf("blocked locks:");
 892         TAILQ_FOREACH(range, &lock->lf_blocked, lf_link)
 893                 kprintf("\t%jd..%jd type %s waiting on %p\n",
 894                        (uintmax_t)range->lf_start, (uintmax_t)range->lf_end,
 895                        range->lf_type == F_RDLCK ? "shared" : "exclusive",
 896                        range);
 897 }
 898 #endif /* LOCKF_DEBUG */