sys/kern/kern_lockf.c

   1 /*
   2  * Copyright (c) 2004 Joerg Sonnenberger <joerg@bec.de>.  All rights reserved.
   3  * Copyright (c) 2006-2018 Matthew Dillon <dillon@backplane.com>.  All rights reserved.
   4  *
   5  * Copyright (c) 1982, 1986, 1989, 1993
   6  *      The Regents of the University of California.  All rights reserved.
   7  *
   8  * This code is derived from software contributed to Berkeley by
   9  * Scooter Morris at Genentech Inc.
  10  *
  11  * Redistribution and use in source and binary forms, with or without
  12  * modification, are permitted provided that the following conditions
  13  * are met:
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  * 2. Redistributions in binary form must reproduce the above copyright
  17  *    notice, this list of conditions and the following disclaimer in the
  18  *    documentation and/or other materials provided with the distribution.
  19  * 3. Neither the name of the University nor the names of its contributors
  20  *    may be used to endorse or promote products derived from this software
  21  *    without specific prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  *
  35  *      @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
  36  * $FreeBSD: src/sys/kern/kern_lockf.c,v 1.25 1999/11/16 16:28:56 phk Exp $
  37  */
  38
  39 #include "opt_debug_lockf.h"
  40
  41 #include <sys/param.h>
  42 #include <sys/systm.h>
  43 #include <sys/kernel.h>
  44 #include <sys/lock.h>
  45 #include <sys/proc.h>
  46 #include <sys/unistd.h>
  47 #include <sys/vnode.h>
  48 #include <sys/malloc.h>
  49 #include <sys/fcntl.h>
  50 #include <sys/resourcevar.h>
  51
  52 #include <sys/lockf.h>
  53 #include <machine/limits.h>     /* for LLONG_MAX */
  54 #include <machine/stdarg.h>
  55
  56 #include <sys/spinlock2.h>
  57
  58 struct lf_pcpu {
  59         struct lockf_range *free1;
  60         struct lockf_range *free2;
  61 } __cachealign;
  62
  63 static struct lf_pcpu   *lf_pcpu_array;
  64
  65 #ifdef LOCKF_DEBUG
  66 int lf_print_ranges = 0;
  67
  68 static void _lf_print_lock(const struct lockf *);
  69 static void _lf_printf(const char *, ...) __printflike(1, 2);
  70
  71 #define lf_print_lock(lock) if (lf_print_ranges) _lf_print_lock(lock)
  72 #define lf_printf(ctl, args...) if (lf_print_ranges) _lf_printf(ctl, args)
  73 #else
  74 #define lf_print_lock(lock)
  75 #define lf_printf(ctl, args...)
  76 #endif
  77
  78 static MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
  79
  80 static void     lf_wakeup(struct lockf *, off_t, off_t);
  81 static struct lockf_range *lf_alloc_range(void);
  82 static void     lf_create_range(struct lockf_range *, struct proc *, int, int,
  83                                 off_t, off_t);
  84 static void     lf_insert(struct lockf_range_list *list,
  85                                 struct lockf_range *elm,
  86                                 struct lockf_range *insert_point);
  87 static void     lf_destroy_range(struct lockf_range *);
  88
  89 static int      lf_setlock(struct lockf *, struct proc *, int, int,
  90                            off_t, off_t);
  91 static int      lf_getlock(struct flock *, struct lockf *, struct proc *,
  92                            int, int, off_t, off_t);
  93
  94 static int      lf_count_change(struct proc *, int);
  95
  96 /*
  97  * Return TRUE (non-zero) if the type and posix flags match.
  98  */
  99 static __inline
 100 int
 101 lf_match(struct lockf_range *range, int type, int flags)
 102 {
 103         if (range->lf_type != type)
 104                 return(0);
 105         if ((range->lf_flags ^ flags) & F_POSIX)
 106                 return(0);
 107         return(1);
 108 }
 109
 110 /*
 111  * Check whether range and [start, end] overlap.
 112  */
 113 static __inline
 114 int
 115 lf_overlap(const struct lockf_range *range, off_t start, off_t end)
 116 {
 117         if (range->lf_start >= start && range->lf_start <= end)
 118                 return(1);
 119         else if (start >= range->lf_start && start <= range->lf_end)
 120                 return(1);
 121         else
 122                 return(0);
 123 }
 124
 125
 126 /*
 127  * Change the POSIX lock accounting for the given process.
 128  */
 129 void
 130 lf_count_adjust(struct proc *p, int increase)
 131 {
 132         struct uidinfo *uip;
 133         struct uidcount *pup;
 134         int n;
 135
 136         KKASSERT(p != NULL);
 137
 138         uip = p->p_ucred->cr_uidinfo;
 139         pup = &uip->ui_pcpu[mycpuid];
 140
 141         if (increase) {
 142                 for (n = 0; n < ncpus; ++n)
 143                         pup->pu_posixlocks += p->p_uidpcpu[n].pu_posixlocks;
 144         } else {
 145                 for (n = 0; n < ncpus; ++n)
 146                         pup->pu_posixlocks -= p->p_uidpcpu[n].pu_posixlocks;
 147         }
 148
 149         if (pup->pu_posixlocks < -PUP_LIMIT ||
 150             pup->pu_posixlocks > PUP_LIMIT) {
 151                 atomic_add_int(&uip->ui_posixlocks, pup->pu_posixlocks);
 152                 pup->pu_posixlocks = 0;
 153         }
 154 }
 155
 156 static int
 157 lf_count_change(struct proc *owner, int diff)
 158 {
 159         struct uidinfo *uip;
 160         int max, ret;
 161
 162         /* we might actually not have a process context */
 163         if (owner == NULL)
 164                 return(0);
 165
 166         uip = owner->p_ucred->cr_uidinfo;
 167
 168         max = MIN(owner->p_rlimit[RLIMIT_POSIXLOCKS].rlim_cur,
 169                   maxposixlocksperuid);
 170
 171         if (diff > 0 && owner->p_ucred->cr_uid != 0 && max != -1 &&
 172             uip->ui_posixlocks >= max ) {
 173                 ret = 1;
 174         } else {
 175                 struct uidcount *pup;
 176                 int cpu = mycpuid;
 177
 178                 pup = &uip->ui_pcpu[cpu];
 179                 pup->pu_posixlocks += diff;
 180                 if (pup->pu_posixlocks < -PUP_LIMIT ||
 181                     pup->pu_posixlocks > PUP_LIMIT) {
 182                         atomic_add_int(&uip->ui_posixlocks, pup->pu_posixlocks);
 183                         pup->pu_posixlocks = 0;
 184                 }
 185                 owner->p_uidpcpu[cpu].pu_posixlocks += diff;
 186                 ret = 0;
 187         }
 188         return ret;
 189 }
 190
 191 /*
 192  * Advisory record locking support
 193  */
 194 int
 195 lf_advlock(struct vop_advlock_args *ap, struct lockf *lock, u_quad_t size)
 196 {
 197         struct flock *fl = ap->a_fl;
 198         struct proc *owner;
 199         off_t start, end;
 200         int type, flags, error;
 201         lwkt_token_t token;
 202
 203         /*
 204          * Convert the flock structure into a start and end.
 205          */
 206         switch (fl->l_whence) {
 207         case SEEK_SET:
 208         case SEEK_CUR:
 209                 /*
 210                  * Caller is responsible for adding any necessary offset
 211                  * when SEEK_CUR is used.
 212                  */
 213                 start = fl->l_start;
 214                 break;
 215
 216         case SEEK_END:
 217                 start = size + fl->l_start;
 218                 break;
 219
 220         default:
 221                 return(EINVAL);
 222         }
 223
 224         flags = ap->a_flags;
 225         if (start < 0)
 226                 return(EINVAL);
 227         if (fl->l_len == 0) {
 228                 flags |= F_NOEND;
 229                 end = LLONG_MAX;
 230         } else if (fl->l_len < 0) {
 231                 return(EINVAL);
 232         } else {
 233                 end = start + fl->l_len - 1;
 234                 if (end < start)
 235                         return(EINVAL);
 236         }
 237
 238         type = fl->l_type;
 239         /*
 240          * This isn't really correct for flock-style locks,
 241          * but the current handling is somewhat broken anyway.
 242          */
 243         owner = (struct proc *)ap->a_id;
 244
 245         /*
 246          * Do the requested operation.
 247          */
 248         token = lwkt_getpooltoken(lock);
 249
 250         if (lock->init_done == 0) {
 251                 TAILQ_INIT(&lock->lf_range);
 252                 TAILQ_INIT(&lock->lf_blocked);
 253                 lock->init_done = 1;
 254         }
 255
 256         switch(ap->a_op) {
 257         case F_SETLK:
 258                 /*
 259                  * NOTE: It is possible for both lf_range and lf_blocked to
 260                  * be empty if we block and get woken up, but another process
 261                  * then gets in and issues an unlock.  So VMAYHAVELOCKS must
 262                  * be set after the lf_setlock() operation completes rather
 263                  * then before.
 264                  */
 265                 error = lf_setlock(lock, owner, type, flags, start, end);
 266                 if ((ap->a_vp->v_flag & VMAYHAVELOCKS) == 0)
 267                         vsetflags(ap->a_vp, VMAYHAVELOCKS);
 268                 break;
 269
 270         case F_UNLCK:
 271                 error = lf_setlock(lock, owner, type, flags, start, end);
 272 #if 0
 273                 /*
 274                  * XXX REMOVED. don't bother doing this in the critical path.
 275                  * close() overhead is minimal.
 276                  */
 277                 if (TAILQ_EMPTY(&lock->lf_range) &&
 278                     TAILQ_EMPTY(&lock->lf_blocked)) {
 279                         vclrflags(ap->a_vp, VMAYHAVELOCKS);
 280                 }
 281 #endif
 282                 break;
 283
 284         case F_GETLK:
 285                 error = lf_getlock(fl, lock, owner, type, flags, start, end);
 286                 break;
 287
 288         default:
 289                 error = EINVAL;
 290                 break;
 291         }
 292         lwkt_reltoken(token);
 293         return(error);
 294 }
 295
 296 static int
 297 lf_setlock(struct lockf *lock, struct proc *owner, int type, int flags,
 298            off_t start, off_t end)
 299 {
 300         struct lockf_range *range;
 301         struct lockf_range *brange;
 302         struct lockf_range *next;
 303         struct lockf_range *first_match;
 304         struct lockf_range *last_match;
 305         struct lockf_range *insert_point;
 306         struct lockf_range *new_range1;
 307         struct lockf_range *new_range2;
 308         int wakeup_needed;
 309         int double_clip;
 310         int unlock_override;
 311         int error = 0;
 312         int count;
 313         struct lockf_range_list deadlist;
 314
 315         new_range1 = NULL;
 316         new_range2 = NULL;
 317         count = 0;
 318
 319 restart:
 320         /*
 321          * Preallocate two ranges so we don't have to worry about blocking
 322          * in the middle of the lock code.
 323          */
 324         if (new_range1 == NULL)
 325                 new_range1 = lf_alloc_range();
 326         if (new_range2 == NULL)
 327                 new_range2 = lf_alloc_range();
 328         first_match = NULL;
 329         last_match = NULL;
 330         insert_point = NULL;
 331         wakeup_needed = 0;
 332
 333         lf_print_lock(lock);
 334
 335         /*
 336          * Locate the insertion point for the new lock (the first range
 337          * with an lf_start >= start).
 338          *
 339          * Locate the first and latch ranges owned by us that overlap
 340          * the requested range.
 341          */
 342         TAILQ_FOREACH(range, &lock->lf_range, lf_link) {
 343                 if (insert_point == NULL && range->lf_start >= start)
 344                         insert_point = range;
 345
 346                 /*
 347                  * Skip non-overlapping locks.  Locks are sorted by lf_start
 348                  * So we can terminate the search when lf_start exceeds the
 349                  * requested range (insert_point is still guarenteed to be
 350                  * set properly).
 351                  */
 352                 if (range->lf_end < start)
 353                         continue;
 354                 if (range->lf_start > end) {
 355                         range = NULL;
 356                         break;
 357                 }
 358
 359                 /*
 360                  * Overlapping lock.  Set first_match and last_match if we
 361                  * are the owner.
 362                  */
 363                 if (range->lf_owner == owner) {
 364                         if (first_match == NULL)
 365                                 first_match = range;
 366                         last_match = range;
 367                         continue;
 368                 }
 369
 370                 /*
 371                  * If we aren't the owner check for a conflicting lock.  Only
 372                  * if not unlocking.
 373                  */
 374                 if (type != F_UNLCK) {
 375                         if (type == F_WRLCK || range->lf_type == F_WRLCK)
 376                                 break;
 377                 }
 378         }
 379
 380         /*
 381          * If a conflicting lock was observed, block or fail as appropriate.
 382          * (this code is skipped when unlocking)
 383          */
 384         if (range != NULL) {
 385                 if ((flags & F_WAIT) == 0) {
 386                         error = EAGAIN;
 387                         goto do_cleanup;
 388                 }
 389
 390                 /*
 391                  * We are blocked. For POSIX locks we have to check
 392                  * for deadlocks and return with EDEADLK. This is done
 393                  * by checking whether range->lf_owner is already
 394                  * blocked.
 395                  *
 396                  * Since flock-style locks cover the whole file, a
 397                  * deadlock between those is nearly impossible.
 398                  * This can only occur if a process tries to lock the
 399                  * same inode exclusively while holding a shared lock
 400                  * with another descriptor.
 401                  * XXX How can we cleanly detect this?
 402                  * XXX The current mixing of flock & fcntl/lockf is evil.
 403                  *
 404                  * Handle existing locks of flock-style like POSIX locks.
 405                  */
 406                 if (flags & F_POSIX) {
 407                         TAILQ_FOREACH(brange, &lock->lf_blocked, lf_link) {
 408                                 if (brange->lf_owner == range->lf_owner) {
 409                                         error = EDEADLK;
 410                                         goto do_cleanup;
 411                                 }
 412                         }
 413                 }
 414
 415                 /*
 416                  * For flock-style locks, we must first remove
 417                  * any shared locks that we hold before we sleep
 418                  * waiting for an exclusive lock.
 419                  */
 420                 if ((flags & F_POSIX) == 0 && type == F_WRLCK)
 421                         lf_setlock(lock, owner, F_UNLCK, 0, start, end);
 422
 423                 brange = new_range1;
 424                 new_range1 = NULL;
 425                 lf_create_range(brange, owner, type, 0, start, end);
 426                 TAILQ_INSERT_TAIL(&lock->lf_blocked, brange, lf_link);
 427                 error = tsleep(brange, PCATCH, "lockf", 0);
 428
 429                 /*
 430                  * We may have been awaked by a signal and/or by a
 431                  * debugger continuing us (in which case we must remove
 432                  * ourselves from the blocked list) and/or by another
 433                  * process releasing/downgrading a lock (in which case
 434                  * we have already been removed from the blocked list
 435                  * and our lf_flags field is 1).
 436                  *
 437                  * Sleep if it looks like we might be livelocking.
 438                  */
 439                 if (brange->lf_flags == 0)
 440                         TAILQ_REMOVE(&lock->lf_blocked, brange, lf_link);
 441                 if (count == 2)
 442                         tsleep(brange, 0, "lockfz", 2);
 443                 else
 444                         ++count;
 445                 lf_destroy_range(brange);
 446
 447                 if (error)
 448                         goto do_cleanup;
 449                 goto restart;
 450         }
 451
 452         /*
 453          * If there are no overlapping locks owned by us then creating
 454          * the new lock is easy.  This is the most common case.
 455          */
 456         if (first_match == NULL) {
 457                 if (type == F_UNLCK)
 458                         goto do_wakeup;
 459                 if (flags & F_POSIX) {
 460                         if (lf_count_change(owner, 1)) {
 461                                 error = ENOLCK;
 462                                 goto do_cleanup;
 463                         }
 464                 }
 465                 range = new_range1;
 466                 new_range1 = NULL;
 467                 lf_create_range(range, owner, type, flags, start, end);
 468                 lf_insert(&lock->lf_range, range, insert_point);
 469                 goto do_wakeup;
 470         }
 471
 472         /*
 473          * double_clip - Calculate a special case where TWO locks may have
 474          *               to be added due to the new lock breaking up an
 475          *               existing incompatible lock in the middle.
 476          *
 477          * unlock_override - Calculate a special case where NO locks
 478          *               need to be created.  This occurs when an unlock
 479          *               does not clip any locks at the front and rear.
 480          *
 481          * WARNING!  closef() and fdrop() assume that an F_UNLCK of the
 482          *           entire range will always succeed so the unlock_override
 483          *           case is mandatory.
 484          */
 485         double_clip = 0;
 486         unlock_override = 0;
 487         if (first_match->lf_start < start) {
 488                 if (first_match == last_match && last_match->lf_end > end)
 489                         double_clip = 1;
 490         } else if (type == F_UNLCK && last_match->lf_end <= end) {
 491                 unlock_override = 1;
 492         }
 493
 494         /*
 495          * Figure out the worst case net increase in POSIX locks and account
 496          * for it now before we start modifying things.  If neither the
 497          * first or last locks match we have an issue.  If there is only
 498          * one overlapping range which needs to be clipped on both ends
 499          * we wind up having to create up to two new locks, else only one.
 500          *
 501          * When unlocking the worst case is always 1 new lock if our
 502          * unlock request cuts the middle out of an existing lock range.
 503          *
 504          * count represents the 'cleanup' adjustment needed.  It starts
 505          * negative, is incremented whenever we create a new POSIX lock,
 506          * and decremented whenever we delete an existing one.  At the
 507          * end of the day it had better be <= 0 or we didn't calculate the
 508          * worse case properly here.
 509          */
 510         count = 0;
 511         if ((flags & F_POSIX) && !unlock_override) {
 512                 if (!lf_match(first_match, type, flags) &&
 513                     !lf_match(last_match, type, flags)
 514                 ) {
 515                         if (double_clip && type != F_UNLCK)
 516                                 count = -2;
 517                         else
 518                                 count = -1;
 519                 }
 520                 if (count && lf_count_change(owner, -count)) {
 521                         error = ENOLCK;
 522                         goto do_cleanup;
 523                 }
 524         }
 525         /* else flock style lock which encompasses entire range */
 526
 527         /*
 528          * Create and insert the lock represented the requested range.
 529          * Adjust the net POSIX lock count.  We have to move our insertion
 530          * point since brange now represents the first record >= start.
 531          *
 532          * When unlocking, no new lock is inserted but we still clip.
 533          */
 534         if (type != F_UNLCK) {
 535                 brange = new_range1;
 536                 new_range1 = NULL;
 537                 lf_create_range(brange, owner, type, flags, start, end);
 538                 lf_insert(&lock->lf_range, brange, insert_point);
 539                 insert_point = brange;
 540                 if (flags & F_POSIX)
 541                         ++count;
 542         } else {
 543                 brange = NULL;
 544         }
 545
 546         /*
 547          * Handle the double_clip case.  This is the only case where
 548          * we wind up having to add TWO locks.
 549          */
 550         if (double_clip) {
 551                 KKASSERT(first_match == last_match);
 552                 last_match = new_range2;
 553                 new_range2 = NULL;
 554                 lf_create_range(last_match, first_match->lf_owner,
 555                                 first_match->lf_type, first_match->lf_flags,
 556                                 end + 1, first_match->lf_end);
 557                 first_match->lf_end = start - 1;
 558                 first_match->lf_flags &= ~F_NOEND;
 559
 560                 /*
 561                  * Figure out where to insert the right side clip.
 562                  */
 563                 lf_insert(&lock->lf_range, last_match, first_match);
 564                 if (last_match->lf_flags & F_POSIX)
 565                         ++count;
 566         }
 567
 568         /*
 569          * Clip or destroy the locks between first_match and last_match,
 570          * inclusive.  Ignore the primary lock we created (brange).  Note
 571          * that if double-clipped, first_match and last_match will be
 572          * outside our clipping range.  Otherwise first_match and last_match
 573          * will be deleted.
 574          *
 575          * We have already taken care of any double clipping.
 576          *
 577          * The insert_point may become invalid as we delete records, do not
 578          * use that pointer any more.  Also, when removing something other
 579          * then 'range' we have to check to see if the item we are removing
 580          * is 'next' and adjust 'next' properly.
 581          *
 582          * NOTE: brange will be NULL if F_UNLCKing.
 583          */
 584         TAILQ_INIT(&deadlist);
 585         next = first_match;
 586
 587         while ((range = next) != NULL) {
 588                 next = TAILQ_NEXT(range, lf_link);
 589
 590                 /*
 591                  * Ignore elements that we do not own and ignore the
 592                  * primary request range which we just created.
 593                  */
 594                 if (range->lf_owner != owner || range == brange)
 595                         continue;
 596
 597                 /*
 598                  * We may have to wakeup a waiter when downgrading a lock.
 599                  */
 600                 if (type == F_UNLCK)
 601                         wakeup_needed = 1;
 602                 if (type == F_RDLCK && range->lf_type == F_WRLCK)
 603                         wakeup_needed = 1;
 604
 605                 /*
 606                  * Clip left.  This can only occur on first_match.
 607                  *
 608                  * Merge the left clip with brange if possible.  This must
 609                  * be done specifically, not in the optimized merge heuristic
 610                  * below, since we may have counted on it in our 'count'
 611                  * calculation above.
 612                  */
 613                 if (range->lf_start < start) {
 614                         KKASSERT(range == first_match);
 615                         if (brange &&
 616                             range->lf_end >= start - 1 &&
 617                             lf_match(range, type, flags)) {
 618                                 range->lf_end = brange->lf_end;
 619                                 range->lf_flags |= brange->lf_flags & F_NOEND;
 620                                 /*
 621                                  * Removing something other then 'range',
 622                                  * adjust 'next' if necessary.
 623                                  */
 624                                 if (next == brange)
 625                                         next = TAILQ_NEXT(next, lf_link);
 626                                 TAILQ_REMOVE(&lock->lf_range, brange, lf_link);
 627                                 if (brange->lf_flags & F_POSIX)
 628                                         --count;
 629                                 TAILQ_INSERT_TAIL(&deadlist, brange, lf_link);
 630                                 brange = range;
 631                         } else if (range->lf_end >= start) {
 632                                 range->lf_end = start - 1;
 633                                 if (type != F_UNLCK)
 634                                         range->lf_flags &= ~F_NOEND;
 635                         }
 636                         if (range == last_match)
 637                                 break;
 638                         continue;
 639                 }
 640
 641                 /*
 642                  * Clip right.  This can only occur on last_match.
 643                  *
 644                  * Merge the right clip if possible.  This must be done
 645                  * specifically, not in the optimized merge heuristic
 646                  * below, since we may have counted on it in our 'count'
 647                  * calculation.
 648                  *
 649                  * Since we are adjusting lf_start, we have to move the
 650                  * record to maintain the sorted list.  Since lf_start is
 651                  * only getting larger we can use the next element as the
 652                  * insert point (we don't have to backtrack).
 653                  */
 654                 if (range->lf_end > end) {
 655                         KKASSERT(range == last_match);
 656                         if (brange &&
 657                             range->lf_start <= end + 1 &&
 658                             lf_match(range, type, flags)) {
 659                                 brange->lf_end = range->lf_end;
 660                                 brange->lf_flags |= range->lf_flags & F_NOEND;
 661                                 TAILQ_REMOVE(&lock->lf_range, range, lf_link);
 662                                 if (range->lf_flags & F_POSIX)
 663                                         --count;
 664                                 TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
 665                         } else if (range->lf_start <= end) {
 666                                 range->lf_start = end + 1;
 667                                 TAILQ_REMOVE(&lock->lf_range, range, lf_link);
 668                                 lf_insert(&lock->lf_range, range, next);
 669                         }
 670                         /* range == last_match, we are done */
 671                         break;
 672                 }
 673
 674                 /*
 675                  * The record must be entirely enclosed.  Note that the
 676                  * record could be first_match or last_match, and will be
 677                  * deleted.
 678                  */
 679                 KKASSERT(range->lf_start >= start && range->lf_end <= end);
 680                 TAILQ_REMOVE(&lock->lf_range, range, lf_link);
 681                 if (range->lf_flags & F_POSIX)
 682                         --count;
 683                 TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
 684                 if (range == last_match)
 685                         break;
 686         }
 687
 688         /*
 689          * Attempt to merge locks adjacent to brange.  For example, we may
 690          * have had to clip first_match and/or last_match, and they might
 691          * be adjacent.  Or there might simply have been an adjacent lock
 692          * already there.
 693          *
 694          * Don't get fancy, just check adjacent elements in the list if they
 695          * happen to be owned by us.
 696          *
 697          * This case only gets hit if we have a situation where a shared
 698          * and exclusive lock are adjacent, and the exclusive lock is
 699          * downgraded to shared or the shared lock is upgraded to exclusive.
 700          */
 701         if (brange) {
 702                 range = TAILQ_PREV(brange, lockf_range_list, lf_link);
 703                 if (range &&
 704                     range->lf_owner == owner &&
 705                     range->lf_end == brange->lf_start - 1 &&
 706                     lf_match(range, type, flags)
 707                 ) {
 708                         /*
 709                          * Extend range to cover brange and scrap brange.
 710                          */
 711                         range->lf_end = brange->lf_end;
 712                         range->lf_flags |= brange->lf_flags & F_NOEND;
 713                         TAILQ_REMOVE(&lock->lf_range, brange, lf_link);
 714                         if (brange->lf_flags & F_POSIX)
 715                                 --count;
 716                         TAILQ_INSERT_TAIL(&deadlist, brange, lf_link);
 717                         brange = range;
 718                 }
 719                 range = TAILQ_NEXT(brange, lf_link);
 720                 if (range &&
 721                     range->lf_owner == owner &&
 722                     range->lf_start == brange->lf_end + 1 &&
 723                     lf_match(range, type, flags)
 724                 ) {
 725                         /*
 726                          * Extend brange to cover range and scrap range.
 727                          */
 728                         brange->lf_end = range->lf_end;
 729                         brange->lf_flags |= range->lf_flags & F_NOEND;
 730                         TAILQ_REMOVE(&lock->lf_range, range, lf_link);
 731                         if (range->lf_flags & F_POSIX)
 732                                 --count;
 733                         TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
 734                 }
 735         }
 736
 737         /*
 738          * Destroy deleted elements.  We didn't want to do it in the loop
 739          * because the free() might have blocked.
 740          *
 741          * Adjust the count for any posix locks we thought we might create
 742          * but didn't.
 743          */
 744         while ((range = TAILQ_FIRST(&deadlist)) != NULL) {
 745                 TAILQ_REMOVE(&deadlist, range, lf_link);
 746                 lf_destroy_range(range);
 747         }
 748
 749         KKASSERT(count <= 0);
 750         if (count < 0)
 751                 lf_count_change(owner, count);
 752 do_wakeup:
 753         lf_print_lock(lock);
 754         if (wakeup_needed)
 755                 lf_wakeup(lock, start, end);
 756         error = 0;
 757 do_cleanup:
 758         if (new_range1 != NULL)
 759                 lf_destroy_range(new_range1);
 760         if (new_range2 != NULL)
 761                 lf_destroy_range(new_range2);
 762         return(error);
 763 }
 764
 765 /*
 766  * Check whether there is a blocking lock,
 767  * and if so return its process identifier.
 768  */
 769 static int
 770 lf_getlock(struct flock *fl, struct lockf *lock, struct proc *owner,
 771            int type, int flags, off_t start, off_t end)
 772 {
 773         struct lockf_range *range;
 774
 775         TAILQ_FOREACH(range, &lock->lf_range, lf_link)
 776                 if (range->lf_owner != owner &&
 777                     lf_overlap(range, start, end) &&
 778                     (type == F_WRLCK || range->lf_type == F_WRLCK))
 779                         break;
 780         if (range == NULL) {
 781                 fl->l_type = F_UNLCK;
 782                 return(0);
 783         }
 784         fl->l_type = range->lf_type;
 785         fl->l_whence = SEEK_SET;
 786         fl->l_start = range->lf_start;
 787         if (range->lf_flags & F_NOEND)
 788                 fl->l_len = 0;
 789         else
 790                 fl->l_len = range->lf_end - range->lf_start + 1;
 791         if (range->lf_owner != NULL && (range->lf_flags & F_POSIX))
 792                 fl->l_pid = range->lf_owner->p_pid;
 793         else
 794                 fl->l_pid = -1;
 795         return(0);
 796 }
 797
 798 /*
 799  * Wakeup pending lock attempts.  Theoretically we can stop as soon as
 800  * we encounter an exclusive request that covers the whole range (at least
 801  * insofar as the sleep code above calls lf_wakeup() if it would otherwise
 802  * exit instead of loop), but for now just wakeup all overlapping
 803  * requests.  XXX
 804  */
 805 static void
 806 lf_wakeup(struct lockf *lock, off_t start, off_t end)
 807 {
 808         struct lockf_range *range, *nrange;
 809
 810         TAILQ_FOREACH_MUTABLE(range, &lock->lf_blocked, lf_link, nrange) {
 811                 if (lf_overlap(range, start, end) == 0)
 812                         continue;
 813                 TAILQ_REMOVE(&lock->lf_blocked, range, lf_link);
 814                 range->lf_flags = 1;
 815                 wakeup(range);
 816         }
 817 }
 818
 819 /*
 820  * Allocate a range structure and initialize it sufficiently such that
 821  * lf_destroy_range() does not barf.
 822  *
 823  * Most use cases are temporary, implement a small 2-entry-per-cpu
 824  * cache.
 825  */
 826 static struct lockf_range *
 827 lf_alloc_range(void)
 828 {
 829         struct lockf_range *range;
 830         struct lf_pcpu *lfpc;
 831
 832         lfpc = &lf_pcpu_array[mycpuid];
 833         if ((range = lfpc->free1) != NULL) {
 834                 lfpc->free1 = NULL;
 835                 return range;
 836         }
 837         if ((range = lfpc->free2) != NULL) {
 838                 lfpc->free2 = NULL;
 839                 return range;
 840         }
 841         range = kmalloc(sizeof(struct lockf_range), M_LOCKF, M_WAITOK);
 842         range->lf_owner = NULL;
 843
 844         return(range);
 845 }
 846
 847 static void
 848 lf_insert(struct lockf_range_list *list, struct lockf_range *elm,
 849           struct lockf_range *insert_point)
 850 {
 851         while (insert_point && insert_point->lf_start < elm->lf_start)
 852                 insert_point = TAILQ_NEXT(insert_point, lf_link);
 853         if (insert_point != NULL)
 854                 TAILQ_INSERT_BEFORE(insert_point, elm, lf_link);
 855         else
 856                 TAILQ_INSERT_TAIL(list, elm, lf_link);
 857 }
 858
 859 static void
 860 lf_create_range(struct lockf_range *range, struct proc *owner, int type,
 861                 int flags, off_t start, off_t end)
 862 {
 863         KKASSERT(start <= end);
 864         range->lf_type = type;
 865         range->lf_flags = flags;
 866         range->lf_start = start;
 867         range->lf_end = end;
 868         range->lf_owner = owner;
 869
 870         lf_printf("lf_create_range: %ju..%ju\n",
 871             (uintmax_t)range->lf_start, (uintmax_t)range->lf_end);
 872 }
 873
 874 static void
 875 lf_destroy_range(struct lockf_range *range)
 876 {
 877         struct lf_pcpu *lfpc;
 878
 879         lf_printf("lf_destroy_range: %ju..%ju\n",
 880                   (uintmax_t)range->lf_start, (uintmax_t)range->lf_end);
 881
 882         lfpc = &lf_pcpu_array[mycpuid];
 883         if (lfpc->free1 == NULL) {
 884                 range->lf_owner = NULL;
 885                 lfpc->free1 = range;
 886                 return;
 887         }
 888         if (lfpc->free2 == NULL) {
 889                 range->lf_owner = NULL;
 890                 lfpc->free2 = range;
 891                 return;
 892         }
 893         kfree(range, M_LOCKF);
 894 }
 895
 896 #ifdef LOCKF_DEBUG
 897
 898 static void
 899 _lf_printf(const char *ctl, ...)
 900 {
 901         struct proc *p;
 902         __va_list va;
 903
 904         if (lf_print_ranges) {
 905             if ((p = curproc) != NULL)
 906                 kprintf("pid %d (%s): ", p->p_pid, p->p_comm);
 907         }
 908         __va_start(va, ctl);
 909         kvprintf(ctl, va);
 910         __va_end(va);
 911 }
 912
 913 static void
 914 _lf_print_lock(const struct lockf *lock)
 915 {
 916         struct lockf_range *range;
 917
 918         if (lf_print_ranges == 0)
 919                 return;
 920
 921         if (TAILQ_EMPTY(&lock->lf_range)) {
 922                 lf_printf("lockf %p: no ranges locked\n", lock);
 923         } else {
 924                 lf_printf("lockf %p:\n", lock);
 925         }
 926         TAILQ_FOREACH(range, &lock->lf_range, lf_link)
 927                 kprintf("\t%jd..%jd type %s owned by %d\n",
 928                        (uintmax_t)range->lf_start, (uintmax_t)range->lf_end,
 929                        range->lf_type == F_RDLCK ? "shared" : "exclusive",
 930                        range->lf_flags & F_POSIX ? range->lf_owner->p_pid : -1);
 931         if (TAILQ_EMPTY(&lock->lf_blocked))
 932                 kprintf("no process waiting for range\n");
 933         else
 934                 kprintf("blocked locks:");
 935         TAILQ_FOREACH(range, &lock->lf_blocked, lf_link)
 936                 kprintf("\t%jd..%jd type %s waiting on %p\n",
 937                        (uintmax_t)range->lf_start, (uintmax_t)range->lf_end,
 938                        range->lf_type == F_RDLCK ? "shared" : "exclusive",
 939                        range);
 940 }
 941 #endif /* LOCKF_DEBUG */
 942
 943 static void
 944 lf_init(void *dummy __unused)
 945 {
 946         lf_pcpu_array = kmalloc(sizeof(*lf_pcpu_array) * ncpus,
 947                                 M_LOCKF, M_WAITOK | M_ZERO);
 948 }
 949
 950 SYSINIT(lockf, SI_BOOT2_MACHDEP, SI_ORDER_ANY, lf_init, NULL);