sys/kern/kern_lockf.c

   1 /*
   2  * Copyright (c) 2004 Joerg Sonnenberger <joerg@bec.de>.  All rights reserved.
   3  * Copyright (c) 2006-2018 Matthew Dillon <dillon@backplane.com>.  All rights reserved.
   4  *
   5  * Copyright (c) 1982, 1986, 1989, 1993
   6  *      The Regents of the University of California.  All rights reserved.
   7  *
   8  * This code is derived from software contributed to Berkeley by
   9  * Scooter Morris at Genentech Inc.
  10  *
  11  * Redistribution and use in source and binary forms, with or without
  12  * modification, are permitted provided that the following conditions
  13  * are met:
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  * 2. Redistributions in binary form must reproduce the above copyright
  17  *    notice, this list of conditions and the following disclaimer in the
  18  *    documentation and/or other materials provided with the distribution.
  19  * 3. Neither the name of the University nor the names of its contributors
  20  *    may be used to endorse or promote products derived from this software
  21  *    without specific prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  *
  35  *      @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
  36  * $FreeBSD: src/sys/kern/kern_lockf.c,v 1.25 1999/11/16 16:28:56 phk Exp $
  37  */
  38
  39 #include "opt_debug_lockf.h"
  40
  41 #include <sys/param.h>
  42 #include <sys/systm.h>
  43 #include <sys/kernel.h>
  44 #include <sys/lock.h>
  45 #include <sys/proc.h>
  46 #include <sys/unistd.h>
  47 #include <sys/vnode.h>
  48 #include <sys/malloc.h>
  49 #include <sys/fcntl.h>
  50 #include <sys/resourcevar.h>
  51
  52 #include <sys/lockf.h>
  53 #include <machine/limits.h>     /* for LLONG_MAX */
  54 #include <machine/stdarg.h>
  55
  56 #include <sys/spinlock2.h>
  57
  58 #ifdef INVARIANTS
  59 int lf_global_counter = 0;
  60 #endif
  61
  62 #ifdef LOCKF_DEBUG
  63 int lf_print_ranges = 0;
  64
  65 static void _lf_print_lock(const struct lockf *);
  66 static void _lf_printf(const char *, ...) __printflike(1, 2);
  67
  68 #define lf_print_lock(lock) if (lf_print_ranges) _lf_print_lock(lock)
  69 #define lf_printf(ctl, args...) if (lf_print_ranges) _lf_printf(ctl, args)
  70 #else
  71 #define lf_print_lock(lock)
  72 #define lf_printf(ctl, args...)
  73 #endif
  74
  75 static MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
  76
  77 static void     lf_wakeup(struct lockf *, off_t, off_t);
  78 static struct lockf_range *lf_alloc_range(void);
  79 static void     lf_create_range(struct lockf_range *, struct proc *, int, int,
  80                                 off_t, off_t);
  81 static void     lf_insert(struct lockf_range_list *list,
  82                                 struct lockf_range *elm,
  83                                 struct lockf_range *insert_point);
  84 static void     lf_destroy_range(struct lockf_range *);
  85
  86 static int      lf_setlock(struct lockf *, struct proc *, int, int,
  87                            off_t, off_t);
  88 static int      lf_getlock(struct flock *, struct lockf *, struct proc *,
  89                            int, int, off_t, off_t);
  90
  91 static int      lf_count_change(struct proc *, int);
  92
  93 /*
  94  * Return TRUE (non-zero) if the type and posix flags match.
  95  */
  96 static __inline
  97 int
  98 lf_match(struct lockf_range *range, int type, int flags)
  99 {
 100         if (range->lf_type != type)
 101                 return(0);
 102         if ((range->lf_flags ^ flags) & F_POSIX)
 103                 return(0);
 104         return(1);
 105 }
 106
 107 /*
 108  * Check whether range and [start, end] overlap.
 109  */
 110 static __inline
 111 int
 112 lf_overlap(const struct lockf_range *range, off_t start, off_t end)
 113 {
 114         if (range->lf_start >= start && range->lf_start <= end)
 115                 return(1);
 116         else if (start >= range->lf_start && start <= range->lf_end)
 117                 return(1);
 118         else
 119                 return(0);
 120 }
 121
 122
 123 /*
 124  * Change the POSIX lock accounting for the given process.
 125  */
 126 void
 127 lf_count_adjust(struct proc *p, int increase)
 128 {
 129         struct uidinfo *uip;
 130         struct uidcount *pup;
 131         int n;
 132
 133         KKASSERT(p != NULL);
 134
 135         uip = p->p_ucred->cr_uidinfo;
 136         pup = &uip->ui_pcpu[mycpuid];
 137
 138         if (increase) {
 139                 for (n = 0; n < ncpus; ++n)
 140                         pup->pu_posixlocks += p->p_uidpcpu[n].pu_posixlocks;
 141         } else {
 142                 for (n = 0; n < ncpus; ++n)
 143                         pup->pu_posixlocks -= p->p_uidpcpu[n].pu_posixlocks;
 144         }
 145
 146         if (pup->pu_posixlocks < -PUP_LIMIT ||
 147             pup->pu_posixlocks > PUP_LIMIT) {
 148                 atomic_add_int(&uip->ui_posixlocks, pup->pu_posixlocks);
 149                 pup->pu_posixlocks = 0;
 150         }
 151 }
 152
 153 static int
 154 lf_count_change(struct proc *owner, int diff)
 155 {
 156         struct uidinfo *uip;
 157         int max, ret;
 158
 159         /* we might actually not have a process context */
 160         if (owner == NULL)
 161                 return(0);
 162
 163         uip = owner->p_ucred->cr_uidinfo;
 164
 165         max = MIN(owner->p_rlimit[RLIMIT_POSIXLOCKS].rlim_cur,
 166                   maxposixlocksperuid);
 167
 168         if (diff > 0 && owner->p_ucred->cr_uid != 0 && max != -1 &&
 169             uip->ui_posixlocks >= max ) {
 170                 ret = 1;
 171         } else {
 172                 struct uidcount *pup;
 173                 int cpu = mycpuid;
 174
 175                 pup = &uip->ui_pcpu[cpu];
 176                 pup->pu_posixlocks += diff;
 177                 if (pup->pu_posixlocks < -PUP_LIMIT ||
 178                     pup->pu_posixlocks > PUP_LIMIT) {
 179                         atomic_add_int(&uip->ui_posixlocks, pup->pu_posixlocks);
 180                         pup->pu_posixlocks = 0;
 181                 }
 182                 owner->p_uidpcpu[cpu].pu_posixlocks += diff;
 183                 ret = 0;
 184         }
 185         return ret;
 186 }
 187
 188 /*
 189  * Advisory record locking support
 190  */
 191 int
 192 lf_advlock(struct vop_advlock_args *ap, struct lockf *lock, u_quad_t size)
 193 {
 194         struct flock *fl = ap->a_fl;
 195         struct proc *owner;
 196         off_t start, end;
 197         int type, flags, error;
 198         lwkt_token_t token;
 199
 200         /*
 201          * Convert the flock structure into a start and end.
 202          */
 203         switch (fl->l_whence) {
 204         case SEEK_SET:
 205         case SEEK_CUR:
 206                 /*
 207                  * Caller is responsible for adding any necessary offset
 208                  * when SEEK_CUR is used.
 209                  */
 210                 start = fl->l_start;
 211                 break;
 212
 213         case SEEK_END:
 214                 start = size + fl->l_start;
 215                 break;
 216
 217         default:
 218                 return(EINVAL);
 219         }
 220
 221         flags = ap->a_flags;
 222         if (start < 0)
 223                 return(EINVAL);
 224         if (fl->l_len == 0) {
 225                 flags |= F_NOEND;
 226                 end = LLONG_MAX;
 227         } else if (fl->l_len < 0) {
 228                 return(EINVAL);
 229         } else {
 230                 end = start + fl->l_len - 1;
 231                 if (end < start)
 232                         return(EINVAL);
 233         }
 234
 235         type = fl->l_type;
 236         /*
 237          * This isn't really correct for flock-style locks,
 238          * but the current handling is somewhat broken anyway.
 239          */
 240         owner = (struct proc *)ap->a_id;
 241
 242         /*
 243          * Do the requested operation.
 244          */
 245         token = lwkt_getpooltoken(lock);
 246
 247         if (lock->init_done == 0) {
 248                 TAILQ_INIT(&lock->lf_range);
 249                 TAILQ_INIT(&lock->lf_blocked);
 250                 lock->init_done = 1;
 251         }
 252
 253         switch(ap->a_op) {
 254         case F_SETLK:
 255                 /*
 256                  * NOTE: It is possible for both lf_range and lf_blocked to
 257                  * be empty if we block and get woken up, but another process
 258                  * then gets in and issues an unlock.  So VMAYHAVELOCKS must
 259                  * be set after the lf_setlock() operation completes rather
 260                  * then before.
 261                  */
 262                 error = lf_setlock(lock, owner, type, flags, start, end);
 263                 vsetflags(ap->a_vp, VMAYHAVELOCKS);
 264                 break;
 265
 266         case F_UNLCK:
 267                 error = lf_setlock(lock, owner, type, flags, start, end);
 268                 if (TAILQ_EMPTY(&lock->lf_range) &&
 269                     TAILQ_EMPTY(&lock->lf_blocked)) {
 270                         vclrflags(ap->a_vp, VMAYHAVELOCKS);
 271                 }
 272                 break;
 273
 274         case F_GETLK:
 275                 error = lf_getlock(fl, lock, owner, type, flags, start, end);
 276                 break;
 277
 278         default:
 279                 error = EINVAL;
 280                 break;
 281         }
 282         lwkt_reltoken(token);
 283         return(error);
 284 }
 285
 286 static int
 287 lf_setlock(struct lockf *lock, struct proc *owner, int type, int flags,
 288            off_t start, off_t end)
 289 {
 290         struct lockf_range *range;
 291         struct lockf_range *brange;
 292         struct lockf_range *next;
 293         struct lockf_range *first_match;
 294         struct lockf_range *last_match;
 295         struct lockf_range *insert_point;
 296         struct lockf_range *new_range1;
 297         struct lockf_range *new_range2;
 298         int wakeup_needed;
 299         int double_clip;
 300         int unlock_override;
 301         int error = 0;
 302         int count;
 303         struct lockf_range_list deadlist;
 304
 305         new_range1 = NULL;
 306         new_range2 = NULL;
 307         count = 0;
 308
 309 restart:
 310         /*
 311          * Preallocate two ranges so we don't have to worry about blocking
 312          * in the middle of the lock code.
 313          */
 314         if (new_range1 == NULL)
 315                 new_range1 = lf_alloc_range();
 316         if (new_range2 == NULL)
 317                 new_range2 = lf_alloc_range();
 318         first_match = NULL;
 319         last_match = NULL;
 320         insert_point = NULL;
 321         wakeup_needed = 0;
 322
 323         lf_print_lock(lock);
 324
 325         /*
 326          * Locate the insertion point for the new lock (the first range
 327          * with an lf_start >= start).
 328          *
 329          * Locate the first and latch ranges owned by us that overlap
 330          * the requested range.
 331          */
 332         TAILQ_FOREACH(range, &lock->lf_range, lf_link) {
 333                 if (insert_point == NULL && range->lf_start >= start)
 334                         insert_point = range;
 335
 336                 /*
 337                  * Skip non-overlapping locks.  Locks are sorted by lf_start
 338                  * So we can terminate the search when lf_start exceeds the
 339                  * requested range (insert_point is still guarenteed to be
 340                  * set properly).
 341                  */
 342                 if (range->lf_end < start)
 343                         continue;
 344                 if (range->lf_start > end) {
 345                         range = NULL;
 346                         break;
 347                 }
 348
 349                 /*
 350                  * Overlapping lock.  Set first_match and last_match if we
 351                  * are the owner.
 352                  */
 353                 if (range->lf_owner == owner) {
 354                         if (first_match == NULL)
 355                                 first_match = range;
 356                         last_match = range;
 357                         continue;
 358                 }
 359
 360                 /*
 361                  * If we aren't the owner check for a conflicting lock.  Only
 362                  * if not unlocking.
 363                  */
 364                 if (type != F_UNLCK) {
 365                         if (type == F_WRLCK || range->lf_type == F_WRLCK)
 366                                 break;
 367                 }
 368         }
 369
 370         /*
 371          * If a conflicting lock was observed, block or fail as appropriate.
 372          * (this code is skipped when unlocking)
 373          */
 374         if (range != NULL) {
 375                 if ((flags & F_WAIT) == 0) {
 376                         error = EAGAIN;
 377                         goto do_cleanup;
 378                 }
 379
 380                 /*
 381                  * We are blocked. For POSIX locks we have to check
 382                  * for deadlocks and return with EDEADLK. This is done
 383                  * by checking whether range->lf_owner is already
 384                  * blocked.
 385                  *
 386                  * Since flock-style locks cover the whole file, a
 387                  * deadlock between those is nearly impossible.
 388                  * This can only occur if a process tries to lock the
 389                  * same inode exclusively while holding a shared lock
 390                  * with another descriptor.
 391                  * XXX How can we cleanly detect this?
 392                  * XXX The current mixing of flock & fcntl/lockf is evil.
 393                  *
 394                  * Handle existing locks of flock-style like POSIX locks.
 395                  */
 396                 if (flags & F_POSIX) {
 397                         TAILQ_FOREACH(brange, &lock->lf_blocked, lf_link) {
 398                                 if (brange->lf_owner == range->lf_owner) {
 399                                         error = EDEADLK;
 400                                         goto do_cleanup;
 401                                 }
 402                         }
 403                 }
 404
 405                 /*
 406                  * For flock-style locks, we must first remove
 407                  * any shared locks that we hold before we sleep
 408                  * waiting for an exclusive lock.
 409                  */
 410                 if ((flags & F_POSIX) == 0 && type == F_WRLCK)
 411                         lf_setlock(lock, owner, F_UNLCK, 0, start, end);
 412
 413                 brange = new_range1;
 414                 new_range1 = NULL;
 415                 lf_create_range(brange, owner, type, 0, start, end);
 416                 TAILQ_INSERT_TAIL(&lock->lf_blocked, brange, lf_link);
 417                 error = tsleep(brange, PCATCH, "lockf", 0);
 418
 419                 /*
 420                  * We may have been awaked by a signal and/or by a
 421                  * debugger continuing us (in which case we must remove
 422                  * ourselves from the blocked list) and/or by another
 423                  * process releasing/downgrading a lock (in which case
 424                  * we have already been removed from the blocked list
 425                  * and our lf_flags field is 1).
 426                  *
 427                  * Sleep if it looks like we might be livelocking.
 428                  */
 429                 if (brange->lf_flags == 0)
 430                         TAILQ_REMOVE(&lock->lf_blocked, brange, lf_link);
 431                 if (count == 2)
 432                         tsleep(brange, 0, "lockfz", 2);
 433                 else
 434                         ++count;
 435                 lf_destroy_range(brange);
 436
 437                 if (error)
 438                         goto do_cleanup;
 439                 goto restart;
 440         }
 441
 442         /*
 443          * If there are no overlapping locks owned by us then creating
 444          * the new lock is easy.  This is the most common case.
 445          */
 446         if (first_match == NULL) {
 447                 if (type == F_UNLCK)
 448                         goto do_wakeup;
 449                 if (flags & F_POSIX) {
 450                         if (lf_count_change(owner, 1)) {
 451                                 error = ENOLCK;
 452                                 goto do_cleanup;
 453                         }
 454                 }
 455                 range = new_range1;
 456                 new_range1 = NULL;
 457                 lf_create_range(range, owner, type, flags, start, end);
 458                 lf_insert(&lock->lf_range, range, insert_point);
 459                 goto do_wakeup;
 460         }
 461
 462         /*
 463          * double_clip - Calculate a special case where TWO locks may have
 464          *               to be added due to the new lock breaking up an
 465          *               existing incompatible lock in the middle.
 466          *
 467          * unlock_override - Calculate a special case where NO locks
 468          *               need to be created.  This occurs when an unlock
 469          *               does not clip any locks at the front and rear.
 470          *
 471          * WARNING!  closef() and fdrop() assume that an F_UNLCK of the
 472          *           entire range will always succeed so the unlock_override
 473          *           case is mandatory.
 474          */
 475         double_clip = 0;
 476         unlock_override = 0;
 477         if (first_match->lf_start < start) {
 478                 if (first_match == last_match && last_match->lf_end > end)
 479                         double_clip = 1;
 480         } else if (type == F_UNLCK && last_match->lf_end <= end) {
 481                 unlock_override = 1;
 482         }
 483
 484         /*
 485          * Figure out the worst case net increase in POSIX locks and account
 486          * for it now before we start modifying things.  If neither the
 487          * first or last locks match we have an issue.  If there is only
 488          * one overlapping range which needs to be clipped on both ends
 489          * we wind up having to create up to two new locks, else only one.
 490          *
 491          * When unlocking the worst case is always 1 new lock if our
 492          * unlock request cuts the middle out of an existing lock range.
 493          *
 494          * count represents the 'cleanup' adjustment needed.  It starts
 495          * negative, is incremented whenever we create a new POSIX lock,
 496          * and decremented whenever we delete an existing one.  At the
 497          * end of the day it had better be <= 0 or we didn't calculate the
 498          * worse case properly here.
 499          */
 500         count = 0;
 501         if ((flags & F_POSIX) && !unlock_override) {
 502                 if (!lf_match(first_match, type, flags) &&
 503                     !lf_match(last_match, type, flags)
 504                 ) {
 505                         if (double_clip && type != F_UNLCK)
 506                                 count = -2;
 507                         else
 508                                 count = -1;
 509                 }
 510                 if (count && lf_count_change(owner, -count)) {
 511                         error = ENOLCK;
 512                         goto do_cleanup;
 513                 }
 514         }
 515         /* else flock style lock which encompasses entire range */
 516
 517         /*
 518          * Create and insert the lock represented the requested range.
 519          * Adjust the net POSIX lock count.  We have to move our insertion
 520          * point since brange now represents the first record >= start.
 521          *
 522          * When unlocking, no new lock is inserted but we still clip.
 523          */
 524         if (type != F_UNLCK) {
 525                 brange = new_range1;
 526                 new_range1 = NULL;
 527                 lf_create_range(brange, owner, type, flags, start, end);
 528                 lf_insert(&lock->lf_range, brange, insert_point);
 529                 insert_point = brange;
 530                 if (flags & F_POSIX)
 531                         ++count;
 532         } else {
 533                 brange = NULL;
 534         }
 535
 536         /*
 537          * Handle the double_clip case.  This is the only case where
 538          * we wind up having to add TWO locks.
 539          */
 540         if (double_clip) {
 541                 KKASSERT(first_match == last_match);
 542                 last_match = new_range2;
 543                 new_range2 = NULL;
 544                 lf_create_range(last_match, first_match->lf_owner,
 545                                 first_match->lf_type, first_match->lf_flags,
 546                                 end + 1, first_match->lf_end);
 547                 first_match->lf_end = start - 1;
 548                 first_match->lf_flags &= ~F_NOEND;
 549
 550                 /*
 551                  * Figure out where to insert the right side clip.
 552                  */
 553                 lf_insert(&lock->lf_range, last_match, first_match);
 554                 if (last_match->lf_flags & F_POSIX)
 555                         ++count;
 556         }
 557
 558         /*
 559          * Clip or destroy the locks between first_match and last_match,
 560          * inclusive.  Ignore the primary lock we created (brange).  Note
 561          * that if double-clipped, first_match and last_match will be
 562          * outside our clipping range.  Otherwise first_match and last_match
 563          * will be deleted.
 564          *
 565          * We have already taken care of any double clipping.
 566          *
 567          * The insert_point may become invalid as we delete records, do not
 568          * use that pointer any more.  Also, when removing something other
 569          * then 'range' we have to check to see if the item we are removing
 570          * is 'next' and adjust 'next' properly.
 571          *
 572          * NOTE: brange will be NULL if F_UNLCKing.
 573          */
 574         TAILQ_INIT(&deadlist);
 575         next = first_match;
 576
 577         while ((range = next) != NULL) {
 578                 next = TAILQ_NEXT(range, lf_link);
 579
 580                 /*
 581                  * Ignore elements that we do not own and ignore the
 582                  * primary request range which we just created.
 583                  */
 584                 if (range->lf_owner != owner || range == brange)
 585                         continue;
 586
 587                 /*
 588                  * We may have to wakeup a waiter when downgrading a lock.
 589                  */
 590                 if (type == F_UNLCK)
 591                         wakeup_needed = 1;
 592                 if (type == F_RDLCK && range->lf_type == F_WRLCK)
 593                         wakeup_needed = 1;
 594
 595                 /*
 596                  * Clip left.  This can only occur on first_match.
 597                  *
 598                  * Merge the left clip with brange if possible.  This must
 599                  * be done specifically, not in the optimized merge heuristic
 600                  * below, since we may have counted on it in our 'count'
 601                  * calculation above.
 602                  */
 603                 if (range->lf_start < start) {
 604                         KKASSERT(range == first_match);
 605                         if (brange &&
 606                             range->lf_end >= start - 1 &&
 607                             lf_match(range, type, flags)) {
 608                                 range->lf_end = brange->lf_end;
 609                                 range->lf_flags |= brange->lf_flags & F_NOEND;
 610                                 /*
 611                                  * Removing something other then 'range',
 612                                  * adjust 'next' if necessary.
 613                                  */
 614                                 if (next == brange)
 615                                         next = TAILQ_NEXT(next, lf_link);
 616                                 TAILQ_REMOVE(&lock->lf_range, brange, lf_link);
 617                                 if (brange->lf_flags & F_POSIX)
 618                                         --count;
 619                                 TAILQ_INSERT_TAIL(&deadlist, brange, lf_link);
 620                                 brange = range;
 621                         } else if (range->lf_end >= start) {
 622                                 range->lf_end = start - 1;
 623                                 if (type != F_UNLCK)
 624                                         range->lf_flags &= ~F_NOEND;
 625                         }
 626                         if (range == last_match)
 627                                 break;
 628                         continue;
 629                 }
 630
 631                 /*
 632                  * Clip right.  This can only occur on last_match.
 633                  *
 634                  * Merge the right clip if possible.  This must be done
 635                  * specifically, not in the optimized merge heuristic
 636                  * below, since we may have counted on it in our 'count'
 637                  * calculation.
 638                  *
 639                  * Since we are adjusting lf_start, we have to move the
 640                  * record to maintain the sorted list.  Since lf_start is
 641                  * only getting larger we can use the next element as the
 642                  * insert point (we don't have to backtrack).
 643                  */
 644                 if (range->lf_end > end) {
 645                         KKASSERT(range == last_match);
 646                         if (brange &&
 647                             range->lf_start <= end + 1 &&
 648                             lf_match(range, type, flags)) {
 649                                 brange->lf_end = range->lf_end;
 650                                 brange->lf_flags |= range->lf_flags & F_NOEND;
 651                                 TAILQ_REMOVE(&lock->lf_range, range, lf_link);
 652                                 if (range->lf_flags & F_POSIX)
 653                                         --count;
 654                                 TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
 655                         } else if (range->lf_start <= end) {
 656                                 range->lf_start = end + 1;
 657                                 TAILQ_REMOVE(&lock->lf_range, range, lf_link);
 658                                 lf_insert(&lock->lf_range, range, next);
 659                         }
 660                         /* range == last_match, we are done */
 661                         break;
 662                 }
 663
 664                 /*
 665                  * The record must be entirely enclosed.  Note that the
 666                  * record could be first_match or last_match, and will be
 667                  * deleted.
 668                  */
 669                 KKASSERT(range->lf_start >= start && range->lf_end <= end);
 670                 TAILQ_REMOVE(&lock->lf_range, range, lf_link);
 671                 if (range->lf_flags & F_POSIX)
 672                         --count;
 673                 TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
 674                 if (range == last_match)
 675                         break;
 676         }
 677
 678         /*
 679          * Attempt to merge locks adjacent to brange.  For example, we may
 680          * have had to clip first_match and/or last_match, and they might
 681          * be adjacent.  Or there might simply have been an adjacent lock
 682          * already there.
 683          *
 684          * Don't get fancy, just check adjacent elements in the list if they
 685          * happen to be owned by us.
 686          *
 687          * This case only gets hit if we have a situation where a shared
 688          * and exclusive lock are adjacent, and the exclusive lock is
 689          * downgraded to shared or the shared lock is upgraded to exclusive.
 690          */
 691         if (brange) {
 692                 range = TAILQ_PREV(brange, lockf_range_list, lf_link);
 693                 if (range &&
 694                     range->lf_owner == owner &&
 695                     range->lf_end == brange->lf_start - 1 &&
 696                     lf_match(range, type, flags)
 697                 ) {
 698                         /*
 699                          * Extend range to cover brange and scrap brange.
 700                          */
 701                         range->lf_end = brange->lf_end;
 702                         range->lf_flags |= brange->lf_flags & F_NOEND;
 703                         TAILQ_REMOVE(&lock->lf_range, brange, lf_link);
 704                         if (brange->lf_flags & F_POSIX)
 705                                 --count;
 706                         TAILQ_INSERT_TAIL(&deadlist, brange, lf_link);
 707                         brange = range;
 708                 }
 709                 range = TAILQ_NEXT(brange, lf_link);
 710                 if (range &&
 711                     range->lf_owner == owner &&
 712                     range->lf_start == brange->lf_end + 1 &&
 713                     lf_match(range, type, flags)
 714                 ) {
 715                         /*
 716                          * Extend brange to cover range and scrap range.
 717                          */
 718                         brange->lf_end = range->lf_end;
 719                         brange->lf_flags |= range->lf_flags & F_NOEND;
 720                         TAILQ_REMOVE(&lock->lf_range, range, lf_link);
 721                         if (range->lf_flags & F_POSIX)
 722                                 --count;
 723                         TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
 724                 }
 725         }
 726
 727         /*
 728          * Destroy deleted elements.  We didn't want to do it in the loop
 729          * because the free() might have blocked.
 730          *
 731          * Adjust the count for any posix locks we thought we might create
 732          * but didn't.
 733          */
 734         while ((range = TAILQ_FIRST(&deadlist)) != NULL) {
 735                 TAILQ_REMOVE(&deadlist, range, lf_link);
 736                 lf_destroy_range(range);
 737         }
 738
 739         KKASSERT(count <= 0);
 740         if (count < 0)
 741                 lf_count_change(owner, count);
 742 do_wakeup:
 743         lf_print_lock(lock);
 744         if (wakeup_needed)
 745                 lf_wakeup(lock, start, end);
 746         error = 0;
 747 do_cleanup:
 748         if (new_range1 != NULL)
 749                 lf_destroy_range(new_range1);
 750         if (new_range2 != NULL)
 751                 lf_destroy_range(new_range2);
 752         return(error);
 753 }
 754
 755 /*
 756  * Check whether there is a blocking lock,
 757  * and if so return its process identifier.
 758  */
 759 static int
 760 lf_getlock(struct flock *fl, struct lockf *lock, struct proc *owner,
 761            int type, int flags, off_t start, off_t end)
 762 {
 763         struct lockf_range *range;
 764
 765         TAILQ_FOREACH(range, &lock->lf_range, lf_link)
 766                 if (range->lf_owner != owner &&
 767                     lf_overlap(range, start, end) &&
 768                     (type == F_WRLCK || range->lf_type == F_WRLCK))
 769                         break;
 770         if (range == NULL) {
 771                 fl->l_type = F_UNLCK;
 772                 return(0);
 773         }
 774         fl->l_type = range->lf_type;
 775         fl->l_whence = SEEK_SET;
 776         fl->l_start = range->lf_start;
 777         if (range->lf_flags & F_NOEND)
 778                 fl->l_len = 0;
 779         else
 780                 fl->l_len = range->lf_end - range->lf_start + 1;
 781         if (range->lf_owner != NULL && (range->lf_flags & F_POSIX))
 782                 fl->l_pid = range->lf_owner->p_pid;
 783         else
 784                 fl->l_pid = -1;
 785         return(0);
 786 }
 787
 788 /*
 789  * Wakeup pending lock attempts.  Theoretically we can stop as soon as
 790  * we encounter an exclusive request that covers the whole range (at least
 791  * insofar as the sleep code above calls lf_wakeup() if it would otherwise
 792  * exit instead of loop), but for now just wakeup all overlapping
 793  * requests.  XXX
 794  */
 795 static void
 796 lf_wakeup(struct lockf *lock, off_t start, off_t end)
 797 {
 798         struct lockf_range *range, *nrange;
 799
 800         TAILQ_FOREACH_MUTABLE(range, &lock->lf_blocked, lf_link, nrange) {
 801                 if (lf_overlap(range, start, end) == 0)
 802                         continue;
 803                 TAILQ_REMOVE(&lock->lf_blocked, range, lf_link);
 804                 range->lf_flags = 1;
 805                 wakeup(range);
 806         }
 807 }
 808
 809 /*
 810  * Allocate a range structure and initialize it sufficiently such that
 811  * lf_destroy_range() does not barf.
 812  */
 813 static struct lockf_range *
 814 lf_alloc_range(void)
 815 {
 816         struct lockf_range *range;
 817
 818 #ifdef INVARIANTS
 819         atomic_add_int(&lf_global_counter, 1);
 820 #endif
 821         range = kmalloc(sizeof(struct lockf_range), M_LOCKF, M_WAITOK);
 822         range->lf_owner = NULL;
 823         return(range);
 824 }
 825
 826 static void
 827 lf_insert(struct lockf_range_list *list, struct lockf_range *elm,
 828           struct lockf_range *insert_point)
 829 {
 830         while (insert_point && insert_point->lf_start < elm->lf_start)
 831                 insert_point = TAILQ_NEXT(insert_point, lf_link);
 832         if (insert_point != NULL)
 833                 TAILQ_INSERT_BEFORE(insert_point, elm, lf_link);
 834         else
 835                 TAILQ_INSERT_TAIL(list, elm, lf_link);
 836 }
 837
 838 static void
 839 lf_create_range(struct lockf_range *range, struct proc *owner, int type,
 840                 int flags, off_t start, off_t end)
 841 {
 842         KKASSERT(start <= end);
 843         range->lf_type = type;
 844         range->lf_flags = flags;
 845         range->lf_start = start;
 846         range->lf_end = end;
 847         range->lf_owner = owner;
 848
 849         lf_printf("lf_create_range: %ju..%ju\n",
 850             (uintmax_t)range->lf_start, (uintmax_t)range->lf_end);
 851 }
 852
 853 static void
 854 lf_destroy_range(struct lockf_range *range)
 855 {
 856         lf_printf("lf_destroy_range: %ju..%ju\n",
 857             (uintmax_t)range->lf_start, (uintmax_t)range->lf_end);
 858         kfree(range, M_LOCKF);
 859 #ifdef INVARIANTS
 860         atomic_add_int(&lf_global_counter, -1);
 861         KKASSERT(lf_global_counter >= 0);
 862 #endif
 863 }
 864
 865 #ifdef LOCKF_DEBUG
 866
 867 static void
 868 _lf_printf(const char *ctl, ...)
 869 {
 870         struct proc *p;
 871         __va_list va;
 872
 873         if (lf_print_ranges) {
 874             if ((p = curproc) != NULL)
 875                 kprintf("pid %d (%s): ", p->p_pid, p->p_comm);
 876         }
 877         __va_start(va, ctl);
 878         kvprintf(ctl, va);
 879         __va_end(va);
 880 }
 881
 882 static void
 883 _lf_print_lock(const struct lockf *lock)
 884 {
 885         struct lockf_range *range;
 886
 887         if (lf_print_ranges == 0)
 888                 return;
 889
 890         if (TAILQ_EMPTY(&lock->lf_range)) {
 891                 lf_printf("lockf %p: no ranges locked\n", lock);
 892         } else {
 893                 lf_printf("lockf %p:\n", lock);
 894         }
 895         TAILQ_FOREACH(range, &lock->lf_range, lf_link)
 896                 kprintf("\t%jd..%jd type %s owned by %d\n",
 897                        (uintmax_t)range->lf_start, (uintmax_t)range->lf_end,
 898                        range->lf_type == F_RDLCK ? "shared" : "exclusive",
 899                        range->lf_flags & F_POSIX ? range->lf_owner->p_pid : -1);
 900         if (TAILQ_EMPTY(&lock->lf_blocked))
 901                 kprintf("no process waiting for range\n");
 902         else
 903                 kprintf("blocked locks:");
 904         TAILQ_FOREACH(range, &lock->lf_blocked, lf_link)
 905                 kprintf("\t%jd..%jd type %s waiting on %p\n",
 906                        (uintmax_t)range->lf_start, (uintmax_t)range->lf_end,
 907                        range->lf_type == F_RDLCK ? "shared" : "exclusive",
 908                        range);
 909 }
 910 #endif /* LOCKF_DEBUG */