scalos/libraries/sqlite/src/os_unix.c

   1 /*
   2 ** 2004 May 22
   3 **
   4 ** The author disclaims copyright to this source code.  In place of
   5 ** a legal notice, here is a blessing:
   6 **
   7 **    May you do good and not evil.
   8 **    May you find forgiveness for yourself and forgive others.
   9 **    May you share freely, never taking more than you give.
  10 **
  11 ******************************************************************************
  12 **
  13 ** This file contains the VFS implementation for unix-like operating systems
  14 ** include Linux, MacOSX, *BSD, QNX, VxWorks, AIX, HPUX, and others.
  15 **
  16 ** There are actually several different VFS implementations in this file.
  17 ** The differences are in the way that file locking is done.  The default
  18 ** implementation uses Posix Advisory Locks.  Alternative implementations
  19 ** use flock(), dot-files, various proprietary locking schemas, or simply
  20 ** skip locking all together.
  21 **
  22 ** This source file is organized into divisions where the logic for various
  23 ** subfunctions is contained within the appropriate division.  PLEASE
  24 ** KEEP THE STRUCTURE OF THIS FILE INTACT.  New code should be placed
  25 ** in the correct division and should be clearly labeled.
  26 **
  27 ** The layout of divisions is as follows:
  28 **
  29 **   *  General-purpose declarations and utility functions.
  30 **   *  Unique file ID logic used by VxWorks.
  31 **   *  Various locking primitive implementations (all except proxy locking):
  32 **      + for Posix Advisory Locks
  33 **      + for no-op locks
  34 **      + for dot-file locks
  35 **      + for flock() locking
  36 **      + for named semaphore locks (VxWorks only)
  37 **      + for AFP filesystem locks (MacOSX only)
  38 **   *  sqlite3_file methods not associated with locking.
  39 **   *  Definitions of sqlite3_io_methods objects for all locking
  40 **      methods plus "finder" functions for each locking method.
  41 **   *  sqlite3_vfs method implementations.
  42 **   *  Locking primitives for the proxy uber-locking-method. (MacOSX only)
  43 **   *  Definitions of sqlite3_vfs objects for all locking methods
  44 **      plus implementations of sqlite3_os_init() and sqlite3_os_end().
  45 */
  46 #include "sqliteInt.h"
  47 #if SQLITE_OS_UNIX              /* This file is used on unix only */
  48
  49 /*
  50 ** There are various methods for file locking used for concurrency
  51 ** control:
  52 **
  53 **   1. POSIX locking (the default),
  54 **   2. No locking,
  55 **   3. Dot-file locking,
  56 **   4. flock() locking,
  57 **   5. AFP locking (OSX only),
  58 **   6. Named POSIX semaphores (VXWorks only),
  59 **   7. proxy locking. (OSX only)
  60 **
  61 ** Styles 4, 5, and 7 are only available of SQLITE_ENABLE_LOCKING_STYLE
  62 ** is defined to 1.  The SQLITE_ENABLE_LOCKING_STYLE also enables automatic
  63 ** selection of the appropriate locking style based on the filesystem
  64 ** where the database is located.
  65 */
  66 #if !defined(SQLITE_ENABLE_LOCKING_STYLE)
  67 #  if defined(__APPLE__)
  68 #    define SQLITE_ENABLE_LOCKING_STYLE 1
  69 #  else
  70 #    define SQLITE_ENABLE_LOCKING_STYLE 0
  71 #  endif
  72 #endif
  73
  74 /*
  75 ** Define the OS_VXWORKS pre-processor macro to 1 if building on
  76 ** vxworks, or 0 otherwise.
  77 */
  78 #ifndef OS_VXWORKS
  79 #  if defined(__RTP__) || defined(_WRS_KERNEL)
  80 #    define OS_VXWORKS 1
  81 #  else
  82 #    define OS_VXWORKS 0
  83 #  endif
  84 #endif
  85
  86 /*
  87 ** These #defines should enable >2GB file support on Posix if the
  88 ** underlying operating system supports it.  If the OS lacks
  89 ** large file support, these should be no-ops.
  90 **
  91 ** Large file support can be disabled using the -DSQLITE_DISABLE_LFS switch
  92 ** on the compiler command line.  This is necessary if you are compiling
  93 ** on a recent machine (ex: RedHat 7.2) but you want your code to work
  94 ** on an older machine (ex: RedHat 6.0).  If you compile on RedHat 7.2
  95 ** without this option, LFS is enable.  But LFS does not exist in the kernel
  96 ** in RedHat 6.0, so the code won't work.  Hence, for maximum binary
  97 ** portability you should omit LFS.
  98 **
  99 ** The previous paragraph was written in 2005.  (This paragraph is written
 100 ** on 2008-11-28.) These days, all Linux kernels support large files, so
 101 ** you should probably leave LFS enabled.  But some embedded platforms might
 102 ** lack LFS in which case the SQLITE_DISABLE_LFS macro might still be useful.
 103 */
 104 #ifndef SQLITE_DISABLE_LFS
 105 # define _LARGE_FILE       1
 106 # ifndef _FILE_OFFSET_BITS
 107 #   define _FILE_OFFSET_BITS 64
 108 # endif
 109 # define _LARGEFILE_SOURCE 1
 110 #endif
 111
 112 /*
 113 ** standard include files.
 114 */
 115 #include <sys/types.h>
 116 #include <sys/stat.h>
 117 #include <fcntl.h>
 118 #include <unistd.h>
 119 #include <time.h>
 120 #include <sys/time.h>
 121 #include <errno.h>
 122 #ifndef SQLITE_OMIT_WAL
 123 #include <sys/mman.h>
 124 #endif
 125
 126
 127 #if SQLITE_ENABLE_LOCKING_STYLE
 128 # include <sys/ioctl.h>
 129 # if OS_VXWORKS
 130 #  include <semaphore.h>
 131 #  include <limits.h>
 132 # else
 133 #  include <sys/file.h>
 134 #  include <sys/param.h>
 135 # endif
 136 #endif /* SQLITE_ENABLE_LOCKING_STYLE */
 137
 138 #if defined(__APPLE__) || (SQLITE_ENABLE_LOCKING_STYLE && !OS_VXWORKS)
 139 # include <sys/mount.h>
 140 #endif
 141
 142 #ifdef HAVE_UTIME
 143 # include <utime.h>
 144 #endif
 145
 146 /*
 147 ** Allowed values of unixFile.fsFlags
 148 */
 149 #define SQLITE_FSFLAGS_IS_MSDOS     0x1
 150
 151 /*
 152 ** If we are to be thread-safe, include the pthreads header and define
 153 ** the SQLITE_UNIX_THREADS macro.
 154 */
 155 #if SQLITE_THREADSAFE
 156 # include <pthread.h>
 157 # define SQLITE_UNIX_THREADS 1
 158 #endif
 159
 160 /*
 161 ** Default permissions when creating a new file
 162 */
 163 #ifndef SQLITE_DEFAULT_FILE_PERMISSIONS
 164 # define SQLITE_DEFAULT_FILE_PERMISSIONS 0644
 165 #endif
 166
 167 /*
 168  ** Default permissions when creating auto proxy dir
 169  */
 170 #ifndef SQLITE_DEFAULT_PROXYDIR_PERMISSIONS
 171 # define SQLITE_DEFAULT_PROXYDIR_PERMISSIONS 0755
 172 #endif
 173
 174 /*
 175 ** Maximum supported path-length.
 176 */
 177 #define MAX_PATHNAME 512
 178
 179 /*
 180 ** Only set the lastErrno if the error code is a real error and not
 181 ** a normal expected return code of SQLITE_BUSY or SQLITE_OK
 182 */
 183 #define IS_LOCK_ERROR(x)  ((x != SQLITE_OK) && (x != SQLITE_BUSY))
 184
 185 /* Forward references */
 186 typedef struct unixShm unixShm;               /* Connection shared memory */
 187 typedef struct unixShmNode unixShmNode;       /* Shared memory instance */
 188 typedef struct unixInodeInfo unixInodeInfo;   /* An i-node */
 189 typedef struct UnixUnusedFd UnixUnusedFd;     /* An unused file descriptor */
 190
 191 /*
 192 ** Sometimes, after a file handle is closed by SQLite, the file descriptor
 193 ** cannot be closed immediately. In these cases, instances of the following
 194 ** structure are used to store the file descriptor while waiting for an
 195 ** opportunity to either close or reuse it.
 196 */
 197 struct UnixUnusedFd {
 198   int fd;                   /* File descriptor to close */
 199   int flags;                /* Flags this file descriptor was opened with */
 200   UnixUnusedFd *pNext;      /* Next unused file descriptor on same file */
 201 };
 202
 203 /*
 204 ** The unixFile structure is subclass of sqlite3_file specific to the unix
 205 ** VFS implementations.
 206 */
 207 typedef struct unixFile unixFile;
 208 struct unixFile {
 209   sqlite3_io_methods const *pMethod;  /* Always the first entry */
 210   sqlite3_vfs *pVfs;                  /* The VFS that created this unixFile */
 211   unixInodeInfo *pInode;              /* Info about locks on this inode */
 212   int h;                              /* The file descriptor */
 213   unsigned char eFileLock;            /* The type of lock held on this fd */
 214   unsigned char ctrlFlags;            /* Behavioral bits.  UNIXFILE_* flags */
 215   int lastErrno;                      /* The unix errno from last I/O error */
 216   void *lockingContext;               /* Locking style specific state */
 217   UnixUnusedFd *pUnused;              /* Pre-allocated UnixUnusedFd */
 218   const char *zPath;                  /* Name of the file */
 219   unixShm *pShm;                      /* Shared memory segment information */
 220   int szChunk;                        /* Configured by FCNTL_CHUNK_SIZE */
 221 #if SQLITE_ENABLE_LOCKING_STYLE
 222   int openFlags;                      /* The flags specified at open() */
 223 #endif
 224 #if SQLITE_ENABLE_LOCKING_STYLE || defined(__APPLE__)
 225   unsigned fsFlags;                   /* cached details from statfs() */
 226 #endif
 227 #if OS_VXWORKS
 228   struct vxworksFileId *pId;          /* Unique file ID */
 229 #endif
 230 #ifndef NDEBUG
 231   /* The next group of variables are used to track whether or not the
 232   ** transaction counter in bytes 24-27 of database files are updated
 233   ** whenever any part of the database changes.  An assertion fault will
 234   ** occur if a file is updated without also updating the transaction
 235   ** counter.  This test is made to avoid new problems similar to the
 236   ** one described by ticket #3584.
 237   */
 238   unsigned char transCntrChng;   /* True if the transaction counter changed */
 239   unsigned char dbUpdate;        /* True if any part of database file changed */
 240   unsigned char inNormalWrite;   /* True if in a normal write operation */
 241 #endif
 242 #ifdef SQLITE_TEST
 243   /* In test mode, increase the size of this structure a bit so that
 244   ** it is larger than the struct CrashFile defined in test6.c.
 245   */
 246   char aPadding[32];
 247 #endif
 248 };
 249
 250 /*
 251 ** Allowed values for the unixFile.ctrlFlags bitmask:
 252 */
 253 #define UNIXFILE_EXCL        0x01     /* Connections from one process only */
 254 #define UNIXFILE_RDONLY      0x02     /* Connection is read only */
 255 #define UNIXFILE_PERSIST_WAL 0x04     /* Persistent WAL mode */
 256 #ifndef SQLITE_DISABLE_DIRSYNC
 257 # define UNIXFILE_DIRSYNC    0x08     /* Directory sync needed */
 258 #else
 259 # define UNIXFILE_DIRSYNC    0x00
 260 #endif
 261 #define UNIXFILE_PSOW        0x10     /* SQLITE_IOCAP_POWERSAFE_OVERWRITE */
 262 #define UNIXFILE_DELETE      0x20     /* Delete on close */
 263 #define UNIXFILE_URI         0x40     /* Filename might have query parameters */
 264 #define UNIXFILE_NOLOCK      0x80     /* Do no file locking */
 265
 266 /*
 267 ** Include code that is common to all os_*.c files
 268 */
 269 #include "os_common.h"
 270
 271 /*
 272 ** Define various macros that are missing from some systems.
 273 */
 274 #ifndef O_LARGEFILE
 275 # define O_LARGEFILE 0
 276 #endif
 277 #ifdef SQLITE_DISABLE_LFS
 278 # undef O_LARGEFILE
 279 # define O_LARGEFILE 0
 280 #endif
 281 #ifndef O_NOFOLLOW
 282 # define O_NOFOLLOW 0
 283 #endif
 284 #ifndef O_BINARY
 285 # define O_BINARY 0
 286 #endif
 287
 288 /*
 289 ** The threadid macro resolves to the thread-id or to 0.  Used for
 290 ** testing and debugging only.
 291 */
 292 #if SQLITE_THREADSAFE
 293 #define threadid pthread_self()
 294 #else
 295 #define threadid 0
 296 #endif
 297
 298 /*
 299 ** Different Unix systems declare open() in different ways.  Same use
 300 ** open(const char*,int,mode_t).  Others use open(const char*,int,...).
 301 ** The difference is important when using a pointer to the function.
 302 **
 303 ** The safest way to deal with the problem is to always use this wrapper
 304 ** which always has the same well-defined interface.
 305 */
 306 static int posixOpen(const char *zFile, int flags, int mode){
 307   return open(zFile, flags, mode);
 308 }
 309
 310 /* Forward reference */
 311 static int openDirectory(const char*, int*);
 312
 313 /*
 314 ** Many system calls are accessed through pointer-to-functions so that
 315 ** they may be overridden at runtime to facilitate fault injection during
 316 ** testing and sandboxing.  The following array holds the names and pointers
 317 ** to all overrideable system calls.
 318 */
 319 static struct unix_syscall {
 320   const char *zName;            /* Name of the sytem call */
 321   sqlite3_syscall_ptr pCurrent; /* Current value of the system call */
 322   sqlite3_syscall_ptr pDefault; /* Default value */
 323 } aSyscall[] = {
 324   { "open",         (sqlite3_syscall_ptr)posixOpen,  0  },
 325 #define osOpen      ((int(*)(const char*,int,int))aSyscall[0].pCurrent)
 326
 327   { "close",        (sqlite3_syscall_ptr)close,      0  },
 328 #define osClose     ((int(*)(int))aSyscall[1].pCurrent)
 329
 330   { "access",       (sqlite3_syscall_ptr)access,     0  },
 331 #define osAccess    ((int(*)(const char*,int))aSyscall[2].pCurrent)
 332
 333   { "getcwd",       (sqlite3_syscall_ptr)getcwd,     0  },
 334 #define osGetcwd    ((char*(*)(char*,size_t))aSyscall[3].pCurrent)
 335
 336   { "stat",         (sqlite3_syscall_ptr)stat,       0  },
 337 #define osStat      ((int(*)(const char*,struct stat*))aSyscall[4].pCurrent)
 338
 339 /*
 340 ** The DJGPP compiler environment looks mostly like Unix, but it
 341 ** lacks the fcntl() system call.  So redefine fcntl() to be something
 342 ** that always succeeds.  This means that locking does not occur under
 343 ** DJGPP.  But it is DOS - what did you expect?
 344 */
 345 #ifdef __DJGPP__
 346   { "fstat",        0,                 0  },
 347 #define osFstat(a,b,c)    0
 348 #else
 349   { "fstat",        (sqlite3_syscall_ptr)fstat,      0  },
 350 #define osFstat     ((int(*)(int,struct stat*))aSyscall[5].pCurrent)
 351 #endif
 352
 353   { "ftruncate",    (sqlite3_syscall_ptr)ftruncate,  0  },
 354 #define osFtruncate ((int(*)(int,off_t))aSyscall[6].pCurrent)
 355
 356   { "fcntl",        (sqlite3_syscall_ptr)fcntl,      0  },
 357 #define osFcntl     ((int(*)(int,int,...))aSyscall[7].pCurrent)
 358
 359   { "read",         (sqlite3_syscall_ptr)read,       0  },
 360 #define osRead      ((ssize_t(*)(int,void*,size_t))aSyscall[8].pCurrent)
 361
 362 #if defined(USE_PREAD) || SQLITE_ENABLE_LOCKING_STYLE
 363   { "pread",        (sqlite3_syscall_ptr)pread,      0  },
 364 #else
 365   { "pread",        (sqlite3_syscall_ptr)0,          0  },
 366 #endif
 367 #define osPread     ((ssize_t(*)(int,void*,size_t,off_t))aSyscall[9].pCurrent)
 368
 369 #if defined(USE_PREAD64)
 370   { "pread64",      (sqlite3_syscall_ptr)pread64,    0  },
 371 #else
 372   { "pread64",      (sqlite3_syscall_ptr)0,          0  },
 373 #endif
 374 #define osPread64   ((ssize_t(*)(int,void*,size_t,off_t))aSyscall[10].pCurrent)
 375
 376   { "write",        (sqlite3_syscall_ptr)write,      0  },
 377 #define osWrite     ((ssize_t(*)(int,const void*,size_t))aSyscall[11].pCurrent)
 378
 379 #if defined(USE_PREAD) || SQLITE_ENABLE_LOCKING_STYLE
 380   { "pwrite",       (sqlite3_syscall_ptr)pwrite,     0  },
 381 #else
 382   { "pwrite",       (sqlite3_syscall_ptr)0,          0  },
 383 #endif
 384 #define osPwrite    ((ssize_t(*)(int,const void*,size_t,off_t))\
 385                     aSyscall[12].pCurrent)
 386
 387 #if defined(USE_PREAD64)
 388   { "pwrite64",     (sqlite3_syscall_ptr)pwrite64,   0  },
 389 #else
 390   { "pwrite64",     (sqlite3_syscall_ptr)0,          0  },
 391 #endif
 392 #define osPwrite64  ((ssize_t(*)(int,const void*,size_t,off_t))\
 393                     aSyscall[13].pCurrent)
 394
 395 #if SQLITE_ENABLE_LOCKING_STYLE
 396   { "fchmod",       (sqlite3_syscall_ptr)fchmod,     0  },
 397 #else
 398   { "fchmod",       (sqlite3_syscall_ptr)0,          0  },
 399 #endif
 400 #define osFchmod    ((int(*)(int,mode_t))aSyscall[14].pCurrent)
 401
 402 #if defined(HAVE_POSIX_FALLOCATE) && HAVE_POSIX_FALLOCATE
 403   { "fallocate",    (sqlite3_syscall_ptr)posix_fallocate,  0 },
 404 #else
 405   { "fallocate",    (sqlite3_syscall_ptr)0,                0 },
 406 #endif
 407 #define osFallocate ((int(*)(int,off_t,off_t))aSyscall[15].pCurrent)
 408
 409   { "unlink",       (sqlite3_syscall_ptr)unlink,           0 },
 410 #define osUnlink    ((int(*)(const char*))aSyscall[16].pCurrent)
 411
 412   { "openDirectory",    (sqlite3_syscall_ptr)openDirectory,      0 },
 413 #define osOpenDirectory ((int(*)(const char*,int*))aSyscall[17].pCurrent)
 414
 415   { "mkdir",        (sqlite3_syscall_ptr)mkdir,           0 },
 416 #define osMkdir     ((int(*)(const char*,mode_t))aSyscall[18].pCurrent)
 417
 418   { "rmdir",        (sqlite3_syscall_ptr)rmdir,           0 },
 419 #define osRmdir     ((int(*)(const char*))aSyscall[19].pCurrent)
 420
 421 }; /* End of the overrideable system calls */
 422
 423 /*
 424 ** This is the xSetSystemCall() method of sqlite3_vfs for all of the
 425 ** "unix" VFSes.  Return SQLITE_OK opon successfully updating the
 426 ** system call pointer, or SQLITE_NOTFOUND if there is no configurable
 427 ** system call named zName.
 428 */
 429 static int unixSetSystemCall(
 430   sqlite3_vfs *pNotUsed,        /* The VFS pointer.  Not used */
 431   const char *zName,            /* Name of system call to override */
 432   sqlite3_syscall_ptr pNewFunc  /* Pointer to new system call value */
 433 ){
 434   unsigned int i;
 435   int rc = SQLITE_NOTFOUND;
 436
 437   UNUSED_PARAMETER(pNotUsed);
 438   if( zName==0 ){
 439     /* If no zName is given, restore all system calls to their default
 440     ** settings and return NULL
 441     */
 442     rc = SQLITE_OK;
 443     for(i=0; i<sizeof(aSyscall)/sizeof(aSyscall[0]); i++){
 444       if( aSyscall[i].pDefault ){
 445         aSyscall[i].pCurrent = aSyscall[i].pDefault;
 446       }
 447     }
 448   }else{
 449     /* If zName is specified, operate on only the one system call
 450     ** specified.
 451     */
 452     for(i=0; i<sizeof(aSyscall)/sizeof(aSyscall[0]); i++){
 453       if( strcmp(zName, aSyscall[i].zName)==0 ){
 454         if( aSyscall[i].pDefault==0 ){
 455           aSyscall[i].pDefault = aSyscall[i].pCurrent;
 456         }
 457         rc = SQLITE_OK;
 458         if( pNewFunc==0 ) pNewFunc = aSyscall[i].pDefault;
 459         aSyscall[i].pCurrent = pNewFunc;
 460         break;
 461       }
 462     }
 463   }
 464   return rc;
 465 }
 466
 467 /*
 468 ** Return the value of a system call.  Return NULL if zName is not a
 469 ** recognized system call name.  NULL is also returned if the system call
 470 ** is currently undefined.
 471 */
 472 static sqlite3_syscall_ptr unixGetSystemCall(
 473   sqlite3_vfs *pNotUsed,
 474   const char *zName
 475 ){
 476   unsigned int i;
 477
 478   UNUSED_PARAMETER(pNotUsed);
 479   for(i=0; i<sizeof(aSyscall)/sizeof(aSyscall[0]); i++){
 480     if( strcmp(zName, aSyscall[i].zName)==0 ) return aSyscall[i].pCurrent;
 481   }
 482   return 0;
 483 }
 484
 485 /*
 486 ** Return the name of the first system call after zName.  If zName==NULL
 487 ** then return the name of the first system call.  Return NULL if zName
 488 ** is the last system call or if zName is not the name of a valid
 489 ** system call.
 490 */
 491 static const char *unixNextSystemCall(sqlite3_vfs *p, const char *zName){
 492   int i = -1;
 493
 494   UNUSED_PARAMETER(p);
 495   if( zName ){
 496     for(i=0; i<ArraySize(aSyscall)-1; i++){
 497       if( strcmp(zName, aSyscall[i].zName)==0 ) break;
 498     }
 499   }
 500   for(i++; i<ArraySize(aSyscall); i++){
 501     if( aSyscall[i].pCurrent!=0 ) return aSyscall[i].zName;
 502   }
 503   return 0;
 504 }
 505
 506 /*
 507 ** Retry open() calls that fail due to EINTR
 508 */
 509 static int robust_open(const char *z, int f, int m){
 510   int rc;
 511   do{ rc = osOpen(z,f,m); }while( rc<0 && errno==EINTR );
 512   return rc;
 513 }
 514
 515 /*
 516 ** Helper functions to obtain and relinquish the global mutex. The
 517 ** global mutex is used to protect the unixInodeInfo and
 518 ** vxworksFileId objects used by this file, all of which may be
 519 ** shared by multiple threads.
 520 **
 521 ** Function unixMutexHeld() is used to assert() that the global mutex
 522 ** is held when required. This function is only used as part of assert()
 523 ** statements. e.g.
 524 **
 525 **   unixEnterMutex()
 526 **     assert( unixMutexHeld() );
 527 **   unixEnterLeave()
 528 */
 529 static void unixEnterMutex(void){
 530   sqlite3_mutex_enter(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER));
 531 }
 532 static void unixLeaveMutex(void){
 533   sqlite3_mutex_leave(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER));
 534 }
 535 #ifdef SQLITE_DEBUG
 536 static int unixMutexHeld(void) {
 537   return sqlite3_mutex_held(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER));
 538 }
 539 #endif
 540
 541
 542 #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
 543 /*
 544 ** Helper function for printing out trace information from debugging
 545 ** binaries. This returns the string represetation of the supplied
 546 ** integer lock-type.
 547 */
 548 static const char *azFileLock(int eFileLock){
 549   switch( eFileLock ){
 550     case NO_LOCK: return "NONE";
 551     case SHARED_LOCK: return "SHARED";
 552     case RESERVED_LOCK: return "RESERVED";
 553     case PENDING_LOCK: return "PENDING";
 554     case EXCLUSIVE_LOCK: return "EXCLUSIVE";
 555   }
 556   return "ERROR";
 557 }
 558 #endif
 559
 560 #ifdef SQLITE_LOCK_TRACE
 561 /*
 562 ** Print out information about all locking operations.
 563 **
 564 ** This routine is used for troubleshooting locks on multithreaded
 565 ** platforms.  Enable by compiling with the -DSQLITE_LOCK_TRACE
 566 ** command-line option on the compiler.  This code is normally
 567 ** turned off.
 568 */
 569 static int lockTrace(int fd, int op, struct flock *p){
 570   char *zOpName, *zType;
 571   int s;
 572   int savedErrno;
 573   if( op==F_GETLK ){
 574     zOpName = "GETLK";
 575   }else if( op==F_SETLK ){
 576     zOpName = "SETLK";
 577   }else{
 578     s = osFcntl(fd, op, p);
 579     sqlite3DebugPrintf("fcntl unknown %d %d %d\n", fd, op, s);
 580     return s;
 581   }
 582   if( p->l_type==F_RDLCK ){
 583     zType = "RDLCK";
 584   }else if( p->l_type==F_WRLCK ){
 585     zType = "WRLCK";
 586   }else if( p->l_type==F_UNLCK ){
 587     zType = "UNLCK";
 588   }else{
 589     assert( 0 );
 590   }
 591   assert( p->l_whence==SEEK_SET );
 592   s = osFcntl(fd, op, p);
 593   savedErrno = errno;
 594   sqlite3DebugPrintf("fcntl %d %d %s %s %d %d %d %d\n",
 595      threadid, fd, zOpName, zType, (int)p->l_start, (int)p->l_len,
 596      (int)p->l_pid, s);
 597   if( s==(-1) && op==F_SETLK && (p->l_type==F_RDLCK || p->l_type==F_WRLCK) ){
 598     struct flock l2;
 599     l2 = *p;
 600     osFcntl(fd, F_GETLK, &l2);
 601     if( l2.l_type==F_RDLCK ){
 602       zType = "RDLCK";
 603     }else if( l2.l_type==F_WRLCK ){
 604       zType = "WRLCK";
 605     }else if( l2.l_type==F_UNLCK ){
 606       zType = "UNLCK";
 607     }else{
 608       assert( 0 );
 609     }
 610     sqlite3DebugPrintf("fcntl-failure-reason: %s %d %d %d\n",
 611        zType, (int)l2.l_start, (int)l2.l_len, (int)l2.l_pid);
 612   }
 613   errno = savedErrno;
 614   return s;
 615 }
 616 #undef osFcntl
 617 #define osFcntl lockTrace
 618 #endif /* SQLITE_LOCK_TRACE */
 619
 620 /*
 621 ** Retry ftruncate() calls that fail due to EINTR
 622 */
 623 static int robust_ftruncate(int h, sqlite3_int64 sz){
 624   int rc;
 625   do{ rc = osFtruncate(h,sz); }while( rc<0 && errno==EINTR );
 626   return rc;
 627 }
 628
 629 /*
 630 ** This routine translates a standard POSIX errno code into something
 631 ** useful to the clients of the sqlite3 functions.  Specifically, it is
 632 ** intended to translate a variety of "try again" errors into SQLITE_BUSY
 633 ** and a variety of "please close the file descriptor NOW" errors into
 634 ** SQLITE_IOERR
 635 **
 636 ** Errors during initialization of locks, or file system support for locks,
 637 ** should handle ENOLCK, ENOTSUP, EOPNOTSUPP separately.
 638 */
 639 static int sqliteErrorFromPosixError(int posixError, int sqliteIOErr) {
 640   switch (posixError) {
 641 #if 0
 642   /* At one point this code was not commented out. In theory, this branch
 643   ** should never be hit, as this function should only be called after
 644   ** a locking-related function (i.e. fcntl()) has returned non-zero with
 645   ** the value of errno as the first argument. Since a system call has failed,
 646   ** errno should be non-zero.
 647   **
 648   ** Despite this, if errno really is zero, we still don't want to return
 649   ** SQLITE_OK. The system call failed, and *some* SQLite error should be
 650   ** propagated back to the caller. Commenting this branch out means errno==0
 651   ** will be handled by the "default:" case below.
 652   */
 653   case 0:
 654     return SQLITE_OK;
 655 #endif
 656
 657   case EAGAIN:
 658   case ETIMEDOUT:
 659   case EBUSY:
 660   case EINTR:
 661   case ENOLCK:
 662     /* random NFS retry error, unless during file system support
 663      * introspection, in which it actually means what it says */
 664     return SQLITE_BUSY;
 665
 666   case EACCES:
 667     /* EACCES is like EAGAIN during locking operations, but not any other time*/
 668     if( (sqliteIOErr == SQLITE_IOERR_LOCK) ||
 669         (sqliteIOErr == SQLITE_IOERR_UNLOCK) ||
 670         (sqliteIOErr == SQLITE_IOERR_RDLOCK) ||
 671         (sqliteIOErr == SQLITE_IOERR_CHECKRESERVEDLOCK) ){
 672       return SQLITE_BUSY;
 673     }
 674     /* else fall through */
 675   case EPERM:
 676     return SQLITE_PERM;
 677
 678   /* EDEADLK is only possible if a call to fcntl(F_SETLKW) is made. And
 679   ** this module never makes such a call. And the code in SQLite itself
 680   ** asserts that SQLITE_IOERR_BLOCKED is never returned. For these reasons
 681   ** this case is also commented out. If the system does set errno to EDEADLK,
 682   ** the default SQLITE_IOERR_XXX code will be returned. */
 683 #if 0
 684   case EDEADLK:
 685     return SQLITE_IOERR_BLOCKED;
 686 #endif
 687
 688 #if EOPNOTSUPP!=ENOTSUP
 689   case EOPNOTSUPP:
 690     /* something went terribly awry, unless during file system support
 691      * introspection, in which it actually means what it says */
 692 #endif
 693 #ifdef ENOTSUP
 694   case ENOTSUP:
 695     /* invalid fd, unless during file system support introspection, in which
 696      * it actually means what it says */
 697 #endif
 698   case EIO:
 699   case EBADF:
 700   case EINVAL:
 701   case ENOTCONN:
 702   case ENODEV:
 703   case ENXIO:
 704   case ENOENT:
 705 #ifdef ESTALE                     /* ESTALE is not defined on Interix systems */
 706   case ESTALE:
 707 #endif
 708   case ENOSYS:
 709     /* these should force the client to close the file and reconnect */
 710
 711   default:
 712     return sqliteIOErr;
 713   }
 714 }
 715
 716
 717
 718 /******************************************************************************
 719 ****************** Begin Unique File ID Utility Used By VxWorks ***************
 720 **
 721 ** On most versions of unix, we can get a unique ID for a file by concatenating
 722 ** the device number and the inode number.  But this does not work on VxWorks.
 723 ** On VxWorks, a unique file id must be based on the canonical filename.
 724 **
 725 ** A pointer to an instance of the following structure can be used as a
 726 ** unique file ID in VxWorks.  Each instance of this structure contains
 727 ** a copy of the canonical filename.  There is also a reference count.
 728 ** The structure is reclaimed when the number of pointers to it drops to
 729 ** zero.
 730 **
 731 ** There are never very many files open at one time and lookups are not
 732 ** a performance-critical path, so it is sufficient to put these
 733 ** structures on a linked list.
 734 */
 735 struct vxworksFileId {
 736   struct vxworksFileId *pNext;  /* Next in a list of them all */
 737   int nRef;                     /* Number of references to this one */
 738   int nName;                    /* Length of the zCanonicalName[] string */
 739   char *zCanonicalName;         /* Canonical filename */
 740 };
 741
 742 #if OS_VXWORKS
 743 /*
 744 ** All unique filenames are held on a linked list headed by this
 745 ** variable:
 746 */
 747 static struct vxworksFileId *vxworksFileList = 0;
 748
 749 /*
 750 ** Simplify a filename into its canonical form
 751 ** by making the following changes:
 752 **
 753 **  * removing any trailing and duplicate /
 754 **  * convert /./ into just /
 755 **  * convert /A/../ where A is any simple name into just /
 756 **
 757 ** Changes are made in-place.  Return the new name length.
 758 **
 759 ** The original filename is in z[0..n-1].  Return the number of
 760 ** characters in the simplified name.
 761 */
 762 static int vxworksSimplifyName(char *z, int n){
 763   int i, j;
 764   while( n>1 && z[n-1]=='/' ){ n--; }
 765   for(i=j=0; i<n; i++){
 766     if( z[i]=='/' ){
 767       if( z[i+1]=='/' ) continue;
 768       if( z[i+1]=='.' && i+2<n && z[i+2]=='/' ){
 769         i += 1;
 770         continue;
 771       }
 772       if( z[i+1]=='.' && i+3<n && z[i+2]=='.' && z[i+3]=='/' ){
 773         while( j>0 && z[j-1]!='/' ){ j--; }
 774         if( j>0 ){ j--; }
 775         i += 2;
 776         continue;
 777       }
 778     }
 779     z[j++] = z[i];
 780   }
 781   z[j] = 0;
 782   return j;
 783 }
 784
 785 /*
 786 ** Find a unique file ID for the given absolute pathname.  Return
 787 ** a pointer to the vxworksFileId object.  This pointer is the unique
 788 ** file ID.
 789 **
 790 ** The nRef field of the vxworksFileId object is incremented before
 791 ** the object is returned.  A new vxworksFileId object is created
 792 ** and added to the global list if necessary.
 793 **
 794 ** If a memory allocation error occurs, return NULL.
 795 */
 796 static struct vxworksFileId *vxworksFindFileId(const char *zAbsoluteName){
 797   struct vxworksFileId *pNew;         /* search key and new file ID */
 798   struct vxworksFileId *pCandidate;   /* For looping over existing file IDs */
 799   int n;                              /* Length of zAbsoluteName string */
 800
 801   assert( zAbsoluteName[0]=='/' );
 802   n = (int)strlen(zAbsoluteName);
 803   pNew = sqlite3_malloc( sizeof(*pNew) + (n+1) );
 804   if( pNew==0 ) return 0;
 805   pNew->zCanonicalName = (char*)&pNew[1];
 806   memcpy(pNew->zCanonicalName, zAbsoluteName, n+1);
 807   n = vxworksSimplifyName(pNew->zCanonicalName, n);
 808
 809   /* Search for an existing entry that matching the canonical name.
 810   ** If found, increment the reference count and return a pointer to
 811   ** the existing file ID.
 812   */
 813   unixEnterMutex();
 814   for(pCandidate=vxworksFileList; pCandidate; pCandidate=pCandidate->pNext){
 815     if( pCandidate->nName==n
 816      && memcmp(pCandidate->zCanonicalName, pNew->zCanonicalName, n)==0
 817     ){
 818        sqlite3_free(pNew);
 819        pCandidate->nRef++;
 820        unixLeaveMutex();
 821        return pCandidate;
 822     }
 823   }
 824
 825   /* No match was found.  We will make a new file ID */
 826   pNew->nRef = 1;
 827   pNew->nName = n;
 828   pNew->pNext = vxworksFileList;
 829   vxworksFileList = pNew;
 830   unixLeaveMutex();
 831   return pNew;
 832 }
 833
 834 /*
 835 ** Decrement the reference count on a vxworksFileId object.  Free
 836 ** the object when the reference count reaches zero.
 837 */
 838 static void vxworksReleaseFileId(struct vxworksFileId *pId){
 839   unixEnterMutex();
 840   assert( pId->nRef>0 );
 841   pId->nRef--;
 842   if( pId->nRef==0 ){
 843     struct vxworksFileId **pp;
 844     for(pp=&vxworksFileList; *pp && *pp!=pId; pp = &((*pp)->pNext)){}
 845     assert( *pp==pId );
 846     *pp = pId->pNext;
 847     sqlite3_free(pId);
 848   }
 849   unixLeaveMutex();
 850 }
 851 #endif /* OS_VXWORKS */
 852 /*************** End of Unique File ID Utility Used By VxWorks ****************
 853 ******************************************************************************/
 854
 855
 856 /******************************************************************************
 857 *************************** Posix Advisory Locking ****************************
 858 **
 859 ** POSIX advisory locks are broken by design.  ANSI STD 1003.1 (1996)
 860 ** section 6.5.2.2 lines 483 through 490 specify that when a process
 861 ** sets or clears a lock, that operation overrides any prior locks set
 862 ** by the same process.  It does not explicitly say so, but this implies
 863 ** that it overrides locks set by the same process using a different
 864 ** file descriptor.  Consider this test case:
 865 **
 866 **       int fd1 = open("./file1", O_RDWR|O_CREAT, 0644);
 867 **       int fd2 = open("./file2", O_RDWR|O_CREAT, 0644);
 868 **
 869 ** Suppose ./file1 and ./file2 are really the same file (because
 870 ** one is a hard or symbolic link to the other) then if you set
 871 ** an exclusive lock on fd1, then try to get an exclusive lock
 872 ** on fd2, it works.  I would have expected the second lock to
 873 ** fail since there was already a lock on the file due to fd1.
 874 ** But not so.  Since both locks came from the same process, the
 875 ** second overrides the first, even though they were on different
 876 ** file descriptors opened on different file names.
 877 **
 878 ** This means that we cannot use POSIX locks to synchronize file access
 879 ** among competing threads of the same process.  POSIX locks will work fine
 880 ** to synchronize access for threads in separate processes, but not
 881 ** threads within the same process.
 882 **
 883 ** To work around the problem, SQLite has to manage file locks internally
 884 ** on its own.  Whenever a new database is opened, we have to find the
 885 ** specific inode of the database file (the inode is determined by the
 886 ** st_dev and st_ino fields of the stat structure that fstat() fills in)
 887 ** and check for locks already existing on that inode.  When locks are
 888 ** created or removed, we have to look at our own internal record of the
 889 ** locks to see if another thread has previously set a lock on that same
 890 ** inode.
 891 **
 892 ** (Aside: The use of inode numbers as unique IDs does not work on VxWorks.
 893 ** For VxWorks, we have to use the alternative unique ID system based on
 894 ** canonical filename and implemented in the previous division.)
 895 **
 896 ** The sqlite3_file structure for POSIX is no longer just an integer file
 897 ** descriptor.  It is now a structure that holds the integer file
 898 ** descriptor and a pointer to a structure that describes the internal
 899 ** locks on the corresponding inode.  There is one locking structure
 900 ** per inode, so if the same inode is opened twice, both unixFile structures
 901 ** point to the same locking structure.  The locking structure keeps
 902 ** a reference count (so we will know when to delete it) and a "cnt"
 903 ** field that tells us its internal lock status.  cnt==0 means the
 904 ** file is unlocked.  cnt==-1 means the file has an exclusive lock.
 905 ** cnt>0 means there are cnt shared locks on the file.
 906 **
 907 ** Any attempt to lock or unlock a file first checks the locking
 908 ** structure.  The fcntl() system call is only invoked to set a
 909 ** POSIX lock if the internal lock structure transitions between
 910 ** a locked and an unlocked state.
 911 **
 912 ** But wait:  there are yet more problems with POSIX advisory locks.
 913 **
 914 ** If you close a file descriptor that points to a file that has locks,
 915 ** all locks on that file that are owned by the current process are
 916 ** released.  To work around this problem, each unixInodeInfo object
 917 ** maintains a count of the number of pending locks on tha inode.
 918 ** When an attempt is made to close an unixFile, if there are
 919 ** other unixFile open on the same inode that are holding locks, the call
 920 ** to close() the file descriptor is deferred until all of the locks clear.
 921 ** The unixInodeInfo structure keeps a list of file descriptors that need to
 922 ** be closed and that list is walked (and cleared) when the last lock
 923 ** clears.
 924 **
 925 ** Yet another problem:  LinuxThreads do not play well with posix locks.
 926 **
 927 ** Many older versions of linux use the LinuxThreads library which is
 928 ** not posix compliant.  Under LinuxThreads, a lock created by thread
 929 ** A cannot be modified or overridden by a different thread B.
 930 ** Only thread A can modify the lock.  Locking behavior is correct
 931 ** if the appliation uses the newer Native Posix Thread Library (NPTL)
 932 ** on linux - with NPTL a lock created by thread A can override locks
 933 ** in thread B.  But there is no way to know at compile-time which
 934 ** threading library is being used.  So there is no way to know at
 935 ** compile-time whether or not thread A can override locks on thread B.
 936 ** One has to do a run-time check to discover the behavior of the
 937 ** current process.
 938 **
 939 ** SQLite used to support LinuxThreads.  But support for LinuxThreads
 940 ** was dropped beginning with version 3.7.0.  SQLite will still work with
 941 ** LinuxThreads provided that (1) there is no more than one connection
 942 ** per database file in the same process and (2) database connections
 943 ** do not move across threads.
 944 */
 945
 946 /*
 947 ** An instance of the following structure serves as the key used
 948 ** to locate a particular unixInodeInfo object.
 949 */
 950 struct unixFileId {
 951   dev_t dev;                  /* Device number */
 952 #if OS_VXWORKS
 953   struct vxworksFileId *pId;  /* Unique file ID for vxworks. */
 954 #else
 955   ino_t ino;                  /* Inode number */
 956 #endif
 957 };
 958
 959 /*
 960 ** An instance of the following structure is allocated for each open
 961 ** inode.  Or, on LinuxThreads, there is one of these structures for
 962 ** each inode opened by each thread.
 963 **
 964 ** A single inode can have multiple file descriptors, so each unixFile
 965 ** structure contains a pointer to an instance of this object and this
 966 ** object keeps a count of the number of unixFile pointing to it.
 967 */
 968 struct unixInodeInfo {
 969   struct unixFileId fileId;       /* The lookup key */
 970   int nShared;                    /* Number of SHARED locks held */
 971   unsigned char eFileLock;        /* One of SHARED_LOCK, RESERVED_LOCK etc. */
 972   unsigned char bProcessLock;     /* An exclusive process lock is held */
 973   int nRef;                       /* Number of pointers to this structure */
 974   unixShmNode *pShmNode;          /* Shared memory associated with this inode */
 975   int nLock;                      /* Number of outstanding file locks */
 976   UnixUnusedFd *pUnused;          /* Unused file descriptors to close */
 977   unixInodeInfo *pNext;           /* List of all unixInodeInfo objects */
 978   unixInodeInfo *pPrev;           /*    .... doubly linked */
 979 #if SQLITE_ENABLE_LOCKING_STYLE
 980   unsigned long long sharedByte;  /* for AFP simulated shared lock */
 981 #endif
 982 #if OS_VXWORKS
 983   sem_t *pSem;                    /* Named POSIX semaphore */
 984   char aSemName[MAX_PATHNAME+2];  /* Name of that semaphore */
 985 #endif
 986 };
 987
 988 /*
 989 ** A lists of all unixInodeInfo objects.
 990 */
 991 static unixInodeInfo *inodeList = 0;
 992
 993 /*
 994 **
 995 ** This function - unixLogError_x(), is only ever called via the macro
 996 ** unixLogError().
 997 **
 998 ** It is invoked after an error occurs in an OS function and errno has been
 999 ** set. It logs a message using sqlite3_log() containing the current value of
1000 ** errno and, if possible, the human-readable equivalent from strerror() or
1001 ** strerror_r().
1002 **
1003 ** The first argument passed to the macro should be the error code that
1004 ** will be returned to SQLite (e.g. SQLITE_IOERR_DELETE, SQLITE_CANTOPEN).
1005 ** The two subsequent arguments should be the name of the OS function that
1006 ** failed (e.g. "unlink", "open") and the the associated file-system path,
1007 ** if any.
1008 */
1009 #define unixLogError(a,b,c)     unixLogErrorAtLine(a,b,c,__LINE__)
1010 static int unixLogErrorAtLine(
1011   int errcode,                    /* SQLite error code */
1012   const char *zFunc,              /* Name of OS function that failed */
1013   const char *zPath,              /* File path associated with error */
1014   int iLine                       /* Source line number where error occurred */
1015 ){
1016   char *zErr;                     /* Message from strerror() or equivalent */
1017   int iErrno = errno;             /* Saved syscall error number */
1018
1019   /* If this is not a threadsafe build (SQLITE_THREADSAFE==0), then use
1020   ** the strerror() function to obtain the human-readable error message
1021   ** equivalent to errno. Otherwise, use strerror_r().
1022   */
1023 #if SQLITE_THREADSAFE && defined(HAVE_STRERROR_R)
1024   char aErr[80];
1025   memset(aErr, 0, sizeof(aErr));
1026   zErr = aErr;
1027
1028   /* If STRERROR_R_CHAR_P (set by autoconf scripts) or __USE_GNU is defined,
1029   ** assume that the system provides the the GNU version of strerror_r() that
1030   ** returns a pointer to a buffer containing the error message. That pointer
1031   ** may point to aErr[], or it may point to some static storage somewhere.
1032   ** Otherwise, assume that the system provides the POSIX version of
1033   ** strerror_r(), which always writes an error message into aErr[].
1034   **
1035   ** If the code incorrectly assumes that it is the POSIX version that is
1036   ** available, the error message will often be an empty string. Not a
1037   ** huge problem. Incorrectly concluding that the GNU version is available
1038   ** could lead to a segfault though.
1039   */
1040 #if defined(STRERROR_R_CHAR_P) || defined(__USE_GNU)
1041   zErr =
1042 # endif
1043   strerror_r(iErrno, aErr, sizeof(aErr)-1);
1044
1045 #elif SQLITE_THREADSAFE
1046   /* This is a threadsafe build, but strerror_r() is not available. */
1047   zErr = "";
1048 #else
1049   /* Non-threadsafe build, use strerror(). */
1050   zErr = strerror(iErrno);
1051 #endif
1052
1053   assert( errcode!=SQLITE_OK );
1054   if( zPath==0 ) zPath = "";
1055   sqlite3_log(errcode,
1056       "os_unix.c:%d: (%d) %s(%s) - %s",
1057       iLine, iErrno, zFunc, zPath, zErr
1058   );
1059
1060   return errcode;
1061 }
1062
1063 /*
1064 ** Close a file descriptor.
1065 **
1066 ** We assume that close() almost always works, since it is only in a
1067 ** very sick application or on a very sick platform that it might fail.
1068 ** If it does fail, simply leak the file descriptor, but do log the
1069 ** error.
1070 **
1071 ** Note that it is not safe to retry close() after EINTR since the
1072 ** file descriptor might have already been reused by another thread.
1073 ** So we don't even try to recover from an EINTR.  Just log the error
1074 ** and move on.
1075 */
1076 static void robust_close(unixFile *pFile, int h, int lineno){
1077   if( osClose(h) ){
1078     unixLogErrorAtLine(SQLITE_IOERR_CLOSE, "close",
1079                        pFile ? pFile->zPath : 0, lineno);
1080   }
1081 }
1082
1083 /*
1084 ** Close all file descriptors accumuated in the unixInodeInfo->pUnused list.
1085 */
1086 static void closePendingFds(unixFile *pFile){
1087   unixInodeInfo *pInode = pFile->pInode;
1088   UnixUnusedFd *p;
1089   UnixUnusedFd *pNext;
1090   for(p=pInode->pUnused; p; p=pNext){
1091     pNext = p->pNext;
1092     robust_close(pFile, p->fd, __LINE__);
1093     sqlite3_free(p);
1094   }
1095   pInode->pUnused = 0;
1096 }
1097
1098 /*
1099 ** Release a unixInodeInfo structure previously allocated by findInodeInfo().
1100 **
1101 ** The mutex entered using the unixEnterMutex() function must be held
1102 ** when this function is called.
1103 */
1104 static void releaseInodeInfo(unixFile *pFile){
1105   unixInodeInfo *pInode = pFile->pInode;
1106   assert( unixMutexHeld() );
1107   if( ALWAYS(pInode) ){
1108     pInode->nRef--;
1109     if( pInode->nRef==0 ){
1110       assert( pInode->pShmNode==0 );
1111       closePendingFds(pFile);
1112       if( pInode->pPrev ){
1113         assert( pInode->pPrev->pNext==pInode );
1114         pInode->pPrev->pNext = pInode->pNext;
1115       }else{
1116         assert( inodeList==pInode );
1117         inodeList = pInode->pNext;
1118       }
1119       if( pInode->pNext ){
1120         assert( pInode->pNext->pPrev==pInode );
1121         pInode->pNext->pPrev = pInode->pPrev;
1122       }
1123       sqlite3_free(pInode);
1124     }
1125   }
1126 }
1127
1128 /*
1129 ** Given a file descriptor, locate the unixInodeInfo object that
1130 ** describes that file descriptor.  Create a new one if necessary.  The
1131 ** return value might be uninitialized if an error occurs.
1132 **
1133 ** The mutex entered using the unixEnterMutex() function must be held
1134 ** when this function is called.
1135 **
1136 ** Return an appropriate error code.
1137 */
1138 static int findInodeInfo(
1139   unixFile *pFile,               /* Unix file with file desc used in the key */
1140   unixInodeInfo **ppInode        /* Return the unixInodeInfo object here */
1141 ){
1142   int rc;                        /* System call return code */
1143   int fd;                        /* The file descriptor for pFile */
1144   struct unixFileId fileId;      /* Lookup key for the unixInodeInfo */
1145   struct stat statbuf;           /* Low-level file information */
1146   unixInodeInfo *pInode = 0;     /* Candidate unixInodeInfo object */
1147
1148   assert( unixMutexHeld() );
1149
1150   /* Get low-level information about the file that we can used to
1151   ** create a unique name for the file.
1152   */
1153   fd = pFile->h;
1154   rc = osFstat(fd, &statbuf);
1155   if( rc!=0 ){
1156     pFile->lastErrno = errno;
1157 #ifdef EOVERFLOW
1158     if( pFile->lastErrno==EOVERFLOW ) return SQLITE_NOLFS;
1159 #endif
1160     return SQLITE_IOERR;
1161   }
1162
1163 #ifdef __APPLE__
1164   /* On OS X on an msdos filesystem, the inode number is reported
1165   ** incorrectly for zero-size files.  See ticket #3260.  To work
1166   ** around this problem (we consider it a bug in OS X, not SQLite)
1167   ** we always increase the file size to 1 by writing a single byte
1168   ** prior to accessing the inode number.  The one byte written is
1169   ** an ASCII 'S' character which also happens to be the first byte
1170   ** in the header of every SQLite database.  In this way, if there
1171   ** is a race condition such that another thread has already populated
1172   ** the first page of the database, no damage is done.
1173   */
1174   if( statbuf.st_size==0 && (pFile->fsFlags & SQLITE_FSFLAGS_IS_MSDOS)!=0 ){
1175     do{ rc = osWrite(fd, "S", 1); }while( rc<0 && errno==EINTR );
1176     if( rc!=1 ){
1177       pFile->lastErrno = errno;
1178       return SQLITE_IOERR;
1179     }
1180     rc = osFstat(fd, &statbuf);
1181     if( rc!=0 ){
1182       pFile->lastErrno = errno;
1183       return SQLITE_IOERR;
1184     }
1185   }
1186 #endif
1187
1188   memset(&fileId, 0, sizeof(fileId));
1189   fileId.dev = statbuf.st_dev;
1190 #if OS_VXWORKS
1191   fileId.pId = pFile->pId;
1192 #else
1193   fileId.ino = statbuf.st_ino;
1194 #endif
1195   pInode = inodeList;
1196   while( pInode && memcmp(&fileId, &pInode->fileId, sizeof(fileId)) ){
1197     pInode = pInode->pNext;
1198   }
1199   if( pInode==0 ){
1200     pInode = sqlite3_malloc( sizeof(*pInode) );
1201     if( pInode==0 ){
1202       return SQLITE_NOMEM;
1203     }
1204     memset(pInode, 0, sizeof(*pInode));
1205     memcpy(&pInode->fileId, &fileId, sizeof(fileId));
1206     pInode->nRef = 1;
1207     pInode->pNext = inodeList;
1208     pInode->pPrev = 0;
1209     if( inodeList ) inodeList->pPrev = pInode;
1210     inodeList = pInode;
1211   }else{
1212     pInode->nRef++;
1213   }
1214   *ppInode = pInode;
1215   return SQLITE_OK;
1216 }
1217
1218
1219 /*
1220 ** This routine checks if there is a RESERVED lock held on the specified
1221 ** file by this or any other process. If such a lock is held, set *pResOut
1222 ** to a non-zero value otherwise *pResOut is set to zero.  The return value
1223 ** is set to SQLITE_OK unless an I/O error occurs during lock checking.
1224 */
1225 static int unixCheckReservedLock(sqlite3_file *id, int *pResOut){
1226   int rc = SQLITE_OK;
1227   int reserved = 0;
1228   unixFile *pFile = (unixFile*)id;
1229
1230   SimulateIOError( return SQLITE_IOERR_CHECKRESERVEDLOCK; );
1231
1232   assert( pFile );
1233   unixEnterMutex(); /* Because pFile->pInode is shared across threads */
1234
1235   /* Check if a thread in this process holds such a lock */
1236   if( pFile->pInode->eFileLock>SHARED_LOCK ){
1237     reserved = 1;
1238   }
1239
1240   /* Otherwise see if some other process holds it.
1241   */
1242 #ifndef __DJGPP__
1243   if( !reserved && !pFile->pInode->bProcessLock ){
1244     struct flock lock;
1245     lock.l_whence = SEEK_SET;
1246     lock.l_start = RESERVED_BYTE;
1247     lock.l_len = 1;
1248     lock.l_type = F_WRLCK;
1249     if( osFcntl(pFile->h, F_GETLK, &lock) ){
1250       rc = SQLITE_IOERR_CHECKRESERVEDLOCK;
1251       pFile->lastErrno = errno;
1252     } else if( lock.l_type!=F_UNLCK ){
1253       reserved = 1;
1254     }
1255   }
1256 #endif
1257
1258   unixLeaveMutex();
1259   OSTRACE(("TEST WR-LOCK %d %d %d (unix)\n", pFile->h, rc, reserved));
1260
1261   *pResOut = reserved;
1262   return rc;
1263 }
1264
1265 /*
1266 ** Attempt to set a system-lock on the file pFile.  The lock is
1267 ** described by pLock.
1268 **
1269 ** If the pFile was opened read/write from unix-excl, then the only lock
1270 ** ever obtained is an exclusive lock, and it is obtained exactly once
1271 ** the first time any lock is attempted.  All subsequent system locking
1272 ** operations become no-ops.  Locking operations still happen internally,
1273 ** in order to coordinate access between separate database connections
1274 ** within this process, but all of that is handled in memory and the
1275 ** operating system does not participate.
1276 **
1277 ** This function is a pass-through to fcntl(F_SETLK) if pFile is using
1278 ** any VFS other than "unix-excl" or if pFile is opened on "unix-excl"
1279 ** and is read-only.
1280 **
1281 ** Zero is returned if the call completes successfully, or -1 if a call
1282 ** to fcntl() fails. In this case, errno is set appropriately (by fcntl()).
1283 */
1284 static int unixFileLock(unixFile *pFile, struct flock *pLock){
1285   int rc;
1286   unixInodeInfo *pInode = pFile->pInode;
1287   assert( unixMutexHeld() );
1288   assert( pInode!=0 );
1289   if( ((pFile->ctrlFlags & UNIXFILE_EXCL)!=0 || pInode->bProcessLock)
1290    && ((pFile->ctrlFlags & UNIXFILE_RDONLY)==0)
1291   ){
1292     if( pInode->bProcessLock==0 ){
1293       struct flock lock;
1294       assert( pInode->nLock==0 );
1295       lock.l_whence = SEEK_SET;
1296       lock.l_start = SHARED_FIRST;
1297       lock.l_len = SHARED_SIZE;
1298       lock.l_type = F_WRLCK;
1299       rc = osFcntl(pFile->h, F_SETLK, &lock);
1300       if( rc<0 ) return rc;
1301       pInode->bProcessLock = 1;
1302       pInode->nLock++;
1303     }else{
1304       rc = 0;
1305     }
1306   }else{
1307     rc = osFcntl(pFile->h, F_SETLK, pLock);
1308   }
1309   return rc;
1310 }
1311
1312 /*
1313 ** Lock the file with the lock specified by parameter eFileLock - one
1314 ** of the following:
1315 **
1316 **     (1) SHARED_LOCK
1317 **     (2) RESERVED_LOCK
1318 **     (3) PENDING_LOCK
1319 **     (4) EXCLUSIVE_LOCK
1320 **
1321 ** Sometimes when requesting one lock state, additional lock states
1322 ** are inserted in between.  The locking might fail on one of the later
1323 ** transitions leaving the lock state different from what it started but
1324 ** still short of its goal.  The following chart shows the allowed
1325 ** transitions and the inserted intermediate states:
1326 **
1327 **    UNLOCKED -> SHARED
1328 **    SHARED -> RESERVED
1329 **    SHARED -> (PENDING) -> EXCLUSIVE
1330 **    RESERVED -> (PENDING) -> EXCLUSIVE
1331 **    PENDING -> EXCLUSIVE
1332 **
1333 ** This routine will only increase a lock.  Use the sqlite3OsUnlock()
1334 ** routine to lower a locking level.
1335 */
1336 static int unixLock(sqlite3_file *id, int eFileLock){
1337   /* The following describes the implementation of the various locks and
1338   ** lock transitions in terms of the POSIX advisory shared and exclusive
1339   ** lock primitives (called read-locks and write-locks below, to avoid
1340   ** confusion with SQLite lock names). The algorithms are complicated
1341   ** slightly in order to be compatible with windows systems simultaneously
1342   ** accessing the same database file, in case that is ever required.
1343   **
1344   ** Symbols defined in os.h indentify the 'pending byte' and the 'reserved
1345   ** byte', each single bytes at well known offsets, and the 'shared byte
1346   ** range', a range of 510 bytes at a well known offset.
1347   **
1348   ** To obtain a SHARED lock, a read-lock is obtained on the 'pending
1349   ** byte'.  If this is successful, a random byte from the 'shared byte
1350   ** range' is read-locked and the lock on the 'pending byte' released.
1351   **
1352   ** A process may only obtain a RESERVED lock after it has a SHARED lock.
1353   ** A RESERVED lock is implemented by grabbing a write-lock on the
1354   ** 'reserved byte'.
1355   **
1356   ** A process may only obtain a PENDING lock after it has obtained a
1357   ** SHARED lock. A PENDING lock is implemented by obtaining a write-lock
1358   ** on the 'pending byte'. This ensures that no new SHARED locks can be
1359   ** obtained, but existing SHARED locks are allowed to persist. A process
1360   ** does not have to obtain a RESERVED lock on the way to a PENDING lock.
1361   ** This property is used by the algorithm for rolling back a journal file
1362   ** after a crash.
1363   **
1364   ** An EXCLUSIVE lock, obtained after a PENDING lock is held, is
1365   ** implemented by obtaining a write-lock on the entire 'shared byte
1366   ** range'. Since all other locks require a read-lock on one of the bytes
1367   ** within this range, this ensures that no other locks are held on the
1368   ** database.
1369   **
1370   ** The reason a single byte cannot be used instead of the 'shared byte
1371   ** range' is that some versions of windows do not support read-locks. By
1372   ** locking a random byte from a range, concurrent SHARED locks may exist
1373   ** even if the locking primitive used is always a write-lock.
1374   */
1375   int rc = SQLITE_OK;
1376   unixFile *pFile = (unixFile*)id;
1377   unixInodeInfo *pInode;
1378   struct flock lock;
1379   int tErrno = 0;
1380
1381   assert( pFile );
1382   OSTRACE(("LOCK    %d %s was %s(%s,%d) pid=%d (unix)\n", pFile->h,
1383       azFileLock(eFileLock), azFileLock(pFile->eFileLock),
1384       azFileLock(pFile->pInode->eFileLock), pFile->pInode->nShared , getpid()));
1385
1386   /* If there is already a lock of this type or more restrictive on the
1387   ** unixFile, do nothing. Don't use the end_lock: exit path, as
1388   ** unixEnterMutex() hasn't been called yet.
1389   */
1390   if( pFile->eFileLock>=eFileLock ){
1391     OSTRACE(("LOCK    %d %s ok (already held) (unix)\n", pFile->h,
1392             azFileLock(eFileLock)));
1393     return SQLITE_OK;
1394   }
1395
1396   /* Make sure the locking sequence is correct.
1397   **  (1) We never move from unlocked to anything higher than shared lock.
1398   **  (2) SQLite never explicitly requests a pendig lock.
1399   **  (3) A shared lock is always held when a reserve lock is requested.
1400   */
1401   assert( pFile->eFileLock!=NO_LOCK || eFileLock==SHARED_LOCK );
1402   assert( eFileLock!=PENDING_LOCK );
1403   assert( eFileLock!=RESERVED_LOCK || pFile->eFileLock==SHARED_LOCK );
1404
1405   /* This mutex is needed because pFile->pInode is shared across threads
1406   */
1407   unixEnterMutex();
1408   pInode = pFile->pInode;
1409
1410   /* If some thread using this PID has a lock via a different unixFile*
1411   ** handle that precludes the requested lock, return BUSY.
1412   */
1413   if( (pFile->eFileLock!=pInode->eFileLock &&
1414           (pInode->eFileLock>=PENDING_LOCK || eFileLock>SHARED_LOCK))
1415   ){
1416     rc = SQLITE_BUSY;
1417     goto end_lock;
1418   }
1419
1420   /* If a SHARED lock is requested, and some thread using this PID already
1421   ** has a SHARED or RESERVED lock, then increment reference counts and
1422   ** return SQLITE_OK.
1423   */
1424   if( eFileLock==SHARED_LOCK &&
1425       (pInode->eFileLock==SHARED_LOCK || pInode->eFileLock==RESERVED_LOCK) ){
1426     assert( eFileLock==SHARED_LOCK );
1427     assert( pFile->eFileLock==0 );
1428     assert( pInode->nShared>0 );
1429     pFile->eFileLock = SHARED_LOCK;
1430     pInode->nShared++;
1431     pInode->nLock++;
1432     goto end_lock;
1433   }
1434
1435
1436   /* A PENDING lock is needed before acquiring a SHARED lock and before
1437   ** acquiring an EXCLUSIVE lock.  For the SHARED lock, the PENDING will
1438   ** be released.
1439   */
1440   lock.l_len = 1L;
1441   lock.l_whence = SEEK_SET;
1442   if( eFileLock==SHARED_LOCK
1443       || (eFileLock==EXCLUSIVE_LOCK && pFile->eFileLock<PENDING_LOCK)
1444   ){
1445     lock.l_type = (eFileLock==SHARED_LOCK?F_RDLCK:F_WRLCK);
1446     lock.l_start = PENDING_BYTE;
1447     if( unixFileLock(pFile, &lock) ){
1448       tErrno = errno;
1449       rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_LOCK);
1450       if( rc!=SQLITE_BUSY ){
1451         pFile->lastErrno = tErrno;
1452       }
1453       goto end_lock;
1454     }
1455   }
1456
1457
1458   /* If control gets to this point, then actually go ahead and make
1459   ** operating system calls for the specified lock.
1460   */
1461   if( eFileLock==SHARED_LOCK ){
1462     assert( pInode->nShared==0 );
1463     assert( pInode->eFileLock==0 );
1464     assert( rc==SQLITE_OK );
1465
1466     /* Now get the read-lock */
1467     lock.l_start = SHARED_FIRST;
1468     lock.l_len = SHARED_SIZE;
1469     if( unixFileLock(pFile, &lock) ){
1470       tErrno = errno;
1471       rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_LOCK);
1472     }
1473
1474     /* Drop the temporary PENDING lock */
1475     lock.l_start = PENDING_BYTE;
1476     lock.l_len = 1L;
1477     lock.l_type = F_UNLCK;
1478     if( unixFileLock(pFile, &lock) && rc==SQLITE_OK ){
1479       /* This could happen with a network mount */
1480       tErrno = errno;
1481       rc = SQLITE_IOERR_UNLOCK;
1482     }
1483
1484     if( rc ){
1485       if( rc!=SQLITE_BUSY ){
1486         pFile->lastErrno = tErrno;
1487       }
1488       goto end_lock;
1489     }else{
1490       pFile->eFileLock = SHARED_LOCK;
1491       pInode->nLock++;
1492       pInode->nShared = 1;
1493     }
1494   }else if( eFileLock==EXCLUSIVE_LOCK && pInode->nShared>1 ){
1495     /* We are trying for an exclusive lock but another thread in this
1496     ** same process is still holding a shared lock. */
1497     rc = SQLITE_BUSY;
1498   }else{
1499     /* The request was for a RESERVED or EXCLUSIVE lock.  It is
1500     ** assumed that there is a SHARED or greater lock on the file
1501     ** already.
1502     */
1503     assert( 0!=pFile->eFileLock );
1504     lock.l_type = F_WRLCK;
1505
1506     assert( eFileLock==RESERVED_LOCK || eFileLock==EXCLUSIVE_LOCK );
1507     if( eFileLock==RESERVED_LOCK ){
1508       lock.l_start = RESERVED_BYTE;
1509       lock.l_len = 1L;
1510     }else{
1511       lock.l_start = SHARED_FIRST;
1512       lock.l_len = SHARED_SIZE;
1513     }
1514
1515     if( unixFileLock(pFile, &lock) ){
1516       tErrno = errno;
1517       rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_LOCK);
1518       if( rc!=SQLITE_BUSY ){
1519         pFile->lastErrno = tErrno;
1520       }
1521     }
1522   }
1523
1524
1525 #ifndef NDEBUG
1526   /* Set up the transaction-counter change checking flags when
1527   ** transitioning from a SHARED to a RESERVED lock.  The change
1528   ** from SHARED to RESERVED marks the beginning of a normal
1529   ** write operation (not a hot journal rollback).
1530   */
1531   if( rc==SQLITE_OK
1532    && pFile->eFileLock<=SHARED_LOCK
1533    && eFileLock==RESERVED_LOCK
1534   ){
1535     pFile->transCntrChng = 0;
1536     pFile->dbUpdate = 0;
1537     pFile->inNormalWrite = 1;
1538   }
1539 #endif
1540
1541
1542   if( rc==SQLITE_OK ){
1543     pFile->eFileLock = eFileLock;
1544     pInode->eFileLock = eFileLock;
1545   }else if( eFileLock==EXCLUSIVE_LOCK ){
1546     pFile->eFileLock = PENDING_LOCK;
1547     pInode->eFileLock = PENDING_LOCK;
1548   }
1549
1550 end_lock:
1551   unixLeaveMutex();
1552   OSTRACE(("LOCK    %d %s %s (unix)\n", pFile->h, azFileLock(eFileLock),
1553       rc==SQLITE_OK ? "ok" : "failed"));
1554   return rc;
1555 }
1556
1557 /*
1558 ** Add the file descriptor used by file handle pFile to the corresponding
1559 ** pUnused list.
1560 */
1561 static void setPendingFd(unixFile *pFile){
1562   unixInodeInfo *pInode = pFile->pInode;
1563   UnixUnusedFd *p = pFile->pUnused;
1564   p->pNext = pInode->pUnused;
1565   pInode->pUnused = p;
1566   pFile->h = -1;
1567   pFile->pUnused = 0;
1568 }
1569
1570 /*
1571 ** Lower the locking level on file descriptor pFile to eFileLock.  eFileLock
1572 ** must be either NO_LOCK or SHARED_LOCK.
1573 **
1574 ** If the locking level of the file descriptor is already at or below
1575 ** the requested locking level, this routine is a no-op.
1576 **
1577 ** If handleNFSUnlock is true, then on downgrading an EXCLUSIVE_LOCK to SHARED
1578 ** the byte range is divided into 2 parts and the first part is unlocked then
1579 ** set to a read lock, then the other part is simply unlocked.  This works
1580 ** around a bug in BSD NFS lockd (also seen on MacOSX 10.3+) that fails to
1581 ** remove the write lock on a region when a read lock is set.
1582 */
1583 static int posixUnlock(sqlite3_file *id, int eFileLock, int handleNFSUnlock){
1584   unixFile *pFile = (unixFile*)id;
1585   unixInodeInfo *pInode;
1586   struct flock lock;
1587   int rc = SQLITE_OK;
1588
1589   assert( pFile );
1590   OSTRACE(("UNLOCK  %d %d was %d(%d,%d) pid=%d (unix)\n", pFile->h, eFileLock,
1591       pFile->eFileLock, pFile->pInode->eFileLock, pFile->pInode->nShared,
1592       getpid()));
1593
1594   assert( eFileLock<=SHARED_LOCK );
1595   if( pFile->eFileLock<=eFileLock ){
1596     return SQLITE_OK;
1597   }
1598   unixEnterMutex();
1599   pInode = pFile->pInode;
1600   assert( pInode->nShared!=0 );
1601   if( pFile->eFileLock>SHARED_LOCK ){
1602     assert( pInode->eFileLock==pFile->eFileLock );
1603
1604 #ifndef NDEBUG
1605     /* When reducing a lock such that other processes can start
1606     ** reading the database file again, make sure that the
1607     ** transaction counter was updated if any part of the database
1608     ** file changed.  If the transaction counter is not updated,
1609     ** other connections to the same file might not realize that
1610     ** the file has changed and hence might not know to flush their
1611     ** cache.  The use of a stale cache can lead to database corruption.
1612     */
1613     pFile->inNormalWrite = 0;
1614 #endif
1615
1616     /* downgrading to a shared lock on NFS involves clearing the write lock
1617     ** before establishing the readlock - to avoid a race condition we downgrade
1618     ** the lock in 2 blocks, so that part of the range will be covered by a
1619     ** write lock until the rest is covered by a read lock:
1620     **  1:   [WWWWW]
1621     **  2:   [....W]
1622     **  3:   [RRRRW]
1623     **  4:   [RRRR.]
1624     */
1625     if( eFileLock==SHARED_LOCK ){
1626
1627 #if !defined(__APPLE__) || !SQLITE_ENABLE_LOCKING_STYLE
1628       (void)handleNFSUnlock;
1629       assert( handleNFSUnlock==0 );
1630 #endif
1631 #if defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE
1632       if( handleNFSUnlock ){
1633         int tErrno;               /* Error code from system call errors */
1634         off_t divSize = SHARED_SIZE - 1;
1635
1636         lock.l_type = F_UNLCK;
1637         lock.l_whence = SEEK_SET;
1638         lock.l_start = SHARED_FIRST;
1639         lock.l_len = divSize;
1640         if( unixFileLock(pFile, &lock)==(-1) ){
1641           tErrno = errno;
1642           rc = SQLITE_IOERR_UNLOCK;
1643           if( IS_LOCK_ERROR(rc) ){
1644             pFile->lastErrno = tErrno;
1645           }
1646           goto end_unlock;
1647         }
1648         lock.l_type = F_RDLCK;
1649         lock.l_whence = SEEK_SET;
1650         lock.l_start = SHARED_FIRST;
1651         lock.l_len = divSize;
1652         if( unixFileLock(pFile, &lock)==(-1) ){
1653           tErrno = errno;
1654           rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_RDLOCK);
1655           if( IS_LOCK_ERROR(rc) ){
1656             pFile->lastErrno = tErrno;
1657           }
1658           goto end_unlock;
1659         }
1660         lock.l_type = F_UNLCK;
1661         lock.l_whence = SEEK_SET;
1662         lock.l_start = SHARED_FIRST+divSize;
1663         lock.l_len = SHARED_SIZE-divSize;
1664         if( unixFileLock(pFile, &lock)==(-1) ){
1665           tErrno = errno;
1666           rc = SQLITE_IOERR_UNLOCK;
1667           if( IS_LOCK_ERROR(rc) ){
1668             pFile->lastErrno = tErrno;
1669           }
1670           goto end_unlock;
1671         }
1672       }else
1673 #endif /* defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE */
1674       {
1675         lock.l_type = F_RDLCK;
1676         lock.l_whence = SEEK_SET;
1677         lock.l_start = SHARED_FIRST;
1678         lock.l_len = SHARED_SIZE;
1679         if( unixFileLock(pFile, &lock) ){
1680           /* In theory, the call to unixFileLock() cannot fail because another
1681           ** process is holding an incompatible lock. If it does, this
1682           ** indicates that the other process is not following the locking
1683           ** protocol. If this happens, return SQLITE_IOERR_RDLOCK. Returning
1684           ** SQLITE_BUSY would confuse the upper layer (in practice it causes
1685           ** an assert to fail). */
1686           rc = SQLITE_IOERR_RDLOCK;
1687           pFile->lastErrno = errno;
1688           goto end_unlock;
1689         }
1690       }
1691     }
1692     lock.l_type = F_UNLCK;
1693     lock.l_whence = SEEK_SET;
1694     lock.l_start = PENDING_BYTE;
1695     lock.l_len = 2L;  assert( PENDING_BYTE+1==RESERVED_BYTE );
1696     if( unixFileLock(pFile, &lock)==0 ){
1697       pInode->eFileLock = SHARED_LOCK;
1698     }else{
1699       rc = SQLITE_IOERR_UNLOCK;
1700       pFile->lastErrno = errno;
1701       goto end_unlock;
1702     }
1703   }
1704   if( eFileLock==NO_LOCK ){
1705     /* Decrement the shared lock counter.  Release the lock using an
1706     ** OS call only when all threads in this same process have released
1707     ** the lock.
1708     */
1709     pInode->nShared--;
1710     if( pInode->nShared==0 ){
1711       lock.l_type = F_UNLCK;
1712       lock.l_whence = SEEK_SET;
1713       lock.l_start = lock.l_len = 0L;
1714       if( unixFileLock(pFile, &lock)==0 ){
1715         pInode->eFileLock = NO_LOCK;
1716       }else{
1717         rc = SQLITE_IOERR_UNLOCK;
1718         pFile->lastErrno = errno;
1719         pInode->eFileLock = NO_LOCK;
1720         pFile->eFileLock = NO_LOCK;
1721       }
1722     }
1723
1724     /* Decrement the count of locks against this same file.  When the
1725     ** count reaches zero, close any other file descriptors whose close
1726     ** was deferred because of outstanding locks.
1727     */
1728     pInode->nLock--;
1729     assert( pInode->nLock>=0 );
1730     if( pInode->nLock==0 ){
1731       closePendingFds(pFile);
1732     }
1733   }
1734
1735 end_unlock:
1736   unixLeaveMutex();
1737   if( rc==SQLITE_OK ) pFile->eFileLock = eFileLock;
1738   return rc;
1739 }
1740
1741 /*
1742 ** Lower the locking level on file descriptor pFile to eFileLock.  eFileLock
1743 ** must be either NO_LOCK or SHARED_LOCK.
1744 **
1745 ** If the locking level of the file descriptor is already at or below
1746 ** the requested locking level, this routine is a no-op.
1747 */
1748 static int unixUnlock(sqlite3_file *id, int eFileLock){
1749   return posixUnlock(id, eFileLock, 0);
1750 }
1751
1752 /*
1753 ** This function performs the parts of the "close file" operation
1754 ** common to all locking schemes. It closes the directory and file
1755 ** handles, if they are valid, and sets all fields of the unixFile
1756 ** structure to 0.
1757 **
1758 ** It is *not* necessary to hold the mutex when this routine is called,
1759 ** even on VxWorks.  A mutex will be acquired on VxWorks by the
1760 ** vxworksReleaseFileId() routine.
1761 */
1762 static int closeUnixFile(sqlite3_file *id){
1763   unixFile *pFile = (unixFile*)id;
1764   if( pFile->h>=0 ){
1765     robust_close(pFile, pFile->h, __LINE__);
1766     pFile->h = -1;
1767   }
1768 #if OS_VXWORKS
1769   if( pFile->pId ){
1770     if( pFile->ctrlFlags & UNIXFILE_DELETE ){
1771       osUnlink(pFile->pId->zCanonicalName);
1772     }
1773     vxworksReleaseFileId(pFile->pId);
1774     pFile->pId = 0;
1775   }
1776 #endif
1777   OSTRACE(("CLOSE   %-3d\n", pFile->h));
1778   OpenCounter(-1);
1779   sqlite3_free(pFile->pUnused);
1780   memset(pFile, 0, sizeof(unixFile));
1781   return SQLITE_OK;
1782 }
1783
1784 /*
1785 ** Close a file.
1786 */
1787 static int unixClose(sqlite3_file *id){
1788   int rc = SQLITE_OK;
1789   unixFile *pFile = (unixFile *)id;
1790   unixUnlock(id, NO_LOCK);
1791   unixEnterMutex();
1792
1793   /* unixFile.pInode is always valid here. Otherwise, a different close
1794   ** routine (e.g. nolockClose()) would be called instead.
1795   */
1796   assert( pFile->pInode->nLock>0 || pFile->pInode->bProcessLock==0 );
1797   if( ALWAYS(pFile->pInode) && pFile->pInode->nLock ){
1798     /* If there are outstanding locks, do not actually close the file just
1799     ** yet because that would clear those locks.  Instead, add the file
1800     ** descriptor to pInode->pUnused list.  It will be automatically closed
1801     ** when the last lock is cleared.
1802     */
1803     setPendingFd(pFile);
1804   }
1805   releaseInodeInfo(pFile);
1806   rc = closeUnixFile(id);
1807   unixLeaveMutex();
1808   return rc;
1809 }
1810
1811 /************** End of the posix advisory lock implementation *****************
1812 ******************************************************************************/
1813
1814 /******************************************************************************
1815 ****************************** No-op Locking **********************************
1816 **
1817 ** Of the various locking implementations available, this is by far the
1818 ** simplest:  locking is ignored.  No attempt is made to lock the database
1819 ** file for reading or writing.
1820 **
1821 ** This locking mode is appropriate for use on read-only databases
1822 ** (ex: databases that are burned into CD-ROM, for example.)  It can
1823 ** also be used if the application employs some external mechanism to
1824 ** prevent simultaneous access of the same database by two or more
1825 ** database connections.  But there is a serious risk of database
1826 ** corruption if this locking mode is used in situations where multiple
1827 ** database connections are accessing the same database file at the same
1828 ** time and one or more of those connections are writing.
1829 */
1830
1831 static int nolockCheckReservedLock(sqlite3_file *NotUsed, int *pResOut){
1832   UNUSED_PARAMETER(NotUsed);
1833   *pResOut = 0;
1834   return SQLITE_OK;
1835 }
1836 static int nolockLock(sqlite3_file *NotUsed, int NotUsed2){
1837   UNUSED_PARAMETER2(NotUsed, NotUsed2);
1838   return SQLITE_OK;
1839 }
1840 static int nolockUnlock(sqlite3_file *NotUsed, int NotUsed2){
1841   UNUSED_PARAMETER2(NotUsed, NotUsed2);
1842   return SQLITE_OK;
1843 }
1844
1845 /*
1846 ** Close the file.
1847 */
1848 static int nolockClose(sqlite3_file *id) {
1849   return closeUnixFile(id);
1850 }
1851
1852 /******************* End of the no-op lock implementation *********************
1853 ******************************************************************************/
1854
1855 /******************************************************************************
1856 ************************* Begin dot-file Locking ******************************
1857 **
1858 ** The dotfile locking implementation uses the existance of separate lock
1859 ** files (really a directory) to control access to the database.  This works
1860 ** on just about every filesystem imaginable.  But there are serious downsides:
1861 **
1862 **    (1)  There is zero concurrency.  A single reader blocks all other
1863 **         connections from reading or writing the database.
1864 **
1865 **    (2)  An application crash or power loss can leave stale lock files
1866 **         sitting around that need to be cleared manually.
1867 **
1868 ** Nevertheless, a dotlock is an appropriate locking mode for use if no
1869 ** other locking strategy is available.
1870 **
1871 ** Dotfile locking works by creating a subdirectory in the same directory as
1872 ** the database and with the same name but with a ".lock" extension added.
1873 ** The existance of a lock directory implies an EXCLUSIVE lock.  All other
1874 ** lock types (SHARED, RESERVED, PENDING) are mapped into EXCLUSIVE.
1875 */
1876
1877 /*
1878 ** The file suffix added to the data base filename in order to create the
1879 ** lock directory.
1880 */
1881 #define DOTLOCK_SUFFIX ".lock"
1882
1883 /*
1884 ** This routine checks if there is a RESERVED lock held on the specified
1885 ** file by this or any other process. If such a lock is held, set *pResOut
1886 ** to a non-zero value otherwise *pResOut is set to zero.  The return value
1887 ** is set to SQLITE_OK unless an I/O error occurs during lock checking.
1888 **
1889 ** In dotfile locking, either a lock exists or it does not.  So in this
1890 ** variation of CheckReservedLock(), *pResOut is set to true if any lock
1891 ** is held on the file and false if the file is unlocked.
1892 */
1893 static int dotlockCheckReservedLock(sqlite3_file *id, int *pResOut) {
1894   int rc = SQLITE_OK;
1895   int reserved = 0;
1896   unixFile *pFile = (unixFile*)id;
1897
1898   SimulateIOError( return SQLITE_IOERR_CHECKRESERVEDLOCK; );
1899
1900   assert( pFile );
1901
1902   /* Check if a thread in this process holds such a lock */
1903   if( pFile->eFileLock>SHARED_LOCK ){
1904     /* Either this connection or some other connection in the same process
1905     ** holds a lock on the file.  No need to check further. */
1906     reserved = 1;
1907   }else{
1908     /* The lock is held if and only if the lockfile exists */
1909     const char *zLockFile = (const char*)pFile->lockingContext;
1910     reserved = osAccess(zLockFile, 0)==0;
1911   }
1912   OSTRACE(("TEST WR-LOCK %d %d %d (dotlock)\n", pFile->h, rc, reserved));
1913   *pResOut = reserved;
1914   return rc;
1915 }
1916
1917 /*
1918 ** Lock the file with the lock specified by parameter eFileLock - one
1919 ** of the following:
1920 **
1921 **     (1) SHARED_LOCK
1922 **     (2) RESERVED_LOCK
1923 **     (3) PENDING_LOCK
1924 **     (4) EXCLUSIVE_LOCK
1925 **
1926 ** Sometimes when requesting one lock state, additional lock states
1927 ** are inserted in between.  The locking might fail on one of the later
1928 ** transitions leaving the lock state different from what it started but
1929 ** still short of its goal.  The following chart shows the allowed
1930 ** transitions and the inserted intermediate states:
1931 **
1932 **    UNLOCKED -> SHARED
1933 **    SHARED -> RESERVED
1934 **    SHARED -> (PENDING) -> EXCLUSIVE
1935 **    RESERVED -> (PENDING) -> EXCLUSIVE
1936 **    PENDING -> EXCLUSIVE
1937 **
1938 ** This routine will only increase a lock.  Use the sqlite3OsUnlock()
1939 ** routine to lower a locking level.
1940 **
1941 ** With dotfile locking, we really only support state (4): EXCLUSIVE.
1942 ** But we track the other locking levels internally.
1943 */
1944 static int dotlockLock(sqlite3_file *id, int eFileLock) {
1945   unixFile *pFile = (unixFile*)id;
1946   char *zLockFile = (char *)pFile->lockingContext;
1947   int rc = SQLITE_OK;
1948
1949
1950   /* If we have any lock, then the lock file already exists.  All we have
1951   ** to do is adjust our internal record of the lock level.
1952   */
1953   if( pFile->eFileLock > NO_LOCK ){
1954     pFile->eFileLock = eFileLock;
1955     /* Always update the timestamp on the old file */
1956 #ifdef HAVE_UTIME
1957     utime(zLockFile, NULL);
1958 #else
1959     utimes(zLockFile, NULL);
1960 #endif
1961     return SQLITE_OK;
1962   }
1963
1964   /* grab an exclusive lock */
1965   rc = osMkdir(zLockFile, 0777);
1966   if( rc<0 ){
1967     /* failed to open/create the lock directory */
1968     int tErrno = errno;
1969     if( EEXIST == tErrno ){
1970       rc = SQLITE_BUSY;
1971     } else {
1972       rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_LOCK);
1973       if( IS_LOCK_ERROR(rc) ){
1974         pFile->lastErrno = tErrno;
1975       }
1976     }
1977     return rc;
1978   }
1979
1980   /* got it, set the type and return ok */
1981   pFile->eFileLock = eFileLock;
1982   return rc;
1983 }
1984
1985 /*
1986 ** Lower the locking level on file descriptor pFile to eFileLock.  eFileLock
1987 ** must be either NO_LOCK or SHARED_LOCK.
1988 **
1989 ** If the locking level of the file descriptor is already at or below
1990 ** the requested locking level, this routine is a no-op.
1991 **
1992 ** When the locking level reaches NO_LOCK, delete the lock file.
1993 */
1994 static int dotlockUnlock(sqlite3_file *id, int eFileLock) {
1995   unixFile *pFile = (unixFile*)id;
1996   char *zLockFile = (char *)pFile->lockingContext;
1997   int rc;
1998
1999   assert( pFile );
2000   OSTRACE(("UNLOCK  %d %d was %d pid=%d (dotlock)\n", pFile->h, eFileLock,
2001            pFile->eFileLock, getpid()));
2002   assert( eFileLock<=SHARED_LOCK );
2003
2004   /* no-op if possible */
2005   if( pFile->eFileLock==eFileLock ){
2006     return SQLITE_OK;
2007   }
2008
2009   /* To downgrade to shared, simply update our internal notion of the
2010   ** lock state.  No need to mess with the file on disk.
2011   */
2012   if( eFileLock==SHARED_LOCK ){
2013     pFile->eFileLock = SHARED_LOCK;
2014     return SQLITE_OK;
2015   }
2016
2017   /* To fully unlock the database, delete the lock file */
2018   assert( eFileLock==NO_LOCK );
2019   rc = osRmdir(zLockFile);
2020   if( rc<0 && errno==ENOTDIR ) rc = osUnlink(zLockFile);
2021   if( rc<0 ){
2022     int tErrno = errno;
2023     rc = 0;
2024     if( ENOENT != tErrno ){
2025       rc = SQLITE_IOERR_UNLOCK;
2026     }
2027     if( IS_LOCK_ERROR(rc) ){
2028       pFile->lastErrno = tErrno;
2029     }
2030     return rc;
2031   }
2032   pFile->eFileLock = NO_LOCK;
2033   return SQLITE_OK;
2034 }
2035
2036 /*
2037 ** Close a file.  Make sure the lock has been released before closing.
2038 */
2039 static int dotlockClose(sqlite3_file *id) {
2040   int rc;
2041   if( id ){
2042     unixFile *pFile = (unixFile*)id;
2043     dotlockUnlock(id, NO_LOCK);
2044     sqlite3_free(pFile->lockingContext);
2045   }
2046   rc = closeUnixFile(id);
2047   return rc;
2048 }
2049 /****************** End of the dot-file lock implementation *******************
2050 ******************************************************************************/
2051
2052 /******************************************************************************
2053 ************************** Begin flock Locking ********************************
2054 **
2055 ** Use the flock() system call to do file locking.
2056 **
2057 ** flock() locking is like dot-file locking in that the various
2058 ** fine-grain locking levels supported by SQLite are collapsed into
2059 ** a single exclusive lock.  In other words, SHARED, RESERVED, and
2060 ** PENDING locks are the same thing as an EXCLUSIVE lock.  SQLite
2061 ** still works when you do this, but concurrency is reduced since
2062 ** only a single process can be reading the database at a time.
2063 **
2064 ** Omit this section if SQLITE_ENABLE_LOCKING_STYLE is turned off or if
2065 ** compiling for VXWORKS.
2066 */
2067 #if SQLITE_ENABLE_LOCKING_STYLE && !OS_VXWORKS
2068
2069 /*
2070 ** Retry flock() calls that fail with EINTR
2071 */
2072 #ifdef EINTR
2073 static int robust_flock(int fd, int op){
2074   int rc;
2075   do{ rc = flock(fd,op); }while( rc<0 && errno==EINTR );
2076   return rc;
2077 }
2078 #else
2079 # define robust_flock(a,b) flock(a,b)
2080 #endif
2081
2082
2083 /*
2084 ** This routine checks if there is a RESERVED lock held on the specified
2085 ** file by this or any other process. If such a lock is held, set *pResOut
2086 ** to a non-zero value otherwise *pResOut is set to zero.  The return value
2087 ** is set to SQLITE_OK unless an I/O error occurs during lock checking.
2088 */
2089 static int flockCheckReservedLock(sqlite3_file *id, int *pResOut){
2090   int rc = SQLITE_OK;
2091   int reserved = 0;
2092   unixFile *pFile = (unixFile*)id;
2093
2094   SimulateIOError( return SQLITE_IOERR_CHECKRESERVEDLOCK; );
2095
2096   assert( pFile );
2097
2098   /* Check if a thread in this process holds such a lock */
2099   if( pFile->eFileLock>SHARED_LOCK ){
2100     reserved = 1;
2101   }
2102
2103   /* Otherwise see if some other process holds it. */
2104   if( !reserved ){
2105     /* attempt to get the lock */
2106     int lrc = robust_flock(pFile->h, LOCK_EX | LOCK_NB);
2107     if( !lrc ){
2108       /* got the lock, unlock it */
2109       lrc = robust_flock(pFile->h, LOCK_UN);
2110       if ( lrc ) {
2111         int tErrno = errno;
2112         /* unlock failed with an error */
2113         lrc = SQLITE_IOERR_UNLOCK;
2114         if( IS_LOCK_ERROR(lrc) ){
2115           pFile->lastErrno = tErrno;
2116           rc = lrc;
2117         }
2118       }
2119     } else {
2120       int tErrno = errno;
2121       reserved = 1;
2122       /* someone else might have it reserved */
2123       lrc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_LOCK);
2124       if( IS_LOCK_ERROR(lrc) ){
2125         pFile->lastErrno = tErrno;
2126         rc = lrc;
2127       }
2128     }
2129   }
2130   OSTRACE(("TEST WR-LOCK %d %d %d (flock)\n", pFile->h, rc, reserved));
2131
2132 #ifdef SQLITE_IGNORE_FLOCK_LOCK_ERRORS
2133   if( (rc & SQLITE_IOERR) == SQLITE_IOERR ){
2134     rc = SQLITE_OK;
2135     reserved=1;
2136   }
2137 #endif /* SQLITE_IGNORE_FLOCK_LOCK_ERRORS */
2138   *pResOut = reserved;
2139   return rc;
2140 }
2141
2142 /*
2143 ** Lock the file with the lock specified by parameter eFileLock - one
2144 ** of the following:
2145 **
2146 **     (1) SHARED_LOCK
2147 **     (2) RESERVED_LOCK
2148 **     (3) PENDING_LOCK
2149 **     (4) EXCLUSIVE_LOCK
2150 **
2151 ** Sometimes when requesting one lock state, additional lock states
2152 ** are inserted in between.  The locking might fail on one of the later
2153 ** transitions leaving the lock state different from what it started but
2154 ** still short of its goal.  The following chart shows the allowed
2155 ** transitions and the inserted intermediate states:
2156 **
2157 **    UNLOCKED -> SHARED
2158 **    SHARED -> RESERVED
2159 **    SHARED -> (PENDING) -> EXCLUSIVE
2160 **    RESERVED -> (PENDING) -> EXCLUSIVE
2161 **    PENDING -> EXCLUSIVE
2162 **
2163 ** flock() only really support EXCLUSIVE locks.  We track intermediate
2164 ** lock states in the sqlite3_file structure, but all locks SHARED or
2165 ** above are really EXCLUSIVE locks and exclude all other processes from
2166 ** access the file.
2167 **
2168 ** This routine will only increase a lock.  Use the sqlite3OsUnlock()
2169 ** routine to lower a locking level.
2170 */
2171 static int flockLock(sqlite3_file *id, int eFileLock) {
2172   int rc = SQLITE_OK;
2173   unixFile *pFile = (unixFile*)id;
2174
2175   assert( pFile );
2176
2177   /* if we already have a lock, it is exclusive.
2178   ** Just adjust level and punt on outta here. */
2179   if (pFile->eFileLock > NO_LOCK) {
2180     pFile->eFileLock = eFileLock;
2181     return SQLITE_OK;
2182   }
2183
2184   /* grab an exclusive lock */
2185
2186   if (robust_flock(pFile->h, LOCK_EX | LOCK_NB)) {
2187     int tErrno = errno;
2188     /* didn't get, must be busy */
2189     rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_LOCK);
2190     if( IS_LOCK_ERROR(rc) ){
2191       pFile->lastErrno = tErrno;
2192     }
2193   } else {
2194     /* got it, set the type and return ok */
2195     pFile->eFileLock = eFileLock;
2196   }
2197   OSTRACE(("LOCK    %d %s %s (flock)\n", pFile->h, azFileLock(eFileLock),
2198            rc==SQLITE_OK ? "ok" : "failed"));
2199 #ifdef SQLITE_IGNORE_FLOCK_LOCK_ERRORS
2200   if( (rc & SQLITE_IOERR) == SQLITE_IOERR ){
2201     rc = SQLITE_BUSY;
2202   }
2203 #endif /* SQLITE_IGNORE_FLOCK_LOCK_ERRORS */
2204   return rc;
2205 }
2206
2207
2208 /*
2209 ** Lower the locking level on file descriptor pFile to eFileLock.  eFileLock
2210 ** must be either NO_LOCK or SHARED_LOCK.
2211 **
2212 ** If the locking level of the file descriptor is already at or below
2213 ** the requested locking level, this routine is a no-op.
2214 */
2215 static int flockUnlock(sqlite3_file *id, int eFileLock) {
2216   unixFile *pFile = (unixFile*)id;
2217
2218   assert( pFile );
2219   OSTRACE(("UNLOCK  %d %d was %d pid=%d (flock)\n", pFile->h, eFileLock,
2220            pFile->eFileLock, getpid()));
2221   assert( eFileLock<=SHARED_LOCK );
2222
2223   /* no-op if possible */
2224   if( pFile->eFileLock==eFileLock ){
2225     return SQLITE_OK;
2226   }
2227
2228   /* shared can just be set because we always have an exclusive */
2229   if (eFileLock==SHARED_LOCK) {
2230     pFile->eFileLock = eFileLock;
2231     return SQLITE_OK;
2232   }
2233
2234   /* no, really, unlock. */
2235   if( robust_flock(pFile->h, LOCK_UN) ){
2236 #ifdef SQLITE_IGNORE_FLOCK_LOCK_ERRORS
2237     return SQLITE_OK;
2238 #endif /* SQLITE_IGNORE_FLOCK_LOCK_ERRORS */
2239     return SQLITE_IOERR_UNLOCK;
2240   }else{
2241     pFile->eFileLock = NO_LOCK;
2242     return SQLITE_OK;
2243   }
2244 }
2245
2246 /*
2247 ** Close a file.
2248 */
2249 static int flockClose(sqlite3_file *id) {
2250   if( id ){
2251     flockUnlock(id, NO_LOCK);
2252   }
2253   return closeUnixFile(id);
2254 }
2255
2256 #endif /* SQLITE_ENABLE_LOCKING_STYLE && !OS_VXWORK */
2257
2258 /******************* End of the flock lock implementation *********************
2259 ******************************************************************************/
2260
2261 /******************************************************************************
2262 ************************ Begin Named Semaphore Locking ************************
2263 **
2264 ** Named semaphore locking is only supported on VxWorks.
2265 **
2266 ** Semaphore locking is like dot-lock and flock in that it really only
2267 ** supports EXCLUSIVE locking.  Only a single process can read or write
2268 ** the database file at a time.  This reduces potential concurrency, but
2269 ** makes the lock implementation much easier.
2270 */
2271 #if OS_VXWORKS
2272
2273 /*
2274 ** This routine checks if there is a RESERVED lock held on the specified
2275 ** file by this or any other process. If such a lock is held, set *pResOut
2276 ** to a non-zero value otherwise *pResOut is set to zero.  The return value
2277 ** is set to SQLITE_OK unless an I/O error occurs during lock checking.
2278 */
2279 static int semCheckReservedLock(sqlite3_file *id, int *pResOut) {
2280   int rc = SQLITE_OK;
2281   int reserved = 0;
2282   unixFile *pFile = (unixFile*)id;
2283
2284   SimulateIOError( return SQLITE_IOERR_CHECKRESERVEDLOCK; );
2285
2286   assert( pFile );
2287
2288   /* Check if a thread in this process holds such a lock */
2289   if( pFile->eFileLock>SHARED_LOCK ){
2290     reserved = 1;
2291   }
2292
2293   /* Otherwise see if some other process holds it. */
2294   if( !reserved ){
2295     sem_t *pSem = pFile->pInode->pSem;
2296     struct stat statBuf;
2297
2298     if( sem_trywait(pSem)==-1 ){
2299       int tErrno = errno;
2300       if( EAGAIN != tErrno ){
2301         rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_CHECKRESERVEDLOCK);
2302         pFile->lastErrno = tErrno;
2303       } else {
2304         /* someone else has the lock when we are in NO_LOCK */
2305         reserved = (pFile->eFileLock < SHARED_LOCK);
2306       }
2307     }else{
2308       /* we could have it if we want it */
2309       sem_post(pSem);
2310     }
2311   }
2312   OSTRACE(("TEST WR-LOCK %d %d %d (sem)\n", pFile->h, rc, reserved));
2313
2314   *pResOut = reserved;
2315   return rc;
2316 }
2317
2318 /*
2319 ** Lock the file with the lock specified by parameter eFileLock - one
2320 ** of the following:
2321 **
2322 **     (1) SHARED_LOCK
2323 **     (2) RESERVED_LOCK
2324 **     (3) PENDING_LOCK
2325 **     (4) EXCLUSIVE_LOCK
2326 **
2327 ** Sometimes when requesting one lock state, additional lock states
2328 ** are inserted in between.  The locking might fail on one of the later
2329 ** transitions leaving the lock state different from what it started but
2330 ** still short of its goal.  The following chart shows the allowed
2331 ** transitions and the inserted intermediate states:
2332 **
2333 **    UNLOCKED -> SHARED
2334 **    SHARED -> RESERVED
2335 **    SHARED -> (PENDING) -> EXCLUSIVE
2336 **    RESERVED -> (PENDING) -> EXCLUSIVE
2337 **    PENDING -> EXCLUSIVE
2338 **
2339 ** Semaphore locks only really support EXCLUSIVE locks.  We track intermediate
2340 ** lock states in the sqlite3_file structure, but all locks SHARED or
2341 ** above are really EXCLUSIVE locks and exclude all other processes from
2342 ** access the file.
2343 **
2344 ** This routine will only increase a lock.  Use the sqlite3OsUnlock()
2345 ** routine to lower a locking level.
2346 */
2347 static int semLock(sqlite3_file *id, int eFileLock) {
2348   unixFile *pFile = (unixFile*)id;
2349   int fd;
2350   sem_t *pSem = pFile->pInode->pSem;
2351   int rc = SQLITE_OK;
2352
2353   /* if we already have a lock, it is exclusive.
2354   ** Just adjust level and punt on outta here. */
2355   if (pFile->eFileLock > NO_LOCK) {
2356     pFile->eFileLock = eFileLock;
2357     rc = SQLITE_OK;
2358     goto sem_end_lock;
2359   }
2360
2361   /* lock semaphore now but bail out when already locked. */
2362   if( sem_trywait(pSem)==-1 ){
2363     rc = SQLITE_BUSY;
2364     goto sem_end_lock;
2365   }
2366
2367   /* got it, set the type and return ok */
2368   pFile->eFileLock = eFileLock;
2369
2370  sem_end_lock:
2371   return rc;
2372 }
2373
2374 /*
2375 ** Lower the locking level on file descriptor pFile to eFileLock.  eFileLock
2376 ** must be either NO_LOCK or SHARED_LOCK.
2377 **
2378 ** If the locking level of the file descriptor is already at or below
2379 ** the requested locking level, this routine is a no-op.
2380 */
2381 static int semUnlock(sqlite3_file *id, int eFileLock) {
2382   unixFile *pFile = (unixFile*)id;
2383   sem_t *pSem = pFile->pInode->pSem;
2384
2385   assert( pFile );
2386   assert( pSem );
2387   OSTRACE(("UNLOCK  %d %d was %d pid=%d (sem)\n", pFile->h, eFileLock,
2388            pFile->eFileLock, getpid()));
2389   assert( eFileLock<=SHARED_LOCK );
2390
2391   /* no-op if possible */
2392   if( pFile->eFileLock==eFileLock ){
2393     return SQLITE_OK;
2394   }
2395
2396   /* shared can just be set because we always have an exclusive */
2397   if (eFileLock==SHARED_LOCK) {
2398     pFile->eFileLock = eFileLock;
2399     return SQLITE_OK;
2400   }
2401
2402   /* no, really unlock. */
2403   if ( sem_post(pSem)==-1 ) {
2404     int rc, tErrno = errno;
2405     rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_UNLOCK);
2406     if( IS_LOCK_ERROR(rc) ){
2407       pFile->lastErrno = tErrno;
2408     }
2409     return rc;
2410   }
2411   pFile->eFileLock = NO_LOCK;
2412   return SQLITE_OK;
2413 }
2414
2415 /*
2416  ** Close a file.
2417  */
2418 static int semClose(sqlite3_file *id) {
2419   if( id ){
2420     unixFile *pFile = (unixFile*)id;
2421     semUnlock(id, NO_LOCK);
2422     assert( pFile );
2423     unixEnterMutex();
2424     releaseInodeInfo(pFile);
2425     unixLeaveMutex();
2426     closeUnixFile(id);
2427   }
2428   return SQLITE_OK;
2429 }
2430
2431 #endif /* OS_VXWORKS */
2432 /*
2433 ** Named semaphore locking is only available on VxWorks.
2434 **
2435 *************** End of the named semaphore lock implementation ****************
2436 ******************************************************************************/
2437
2438
2439 /******************************************************************************
2440 *************************** Begin AFP Locking *********************************
2441 **
2442 ** AFP is the Apple Filing Protocol.  AFP is a network filesystem found
2443 ** on Apple Macintosh computers - both OS9 and OSX.
2444 **
2445 ** Third-party implementations of AFP are available.  But this code here
2446 ** only works on OSX.
2447 */
2448
2449 #if defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE
2450 /*
2451 ** The afpLockingContext structure contains all afp lock specific state
2452 */
2453 typedef struct afpLockingContext afpLockingContext;
2454 struct afpLockingContext {
2455   int reserved;
2456   const char *dbPath;             /* Name of the open file */
2457 };
2458
2459 struct ByteRangeLockPB2
2460 {
2461   unsigned long long offset;        /* offset to first byte to lock */
2462   unsigned long long length;        /* nbr of bytes to lock */
2463   unsigned long long retRangeStart; /* nbr of 1st byte locked if successful */
2464   unsigned char unLockFlag;         /* 1 = unlock, 0 = lock */
2465   unsigned char startEndFlag;       /* 1=rel to end of fork, 0=rel to start */
2466   int fd;                           /* file desc to assoc this lock with */
2467 };
2468
2469 #define afpfsByteRangeLock2FSCTL        _IOWR('z', 23, struct ByteRangeLockPB2)
2470
2471 /*
2472 ** This is a utility for setting or clearing a bit-range lock on an
2473 ** AFP filesystem.
2474 **
2475 ** Return SQLITE_OK on success, SQLITE_BUSY on failure.
2476 */
2477 static int afpSetLock(
2478   const char *path,              /* Name of the file to be locked or unlocked */
2479   unixFile *pFile,               /* Open file descriptor on path */
2480   unsigned long long offset,     /* First byte to be locked */
2481   unsigned long long length,     /* Number of bytes to lock */
2482   int setLockFlag                /* True to set lock.  False to clear lock */
2483 ){
2484   struct ByteRangeLockPB2 pb;
2485   int err;
2486
2487   pb.unLockFlag = setLockFlag ? 0 : 1;
2488   pb.startEndFlag = 0;
2489   pb.offset = offset;
2490   pb.length = length;
2491   pb.fd = pFile->h;
2492
2493   OSTRACE(("AFPSETLOCK [%s] for %d%s in range %llx:%llx\n",
2494     (setLockFlag?"ON":"OFF"), pFile->h, (pb.fd==-1?"[testval-1]":""),
2495     offset, length));
2496   err = fsctl(path, afpfsByteRangeLock2FSCTL, &pb, 0);
2497   if ( err==-1 ) {
2498     int rc;
2499     int tErrno = errno;
2500     OSTRACE(("AFPSETLOCK failed to fsctl() '%s' %d %s\n",
2501              path, tErrno, strerror(tErrno)));
2502 #ifdef SQLITE_IGNORE_AFP_LOCK_ERRORS
2503     rc = SQLITE_BUSY;
2504 #else
2505     rc = sqliteErrorFromPosixError(tErrno,
2506                     setLockFlag ? SQLITE_IOERR_LOCK : SQLITE_IOERR_UNLOCK);
2507 #endif /* SQLITE_IGNORE_AFP_LOCK_ERRORS */
2508     if( IS_LOCK_ERROR(rc) ){
2509       pFile->lastErrno = tErrno;
2510     }
2511     return rc;
2512   } else {
2513     return SQLITE_OK;
2514   }
2515 }
2516
2517 /*
2518 ** This routine checks if there is a RESERVED lock held on the specified
2519 ** file by this or any other process. If such a lock is held, set *pResOut
2520 ** to a non-zero value otherwise *pResOut is set to zero.  The return value
2521 ** is set to SQLITE_OK unless an I/O error occurs during lock checking.
2522 */
2523 static int afpCheckReservedLock(sqlite3_file *id, int *pResOut){
2524   int rc = SQLITE_OK;
2525   int reserved = 0;
2526   unixFile *pFile = (unixFile*)id;
2527   afpLockingContext *context;
2528
2529   SimulateIOError( return SQLITE_IOERR_CHECKRESERVEDLOCK; );
2530
2531   assert( pFile );
2532   context = (afpLockingContext *) pFile->lockingContext;
2533   if( context->reserved ){
2534     *pResOut = 1;
2535     return SQLITE_OK;
2536   }
2537   unixEnterMutex(); /* Because pFile->pInode is shared across threads */
2538
2539   /* Check if a thread in this process holds such a lock */
2540   if( pFile->pInode->eFileLock>SHARED_LOCK ){
2541     reserved = 1;
2542   }
2543
2544   /* Otherwise see if some other process holds it.
2545    */
2546   if( !reserved ){
2547     /* lock the RESERVED byte */
2548     int lrc = afpSetLock(context->dbPath, pFile, RESERVED_BYTE, 1,1);
2549     if( SQLITE_OK==lrc ){
2550       /* if we succeeded in taking the reserved lock, unlock it to restore
2551       ** the original state */
2552       lrc = afpSetLock(context->dbPath, pFile, RESERVED_BYTE, 1, 0);
2553     } else {
2554       /* if we failed to get the lock then someone else must have it */
2555       reserved = 1;
2556     }
2557     if( IS_LOCK_ERROR(lrc) ){
2558       rc=lrc;
2559     }
2560   }
2561
2562   unixLeaveMutex();
2563   OSTRACE(("TEST WR-LOCK %d %d %d (afp)\n", pFile->h, rc, reserved));
2564
2565   *pResOut = reserved;
2566   return rc;
2567 }
2568
2569 /*
2570 ** Lock the file with the lock specified by parameter eFileLock - one
2571 ** of the following:
2572 **
2573 **     (1) SHARED_LOCK
2574 **     (2) RESERVED_LOCK
2575 **     (3) PENDING_LOCK
2576 **     (4) EXCLUSIVE_LOCK
2577 **
2578 ** Sometimes when requesting one lock state, additional lock states
2579 ** are inserted in between.  The locking might fail on one of the later
2580 ** transitions leaving the lock state different from what it started but
2581 ** still short of its goal.  The following chart shows the allowed
2582 ** transitions and the inserted intermediate states:
2583 **
2584 **    UNLOCKED -> SHARED
2585 **    SHARED -> RESERVED
2586 **    SHARED -> (PENDING) -> EXCLUSIVE
2587 **    RESERVED -> (PENDING) -> EXCLUSIVE
2588 **    PENDING -> EXCLUSIVE
2589 **
2590 ** This routine will only increase a lock.  Use the sqlite3OsUnlock()
2591 ** routine to lower a locking level.
2592 */
2593 static int afpLock(sqlite3_file *id, int eFileLock){
2594   int rc = SQLITE_OK;
2595   unixFile *pFile = (unixFile*)id;
2596   unixInodeInfo *pInode = pFile->pInode;
2597   afpLockingContext *context = (afpLockingContext *) pFile->lockingContext;
2598
2599   assert( pFile );
2600   OSTRACE(("LOCK    %d %s was %s(%s,%d) pid=%d (afp)\n", pFile->h,
2601            azFileLock(eFileLock), azFileLock(pFile->eFileLock),
2602            azFileLock(pInode->eFileLock), pInode->nShared , getpid()));
2603
2604   /* If there is already a lock of this type or more restrictive on the
2605   ** unixFile, do nothing. Don't use the afp_end_lock: exit path, as
2606   ** unixEnterMutex() hasn't been called yet.
2607   */
2608   if( pFile->eFileLock>=eFileLock ){
2609     OSTRACE(("LOCK    %d %s ok (already held) (afp)\n", pFile->h,
2610            azFileLock(eFileLock)));
2611     return SQLITE_OK;
2612   }
2613
2614   /* Make sure the locking sequence is correct
2615   **  (1) We never move from unlocked to anything higher than shared lock.
2616   **  (2) SQLite never explicitly requests a pendig lock.
2617   **  (3) A shared lock is always held when a reserve lock is requested.
2618   */
2619   assert( pFile->eFileLock!=NO_LOCK || eFileLock==SHARED_LOCK );
2620   assert( eFileLock!=PENDING_LOCK );
2621   assert( eFileLock!=RESERVED_LOCK || pFile->eFileLock==SHARED_LOCK );
2622
2623   /* This mutex is needed because pFile->pInode is shared across threads
2624   */
2625   unixEnterMutex();
2626   pInode = pFile->pInode;
2627
2628   /* If some thread using this PID has a lock via a different unixFile*
2629   ** handle that precludes the requested lock, return BUSY.
2630   */
2631   if( (pFile->eFileLock!=pInode->eFileLock &&
2632        (pInode->eFileLock>=PENDING_LOCK || eFileLock>SHARED_LOCK))
2633      ){
2634     rc = SQLITE_BUSY;
2635     goto afp_end_lock;
2636   }
2637
2638   /* If a SHARED lock is requested, and some thread using this PID already
2639   ** has a SHARED or RESERVED lock, then increment reference counts and
2640   ** return SQLITE_OK.
2641   */
2642   if( eFileLock==SHARED_LOCK &&
2643      (pInode->eFileLock==SHARED_LOCK || pInode->eFileLock==RESERVED_LOCK) ){
2644     assert( eFileLock==SHARED_LOCK );
2645     assert( pFile->eFileLock==0 );
2646     assert( pInode->nShared>0 );
2647     pFile->eFileLock = SHARED_LOCK;
2648     pInode->nShared++;
2649     pInode->nLock++;
2650     goto afp_end_lock;
2651   }
2652
2653   /* A PENDING lock is needed before acquiring a SHARED lock and before
2654   ** acquiring an EXCLUSIVE lock.  For the SHARED lock, the PENDING will
2655   ** be released.
2656   */
2657   if( eFileLock==SHARED_LOCK
2658       || (eFileLock==EXCLUSIVE_LOCK && pFile->eFileLock<PENDING_LOCK)
2659   ){
2660     int failed;
2661     failed = afpSetLock(context->dbPath, pFile, PENDING_BYTE, 1, 1);
2662     if (failed) {
2663       rc = failed;
2664       goto afp_end_lock;
2665     }
2666   }
2667
2668   /* If control gets to this point, then actually go ahead and make
2669   ** operating system calls for the specified lock.
2670   */
2671   if( eFileLock==SHARED_LOCK ){
2672     int lrc1, lrc2, lrc1Errno = 0;
2673     long lk, mask;
2674
2675     assert( pInode->nShared==0 );
2676     assert( pInode->eFileLock==0 );
2677
2678     mask = (sizeof(long)==8) ? LARGEST_INT64 : 0x7fffffff;
2679     /* Now get the read-lock SHARED_LOCK */
2680     /* note that the quality of the randomness doesn't matter that much */
2681     lk = random();
2682     pInode->sharedByte = (lk & mask)%(SHARED_SIZE - 1);
2683     lrc1 = afpSetLock(context->dbPath, pFile,
2684           SHARED_FIRST+pInode->sharedByte, 1, 1);
2685     if( IS_LOCK_ERROR(lrc1) ){
2686       lrc1Errno = pFile->lastErrno;
2687     }
2688     /* Drop the temporary PENDING lock */
2689     lrc2 = afpSetLock(context->dbPath, pFile, PENDING_BYTE, 1, 0);
2690
2691     if( IS_LOCK_ERROR(lrc1) ) {
2692       pFile->lastErrno = lrc1Errno;
2693       rc = lrc1;
2694       goto afp_end_lock;
2695     } else if( IS_LOCK_ERROR(lrc2) ){
2696       rc = lrc2;
2697       goto afp_end_lock;
2698     } else if( lrc1 != SQLITE_OK ) {
2699       rc = lrc1;
2700     } else {
2701       pFile->eFileLock = SHARED_LOCK;
2702       pInode->nLock++;
2703       pInode->nShared = 1;
2704     }
2705   }else if( eFileLock==EXCLUSIVE_LOCK && pInode->nShared>1 ){
2706     /* We are trying for an exclusive lock but another thread in this
2707      ** same process is still holding a shared lock. */
2708     rc = SQLITE_BUSY;
2709   }else{
2710     /* The request was for a RESERVED or EXCLUSIVE lock.  It is
2711     ** assumed that there is a SHARED or greater lock on the file
2712     ** already.
2713     */
2714     int failed = 0;
2715     assert( 0!=pFile->eFileLock );
2716     if (eFileLock >= RESERVED_LOCK && pFile->eFileLock < RESERVED_LOCK) {
2717         /* Acquire a RESERVED lock */
2718         failed = afpSetLock(context->dbPath, pFile, RESERVED_BYTE, 1,1);
2719       if( !failed ){
2720         context->reserved = 1;
2721       }
2722     }
2723     if (!failed && eFileLock == EXCLUSIVE_LOCK) {
2724       /* Acquire an EXCLUSIVE lock */
2725
2726       /* Remove the shared lock before trying the range.  we'll need to
2727       ** reestablish the shared lock if we can't get the  afpUnlock
2728       */
2729       if( !(failed = afpSetLock(context->dbPath, pFile, SHARED_FIRST +
2730                          pInode->sharedByte, 1, 0)) ){
2731         int failed2 = SQLITE_OK;
2732         /* now attemmpt to get the exclusive lock range */
2733         failed = afpSetLock(context->dbPath, pFile, SHARED_FIRST,
2734                                SHARED_SIZE, 1);
2735         if( failed && (failed2 = afpSetLock(context->dbPath, pFile,
2736                        SHARED_FIRST + pInode->sharedByte, 1, 1)) ){
2737           /* Can't reestablish the shared lock.  Sqlite can't deal, this is
2738           ** a critical I/O error
2739           */
2740           rc = ((failed & SQLITE_IOERR) == SQLITE_IOERR) ? failed2 :
2741                SQLITE_IOERR_LOCK;
2742           goto afp_end_lock;
2743         }
2744       }else{
2745         rc = failed;
2746       }
2747     }
2748     if( failed ){
2749       rc = failed;
2750     }
2751   }
2752
2753   if( rc==SQLITE_OK ){
2754     pFile->eFileLock = eFileLock;
2755     pInode->eFileLock = eFileLock;
2756   }else if( eFileLock==EXCLUSIVE_LOCK ){
2757     pFile->eFileLock = PENDING_LOCK;
2758     pInode->eFileLock = PENDING_LOCK;
2759   }
2760
2761 afp_end_lock:
2762   unixLeaveMutex();
2763   OSTRACE(("LOCK    %d %s %s (afp)\n", pFile->h, azFileLock(eFileLock),
2764          rc==SQLITE_OK ? "ok" : "failed"));
2765   return rc;
2766 }
2767
2768 /*
2769 ** Lower the locking level on file descriptor pFile to eFileLock.  eFileLock
2770 ** must be either NO_LOCK or SHARED_LOCK.
2771 **
2772 ** If the locking level of the file descriptor is already at or below
2773 ** the requested locking level, this routine is a no-op.
2774 */
2775 static int afpUnlock(sqlite3_file *id, int eFileLock) {
2776   int rc = SQLITE_OK;
2777   unixFile *pFile = (unixFile*)id;
2778   unixInodeInfo *pInode;
2779   afpLockingContext *context = (afpLockingContext *) pFile->lockingContext;
2780   int skipShared = 0;
2781 #ifdef SQLITE_TEST
2782   int h = pFile->h;
2783 #endif
2784
2785   assert( pFile );
2786   OSTRACE(("UNLOCK  %d %d was %d(%d,%d) pid=%d (afp)\n", pFile->h, eFileLock,
2787            pFile->eFileLock, pFile->pInode->eFileLock, pFile->pInode->nShared,
2788            getpid()));
2789
2790   assert( eFileLock<=SHARED_LOCK );
2791   if( pFile->eFileLock<=eFileLock ){
2792     return SQLITE_OK;
2793   }
2794   unixEnterMutex();
2795   pInode = pFile->pInode;
2796   assert( pInode->nShared!=0 );
2797   if( pFile->eFileLock>SHARED_LOCK ){
2798     assert( pInode->eFileLock==pFile->eFileLock );
2799     SimulateIOErrorBenign(1);
2800     SimulateIOError( h=(-1) )
2801     SimulateIOErrorBenign(0);
2802
2803 #ifndef NDEBUG
2804     /* When reducing a lock such that other processes can start
2805     ** reading the database file again, make sure that the
2806     ** transaction counter was updated if any part of the database
2807     ** file changed.  If the transaction counter is not updated,
2808     ** other connections to the same file might not realize that
2809     ** the file has changed and hence might not know to flush their
2810     ** cache.  The use of a stale cache can lead to database corruption.
2811     */
2812     assert( pFile->inNormalWrite==0
2813            || pFile->dbUpdate==0
2814            || pFile->transCntrChng==1 );
2815     pFile->inNormalWrite = 0;
2816 #endif
2817
2818     if( pFile->eFileLock==EXCLUSIVE_LOCK ){
2819       rc = afpSetLock(context->dbPath, pFile, SHARED_FIRST, SHARED_SIZE, 0);
2820       if( rc==SQLITE_OK && (eFileLock==SHARED_LOCK || pInode->nShared>1) ){
2821         /* only re-establish the shared lock if necessary */
2822         int sharedLockByte = SHARED_FIRST+pInode->sharedByte;
2823         rc = afpSetLock(context->dbPath, pFile, sharedLockByte, 1, 1);
2824       } else {
2825         skipShared = 1;
2826       }
2827     }
2828     if( rc==SQLITE_OK && pFile->eFileLock>=PENDING_LOCK ){
2829       rc = afpSetLock(context->dbPath, pFile, PENDING_BYTE, 1, 0);
2830     }
2831     if( rc==SQLITE_OK && pFile->eFileLock>=RESERVED_LOCK && context->reserved ){
2832       rc = afpSetLock(context->dbPath, pFile, RESERVED_BYTE, 1, 0);
2833       if( !rc ){
2834         context->reserved = 0;
2835       }
2836     }
2837     if( rc==SQLITE_OK && (eFileLock==SHARED_LOCK || pInode->nShared>1)){
2838       pInode->eFileLock = SHARED_LOCK;
2839     }
2840   }
2841   if( rc==SQLITE_OK && eFileLock==NO_LOCK ){
2842
2843     /* Decrement the shared lock counter.  Release the lock using an
2844     ** OS call only when all threads in this same process have released
2845     ** the lock.
2846     */
2847     unsigned long long sharedLockByte = SHARED_FIRST+pInode->sharedByte;
2848     pInode->nShared--;
2849     if( pInode->nShared==0 ){
2850       SimulateIOErrorBenign(1);
2851       SimulateIOError( h=(-1) )
2852       SimulateIOErrorBenign(0);
2853       if( !skipShared ){
2854         rc = afpSetLock(context->dbPath, pFile, sharedLockByte, 1, 0);
2855       }
2856       if( !rc ){
2857         pInode->eFileLock = NO_LOCK;
2858         pFile->eFileLock = NO_LOCK;
2859       }
2860     }
2861     if( rc==SQLITE_OK ){
2862       pInode->nLock--;
2863       assert( pInode->nLock>=0 );
2864       if( pInode->nLock==0 ){
2865         closePendingFds(pFile);
2866       }
2867     }
2868   }
2869
2870   unixLeaveMutex();
2871   if( rc==SQLITE_OK ) pFile->eFileLock = eFileLock;
2872   return rc;
2873 }
2874
2875 /*
2876 ** Close a file & cleanup AFP specific locking context
2877 */
2878 static int afpClose(sqlite3_file *id) {
2879   int rc = SQLITE_OK;
2880   if( id ){
2881     unixFile *pFile = (unixFile*)id;
2882     afpUnlock(id, NO_LOCK);
2883     unixEnterMutex();
2884     if( pFile->pInode && pFile->pInode->nLock ){
2885       /* If there are outstanding locks, do not actually close the file just
2886       ** yet because that would clear those locks.  Instead, add the file
2887       ** descriptor to pInode->aPending.  It will be automatically closed when
2888       ** the last lock is cleared.
2889       */
2890       setPendingFd(pFile);
2891     }
2892     releaseInodeInfo(pFile);
2893     sqlite3_free(pFile->lockingContext);
2894     rc = closeUnixFile(id);
2895     unixLeaveMutex();
2896   }
2897   return rc;
2898 }
2899
2900 #endif /* defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE */
2901 /*
2902 ** The code above is the AFP lock implementation.  The code is specific
2903 ** to MacOSX and does not work on other unix platforms.  No alternative
2904 ** is available.  If you don't compile for a mac, then the "unix-afp"
2905 ** VFS is not available.
2906 **
2907 ********************* End of the AFP lock implementation **********************
2908 ******************************************************************************/
2909
2910 /******************************************************************************
2911 *************************** Begin NFS Locking ********************************/
2912
2913 #if defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE
2914 /*
2915  ** Lower the locking level on file descriptor pFile to eFileLock.  eFileLock
2916  ** must be either NO_LOCK or SHARED_LOCK.
2917  **
2918  ** If the locking level of the file descriptor is already at or below
2919  ** the requested locking level, this routine is a no-op.
2920  */
2921 static int nfsUnlock(sqlite3_file *id, int eFileLock){
2922   return posixUnlock(id, eFileLock, 1);
2923 }
2924
2925 #endif /* defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE */
2926 /*
2927 ** The code above is the NFS lock implementation.  The code is specific
2928 ** to MacOSX and does not work on other unix platforms.  No alternative
2929 ** is available.
2930 **
2931 ********************* End of the NFS lock implementation **********************
2932 ******************************************************************************/
2933
2934 /******************************************************************************
2935 **************** Non-locking sqlite3_file methods *****************************
2936 **
2937 ** The next division contains implementations for all methods of the
2938 ** sqlite3_file object other than the locking methods.  The locking
2939 ** methods were defined in divisions above (one locking method per
2940 ** division).  Those methods that are common to all locking modes
2941 ** are gather together into this division.
2942 */
2943
2944 /*
2945 ** Seek to the offset passed as the second argument, then read cnt
2946 ** bytes into pBuf. Return the number of bytes actually read.
2947 **
2948 ** NB:  If you define USE_PREAD or USE_PREAD64, then it might also
2949 ** be necessary to define _XOPEN_SOURCE to be 500.  This varies from
2950 ** one system to another.  Since SQLite does not define USE_PREAD
2951 ** any any form by default, we will not attempt to define _XOPEN_SOURCE.
2952 ** See tickets #2741 and #2681.
2953 **
2954 ** To avoid stomping the errno value on a failed read the lastErrno value
2955 ** is set before returning.
2956 */
2957 static int seekAndRead(unixFile *id, sqlite3_int64 offset, void *pBuf, int cnt){
2958   int got;
2959   int prior = 0;
2960 #if (!defined(USE_PREAD) && !defined(USE_PREAD64))
2961   i64 newOffset;
2962 #endif
2963   TIMER_START;
2964   do{
2965 #if defined(USE_PREAD)
2966     got = osPread(id->h, pBuf, cnt, offset);
2967     SimulateIOError( got = -1 );
2968 #elif defined(USE_PREAD64)
2969     got = osPread64(id->h, pBuf, cnt, offset);
2970     SimulateIOError( got = -1 );
2971 #else
2972     newOffset = lseek(id->h, offset, SEEK_SET);
2973     SimulateIOError( newOffset-- );
2974     if( newOffset!=offset ){
2975       if( newOffset == -1 ){
2976         ((unixFile*)id)->lastErrno = errno;
2977       }else{
2978         ((unixFile*)id)->lastErrno = 0;
2979       }
2980       return -1;
2981     }
2982     got = osRead(id->h, pBuf, cnt);
2983 #endif
2984     if( got==cnt ) break;
2985     if( got<0 ){
2986       if( errno==EINTR ){ got = 1; continue; }
2987       prior = 0;
2988       ((unixFile*)id)->lastErrno = errno;
2989       break;
2990     }else if( got>0 ){
2991       cnt -= got;
2992       offset += got;
2993       prior += got;
2994       pBuf = (void*)(got + (char*)pBuf);
2995     }
2996   }while( got>0 );
2997   TIMER_END;
2998   OSTRACE(("READ    %-3d %5d %7lld %llu\n",
2999             id->h, got+prior, offset-prior, TIMER_ELAPSED));
3000   return got+prior;
3001 }
3002
3003 /*
3004 ** Read data from a file into a buffer.  Return SQLITE_OK if all
3005 ** bytes were read successfully and SQLITE_IOERR if anything goes
3006 ** wrong.
3007 */
3008 static int unixRead(
3009   sqlite3_file *id,
3010   void *pBuf,
3011   int amt,
3012   sqlite3_int64 offset
3013 ){
3014   unixFile *pFile = (unixFile *)id;
3015   int got;
3016   assert( id );
3017
3018   /* If this is a database file (not a journal, master-journal or temp
3019   ** file), the bytes in the locking range should never be read or written. */
3020 #if 0
3021   assert( pFile->pUnused==0
3022        || offset>=PENDING_BYTE+512
3023        || offset+amt<=PENDING_BYTE
3024   );
3025 #endif
3026
3027   got = seekAndRead(pFile, offset, pBuf, amt);
3028   if( got==amt ){
3029     return SQLITE_OK;
3030   }else if( got<0 ){
3031     /* lastErrno set by seekAndRead */
3032     return SQLITE_IOERR_READ;
3033   }else{
3034     pFile->lastErrno = 0; /* not a system error */
3035     /* Unread parts of the buffer must be zero-filled */
3036     memset(&((char*)pBuf)[got], 0, amt-got);
3037     return SQLITE_IOERR_SHORT_READ;
3038   }
3039 }
3040
3041 /*
3042 ** Seek to the offset in id->offset then read cnt bytes into pBuf.
3043 ** Return the number of bytes actually read.  Update the offset.
3044 **
3045 ** To avoid stomping the errno value on a failed write the lastErrno value
3046 ** is set before returning.
3047 */
3048 static int seekAndWrite(unixFile *id, i64 offset, const void *pBuf, int cnt){
3049   int got;
3050 #if (!defined(USE_PREAD) && !defined(USE_PREAD64))
3051   i64 newOffset;
3052 #endif
3053   TIMER_START;
3054 #if defined(USE_PREAD)
3055   do{ got = osPwrite(id->h, pBuf, cnt, offset); }while( got<0 && errno==EINTR );
3056 #elif defined(USE_PREAD64)
3057   do{ got = osPwrite64(id->h, pBuf, cnt, offset);}while( got<0 && errno==EINTR);
3058 #else
3059   do{
3060     newOffset = lseek(id->h, offset, SEEK_SET);
3061     SimulateIOError( newOffset-- );
3062     if( newOffset!=offset ){
3063       if( newOffset == -1 ){
3064         ((unixFile*)id)->lastErrno = errno;
3065       }else{
3066         ((unixFile*)id)->lastErrno = 0;
3067       }
3068       return -1;
3069     }
3070     got = osWrite(id->h, pBuf, cnt);
3071   }while( got<0 && errno==EINTR );
3072 #endif
3073   TIMER_END;
3074   if( got<0 ){
3075     ((unixFile*)id)->lastErrno = errno;
3076   }
3077
3078   OSTRACE(("WRITE   %-3d %5d %7lld %llu\n", id->h, got, offset, TIMER_ELAPSED));
3079   return got;
3080 }
3081
3082
3083 /*
3084 ** Write data from a buffer into a file.  Return SQLITE_OK on success
3085 ** or some other error code on failure.
3086 */
3087 static int unixWrite(
3088   sqlite3_file *id,
3089   const void *pBuf,
3090   int amt,
3091   sqlite3_int64 offset
3092 ){
3093   unixFile *pFile = (unixFile*)id;
3094   int wrote = 0;
3095   assert( id );
3096   assert( amt>0 );
3097
3098   /* If this is a database file (not a journal, master-journal or temp
3099   ** file), the bytes in the locking range should never be read or written. */
3100 #if 0
3101   assert( pFile->pUnused==0
3102        || offset>=PENDING_BYTE+512
3103        || offset+amt<=PENDING_BYTE
3104   );
3105 #endif
3106
3107 #ifndef NDEBUG
3108   /* If we are doing a normal write to a database file (as opposed to
3109   ** doing a hot-journal rollback or a write to some file other than a
3110   ** normal database file) then record the fact that the database
3111   ** has changed.  If the transaction counter is modified, record that
3112   ** fact too.
3113   */
3114   if( pFile->inNormalWrite ){
3115     pFile->dbUpdate = 1;  /* The database has been modified */
3116     if( offset<=24 && offset+amt>=27 ){
3117       int rc;
3118       char oldCntr[4];
3119       SimulateIOErrorBenign(1);
3120       rc = seekAndRead(pFile, 24, oldCntr, 4);
3121       SimulateIOErrorBenign(0);
3122       if( rc!=4 || memcmp(oldCntr, &((char*)pBuf)[24-offset], 4)!=0 ){
3123         pFile->transCntrChng = 1;  /* The transaction counter has changed */
3124       }
3125     }
3126   }
3127 #endif
3128
3129   while( amt>0 && (wrote = seekAndWrite(pFile, offset, pBuf, amt))>0 ){
3130     amt -= wrote;
3131     offset += wrote;
3132     pBuf = &((char*)pBuf)[wrote];
3133   }
3134   SimulateIOError(( wrote=(-1), amt=1 ));
3135   SimulateDiskfullError(( wrote=0, amt=1 ));
3136
3137   if( amt>0 ){
3138     if( wrote<0 && pFile->lastErrno!=ENOSPC ){
3139       /* lastErrno set by seekAndWrite */
3140       return SQLITE_IOERR_WRITE;
3141     }else{
3142       pFile->lastErrno = 0; /* not a system error */
3143       return SQLITE_FULL;
3144     }
3145   }
3146
3147   return SQLITE_OK;
3148 }
3149
3150 #ifdef SQLITE_TEST
3151 /*
3152 ** Count the number of fullsyncs and normal syncs.  This is used to test
3153 ** that syncs and fullsyncs are occurring at the right times.
3154 */
3155 int sqlite3_sync_count = 0;
3156 int sqlite3_fullsync_count = 0;
3157 #endif
3158
3159 /*
3160 ** We do not trust systems to provide a working fdatasync().  Some do.
3161 ** Others do no.  To be safe, we will stick with the (slightly slower)
3162 ** fsync(). If you know that your system does support fdatasync() correctly,
3163 ** then simply compile with -Dfdatasync=fdatasync
3164 */
3165 #if !defined(fdatasync)
3166 # define fdatasync fsync
3167 #endif
3168
3169 /*
3170 ** Define HAVE_FULLFSYNC to 0 or 1 depending on whether or not
3171 ** the F_FULLFSYNC macro is defined.  F_FULLFSYNC is currently
3172 ** only available on Mac OS X.  But that could change.
3173 */
3174 #ifdef F_FULLFSYNC
3175 # define HAVE_FULLFSYNC 1
3176 #else
3177 # define HAVE_FULLFSYNC 0
3178 #endif
3179
3180
3181 /*
3182 ** The fsync() system call does not work as advertised on many
3183 ** unix systems.  The following procedure is an attempt to make
3184 ** it work better.
3185 **
3186 ** The SQLITE_NO_SYNC macro disables all fsync()s.  This is useful
3187 ** for testing when we want to run through the test suite quickly.
3188 ** You are strongly advised *not* to deploy with SQLITE_NO_SYNC
3189 ** enabled, however, since with SQLITE_NO_SYNC enabled, an OS crash
3190 ** or power failure will likely corrupt the database file.
3191 **
3192 ** SQLite sets the dataOnly flag if the size of the file is unchanged.
3193 ** The idea behind dataOnly is that it should only write the file content
3194 ** to disk, not the inode.  We only set dataOnly if the file size is
3195 ** unchanged since the file size is part of the inode.  However,
3196 ** Ted Ts'o tells us that fdatasync() will also write the inode if the
3197 ** file size has changed.  The only real difference between fdatasync()
3198 ** and fsync(), Ted tells us, is that fdatasync() will not flush the
3199 ** inode if the mtime or owner or other inode attributes have changed.
3200 ** We only care about the file size, not the other file attributes, so
3201 ** as far as SQLite is concerned, an fdatasync() is always adequate.
3202 ** So, we always use fdatasync() if it is available, regardless of
3203 ** the value of the dataOnly flag.
3204 */
3205 static int full_fsync(int fd, int fullSync, int dataOnly){
3206   int rc;
3207
3208   /* The following "ifdef/elif/else/" block has the same structure as
3209   ** the one below. It is replicated here solely to avoid cluttering
3210   ** up the real code with the UNUSED_PARAMETER() macros.
3211   */
3212 #ifdef SQLITE_NO_SYNC
3213   UNUSED_PARAMETER(fd);
3214   UNUSED_PARAMETER(fullSync);
3215   UNUSED_PARAMETER(dataOnly);
3216 #elif HAVE_FULLFSYNC
3217   UNUSED_PARAMETER(dataOnly);
3218 #else
3219   UNUSED_PARAMETER(fullSync);
3220   UNUSED_PARAMETER(dataOnly);
3221 #endif
3222
3223   /* Record the number of times that we do a normal fsync() and
3224   ** FULLSYNC.  This is used during testing to verify that this procedure
3225   ** gets called with the correct arguments.
3226   */
3227 #ifdef SQLITE_TEST
3228   if( fullSync ) sqlite3_fullsync_count++;
3229   sqlite3_sync_count++;
3230 #endif
3231
3232   /* If we compiled with the SQLITE_NO_SYNC flag, then syncing is a
3233   ** no-op
3234   */
3235 #ifdef SQLITE_NO_SYNC
3236   rc = SQLITE_OK;
3237 #elif HAVE_FULLFSYNC
3238   if( fullSync ){
3239     rc = osFcntl(fd, F_FULLFSYNC, 0);
3240   }else{
3241     rc = 1;
3242   }
3243   /* If the FULLFSYNC failed, fall back to attempting an fsync().
3244   ** It shouldn't be possible for fullfsync to fail on the local
3245   ** file system (on OSX), so failure indicates that FULLFSYNC
3246   ** isn't supported for this file system. So, attempt an fsync
3247   ** and (for now) ignore the overhead of a superfluous fcntl call.
3248   ** It'd be better to detect fullfsync support once and avoid
3249   ** the fcntl call every time sync is called.
3250   */
3251   if( rc ) rc = fsync(fd);
3252
3253 #elif defined(__APPLE__)
3254   /* fdatasync() on HFS+ doesn't yet flush the file size if it changed correctly
3255   ** so currently we default to the macro that redefines fdatasync to fsync
3256   */
3257   rc = fsync(fd);
3258 #else
3259   rc = fdatasync(fd);
3260 #if OS_VXWORKS
3261   if( rc==-1 && errno==ENOTSUP ){
3262     rc = fsync(fd);
3263   }
3264 #endif /* OS_VXWORKS */
3265 #endif /* ifdef SQLITE_NO_SYNC elif HAVE_FULLFSYNC */
3266
3267   if( OS_VXWORKS && rc!= -1 ){
3268     rc = 0;
3269   }
3270   return rc;
3271 }
3272
3273 /*
3274 ** Open a file descriptor to the directory containing file zFilename.
3275 ** If successful, *pFd is set to the opened file descriptor and
3276 ** SQLITE_OK is returned. If an error occurs, either SQLITE_NOMEM
3277 ** or SQLITE_CANTOPEN is returned and *pFd is set to an undefined
3278 ** value.
3279 **
3280 ** The directory file descriptor is used for only one thing - to
3281 ** fsync() a directory to make sure file creation and deletion events
3282 ** are flushed to disk.  Such fsyncs are not needed on newer
3283 ** journaling filesystems, but are required on older filesystems.
3284 **
3285 ** This routine can be overridden using the xSetSysCall interface.
3286 ** The ability to override this routine was added in support of the
3287 ** chromium sandbox.  Opening a directory is a security risk (we are
3288 ** told) so making it overrideable allows the chromium sandbox to
3289 ** replace this routine with a harmless no-op.  To make this routine
3290 ** a no-op, replace it with a stub that returns SQLITE_OK but leaves
3291 ** *pFd set to a negative number.
3292 **
3293 ** If SQLITE_OK is returned, the caller is responsible for closing
3294 ** the file descriptor *pFd using close().
3295 */
3296 static int openDirectory(const char *zFilename, int *pFd){
3297   int ii;
3298   int fd = -1;
3299   char zDirname[MAX_PATHNAME+1];
3300
3301   sqlite3_snprintf(MAX_PATHNAME, zDirname, "%s", zFilename);
3302   for(ii=(int)strlen(zDirname); ii>1 && zDirname[ii]!='/'; ii--);
3303   if( ii>0 ){
3304     zDirname[ii] = '\0';
3305     fd = robust_open(zDirname, O_RDONLY|O_BINARY, 0);
3306     if( fd>=0 ){
3307 #ifdef FD_CLOEXEC
3308       osFcntl(fd, F_SETFD, osFcntl(fd, F_GETFD, 0) | FD_CLOEXEC);
3309 #endif
3310       OSTRACE(("OPENDIR %-3d %s\n", fd, zDirname));
3311     }
3312   }
3313   *pFd = fd;
3314   return (fd>=0?SQLITE_OK:unixLogError(SQLITE_CANTOPEN_BKPT, "open", zDirname));
3315 }
3316
3317 /*
3318 ** Make sure all writes to a particular file are committed to disk.
3319 **
3320 ** If dataOnly==0 then both the file itself and its metadata (file
3321 ** size, access time, etc) are synced.  If dataOnly!=0 then only the
3322 ** file data is synced.
3323 **
3324 ** Under Unix, also make sure that the directory entry for the file
3325 ** has been created by fsync-ing the directory that contains the file.
3326 ** If we do not do this and we encounter a power failure, the directory
3327 ** entry for the journal might not exist after we reboot.  The next
3328 ** SQLite to access the file will not know that the journal exists (because
3329 ** the directory entry for the journal was never created) and the transaction
3330 ** will not roll back - possibly leading to database corruption.
3331 */
3332 static int unixSync(sqlite3_file *id, int flags){
3333   int rc;
3334   unixFile *pFile = (unixFile*)id;
3335
3336   int isDataOnly = (flags&SQLITE_SYNC_DATAONLY);
3337   int isFullsync = (flags&0x0F)==SQLITE_SYNC_FULL;
3338
3339   /* Check that one of SQLITE_SYNC_NORMAL or FULL was passed */
3340   assert((flags&0x0F)==SQLITE_SYNC_NORMAL
3341       || (flags&0x0F)==SQLITE_SYNC_FULL
3342   );
3343
3344   /* Unix cannot, but some systems may return SQLITE_FULL from here. This
3345   ** line is to test that doing so does not cause any problems.
3346   */
3347   SimulateDiskfullError( return SQLITE_FULL );
3348
3349   assert( pFile );
3350   OSTRACE(("SYNC    %-3d\n", pFile->h));
3351   rc = full_fsync(pFile->h, isFullsync, isDataOnly);
3352   SimulateIOError( rc=1 );
3353   if( rc ){
3354     pFile->lastErrno = errno;
3355     return unixLogError(SQLITE_IOERR_FSYNC, "full_fsync", pFile->zPath);
3356   }
3357
3358   /* Also fsync the directory containing the file if the DIRSYNC flag
3359   ** is set.  This is a one-time occurrance.  Many systems (examples: AIX)
3360   ** are unable to fsync a directory, so ignore errors on the fsync.
3361   */
3362   if( pFile->ctrlFlags & UNIXFILE_DIRSYNC ){
3363     int dirfd;
3364     OSTRACE(("DIRSYNC %s (have_fullfsync=%d fullsync=%d)\n", pFile->zPath,
3365             HAVE_FULLFSYNC, isFullsync));
3366     rc = osOpenDirectory(pFile->zPath, &dirfd);
3367     if( rc==SQLITE_OK && dirfd>=0 ){
3368       full_fsync(dirfd, 0, 0);
3369       robust_close(pFile, dirfd, __LINE__);
3370     }else if( rc==SQLITE_CANTOPEN ){
3371       rc = SQLITE_OK;
3372     }
3373     pFile->ctrlFlags &= ~UNIXFILE_DIRSYNC;
3374   }
3375   return rc;
3376 }
3377
3378 /*
3379 ** Truncate an open file to a specified size
3380 */
3381 static int unixTruncate(sqlite3_file *id, i64 nByte){
3382   unixFile *pFile = (unixFile *)id;
3383   int rc;
3384   assert( pFile );
3385   SimulateIOError( return SQLITE_IOERR_TRUNCATE );
3386
3387   /* If the user has configured a chunk-size for this file, truncate the
3388   ** file so that it consists of an integer number of chunks (i.e. the
3389   ** actual file size after the operation may be larger than the requested
3390   ** size).
3391   */
3392   if( pFile->szChunk ){
3393     nByte = ((nByte + pFile->szChunk - 1)/pFile->szChunk) * pFile->szChunk;
3394   }
3395
3396   rc = robust_ftruncate(pFile->h, (off_t)nByte);
3397   if( rc ){
3398     pFile->lastErrno = errno;
3399     return unixLogError(SQLITE_IOERR_TRUNCATE, "ftruncate", pFile->zPath);
3400   }else{
3401 #ifndef NDEBUG
3402     /* If we are doing a normal write to a database file (as opposed to
3403     ** doing a hot-journal rollback or a write to some file other than a
3404     ** normal database file) and we truncate the file to zero length,
3405     ** that effectively updates the change counter.  This might happen
3406     ** when restoring a database using the backup API from a zero-length
3407     ** source.
3408     */
3409     if( pFile->inNormalWrite && nByte==0 ){
3410       pFile->transCntrChng = 1;
3411     }
3412 #endif
3413
3414     return SQLITE_OK;
3415   }
3416 }
3417
3418 /*
3419 ** Determine the current size of a file in bytes
3420 */
3421 static int unixFileSize(sqlite3_file *id, i64 *pSize){
3422   int rc;
3423   struct stat buf;
3424   assert( id );
3425   rc = osFstat(((unixFile*)id)->h, &buf);
3426   SimulateIOError( rc=1 );
3427   if( rc!=0 ){
3428     ((unixFile*)id)->lastErrno = errno;
3429     return SQLITE_IOERR_FSTAT;
3430   }
3431   *pSize = buf.st_size;
3432
3433   /* When opening a zero-size database, the findInodeInfo() procedure
3434   ** writes a single byte into that file in order to work around a bug
3435   ** in the OS-X msdos filesystem.  In order to avoid problems with upper
3436   ** layers, we need to report this file size as zero even though it is
3437   ** really 1.   Ticket #3260.
3438   */
3439   if( *pSize==1 ) *pSize = 0;
3440
3441
3442   return SQLITE_OK;
3443 }
3444
3445 #if SQLITE_ENABLE_LOCKING_STYLE && defined(__APPLE__)
3446 /*
3447 ** Handler for proxy-locking file-control verbs.  Defined below in the
3448 ** proxying locking division.
3449 */
3450 static int proxyFileControl(sqlite3_file*,int,void*);
3451 #endif
3452
3453 /*
3454 ** This function is called to handle the SQLITE_FCNTL_SIZE_HINT
3455 ** file-control operation.  Enlarge the database to nBytes in size
3456 ** (rounded up to the next chunk-size).  If the database is already
3457 ** nBytes or larger, this routine is a no-op.
3458 */
3459 static int fcntlSizeHint(unixFile *pFile, i64 nByte){
3460   if( pFile->szChunk>0 ){
3461     i64 nSize;                    /* Required file size */
3462     struct stat buf;              /* Used to hold return values of fstat() */
3463
3464     if( osFstat(pFile->h, &buf) ) return SQLITE_IOERR_FSTAT;
3465
3466     nSize = ((nByte+pFile->szChunk-1) / pFile->szChunk) * pFile->szChunk;
3467     if( nSize>(i64)buf.st_size ){
3468
3469 #if defined(HAVE_POSIX_FALLOCATE) && HAVE_POSIX_FALLOCATE
3470       /* The code below is handling the return value of osFallocate()
3471       ** correctly. posix_fallocate() is defined to "returns zero on success,
3472       ** or an error number on  failure". See the manpage for details. */
3473       int err;
3474       do{
3475         err = osFallocate(pFile->h, buf.st_size, nSize-buf.st_size);
3476       }while( err==EINTR );
3477       if( err ) return SQLITE_IOERR_WRITE;
3478 #else
3479       /* If the OS does not have posix_fallocate(), fake it. First use
3480       ** ftruncate() to set the file size, then write a single byte to
3481       ** the last byte in each block within the extended region. This
3482       ** is the same technique used by glibc to implement posix_fallocate()
3483       ** on systems that do not have a real fallocate() system call.
3484       */
3485       int nBlk = buf.st_blksize;  /* File-system block size */
3486       i64 iWrite;                 /* Next offset to write to */
3487
3488       if( robust_ftruncate(pFile->h, nSize) ){
3489         pFile->lastErrno = errno;
3490         return unixLogError(SQLITE_IOERR_TRUNCATE, "ftruncate", pFile->zPath);
3491       }
3492       iWrite = ((buf.st_size + 2*nBlk - 1)/nBlk)*nBlk-1;
3493       while( iWrite<nSize ){
3494         int nWrite = seekAndWrite(pFile, iWrite, "", 1);
3495         if( nWrite!=1 ) return SQLITE_IOERR_WRITE;
3496         iWrite += nBlk;
3497       }
3498 #endif
3499     }
3500   }
3501
3502   return SQLITE_OK;
3503 }
3504
3505 /*
3506 ** If *pArg is inititially negative then this is a query.  Set *pArg to
3507 ** 1 or 0 depending on whether or not bit mask of pFile->ctrlFlags is set.
3508 **
3509 ** If *pArg is 0 or 1, then clear or set the mask bit of pFile->ctrlFlags.
3510 */
3511 static void unixModeBit(unixFile *pFile, unsigned char mask, int *pArg){
3512   if( *pArg<0 ){
3513     *pArg = (pFile->ctrlFlags & mask)!=0;
3514   }else if( (*pArg)==0 ){
3515     pFile->ctrlFlags &= ~mask;
3516   }else{
3517     pFile->ctrlFlags |= mask;
3518   }
3519 }
3520
3521 /*
3522 ** Information and control of an open file handle.
3523 */
3524 static int unixFileControl(sqlite3_file *id, int op, void *pArg){
3525   unixFile *pFile = (unixFile*)id;
3526   switch( op ){
3527     case SQLITE_FCNTL_LOCKSTATE: {
3528       *(int*)pArg = pFile->eFileLock;
3529       return SQLITE_OK;
3530     }
3531     case SQLITE_LAST_ERRNO: {
3532       *(int*)pArg = pFile->lastErrno;
3533       return SQLITE_OK;
3534     }
3535     case SQLITE_FCNTL_CHUNK_SIZE: {
3536       pFile->szChunk = *(int *)pArg;
3537       return SQLITE_OK;
3538     }
3539     case SQLITE_FCNTL_SIZE_HINT: {
3540       int rc;
3541       SimulateIOErrorBenign(1);
3542       rc = fcntlSizeHint(pFile, *(i64 *)pArg);
3543       SimulateIOErrorBenign(0);
3544       return rc;
3545     }
3546     case SQLITE_FCNTL_PERSIST_WAL: {
3547       unixModeBit(pFile, UNIXFILE_PERSIST_WAL, (int*)pArg);
3548       return SQLITE_OK;
3549     }
3550     case SQLITE_FCNTL_POWERSAFE_OVERWRITE: {
3551       unixModeBit(pFile, UNIXFILE_PSOW, (int*)pArg);
3552       return SQLITE_OK;
3553     }
3554     case SQLITE_FCNTL_VFSNAME: {
3555       *(char**)pArg = sqlite3_mprintf("%s", pFile->pVfs->zName);
3556       return SQLITE_OK;
3557     }
3558 #ifndef NDEBUG
3559     /* The pager calls this method to signal that it has done
3560     ** a rollback and that the database is therefore unchanged and
3561     ** it hence it is OK for the transaction change counter to be
3562     ** unchanged.
3563     */
3564     case SQLITE_FCNTL_DB_UNCHANGED: {
3565       ((unixFile*)id)->dbUpdate = 0;
3566       return SQLITE_OK;
3567     }
3568 #endif
3569 #if SQLITE_ENABLE_LOCKING_STYLE && defined(__APPLE__)
3570     case SQLITE_SET_LOCKPROXYFILE:
3571     case SQLITE_GET_LOCKPROXYFILE: {
3572       return proxyFileControl(id,op,pArg);
3573     }
3574 #endif /* SQLITE_ENABLE_LOCKING_STYLE && defined(__APPLE__) */
3575   }
3576   return SQLITE_NOTFOUND;
3577 }
3578
3579 /*
3580 ** Return the sector size in bytes of the underlying block device for
3581 ** the specified file. This is almost always 512 bytes, but may be
3582 ** larger for some devices.
3583 **
3584 ** SQLite code assumes this function cannot fail. It also assumes that
3585 ** if two files are created in the same file-system directory (i.e.
3586 ** a database and its journal file) that the sector size will be the
3587 ** same for both.
3588 */
3589 static int unixSectorSize(sqlite3_file *pFile){
3590   (void)pFile;
3591   return SQLITE_DEFAULT_SECTOR_SIZE;
3592 }
3593
3594 /*
3595 ** Return the device characteristics for the file.
3596 **
3597 ** This VFS is set up to return SQLITE_IOCAP_POWERSAFE_OVERWRITE by default.
3598 ** However, that choice is contraversial since technically the underlying
3599 ** file system does not always provide powersafe overwrites.  (In other
3600 ** words, after a power-loss event, parts of the file that were never
3601 ** written might end up being altered.)  However, non-PSOW behavior is very,
3602 ** very rare.  And asserting PSOW makes a large reduction in the amount
3603 ** of required I/O for journaling, since a lot of padding is eliminated.
3604 **  Hence, while POWERSAFE_OVERWRITE is on by default, there is a file-control
3605 ** available to turn it off and URI query parameter available to turn it off.
3606 */
3607 static int unixDeviceCharacteristics(sqlite3_file *id){
3608   unixFile *p = (unixFile*)id;
3609   if( p->ctrlFlags & UNIXFILE_PSOW ){
3610     return SQLITE_IOCAP_POWERSAFE_OVERWRITE;
3611   }else{
3612     return 0;
3613   }
3614 }
3615
3616 #ifndef SQLITE_OMIT_WAL
3617
3618
3619 /*
3620 ** Object used to represent an shared memory buffer.
3621 **
3622 ** When multiple threads all reference the same wal-index, each thread
3623 ** has its own unixShm object, but they all point to a single instance
3624 ** of this unixShmNode object.  In other words, each wal-index is opened
3625 ** only once per process.
3626 **
3627 ** Each unixShmNode object is connected to a single unixInodeInfo object.
3628 ** We could coalesce this object into unixInodeInfo, but that would mean
3629 ** every open file that does not use shared memory (in other words, most
3630 ** open files) would have to carry around this extra information.  So
3631 ** the unixInodeInfo object contains a pointer to this unixShmNode object
3632 ** and the unixShmNode object is created only when needed.
3633 **
3634 ** unixMutexHeld() must be true when creating or destroying
3635 ** this object or while reading or writing the following fields:
3636 **
3637 **      nRef
3638 **
3639 ** The following fields are read-only after the object is created:
3640 **
3641 **      fid
3642 **      zFilename
3643 **
3644 ** Either unixShmNode.mutex must be held or unixShmNode.nRef==0 and
3645 ** unixMutexHeld() is true when reading or writing any other field
3646 ** in this structure.
3647 */
3648 struct unixShmNode {
3649   unixInodeInfo *pInode;     /* unixInodeInfo that owns this SHM node */
3650   sqlite3_mutex *mutex;      /* Mutex to access this object */
3651   char *zFilename;           /* Name of the mmapped file */
3652   int h;                     /* Open file descriptor */
3653   int szRegion;              /* Size of shared-memory regions */
3654   u16 nRegion;               /* Size of array apRegion */
3655   u8 isReadonly;             /* True if read-only */
3656   char **apRegion;           /* Array of mapped shared-memory regions */
3657   int nRef;                  /* Number of unixShm objects pointing to this */
3658   unixShm *pFirst;           /* All unixShm objects pointing to this */
3659 #ifdef SQLITE_DEBUG
3660   u8 exclMask;               /* Mask of exclusive locks held */
3661   u8 sharedMask;             /* Mask of shared locks held */
3662   u8 nextShmId;              /* Next available unixShm.id value */
3663 #endif
3664 };
3665
3666 /*
3667 ** Structure used internally by this VFS to record the state of an
3668 ** open shared memory connection.
3669 **
3670 ** The following fields are initialized when this object is created and
3671 ** are read-only thereafter:
3672 **
3673 **    unixShm.pFile
3674 **    unixShm.id
3675 **
3676 ** All other fields are read/write.  The unixShm.pFile->mutex must be held
3677 ** while accessing any read/write fields.
3678 */
3679 struct unixShm {
3680   unixShmNode *pShmNode;     /* The underlying unixShmNode object */
3681   unixShm *pNext;            /* Next unixShm with the same unixShmNode */
3682   u8 hasMutex;               /* True if holding the unixShmNode mutex */
3683   u8 id;                     /* Id of this connection within its unixShmNode */
3684   u16 sharedMask;            /* Mask of shared locks held */
3685   u16 exclMask;              /* Mask of exclusive locks held */
3686 };
3687
3688 /*
3689 ** Constants used for locking
3690 */
3691 #define UNIX_SHM_BASE   ((22+SQLITE_SHM_NLOCK)*4)         /* first lock byte */
3692 #define UNIX_SHM_DMS    (UNIX_SHM_BASE+SQLITE_SHM_NLOCK)  /* deadman switch */
3693
3694 /*
3695 ** Apply posix advisory locks for all bytes from ofst through ofst+n-1.
3696 **
3697 ** Locks block if the mask is exactly UNIX_SHM_C and are non-blocking
3698 ** otherwise.
3699 */
3700 static int unixShmSystemLock(
3701   unixShmNode *pShmNode, /* Apply locks to this open shared-memory segment */
3702   int lockType,          /* F_UNLCK, F_RDLCK, or F_WRLCK */
3703   int ofst,              /* First byte of the locking range */
3704   int n                  /* Number of bytes to lock */
3705 ){
3706   struct flock f;       /* The posix advisory locking structure */
3707   int rc = SQLITE_OK;   /* Result code form fcntl() */
3708
3709   /* Access to the unixShmNode object is serialized by the caller */
3710   assert( sqlite3_mutex_held(pShmNode->mutex) || pShmNode->nRef==0 );
3711
3712   /* Shared locks never span more than one byte */
3713   assert( n==1 || lockType!=F_RDLCK );
3714
3715   /* Locks are within range */
3716   assert( n>=1 && n<SQLITE_SHM_NLOCK );
3717
3718   if( pShmNode->h>=0 ){
3719     /* Initialize the locking parameters */
3720     memset(&f, 0, sizeof(f));
3721     f.l_type = lockType;
3722     f.l_whence = SEEK_SET;
3723     f.l_start = ofst;
3724     f.l_len = n;
3725
3726     rc = osFcntl(pShmNode->h, F_SETLK, &f);
3727     rc = (rc!=(-1)) ? SQLITE_OK : SQLITE_BUSY;
3728   }
3729
3730   /* Update the global lock state and do debug tracing */
3731 #ifdef SQLITE_DEBUG
3732   { u16 mask;
3733   OSTRACE(("SHM-LOCK "));
3734   mask = (1<<(ofst+n)) - (1<<ofst);
3735   if( rc==SQLITE_OK ){
3736     if( lockType==F_UNLCK ){
3737       OSTRACE(("unlock %d ok", ofst));
3738       pShmNode->exclMask &= ~mask;
3739       pShmNode->sharedMask &= ~mask;
3740     }else if( lockType==F_RDLCK ){
3741       OSTRACE(("read-lock %d ok", ofst));
3742       pShmNode->exclMask &= ~mask;
3743       pShmNode->sharedMask |= mask;
3744     }else{
3745       assert( lockType==F_WRLCK );
3746       OSTRACE(("write-lock %d ok", ofst));
3747       pShmNode->exclMask |= mask;
3748       pShmNode->sharedMask &= ~mask;
3749     }
3750   }else{
3751     if( lockType==F_UNLCK ){
3752       OSTRACE(("unlock %d failed", ofst));
3753     }else if( lockType==F_RDLCK ){
3754       OSTRACE(("read-lock failed"));
3755     }else{
3756       assert( lockType==F_WRLCK );
3757       OSTRACE(("write-lock %d failed", ofst));
3758     }
3759   }
3760   OSTRACE((" - afterwards %03x,%03x\n",
3761            pShmNode->sharedMask, pShmNode->exclMask));
3762   }
3763 #endif
3764
3765   return rc;
3766 }
3767
3768
3769 /*
3770 ** Purge the unixShmNodeList list of all entries with unixShmNode.nRef==0.
3771 **
3772 ** This is not a VFS shared-memory method; it is a utility function called
3773 ** by VFS shared-memory methods.
3774 */
3775 static void unixShmPurge(unixFile *pFd){
3776   unixShmNode *p = pFd->pInode->pShmNode;
3777   assert( unixMutexHeld() );
3778   if( p && p->nRef==0 ){
3779     int i;
3780     assert( p->pInode==pFd->pInode );
3781     sqlite3_mutex_free(p->mutex);
3782     for(i=0; i<p->nRegion; i++){
3783       if( p->h>=0 ){
3784         munmap(p->apRegion[i], p->szRegion);
3785       }else{
3786         sqlite3_free(p->apRegion[i]);
3787       }
3788     }
3789     sqlite3_free(p->apRegion);
3790     if( p->h>=0 ){
3791       robust_close(pFd, p->h, __LINE__);
3792       p->h = -1;
3793     }
3794     p->pInode->pShmNode = 0;
3795     sqlite3_free(p);
3796   }
3797 }
3798
3799 /*
3800 ** Open a shared-memory area associated with open database file pDbFd.
3801 ** This particular implementation uses mmapped files.
3802 **
3803 ** The file used to implement shared-memory is in the same directory
3804 ** as the open database file and has the same name as the open database
3805 ** file with the "-shm" suffix added.  For example, if the database file
3806 ** is "/home/user1/config.db" then the file that is created and mmapped
3807 ** for shared memory will be called "/home/user1/config.db-shm".
3808 **
3809 ** Another approach to is to use files in /dev/shm or /dev/tmp or an
3810 ** some other tmpfs mount. But if a file in a different directory
3811 ** from the database file is used, then differing access permissions
3812 ** or a chroot() might cause two different processes on the same
3813 ** database to end up using different files for shared memory -
3814 ** meaning that their memory would not really be shared - resulting
3815 ** in database corruption.  Nevertheless, this tmpfs file usage
3816 ** can be enabled at compile-time using -DSQLITE_SHM_DIRECTORY="/dev/shm"
3817 ** or the equivalent.  The use of the SQLITE_SHM_DIRECTORY compile-time
3818 ** option results in an incompatible build of SQLite;  builds of SQLite
3819 ** that with differing SQLITE_SHM_DIRECTORY settings attempt to use the
3820 ** same database file at the same time, database corruption will likely
3821 ** result. The SQLITE_SHM_DIRECTORY compile-time option is considered
3822 ** "unsupported" and may go away in a future SQLite release.
3823 **
3824 ** When opening a new shared-memory file, if no other instances of that
3825 ** file are currently open, in this process or in other processes, then
3826 ** the file must be truncated to zero length or have its header cleared.
3827 **
3828 ** If the original database file (pDbFd) is using the "unix-excl" VFS
3829 ** that means that an exclusive lock is held on the database file and
3830 ** that no other processes are able to read or write the database.  In
3831 ** that case, we do not really need shared memory.  No shared memory
3832 ** file is created.  The shared memory will be simulated with heap memory.
3833 */
3834 static int unixOpenSharedMemory(unixFile *pDbFd){
3835   struct unixShm *p = 0;          /* The connection to be opened */
3836   struct unixShmNode *pShmNode;   /* The underlying mmapped file */
3837   int rc;                         /* Result code */
3838   unixInodeInfo *pInode;          /* The inode of fd */
3839   char *zShmFilename;             /* Name of the file used for SHM */
3840   int nShmFilename;               /* Size of the SHM filename in bytes */
3841
3842   /* Allocate space for the new unixShm object. */
3843   p = sqlite3_malloc( sizeof(*p) );
3844   if( p==0 ) return SQLITE_NOMEM;
3845   memset(p, 0, sizeof(*p));
3846   assert( pDbFd->pShm==0 );
3847
3848   /* Check to see if a unixShmNode object already exists. Reuse an existing
3849   ** one if present. Create a new one if necessary.
3850   */
3851   unixEnterMutex();
3852   pInode = pDbFd->pInode;
3853   pShmNode = pInode->pShmNode;
3854   if( pShmNode==0 ){
3855     struct stat sStat;                 /* fstat() info for database file */
3856
3857     /* Call fstat() to figure out the permissions on the database file. If
3858     ** a new *-shm file is created, an attempt will be made to create it
3859     ** with the same permissions. The actual permissions the file is created
3860     ** with are subject to the current umask setting.
3861     */
3862     if( osFstat(pDbFd->h, &sStat) && pInode->bProcessLock==0 ){
3863       rc = SQLITE_IOERR_FSTAT;
3864       goto shm_open_err;
3865     }
3866
3867 #ifdef SQLITE_SHM_DIRECTORY
3868     nShmFilename = sizeof(SQLITE_SHM_DIRECTORY) + 31;
3869 #else
3870     nShmFilename = 6 + (int)strlen(pDbFd->zPath);
3871 #endif
3872     pShmNode = sqlite3_malloc( sizeof(*pShmNode) + nShmFilename );
3873     if( pShmNode==0 ){
3874       rc = SQLITE_NOMEM;
3875       goto shm_open_err;
3876     }
3877     memset(pShmNode, 0, sizeof(*pShmNode)+nShmFilename);
3878     zShmFilename = pShmNode->zFilename = (char*)&pShmNode[1];
3879 #ifdef SQLITE_SHM_DIRECTORY
3880     sqlite3_snprintf(nShmFilename, zShmFilename,
3881                      SQLITE_SHM_DIRECTORY "/sqlite-shm-%x-%x",
3882                      (u32)sStat.st_ino, (u32)sStat.st_dev);
3883 #else
3884     sqlite3_snprintf(nShmFilename, zShmFilename, "%s-shm", pDbFd->zPath);
3885     sqlite3FileSuffix3(pDbFd->zPath, zShmFilename);
3886 #endif
3887     pShmNode->h = -1;
3888     pDbFd->pInode->pShmNode = pShmNode;
3889     pShmNode->pInode = pDbFd->pInode;
3890     pShmNode->mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_FAST);
3891     if( pShmNode->mutex==0 ){
3892       rc = SQLITE_NOMEM;
3893       goto shm_open_err;
3894     }
3895
3896     if( pInode->bProcessLock==0 ){
3897       int openFlags = O_RDWR | O_CREAT;
3898       if( sqlite3_uri_boolean(pDbFd->zPath, "readonly_shm", 0) ){
3899         openFlags = O_RDONLY;
3900         pShmNode->isReadonly = 1;
3901       }
3902       pShmNode->h = robust_open(zShmFilename, openFlags, (sStat.st_mode&0777));
3903       if( pShmNode->h<0 ){
3904         if( pShmNode->h<0 ){
3905           rc = unixLogError(SQLITE_CANTOPEN_BKPT, "open", zShmFilename);
3906           goto shm_open_err;
3907         }
3908       }
3909
3910       /* Check to see if another process is holding the dead-man switch.
3911       ** If not, truncate the file to zero length.
3912       */
3913       rc = SQLITE_OK;
3914       if( unixShmSystemLock(pShmNode, F_WRLCK, UNIX_SHM_DMS, 1)==SQLITE_OK ){
3915         if( robust_ftruncate(pShmNode->h, 0) ){
3916           rc = unixLogError(SQLITE_IOERR_SHMOPEN, "ftruncate", zShmFilename);
3917         }
3918       }
3919       if( rc==SQLITE_OK ){
3920         rc = unixShmSystemLock(pShmNode, F_RDLCK, UNIX_SHM_DMS, 1);
3921       }
3922       if( rc ) goto shm_open_err;
3923     }
3924   }
3925
3926   /* Make the new connection a child of the unixShmNode */
3927   p->pShmNode = pShmNode;
3928 #ifdef SQLITE_DEBUG
3929   p->id = pShmNode->nextShmId++;
3930 #endif
3931   pShmNode->nRef++;
3932   pDbFd->pShm = p;
3933   unixLeaveMutex();
3934
3935   /* The reference count on pShmNode has already been incremented under
3936   ** the cover of the unixEnterMutex() mutex and the pointer from the
3937   ** new (struct unixShm) object to the pShmNode has been set. All that is
3938   ** left to do is to link the new object into the linked list starting
3939   ** at pShmNode->pFirst. This must be done while holding the pShmNode->mutex
3940   ** mutex.
3941   */
3942   sqlite3_mutex_enter(pShmNode->mutex);
3943   p->pNext = pShmNode->pFirst;
3944   pShmNode->pFirst = p;
3945   sqlite3_mutex_leave(pShmNode->mutex);
3946   return SQLITE_OK;
3947
3948   /* Jump here on any error */
3949 shm_open_err:
3950   unixShmPurge(pDbFd);       /* This call frees pShmNode if required */
3951   sqlite3_free(p);
3952   unixLeaveMutex();
3953   return rc;
3954 }
3955
3956 /*
3957 ** This function is called to obtain a pointer to region iRegion of the
3958 ** shared-memory associated with the database file fd. Shared-memory regions
3959 ** are numbered starting from zero. Each shared-memory region is szRegion
3960 ** bytes in size.
3961 **
3962 ** If an error occurs, an error code is returned and *pp is set to NULL.
3963 **
3964 ** Otherwise, if the bExtend parameter is 0 and the requested shared-memory
3965 ** region has not been allocated (by any client, including one running in a
3966 ** separate process), then *pp is set to NULL and SQLITE_OK returned. If
3967 ** bExtend is non-zero and the requested shared-memory region has not yet
3968 ** been allocated, it is allocated by this function.
3969 **
3970 ** If the shared-memory region has already been allocated or is allocated by
3971 ** this call as described above, then it is mapped into this processes
3972 ** address space (if it is not already), *pp is set to point to the mapped
3973 ** memory and SQLITE_OK returned.
3974 */
3975 static int unixShmMap(
3976   sqlite3_file *fd,               /* Handle open on database file */
3977   int iRegion,                    /* Region to retrieve */
3978   int szRegion,                   /* Size of regions */
3979   int bExtend,                    /* True to extend file if necessary */
3980   void volatile **pp              /* OUT: Mapped memory */
3981 ){
3982   unixFile *pDbFd = (unixFile*)fd;
3983   unixShm *p;
3984   unixShmNode *pShmNode;
3985   int rc = SQLITE_OK;
3986
3987   /* If the shared-memory file has not yet been opened, open it now. */
3988   if( pDbFd->pShm==0 ){
3989     rc = unixOpenSharedMemory(pDbFd);
3990     if( rc!=SQLITE_OK ) return rc;
3991   }
3992
3993   p = pDbFd->pShm;
3994   pShmNode = p->pShmNode;
3995   sqlite3_mutex_enter(pShmNode->mutex);
3996   assert( szRegion==pShmNode->szRegion || pShmNode->nRegion==0 );
3997   assert( pShmNode->pInode==pDbFd->pInode );
3998   assert( pShmNode->h>=0 || pDbFd->pInode->bProcessLock==1 );
3999   assert( pShmNode->h<0 || pDbFd->pInode->bProcessLock==0 );
4000
4001   if( pShmNode->nRegion<=iRegion ){
4002     char **apNew;                      /* New apRegion[] array */
4003     int nByte = (iRegion+1)*szRegion;  /* Minimum required file size */
4004     struct stat sStat;                 /* Used by fstat() */
4005
4006     pShmNode->szRegion = szRegion;
4007
4008     if( pShmNode->h>=0 ){
4009       /* The requested region is not mapped into this processes address space.
4010       ** Check to see if it has been allocated (i.e. if the wal-index file is
4011       ** large enough to contain the requested region).
4012       */
4013       if( osFstat(pShmNode->h, &sStat) ){
4014         rc = SQLITE_IOERR_SHMSIZE;
4015         goto shmpage_out;
4016       }
4017
4018       if( sStat.st_size<nByte ){
4019         /* The requested memory region does not exist. If bExtend is set to
4020         ** false, exit early. *pp will be set to NULL and SQLITE_OK returned.
4021         **
4022         ** Alternatively, if bExtend is true, use ftruncate() to allocate
4023         ** the requested memory region.
4024         */
4025         if( !bExtend ) goto shmpage_out;
4026         if( robust_ftruncate(pShmNode->h, nByte) ){
4027           rc = unixLogError(SQLITE_IOERR_SHMSIZE, "ftruncate",
4028                             pShmNode->zFilename);
4029           goto shmpage_out;
4030         }
4031       }
4032     }
4033
4034     /* Map the requested memory region into this processes address space. */
4035     apNew = (char **)sqlite3_realloc(
4036         pShmNode->apRegion, (iRegion+1)*sizeof(char *)
4037     );
4038     if( !apNew ){
4039       rc = SQLITE_IOERR_NOMEM;
4040       goto shmpage_out;
4041     }
4042     pShmNode->apRegion = apNew;
4043     while(pShmNode->nRegion<=iRegion){
4044       void *pMem;
4045       if( pShmNode->h>=0 ){
4046         pMem = mmap(0, szRegion,
4047             pShmNode->isReadonly ? PROT_READ : PROT_READ|PROT_WRITE,
4048             MAP_SHARED, pShmNode->h, pShmNode->nRegion*szRegion
4049         );
4050         if( pMem==MAP_FAILED ){
4051           rc = unixLogError(SQLITE_IOERR_SHMMAP, "mmap", pShmNode->zFilename);
4052           goto shmpage_out;
4053         }
4054       }else{
4055         pMem = sqlite3_malloc(szRegion);
4056         if( pMem==0 ){
4057           rc = SQLITE_NOMEM;
4058           goto shmpage_out;
4059         }
4060         memset(pMem, 0, szRegion);
4061       }
4062       pShmNode->apRegion[pShmNode->nRegion] = pMem;
4063       pShmNode->nRegion++;
4064     }
4065   }
4066
4067 shmpage_out:
4068   if( pShmNode->nRegion>iRegion ){
4069     *pp = pShmNode->apRegion[iRegion];
4070   }else{
4071     *pp = 0;
4072   }
4073   if( pShmNode->isReadonly && rc==SQLITE_OK ) rc = SQLITE_READONLY;
4074   sqlite3_mutex_leave(pShmNode->mutex);
4075   return rc;
4076 }
4077
4078 /*
4079 ** Change the lock state for a shared-memory segment.
4080 **
4081 ** Note that the relationship between SHAREd and EXCLUSIVE locks is a little
4082 ** different here than in posix.  In xShmLock(), one can go from unlocked
4083 ** to shared and back or from unlocked to exclusive and back.  But one may
4084 ** not go from shared to exclusive or from exclusive to shared.
4085 */
4086 static int unixShmLock(
4087   sqlite3_file *fd,          /* Database file holding the shared memory */
4088   int ofst,                  /* First lock to acquire or release */
4089   int n,                     /* Number of locks to acquire or release */
4090   int flags                  /* What to do with the lock */
4091 ){
4092   unixFile *pDbFd = (unixFile*)fd;      /* Connection holding shared memory */
4093   unixShm *p = pDbFd->pShm;             /* The shared memory being locked */
4094   unixShm *pX;                          /* For looping over all siblings */
4095   unixShmNode *pShmNode = p->pShmNode;  /* The underlying file iNode */
4096   int rc = SQLITE_OK;                   /* Result code */
4097   u16 mask;                             /* Mask of locks to take or release */
4098
4099   assert( pShmNode==pDbFd->pInode->pShmNode );
4100   assert( pShmNode->pInode==pDbFd->pInode );
4101   assert( ofst>=0 && ofst+n<=SQLITE_SHM_NLOCK );
4102   assert( n>=1 );
4103   assert( flags==(SQLITE_SHM_LOCK | SQLITE_SHM_SHARED)
4104        || flags==(SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE)
4105        || flags==(SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED)
4106        || flags==(SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE) );
4107   assert( n==1 || (flags & SQLITE_SHM_EXCLUSIVE)!=0 );
4108   assert( pShmNode->h>=0 || pDbFd->pInode->bProcessLock==1 );
4109   assert( pShmNode->h<0 || pDbFd->pInode->bProcessLock==0 );
4110
4111   mask = (1<<(ofst+n)) - (1<<ofst);
4112   assert( n>1 || mask==(1<<ofst) );
4113   sqlite3_mutex_enter(pShmNode->mutex);
4114   if( flags & SQLITE_SHM_UNLOCK ){
4115     u16 allMask = 0; /* Mask of locks held by siblings */
4116
4117     /* See if any siblings hold this same lock */
4118     for(pX=pShmNode->pFirst; pX; pX=pX->pNext){
4119       if( pX==p ) continue;
4120       assert( (pX->exclMask & (p->exclMask|p->sharedMask))==0 );
4121       allMask |= pX->sharedMask;
4122     }
4123
4124     /* Unlock the system-level locks */
4125     if( (mask & allMask)==0 ){
4126       rc = unixShmSystemLock(pShmNode, F_UNLCK, ofst+UNIX_SHM_BASE, n);
4127     }else{
4128       rc = SQLITE_OK;
4129     }
4130
4131     /* Undo the local locks */
4132     if( rc==SQLITE_OK ){
4133       p->exclMask &= ~mask;
4134       p->sharedMask &= ~mask;
4135     }
4136   }else if( flags & SQLITE_SHM_SHARED ){
4137     u16 allShared = 0;  /* Union of locks held by connections other than "p" */
4138
4139     /* Find out which shared locks are already held by sibling connections.
4140     ** If any sibling already holds an exclusive lock, go ahead and return
4141     ** SQLITE_BUSY.
4142     */
4143     for(pX=pShmNode->pFirst; pX; pX=pX->pNext){
4144       if( (pX->exclMask & mask)!=0 ){
4145         rc = SQLITE_BUSY;
4146         break;
4147       }
4148       allShared |= pX->sharedMask;
4149     }
4150
4151     /* Get shared locks at the system level, if necessary */
4152     if( rc==SQLITE_OK ){
4153       if( (allShared & mask)==0 ){
4154         rc = unixShmSystemLock(pShmNode, F_RDLCK, ofst+UNIX_SHM_BASE, n);
4155       }else{
4156         rc = SQLITE_OK;
4157       }
4158     }
4159
4160     /* Get the local shared locks */
4161     if( rc==SQLITE_OK ){
4162       p->sharedMask |= mask;
4163     }
4164   }else{
4165     /* Make sure no sibling connections hold locks that will block this
4166     ** lock.  If any do, return SQLITE_BUSY right away.
4167     */
4168     for(pX=pShmNode->pFirst; pX; pX=pX->pNext){
4169       if( (pX->exclMask & mask)!=0 || (pX->sharedMask & mask)!=0 ){
4170         rc = SQLITE_BUSY;
4171         break;
4172       }
4173     }
4174
4175     /* Get the exclusive locks at the system level.  Then if successful
4176     ** also mark the local connection as being locked.
4177     */
4178     if( rc==SQLITE_OK ){
4179       rc = unixShmSystemLock(pShmNode, F_WRLCK, ofst+UNIX_SHM_BASE, n);
4180       if( rc==SQLITE_OK ){
4181         assert( (p->sharedMask & mask)==0 );
4182         p->exclMask |= mask;
4183       }
4184     }
4185   }
4186   sqlite3_mutex_leave(pShmNode->mutex);
4187   OSTRACE(("SHM-LOCK shmid-%d, pid-%d got %03x,%03x\n",
4188            p->id, getpid(), p->sharedMask, p->exclMask));
4189   return rc;
4190 }
4191
4192 /*
4193 ** Implement a memory barrier or memory fence on shared memory.
4194 **
4195 ** All loads and stores begun before the barrier must complete before
4196 ** any load or store begun after the barrier.
4197 */
4198 static void unixShmBarrier(
4199   sqlite3_file *fd                /* Database file holding the shared memory */
4200 ){
4201   UNUSED_PARAMETER(fd);
4202   unixEnterMutex();
4203   unixLeaveMutex();
4204 }
4205
4206 /*
4207 ** Close a connection to shared-memory.  Delete the underlying
4208 ** storage if deleteFlag is true.
4209 **
4210 ** If there is no shared memory associated with the connection then this
4211 ** routine is a harmless no-op.
4212 */
4213 static int unixShmUnmap(
4214   sqlite3_file *fd,               /* The underlying database file */
4215   int deleteFlag                  /* Delete shared-memory if true */
4216 ){
4217   unixShm *p;                     /* The connection to be closed */
4218   unixShmNode *pShmNode;          /* The underlying shared-memory file */
4219   unixShm **pp;                   /* For looping over sibling connections */
4220   unixFile *pDbFd;                /* The underlying database file */
4221
4222   pDbFd = (unixFile*)fd;
4223   p = pDbFd->pShm;
4224   if( p==0 ) return SQLITE_OK;
4225   pShmNode = p->pShmNode;
4226
4227   assert( pShmNode==pDbFd->pInode->pShmNode );
4228   assert( pShmNode->pInode==pDbFd->pInode );
4229
4230   /* Remove connection p from the set of connections associated
4231   ** with pShmNode */
4232   sqlite3_mutex_enter(pShmNode->mutex);
4233   for(pp=&pShmNode->pFirst; (*pp)!=p; pp = &(*pp)->pNext){}
4234   *pp = p->pNext;
4235
4236   /* Free the connection p */
4237   sqlite3_free(p);
4238   pDbFd->pShm = 0;
4239   sqlite3_mutex_leave(pShmNode->mutex);
4240
4241   /* If pShmNode->nRef has reached 0, then close the underlying
4242   ** shared-memory file, too */
4243   unixEnterMutex();
4244   assert( pShmNode->nRef>0 );
4245   pShmNode->nRef--;
4246   if( pShmNode->nRef==0 ){
4247     if( deleteFlag && pShmNode->h>=0 ) osUnlink(pShmNode->zFilename);
4248     unixShmPurge(pDbFd);
4249   }
4250   unixLeaveMutex();
4251
4252   return SQLITE_OK;
4253 }
4254
4255
4256 #else
4257 # define unixShmMap     0
4258 # define unixShmLock    0
4259 # define unixShmBarrier 0
4260 # define unixShmUnmap   0
4261 #endif /* #ifndef SQLITE_OMIT_WAL */
4262
4263 /*
4264 ** Here ends the implementation of all sqlite3_file methods.
4265 **
4266 ********************** End sqlite3_file Methods *******************************
4267 ******************************************************************************/
4268
4269 /*
4270 ** This division contains definitions of sqlite3_io_methods objects that
4271 ** implement various file locking strategies.  It also contains definitions
4272 ** of "finder" functions.  A finder-function is used to locate the appropriate
4273 ** sqlite3_io_methods object for a particular database file.  The pAppData
4274 ** field of the sqlite3_vfs VFS objects are initialized to be pointers to
4275 ** the correct finder-function for that VFS.
4276 **
4277 ** Most finder functions return a pointer to a fixed sqlite3_io_methods
4278 ** object.  The only interesting finder-function is autolockIoFinder, which
4279 ** looks at the filesystem type and tries to guess the best locking
4280 ** strategy from that.
4281 **
4282 ** For finder-funtion F, two objects are created:
4283 **
4284 **    (1) The real finder-function named "FImpt()".
4285 **
4286 **    (2) A constant pointer to this function named just "F".
4287 **
4288 **
4289 ** A pointer to the F pointer is used as the pAppData value for VFS
4290 ** objects.  We have to do this instead of letting pAppData point
4291 ** directly at the finder-function since C90 rules prevent a void*
4292 ** from be cast into a function pointer.
4293 **
4294 **
4295 ** Each instance of this macro generates two objects:
4296 **
4297 **   *  A constant sqlite3_io_methods object call METHOD that has locking
4298 **      methods CLOSE, LOCK, UNLOCK, CKRESLOCK.
4299 **
4300 **   *  An I/O method finder function called FINDER that returns a pointer
4301 **      to the METHOD object in the previous bullet.
4302 */
4303 #define IOMETHODS(FINDER, METHOD, VERSION, CLOSE, LOCK, UNLOCK, CKLOCK)      \
4304 static const sqlite3_io_methods METHOD = {                                   \
4305    VERSION,                    /* iVersion */                                \
4306    CLOSE,                      /* xClose */                                  \
4307    unixRead,                   /* xRead */                                   \
4308    unixWrite,                  /* xWrite */                                  \
4309    unixTruncate,               /* xTruncate */                               \
4310    unixSync,                   /* xSync */                                   \
4311    unixFileSize,               /* xFileSize */                               \
4312    LOCK,                       /* xLock */                                   \
4313    UNLOCK,                     /* xUnlock */                                 \
4314    CKLOCK,                     /* xCheckReservedLock */                      \
4315    unixFileControl,            /* xFileControl */                            \
4316    unixSectorSize,             /* xSectorSize */                             \
4317    unixDeviceCharacteristics,  /* xDeviceCapabilities */                     \
4318    unixShmMap,                 /* xShmMap */                                 \
4319    unixShmLock,                /* xShmLock */                                \
4320    unixShmBarrier,             /* xShmBarrier */                             \
4321    unixShmUnmap                /* xShmUnmap */                               \
4322 };                                                                           \
4323 static const sqlite3_io_methods *FINDER##Impl(const char *z, unixFile *p){   \
4324   UNUSED_PARAMETER(z); UNUSED_PARAMETER(p);                                  \
4325   return &METHOD;                                                            \
4326 }                                                                            \
4327 static const sqlite3_io_methods *(*const FINDER)(const char*,unixFile *p)    \
4328     = FINDER##Impl;
4329
4330 /*
4331 ** Here are all of the sqlite3_io_methods objects for each of the
4332 ** locking strategies.  Functions that return pointers to these methods
4333 ** are also created.
4334 */
4335 IOMETHODS(
4336   posixIoFinder,            /* Finder function name */
4337   posixIoMethods,           /* sqlite3_io_methods object name */
4338   2,                        /* shared memory is enabled */
4339   unixClose,                /* xClose method */
4340   unixLock,                 /* xLock method */
4341   unixUnlock,               /* xUnlock method */
4342   unixCheckReservedLock     /* xCheckReservedLock method */
4343 )
4344 IOMETHODS(
4345   nolockIoFinder,           /* Finder function name */
4346   nolockIoMethods,          /* sqlite3_io_methods object name */
4347   1,                        /* shared memory is disabled */
4348   nolockClose,              /* xClose method */
4349   nolockLock,               /* xLock method */
4350   nolockUnlock,             /* xUnlock method */
4351   nolockCheckReservedLock   /* xCheckReservedLock method */
4352 )
4353 IOMETHODS(
4354   dotlockIoFinder,          /* Finder function name */
4355   dotlockIoMethods,         /* sqlite3_io_methods object name */
4356   1,                        /* shared memory is disabled */
4357   dotlockClose,             /* xClose method */
4358   dotlockLock,              /* xLock method */
4359   dotlockUnlock,            /* xUnlock method */
4360   dotlockCheckReservedLock  /* xCheckReservedLock method */
4361 )
4362
4363 #if SQLITE_ENABLE_LOCKING_STYLE && !OS_VXWORKS
4364 IOMETHODS(
4365   flockIoFinder,            /* Finder function name */
4366   flockIoMethods,           /* sqlite3_io_methods object name */
4367   1,                        /* shared memory is disabled */
4368   flockClose,               /* xClose method */
4369   flockLock,                /* xLock method */
4370   flockUnlock,              /* xUnlock method */
4371   flockCheckReservedLock    /* xCheckReservedLock method */
4372 )
4373 #endif
4374
4375 #if OS_VXWORKS
4376 IOMETHODS(
4377   semIoFinder,              /* Finder function name */
4378   semIoMethods,             /* sqlite3_io_methods object name */
4379   1,                        /* shared memory is disabled */
4380   semClose,                 /* xClose method */
4381   semLock,                  /* xLock method */
4382   semUnlock,                /* xUnlock method */
4383   semCheckReservedLock      /* xCheckReservedLock method */
4384 )
4385 #endif
4386
4387 #if defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE
4388 IOMETHODS(
4389   afpIoFinder,              /* Finder function name */
4390   afpIoMethods,             /* sqlite3_io_methods object name */
4391   1,                        /* shared memory is disabled */
4392   afpClose,                 /* xClose method */
4393   afpLock,                  /* xLock method */
4394   afpUnlock,                /* xUnlock method */
4395   afpCheckReservedLock      /* xCheckReservedLock method */
4396 )
4397 #endif
4398
4399 /*
4400 ** The proxy locking method is a "super-method" in the sense that it
4401 ** opens secondary file descriptors for the conch and lock files and
4402 ** it uses proxy, dot-file, AFP, and flock() locking methods on those
4403 ** secondary files.  For this reason, the division that implements
4404 ** proxy locking is located much further down in the file.  But we need
4405 ** to go ahead and define the sqlite3_io_methods and finder function
4406 ** for proxy locking here.  So we forward declare the I/O methods.
4407 */
4408 #if defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE
4409 static int proxyClose(sqlite3_file*);
4410 static int proxyLock(sqlite3_file*, int);
4411 static int proxyUnlock(sqlite3_file*, int);
4412 static int proxyCheckReservedLock(sqlite3_file*, int*);
4413 IOMETHODS(
4414   proxyIoFinder,            /* Finder function name */
4415   proxyIoMethods,           /* sqlite3_io_methods object name */
4416   1,                        /* shared memory is disabled */
4417   proxyClose,               /* xClose method */
4418   proxyLock,                /* xLock method */
4419   proxyUnlock,              /* xUnlock method */
4420   proxyCheckReservedLock    /* xCheckReservedLock method */
4421 )
4422 #endif
4423
4424 /* nfs lockd on OSX 10.3+ doesn't clear write locks when a read lock is set */
4425 #if defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE
4426 IOMETHODS(
4427   nfsIoFinder,               /* Finder function name */
4428   nfsIoMethods,              /* sqlite3_io_methods object name */
4429   1,                         /* shared memory is disabled */
4430   unixClose,                 /* xClose method */
4431   unixLock,                  /* xLock method */
4432   nfsUnlock,                 /* xUnlock method */
4433   unixCheckReservedLock      /* xCheckReservedLock method */
4434 )
4435 #endif
4436
4437 #if defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE
4438 /*
4439 ** This "finder" function attempts to determine the best locking strategy
4440 ** for the database file "filePath".  It then returns the sqlite3_io_methods
4441 ** object that implements that strategy.
4442 **
4443 ** This is for MacOSX only.
4444 */
4445 static const sqlite3_io_methods *autolockIoFinderImpl(
4446   const char *filePath,    /* name of the database file */
4447   unixFile *pNew           /* open file object for the database file */
4448 ){
4449   static const struct Mapping {
4450     const char *zFilesystem;              /* Filesystem type name */
4451     const sqlite3_io_methods *pMethods;   /* Appropriate locking method */
4452   } aMap[] = {
4453     { "hfs",    &posixIoMethods },
4454     { "ufs",    &posixIoMethods },
4455     { "afpfs",  &afpIoMethods },
4456     { "smbfs",  &afpIoMethods },
4457     { "webdav", &nolockIoMethods },
4458     { 0, 0 }
4459   };
4460   int i;
4461   struct statfs fsInfo;
4462   struct flock lockInfo;
4463
4464   if( !filePath ){
4465     /* If filePath==NULL that means we are dealing with a transient file
4466     ** that does not need to be locked. */
4467     return &nolockIoMethods;
4468   }
4469   if( statfs(filePath, &fsInfo) != -1 ){
4470     if( fsInfo.f_flags & MNT_RDONLY ){
4471       return &nolockIoMethods;
4472     }
4473     for(i=0; aMap[i].zFilesystem; i++){
4474       if( strcmp(fsInfo.f_fstypename, aMap[i].zFilesystem)==0 ){
4475         return aMap[i].pMethods;
4476       }
4477     }
4478   }
4479
4480   /* Default case. Handles, amongst others, "nfs".
4481   ** Test byte-range lock using fcntl(). If the call succeeds,
4482   ** assume that the file-system supports POSIX style locks.
4483   */
4484   lockInfo.l_len = 1;
4485   lockInfo.l_start = 0;
4486   lockInfo.l_whence = SEEK_SET;
4487   lockInfo.l_type = F_RDLCK;
4488   if( osFcntl(pNew->h, F_GETLK, &lockInfo)!=-1 ) {
4489     if( strcmp(fsInfo.f_fstypename, "nfs")==0 ){
4490       return &nfsIoMethods;
4491     } else {
4492       return &posixIoMethods;
4493     }
4494   }else{
4495     return &dotlockIoMethods;
4496   }
4497 }
4498 static const sqlite3_io_methods
4499   *(*const autolockIoFinder)(const char*,unixFile*) = autolockIoFinderImpl;
4500
4501 #endif /* defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE */
4502
4503 #if OS_VXWORKS && SQLITE_ENABLE_LOCKING_STYLE
4504 /*
4505 ** This "finder" function attempts to determine the best locking strategy
4506 ** for the database file "filePath".  It then returns the sqlite3_io_methods
4507 ** object that implements that strategy.
4508 **
4509 ** This is for VXWorks only.
4510 */
4511 static const sqlite3_io_methods *autolockIoFinderImpl(
4512   const char *filePath,    /* name of the database file */
4513   unixFile *pNew           /* the open file object */
4514 ){
4515   struct flock lockInfo;
4516
4517   if( !filePath ){
4518     /* If filePath==NULL that means we are dealing with a transient file
4519     ** that does not need to be locked. */
4520     return &nolockIoMethods;
4521   }
4522
4523   /* Test if fcntl() is supported and use POSIX style locks.
4524   ** Otherwise fall back to the named semaphore method.
4525   */
4526   lockInfo.l_len = 1;
4527   lockInfo.l_start = 0;
4528   lockInfo.l_whence = SEEK_SET;
4529   lockInfo.l_type = F_RDLCK;
4530   if( osFcntl(pNew->h, F_GETLK, &lockInfo)!=-1 ) {
4531     return &posixIoMethods;
4532   }else{
4533     return &semIoMethods;
4534   }
4535 }
4536 static const sqlite3_io_methods
4537   *(*const autolockIoFinder)(const char*,unixFile*) = autolockIoFinderImpl;
4538
4539 #endif /* OS_VXWORKS && SQLITE_ENABLE_LOCKING_STYLE */
4540
4541 /*
4542 ** An abstract type for a pointer to a IO method finder function:
4543 */
4544 typedef const sqlite3_io_methods *(*finder_type)(const char*,unixFile*);
4545
4546
4547 /****************************************************************************
4548 **************************** sqlite3_vfs methods ****************************
4549 **
4550 ** This division contains the implementation of methods on the
4551 ** sqlite3_vfs object.
4552 */
4553
4554 /*
4555 ** Initialize the contents of the unixFile structure pointed to by pId.
4556 */
4557 static int fillInUnixFile(
4558   sqlite3_vfs *pVfs,      /* Pointer to vfs object */
4559   int h,                  /* Open file descriptor of file being opened */
4560   sqlite3_file *pId,      /* Write to the unixFile structure here */
4561   const char *zFilename,  /* Name of the file being opened */
4562   int ctrlFlags           /* Zero or more UNIXFILE_* values */
4563 ){
4564   const sqlite3_io_methods *pLockingStyle;
4565   unixFile *pNew = (unixFile *)pId;
4566   int rc = SQLITE_OK;
4567
4568   assert( pNew->pInode==NULL );
4569
4570   /* Usually the path zFilename should not be a relative pathname. The
4571   ** exception is when opening the proxy "conch" file in builds that
4572   ** include the special Apple locking styles.
4573   */
4574 #if defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE
4575   assert( zFilename==0 || zFilename[0]=='/'
4576     || pVfs->pAppData==(void*)&autolockIoFinder );
4577 #else
4578   assert( zFilename==0 || zFilename[0]=='/' );
4579 #endif
4580
4581   /* No locking occurs in temporary files */
4582   assert( zFilename!=0 || (ctrlFlags & UNIXFILE_NOLOCK)!=0 );
4583
4584   OSTRACE(("OPEN    %-3d %s\n", h, zFilename));
4585   pNew->h = h;
4586   pNew->pVfs = pVfs;
4587   pNew->zPath = zFilename;
4588   pNew->ctrlFlags = (u8)ctrlFlags;
4589   if( sqlite3_uri_boolean(((ctrlFlags & UNIXFILE_URI) ? zFilename : 0),
4590                            "psow", SQLITE_POWERSAFE_OVERWRITE) ){
4591     pNew->ctrlFlags |= UNIXFILE_PSOW;
4592   }
4593   if( memcmp(pVfs->zName,"unix-excl",10)==0 ){
4594     pNew->ctrlFlags |= UNIXFILE_EXCL;
4595   }
4596
4597 #if OS_VXWORKS
4598   pNew->pId = vxworksFindFileId(zFilename);
4599   if( pNew->pId==0 ){
4600     ctrlFlags |= UNIXFILE_NOLOCK;
4601     rc = SQLITE_NOMEM;
4602   }
4603 #endif
4604
4605   if( ctrlFlags & UNIXFILE_NOLOCK ){
4606     pLockingStyle = &nolockIoMethods;
4607   }else{
4608     pLockingStyle = (**(finder_type*)pVfs->pAppData)(zFilename, pNew);
4609 #if SQLITE_ENABLE_LOCKING_STYLE
4610     /* Cache zFilename in the locking context (AFP and dotlock override) for
4611     ** proxyLock activation is possible (remote proxy is based on db name)
4612     ** zFilename remains valid until file is closed, to support */
4613     pNew->lockingContext = (void*)zFilename;
4614 #endif
4615   }
4616
4617   if( pLockingStyle == &posixIoMethods
4618 #if defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE
4619     || pLockingStyle == &nfsIoMethods
4620 #endif
4621   ){
4622     unixEnterMutex();
4623     rc = findInodeInfo(pNew, &pNew->pInode);
4624     if( rc!=SQLITE_OK ){
4625       /* If an error occured in findInodeInfo(), close the file descriptor
4626       ** immediately, before releasing the mutex. findInodeInfo() may fail
4627       ** in two scenarios:
4628       **
4629       **   (a) A call to fstat() failed.
4630       **   (b) A malloc failed.
4631       **
4632       ** Scenario (b) may only occur if the process is holding no other
4633       ** file descriptors open on the same file. If there were other file
4634       ** descriptors on this file, then no malloc would be required by
4635       ** findInodeInfo(). If this is the case, it is quite safe to close
4636       ** handle h - as it is guaranteed that no posix locks will be released
4637       ** by doing so.
4638       **
4639       ** If scenario (a) caused the error then things are not so safe. The
4640       ** implicit assumption here is that if fstat() fails, things are in
4641       ** such bad shape that dropping a lock or two doesn't matter much.
4642       */
4643       robust_close(pNew, h, __LINE__);
4644       h = -1;
4645     }
4646     unixLeaveMutex();
4647   }
4648
4649 #if SQLITE_ENABLE_LOCKING_STYLE && defined(__APPLE__)
4650   else if( pLockingStyle == &afpIoMethods ){
4651     /* AFP locking uses the file path so it needs to be included in
4652     ** the afpLockingContext.
4653     */
4654     afpLockingContext *pCtx;
4655     pNew->lockingContext = pCtx = sqlite3_malloc( sizeof(*pCtx) );
4656     if( pCtx==0 ){
4657       rc = SQLITE_NOMEM;
4658     }else{
4659       /* NB: zFilename exists and remains valid until the file is closed
4660       ** according to requirement F11141.  So we do not need to make a
4661       ** copy of the filename. */
4662       pCtx->dbPath = zFilename;
4663       pCtx->reserved = 0;
4664       srandomdev();
4665       unixEnterMutex();
4666       rc = findInodeInfo(pNew, &pNew->pInode);
4667       if( rc!=SQLITE_OK ){
4668         sqlite3_free(pNew->lockingContext);
4669         robust_close(pNew, h, __LINE__);
4670         h = -1;
4671       }
4672       unixLeaveMutex();
4673     }
4674   }
4675 #endif
4676
4677   else if( pLockingStyle == &dotlockIoMethods ){
4678     /* Dotfile locking uses the file path so it needs to be included in
4679     ** the dotlockLockingContext
4680     */
4681     char *zLockFile;
4682     int nFilename;
4683     assert( zFilename!=0 );
4684     nFilename = (int)strlen(zFilename) + 6;
4685     zLockFile = (char *)sqlite3_malloc(nFilename);
4686     if( zLockFile==0 ){
4687       rc = SQLITE_NOMEM;
4688     }else{
4689       sqlite3_snprintf(nFilename, zLockFile, "%s" DOTLOCK_SUFFIX, zFilename);
4690     }
4691     pNew->lockingContext = zLockFile;
4692   }
4693
4694 #if OS_VXWORKS
4695   else if( pLockingStyle == &semIoMethods ){
4696     /* Named semaphore locking uses the file path so it needs to be
4697     ** included in the semLockingContext
4698     */
4699     unixEnterMutex();
4700     rc = findInodeInfo(pNew, &pNew->pInode);
4701     if( (rc==SQLITE_OK) && (pNew->pInode->pSem==NULL) ){
4702       char *zSemName = pNew->pInode->aSemName;
4703       int n;
4704       sqlite3_snprintf(MAX_PATHNAME, zSemName, "/%s.sem",
4705                        pNew->pId->zCanonicalName);
4706       for( n=1; zSemName[n]; n++ )
4707         if( zSemName[n]=='/' ) zSemName[n] = '_';
4708       pNew->pInode->pSem = sem_open(zSemName, O_CREAT, 0666, 1);
4709       if( pNew->pInode->pSem == SEM_FAILED ){
4710         rc = SQLITE_NOMEM;
4711         pNew->pInode->aSemName[0] = '\0';
4712       }
4713     }
4714     unixLeaveMutex();
4715   }
4716 #endif
4717
4718   pNew->lastErrno = 0;
4719 #if OS_VXWORKS
4720   if( rc!=SQLITE_OK ){
4721     if( h>=0 ) robust_close(pNew, h, __LINE__);
4722     h = -1;
4723     osUnlink(zFilename);
4724     isDelete = 0;
4725   }
4726   if( isDelete ) pNew->ctrlFlags |= UNIXFILE_DELETE;
4727 #endif
4728   if( rc!=SQLITE_OK ){
4729     if( h>=0 ) robust_close(pNew, h, __LINE__);
4730   }else{
4731     pNew->pMethod = pLockingStyle;
4732     OpenCounter(+1);
4733   }
4734   return rc;
4735 }
4736
4737 /*
4738 ** Return the name of a directory in which to put temporary files.
4739 ** If no suitable temporary file directory can be found, return NULL.
4740 */
4741 static const char *unixTempFileDir(void){
4742   static const char *azDirs[] = {
4743      0,
4744      0,
4745      "/var/tmp",
4746      "/usr/tmp",
4747      "/tmp",
4748      0        /* List terminator */
4749   };
4750   unsigned int i;
4751   struct stat buf;
4752   const char *zDir = 0;
4753
4754   azDirs[0] = sqlite3_temp_directory;
4755   if( !azDirs[1] ) azDirs[1] = getenv("TMPDIR");
4756   for(i=0; i<sizeof(azDirs)/sizeof(azDirs[0]); zDir=azDirs[i++]){
4757     if( zDir==0 ) continue;
4758     if( osStat(zDir, &buf) ) continue;
4759     if( !S_ISDIR(buf.st_mode) ) continue;
4760     if( osAccess(zDir, 07) ) continue;
4761     break;
4762   }
4763   return zDir;
4764 }
4765
4766 /*
4767 ** Create a temporary file name in zBuf.  zBuf must be allocated
4768 ** by the calling process and must be big enough to hold at least
4769 ** pVfs->mxPathname bytes.
4770 */
4771 static int unixGetTempname(int nBuf, char *zBuf){
4772   static const unsigned char zChars[] =
4773     "abcdefghijklmnopqrstuvwxyz"
4774     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
4775     "0123456789";
4776   unsigned int i, j;
4777   const char *zDir;
4778
4779   /* It's odd to simulate an io-error here, but really this is just
4780   ** using the io-error infrastructure to test that SQLite handles this
4781   ** function failing.
4782   */
4783   SimulateIOError( return SQLITE_IOERR );
4784
4785   zDir = unixTempFileDir();
4786   if( zDir==0 ) zDir = ".";
4787
4788   /* Check that the output buffer is large enough for the temporary file
4789   ** name. If it is not, return SQLITE_ERROR.
4790   */
4791   if( (strlen(zDir) + strlen(SQLITE_TEMP_FILE_PREFIX) + 18) >= (size_t)nBuf ){
4792     return SQLITE_ERROR;
4793   }
4794
4795   do{
4796     sqlite3_snprintf(nBuf-18, zBuf, "%s/"SQLITE_TEMP_FILE_PREFIX, zDir);
4797     j = (int)strlen(zBuf);
4798     sqlite3_randomness(15, &zBuf[j]);
4799     for(i=0; i<15; i++, j++){
4800       zBuf[j] = (char)zChars[ ((unsigned char)zBuf[j])%(sizeof(zChars)-1) ];
4801     }
4802     zBuf[j] = 0;
4803     zBuf[j+1] = 0;
4804   }while( osAccess(zBuf,0)==0 );
4805   return SQLITE_OK;
4806 }
4807
4808 #if SQLITE_ENABLE_LOCKING_STYLE && defined(__APPLE__)
4809 /*
4810 ** Routine to transform a unixFile into a proxy-locking unixFile.
4811 ** Implementation in the proxy-lock division, but used by unixOpen()
4812 ** if SQLITE_PREFER_PROXY_LOCKING is defined.
4813 */
4814 static int proxyTransformUnixFile(unixFile*, const char*);
4815 #endif
4816
4817 /*
4818 ** Search for an unused file descriptor that was opened on the database
4819 ** file (not a journal or master-journal file) identified by pathname
4820 ** zPath with SQLITE_OPEN_XXX flags matching those passed as the second
4821 ** argument to this function.
4822 **
4823 ** Such a file descriptor may exist if a database connection was closed
4824 ** but the associated file descriptor could not be closed because some
4825 ** other file descriptor open on the same file is holding a file-lock.
4826 ** Refer to comments in the unixClose() function and the lengthy comment
4827 ** describing "Posix Advisory Locking" at the start of this file for
4828 ** further details. Also, ticket #4018.
4829 **
4830 ** If a suitable file descriptor is found, then it is returned. If no
4831 ** such file descriptor is located, -1 is returned.
4832 */
4833 static UnixUnusedFd *findReusableFd(const char *zPath, int flags){
4834   UnixUnusedFd *pUnused = 0;
4835
4836   /* Do not search for an unused file descriptor on vxworks. Not because
4837   ** vxworks would not benefit from the change (it might, we're not sure),
4838   ** but because no way to test it is currently available. It is better
4839   ** not to risk breaking vxworks support for the sake of such an obscure
4840   ** feature.  */
4841 #if !OS_VXWORKS
4842   struct stat sStat;                   /* Results of stat() call */
4843
4844   /* A stat() call may fail for various reasons. If this happens, it is
4845   ** almost certain that an open() call on the same path will also fail.
4846   ** For this reason, if an error occurs in the stat() call here, it is
4847   ** ignored and -1 is returned. The caller will try to open a new file
4848   ** descriptor on the same path, fail, and return an error to SQLite.
4849   **
4850   ** Even if a subsequent open() call does succeed, the consequences of
4851   ** not searching for a resusable file descriptor are not dire.  */
4852   if( 0==osStat(zPath, &sStat) ){
4853     unixInodeInfo *pInode;
4854
4855     unixEnterMutex();
4856     pInode = inodeList;
4857     while( pInode && (pInode->fileId.dev!=sStat.st_dev
4858                      || pInode->fileId.ino!=sStat.st_ino) ){
4859        pInode = pInode->pNext;
4860     }
4861     if( pInode ){
4862       UnixUnusedFd **pp;
4863       for(pp=&pInode->pUnused; *pp && (*pp)->flags!=flags; pp=&((*pp)->pNext));
4864       pUnused = *pp;
4865       if( pUnused ){
4866         *pp = pUnused->pNext;
4867       }
4868     }
4869     unixLeaveMutex();
4870   }
4871 #endif    /* if !OS_VXWORKS */
4872   return pUnused;
4873 }
4874
4875 /*
4876 ** This function is called by unixOpen() to determine the unix permissions
4877 ** to create new files with. If no error occurs, then SQLITE_OK is returned
4878 ** and a value suitable for passing as the third argument to open(2) is
4879 ** written to *pMode. If an IO error occurs, an SQLite error code is
4880 ** returned and the value of *pMode is not modified.
4881 **
4882 ** If the file being opened is a temporary file, it is always created with
4883 ** the octal permissions 0600 (read/writable by owner only). If the file
4884 ** is a database or master journal file, it is created with the permissions
4885 ** mask SQLITE_DEFAULT_FILE_PERMISSIONS.
4886 **
4887 ** Finally, if the file being opened is a WAL or regular journal file, then
4888 ** this function queries the file-system for the permissions on the
4889 ** corresponding database file and sets *pMode to this value. Whenever
4890 ** possible, WAL and journal files are created using the same permissions
4891 ** as the associated database file.
4892 **
4893 ** If the SQLITE_ENABLE_8_3_NAMES option is enabled, then the
4894 ** original filename is unavailable.  But 8_3_NAMES is only used for
4895 ** FAT filesystems and permissions do not matter there, so just use
4896 ** the default permissions.
4897 */
4898 static int findCreateFileMode(
4899   const char *zPath,              /* Path of file (possibly) being created */
4900   int flags,                      /* Flags passed as 4th argument to xOpen() */
4901   mode_t *pMode                   /* OUT: Permissions to open file with */
4902 ){
4903   int rc = SQLITE_OK;             /* Return Code */
4904   *pMode = SQLITE_DEFAULT_FILE_PERMISSIONS;
4905   if( flags & (SQLITE_OPEN_WAL|SQLITE_OPEN_MAIN_JOURNAL) ){
4906     char zDb[MAX_PATHNAME+1];     /* Database file path */
4907     int nDb;                      /* Number of valid bytes in zDb */
4908     struct stat sStat;            /* Output of stat() on database file */
4909
4910     /* zPath is a path to a WAL or journal file. The following block derives
4911     ** the path to the associated database file from zPath. This block handles
4912     ** the following naming conventions:
4913     **
4914     **   "<path to db>-journal"
4915     **   "<path to db>-wal"
4916     **   "<path to db>-journalNN"
4917     **   "<path to db>-walNN"
4918     **
4919     ** where NN is a decimal number. The NN naming schemes are
4920     ** used by the test_multiplex.c module.
4921     */
4922     nDb = sqlite3Strlen30(zPath) - 1;
4923 #ifdef SQLITE_ENABLE_8_3_NAMES
4924     while( nDb>0 && sqlite3Isalnum(zPath[nDb]) ) nDb--;
4925     if( nDb==0 || zPath[nDb]!='-' ) return SQLITE_OK;
4926 #else
4927     while( zPath[nDb]!='-' ){
4928       assert( nDb>0 );
4929       assert( zPath[nDb]!='\n' );
4930       nDb--;
4931     }
4932 #endif
4933     memcpy(zDb, zPath, nDb);
4934     zDb[nDb] = '\0';
4935
4936     if( 0==osStat(zDb, &sStat) ){
4937       *pMode = sStat.st_mode & 0777;
4938     }else{
4939       rc = SQLITE_IOERR_FSTAT;
4940     }
4941   }else if( flags & SQLITE_OPEN_DELETEONCLOSE ){
4942     *pMode = 0600;
4943   }
4944   return rc;
4945 }
4946
4947 /*
4948 ** Open the file zPath.
4949 **
4950 ** Previously, the SQLite OS layer used three functions in place of this
4951 ** one:
4952 **
4953 **     sqlite3OsOpenReadWrite();
4954 **     sqlite3OsOpenReadOnly();
4955 **     sqlite3OsOpenExclusive();
4956 **
4957 ** These calls correspond to the following combinations of flags:
4958 **
4959 **     ReadWrite() ->     (READWRITE | CREATE)
4960 **     ReadOnly()  ->     (READONLY)
4961 **     OpenExclusive() -> (READWRITE | CREATE | EXCLUSIVE)
4962 **
4963 ** The old OpenExclusive() accepted a boolean argument - "delFlag". If
4964 ** true, the file was configured to be automatically deleted when the
4965 ** file handle closed. To achieve the same effect using this new
4966 ** interface, add the DELETEONCLOSE flag to those specified above for
4967 ** OpenExclusive().
4968 */
4969 static int unixOpen(
4970   sqlite3_vfs *pVfs,           /* The VFS for which this is the xOpen method */
4971   const char *zPath,           /* Pathname of file to be opened */
4972   sqlite3_file *pFile,         /* The file descriptor to be filled in */
4973   int flags,                   /* Input flags to control the opening */
4974   int *pOutFlags               /* Output flags returned to SQLite core */
4975 ){
4976   unixFile *p = (unixFile *)pFile;
4977   int fd = -1;                   /* File descriptor returned by open() */
4978   int openFlags = 0;             /* Flags to pass to open() */
4979   int eType = flags&0xFFFFFF00;  /* Type of file to open */
4980   int noLock;                    /* True to omit locking primitives */
4981   int rc = SQLITE_OK;            /* Function Return Code */
4982   int ctrlFlags = 0;             /* UNIXFILE_* flags */
4983
4984   int isExclusive  = (flags & SQLITE_OPEN_EXCLUSIVE);
4985   int isDelete     = (flags & SQLITE_OPEN_DELETEONCLOSE);
4986   int isCreate     = (flags & SQLITE_OPEN_CREATE);
4987   int isReadonly   = (flags & SQLITE_OPEN_READONLY);
4988   int isReadWrite  = (flags & SQLITE_OPEN_READWRITE);
4989 #if SQLITE_ENABLE_LOCKING_STYLE
4990   int isAutoProxy  = (flags & SQLITE_OPEN_AUTOPROXY);
4991 #endif
4992 #if defined(__APPLE__) || SQLITE_ENABLE_LOCKING_STYLE
4993   struct statfs fsInfo;
4994 #endif
4995
4996   /* If creating a master or main-file journal, this function will open
4997   ** a file-descriptor on the directory too. The first time unixSync()
4998   ** is called the directory file descriptor will be fsync()ed and close()d.
4999   */
5000   int syncDir = (isCreate && (
5001         eType==SQLITE_OPEN_MASTER_JOURNAL
5002      || eType==SQLITE_OPEN_MAIN_JOURNAL
5003      || eType==SQLITE_OPEN_WAL
5004   ));
5005
5006   /* If argument zPath is a NULL pointer, this function is required to open
5007   ** a temporary file. Use this buffer to store the file name in.
5008   */
5009   char zTmpname[MAX_PATHNAME+2];
5010   const char *zName = zPath;
5011
5012   /* Check the following statements are true:
5013   **
5014   **   (a) Exactly one of the READWRITE and READONLY flags must be set, and
5015   **   (b) if CREATE is set, then READWRITE must also be set, and
5016   **   (c) if EXCLUSIVE is set, then CREATE must also be set.
5017   **   (d) if DELETEONCLOSE is set, then CREATE must also be set.
5018   */
5019   assert((isReadonly==0 || isReadWrite==0) && (isReadWrite || isReadonly));
5020   assert(isCreate==0 || isReadWrite);
5021   assert(isExclusive==0 || isCreate);
5022   assert(isDelete==0 || isCreate);
5023
5024   /* The main DB, main journal, WAL file and master journal are never
5025   ** automatically deleted. Nor are they ever temporary files.  */
5026   assert( (!isDelete && zName) || eType!=SQLITE_OPEN_MAIN_DB );
5027   assert( (!isDelete && zName) || eType!=SQLITE_OPEN_MAIN_JOURNAL );
5028   assert( (!isDelete && zName) || eType!=SQLITE_OPEN_MASTER_JOURNAL );
5029   assert( (!isDelete && zName) || eType!=SQLITE_OPEN_WAL );
5030
5031   /* Assert that the upper layer has set one of the "file-type" flags. */
5032   assert( eType==SQLITE_OPEN_MAIN_DB      || eType==SQLITE_OPEN_TEMP_DB
5033        || eType==SQLITE_OPEN_MAIN_JOURNAL || eType==SQLITE_OPEN_TEMP_JOURNAL
5034        || eType==SQLITE_OPEN_SUBJOURNAL   || eType==SQLITE_OPEN_MASTER_JOURNAL
5035        || eType==SQLITE_OPEN_TRANSIENT_DB || eType==SQLITE_OPEN_WAL
5036   );
5037
5038   memset(p, 0, sizeof(unixFile));
5039
5040   if( eType==SQLITE_OPEN_MAIN_DB ){
5041     UnixUnusedFd *pUnused;
5042     pUnused = findReusableFd(zName, flags);
5043     if( pUnused ){
5044       fd = pUnused->fd;
5045     }else{
5046       pUnused = sqlite3_malloc(sizeof(*pUnused));
5047       if( !pUnused ){
5048         return SQLITE_NOMEM;
5049       }
5050     }
5051     p->pUnused = pUnused;
5052
5053     /* Database filenames are double-zero terminated if they are not
5054     ** URIs with parameters.  Hence, they can always be passed into
5055     ** sqlite3_uri_parameter(). */
5056     assert( (flags & SQLITE_OPEN_URI) || zName[strlen(zName)+1]==0 );
5057
5058   }else if( !zName ){
5059     /* If zName is NULL, the upper layer is requesting a temp file. */
5060     assert(isDelete && !syncDir);
5061     rc = unixGetTempname(MAX_PATHNAME+2, zTmpname);
5062     if( rc!=SQLITE_OK ){
5063       return rc;
5064     }
5065     zName = zTmpname;
5066
5067     /* Generated temporary filenames are always double-zero terminated
5068     ** for use by sqlite3_uri_parameter(). */
5069     assert( zName[strlen(zName)+1]==0 );
5070   }
5071
5072   /* Determine the value of the flags parameter passed to POSIX function
5073   ** open(). These must be calculated even if open() is not called, as
5074   ** they may be stored as part of the file handle and used by the
5075   ** 'conch file' locking functions later on.  */
5076   if( isReadonly )  openFlags |= O_RDONLY;
5077   if( isReadWrite ) openFlags |= O_RDWR;
5078   if( isCreate )    openFlags |= O_CREAT;
5079   if( isExclusive ) openFlags |= (O_EXCL|O_NOFOLLOW);
5080   openFlags |= (O_LARGEFILE|O_BINARY);
5081
5082   if( fd<0 ){
5083     mode_t openMode;              /* Permissions to create file with */
5084     rc = findCreateFileMode(zName, flags, &openMode);
5085     if( rc!=SQLITE_OK ){
5086       assert( !p->pUnused );
5087       assert( eType==SQLITE_OPEN_WAL || eType==SQLITE_OPEN_MAIN_JOURNAL );
5088       return rc;
5089     }
5090     fd = robust_open(zName, openFlags, openMode);
5091     OSTRACE(("OPENX   %-3d %s 0%o\n", fd, zName, openFlags));
5092     if( fd<0 && errno!=EISDIR && isReadWrite && !isExclusive ){
5093       /* Failed to open the file for read/write access. Try read-only. */
5094       flags &= ~(SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE);
5095       openFlags &= ~(O_RDWR|O_CREAT);
5096       flags |= SQLITE_OPEN_READONLY;
5097       openFlags |= O_RDONLY;
5098       isReadonly = 1;
5099       fd = robust_open(zName, openFlags, openMode);
5100     }
5101     if( fd<0 ){
5102       rc = unixLogError(SQLITE_CANTOPEN_BKPT, "open", zName);
5103       goto open_finished;
5104     }
5105   }
5106   assert( fd>=0 );
5107   if( pOutFlags ){
5108     *pOutFlags = flags;
5109   }
5110
5111   if( p->pUnused ){
5112     p->pUnused->fd = fd;
5113     p->pUnused->flags = flags;
5114   }
5115
5116   if( isDelete ){
5117 #if OS_VXWORKS
5118     zPath = zName;
5119 #else
5120     osUnlink(zName);
5121 #endif
5122   }
5123 #if SQLITE_ENABLE_LOCKING_STYLE
5124   else{
5125     p->openFlags = openFlags;
5126   }
5127 #endif
5128
5129 #ifdef FD_CLOEXEC
5130   osFcntl(fd, F_SETFD, osFcntl(fd, F_GETFD, 0) | FD_CLOEXEC);
5131 #endif
5132
5133   noLock = eType!=SQLITE_OPEN_MAIN_DB;
5134
5135
5136 #if defined(__APPLE__) || SQLITE_ENABLE_LOCKING_STYLE
5137   if( fstatfs(fd, &fsInfo) == -1 ){
5138     ((unixFile*)pFile)->lastErrno = errno;
5139     robust_close(p, fd, __LINE__);
5140     return SQLITE_IOERR_ACCESS;
5141   }
5142   if (0 == strncmp("msdos", fsInfo.f_fstypename, 5)) {
5143     ((unixFile*)pFile)->fsFlags |= SQLITE_FSFLAGS_IS_MSDOS;
5144   }
5145 #endif
5146
5147   /* Set up appropriate ctrlFlags */
5148   if( isDelete )                ctrlFlags |= UNIXFILE_DELETE;
5149   if( isReadonly )              ctrlFlags |= UNIXFILE_RDONLY;
5150   if( noLock )                  ctrlFlags |= UNIXFILE_NOLOCK;
5151   if( syncDir )                 ctrlFlags |= UNIXFILE_DIRSYNC;
5152   if( flags & SQLITE_OPEN_URI ) ctrlFlags |= UNIXFILE_URI;
5153
5154 #if SQLITE_ENABLE_LOCKING_STYLE
5155 #if SQLITE_PREFER_PROXY_LOCKING
5156   isAutoProxy = 1;
5157 #endif
5158   if( isAutoProxy && (zPath!=NULL) && (!noLock) && pVfs->xOpen ){
5159     char *envforce = getenv("SQLITE_FORCE_PROXY_LOCKING");
5160     int useProxy = 0;
5161
5162     /* SQLITE_FORCE_PROXY_LOCKING==1 means force always use proxy, 0 means
5163     ** never use proxy, NULL means use proxy for non-local files only.  */
5164     if( envforce!=NULL ){
5165       useProxy = atoi(envforce)>0;
5166     }else{
5167       if( statfs(zPath, &fsInfo) == -1 ){
5168         /* In theory, the close(fd) call is sub-optimal. If the file opened
5169         ** with fd is a database file, and there are other connections open
5170         ** on that file that are currently holding advisory locks on it,
5171         ** then the call to close() will cancel those locks. In practice,
5172         ** we're assuming that statfs() doesn't fail very often. At least
5173         ** not while other file descriptors opened by the same process on
5174         ** the same file are working.  */
5175         p->lastErrno = errno;
5176         robust_close(p, fd, __LINE__);
5177         rc = SQLITE_IOERR_ACCESS;
5178         goto open_finished;
5179       }
5180       useProxy = !(fsInfo.f_flags&MNT_LOCAL);
5181     }
5182     if( useProxy ){
5183       rc = fillInUnixFile(pVfs, fd, pFile, zPath, ctrlFlags);
5184       if( rc==SQLITE_OK ){
5185         rc = proxyTransformUnixFile((unixFile*)pFile, ":auto:");
5186         if( rc!=SQLITE_OK ){
5187           /* Use unixClose to clean up the resources added in fillInUnixFile
5188           ** and clear all the structure's references.  Specifically,
5189           ** pFile->pMethods will be NULL so sqlite3OsClose will be a no-op
5190           */
5191           unixClose(pFile);
5192           return rc;
5193         }
5194       }
5195       goto open_finished;
5196     }
5197   }
5198 #endif
5199
5200   rc = fillInUnixFile(pVfs, fd, pFile, zPath, ctrlFlags);
5201
5202 open_finished:
5203   if( rc!=SQLITE_OK ){
5204     sqlite3_free(p->pUnused);
5205   }
5206   return rc;
5207 }
5208
5209
5210 /*
5211 ** Delete the file at zPath. If the dirSync argument is true, fsync()
5212 ** the directory after deleting the file.
5213 */
5214 static int unixDelete(
5215   sqlite3_vfs *NotUsed,     /* VFS containing this as the xDelete method */
5216   const char *zPath,        /* Name of file to be deleted */
5217   int dirSync               /* If true, fsync() directory after deleting file */
5218 ){
5219   int rc = SQLITE_OK;
5220   UNUSED_PARAMETER(NotUsed);
5221   SimulateIOError(return SQLITE_IOERR_DELETE);
5222   if( osUnlink(zPath)==(-1) && errno!=ENOENT ){
5223     return unixLogError(SQLITE_IOERR_DELETE, "unlink", zPath);
5224   }
5225 #ifndef SQLITE_DISABLE_DIRSYNC
5226   if( (dirSync & 1)!=0 ){
5227     int fd;
5228     rc = osOpenDirectory(zPath, &fd);
5229     if( rc==SQLITE_OK ){
5230 #if OS_VXWORKS
5231       if( fsync(fd)==-1 )
5232 #else
5233       if( fsync(fd) )
5234 #endif
5235       {
5236         rc = unixLogError(SQLITE_IOERR_DIR_FSYNC, "fsync", zPath);
5237       }
5238       robust_close(0, fd, __LINE__);
5239     }else if( rc==SQLITE_CANTOPEN ){
5240       rc = SQLITE_OK;
5241     }
5242   }
5243 #endif
5244   return rc;
5245 }
5246
5247 /*
5248 ** Test the existance of or access permissions of file zPath. The
5249 ** test performed depends on the value of flags:
5250 **
5251 **     SQLITE_ACCESS_EXISTS: Return 1 if the file exists
5252 **     SQLITE_ACCESS_READWRITE: Return 1 if the file is read and writable.
5253 **     SQLITE_ACCESS_READONLY: Return 1 if the file is readable.
5254 **
5255 ** Otherwise return 0.
5256 */
5257 static int unixAccess(
5258   sqlite3_vfs *NotUsed,   /* The VFS containing this xAccess method */
5259   const char *zPath,      /* Path of the file to examine */
5260   int flags,              /* What do we want to learn about the zPath file? */
5261   int *pResOut            /* Write result boolean here */
5262 ){
5263   int amode = 0;
5264   UNUSED_PARAMETER(NotUsed);
5265   SimulateIOError( return SQLITE_IOERR_ACCESS; );
5266   switch( flags ){
5267     case SQLITE_ACCESS_EXISTS:
5268       amode = F_OK;
5269       break;
5270     case SQLITE_ACCESS_READWRITE:
5271       amode = W_OK|R_OK;
5272       break;
5273     case SQLITE_ACCESS_READ:
5274       amode = R_OK;
5275       break;
5276
5277     default:
5278       assert(!"Invalid flags argument");
5279   }
5280   *pResOut = (osAccess(zPath, amode)==0);
5281   if( flags==SQLITE_ACCESS_EXISTS && *pResOut ){
5282     struct stat buf;
5283     if( 0==osStat(zPath, &buf) && buf.st_size==0 ){
5284       *pResOut = 0;
5285     }
5286   }
5287   return SQLITE_OK;
5288 }
5289
5290
5291 /*
5292 ** Turn a relative pathname into a full pathname. The relative path
5293 ** is stored as a nul-terminated string in the buffer pointed to by
5294 ** zPath.
5295 **
5296 ** zOut points to a buffer of at least sqlite3_vfs.mxPathname bytes
5297 ** (in this case, MAX_PATHNAME bytes). The full-path is written to
5298 ** this buffer before returning.
5299 */
5300 static int unixFullPathname(
5301   sqlite3_vfs *pVfs,            /* Pointer to vfs object */
5302   const char *zPath,            /* Possibly relative input path */
5303   int nOut,                     /* Size of output buffer in bytes */
5304   char *zOut                    /* Output buffer */
5305 ){
5306
5307   /* It's odd to simulate an io-error here, but really this is just
5308   ** using the io-error infrastructure to test that SQLite handles this
5309   ** function failing. This function could fail if, for example, the
5310   ** current working directory has been unlinked.
5311   */
5312   SimulateIOError( return SQLITE_ERROR );
5313
5314   assert( pVfs->mxPathname==MAX_PATHNAME );
5315   UNUSED_PARAMETER(pVfs);
5316
5317   zOut[nOut-1] = '\0';
5318   if( zPath[0]=='/' ){
5319     sqlite3_snprintf(nOut, zOut, "%s", zPath);
5320   }else{
5321     int nCwd;
5322     if( osGetcwd(zOut, nOut-1)==0 ){
5323       return unixLogError(SQLITE_CANTOPEN_BKPT, "getcwd", zPath);
5324     }
5325     nCwd = (int)strlen(zOut);
5326     sqlite3_snprintf(nOut-nCwd, &zOut[nCwd], "/%s", zPath);
5327   }
5328   return SQLITE_OK;
5329 }
5330
5331
5332 #ifndef SQLITE_OMIT_LOAD_EXTENSION
5333 /*
5334 ** Interfaces for opening a shared library, finding entry points
5335 ** within the shared library, and closing the shared library.
5336 */
5337 #include <dlfcn.h>
5338 static void *unixDlOpen(sqlite3_vfs *NotUsed, const char *zFilename){
5339   UNUSED_PARAMETER(NotUsed);
5340   return dlopen(zFilename, RTLD_NOW | RTLD_GLOBAL);
5341 }
5342
5343 /*
5344 ** SQLite calls this function immediately after a call to unixDlSym() or
5345 ** unixDlOpen() fails (returns a null pointer). If a more detailed error
5346 ** message is available, it is written to zBufOut. If no error message
5347 ** is available, zBufOut is left unmodified and SQLite uses a default
5348 ** error message.
5349 */
5350 static void unixDlError(sqlite3_vfs *NotUsed, int nBuf, char *zBufOut){
5351   const char *zErr;
5352   UNUSED_PARAMETER(NotUsed);
5353   unixEnterMutex();
5354   zErr = dlerror();
5355   if( zErr ){
5356     sqlite3_snprintf(nBuf, zBufOut, "%s", zErr);
5357   }
5358   unixLeaveMutex();
5359 }
5360 static void (*unixDlSym(sqlite3_vfs *NotUsed, void *p, const char*zSym))(void){
5361   /*
5362   ** GCC with -pedantic-errors says that C90 does not allow a void* to be
5363   ** cast into a pointer to a function.  And yet the library dlsym() routine
5364   ** returns a void* which is really a pointer to a function.  So how do we
5365   ** use dlsym() with -pedantic-errors?
5366   **
5367   ** Variable x below is defined to be a pointer to a function taking
5368   ** parameters void* and const char* and returning a pointer to a function.
5369   ** We initialize x by assigning it a pointer to the dlsym() function.
5370   ** (That assignment requires a cast.)  Then we call the function that
5371   ** x points to.
5372   **
5373   ** This work-around is unlikely to work correctly on any system where
5374   ** you really cannot cast a function pointer into void*.  But then, on the
5375   ** other hand, dlsym() will not work on such a system either, so we have
5376   ** not really lost anything.
5377   */
5378   void (*(*x)(void*,const char*))(void);
5379   UNUSED_PARAMETER(NotUsed);
5380   x = (void(*(*)(void*,const char*))(void))dlsym;
5381   return (*x)(p, zSym);
5382 }
5383 static void unixDlClose(sqlite3_vfs *NotUsed, void *pHandle){
5384   UNUSED_PARAMETER(NotUsed);
5385   dlclose(pHandle);
5386 }
5387 #else /* if SQLITE_OMIT_LOAD_EXTENSION is defined: */
5388   #define unixDlOpen  0
5389   #define unixDlError 0
5390   #define unixDlSym   0
5391   #define unixDlClose 0
5392 #endif
5393
5394 /*
5395 ** Write nBuf bytes of random data to the supplied buffer zBuf.
5396 */
5397 static int unixRandomness(sqlite3_vfs *NotUsed, int nBuf, char *zBuf){
5398   UNUSED_PARAMETER(NotUsed);
5399   assert((size_t)nBuf>=(sizeof(time_t)+sizeof(int)));
5400
5401   /* We have to initialize zBuf to prevent valgrind from reporting
5402   ** errors.  The reports issued by valgrind are incorrect - we would
5403   ** prefer that the randomness be increased by making use of the
5404   ** uninitialized space in zBuf - but valgrind errors tend to worry
5405   ** some users.  Rather than argue, it seems easier just to initialize
5406   ** the whole array and silence valgrind, even if that means less randomness
5407   ** in the random seed.
5408   **
5409   ** When testing, initializing zBuf[] to zero is all we do.  That means
5410   ** that we always use the same random number sequence.  This makes the
5411   ** tests repeatable.
5412   */
5413   memset(zBuf, 0, nBuf);
5414 #if !defined(SQLITE_TEST)
5415   {
5416     int pid, fd;
5417     fd = robust_open("/dev/urandom", O_RDONLY, 0);
5418     if( fd<0 ){
5419       time_t t;
5420       time(&t);
5421       memcpy(zBuf, &t, sizeof(t));
5422       pid = getpid();
5423       memcpy(&zBuf[sizeof(t)], &pid, sizeof(pid));
5424       assert( sizeof(t)+sizeof(pid)<=(size_t)nBuf );
5425       nBuf = sizeof(t) + sizeof(pid);
5426     }else{
5427       do{ nBuf = osRead(fd, zBuf, nBuf); }while( nBuf<0 && errno==EINTR );
5428       robust_close(0, fd, __LINE__);
5429     }
5430   }
5431 #endif
5432   return nBuf;
5433 }
5434
5435
5436 /*
5437 ** Sleep for a little while.  Return the amount of time slept.
5438 ** The argument is the number of microseconds we want to sleep.
5439 ** The return value is the number of microseconds of sleep actually
5440 ** requested from the underlying operating system, a number which
5441 ** might be greater than or equal to the argument, but not less
5442 ** than the argument.
5443 */
5444 static int unixSleep(sqlite3_vfs *NotUsed, int microseconds){
5445 #if OS_VXWORKS
5446   struct timespec sp;
5447
5448   sp.tv_sec = microseconds / 1000000;
5449   sp.tv_nsec = (microseconds % 1000000) * 1000;
5450   nanosleep(&sp, NULL);
5451   UNUSED_PARAMETER(NotUsed);
5452   return microseconds;
5453 #elif defined(HAVE_USLEEP) && HAVE_USLEEP
5454   usleep(microseconds);
5455   UNUSED_PARAMETER(NotUsed);
5456   return microseconds;
5457 #else
5458   int seconds = (microseconds+999999)/1000000;
5459   sleep(seconds);
5460   UNUSED_PARAMETER(NotUsed);
5461   return seconds*1000000;
5462 #endif
5463 }
5464
5465 /*
5466 ** The following variable, if set to a non-zero value, is interpreted as
5467 ** the number of seconds since 1970 and is used to set the result of
5468 ** sqlite3OsCurrentTime() during testing.
5469 */
5470 #ifdef SQLITE_TEST
5471 int sqlite3_current_time = 0;  /* Fake system time in seconds since 1970. */
5472 #endif
5473
5474 /*
5475 ** Find the current time (in Universal Coordinated Time).  Write into *piNow
5476 ** the current time and date as a Julian Day number times 86_400_000.  In
5477 ** other words, write into *piNow the number of milliseconds since the Julian
5478 ** epoch of noon in Greenwich on November 24, 4714 B.C according to the
5479 ** proleptic Gregorian calendar.
5480 **
5481 ** On success, return SQLITE_OK.  Return SQLITE_ERROR if the time and date
5482 ** cannot be found.
5483 */
5484 static int unixCurrentTimeInt64(sqlite3_vfs *NotUsed, sqlite3_int64 *piNow){
5485   static const sqlite3_int64 unixEpoch = 24405875*(sqlite3_int64)8640000;
5486   int rc = SQLITE_OK;
5487 #if defined(NO_GETTOD)
5488   time_t t;
5489   time(&t);
5490   *piNow = ((sqlite3_int64)t)*1000 + unixEpoch;
5491 #elif OS_VXWORKS
5492   struct timespec sNow;
5493   clock_gettime(CLOCK_REALTIME, &sNow);
5494   *piNow = unixEpoch + 1000*(sqlite3_int64)sNow.tv_sec + sNow.tv_nsec/1000000;
5495 #else
5496   struct timeval sNow;
5497   if( gettimeofday(&sNow, 0)==0 ){
5498     *piNow = unixEpoch + 1000*(sqlite3_int64)sNow.tv_sec + sNow.tv_usec/1000;
5499   }else{
5500     rc = SQLITE_ERROR;
5501   }
5502 #endif
5503
5504 #ifdef SQLITE_TEST
5505   if( sqlite3_current_time ){
5506     *piNow = 1000*(sqlite3_int64)sqlite3_current_time + unixEpoch;
5507   }
5508 #endif
5509   UNUSED_PARAMETER(NotUsed);
5510   return rc;
5511 }
5512
5513 /*
5514 ** Find the current time (in Universal Coordinated Time).  Write the
5515 ** current time and date as a Julian Day number into *prNow and
5516 ** return 0.  Return 1 if the time and date cannot be found.
5517 */
5518 static int unixCurrentTime(sqlite3_vfs *NotUsed, double *prNow){
5519   sqlite3_int64 i = 0;
5520   int rc;
5521   UNUSED_PARAMETER(NotUsed);
5522   rc = unixCurrentTimeInt64(0, &i);
5523   *prNow = i/86400000.0;
5524   return rc;
5525 }
5526
5527 /*
5528 ** We added the xGetLastError() method with the intention of providing
5529 ** better low-level error messages when operating-system problems come up
5530 ** during SQLite operation.  But so far, none of that has been implemented
5531 ** in the core.  So this routine is never called.  For now, it is merely
5532 ** a place-holder.
5533 */
5534 static int unixGetLastError(sqlite3_vfs *NotUsed, int NotUsed2, char *NotUsed3){
5535   UNUSED_PARAMETER(NotUsed);
5536   UNUSED_PARAMETER(NotUsed2);
5537   UNUSED_PARAMETER(NotUsed3);
5538   return 0;
5539 }
5540
5541
5542 /*
5543 ************************ End of sqlite3_vfs methods ***************************
5544 ******************************************************************************/
5545
5546 /******************************************************************************
5547 ************************** Begin Proxy Locking ********************************
5548 **
5549 ** Proxy locking is a "uber-locking-method" in this sense:  It uses the
5550 ** other locking methods on secondary lock files.  Proxy locking is a
5551 ** meta-layer over top of the primitive locking implemented above.  For
5552 ** this reason, the division that implements of proxy locking is deferred
5553 ** until late in the file (here) after all of the other I/O methods have
5554 ** been defined - so that the primitive locking methods are available
5555 ** as services to help with the implementation of proxy locking.
5556 **
5557 ****
5558 **
5559 ** The default locking schemes in SQLite use byte-range locks on the
5560 ** database file to coordinate safe, concurrent access by multiple readers
5561 ** and writers [http://sqlite.org/lockingv3.html].  The five file locking
5562 ** states (UNLOCKED, PENDING, SHARED, RESERVED, EXCLUSIVE) are implemented
5563 ** as POSIX read & write locks over fixed set of locations (via fsctl),
5564 ** on AFP and SMB only exclusive byte-range locks are available via fsctl
5565 ** with _IOWR('z', 23, struct ByteRangeLockPB2) to track the same 5 states.
5566 ** To simulate a F_RDLCK on the shared range, on AFP a randomly selected
5567 ** address in the shared range is taken for a SHARED lock, the entire
5568 ** shared range is taken for an EXCLUSIVE lock):
5569 **
5570 **      PENDING_BYTE        0x40000000
5571 **      RESERVED_BYTE       0x40000001
5572 **      SHARED_RANGE        0x40000002 -> 0x40000200
5573 **
5574 ** This works well on the local file system, but shows a nearly 100x
5575 ** slowdown in read performance on AFP because the AFP client disables
5576 ** the read cache when byte-range locks are present.  Enabling the read
5577 ** cache exposes a cache coherency problem that is present on all OS X
5578 ** supported network file systems.  NFS and AFP both observe the
5579 ** close-to-open semantics for ensuring cache coherency
5580 ** [http://nfs.sourceforge.net/#faq_a8], which does not effectively
5581 ** address the requirements for concurrent database access by multiple
5582 ** readers and writers
5583 ** [http://www.nabble.com/SQLite-on-NFS-cache-coherency-td15655701.html].
5584 **
5585 ** To address the performance and cache coherency issues, proxy file locking
5586 ** changes the way database access is controlled by limiting access to a
5587 ** single host at a time and moving file locks off of the database file
5588 ** and onto a proxy file on the local file system.
5589 **
5590 **
5591 ** Using proxy locks
5592 ** -----------------
5593 **
5594 ** C APIs
5595 **
5596 **  sqlite3_file_control(db, dbname, SQLITE_SET_LOCKPROXYFILE,
5597 **                       <proxy_path> | ":auto:");
5598 **  sqlite3_file_control(db, dbname, SQLITE_GET_LOCKPROXYFILE, &<proxy_path>);
5599 **
5600 **
5601 ** SQL pragmas
5602 **
5603 **  PRAGMA [database.]lock_proxy_file=<proxy_path> | :auto:
5604 **  PRAGMA [database.]lock_proxy_file
5605 **
5606 ** Specifying ":auto:" means that if there is a conch file with a matching
5607 ** host ID in it, the proxy path in the conch file will be used, otherwise
5608 ** a proxy path based on the user's temp dir
5609 ** (via confstr(_CS_DARWIN_USER_TEMP_DIR,...)) will be used and the
5610 ** actual proxy file name is generated from the name and path of the
5611 ** database file.  For example:
5612 **
5613 **       For database path "/Users/me/foo.db"
5614 **       The lock path will be "<tmpdir>/sqliteplocks/_Users_me_foo.db:auto:")
5615 **
5616 ** Once a lock proxy is configured for a database connection, it can not
5617 ** be removed, however it may be switched to a different proxy path via
5618 ** the above APIs (assuming the conch file is not being held by another
5619 ** connection or process).
5620 **
5621 **
5622 ** How proxy locking works
5623 ** -----------------------
5624 **
5625 ** Proxy file locking relies primarily on two new supporting files:
5626 **
5627 **   *  conch file to limit access to the database file to a single host
5628 **      at a time
5629 **
5630 **   *  proxy file to act as a proxy for the advisory locks normally
5631 **      taken on the database
5632 **
5633 ** The conch file - to use a proxy file, sqlite must first "hold the conch"
5634 ** by taking an sqlite-style shared lock on the conch file, reading the
5635 ** contents and comparing the host's unique host ID (see below) and lock
5636 ** proxy path against the values stored in the conch.  The conch file is
5637 ** stored in the same directory as the database file and the file name
5638 ** is patterned after the database file name as ".<databasename>-conch".
5639 ** If the conch file does not exist, or it's contents do not match the
5640 ** host ID and/or proxy path, then the lock is escalated to an exclusive
5641 ** lock and the conch file contents is updated with the host ID and proxy
5642 ** path and the lock is downgraded to a shared lock again.  If the conch
5643 ** is held by another process (with a shared lock), the exclusive lock
5644 ** will fail and SQLITE_BUSY is returned.
5645 **
5646 ** The proxy file - a single-byte file used for all advisory file locks
5647 ** normally taken on the database file.   This allows for safe sharing
5648 ** of the database file for multiple readers and writers on the same
5649 ** host (the conch ensures that they all use the same local lock file).
5650 **
5651 ** Requesting the lock proxy does not immediately take the conch, it is
5652 ** only taken when the first request to lock database file is made.
5653 ** This matches the semantics of the traditional locking behavior, where
5654 ** opening a connection to a database file does not take a lock on it.
5655 ** The shared lock and an open file descriptor are maintained until
5656 ** the connection to the database is closed.
5657 **
5658 ** The proxy file and the lock file are never deleted so they only need
5659 ** to be created the first time they are used.
5660 **
5661 ** Configuration options
5662 ** ---------------------
5663 **
5664 **  SQLITE_PREFER_PROXY_LOCKING
5665 **
5666 **       Database files accessed on non-local file systems are
5667 **       automatically configured for proxy locking, lock files are
5668 **       named automatically using the same logic as
5669 **       PRAGMA lock_proxy_file=":auto:"
5670 **
5671 **  SQLITE_PROXY_DEBUG
5672 **
5673 **       Enables the logging of error messages during host id file
5674 **       retrieval and creation
5675 **
5676 **  LOCKPROXYDIR
5677 **
5678 **       Overrides the default directory used for lock proxy files that
5679 **       are named automatically via the ":auto:" setting
5680 **
5681 **  SQLITE_DEFAULT_PROXYDIR_PERMISSIONS
5682 **
5683 **       Permissions to use when creating a directory for storing the
5684 **       lock proxy files, only used when LOCKPROXYDIR is not set.
5685 **
5686 **
5687 ** As mentioned above, when compiled with SQLITE_PREFER_PROXY_LOCKING,
5688 ** setting the environment variable SQLITE_FORCE_PROXY_LOCKING to 1 will
5689 ** force proxy locking to be used for every database file opened, and 0
5690 ** will force automatic proxy locking to be disabled for all database
5691 ** files (explicity calling the SQLITE_SET_LOCKPROXYFILE pragma or
5692 ** sqlite_file_control API is not affected by SQLITE_FORCE_PROXY_LOCKING).
5693 */
5694
5695 /*
5696 ** Proxy locking is only available on MacOSX
5697 */
5698 #if defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE
5699
5700 /*
5701 ** The proxyLockingContext has the path and file structures for the remote
5702 ** and local proxy files in it
5703 */
5704 typedef struct proxyLockingContext proxyLockingContext;
5705 struct proxyLockingContext {
5706   unixFile *conchFile;         /* Open conch file */
5707   char *conchFilePath;         /* Name of the conch file */
5708   unixFile *lockProxy;         /* Open proxy lock file */
5709   char *lockProxyPath;         /* Name of the proxy lock file */
5710   char *dbPath;                /* Name of the open file */
5711   int conchHeld;               /* 1 if the conch is held, -1 if lockless */
5712   void *oldLockingContext;     /* Original lockingcontext to restore on close */
5713   sqlite3_io_methods const *pOldMethod;     /* Original I/O methods for close */
5714 };
5715
5716 /*
5717 ** The proxy lock file path for the database at dbPath is written into lPath,
5718 ** which must point to valid, writable memory large enough for a maxLen length
5719 ** file path.
5720 */
5721 static int proxyGetLockPath(const char *dbPath, char *lPath, size_t maxLen){
5722   int len;
5723   int dbLen;
5724   int i;
5725
5726 #ifdef LOCKPROXYDIR
5727   len = strlcpy(lPath, LOCKPROXYDIR, maxLen);
5728 #else
5729 # ifdef _CS_DARWIN_USER_TEMP_DIR
5730   {
5731     if( !confstr(_CS_DARWIN_USER_TEMP_DIR, lPath, maxLen) ){
5732       OSTRACE(("GETLOCKPATH  failed %s errno=%d pid=%d\n",
5733                lPath, errno, getpid()));
5734       return SQLITE_IOERR_LOCK;
5735     }
5736     len = strlcat(lPath, "sqliteplocks", maxLen);
5737   }
5738 # else
5739   len = strlcpy(lPath, "/tmp/", maxLen);
5740 # endif
5741 #endif
5742
5743   if( lPath[len-1]!='/' ){
5744     len = strlcat(lPath, "/", maxLen);
5745   }
5746
5747   /* transform the db path to a unique cache name */
5748   dbLen = (int)strlen(dbPath);
5749   for( i=0; i<dbLen && (i+len+7)<(int)maxLen; i++){
5750     char c = dbPath[i];
5751     lPath[i+len] = (c=='/')?'_':c;
5752   }
5753   lPath[i+len]='\0';
5754   strlcat(lPath, ":auto:", maxLen);
5755   OSTRACE(("GETLOCKPATH  proxy lock path=%s pid=%d\n", lPath, getpid()));
5756   return SQLITE_OK;
5757 }
5758
5759 /*
5760  ** Creates the lock file and any missing directories in lockPath
5761  */
5762 static int proxyCreateLockPath(const char *lockPath){
5763   int i, len;
5764   char buf[MAXPATHLEN];
5765   int start = 0;
5766
5767   assert(lockPath!=NULL);
5768   /* try to create all the intermediate directories */
5769   len = (int)strlen(lockPath);
5770   buf[0] = lockPath[0];
5771   for( i=1; i<len; i++ ){
5772     if( lockPath[i] == '/' && (i - start > 0) ){
5773       /* only mkdir if leaf dir != "." or "/" or ".." */
5774       if( i-start>2 || (i-start==1 && buf[start] != '.' && buf[start] != '/')
5775          || (i-start==2 && buf[start] != '.' && buf[start+1] != '.') ){
5776         buf[i]='\0';
5777         if( osMkdir(buf, SQLITE_DEFAULT_PROXYDIR_PERMISSIONS) ){
5778           int err=errno;
5779           if( err!=EEXIST ) {
5780             OSTRACE(("CREATELOCKPATH  FAILED creating %s, "
5781                      "'%s' proxy lock path=%s pid=%d\n",
5782                      buf, strerror(err), lockPath, getpid()));
5783             return err;
5784           }
5785         }
5786       }
5787       start=i+1;
5788     }
5789     buf[i] = lockPath[i];
5790   }
5791   OSTRACE(("CREATELOCKPATH  proxy lock path=%s pid=%d\n", lockPath, getpid()));
5792   return 0;
5793 }
5794
5795 /*
5796 ** Create a new VFS file descriptor (stored in memory obtained from
5797 ** sqlite3_malloc) and open the file named "path" in the file descriptor.
5798 **
5799 ** The caller is responsible not only for closing the file descriptor
5800 ** but also for freeing the memory associated with the file descriptor.
5801 */
5802 static int proxyCreateUnixFile(
5803     const char *path,        /* path for the new unixFile */
5804     unixFile **ppFile,       /* unixFile created and returned by ref */
5805     int islockfile           /* if non zero missing dirs will be created */
5806 ) {
5807   int fd = -1;
5808   unixFile *pNew;
5809   int rc = SQLITE_OK;
5810   int openFlags = O_RDWR | O_CREAT;
5811   sqlite3_vfs dummyVfs;
5812   int terrno = 0;
5813   UnixUnusedFd *pUnused = NULL;
5814
5815   /* 1. first try to open/create the file
5816   ** 2. if that fails, and this is a lock file (not-conch), try creating
5817   ** the parent directories and then try again.
5818   ** 3. if that fails, try to open the file read-only
5819   ** otherwise return BUSY (if lock file) or CANTOPEN for the conch file
5820   */
5821   pUnused = findReusableFd(path, openFlags);
5822   if( pUnused ){
5823     fd = pUnused->fd;
5824   }else{
5825     pUnused = sqlite3_malloc(sizeof(*pUnused));
5826     if( !pUnused ){
5827       return SQLITE_NOMEM;
5828     }
5829   }
5830   if( fd<0 ){
5831     fd = robust_open(path, openFlags, SQLITE_DEFAULT_FILE_PERMISSIONS);
5832     terrno = errno;
5833     if( fd<0 && errno==ENOENT && islockfile ){
5834       if( proxyCreateLockPath(path) == SQLITE_OK ){
5835         fd = robust_open(path, openFlags, SQLITE_DEFAULT_FILE_PERMISSIONS);
5836       }
5837     }
5838   }
5839   if( fd<0 ){
5840     openFlags = O_RDONLY;
5841     fd = robust_open(path, openFlags, SQLITE_DEFAULT_FILE_PERMISSIONS);
5842     terrno = errno;
5843   }
5844   if( fd<0 ){
5845     if( islockfile ){
5846       return SQLITE_BUSY;
5847     }
5848     switch (terrno) {
5849       case EACCES:
5850         return SQLITE_PERM;
5851       case EIO:
5852         return SQLITE_IOERR_LOCK; /* even though it is the conch */
5853       default:
5854         return SQLITE_CANTOPEN_BKPT;
5855     }
5856   }
5857
5858   pNew = (unixFile *)sqlite3_malloc(sizeof(*pNew));
5859   if( pNew==NULL ){
5860     rc = SQLITE_NOMEM;
5861     goto end_create_proxy;
5862   }
5863   memset(pNew, 0, sizeof(unixFile));
5864   pNew->openFlags = openFlags;
5865   memset(&dummyVfs, 0, sizeof(dummyVfs));
5866   dummyVfs.pAppData = (void*)&autolockIoFinder;
5867   dummyVfs.zName = "dummy";
5868   pUnused->fd = fd;
5869   pUnused->flags = openFlags;
5870   pNew->pUnused = pUnused;
5871
5872   rc = fillInUnixFile(&dummyVfs, fd, (sqlite3_file*)pNew, path, 0);
5873   if( rc==SQLITE_OK ){
5874     *ppFile = pNew;
5875     return SQLITE_OK;
5876   }
5877 end_create_proxy:
5878   robust_close(pNew, fd, __LINE__);
5879   sqlite3_free(pNew);
5880   sqlite3_free(pUnused);
5881   return rc;
5882 }
5883
5884 #ifdef SQLITE_TEST
5885 /* simulate multiple hosts by creating unique hostid file paths */
5886 int sqlite3_hostid_num = 0;
5887 #endif
5888
5889 #define PROXY_HOSTIDLEN    16  /* conch file host id length */
5890
5891 /* Not always defined in the headers as it ought to be */
5892 extern int gethostuuid(uuid_t id, const struct timespec *wait);
5893
5894 /* get the host ID via gethostuuid(), pHostID must point to PROXY_HOSTIDLEN
5895 ** bytes of writable memory.
5896 */
5897 static int proxyGetHostID(unsigned char *pHostID, int *pError){
5898   assert(PROXY_HOSTIDLEN == sizeof(uuid_t));
5899   memset(pHostID, 0, PROXY_HOSTIDLEN);
5900 #if defined(__MAX_OS_X_VERSION_MIN_REQUIRED)\
5901                && __MAC_OS_X_VERSION_MIN_REQUIRED<1050
5902   {
5903     static const struct timespec timeout = {1, 0}; /* 1 sec timeout */
5904     if( gethostuuid(pHostID, &timeout) ){
5905       int err = errno;
5906       if( pError ){
5907         *pError = err;
5908       }
5909       return SQLITE_IOERR;
5910     }
5911   }
5912 #else
5913   UNUSED_PARAMETER(pError);
5914 #endif
5915 #ifdef SQLITE_TEST
5916   /* simulate multiple hosts by creating unique hostid file paths */
5917   if( sqlite3_hostid_num != 0){
5918     pHostID[0] = (char)(pHostID[0] + (char)(sqlite3_hostid_num & 0xFF));
5919   }
5920 #endif
5921
5922   return SQLITE_OK;
5923 }
5924
5925 /* The conch file contains the header, host id and lock file path
5926  */
5927 #define PROXY_CONCHVERSION 2   /* 1-byte header, 16-byte host id, path */
5928 #define PROXY_HEADERLEN    1   /* conch file header length */
5929 #define PROXY_PATHINDEX    (PROXY_HEADERLEN+PROXY_HOSTIDLEN)
5930 #define PROXY_MAXCONCHLEN  (PROXY_HEADERLEN+PROXY_HOSTIDLEN+MAXPATHLEN)
5931
5932 /*
5933 ** Takes an open conch file, copies the contents to a new path and then moves
5934 ** it back.  The newly created file's file descriptor is assigned to the
5935 ** conch file structure and finally the original conch file descriptor is
5936 ** closed.  Returns zero if successful.
5937 */
5938 static int proxyBreakConchLock(unixFile *pFile, uuid_t myHostID){
5939   proxyLockingContext *pCtx = (proxyLockingContext *)pFile->lockingContext;
5940   unixFile *conchFile = pCtx->conchFile;
5941   char tPath[MAXPATHLEN];
5942   char buf[PROXY_MAXCONCHLEN];
5943   char *cPath = pCtx->conchFilePath;
5944   size_t readLen = 0;
5945   size_t pathLen = 0;
5946   char errmsg[64] = "";
5947   int fd = -1;
5948   int rc = -1;
5949   UNUSED_PARAMETER(myHostID);
5950
5951   /* create a new path by replace the trailing '-conch' with '-break' */
5952   pathLen = strlcpy(tPath, cPath, MAXPATHLEN);
5953   if( pathLen>MAXPATHLEN || pathLen<6 ||
5954      (strlcpy(&tPath[pathLen-5], "break", 6) != 5) ){
5955     sqlite3_snprintf(sizeof(errmsg),errmsg,"path error (len %d)",(int)pathLen);
5956     goto end_breaklock;
5957   }
5958   /* read the conch content */
5959   readLen = osPread(conchFile->h, buf, PROXY_MAXCONCHLEN, 0);
5960   if( readLen<PROXY_PATHINDEX ){
5961     sqlite3_snprintf(sizeof(errmsg),errmsg,"read error (len %d)",(int)readLen);
5962     goto end_breaklock;
5963   }
5964   /* write it out to the temporary break file */
5965   fd = robust_open(tPath, (O_RDWR|O_CREAT|O_EXCL),
5966                    SQLITE_DEFAULT_FILE_PERMISSIONS);
5967   if( fd<0 ){
5968     sqlite3_snprintf(sizeof(errmsg), errmsg, "create failed (%d)", errno);
5969     goto end_breaklock;
5970   }
5971   if( osPwrite(fd, buf, readLen, 0) != (ssize_t)readLen ){
5972     sqlite3_snprintf(sizeof(errmsg), errmsg, "write failed (%d)", errno);
5973     goto end_breaklock;
5974   }
5975   if( rename(tPath, cPath) ){
5976     sqlite3_snprintf(sizeof(errmsg), errmsg, "rename failed (%d)", errno);
5977     goto end_breaklock;
5978   }
5979   rc = 0;
5980   fprintf(stderr, "broke stale lock on %s\n", cPath);
5981   robust_close(pFile, conchFile->h, __LINE__);
5982   conchFile->h = fd;
5983   conchFile->openFlags = O_RDWR | O_CREAT;
5984
5985 end_breaklock:
5986   if( rc ){
5987     if( fd>=0 ){
5988       osUnlink(tPath);
5989       robust_close(pFile, fd, __LINE__);
5990     }
5991     fprintf(stderr, "failed to break stale lock on %s, %s\n", cPath, errmsg);
5992   }
5993   return rc;
5994 }
5995
5996 /* Take the requested lock on the conch file and break a stale lock if the
5997 ** host id matches.
5998 */
5999 static int proxyConchLock(unixFile *pFile, uuid_t myHostID, int lockType){
6000   proxyLockingContext *pCtx = (proxyLockingContext *)pFile->lockingContext;
6001   unixFile *conchFile = pCtx->conchFile;
6002   int rc = SQLITE_OK;
6003   int nTries = 0;
6004   struct timespec conchModTime;
6005
6006   memset(&conchModTime, 0, sizeof(conchModTime));
6007   do {
6008     rc = conchFile->pMethod->xLock((sqlite3_file*)conchFile, lockType);
6009     nTries ++;
6010     if( rc==SQLITE_BUSY ){
6011       /* If the lock failed (busy):
6012        * 1st try: get the mod time of the conch, wait 0.5s and try again.
6013        * 2nd try: fail if the mod time changed or host id is different, wait
6014        *           10 sec and try again
6015        * 3rd try: break the lock unless the mod time has changed.
6016        */
6017       struct stat buf;
6018       if( osFstat(conchFile->h, &buf) ){
6019         pFile->lastErrno = errno;
6020         return SQLITE_IOERR_LOCK;
6021       }
6022
6023       if( nTries==1 ){
6024         conchModTime = buf.st_mtimespec;
6025         usleep(500000); /* wait 0.5 sec and try the lock again*/
6026         continue;
6027       }
6028
6029       assert( nTries>1 );
6030       if( conchModTime.tv_sec != buf.st_mtimespec.tv_sec ||
6031          conchModTime.tv_nsec != buf.st_mtimespec.tv_nsec ){
6032         return SQLITE_BUSY;
6033       }
6034
6035       if( nTries==2 ){
6036         char tBuf[PROXY_MAXCONCHLEN];
6037         int len = osPread(conchFile->h, tBuf, PROXY_MAXCONCHLEN, 0);
6038         if( len<0 ){
6039           pFile->lastErrno = errno;
6040           return SQLITE_IOERR_LOCK;
6041         }
6042         if( len>PROXY_PATHINDEX && tBuf[0]==(char)PROXY_CONCHVERSION){
6043           /* don't break the lock if the host id doesn't match */
6044           if( 0!=memcmp(&tBuf[PROXY_HEADERLEN], myHostID, PROXY_HOSTIDLEN) ){
6045             return SQLITE_BUSY;
6046           }
6047         }else{
6048           /* don't break the lock on short read or a version mismatch */
6049           return SQLITE_BUSY;
6050         }
6051         usleep(10000000); /* wait 10 sec and try the lock again */
6052         continue;
6053       }
6054
6055       assert( nTries==3 );
6056       if( 0==proxyBreakConchLock(pFile, myHostID) ){
6057         rc = SQLITE_OK;
6058         if( lockType==EXCLUSIVE_LOCK ){
6059           rc = conchFile->pMethod->xLock((sqlite3_file*)conchFile, SHARED_LOCK);
6060         }
6061         if( !rc ){
6062           rc = conchFile->pMethod->xLock((sqlite3_file*)conchFile, lockType);
6063         }
6064       }
6065     }
6066   } while( rc==SQLITE_BUSY && nTries<3 );
6067
6068   return rc;
6069 }
6070
6071 /* Takes the conch by taking a shared lock and read the contents conch, if
6072 ** lockPath is non-NULL, the host ID and lock file path must match.  A NULL
6073 ** lockPath means that the lockPath in the conch file will be used if the
6074 ** host IDs match, or a new lock path will be generated automatically
6075 ** and written to the conch file.
6076 */
6077 static int proxyTakeConch(unixFile *pFile){
6078   proxyLockingContext *pCtx = (proxyLockingContext *)pFile->lockingContext;
6079
6080   if( pCtx->conchHeld!=0 ){
6081     return SQLITE_OK;
6082   }else{
6083     unixFile *conchFile = pCtx->conchFile;
6084     uuid_t myHostID;
6085     int pError = 0;
6086     char readBuf[PROXY_MAXCONCHLEN];
6087     char lockPath[MAXPATHLEN];
6088     char *tempLockPath = NULL;
6089     int rc = SQLITE_OK;
6090     int createConch = 0;
6091     int hostIdMatch = 0;
6092     int readLen = 0;
6093     int tryOldLockPath = 0;
6094     int forceNewLockPath = 0;
6095
6096     OSTRACE(("TAKECONCH  %d for %s pid=%d\n", conchFile->h,
6097              (pCtx->lockProxyPath ? pCtx->lockProxyPath : ":auto:"), getpid()));
6098
6099     rc = proxyGetHostID(myHostID, &pError);
6100     if( (rc&0xff)==SQLITE_IOERR ){
6101       pFile->lastErrno = pError;
6102       goto end_takeconch;
6103     }
6104     rc = proxyConchLock(pFile, myHostID, SHARED_LOCK);
6105     if( rc!=SQLITE_OK ){
6106       goto end_takeconch;
6107     }
6108     /* read the existing conch file */
6109     readLen = seekAndRead((unixFile*)conchFile, 0, readBuf, PROXY_MAXCONCHLEN);
6110     if( readLen<0 ){
6111       /* I/O error: lastErrno set by seekAndRead */
6112       pFile->lastErrno = conchFile->lastErrno;
6113       rc = SQLITE_IOERR_READ;
6114       goto end_takeconch;
6115     }else if( readLen<=(PROXY_HEADERLEN+PROXY_HOSTIDLEN) ||
6116              readBuf[0]!=(char)PROXY_CONCHVERSION ){
6117       /* a short read or version format mismatch means we need to create a new
6118       ** conch file.
6119       */
6120       createConch = 1;
6121     }
6122     /* if the host id matches and the lock path already exists in the conch
6123     ** we'll try to use the path there, if we can't open that path, we'll
6124     ** retry with a new auto-generated path
6125     */
6126     do { /* in case we need to try again for an :auto: named lock file */
6127
6128       if( !createConch && !forceNewLockPath ){
6129         hostIdMatch = !memcmp(&readBuf[PROXY_HEADERLEN], myHostID,
6130                                   PROXY_HOSTIDLEN);
6131         /* if the conch has data compare the contents */
6132         if( !pCtx->lockProxyPath ){
6133           /* for auto-named local lock file, just check the host ID and we'll
6134            ** use the local lock file path that's already in there
6135            */
6136           if( hostIdMatch ){
6137             size_t pathLen = (readLen - PROXY_PATHINDEX);
6138
6139             if( pathLen>=MAXPATHLEN ){
6140               pathLen=MAXPATHLEN-1;
6141             }
6142             memcpy(lockPath, &readBuf[PROXY_PATHINDEX], pathLen);
6143             lockPath[pathLen] = 0;
6144             tempLockPath = lockPath;
6145             tryOldLockPath = 1;
6146             /* create a copy of the lock path if the conch is taken */
6147             goto end_takeconch;
6148           }
6149         }else if( hostIdMatch
6150                && !strncmp(pCtx->lockProxyPath, &readBuf[PROXY_PATHINDEX],
6151                            readLen-PROXY_PATHINDEX)
6152         ){
6153           /* conch host and lock path match */
6154           goto end_takeconch;
6155         }
6156       }
6157
6158       /* if the conch isn't writable and doesn't match, we can't take it */
6159       if( (conchFile->openFlags&O_RDWR) == 0 ){
6160         rc = SQLITE_BUSY;
6161         goto end_takeconch;
6162       }
6163
6164       /* either the conch didn't match or we need to create a new one */
6165       if( !pCtx->lockProxyPath ){
6166         proxyGetLockPath(pCtx->dbPath, lockPath, MAXPATHLEN);
6167         tempLockPath = lockPath;
6168         /* create a copy of the lock path _only_ if the conch is taken */
6169       }
6170
6171       /* update conch with host and path (this will fail if other process
6172       ** has a shared lock already), if the host id matches, use the big
6173       ** stick.
6174       */
6175       futimes(conchFile->h, NULL);
6176       if( hostIdMatch && !createConch ){
6177         if( conchFile->pInode && conchFile->pInode->nShared>1 ){
6178           /* We are trying for an exclusive lock but another thread in this
6179            ** same process is still holding a shared lock. */
6180           rc = SQLITE_BUSY;
6181         } else {
6182           rc = proxyConchLock(pFile, myHostID, EXCLUSIVE_LOCK);
6183         }
6184       }else{
6185         rc = conchFile->pMethod->xLock((sqlite3_file*)conchFile, EXCLUSIVE_LOCK);
6186       }
6187       if( rc==SQLITE_OK ){
6188         char writeBuffer[PROXY_MAXCONCHLEN];
6189         int writeSize = 0;
6190
6191         writeBuffer[0] = (char)PROXY_CONCHVERSION;
6192         memcpy(&writeBuffer[PROXY_HEADERLEN], myHostID, PROXY_HOSTIDLEN);
6193         if( pCtx->lockProxyPath!=NULL ){
6194           strlcpy(&writeBuffer[PROXY_PATHINDEX], pCtx->lockProxyPath, MAXPATHLEN);
6195         }else{
6196           strlcpy(&writeBuffer[PROXY_PATHINDEX], tempLockPath, MAXPATHLEN);
6197         }
6198         writeSize = PROXY_PATHINDEX + strlen(&writeBuffer[PROXY_PATHINDEX]);
6199         robust_ftruncate(conchFile->h, writeSize);
6200         rc = unixWrite((sqlite3_file *)conchFile, writeBuffer, writeSize, 0);
6201         fsync(conchFile->h);
6202         /* If we created a new conch file (not just updated the contents of a
6203          ** valid conch file), try to match the permissions of the database
6204          */
6205         if( rc==SQLITE_OK && createConch ){
6206           struct stat buf;
6207           int err = osFstat(pFile->h, &buf);
6208           if( err==0 ){
6209             mode_t cmode = buf.st_mode&(S_IRUSR|S_IWUSR | S_IRGRP|S_IWGRP |
6210                                         S_IROTH|S_IWOTH);
6211             /* try to match the database file R/W permissions, ignore failure */
6212 #ifndef SQLITE_PROXY_DEBUG
6213             osFchmod(conchFile->h, cmode);
6214 #else
6215             do{
6216               rc = osFchmod(conchFile->h, cmode);
6217             }while( rc==(-1) && errno==EINTR );
6218             if( rc!=0 ){
6219               int code = errno;
6220               fprintf(stderr, "fchmod %o FAILED with %d %s\n",
6221                       cmode, code, strerror(code));
6222             } else {
6223               fprintf(stderr, "fchmod %o SUCCEDED\n",cmode);
6224             }
6225           }else{
6226             int code = errno;
6227             fprintf(stderr, "STAT FAILED[%d] with %d %s\n",
6228                     err, code, strerror(code));
6229 #endif
6230           }
6231         }
6232       }
6233       conchFile->pMethod->xUnlock((sqlite3_file*)conchFile, SHARED_LOCK);
6234
6235     end_takeconch:
6236       OSTRACE(("TRANSPROXY: CLOSE  %d\n", pFile->h));
6237       if( rc==SQLITE_OK && pFile->openFlags ){
6238         int fd;
6239         if( pFile->h>=0 ){
6240           robust_close(pFile, pFile->h, __LINE__);
6241         }
6242         pFile->h = -1;
6243         fd = robust_open(pCtx->dbPath, pFile->openFlags,
6244                       SQLITE_DEFAULT_FILE_PERMISSIONS);
6245         OSTRACE(("TRANSPROXY: OPEN  %d\n", fd));
6246         if( fd>=0 ){
6247           pFile->h = fd;
6248         }else{
6249           rc=SQLITE_CANTOPEN_BKPT; /* SQLITE_BUSY? proxyTakeConch called
6250            during locking */
6251         }
6252       }
6253       if( rc==SQLITE_OK && !pCtx->lockProxy ){
6254         char *path = tempLockPath ? tempLockPath : pCtx->lockProxyPath;
6255         rc = proxyCreateUnixFile(path, &pCtx->lockProxy, 1);
6256         if( rc!=SQLITE_OK && rc!=SQLITE_NOMEM && tryOldLockPath ){
6257           /* we couldn't create the proxy lock file with the old lock file path
6258            ** so try again via auto-naming
6259            */
6260           forceNewLockPath = 1;
6261           tryOldLockPath = 0;
6262           continue; /* go back to the do {} while start point, try again */
6263         }
6264       }
6265       if( rc==SQLITE_OK ){
6266         /* Need to make a copy of path if we extracted the value
6267          ** from the conch file or the path was allocated on the stack
6268          */
6269         if( tempLockPath ){
6270           pCtx->lockProxyPath = sqlite3DbStrDup(0, tempLockPath);
6271           if( !pCtx->lockProxyPath ){
6272             rc = SQLITE_NOMEM;
6273           }
6274         }
6275       }
6276       if( rc==SQLITE_OK ){
6277         pCtx->conchHeld = 1;
6278
6279         if( pCtx->lockProxy->pMethod == &afpIoMethods ){
6280           afpLockingContext *afpCtx;
6281           afpCtx = (afpLockingContext *)pCtx->lockProxy->lockingContext;
6282           afpCtx->dbPath = pCtx->lockProxyPath;
6283         }
6284       } else {
6285         conchFile->pMethod->xUnlock((sqlite3_file*)conchFile, NO_LOCK);
6286       }
6287       OSTRACE(("TAKECONCH  %d %s\n", conchFile->h,
6288                rc==SQLITE_OK?"ok":"failed"));
6289       return rc;
6290     } while (1); /* in case we need to retry the :auto: lock file -
6291                  ** we should never get here except via the 'continue' call. */
6292   }
6293 }
6294
6295 /*
6296 ** If pFile holds a lock on a conch file, then release that lock.
6297 */
6298 static int proxyReleaseConch(unixFile *pFile){
6299   int rc = SQLITE_OK;         /* Subroutine return code */
6300   proxyLockingContext *pCtx;  /* The locking context for the proxy lock */
6301   unixFile *conchFile;        /* Name of the conch file */
6302
6303   pCtx = (proxyLockingContext *)pFile->lockingContext;
6304   conchFile = pCtx->conchFile;
6305   OSTRACE(("RELEASECONCH  %d for %s pid=%d\n", conchFile->h,
6306            (pCtx->lockProxyPath ? pCtx->lockProxyPath : ":auto:"),
6307            getpid()));
6308   if( pCtx->conchHeld>0 ){
6309     rc = conchFile->pMethod->xUnlock((sqlite3_file*)conchFile, NO_LOCK);
6310   }
6311   pCtx->conchHeld = 0;
6312   OSTRACE(("RELEASECONCH  %d %s\n", conchFile->h,
6313            (rc==SQLITE_OK ? "ok" : "failed")));
6314   return rc;
6315 }
6316
6317 /*
6318 ** Given the name of a database file, compute the name of its conch file.
6319 ** Store the conch filename in memory obtained from sqlite3_malloc().
6320 ** Make *pConchPath point to the new name.  Return SQLITE_OK on success
6321 ** or SQLITE_NOMEM if unable to obtain memory.
6322 **
6323 ** The caller is responsible for ensuring that the allocated memory
6324 ** space is eventually freed.
6325 **
6326 ** *pConchPath is set to NULL if a memory allocation error occurs.
6327 */
6328 static int proxyCreateConchPathname(char *dbPath, char **pConchPath){
6329   int i;                        /* Loop counter */
6330   int len = (int)strlen(dbPath); /* Length of database filename - dbPath */
6331   char *conchPath;              /* buffer in which to construct conch name */
6332
6333   /* Allocate space for the conch filename and initialize the name to
6334   ** the name of the original database file. */
6335   *pConchPath = conchPath = (char *)sqlite3_malloc(len + 8);
6336   if( conchPath==0 ){
6337     return SQLITE_NOMEM;
6338   }
6339   memcpy(conchPath, dbPath, len+1);
6340
6341   /* now insert a "." before the last / character */
6342   for( i=(len-1); i>=0; i-- ){
6343     if( conchPath[i]=='/' ){
6344       i++;
6345       break;
6346     }
6347   }
6348   conchPath[i]='.';
6349   while ( i<len ){
6350     conchPath[i+1]=dbPath[i];
6351     i++;
6352   }
6353
6354   /* append the "-conch" suffix to the file */
6355   memcpy(&conchPath[i+1], "-conch", 7);
6356   assert( (int)strlen(conchPath) == len+7 );
6357
6358   return SQLITE_OK;
6359 }
6360
6361
6362 /* Takes a fully configured proxy locking-style unix file and switches
6363 ** the local lock file path
6364 */
6365 static int switchLockProxyPath(unixFile *pFile, const char *path) {
6366   proxyLockingContext *pCtx = (proxyLockingContext*)pFile->lockingContext;
6367   char *oldPath = pCtx->lockProxyPath;
6368   int rc = SQLITE_OK;
6369
6370   if( pFile->eFileLock!=NO_LOCK ){
6371     return SQLITE_BUSY;
6372   }
6373
6374   /* nothing to do if the path is NULL, :auto: or matches the existing path */
6375   if( !path || path[0]=='\0' || !strcmp(path, ":auto:") ||
6376     (oldPath && !strncmp(oldPath, path, MAXPATHLEN)) ){
6377     return SQLITE_OK;
6378   }else{
6379     unixFile *lockProxy = pCtx->lockProxy;
6380     pCtx->lockProxy=NULL;
6381     pCtx->conchHeld = 0;
6382     if( lockProxy!=NULL ){
6383       rc=lockProxy->pMethod->xClose((sqlite3_file *)lockProxy);
6384       if( rc ) return rc;
6385       sqlite3_free(lockProxy);
6386     }
6387     sqlite3_free(oldPath);
6388     pCtx->lockProxyPath = sqlite3DbStrDup(0, path);
6389   }
6390
6391   return rc;
6392 }
6393
6394 /*
6395 ** pFile is a file that has been opened by a prior xOpen call.  dbPath
6396 ** is a string buffer at least MAXPATHLEN+1 characters in size.
6397 **
6398 ** This routine find the filename associated with pFile and writes it
6399 ** int dbPath.
6400 */
6401 static int proxyGetDbPathForUnixFile(unixFile *pFile, char *dbPath){
6402 #if defined(__APPLE__)
6403   if( pFile->pMethod == &afpIoMethods ){
6404     /* afp style keeps a reference to the db path in the filePath field
6405     ** of the struct */
6406     assert( (int)strlen((char*)pFile->lockingContext)<=MAXPATHLEN );
6407     strlcpy(dbPath, ((afpLockingContext *)pFile->lockingContext)->dbPath, MAXPATHLEN);
6408   } else
6409 #endif
6410   if( pFile->pMethod == &dotlockIoMethods ){
6411     /* dot lock style uses the locking context to store the dot lock
6412     ** file path */
6413     int len = strlen((char *)pFile->lockingContext) - strlen(DOTLOCK_SUFFIX);
6414     memcpy(dbPath, (char *)pFile->lockingContext, len + 1);
6415   }else{
6416     /* all other styles use the locking context to store the db file path */
6417     assert( strlen((char*)pFile->lockingContext)<=MAXPATHLEN );
6418     strlcpy(dbPath, (char *)pFile->lockingContext, MAXPATHLEN);
6419   }
6420   return SQLITE_OK;
6421 }
6422
6423 /*
6424 ** Takes an already filled in unix file and alters it so all file locking
6425 ** will be performed on the local proxy lock file.  The following fields
6426 ** are preserved in the locking context so that they can be restored and
6427 ** the unix structure properly cleaned up at close time:
6428 **  ->lockingContext
6429 **  ->pMethod
6430 */
6431 static int proxyTransformUnixFile(unixFile *pFile, const char *path) {
6432   proxyLockingContext *pCtx;
6433   char dbPath[MAXPATHLEN+1];       /* Name of the database file */
6434   char *lockPath=NULL;
6435   int rc = SQLITE_OK;
6436
6437   if( pFile->eFileLock!=NO_LOCK ){
6438     return SQLITE_BUSY;
6439   }
6440   proxyGetDbPathForUnixFile(pFile, dbPath);
6441   if( !path || path[0]=='\0' || !strcmp(path, ":auto:") ){
6442     lockPath=NULL;
6443   }else{
6444     lockPath=(char *)path;
6445   }
6446
6447   OSTRACE(("TRANSPROXY  %d for %s pid=%d\n", pFile->h,
6448            (lockPath ? lockPath : ":auto:"), getpid()));
6449
6450   pCtx = sqlite3_malloc( sizeof(*pCtx) );
6451   if( pCtx==0 ){
6452     return SQLITE_NOMEM;
6453   }
6454   memset(pCtx, 0, sizeof(*pCtx));
6455
6456   rc = proxyCreateConchPathname(dbPath, &pCtx->conchFilePath);
6457   if( rc==SQLITE_OK ){
6458     rc = proxyCreateUnixFile(pCtx->conchFilePath, &pCtx->conchFile, 0);
6459     if( rc==SQLITE_CANTOPEN && ((pFile->openFlags&O_RDWR) == 0) ){
6460       /* if (a) the open flags are not O_RDWR, (b) the conch isn't there, and
6461       ** (c) the file system is read-only, then enable no-locking access.
6462       ** Ugh, since O_RDONLY==0x0000 we test for !O_RDWR since unixOpen asserts
6463       ** that openFlags will have only one of O_RDONLY or O_RDWR.
6464       */
6465       struct statfs fsInfo;
6466       struct stat conchInfo;
6467       int goLockless = 0;
6468
6469       if( osStat(pCtx->conchFilePath, &conchInfo) == -1 ) {
6470         int err = errno;
6471         if( (err==ENOENT) && (statfs(dbPath, &fsInfo) != -1) ){
6472           goLockless = (fsInfo.f_flags&MNT_RDONLY) == MNT_RDONLY;
6473         }
6474       }
6475       if( goLockless ){
6476         pCtx->conchHeld = -1; /* read only FS/ lockless */
6477         rc = SQLITE_OK;
6478       }
6479     }
6480   }
6481   if( rc==SQLITE_OK && lockPath ){
6482     pCtx->lockProxyPath = sqlite3DbStrDup(0, lockPath);
6483   }
6484
6485   if( rc==SQLITE_OK ){
6486     pCtx->dbPath = sqlite3DbStrDup(0, dbPath);
6487     if( pCtx->dbPath==NULL ){
6488       rc = SQLITE_NOMEM;
6489     }
6490   }
6491   if( rc==SQLITE_OK ){
6492     /* all memory is allocated, proxys are created and assigned,
6493     ** switch the locking context and pMethod then return.
6494     */
6495     pCtx->oldLockingContext = pFile->lockingContext;
6496     pFile->lockingContext = pCtx;
6497     pCtx->pOldMethod = pFile->pMethod;
6498     pFile->pMethod = &proxyIoMethods;
6499   }else{
6500     if( pCtx->conchFile ){
6501       pCtx->conchFile->pMethod->xClose((sqlite3_file *)pCtx->conchFile);
6502       sqlite3_free(pCtx->conchFile);
6503     }
6504     sqlite3DbFree(0, pCtx->lockProxyPath);
6505     sqlite3_free(pCtx->conchFilePath);
6506     sqlite3_free(pCtx);
6507   }
6508   OSTRACE(("TRANSPROXY  %d %s\n", pFile->h,
6509            (rc==SQLITE_OK ? "ok" : "failed")));
6510   return rc;
6511 }
6512
6513
6514 /*
6515 ** This routine handles sqlite3_file_control() calls that are specific
6516 ** to proxy locking.
6517 */
6518 static int proxyFileControl(sqlite3_file *id, int op, void *pArg){
6519   switch( op ){
6520     case SQLITE_GET_LOCKPROXYFILE: {
6521       unixFile *pFile = (unixFile*)id;
6522       if( pFile->pMethod == &proxyIoMethods ){
6523         proxyLockingContext *pCtx = (proxyLockingContext*)pFile->lockingContext;
6524         proxyTakeConch(pFile);
6525         if( pCtx->lockProxyPath ){
6526           *(const char **)pArg = pCtx->lockProxyPath;
6527         }else{
6528           *(const char **)pArg = ":auto: (not held)";
6529         }
6530       } else {
6531         *(const char **)pArg = NULL;
6532       }
6533       return SQLITE_OK;
6534     }
6535     case SQLITE_SET_LOCKPROXYFILE: {
6536       unixFile *pFile = (unixFile*)id;
6537       int rc = SQLITE_OK;
6538       int isProxyStyle = (pFile->pMethod == &proxyIoMethods);
6539       if( pArg==NULL || (const char *)pArg==0 ){
6540         if( isProxyStyle ){
6541           /* turn off proxy locking - not supported */
6542           rc = SQLITE_ERROR /*SQLITE_PROTOCOL? SQLITE_MISUSE?*/;
6543         }else{
6544           /* turn off proxy locking - already off - NOOP */
6545           rc = SQLITE_OK;
6546         }
6547       }else{
6548         const char *proxyPath = (const char *)pArg;
6549         if( isProxyStyle ){
6550           proxyLockingContext *pCtx =
6551             (proxyLockingContext*)pFile->lockingContext;
6552           if( !strcmp(pArg, ":auto:")
6553            || (pCtx->lockProxyPath &&
6554                !strncmp(pCtx->lockProxyPath, proxyPath, MAXPATHLEN))
6555           ){
6556             rc = SQLITE_OK;
6557           }else{
6558             rc = switchLockProxyPath(pFile, proxyPath);
6559           }
6560         }else{
6561           /* turn on proxy file locking */
6562           rc = proxyTransformUnixFile(pFile, proxyPath);
6563         }
6564       }
6565       return rc;
6566     }
6567     default: {
6568       assert( 0 );  /* The call assures that only valid opcodes are sent */
6569     }
6570   }
6571   /*NOTREACHED*/
6572   return SQLITE_ERROR;
6573 }
6574
6575 /*
6576 ** Within this division (the proxying locking implementation) the procedures
6577 ** above this point are all utilities.  The lock-related methods of the
6578 ** proxy-locking sqlite3_io_method object follow.
6579 */
6580
6581
6582 /*
6583 ** This routine checks if there is a RESERVED lock held on the specified
6584 ** file by this or any other process. If such a lock is held, set *pResOut
6585 ** to a non-zero value otherwise *pResOut is set to zero.  The return value
6586 ** is set to SQLITE_OK unless an I/O error occurs during lock checking.
6587 */
6588 static int proxyCheckReservedLock(sqlite3_file *id, int *pResOut) {
6589   unixFile *pFile = (unixFile*)id;
6590   int rc = proxyTakeConch(pFile);
6591   if( rc==SQLITE_OK ){
6592     proxyLockingContext *pCtx = (proxyLockingContext *)pFile->lockingContext;
6593     if( pCtx->conchHeld>0 ){
6594       unixFile *proxy = pCtx->lockProxy;
6595       return proxy->pMethod->xCheckReservedLock((sqlite3_file*)proxy, pResOut);
6596     }else{ /* conchHeld < 0 is lockless */
6597       pResOut=0;
6598     }
6599   }
6600   return rc;
6601 }
6602
6603 /*
6604 ** Lock the file with the lock specified by parameter eFileLock - one
6605 ** of the following:
6606 **
6607 **     (1) SHARED_LOCK
6608 **     (2) RESERVED_LOCK
6609 **     (3) PENDING_LOCK
6610 **     (4) EXCLUSIVE_LOCK
6611 **
6612 ** Sometimes when requesting one lock state, additional lock states
6613 ** are inserted in between.  The locking might fail on one of the later
6614 ** transitions leaving the lock state different from what it started but
6615 ** still short of its goal.  The following chart shows the allowed
6616 ** transitions and the inserted intermediate states:
6617 **
6618 **    UNLOCKED -> SHARED
6619 **    SHARED -> RESERVED
6620 **    SHARED -> (PENDING) -> EXCLUSIVE
6621 **    RESERVED -> (PENDING) -> EXCLUSIVE
6622 **    PENDING -> EXCLUSIVE
6623 **
6624 ** This routine will only increase a lock.  Use the sqlite3OsUnlock()
6625 ** routine to lower a locking level.
6626 */
6627 static int proxyLock(sqlite3_file *id, int eFileLock) {
6628   unixFile *pFile = (unixFile*)id;
6629   int rc = proxyTakeConch(pFile);
6630   if( rc==SQLITE_OK ){
6631     proxyLockingContext *pCtx = (proxyLockingContext *)pFile->lockingContext;
6632     if( pCtx->conchHeld>0 ){
6633       unixFile *proxy = pCtx->lockProxy;
6634       rc = proxy->pMethod->xLock((sqlite3_file*)proxy, eFileLock);
6635       pFile->eFileLock = proxy->eFileLock;
6636     }else{
6637       /* conchHeld < 0 is lockless */
6638     }
6639   }
6640   return rc;
6641 }
6642
6643
6644 /*
6645 ** Lower the locking level on file descriptor pFile to eFileLock.  eFileLock
6646 ** must be either NO_LOCK or SHARED_LOCK.
6647 **
6648 ** If the locking level of the file descriptor is already at or below
6649 ** the requested locking level, this routine is a no-op.
6650 */
6651 static int proxyUnlock(sqlite3_file *id, int eFileLock) {
6652   unixFile *pFile = (unixFile*)id;
6653   int rc = proxyTakeConch(pFile);
6654   if( rc==SQLITE_OK ){
6655     proxyLockingContext *pCtx = (proxyLockingContext *)pFile->lockingContext;
6656     if( pCtx->conchHeld>0 ){
6657       unixFile *proxy = pCtx->lockProxy;
6658       rc = proxy->pMethod->xUnlock((sqlite3_file*)proxy, eFileLock);
6659       pFile->eFileLock = proxy->eFileLock;
6660     }else{
6661       /* conchHeld < 0 is lockless */
6662     }
6663   }
6664   return rc;
6665 }
6666
6667 /*
6668 ** Close a file that uses proxy locks.
6669 */
6670 static int proxyClose(sqlite3_file *id) {
6671   if( id ){
6672     unixFile *pFile = (unixFile*)id;
6673     proxyLockingContext *pCtx = (proxyLockingContext *)pFile->lockingContext;
6674     unixFile *lockProxy = pCtx->lockProxy;
6675     unixFile *conchFile = pCtx->conchFile;
6676     int rc = SQLITE_OK;
6677
6678     if( lockProxy ){
6679       rc = lockProxy->pMethod->xUnlock((sqlite3_file*)lockProxy, NO_LOCK);
6680       if( rc ) return rc;
6681       rc = lockProxy->pMethod->xClose((sqlite3_file*)lockProxy);
6682       if( rc ) return rc;
6683       sqlite3_free(lockProxy);
6684       pCtx->lockProxy = 0;
6685     }
6686     if( conchFile ){
6687       if( pCtx->conchHeld ){
6688         rc = proxyReleaseConch(pFile);
6689         if( rc ) return rc;
6690       }
6691       rc = conchFile->pMethod->xClose((sqlite3_file*)conchFile);
6692       if( rc ) return rc;
6693       sqlite3_free(conchFile);
6694     }
6695     sqlite3DbFree(0, pCtx->lockProxyPath);
6696     sqlite3_free(pCtx->conchFilePath);
6697     sqlite3DbFree(0, pCtx->dbPath);
6698     /* restore the original locking context and pMethod then close it */
6699     pFile->lockingContext = pCtx->oldLockingContext;
6700     pFile->pMethod = pCtx->pOldMethod;
6701     sqlite3_free(pCtx);
6702     return pFile->pMethod->xClose(id);
6703   }
6704   return SQLITE_OK;
6705 }
6706
6707
6708
6709 #endif /* defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE */
6710 /*
6711 ** The proxy locking style is intended for use with AFP filesystems.
6712 ** And since AFP is only supported on MacOSX, the proxy locking is also
6713 ** restricted to MacOSX.
6714 **
6715 **
6716 ******************* End of the proxy lock implementation **********************
6717 ******************************************************************************/
6718
6719 /*
6720 ** Initialize the operating system interface.
6721 **
6722 ** This routine registers all VFS implementations for unix-like operating
6723 ** systems.  This routine, and the sqlite3_os_end() routine that follows,
6724 ** should be the only routines in this file that are visible from other
6725 ** files.
6726 **
6727 ** This routine is called once during SQLite initialization and by a
6728 ** single thread.  The memory allocation and mutex subsystems have not
6729 ** necessarily been initialized when this routine is called, and so they
6730 ** should not be used.
6731 */
6732 int sqlite3_os_init(void){
6733   /*
6734   ** The following macro defines an initializer for an sqlite3_vfs object.
6735   ** The name of the VFS is NAME.  The pAppData is a pointer to a pointer
6736   ** to the "finder" function.  (pAppData is a pointer to a pointer because
6737   ** silly C90 rules prohibit a void* from being cast to a function pointer
6738   ** and so we have to go through the intermediate pointer to avoid problems
6739   ** when compiling with -pedantic-errors on GCC.)
6740   **
6741   ** The FINDER parameter to this macro is the name of the pointer to the
6742   ** finder-function.  The finder-function returns a pointer to the
6743   ** sqlite_io_methods object that implements the desired locking
6744   ** behaviors.  See the division above that contains the IOMETHODS
6745   ** macro for addition information on finder-functions.
6746   **
6747   ** Most finders simply return a pointer to a fixed sqlite3_io_methods
6748   ** object.  But the "autolockIoFinder" available on MacOSX does a little
6749   ** more than that; it looks at the filesystem type that hosts the
6750   ** database file and tries to choose an locking method appropriate for
6751   ** that filesystem time.
6752   */
6753   #define UNIXVFS(VFSNAME, FINDER) {                        \
6754     3,                    /* iVersion */                    \
6755     sizeof(unixFile),     /* szOsFile */                    \
6756     MAX_PATHNAME,         /* mxPathname */                  \
6757     0,                    /* pNext */                       \
6758     VFSNAME,              /* zName */                       \
6759     (void*)&FINDER,       /* pAppData */                    \
6760     unixOpen,             /* xOpen */                       \
6761     unixDelete,           /* xDelete */                     \
6762     unixAccess,           /* xAccess */                     \
6763     unixFullPathname,     /* xFullPathname */               \
6764     unixDlOpen,           /* xDlOpen */                     \
6765     unixDlError,          /* xDlError */                    \
6766     unixDlSym,            /* xDlSym */                      \
6767     unixDlClose,          /* xDlClose */                    \
6768     unixRandomness,       /* xRandomness */                 \
6769     unixSleep,            /* xSleep */                      \
6770     unixCurrentTime,      /* xCurrentTime */                \
6771     unixGetLastError,     /* xGetLastError */               \
6772     unixCurrentTimeInt64, /* xCurrentTimeInt64 */           \
6773     unixSetSystemCall,    /* xSetSystemCall */              \
6774     unixGetSystemCall,    /* xGetSystemCall */              \
6775     unixNextSystemCall,   /* xNextSystemCall */             \
6776   }
6777
6778   /*
6779   ** All default VFSes for unix are contained in the following array.
6780   **
6781   ** Note that the sqlite3_vfs.pNext field of the VFS object is modified
6782   ** by the SQLite core when the VFS is registered.  So the following
6783   ** array cannot be const.
6784   */
6785   static sqlite3_vfs aVfs[] = {
6786 #if SQLITE_ENABLE_LOCKING_STYLE && (OS_VXWORKS || defined(__APPLE__))
6787     UNIXVFS("unix",          autolockIoFinder ),
6788 #else
6789     UNIXVFS("unix",          posixIoFinder ),
6790 #endif
6791     UNIXVFS("unix-none",     nolockIoFinder ),
6792     UNIXVFS("unix-dotfile",  dotlockIoFinder ),
6793     UNIXVFS("unix-excl",     posixIoFinder ),
6794 #if OS_VXWORKS
6795     UNIXVFS("unix-namedsem", semIoFinder ),
6796 #endif
6797 #if SQLITE_ENABLE_LOCKING_STYLE
6798     UNIXVFS("unix-posix",    posixIoFinder ),
6799 #if !OS_VXWORKS
6800     UNIXVFS("unix-flock",    flockIoFinder ),
6801 #endif
6802 #endif
6803 #if SQLITE_ENABLE_LOCKING_STYLE && defined(__APPLE__)
6804     UNIXVFS("unix-afp",      afpIoFinder ),
6805     UNIXVFS("unix-nfs",      nfsIoFinder ),
6806     UNIXVFS("unix-proxy",    proxyIoFinder ),
6807 #endif
6808   };
6809   unsigned int i;          /* Loop counter */
6810
6811   /* Double-check that the aSyscall[] array has been constructed
6812   ** correctly.  See ticket [bb3a86e890c8e96ab] */
6813   assert( ArraySize(aSyscall)==20 );
6814
6815   /* Register all VFSes defined in the aVfs[] array */
6816   for(i=0; i<(sizeof(aVfs)/sizeof(sqlite3_vfs)); i++){
6817     sqlite3_vfs_register(&aVfs[i], i==0);
6818   }
6819   return SQLITE_OK;
6820 }
6821
6822 /*
6823 ** Shutdown the operating system interface.
6824 **
6825 ** Some operating systems might need to do some cleanup in this routine,
6826 ** to release dynamically allocated objects.  But not on unix.
6827 ** This routine is a no-op for unix.
6828 */
6829 int sqlite3_os_end(void){
6830   return SQLITE_OK;
6831 }
6832
6833 #endif /* SQLITE_OS_UNIX */