3 * libjio - A library for Journaled I/O
4 * Alberto Bertogli (albertogli@telpin.com.ar)
6 * Core transaction API and recovery functions
30 /* gets a new transaction id */
31 static unsigned int get_tid(struct jfs
*fs
)
33 unsigned int curid
, rv
;
35 /* lock the whole file */
36 plockf(fs
->jfd
, F_LOCKW
, 0, 0);
38 /* read the current max. curid */
41 /* increment it and handle overflows */
46 /* write to the file descriptor */
50 plockf(fs
->jfd
, F_UNLOCK
, 0, 0);
54 /* frees a transaction id */
55 static void free_tid(struct jfs
*fs
, unsigned int tid
)
57 unsigned int curid
, i
;
60 /* lock the whole file */
61 plockf(fs
->jfd
, F_LOCKW
, 0, 0);
63 /* read the current max. curid */
67 /* we're not freeing the max. curid, so we just return */
70 /* look up the new max. */
71 for (i
= curid
- 1; i
> 0; i
--) {
72 /* this can fail if we're low on mem, but we don't
73 * care checking here because the problem will come
74 * out later and we can fail more properly */
75 get_jtfile(fs
->name
, i
, name
);
76 if (access(name
, R_OK
| W_OK
) == 0) {
87 plockf(fs
->jfd
, F_UNLOCK
, 0, 0);
93 * transaction functions
96 /* initialize a transaction structure */
97 void jtrans_init(struct jfs
*fs
, struct jtrans
*ts
)
102 ts
->flags
= fs
->flags
;
105 pthread_mutex_init( &(ts
->lock
), NULL
);
109 /* free the contents of a transaction structure */
110 void jtrans_free(struct jtrans
*ts
)
119 while (ts
->op
!= NULL
) {
120 tmpop
= ts
->op
->next
;
130 pthread_mutex_destroy(&(ts
->lock
));
134 int jtrans_add(struct jtrans
*ts
, const void *buf
, size_t count
, off_t offset
)
136 struct joper
*jop
, *tmpop
;
138 /* find the last operation in the transaction and create a new one at
140 pthread_mutex_lock(&(ts
->lock
));
141 if (ts
->op
== NULL
) {
142 ts
->op
= malloc(sizeof(struct joper
));
148 for (tmpop
= ts
->op
; tmpop
->next
!= NULL
; tmpop
= tmpop
->next
)
150 tmpop
->next
= malloc(sizeof(struct joper
));
151 if (tmpop
->next
== NULL
)
153 tmpop
->next
->prev
= tmpop
;
156 pthread_mutex_unlock(&(ts
->lock
));
158 jop
->buf
= malloc(count
);
159 if (jop
->buf
== NULL
) {
164 /* we copy the buffer because then the caller can reuse it */
165 memcpy(jop
->buf
, buf
, count
);
167 jop
->offset
= offset
;
178 /* commit a transaction */
179 int jtrans_commit(struct jtrans
*ts
)
184 unsigned char *buf_init
, *bufp
;
186 struct jlinger
*linger
;
190 pthread_mutex_lock(&(ts
->lock
));
192 name
= (char *) malloc(PATH_MAX
);
196 id
= get_tid(ts
->fs
);
200 /* open the transaction file */
201 if (!get_jtfile(ts
->fs
->name
, id
, name
))
203 fd
= open(name
, O_RDWR
| O_CREAT
| O_TRUNC
| O_LARGEFILE
, 0600);
208 plockf(fd
, F_LOCKW
, 0, 0);
213 /* save the header */
214 buf_init
= malloc(J_DISKHEADSIZE
);
215 if (buf_init
== NULL
)
220 memcpy(bufp
, (void *) &(ts
->id
), 4);
223 memcpy(bufp
, (void *) &(ts
->flags
), 4);
226 memcpy(bufp
, (void *) &(ts
->numops
), 4);
229 rv
= spwrite(fd
, buf_init
, J_DISKHEADSIZE
, 0);
230 if (rv
!= J_DISKHEADSIZE
) {
237 curpos
= J_DISKHEADSIZE
;
239 /* first of all lock all the regions we're going to work with;
240 * otherwise there could be another transaction trying to write the
241 * same spots and we could end up with interleaved writes, that could
242 * break atomicity warantees if we need to rollback */
243 if (!(ts
->flags
& J_NOLOCK
)) {
244 for (op
= ts
->op
; op
!= NULL
; op
= op
->next
) {
245 rv
= plockf(ts
->fs
->fd
, F_LOCKW
, op
->offset
, op
->len
);
247 /* note it can fail with EDEADLK */
253 /* save each transacion in the file */
254 for (op
= ts
->op
; op
!= NULL
; op
= op
->next
) {
255 /* read the current content only if the transaction is not
256 * marked as NOROLLBACK, and if the data is not there yet,
257 * which is the normal case, but for rollbacking we fill it
259 if (!(ts
->flags
& J_NOROLLBACK
) && (op
->pdata
== NULL
)) {
260 op
->pdata
= malloc(op
->len
);
261 if (op
->pdata
== NULL
)
266 rv
= spread(ts
->fs
->fd
, op
->pdata
, op
->len
,
271 /* we are extending the file! */
272 /* ftruncate(ts->fs->fd, op->offset + op->len); */
277 /* save the operation's header */
278 buf_init
= malloc(J_DISKOPHEADSIZE
);
279 if (buf_init
== NULL
)
284 memcpy(bufp
, (void *) &(op
->len
), 4);
287 memcpy(bufp
, (void *) &(op
->plen
), 4);
290 memcpy(bufp
, (void *) &(op
->offset
), 8);
293 rv
= spwrite(fd
, buf_init
, J_DISKOPHEADSIZE
, curpos
);
294 if (rv
!= J_DISKOPHEADSIZE
) {
301 curpos
+= J_DISKOPHEADSIZE
;
303 /* and save it to the disk */
304 rv
= spwrite(fd
, op
->buf
, op
->len
, curpos
);
311 /* compute and save the checksum */
312 if (!checksum(fd
, curpos
, &csum
))
315 rv
= spwrite(fd
, &csum
, sizeof(uint32_t), curpos
);
316 if (rv
!= sizeof(uint32_t))
318 curpos
+= sizeof(uint32_t);
320 /* this is a simple but efficient optimization: instead of doing
321 * everything O_SYNC, we sync at this point only, this way we avoid
322 * doing a lot of very small writes; in case of a crash the
323 * transaction file is only useful if it's complete (ie. after this
324 * point) so we only flush here (both data and metadata) */
327 if (fsync(ts
->fs
->jdirfd
) != 0) {
328 /* it seems to be legal that fsync() on directories is not
329 * implemented, so if this fails with EINVAL or EBADF, just
330 * call a global sync(); which is awful (and might still
331 * return before metadata is done) but it seems to be the
332 * saner choice; otherwise we just fail */
333 if (errno
== EINVAL
|| errno
== EBADF
) {
340 /* now that we have a safe transaction file, let's apply it */
342 for (op
= ts
->op
; op
!= NULL
; op
= op
->next
) {
343 rv
= spwrite(ts
->fs
->fd
, op
->buf
, op
->len
, op
->offset
);
345 plockf(ts
->fs
->fd
, F_UNLOCK
, op
->offset
, op
->len
);
354 if (ts
->flags
& J_LINGER
) {
355 linger
= malloc(sizeof(struct jlinger
));
360 linger
->name
= strdup(name
);
361 linger
->next
= ts
->fs
->ltrans
;
363 ts
->fs
->ltrans
= linger
;
365 /* the transaction has been applied, so we cleanup and remove
366 * it from the disk */
368 free_tid(ts
->fs
, ts
->id
);
371 /* mark the transaction as commited, _after_ it was removed */
372 ts
->flags
= ts
->flags
| J_COMMITED
;
376 /* If the transaction failed we try to recover by rollbacking it
377 * NOTE: on extreme conditions (ENOSPC/disk failure) this can fail
378 * too! There's nothing much we can do in that case, the caller should
379 * take care of it by itself.
380 * The transaction file might be OK at this point, so the data could
381 * be recovered by a posterior jfsck(); however, that's not what the
382 * user expects (after all, if we return failure, new data should
383 * never appear), so we remove the transaction file.
384 * Transactions that were successfuly recovered by rollbacking them
385 * will have J_ROLLBACKED in their flags, so the caller can verify if
386 * the failure was recovered or not. */
387 if (!(ts
->flags
& J_COMMITED
) && !(ts
->flags
& J_ROLLBACKING
)) {
389 ts
->flags
= ts
->flags
| J_NOLOCK
| J_ROLLBACKING
;
390 if (jtrans_rollback(ts
) >= 0) {
391 ts
->flags
= rv
| J_ROLLBACKED
;
398 if (!(ts
->flags
& J_COMMITED
)) {
400 free_tid(ts
->fs
, ts
->id
);
404 for (op
= ts
->op
; op
!= NULL
; op
= op
->next
) {
406 plockf(ts
->fs
->fd
, F_UNLOCK
, op
->offset
, op
->len
);
410 pthread_mutex_unlock(&(ts
->lock
));
412 /* return the length only if it was properly commited */
413 if (ts
->flags
& J_COMMITED
)
420 /* rollback a transaction */
421 int jtrans_rollback(struct jtrans
*ts
)
425 struct joper
*op
, *curop
, *lop
;
427 jtrans_init(ts
->fs
, &newts
);
428 newts
.flags
= ts
->flags
;
430 if (ts
->op
== NULL
|| ts
->flags
& J_NOROLLBACK
) {
435 /* find the last operation */
436 for (op
= ts
->op
; op
->next
!= NULL
; op
= op
->next
)
439 /* and traverse the list backwards */
440 for ( ; op
!= NULL
; op
= op
->prev
) {
441 /* if we extended the data in the previous transaction, we
442 * should truncate it back */
443 /* DANGEROUS: this is one of the main reasons why rollbacking
444 * is dangerous and should only be done with extreme caution:
445 * if for some reason, after the previous transacton, we have
446 * extended the file further, this will cut it back to what it
447 * was; read the docs for more detail */
448 if (op
->plen
< op
->len
)
449 ftruncate(ts
->fs
->fd
, op
->offset
+ op
->plen
);
451 /* manually add the operation to the new transaction */
452 curop
= malloc(sizeof(struct joper
));
458 curop
->offset
= op
->offset
;
459 curop
->len
= op
->plen
;
460 curop
->buf
= op
->pdata
;
461 curop
->plen
= op
->plen
;
462 curop
->pdata
= op
->pdata
;
465 /* add the new transaction to the list */
466 if (newts
.op
== NULL
) {
471 for (lop
= newts
.op
; lop
->next
!= NULL
; lop
= lop
->next
)
479 rv
= jtrans_commit(&newts
);
482 /* free the transaction */
483 for (curop
= newts
.op
; curop
!= NULL
; curop
= curop
->next
) {
497 int jopen(struct jfs
*fs
, const char *name
, int flags
, int mode
, int jflags
)
501 char jdir
[PATH_MAX
], jlockfile
[PATH_MAX
];
504 /* we always need read and write access, because when we commit a
505 * transaction we read the current contents before applying, and write
506 * access is needed for locking with fcntl */
507 flags
= flags
& ~O_WRONLY
;
508 flags
= flags
& ~O_RDONLY
;
509 flags
= flags
| O_RDWR
;
511 fd
= open(name
, flags
, mode
);
516 fs
->name
= strdup(name
);
520 /* Note on fs->lock usage: this lock is used only inside the wrappers,
521 * and exclusively to protect the file pointer. This means that it
522 * must only be held while performing operations that depend or alter
523 * the file pointer (jread, jreadv, jwrite, jwritev), but the others
524 * (jpread, jpwrite) are left unprotected because they can be
525 * performed in paralell as long as they don't affect the same portion
526 * of the file (this is protected by lockf). The lock doesn't slow
527 * things down tho: any threaded app MUST implement this kind of
528 * locking anyways if it wants to prevent data corruption, we only
529 * make it easier for them by taking care of it here. If performance
530 * is essential, the jpread/jpwrite functions should be used, just as
532 pthread_mutex_init( &(fs
->lock
), NULL
);
534 if (!get_jdir(name
, jdir
))
536 rv
= mkdir(jdir
, 0750);
537 rv
= lstat(jdir
, &sinfo
);
538 if (rv
< 0 || !S_ISDIR(sinfo
.st_mode
))
541 /* open the directory, we will use it to flush transaction files'
542 * metadata in jtrans_commit() */
543 fs
->jdirfd
= open(jdir
, O_RDONLY
);
547 snprintf(jlockfile
, PATH_MAX
, "%s/%s", jdir
, "lock");
548 jfd
= open(jlockfile
, O_RDWR
| O_CREAT
, 0600);
552 /* initialize the lock file by writing the first tid to it, but only
553 * if its empty, otherwise there is a race if two processes call
554 * jopen() simultaneously and both initialize the file */
555 plockf(jfd
, F_LOCKW
, 0, 0);
556 lstat(jlockfile
, &sinfo
);
557 if (sinfo
.st_size
!= sizeof(unsigned int)) {
559 rv
= spwrite(jfd
, &t
, sizeof(t
), 0);
560 if (rv
!= sizeof(t
)) {
561 plockf(jfd
, F_UNLOCK
, 0, 0);
565 plockf(jfd
, F_UNLOCK
, 0, 0);
569 fs
->jmap
= (unsigned int *) mmap(NULL
, sizeof(unsigned int),
570 PROT_READ
| PROT_WRITE
, MAP_SHARED
, jfd
, 0);
571 if (fs
->jmap
== MAP_FAILED
)
577 /* sync a file (makes sense only if using lingering transactions) */
578 int jsync(struct jfs
*fs
)
581 struct jlinger
*linger
, *ltmp
;
583 pthread_mutex_lock(&(fs
->lock
));
590 while (linger
!= NULL
) {
591 free_tid(fs
, linger
->id
);
592 unlink(linger
->name
);
602 pthread_mutex_unlock(&(fs
->lock
));
607 int jclose(struct jfs
*fs
)
615 if (close(fs
->jdirfd
))
618 /* allocated by strdup() in jopen() */
620 munmap(fs
->jmap
, sizeof(unsigned int));
621 pthread_mutex_destroy(&(fs
->lock
));