3 * libjio - A library for Journaled I/O
4 * Alberto Bertogli (albertogli@telpin.com.ar)
23 * small util functions
26 /* like lockf, but lock always from the beginning of the file */
27 static off_t
plockf(int fd
, int cmd
, off_t offset
, off_t len
)
35 } else if (cmd
== F_ULOCK
) {
38 } else if (cmd
== F_TLOCK
) {
44 fl
.l_whence
= SEEK_SET
;
48 return fcntl(fd
, op
, &fl
);
51 /* like pread but either fails, or return a complete read; if we return less
52 * than count is because EOF was reached */
53 static ssize_t
spread(int fd
, void *buf
, size_t count
, off_t offset
)
60 rv
= pread(fd
, buf
+ c
, count
- c
, offset
+ c
);
72 /* incomplete read, keep on reading */
79 /* like spread() but for pwrite() */
80 static ssize_t
spwrite(int fd
, void *buf
, size_t count
, off_t offset
)
87 rv
= pwrite(fd
, buf
+ c
, count
- c
, offset
+ c
);
93 /* error/nothing was written */
96 /* incomplete write, keep on writing */
103 /* build the journal directory name out of the filename */
104 static int get_jdir(char *filename
, char *jdir
)
109 baset
= strdup(filename
);
112 base
= basename(baset
);
114 dirt
= strdup(filename
);
119 snprintf(jdir
, PATH_MAX
, "%s/.%s.jio", dir
, base
);
127 /* build the filename of a given transaction */
128 static int get_jtfile(char *filename
, int tid
, char *jtfile
)
133 baset
= strdup(filename
);
136 base
= basename(baset
);
138 dirt
= strdup(filename
);
143 snprintf(jtfile
, PATH_MAX
, "%s/.%s.jio/%d", dir
, base
, tid
);
151 /* gets a new transaction id */
152 static unsigned int get_tid(struct jfs
*fs
)
157 /* lock the whole file */
158 plockf(fs
->jfd
, F_LOCK
, 0, 0);
160 /* read the current max. curid */
161 r
= spread(fs
->jfd
, &curid
, sizeof(curid
), 0);
162 if (r
!= sizeof(curid
)) {
167 /* increment it and handle overflows */
172 /* write to the file descriptor */
173 r
= spwrite(fs
->jfd
, &rv
, sizeof(rv
), 0);
174 if (r
!= sizeof(curid
)) {
180 plockf(fs
->jfd
, F_ULOCK
, 0, 0);
184 /* frees a transaction id */
185 static void free_tid(struct jfs
*fs
, unsigned int tid
)
187 unsigned int curid
, i
;
191 /* lock the whole file */
192 plockf(fs
->jfd
, F_LOCK
, 0, 0);
194 /* read the current max. curid */
195 r
= spread(fs
->jfd
, &curid
, sizeof(curid
), 0);
196 if (r
!= sizeof(curid
)) {
201 /* we're not freeing the max. curid, so we just return */
204 /* look up the new max. */
205 for (i
= curid
- 1; i
> 0; i
--) {
206 /* this can fail if we're low on mem, but we don't
207 * care checking here because the problem will come
208 * out later and we can fail more properly */
209 get_jtfile(fs
->name
, i
, name
);
210 if (access(name
, R_OK
| W_OK
) == 0) {
217 r
= spwrite(fs
->jfd
, &i
, sizeof(i
), 0);
218 if (r
!= sizeof(curid
)) {
224 plockf(fs
->jfd
, F_ULOCK
, 0, 0);
230 * transaction functions
233 /* initialize a transaction structure */
234 void jtrans_init(struct jfs
*fs
, struct jtrans
*ts
)
249 /* free a transaction structure */
250 void jtrans_free(struct jtrans
*ts
)
252 /* NOTE: we only really free the name and previous data, which are the
253 * things _we_ allocate; the user data is caller stuff */
262 /* commit a transaction */
263 int jtrans_commit(struct jtrans
*ts
)
267 void *buf_init
, *bufp
;
269 name
= (char *) malloc(PATH_MAX
);
273 id
= get_tid(ts
->fs
);
277 /* open the transaction file */
278 if (!get_jtfile(ts
->fs
->name
, id
, name
))
280 fd
= open(name
, O_RDWR
| O_CREAT
| O_TRUNC
| O_LARGEFILE
, 0600);
285 plockf(fd
, F_LOCK
, 0, 0);
290 /* lock the file region to work on */
291 if (!(ts
->fs
->flags
& J_NOLOCK
))
292 plockf(ts
->fs
->fd
, F_LOCK
, ts
->offset
, ts
->len
);
294 /* first the static data */
296 buf_init
= malloc(J_DISKTFIXSIZE
);
297 if (buf_init
== NULL
)
302 memcpy(bufp
, (void *) &(ts
->id
), sizeof(ts
->id
));
305 memcpy(bufp
, (void *) &(ts
->flags
), sizeof(ts
->flags
));
308 memcpy(bufp
, (void *) &(ts
->len
), sizeof(ts
->len
));
311 memcpy(bufp
, (void *) &(ts
->ulen
), sizeof(ts
->ulen
));
314 memcpy(bufp
, (void *) &(ts
->offset
), sizeof(ts
->offset
));
317 rv
= spwrite(fd
, buf_init
, J_DISKTFIXSIZE
, 0);
318 if (rv
!= J_DISKTFIXSIZE
)
324 /* and now the variable part */
327 rv
= spwrite(fd
, ts
->udata
, ts
->ulen
, J_DISKTFIXSIZE
);
332 ts
->pdata
= malloc(ts
->len
);
333 if (ts
->pdata
== NULL
)
338 /* copy the current content into the transaction file */
339 rv
= spread(ts
->fs
->fd
, ts
->pdata
, ts
->len
, ts
->offset
);
343 /* we are extending the file! use ftruncate() to do it */
344 ftruncate(ts
->fs
->fd
, ts
->offset
+ ts
->len
);
350 t
= J_DISKTFIXSIZE
+ ts
->ulen
;
351 rv
= spwrite(fd
, ts
->pdata
, ts
->len
, t
);
355 /* save the new data in the transaction file */
356 t
= J_DISKTFIXSIZE
+ ts
->ulen
+ ts
->plen
;
357 rv
= spwrite(fd
, ts
->buf
, ts
->len
, t
);
361 /* this is a simple but efficient optimization: instead of doing
362 * everything O_SYNC, we sync at this point only, this way we avoid
363 * doing a lot of very small writes; in case of a crash the
364 * transaction file is only useful if it's complete (ie. after this
365 * point) so we only flush here */
368 /* now that we have a safe transaction file, let's apply it */
369 rv
= spwrite(ts
->fs
->fd
, ts
->buf
, ts
->len
, ts
->offset
);
373 /* mark the transaction as commited */
374 ts
->flags
= ts
->flags
| J_COMMITED
;
376 /* the transaction has been applied, so we cleanup and remove it from
378 free_tid(ts
->fs
, ts
->id
);
384 if (!(ts
->fs
->flags
& J_NOLOCK
))
385 plockf(ts
->fs
->fd
, F_ULOCK
, ts
->offset
, ts
->len
);
387 /* return the lenght only if it was properly commited */
388 if (ts
->flags
& J_COMMITED
)
395 /* rollback a transaction */
396 int jtrans_rollback(struct jtrans
*ts
)
401 /* copy the old transaction to the new one */
402 jtrans_init(ts
->fs
, &newts
);
404 newts
.name
= malloc(strlen(ts
->name
));
405 if (newts
.name
== NULL
)
408 strcpy(newts
.name
, ts
->name
);
409 newts
.flags
= ts
->flags
;
410 newts
.offset
= ts
->offset
;
412 newts
.buf
= ts
->pdata
;
413 newts
.len
= ts
->plen
;
415 if (ts
->plen
< ts
->len
) {
416 /* we extended the data in the previous transaction, so we
417 * should truncate it back */
418 /* DANGEROUS: this is one of the main reasons why rollbacking
419 * is dangerous and should only be done with extreme caution:
420 * if for some reason, after the previous transacton, we have
421 * extended the file further, this will cut it back to what it
422 * was; read the docs for more detail */
423 ftruncate(ts
->fs
->fd
, ts
->offset
+ ts
->plen
);
427 newts
.pdata
= ts
->buf
;
428 newts
.plen
= ts
->len
;
430 newts
.udata
= ts
->udata
;
431 newts
.ulen
= ts
->ulen
;
433 rv
= jtrans_commit(&newts
);
443 int jopen(struct jfs
*fs
, char *name
, int flags
, int mode
, int jflags
)
447 char jdir
[PATH_MAX
], jlockfile
[PATH_MAX
];
450 fd
= open(name
, flags
, mode
);
458 pthread_mutex_init( &(fs
->lock
), NULL
);
460 if (!get_jdir(name
, jdir
))
462 rv
= mkdir(jdir
, 0750);
463 rv
= lstat(jdir
, &sinfo
);
464 if (rv
< 0 || !S_ISDIR(sinfo
.st_mode
))
467 snprintf(jlockfile
, PATH_MAX
, "%s/%s", jdir
, "lock");
468 if (access(jlockfile
, F_OK
) != 0) {
469 /* file doesn't exists, create it */
470 jfd
= open(jlockfile
, O_RDWR
| O_CREAT
| O_SYNC
, 0600);
472 jfd
= open(jlockfile
, O_RDWR
| O_SYNC
, 0600);
477 /* initialize the lock file by writing the first tid to it, but only
478 * if its empty, otherwise there is a race if two processes call
479 * jopen() simultaneously and both initialize the file */
480 plockf(jfd
, F_LOCK
, 0, 0);
481 lstat(jlockfile
, &sinfo
);
482 if (sinfo
.st_size
== 0) {
484 rv
= write(jfd
, &t
, sizeof(t
));
485 if (rv
!= sizeof(t
)) {
486 plockf(jfd
, F_ULOCK
, 0, 0);
490 plockf(jfd
, F_ULOCK
, 0, 0);
498 ssize_t
jread(struct jfs
*fs
, void *buf
, size_t count
)
501 pthread_mutex_lock(&(fs
->lock
));
502 lockf(fs
->fd
, F_LOCK
, count
);
503 rv
= read(fs
->fd
, buf
, count
);
504 lockf(fs
->fd
, F_ULOCK
, -count
);
505 pthread_mutex_unlock(&(fs
->lock
));
511 ssize_t
jpread(struct jfs
*fs
, void *buf
, size_t count
, off_t offset
)
514 plockf(fs
->fd
, F_LOCK
, offset
, count
);
515 rv
= pread(fs
->fd
, buf
, count
, offset
);
516 plockf(fs
->fd
, F_ULOCK
, offset
, count
);
522 ssize_t
jreadv(struct jfs
*fs
, struct iovec
*vector
, int count
)
528 for (i
= 0; i
< count
; i
++)
529 sum
+= vector
[i
].iov_len
;
531 pthread_mutex_lock(&(fs
->lock
));
532 lockf(fs
->fd
, F_LOCK
, sum
);
533 rv
= readv(fs
->fd
, vector
, count
);
534 lockf(fs
->fd
, F_ULOCK
, -sum
);
535 pthread_mutex_unlock(&(fs
->lock
));
541 ssize_t
jwrite(struct jfs
*fs
, void *buf
, size_t count
)
547 pthread_mutex_lock(&(fs
->lock
));
549 jtrans_init(fs
, &ts
);
550 pos
= lseek(fs
->fd
, 0, SEEK_CUR
);
556 rv
= jtrans_commit(&ts
);
558 pthread_mutex_unlock(&(fs
->lock
));
563 ssize_t
jpwrite(struct jfs
*fs
, void *buf
, size_t count
, off_t offset
)
568 pthread_mutex_lock(&(fs
->lock
));
570 jtrans_init(fs
, &ts
);
576 rv
= jtrans_commit(&ts
);
578 pthread_mutex_unlock(&(fs
->lock
));
583 ssize_t
jwritev(struct jfs
*fs
, struct iovec
*vector
, int count
)
592 for (i
= 0; i
< count
; i
++)
593 sum
+= vector
[i
].iov_len
;
595 /* unify the buffers into one big chunk to commit */
596 /* FIXME: can't we do this more efficient? It ruins the whole purpose
597 * of using writev() :\
598 * maybe we should do one transaction per vector */
604 for (i
= 0; i
< count
; i
++) {
605 memcpy(buf
+ bufp
, vector
[i
].iov_base
, vector
[i
].iov_len
);
606 bufp
+= vector
[i
].iov_len
;
609 pthread_mutex_lock(&(fs
->lock
));
611 jtrans_init(fs
, &ts
);
612 pos
= lseek(fs
->fd
, 0, SEEK_CUR
);
618 rv
= jtrans_commit(&ts
);
620 pthread_mutex_unlock(&(fs
->lock
));
625 /* truncate a file - be careful with this */
626 int jtruncate(struct jfs
*fs
, off_t lenght
)
630 /* lock from lenght to the end of file */
631 plockf(fs
->fd
, F_LOCK
, lenght
, 0);
632 rv
= ftruncate(fs
->fd
, lenght
);
633 plockf(fs
->fd
, F_ULOCK
, lenght
, 0);
639 int jclose(struct jfs
*fs
)
653 /* check the journal and replay the incomplete transactions */
654 int jfsck(char *name
, struct jfsck_result
*res
)
656 int fd
, jfd
, tfd
, rv
, i
, maxtid
;
657 char jdir
[PATH_MAX
], jlockfile
[PATH_MAX
], tname
[PATH_MAX
];
661 struct jtrans
*curts
;
665 fd
= open(name
, O_RDWR
| O_SYNC
| O_LARGEFILE
);
672 if (!get_jdir(name
, jdir
))
674 rv
= lstat(jdir
, &sinfo
);
675 if (rv
< 0 || !S_ISDIR(sinfo
.st_mode
))
678 snprintf(jlockfile
, PATH_MAX
, "%s/%s", jdir
, "lock");
679 jfd
= open(jlockfile
, O_RDWR
| O_SYNC
, 0600);
683 lstat(jlockfile
, &sinfo
);
684 if (sinfo
.st_size
== 0)
687 plockf(jfd
, F_LOCK
, 0, 0);
688 rv
= spread(jfd
, &maxtid
, sizeof(maxtid
), 0);
689 if (rv
!= sizeof(maxtid
)) {
692 plockf(jfd
, F_ULOCK
, 0, 0);
700 /* we loop all the way up to the max transaction id */
701 for (i
= 1; i
<= maxtid
; i
++) {
702 curts
= malloc(sizeof(struct jtrans
));
706 jtrans_init(&fs
, curts
);
709 /* open the transaction file, using i as its name, so we are
710 * really looping in order (recovering transaction in a
711 * different order as they were applied means instant
713 if (!get_jtfile(name
, i
, tname
))
715 tfd
= open(tname
, O_RDWR
| O_SYNC
| O_LARGEFILE
, 0600);
721 /* try to lock the transaction file, if it's locked then it is
722 * currently being used so we skip it */
723 rv
= plockf(fd
, F_TLOCK
, 0, 0);
731 /* load from disk, header first */
732 buf
= (char *) malloc(J_DISKTFIXSIZE
);
738 rv
= read(tfd
, buf
, J_DISKTFIXSIZE
);
739 if (rv
!= J_DISKTFIXSIZE
) {
744 curts
->flags
= (int) *(buf
+ 4);
745 curts
->len
= (size_t) *(buf
+ 8);
746 curts
->ulen
= (size_t) *(buf
+ 16);
747 curts
->offset
= (off_t
) *(buf
+ 20);
749 /* if we got here, the transaction was not applied, so we
750 * check if the transaction file is complete (we only need to
751 * apply it) or not (so we can't do anything but ignore it) */
753 lstat(tname
, &sinfo
);
754 rv
= J_DISKTFIXSIZE
+ curts
->len
+ curts
->ulen
+ curts
->plen
;
755 if (sinfo
.st_size
!= rv
) {
756 /* the transaction file is incomplete, some of the
762 /* we have a complete transaction file which commit was not
763 * successful, so we read it to complete the transaction
764 * structure and apply it again */
765 curts
->buf
= malloc(curts
->len
);
766 if (curts
->buf
== NULL
) {
771 curts
->pdata
= malloc(curts
->plen
);
772 if (curts
->pdata
== NULL
) {
777 curts
->udata
= malloc(curts
->ulen
);
778 if (curts
->udata
== NULL
) {
784 offset
= J_DISKTFIXSIZE
;
785 rv
= spread(tfd
, curts
->udata
, curts
->ulen
, offset
);
786 if (rv
!= curts
->ulen
) {
793 offset
= J_DISKTFIXSIZE
+ curts
->ulen
;
794 rv
= spread(tfd
, curts
->pdata
, curts
->plen
, offset
);
795 if (rv
!= curts
->plen
) {
802 offset
= J_DISKTFIXSIZE
+ curts
->ulen
+ curts
->plen
;
803 rv
= spread(tfd
, curts
->buf
, curts
->len
, offset
);
804 if (rv
!= curts
->len
) {
810 rv
= jtrans_commit(curts
);
817 /* free the data we just allocated */