There were several memory leaks inside jfsck(), they've probably been there for a...
[libjio.git] / trans.c
blob73ced6a67e3fb3dcec9fc4bd0ba787cff9117f4d
2 /*
3 * libjio - A library for Journaled I/O
4 * Alberto Bertogli (albertogli@telpin.com.ar)
6 * Core transaction API and recovery functions
7 */
9 #include <sys/types.h>
10 #include <sys/stat.h>
11 #include <fcntl.h>
12 #include <unistd.h>
13 #include <stdlib.h>
14 #include <limits.h>
15 #include <string.h>
16 #include <libgen.h>
17 #include <stdio.h>
18 #include <dirent.h>
19 #include <errno.h>
20 #include <sys/mman.h>
22 #include "libjio.h"
23 #include "common.h"
27 * helper functions
30 /* gets a new transaction id */
31 static unsigned int get_tid(struct jfs *fs)
33 unsigned int curid, rv;
35 /* lock the whole file */
36 plockf(fs->jfd, F_LOCKW, 0, 0);
38 /* read the current max. curid */
39 curid = *(fs->jmap);
41 /* increment it and handle overflows */
42 rv = curid + 1;
43 if (rv == 0)
44 goto exit;
46 /* write to the file descriptor */
47 *(fs->jmap) = rv;
49 exit:
50 plockf(fs->jfd, F_UNLOCK, 0, 0);
51 return rv;
54 /* frees a transaction id */
55 static void free_tid(struct jfs *fs, unsigned int tid)
57 unsigned int curid, i;
58 char name[PATH_MAX];
60 /* lock the whole file */
61 plockf(fs->jfd, F_LOCKW, 0, 0);
63 /* read the current max. curid */
64 curid = *(fs->jmap);
66 if (tid < curid) {
67 /* we're not freeing the max. curid, so we just return */
68 goto exit;
69 } else {
70 /* look up the new max. */
71 for (i = curid - 1; i > 0; i--) {
72 /* this can fail if we're low on mem, but we don't
73 * care checking here because the problem will come
74 * out later and we can fail more properly */
75 get_jtfile(fs->name, i, name);
76 if (access(name, R_OK | W_OK) == 0) {
77 curid = i;
78 break;
82 /* and save it */
83 *(fs->jmap) = i;
86 exit:
87 plockf(fs->jfd, F_UNLOCK, 0, 0);
88 return;
93 * transaction functions
96 /* initialize a transaction structure */
97 void jtrans_init(struct jfs *fs, struct jtrans *ts)
99 ts->fs = fs;
100 ts->name = NULL;
101 ts->id = 0;
102 ts->flags = fs->flags;
103 ts->op = NULL;
104 ts->numops = 0;
105 pthread_mutex_init( &(ts->lock), NULL);
109 /* free the contents of a transaction structure */
110 void jtrans_free(struct jtrans *ts)
112 struct joper *tmpop;
114 ts->fs = NULL;
116 if (ts->name)
117 free(ts->name);
119 while (ts->op != NULL) {
120 tmpop = ts->op->next;
122 if (ts->op->buf)
123 free(ts->op->buf);
124 if (ts->op->pdata)
125 free(ts->op->pdata);
126 free(ts->op);
128 ts->op = tmpop;
130 pthread_mutex_destroy(&(ts->lock));
134 int jtrans_add(struct jtrans *ts, const void *buf, size_t count, off_t offset)
136 struct joper *jop, *tmpop;
138 /* find the last operation in the transaction and create a new one at
139 * the end */
140 pthread_mutex_lock(&(ts->lock));
141 if (ts->op == NULL) {
142 ts->op = malloc(sizeof(struct joper));
143 if (ts->op == NULL)
144 return 0;
145 jop = ts->op;
146 jop->prev = NULL;
147 } else {
148 for (tmpop = ts->op; tmpop->next != NULL; tmpop = tmpop->next)
150 tmpop->next = malloc(sizeof(struct joper));
151 if (tmpop->next == NULL)
152 return 0;
153 tmpop->next->prev = tmpop;
154 jop = tmpop->next;
156 pthread_mutex_unlock(&(ts->lock));
158 jop->buf = malloc(count);
159 if (jop->buf == NULL) {
160 free(jop);
161 return 0;
164 /* we copy the buffer because then the caller can reuse it */
165 memcpy(jop->buf, buf, count);
166 jop->len = count;
167 jop->offset = offset;
168 jop->next = NULL;
169 jop->plen = 0;
170 jop->pdata = NULL;
171 jop->locked = 0;
173 ts->numops++;
175 return 1;
178 /* commit a transaction */
179 int jtrans_commit(struct jtrans *ts)
181 int id, rv, fd = -1;
182 uint32_t csum;
183 char *name;
184 unsigned char *buf_init, *bufp;
185 struct joper *op;
186 struct jlinger *linger;
187 off_t curpos = 0;
188 size_t written = 0;
190 pthread_mutex_lock(&(ts->lock));
192 name = (char *) malloc(PATH_MAX);
193 if (name == NULL)
194 goto exit;
196 id = get_tid(ts->fs);
197 if (id == 0)
198 goto exit;
200 /* open the transaction file */
201 if (!get_jtfile(ts->fs->name, id, name))
202 goto exit;
203 fd = open(name, O_RDWR | O_CREAT | O_TRUNC | O_LARGEFILE, 0600);
204 if (fd < 0)
205 goto exit;
207 /* and lock it */
208 plockf(fd, F_LOCKW, 0, 0);
210 ts->id = id;
211 ts->name = name;
213 /* save the header */
214 buf_init = malloc(J_DISKHEADSIZE);
215 if (buf_init == NULL)
216 goto unlink_exit;
218 bufp = buf_init;
220 memcpy(bufp, (void *) &(ts->id), 4);
221 bufp += 4;
223 memcpy(bufp, (void *) &(ts->flags), 4);
224 bufp += 4;
226 memcpy(bufp, (void *) &(ts->numops), 4);
227 bufp += 4;
229 rv = spwrite(fd, buf_init, J_DISKHEADSIZE, 0);
230 if (rv != J_DISKHEADSIZE) {
231 free(buf_init);
232 goto unlink_exit;
235 free(buf_init);
237 curpos = J_DISKHEADSIZE;
239 /* first of all lock all the regions we're going to work with;
240 * otherwise there could be another transaction trying to write the
241 * same spots and we could end up with interleaved writes, that could
242 * break atomicity warantees if we need to rollback */
243 if (!(ts->flags & J_NOLOCK)) {
244 for (op = ts->op; op != NULL; op = op->next) {
245 rv = plockf(ts->fs->fd, F_LOCKW, op->offset, op->len);
246 if (rv == -1)
247 /* note it can fail with EDEADLK */
248 goto unlink_exit;
249 op->locked = 1;
253 /* save each transacion in the file */
254 for (op = ts->op; op != NULL; op = op->next) {
255 /* read the current content only if the transaction is not
256 * marked as NOROLLBACK, and if the data is not there yet,
257 * which is the normal case, but for rollbacking we fill it
258 * ourselves */
259 if (!(ts->flags & J_NOROLLBACK) && (op->pdata == NULL)) {
260 op->pdata = malloc(op->len);
261 if (op->pdata == NULL)
262 goto unlink_exit;
264 op->plen = op->len;
266 rv = spread(ts->fs->fd, op->pdata, op->len,
267 op->offset);
268 if (rv < 0)
269 goto unlink_exit;
270 if (rv < op->len) {
271 /* we are extending the file! */
272 /* ftruncate(ts->fs->fd, op->offset + op->len); */
273 op->plen = rv;
277 /* save the operation's header */
278 buf_init = malloc(J_DISKOPHEADSIZE);
279 if (buf_init == NULL)
280 goto unlink_exit;
282 bufp = buf_init;
284 memcpy(bufp, (void *) &(op->len), 4);
285 bufp += 4;
287 memcpy(bufp, (void *) &(op->plen), 4);
288 bufp += 4;
290 memcpy(bufp, (void *) &(op->offset), 8);
291 bufp += 8;
293 rv = spwrite(fd, buf_init, J_DISKOPHEADSIZE, curpos);
294 if (rv != J_DISKOPHEADSIZE) {
295 free(buf_init);
296 goto unlink_exit;
299 free(buf_init);
301 curpos += J_DISKOPHEADSIZE;
303 /* and save it to the disk */
304 rv = spwrite(fd, op->buf, op->len, curpos);
305 if (rv != op->len)
306 goto unlink_exit;
308 curpos += op->len;
311 /* compute and save the checksum */
312 if (!checksum(fd, curpos, &csum))
313 goto unlink_exit;
315 rv = spwrite(fd, &csum, sizeof(uint32_t), curpos);
316 if (rv != sizeof(uint32_t))
317 goto unlink_exit;
318 curpos += sizeof(uint32_t);
320 /* this is a simple but efficient optimization: instead of doing
321 * everything O_SYNC, we sync at this point only, this way we avoid
322 * doing a lot of very small writes; in case of a crash the
323 * transaction file is only useful if it's complete (ie. after this
324 * point) so we only flush here (both data and metadata) */
325 if (fsync(fd) != 0)
326 goto unlink_exit;
327 if (fsync(ts->fs->jdirfd) != 0) {
328 /* it seems to be legal that fsync() on directories is not
329 * implemented, so if this fails with EINVAL or EBADF, just
330 * call a global sync(); which is awful (and might still
331 * return before metadata is done) but it seems to be the
332 * saner choice; otherwise we just fail */
333 if (errno == EINVAL || errno == EBADF) {
334 sync();
335 } else {
336 goto unlink_exit;
340 /* now that we have a safe transaction file, let's apply it */
341 written = 0;
342 for (op = ts->op; op != NULL; op = op->next) {
343 rv = spwrite(ts->fs->fd, op->buf, op->len, op->offset);
345 plockf(ts->fs->fd, F_UNLOCK, op->offset, op->len);
346 op->locked = 0;
348 if (rv != op->len)
349 goto rollback_exit;
351 written += rv;
354 if (ts->flags & J_LINGER) {
355 linger = malloc(sizeof(struct jlinger));
356 if (linger == NULL)
357 goto rollback_exit;
359 linger->id = id;
360 linger->name = strdup(name);
361 linger->next = ts->fs->ltrans;
363 ts->fs->ltrans = linger;
364 } else {
365 /* the transaction has been applied, so we cleanup and remove
366 * it from the disk */
367 unlink(name);
368 free_tid(ts->fs, ts->id);
371 /* mark the transaction as commited, _after_ it was removed */
372 ts->flags = ts->flags | J_COMMITED;
375 rollback_exit:
376 /* If the transaction failed we try to recover by rollbacking it
377 * NOTE: on extreme conditions (ENOSPC/disk failure) this can fail
378 * too! There's nothing much we can do in that case, the caller should
379 * take care of it by itself.
380 * The transaction file might be OK at this point, so the data could
381 * be recovered by a posterior jfsck(); however, that's not what the
382 * user expects (after all, if we return failure, new data should
383 * never appear), so we remove the transaction file.
384 * Transactions that were successfuly recovered by rollbacking them
385 * will have J_ROLLBACKED in their flags, so the caller can verify if
386 * the failure was recovered or not. */
387 if (!(ts->flags & J_COMMITED) && !(ts->flags & J_ROLLBACKING)) {
388 rv = ts->flags;
389 ts->flags = ts->flags | J_NOLOCK | J_ROLLBACKING;
390 if (jtrans_rollback(ts) >= 0) {
391 ts->flags = rv | J_ROLLBACKED;
392 } else {
393 ts->flags = rv;
397 unlink_exit:
398 if (!(ts->flags & J_COMMITED)) {
399 unlink(name);
400 free_tid(ts->fs, ts->id);
403 close(fd);
404 for (op = ts->op; op != NULL; op = op->next) {
405 if (op->locked)
406 plockf(ts->fs->fd, F_UNLOCK, op->offset, op->len);
409 exit:
410 pthread_mutex_unlock(&(ts->lock));
412 /* return the length only if it was properly commited */
413 if (ts->flags & J_COMMITED)
414 return written;
415 else
416 return -1;
420 /* rollback a transaction */
421 int jtrans_rollback(struct jtrans *ts)
423 int rv;
424 struct jtrans newts;
425 struct joper *op, *curop, *lop;
427 jtrans_init(ts->fs, &newts);
428 newts.flags = ts->flags;
430 if (ts->op == NULL || ts->flags & J_NOROLLBACK) {
431 rv = -1;
432 goto exit;
435 /* find the last operation */
436 for (op = ts->op; op->next != NULL; op = op->next)
439 /* and traverse the list backwards */
440 for ( ; op != NULL; op = op->prev) {
441 /* if we extended the data in the previous transaction, we
442 * should truncate it back */
443 /* DANGEROUS: this is one of the main reasons why rollbacking
444 * is dangerous and should only be done with extreme caution:
445 * if for some reason, after the previous transacton, we have
446 * extended the file further, this will cut it back to what it
447 * was; read the docs for more detail */
448 if (op->plen < op->len)
449 ftruncate(ts->fs->fd, op->offset + op->plen);
451 /* manually add the operation to the new transaction */
452 curop = malloc(sizeof(struct joper));
453 if (curop == NULL) {
454 rv = -1;
455 goto exit;
458 curop->offset = op->offset;
459 curop->len = op->plen;
460 curop->buf = op->pdata;
461 curop->plen = op->plen;
462 curop->pdata = op->pdata;
463 curop->locked = 0;
465 /* add the new transaction to the list */
466 if (newts.op == NULL) {
467 newts.op = curop;
468 curop->prev = NULL;
469 curop->next = NULL;
470 } else {
471 for (lop = newts.op; lop->next != NULL; lop = lop->next)
473 lop->next = curop;
474 curop->prev = lop;
475 curop->next = NULL;
479 rv = jtrans_commit(&newts);
481 exit:
482 /* free the transaction */
483 for (curop = newts.op; curop != NULL; curop = curop->next) {
484 curop->buf = NULL;
485 curop->pdata = NULL;
487 jtrans_free(&newts);
489 return rv;
493 * basic operations
496 /* open a file */
497 int jopen(struct jfs *fs, const char *name, int flags, int mode, int jflags)
499 int fd, jfd, rv;
500 unsigned int t;
501 char jdir[PATH_MAX], jlockfile[PATH_MAX];
502 struct stat sinfo;
504 /* we always need read and write access, because when we commit a
505 * transaction we read the current contents before applying, and write
506 * access is needed for locking with fcntl */
507 flags = flags & ~O_WRONLY;
508 flags = flags & ~O_RDONLY;
509 flags = flags | O_RDWR;
511 fd = open(name, flags, mode);
512 if (fd < 0)
513 return -1;
515 fs->fd = fd;
516 fs->name = strdup(name);
517 fs->flags = jflags;
518 fs->ltrans = NULL;
520 /* Note on fs->lock usage: this lock is used only inside the wrappers,
521 * and exclusively to protect the file pointer. This means that it
522 * must only be held while performing operations that depend or alter
523 * the file pointer (jread, jreadv, jwrite, jwritev), but the others
524 * (jpread, jpwrite) are left unprotected because they can be
525 * performed in paralell as long as they don't affect the same portion
526 * of the file (this is protected by lockf). The lock doesn't slow
527 * things down tho: any threaded app MUST implement this kind of
528 * locking anyways if it wants to prevent data corruption, we only
529 * make it easier for them by taking care of it here. If performance
530 * is essential, the jpread/jpwrite functions should be used, just as
531 * real life. */
532 pthread_mutex_init( &(fs->lock), NULL);
534 if (!get_jdir(name, jdir))
535 return -1;
536 rv = mkdir(jdir, 0750);
537 rv = lstat(jdir, &sinfo);
538 if (rv < 0 || !S_ISDIR(sinfo.st_mode))
539 return -1;
541 /* open the directory, we will use it to flush transaction files'
542 * metadata in jtrans_commit() */
543 fs->jdirfd = open(jdir, O_RDONLY);
544 if (fs->jdirfd < 0)
545 return -1;
547 snprintf(jlockfile, PATH_MAX, "%s/%s", jdir, "lock");
548 jfd = open(jlockfile, O_RDWR | O_CREAT, 0600);
549 if (jfd < 0)
550 return -1;
552 /* initialize the lock file by writing the first tid to it, but only
553 * if its empty, otherwise there is a race if two processes call
554 * jopen() simultaneously and both initialize the file */
555 plockf(jfd, F_LOCKW, 0, 0);
556 lstat(jlockfile, &sinfo);
557 if (sinfo.st_size != sizeof(unsigned int)) {
558 t = 0;
559 rv = spwrite(jfd, &t, sizeof(t), 0);
560 if (rv != sizeof(t)) {
561 plockf(jfd, F_UNLOCK, 0, 0);
562 return -1;
565 plockf(jfd, F_UNLOCK, 0, 0);
567 fs->jfd = jfd;
569 fs->jmap = (unsigned int *) mmap(NULL, sizeof(unsigned int),
570 PROT_READ | PROT_WRITE, MAP_SHARED, jfd, 0);
571 if (fs->jmap == MAP_FAILED)
572 return -1;
574 return fd;
577 /* sync a file (makes sense only if using lingering transactions) */
578 int jsync(struct jfs *fs)
580 int rv;
581 struct jlinger *linger, *ltmp;
583 pthread_mutex_lock(&(fs->lock));
585 rv = fsync(fs->fd);
586 if (rv != 0)
587 goto exit;
589 linger = fs->ltrans;
590 while (linger != NULL) {
591 free_tid(fs, linger->id);
592 unlink(linger->name);
593 free(linger->name);
595 ltmp = linger->next;
596 free(linger);
598 linger = ltmp;
601 exit:
602 pthread_mutex_unlock(&(fs->lock));
603 return rv;
606 /* close a file */
607 int jclose(struct jfs *fs)
609 if (jsync(fs))
610 return -1;
611 if (close(fs->fd))
612 return -1;
613 if (close(fs->jfd))
614 return -1;
615 if (close(fs->jdirfd))
616 return -1;
617 if (fs->name)
618 /* allocated by strdup() in jopen() */
619 free(fs->name);
620 munmap(fs->jmap, sizeof(unsigned int));
621 pthread_mutex_destroy(&(fs->lock));
623 return 0;