2 * Copyright (c) 2004,2005 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sbin/jscan/jfile.c,v 1.10 2005/09/07 07:20:23 dillon Exp $
40 static void jalign(struct jfile
*jf
, enum jdirection direction
);
41 static int jreadbuf(struct jfile
*jf
, enum jdirection direction
,
42 void *buf
, int bytes
);
45 * Open a file descriptor for journal record access.
47 * NOTE: only seekable descriptors are supported for backwards scans.
54 jf
= malloc(sizeof(struct jfile
));
55 bzero(jf
, sizeof(struct jfile
));
58 jf
->jf_open_flags
= O_RDONLY
;
64 * Open a prefix set. <prefix>.nnnnnnnnn files or a <prefix>.transid file
65 * must exist to succeed. No file descriptor is actually opened but
66 * the sequence number is initialized to the beginning or end of the set.
69 jopen_prefix(const char *prefix
, int rw
)
73 unsigned int seq_beg
= -1;
74 unsigned int seq_end
= -1;
87 dirname
= data
= strdup(prefix
);
88 if ((basename
= strrchr(dirname
, '/')) != NULL
) {
94 baselen
= strlen(basename
);
95 if ((dir
= opendir(dirname
)) != NULL
) {
96 while ((den
= readdir(dir
)) != NULL
) {
97 if (strncmp(den
->d_name
, basename
, baselen
) == 0 &&
98 den
->d_name
[baselen
] == '.'
100 seq
= strtoul(den
->d_name
+ baselen
+ 1, &ptr
, 16);
101 if (*ptr
== 0 && seq
!= ULONG_MAX
) {
102 if (seq_beg
== (unsigned int)-1 || seq_beg
> seq
)
104 if (seq_end
== (unsigned int)-1 || seq_end
< seq
)
114 asprintf(&data
, "%s.transid", prefix
);
115 if (stat(data
, &st
) == 0)
119 if (seq_beg
!= (unsigned int)-1 || hastransid
) {
120 if (seq_beg
== (unsigned int)-1) {
124 asprintf(&data
, "%s.%08x", prefix
, 0);
125 if ((fd
= open(data
, O_RDWR
|O_CREAT
, 0666)) >= 0)
130 jf
= malloc(sizeof(struct jfile
));
131 bzero(jf
, sizeof(struct jfile
));
133 jf
->jf_write_fd
= -1;
134 jf
->jf_prefix
= strdup(prefix
);
135 jf
->jf_seq
= seq_beg
;
136 jf
->jf_seq_beg
= seq_beg
;
137 jf
->jf_seq_end
= seq_end
;
138 jf
->jf_open_flags
= rw
? (O_RDWR
|O_CREAT
) : O_RDONLY
;
140 fprintf(stderr
, "Open prefix set %08x-%08x\n", seq_beg
, seq_end
);
141 if ((jd
= jread(jf
, NULL
, JD_BACKWARDS
)) != NULL
) {
142 jf
->jf_last_transid
= jd
->jd_transid
;
152 * Get a prefix set ready for append.
155 jrecord_init(const char *prefix
)
164 * Determine whether we already have a prefix set or whether we need
167 jf
= jopen_prefix(prefix
, 0);
170 if (jf
->jf_seq_beg
!= (unsigned int)-1)
174 asprintf(&data
, "%s.transid", prefix
);
177 * If the sequence exists the transid file must ALREADY exist for us
178 * to be able to safely 'append' to the space. Locked-down sequence
179 * spaces do not have a transid file.
182 fd
= open(data
, O_RDWR
, 0666);
184 fd
= open(data
, O_RDWR
|O_CREAT
, 0666);
189 if (fstat(fd
, &st
) == 0 && st
.st_size
== 0)
190 write(fd
, "0000000000000000\n", 17); /* starting transid in hex */
196 * Close a previously opened journal, clean up any side allocations.
199 jclose(struct jfile
*jf
)
201 if (jf
->jf_fd
>= 0) {
205 if (jf
->jf_write_fd
>= 0) {
206 close(jf
->jf_write_fd
);
207 jf
->jf_write_fd
= -1;
213 * Locate the next (or previous) raw record given a jfile, current record,
214 * and direction. If the current record is NULL then the first or last
215 * record for the current sequence number is returned.
217 * PAD RECORD SPECIAL CASE. Pad records can be 16 bytes long, which means
218 * that that rawrecend overlaps the transid field of the rawrecbeg. Because
219 * the transid is garbage, we must skip and cannot return pad records.
222 jread(struct jfile
*jf
, struct jdata
*jd
, enum jdirection direction
)
224 struct journal_rawrecbeg head
;
225 struct journal_rawrecbeg
*headp
;
226 struct journal_rawrecend tail
;
227 struct journal_rawrecend
*tailp
;
238 * Handle the next/previous record case. If running in the forwards
239 * direction we position the file just after jd. If running in the
240 * backwards direction we position the file at the base of jd so
241 * the backwards read gets the previous record.
243 * In prefix mode we have to get the right descriptor open and
244 * position the file, since the fall through code resets to the
245 * beginning or end if it has to open a descriptor.
247 assert(direction
!= JD_SEQFIRST
&& direction
!= JD_SEQLAST
);
249 if (jf
->jf_fd
>= 0 && jf
->jf_seq
!= jd
->jd_seq
) {
253 jf
->jf_seq
= jd
->jd_seq
;
255 asprintf(&filename
, "%s.%08x", jf
->jf_prefix
, jf
->jf_seq
);
256 jf
->jf_fd
= open(filename
, O_RDONLY
);
258 fprintf(stderr
, "Open %s fd %d\n", filename
, jf
->jf_fd
);
262 if ((jmodes
& JMODEF_INPUT_PIPE
) == 0) {
263 if (direction
== JD_FORWARDS
) {
264 jf
->jf_pos
= jd
->jd_pos
+ jd
->jd_size
;
265 lseek(jf
->jf_fd
, jf
->jf_pos
, 0);
267 jf
->jf_pos
= jd
->jd_pos
;
268 /* lseek(jf->jf_fd, jf->jf_pos, 0); not needed */
271 assert(direction
== JD_FORWARDS
&& jf
->jf_prefix
== NULL
);
272 assert(jf
->jf_pos
== jd
->jd_pos
+ jd
->jd_size
);
277 * Handle the first/last record case. In the prefix case we only
278 * need to set jf_seq and close the file handle and fall through.
279 * The SEQ modes maintain the current jf_seq (kinda a hack).
282 if (jf
->jf_fd
>= 0) {
288 jf
->jf_seq
= jf
->jf_seq_beg
;
291 jf
->jf_seq
= jf
->jf_seq_end
;
294 direction
= JD_FORWARDS
;
297 direction
= JD_BACKWARDS
;
300 } else if ((jmodes
& JMODEF_INPUT_PIPE
) == 0) {
303 direction
= JD_FORWARDS
;
306 jf
->jf_pos
= lseek(jf
->jf_fd
, 0L, SEEK_SET
);
309 direction
= JD_BACKWARDS
;
312 jf
->jf_pos
= lseek(jf
->jf_fd
, 0L, SEEK_END
);
316 if (direction
== JD_SEQFIRST
)
317 direction
= JD_FORWARDS
;
318 assert(jf
->jf_pos
== 0 && direction
== JD_FORWARDS
);
324 * If we are doing a prefix scan and the descriptor is not open,
325 * open the file based on jf_seq and position it to the beginning
326 * or end based on the direction. This is how we iterate through
330 asprintf(&filename
, "%s.%08x", jf
->jf_prefix
, jf
->jf_seq
);
331 jf
->jf_fd
= open(filename
, O_RDONLY
);
333 fprintf(stderr
, "Open %s fd %d\n", filename
, jf
->jf_fd
);
335 if (direction
== JD_FORWARDS
)
336 jf
->jf_pos
= lseek(jf
->jf_fd
, 0L, SEEK_SET
);
338 jf
->jf_pos
= lseek(jf
->jf_fd
, 0L, SEEK_END
);
342 * Get the current offset and make sure it is 16-byte aligned. If it
343 * isn't, align it and enter search mode.
345 if (jf
->jf_pos
& 15) {
346 jf_warn(jf
, "realigning bad offset and entering search mode");
347 jalign(jf
, direction
);
354 if (direction
== JD_FORWARDS
) {
356 * Scan the journal forwards. Note that the file pointer might not
359 while ((error
= jreadbuf(jf
, direction
, &head
, sizeof(head
))) == sizeof(head
)) {
360 if (head
.begmagic
!= JREC_BEGMAGIC
) {
362 jf_warn(jf
, "bad beginmagic, searching for new record");
364 jalign(jf
, direction
);
369 * The actual record is 16-byte aligned. head.recsize contains
370 * the unaligned record size.
372 recsize
= (head
.recsize
+ 15) & ~15;
373 if (recsize
< JREC_MINRECSIZE
|| recsize
> JREC_MAXRECSIZE
) {
375 jf_warn(jf
, "bad recordsize: %d\n", recsize
);
377 jalign(jf
, direction
);
380 allocsize
= offsetof(struct jdata
, jd_data
[recsize
]);
381 allocsize
= (allocsize
+ 255) & ~255;
382 jd
= malloc(allocsize
);
383 bzero(jd
, offsetof(struct jdata
, jd_data
[0]));
384 bcopy(&head
, jd
->jd_data
, sizeof(head
));
385 n
= jreadbuf(jf
, direction
, jd
->jd_data
+ sizeof(head
),
386 recsize
- sizeof(head
));
387 if (n
!= (int)(recsize
- sizeof(head
))) {
389 jf_warn(jf
, "Incomplete stream record\n");
391 jalign(jf
, direction
);
396 tailp
= (void *)(jd
->jd_data
+ recsize
- sizeof(*tailp
));
397 if (tailp
->endmagic
!= JREC_ENDMAGIC
) {
399 jf_warn(jf
, "bad endmagic, searching for new record");
401 jalign(jf
, direction
);
409 if (head
.streamid
== JREC_STREAMID_PAD
) {
415 * note: recsize is aligned (the actual record size),
416 * head.recsize is unaligned (the actual payload size).
418 jd
->jd_transid
= head
.transid
;
419 jd
->jd_alloc
= allocsize
;
420 jd
->jd_size
= recsize
;
421 jd
->jd_seq
= jf
->jf_seq
;
422 jd
->jd_pos
= jf
->jf_pos
- recsize
;
428 * Scan the journal backwards. Note that jread()'s reverse-seek and
429 * read. The data read will be forward ordered, however.
431 while ((error
= jreadbuf(jf
, direction
, &tail
, sizeof(tail
))) == sizeof(tail
)) {
432 if (tail
.endmagic
!= JREC_ENDMAGIC
) {
434 jf_warn(jf
, "bad endmagic, searching for new record");
436 jalign(jf
, direction
);
441 * The actual record is 16-byte aligned. head.recsize contains
442 * the unaligned record size.
444 recsize
= (tail
.recsize
+ 15) & ~15;
445 if (recsize
< JREC_MINRECSIZE
|| recsize
> JREC_MAXRECSIZE
) {
447 jf_warn(jf
, "bad recordsize: %d\n", recsize
);
449 jalign(jf
, direction
);
452 allocsize
= offsetof(struct jdata
, jd_data
[recsize
]);
453 allocsize
= (allocsize
+ 255) & ~255;
454 jd
= malloc(allocsize
);
455 bzero(jd
, offsetof(struct jdata
, jd_data
[0]));
456 bcopy(&tail
, jd
->jd_data
+ recsize
- sizeof(tail
), sizeof(tail
));
457 n
= jreadbuf(jf
, direction
, jd
->jd_data
, recsize
- sizeof(tail
));
458 if (n
!= (int)(recsize
- sizeof(tail
))) {
460 jf_warn(jf
, "Incomplete stream record\n");
462 jalign(jf
, direction
);
467 headp
= (void *)jd
->jd_data
;
468 if (headp
->begmagic
!= JREC_BEGMAGIC
) {
470 jf_warn(jf
, "bad begmagic, searching for new record");
472 jalign(jf
, direction
);
480 if (head
.streamid
== JREC_STREAMID_PAD
) {
486 * note: recsize is aligned (the actual record size),
487 * head.recsize is unaligned (the actual payload size).
489 jd
->jd_transid
= headp
->transid
;
490 jd
->jd_alloc
= allocsize
;
491 jd
->jd_size
= recsize
;
492 jd
->jd_seq
= jf
->jf_seq
;
493 jd
->jd_pos
= jf
->jf_pos
;
500 * If reading in prefix mode and there is no more data, close the
501 * current descriptor, adjust the sequence number, and loop.
503 * If we hit the end of the sequence space and were asked to loop,
504 * check for the next sequence number and adjust jf_seq_end. Leave
505 * the current descriptor open so we do not loose track of its seek
506 * position, and also to catch a race where another jscan may have
507 * written more data to the current sequence number before rolling
508 * the next sequence number.
510 if (error
== 0 && jf
->jf_prefix
) {
511 if (direction
== JD_FORWARDS
) {
512 if (jf
->jf_seq
< jf
->jf_seq_end
) {
515 fprintf(stderr
, "jread: roll to seq %08x\n", jf
->jf_seq
);
516 if (jf
->jf_fd
>= 0) {
522 if (jmodes
& JMODEF_LOOP_FOREVER
) {
523 asprintf(&filename
, "%s.%08x", jf
->jf_prefix
, jf
->jf_seq
+ 1);
524 if (stat(filename
, &st
) == 0) {
527 fprintf(stderr
, "jread: roll seq_end to %08x\n",
535 if (jf
->jf_seq
> jf
->jf_seq_beg
) {
538 fprintf(stderr
, "jread: roll to seq %08x\n", jf
->jf_seq
);
539 if (jf
->jf_fd
>= 0) {
549 * If we hit EOF and were asked to loop forever on the input, leave
550 * the current descriptor open, sleep, and loop.
552 * We have already handled the prefix case. This feature only works
553 * when doing forward scans and the input is not a pipe.
555 if (error
== 0 && jf
->jf_prefix
== NULL
&&
556 (jmodes
& JMODEF_LOOP_FOREVER
) &&
557 !(jmodes
& JMODEF_INPUT_PIPE
) &&
558 direction
== JD_FORWARDS
565 * Otherwise there are no more records and we are done.
571 * Write a record out. If this is a prefix set and the file would
572 * exceed record_size, we rotate into a new sequence number.
575 jwrite(struct jfile
*jf
, struct jdata
*jd
)
581 assert(jf
->jf_prefix
);
585 * Open/create a new file in the prefix set
587 if (jf
->jf_write_fd
< 0) {
588 asprintf(&path
, "%s.%08x", jf
->jf_prefix
, jf
->jf_seq_end
);
589 jf
->jf_write_fd
= open(path
, O_RDWR
|O_CREAT
, 0666);
590 if (jf
->jf_write_fd
< 0 || fstat(jf
->jf_write_fd
, &st
) != 0) {
591 fprintf(stderr
, "Unable to open/create %s\n", path
);
594 jf
->jf_write_pos
= st
.st_size
;
595 lseek(jf
->jf_write_fd
, jf
->jf_write_pos
, 0);
600 * Each file must contain at least one raw record, even if it exceeds
601 * the user-requested record-size. Apart from that, we cycle to the next
602 * file when its size would exceed the user-specified
604 if (jf
->jf_write_pos
> 0 &&
605 jf
->jf_write_pos
+ jd
->jd_size
> prefix_file_size
607 close(jf
->jf_write_fd
);
608 jf
->jf_write_fd
= -1;
614 * Terminate if a failure occurs (for now).
616 n
= write(jf
->jf_write_fd
, jd
->jd_data
, jd
->jd_size
);
617 if (n
!= jd
->jd_size
) {
618 ftruncate(jf
->jf_write_fd
, jf
->jf_write_pos
);
619 fprintf(stderr
, "jwrite: failed %s\n", strerror(errno
));
622 jf
->jf_write_pos
+= n
;
623 jf
->jf_last_transid
= jd
->jd_transid
;
627 * Attempt to locate and return the record specified by the transid. The
628 * returned record may be inexact.
630 * If scanning forwards this function guarentees that no record prior
631 * to the returned record is >= transid.
633 * If scanning backwards this function guarentees that no record after
634 * the returned record is <= transid.
637 jseek(struct jfile
*jf
, int64_t transid
, enum jdirection direction
)
640 struct jdata
*jd
= NULL
;
643 * If the input is a pipe we can't seek.
645 if (jmodes
& JMODEF_INPUT_PIPE
) {
646 assert(direction
== JD_FORWARDS
);
647 return (jread(jf
, NULL
, direction
));
652 * If we have a prefix set search the sequence space backwards until
653 * we find the file most likely to contain the transaction id.
655 if (verbose_opt
> 2) {
656 fprintf(stderr
, "jseek prefix set %s %08x-%08x\n", jf
->jf_prefix
,
657 jf
->jf_seq_beg
, jf
->jf_seq_end
);
660 for (seq
= jf
->jf_seq_end
; seq
!= jf
->jf_seq_beg
- 1; --seq
) {
662 fprintf(stderr
, "try seq %08x\n", seq
);
664 if ((jd
= jread(jf
, NULL
, JD_SEQFIRST
)) != NULL
) {
665 if (jd
->jd_transid
== transid
)
667 if (jd
->jd_transid
< transid
) {
676 * if transid is less the first file in the sequence space we
677 * return NULL if scanning backwards, indicating no records are
678 * available, or the first record in the sequence space if we
679 * are scanning forwards.
681 if (seq
== jf
->jf_seq_beg
- 1) {
682 if (direction
== JD_BACKWARDS
)
685 return(jread(jf
, NULL
, JD_FORWARDS
));
688 fprintf(stderr
, "jseek input prefix set to seq %08x\n", seq
);
692 * Position us to the end of the current record, then scan backwards
693 * looking for the requested transid.
695 jd
= jread(jf
, NULL
, JD_SEQLAST
);
697 if (jd
->jd_transid
<= transid
) {
698 if (jd
->jd_transid
< transid
) {
699 if (direction
== JD_FORWARDS
)
700 jd
=jread(jf
, jd
, JD_FORWARDS
);
702 if (verbose_opt
> 1) {
703 fprintf(stderr
, "jseek returning seq %08x offset 0x%08llx\n",
704 jd
->jd_seq
, jd
->jd_pos
);
708 jd
= jread(jf
, jd
, JD_BACKWARDS
);
712 * We scanned the whole file with no luck, all the transid's are
713 * greater then the requested transid. If the intended read
714 * direction is backwards there are no records and we return NULL.
715 * If it is forwards we return the first record.
717 if (direction
== JD_BACKWARDS
)
720 return(jread(jf
, NULL
, JD_FORWARDS
));
724 * Data returned by jread() is persistent until released.
727 jref(struct jdata
*jd
)
734 jfree(struct jfile
*jf __unused
, struct jdata
*jd
)
736 if (--jd
->jd_refs
== 0)
741 * Align us to the next 16 byte boundary. If scanning forwards we align
742 * forwards if not already aligned. If scanning backwards we align
743 * backwards if not already aligned. We only have to synchronize the
744 * seek position with the file seek position for forward scans.
747 jalign(struct jfile
*jf
, enum jdirection direction
)
752 if ((int)jf
->jf_pos
& 15) {
753 if (direction
== JD_FORWARDS
) {
754 bytes
= 16 - ((int)jf
->jf_pos
& 15);
755 jreadbuf(jf
, direction
, dummy
, bytes
);
757 jf
->jf_pos
= jf
->jf_pos
& ~(off_t
)15;
763 * Read the next raw journal record forwards or backwards and return a
764 * pointer to it. Note that the file pointer's actual seek position does
765 * not match jf_pos in the reverse direction case.
768 jreadbuf(struct jfile
*jf
, enum jdirection direction
, void *buf
, int bytes
)
776 if (direction
== JD_FORWARDS
) {
777 while (ttl
!= bytes
) {
778 n
= read(jf
->jf_fd
, (char *)buf
+ ttl
, bytes
- ttl
);
780 if (n
< 0 && ttl
== 0)
788 if (jf
->jf_pos
>= bytes
) {
790 lseek(jf
->jf_fd
, jf
->jf_pos
, 0);
791 while (ttl
!= bytes
) {
792 n
= read(jf
->jf_fd
, (char *)buf
+ ttl
, bytes
- ttl
);
794 if (n
< 0 && ttl
== 0)