2 * Copyright (c) 2010 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * HAMMER redo - REDO record support for the UNDO/REDO FIFO.
38 * See also hammer_undo.c
43 RB_GENERATE2(hammer_redo_rb_tree
, hammer_inode
, rb_redonode
,
44 hammer_redo_rb_compare
, hammer_off_t
, redo_fifo_start
);
47 * HAMMER version 4+ REDO support.
49 * REDO records are used to improve fsync() performance. Instead of having
50 * to go through a complete double-flush cycle involving at least two disk
51 * synchronizations the fsync need only flush UNDO/REDO FIFO buffers through
52 * the related REDO records, which is a single synchronization requiring
53 * no track seeking. If a recovery becomes necessary the recovery code
54 * will generate logical data writes based on the REDO records encountered.
55 * That is, the recovery code will UNDO any partial meta-data/data writes
56 * at the raw disk block level and then REDO the data writes at the logical
60 hammer_generate_redo(hammer_transaction_t trans
, hammer_inode_t ip
,
61 hammer_off_t file_off
, uint32_t flags
,
65 hammer_volume_t root_volume
;
66 hammer_blockmap_t undomap
;
67 hammer_buffer_t buffer
= NULL
;
68 hammer_fifo_redo_t redo
;
69 hammer_fifo_tail_t tail
;
70 hammer_off_t next_offset
;
80 root_volume
= trans
->rootvol
;
81 undomap
= &hmp
->blockmap
[HAMMER_ZONE_UNDO_INDEX
];
84 * No undo recursion when modifying the root volume
86 hammer_modify_volume_noundo(NULL
, root_volume
);
87 hammer_lock_ex(&hmp
->undo_lock
);
89 /* undo had better not roll over (loose test) */
90 if (hammer_undo_space(trans
) < len
+ HAMMER_BUFSIZE
*3)
91 hpanic("insufficient UNDO/REDO FIFO space for redo!");
94 * Loop until the undo for the entire range has been laid down.
95 * Loop at least once (len might be 0 as a degenerate case).
99 * Fetch the layout offset in the UNDO FIFO, wrap it as
102 if (undomap
->next_offset
== undomap
->alloc_offset
)
103 undomap
->next_offset
= HAMMER_ENCODE_UNDO(0);
104 next_offset
= undomap
->next_offset
;
107 * This is a tail-chasing FIFO, when we hit the start of a new
108 * buffer we don't have to read it in.
110 if ((next_offset
& HAMMER_BUFMASK
) == 0) {
111 redo
= hammer_bnew(hmp
, next_offset
, &error
, &buffer
);
112 hammer_format_undo(hmp
,
113 redo
, hmp
->undo_seqno
^ 0x40000000);
115 redo
= hammer_bread(hmp
, next_offset
, &error
, &buffer
);
119 hammer_modify_buffer_noundo(NULL
, buffer
);
122 * Calculate how big a media structure fits up to the next
123 * alignment point and how large a data payload we can
126 * If n calculates to 0 or negative there is no room for
127 * anything but a PAD.
129 bytes
= HAMMER_UNDO_ALIGN
-
130 ((int)next_offset
& HAMMER_UNDO_MASK
);
132 (int)sizeof(struct hammer_fifo_redo
) -
133 (int)sizeof(struct hammer_fifo_tail
);
136 * If available space is insufficient for any payload
137 * we have to lay down a PAD.
139 * The minimum PAD is 8 bytes and the head and tail will
140 * overlap each other in that case. PADs do not have
141 * sequence numbers or CRCs.
143 * A PAD may not start on a boundary. That is, every
144 * 512-byte block in the UNDO/REDO FIFO must begin with
145 * a record containing a sequence number.
148 KKASSERT(bytes
>= sizeof(struct hammer_fifo_tail
));
149 KKASSERT(((int)next_offset
& HAMMER_UNDO_MASK
) != 0);
150 tail
= (void *)((char *)redo
+ bytes
- sizeof(*tail
));
151 if ((void *)redo
!= (void *)tail
) {
152 tail
->tail_signature
= HAMMER_TAIL_SIGNATURE
;
153 tail
->tail_type
= HAMMER_HEAD_TYPE_PAD
;
154 tail
->tail_size
= bytes
;
156 redo
->head
.hdr_signature
= HAMMER_HEAD_SIGNATURE
;
157 redo
->head
.hdr_type
= HAMMER_HEAD_TYPE_PAD
;
158 redo
->head
.hdr_size
= bytes
;
159 /* NO CRC OR SEQ NO */
160 undomap
->next_offset
+= bytes
;
161 hammer_modify_buffer_done(buffer
);
162 hammer_stats_redo
+= bytes
;
167 * When generating an inode-related REDO record we track
168 * the point in the UNDO/REDO FIFO containing the inode's
169 * earliest REDO record. See hammer_generate_redo_sync().
171 * redo_fifo_next is cleared when an inode is staged to
172 * the backend and then used to determine how to reassign
173 * redo_fifo_start after the inode flush completes.
176 redo
->redo_objid
= ip
->obj_id
;
177 redo
->redo_localization
= ip
->obj_localization
;
178 if ((ip
->flags
& HAMMER_INODE_RDIRTY
) == 0) {
179 ip
->redo_fifo_start
= next_offset
;
180 if (RB_INSERT(hammer_redo_rb_tree
,
181 &hmp
->rb_redo_root
, ip
)) {
182 hpanic("cannot insert inode %p on "
185 ip
->flags
|= HAMMER_INODE_RDIRTY
;
187 if (ip
->redo_fifo_next
== 0)
188 ip
->redo_fifo_next
= next_offset
;
190 redo
->redo_objid
= 0;
191 redo
->redo_localization
= 0;
195 * Calculate the actual payload and recalculate the size
196 * of the media structure as necessary. If no data buffer
197 * is supplied there is no payload.
201 } else if (n
> len
) {
204 bytes
= HAMMER_HEAD_DOALIGN(n
) +
205 (int)sizeof(struct hammer_fifo_redo
) +
206 (int)sizeof(struct hammer_fifo_tail
);
207 if (hammer_debug_general
& 0x0080) {
208 hdkprintf("redo %016jx %d %d\n",
209 (intmax_t)next_offset
, bytes
, n
);
212 redo
->head
.hdr_signature
= HAMMER_HEAD_SIGNATURE
;
213 redo
->head
.hdr_type
= HAMMER_HEAD_TYPE_REDO
;
214 redo
->head
.hdr_size
= bytes
;
215 redo
->head
.hdr_seq
= hmp
->undo_seqno
++;
216 redo
->head
.hdr_crc
= 0;
217 redo
->redo_offset
= file_off
;
218 redo
->redo_flags
= flags
;
221 * Incremental payload. If no payload we throw the entire
222 * len into redo_data_bytes and will not loop.
225 redo
->redo_data_bytes
= n
;
226 bcopy(base
, redo
+ 1, n
);
228 base
= (char *)base
+ n
;
231 redo
->redo_data_bytes
= len
;
236 tail
= (void *)((char *)redo
+ bytes
- sizeof(*tail
));
237 tail
->tail_signature
= HAMMER_TAIL_SIGNATURE
;
238 tail
->tail_type
= HAMMER_HEAD_TYPE_REDO
;
239 tail
->tail_size
= bytes
;
241 KKASSERT(bytes
>= sizeof(redo
->head
));
242 hammer_crc_set_fifo_head(hmp
->version
, &redo
->head
, bytes
);
243 undomap
->next_offset
+= bytes
;
244 hammer_stats_redo
+= bytes
;
247 * Before we finish off the buffer we have to deal with any
248 * junk between the end of the media structure we just laid
249 * down and the UNDO alignment boundary. We do this by laying
250 * down a dummy PAD. Even though we will probably overwrite
251 * it almost immediately we have to do this so recovery runs
252 * can iterate the UNDO space without having to depend on
253 * the indices in the volume header.
255 * This dummy PAD will be overwritten on the next undo so
256 * we do not adjust undomap->next_offset.
258 bytes
= HAMMER_UNDO_ALIGN
-
259 ((int)undomap
->next_offset
& HAMMER_UNDO_MASK
);
260 if (bytes
!= HAMMER_UNDO_ALIGN
) {
261 KKASSERT(bytes
>= sizeof(struct hammer_fifo_tail
));
262 redo
= (void *)(tail
+ 1);
263 tail
= (void *)((char *)redo
+ bytes
- sizeof(*tail
));
264 if ((void *)redo
!= (void *)tail
) {
265 tail
->tail_signature
= HAMMER_TAIL_SIGNATURE
;
266 tail
->tail_type
= HAMMER_HEAD_TYPE_PAD
;
267 tail
->tail_size
= bytes
;
269 redo
->head
.hdr_signature
= HAMMER_HEAD_SIGNATURE
;
270 redo
->head
.hdr_type
= HAMMER_HEAD_TYPE_PAD
;
271 redo
->head
.hdr_size
= bytes
;
272 /* NO CRC OR SEQ NO */
274 hammer_modify_buffer_done(buffer
);
278 hammer_modify_volume_done(root_volume
);
279 hammer_unlock(&hmp
->undo_lock
);
282 hammer_rel_buffer(buffer
, 0);
285 * Make sure the nominal undo span contains at least one REDO_SYNC,
286 * otherwise the REDO recovery will not be triggered.
288 if ((hmp
->flags
& HAMMER_MOUNT_REDO_SYNC
) == 0 &&
289 flags
!= HAMMER_REDO_SYNC
) {
290 hammer_generate_redo_sync(trans
);
297 * Generate a REDO SYNC record. At least one such record must be generated
298 * in the nominal recovery span for the recovery code to be able to run
299 * REDOs outside of the span.
301 * The SYNC record contains the aggregate earliest UNDO/REDO FIFO offset
302 * for all inodes with active REDOs. This changes dynamically as inodes
305 * During recovery stage2 any new flush cycles must specify the original
306 * redo sync offset. That way a crash will re-run the REDOs, at least
307 * up to the point where the UNDO FIFO does not overwrite the area.
310 hammer_generate_redo_sync(hammer_transaction_t trans
)
312 hammer_mount_t hmp
= trans
->hmp
;
314 hammer_off_t redo_fifo_start
;
316 if (hmp
->flags
& HAMMER_MOUNT_REDO_RECOVERY_RUN
) {
318 redo_fifo_start
= hmp
->recover_stage2_offset
;
320 ip
= RB_FIRST(hammer_redo_rb_tree
, &hmp
->rb_redo_root
);
322 redo_fifo_start
= ip
->redo_fifo_start
;
326 if (redo_fifo_start
) {
327 if (hammer_debug_io
& 0x0004) {
328 hdkprintf("SYNC IP %p %016jx\n",
329 ip
, (intmax_t)redo_fifo_start
);
331 hammer_generate_redo(trans
, NULL
, redo_fifo_start
,
332 HAMMER_REDO_SYNC
, NULL
, 0);
333 trans
->hmp
->flags
|= HAMMER_MOUNT_REDO_SYNC
;
338 * This is called when an inode is queued to the backend.
341 hammer_redo_fifo_start_flush(hammer_inode_t ip
)
343 ip
->redo_fifo_next
= 0;
347 * This is called when an inode backend flush is finished. We have to make
348 * sure that RDIRTY is not set unless dirty bufs are present. Dirty bufs
349 * can get destroyed through operations such as truncations and leave
350 * us with a stale redo_fifo_next.
353 hammer_redo_fifo_end_flush(hammer_inode_t ip
)
355 hammer_mount_t hmp
= ip
->hmp
;
357 hammer_lock_ex(&hmp
->undo_lock
);
358 if (ip
->flags
& HAMMER_INODE_RDIRTY
) {
359 RB_REMOVE(hammer_redo_rb_tree
, &hmp
->rb_redo_root
, ip
);
360 ip
->flags
&= ~HAMMER_INODE_RDIRTY
;
362 if ((ip
->flags
& HAMMER_INODE_BUFS
) == 0)
363 ip
->redo_fifo_next
= 0;
364 if (ip
->redo_fifo_next
) {
365 ip
->redo_fifo_start
= ip
->redo_fifo_next
;
366 if (RB_INSERT(hammer_redo_rb_tree
, &hmp
->rb_redo_root
, ip
)) {
367 hpanic("cannot reinsert inode %p on redo FIFO", ip
);
369 ip
->flags
|= HAMMER_INODE_RDIRTY
;
371 hammer_unlock(&hmp
->undo_lock
);