From 94ddd0900a8838f62bba15e270649a42f4ef9f81 Mon Sep 17 00:00:00 2001 From: Prakash Surya Date: Thu, 7 Dec 2017 11:26:32 -0800 Subject: [PATCH] 8909 8585 can cause a use-after-free kernel panic Reviewed by: John Kennedy Reviewed by: Matthew Ahrens Reviewed by: George Wilson Reviewed by: Brad Lewis Reviewed by: Igor Kozhukhov Approved by: Robert Mustacchi --- usr/src/pkg/manifests/system-test-zfstest.mf | 1 + .../tests/functional/slog/slog_015_neg.ksh | 73 ++++++++++ usr/src/uts/common/fs/zfs/sys/zil.h | 1 + usr/src/uts/common/fs/zfs/sys/zil_impl.h | 39 +++++- usr/src/uts/common/fs/zfs/sys/zio.h | 1 - usr/src/uts/common/fs/zfs/zil.c | 149 ++++++++++++++------- usr/src/uts/common/fs/zfs/zio.c | 14 -- 7 files changed, 204 insertions(+), 74 deletions(-) create mode 100644 usr/src/test/zfs-tests/tests/functional/slog/slog_015_neg.ksh diff --git a/usr/src/pkg/manifests/system-test-zfstest.mf b/usr/src/pkg/manifests/system-test-zfstest.mf index b955be4edb..91405accea 100644 --- a/usr/src/pkg/manifests/system-test-zfstest.mf +++ b/usr/src/pkg/manifests/system-test-zfstest.mf @@ -2346,6 +2346,7 @@ file path=opt/zfs-tests/tests/functional/slog/slog_011_neg mode=0555 file path=opt/zfs-tests/tests/functional/slog/slog_012_neg mode=0555 file path=opt/zfs-tests/tests/functional/slog/slog_013_pos mode=0555 file path=opt/zfs-tests/tests/functional/slog/slog_014_pos mode=0555 +file path=opt/zfs-tests/tests/functional/slog/slog_015_neg mode=0555 file path=opt/zfs-tests/tests/functional/snapshot/cleanup mode=0555 file path=opt/zfs-tests/tests/functional/snapshot/clone_001_pos mode=0555 file path=opt/zfs-tests/tests/functional/snapshot/deadlist_lock mode=0555 diff --git a/usr/src/test/zfs-tests/tests/functional/slog/slog_015_neg.ksh b/usr/src/test/zfs-tests/tests/functional/slog/slog_015_neg.ksh new file mode 100644 index 0000000000..952c135a9a --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/slog/slog_015_neg.ksh @@ -0,0 +1,73 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2017 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/slog/slog.kshlib + +# +# DESCRIPTION: +# Concurrent sync writes with log offline/online works. +# +# STRATEGY: +# 1. Configure "zfs_commit_timeout_pct" +# 2. Create pool with a log device. +# 3. Concurrently do the following: +# 3.1. Perform 8K sync writes +# 3.2. Perform log offline/online commands +# 4. Loop to test with growing "zfs_commit_timout_pct" values. +# + +verify_runnable "global" + +function cleanup +{ + # + # Wait for any of the writes and/or zpool commands that were + # kicked off in the background to complete. On failure, we may + # enter this function without previously waiting for them. + # + wait + + mdb -kwe "zfs_commit_timeout_pct/Z $ORIG_TIMEOUT" + + poolexists $TESTPOOL && zpool destroy -f $TESTPOOL +} + +ORIG_TIMEOUT=$(mdb -ke "zfs_commit_timeout_pct/J" | tail -1 | awk '{print $NF}') +log_onexit cleanup + +for PCT in 0 1 2 4 8 16 32 64 128 256 512 1024; do + log_must mdb -kwe "zfs_commit_timeout_pct/Z $PCT" + + log_must zpool create $TESTPOOL $VDEV log $SDEV + + for i in {1..10}; do + log_must fio --rw write --sync 1 --directory "/$TESTPOOL" \ + --bs 8K --size 8K --name slog-test + done & + + for i in {1..10}; do + log_must zpool offline $TESTPOOL $SDEV + log_must verify_slog_device $TESTPOOL $SDEV 'OFFLINE' + log_must zpool online $TESTPOOL $SDEV + log_must verify_slog_device $TESTPOOL $SDEV 'ONLINE' + done & + + wait + + log_must zpool destroy -f $TESTPOOL +done + +log_pass "Concurrent writes with slog offline/online works." diff --git a/usr/src/uts/common/fs/zfs/sys/zil.h b/usr/src/uts/common/fs/zfs/sys/zil.h index 1f1298e8c6..31d5e8276c 100644 --- a/usr/src/uts/common/fs/zfs/sys/zil.h +++ b/usr/src/uts/common/fs/zfs/sys/zil.h @@ -417,6 +417,7 @@ extern void zil_itx_destroy(itx_t *itx); extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); extern void zil_commit(zilog_t *zilog, uint64_t oid); +extern void zil_commit_impl(zilog_t *zilog, uint64_t oid); extern int zil_vdev_offline(const char *osname, void *txarg); extern int zil_claim(struct dsl_pool *dp, diff --git a/usr/src/uts/common/fs/zfs/sys/zil_impl.h b/usr/src/uts/common/fs/zfs/sys/zil_impl.h index 543ba1ee1e..af71cab7f1 100644 --- a/usr/src/uts/common/fs/zfs/sys/zil_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h @@ -37,13 +37,38 @@ extern "C" { #endif /* - * Possbile states for a given lwb structure. An lwb will start out in - * the "closed" state, and then transition to the "opened" state via a - * call to zil_lwb_write_open(). After the lwb is "open", it can - * transition into the "issued" state via zil_lwb_write_issue(). After - * the lwb's zio completes, and the vdev's are flushed, the lwb will - * transition into the "done" state via zil_lwb_write_done(), and the - * structure eventually freed. + * Possbile states for a given lwb structure. + * + * An lwb will start out in the "closed" state, and then transition to + * the "opened" state via a call to zil_lwb_write_open(). When + * transitioning from "closed" to "opened" the zilog's "zl_issuer_lock" + * must be held. + * + * After the lwb is "opened", it can transition into the "issued" state + * via zil_lwb_write_issue(). Again, the zilog's "zl_issuer_lock" must + * be held when making this transition. + * + * After the lwb's zio completes, and the vdev's are flushed, the lwb + * will transition into the "done" state via zil_lwb_write_done(). When + * transitioning from "issued" to "done", the zilog's "zl_lock" must be + * held, *not* the "zl_issuer_lock". + * + * The zilog's "zl_issuer_lock" can become heavily contended in certain + * workloads, so we specifically avoid acquiring that lock when + * transitioning an lwb from "issued" to "done". This allows us to avoid + * having to acquire the "zl_issuer_lock" for each lwb ZIO completion, + * which would have added more lock contention on an already heavily + * contended lock. + * + * Additionally, correctness when reading an lwb's state is often + * acheived by exploiting the fact that these state transitions occur in + * this specific order; i.e. "closed" to "opened" to "issued" to "done". + * + * Thus, if an lwb is in the "closed" or "opened" state, holding the + * "zl_issuer_lock" will prevent a concurrent thread from transitioning + * that lwb to the "issued" state. Likewise, if an lwb is already in the + * "issued" state, holding the "zl_lock" will prevent a concurrent + * thread from transitioning that lwb to the "done" state. */ typedef enum { LWB_STATE_CLOSED, diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index c9503297ab..a9fb367f4f 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -556,7 +556,6 @@ extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa, extern enum zio_compress zio_compress_select(spa_t *spa, enum zio_compress child, enum zio_compress parent); -extern void zio_cancel(zio_t *zio); extern void zio_suspend(spa_t *spa, zio_t *zio); extern int zio_resume(spa_t *spa); extern void zio_resume_wait(spa_t *spa); diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c index dc80272912..97af749164 100644 --- a/usr/src/uts/common/fs/zfs/zil.c +++ b/usr/src/uts/common/fs/zfs/zil.c @@ -502,7 +502,7 @@ zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg) ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); - ASSERT(list_is_empty(&lwb->lwb_waiters)); + VERIFY(list_is_empty(&lwb->lwb_waiters)); return (lwb); } @@ -512,31 +512,13 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb) { ASSERT(MUTEX_HELD(&zilog->zl_lock)); ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); - ASSERT(list_is_empty(&lwb->lwb_waiters)); - - if (lwb->lwb_state == LWB_STATE_OPENED) { - avl_tree_t *t = &lwb->lwb_vdev_tree; - void *cookie = NULL; - zil_vdev_node_t *zv; - - while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) - kmem_free(zv, sizeof (*zv)); - - ASSERT3P(lwb->lwb_root_zio, !=, NULL); - ASSERT3P(lwb->lwb_write_zio, !=, NULL); - - zio_cancel(lwb->lwb_root_zio); - zio_cancel(lwb->lwb_write_zio); - - lwb->lwb_root_zio = NULL; - lwb->lwb_write_zio = NULL; - } else { - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); - } - + VERIFY(list_is_empty(&lwb->lwb_waiters)); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); ASSERT3P(lwb->lwb_write_zio, ==, NULL); ASSERT3P(lwb->lwb_root_zio, ==, NULL); + ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa)); + ASSERT(lwb->lwb_state == LWB_STATE_CLOSED || + lwb->lwb_state == LWB_STATE_DONE); /* * Clear the zilog's field to indicate this lwb is no longer @@ -883,6 +865,12 @@ zil_commit_waiter_skip(zil_commit_waiter_t *zcw) static void zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) { + /* + * The lwb_waiters field of the lwb is protected by the zilog's + * zl_lock, thus it must be held when calling this function. + */ + ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); + mutex_enter(&zcw->zcw_lock); ASSERT(!list_link_active(&zcw->zcw_node)); ASSERT3P(zcw->zcw_lwb, ==, NULL); @@ -1348,8 +1336,10 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) * For more details, see the comment above zil_commit(). */ if (lrc->lrc_txtype == TX_COMMIT) { + mutex_enter(&zilog->zl_lock); zil_commit_waiter_link_lwb(itx->itx_private, lwb); itx->itx_private = NULL; + mutex_exit(&zilog->zl_lock); return (lwb); } @@ -1957,16 +1947,54 @@ zil_process_commit_list(zilog_t *zilog) zilog_t *, zilog, itx_t *, itx); } - /* - * This is inherently racy and may result in us writing - * out a log block for a txg that was just synced. This - * is ok since we'll end cleaning up that log block the - * next time we call zil_sync(). - */ boolean_t synced = txg <= spa_last_synced_txg(spa); boolean_t frozen = txg > spa_freeze_txg(spa); - if (!synced || frozen) { + /* + * If the txg of this itx has already been synced out, then + * we don't need to commit this itx to an lwb. This is + * because the data of this itx will have already been + * written to the main pool. This is inherently racy, and + * it's still ok to commit an itx whose txg has already + * been synced; this will result in a write that's + * unnecessary, but will do no harm. + * + * With that said, we always want to commit TX_COMMIT itxs + * to an lwb, regardless of whether or not that itx's txg + * has been synced out. We do this to ensure any OPENED lwb + * will always have at least one zil_commit_waiter_t linked + * to the lwb. + * + * As a counter-example, if we skipped TX_COMMIT itx's + * whose txg had already been synced, the following + * situation could occur if we happened to be racing with + * spa_sync: + * + * 1. we commit a non-TX_COMMIT itx to an lwb, where the + * itx's txg is 10 and the last synced txg is 9. + * 2. spa_sync finishes syncing out txg 10. + * 3. we move to the next itx in the list, it's a TX_COMMIT + * whose txg is 10, so we skip it rather than committing + * it to the lwb used in (1). + * + * If the itx that is skipped in (3) is the last TX_COMMIT + * itx in the commit list, than it's possible for the lwb + * used in (1) to remain in the OPENED state indefinitely. + * + * To prevent the above scenario from occuring, ensuring + * that once an lwb is OPENED it will transition to ISSUED + * and eventually DONE, we always commit TX_COMMIT itx's to + * an lwb here, even if that itx's txg has already been + * synced. + * + * Finally, if the pool is frozen, we _always_ commit the + * itx. The point of freezing the pool is to prevent data + * from being written to the main pool via spa_sync, and + * instead rely solely on the ZIL to persistently store the + * data; i.e. when the pool is frozen, the last synced txg + * value can't be trusted. + */ + if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) { if (lwb != NULL) { lwb = zil_lwb_commit(zilog, itx, lwb); } else if (lrc->lrc_txtype == TX_COMMIT) { @@ -1974,22 +2002,6 @@ zil_process_commit_list(zilog_t *zilog) zil_commit_waiter_link_nolwb( itx->itx_private, &nolwb_waiters); } - } else if (lrc->lrc_txtype == TX_COMMIT) { - ASSERT3B(synced, ==, B_TRUE); - ASSERT3B(frozen, ==, B_FALSE); - - /* - * If this is a commit itx, then there will be a - * thread that is either: already waiting for - * it, or soon will be waiting. - * - * This itx has already been committed to disk - * via spa_sync() so we don't bother committing - * it to an lwb. As a result, we cannot use the - * lwb zio callback to signal the waiter and - * mark it as done, so we must do that here. - */ - zil_commit_waiter_skip(itx->itx_private); } list_remove(&zilog->zl_itx_commit_list, itx); @@ -2091,7 +2103,6 @@ zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) { ASSERT(!MUTEX_HELD(&zilog->zl_lock)); ASSERT(spa_writeable(zilog->zl_spa)); - ASSERT0(zilog->zl_suspend); mutex_enter(&zilog->zl_issuer_lock); @@ -2170,10 +2181,24 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) ASSERT3P(lwb, ==, zcw->zcw_lwb); /* - * We've already checked this above, but since we hadn't - * acquired the zilog's zl_issuer_lock, we have to perform this - * check a second time while holding the lock. We can't call + * We've already checked this above, but since we hadn't acquired + * the zilog's zl_issuer_lock, we have to perform this check a + * second time while holding the lock. + * + * We don't need to hold the zl_lock since the lwb cannot transition + * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb + * _can_ transition from ISSUED to DONE, but it's OK to race with + * that transition since we treat the lwb the same, whether it's in + * the ISSUED or DONE states. + * + * The important thing, is we treat the lwb differently depending on + * if it's ISSUED or OPENED, and block any other threads that might + * attempt to issue this lwb. For that reason we hold the + * zl_issuer_lock when checking the lwb_state; we must not call * zil_lwb_write_issue() if the lwb had already been issued. + * + * See the comment above the lwb_state_t structure definition for + * more details on the lwb states, and locking requirements. */ if (lwb->lwb_state == LWB_STATE_ISSUED || lwb->lwb_state == LWB_STATE_DONE) @@ -2263,7 +2288,6 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) ASSERT(!MUTEX_HELD(&zilog->zl_lock)); ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT(spa_writeable(zilog->zl_spa)); - ASSERT0(zilog->zl_suspend); mutex_enter(&zcw->zcw_lock); @@ -2568,6 +2592,12 @@ zil_commit(zilog_t *zilog, uint64_t foid) return; } + zil_commit_impl(zilog, foid); +} + +void +zil_commit_impl(zilog_t *zilog, uint64_t foid) +{ /* * Move the "async" itxs for the specified foid to the "sync" * queues, such that they will be later committed (or skipped) @@ -2980,7 +3010,22 @@ zil_suspend(const char *osname, void **cookiep) zilog->zl_suspending = B_TRUE; mutex_exit(&zilog->zl_lock); - zil_commit(zilog, 0); + /* + * We need to use zil_commit_impl to ensure we wait for all + * LWB_STATE_OPENED and LWB_STATE_ISSUED lwb's to be committed + * to disk before proceeding. If we used zil_commit instead, it + * would just call txg_wait_synced(), because zl_suspend is set. + * txg_wait_synced() doesn't wait for these lwb's to be + * LWB_STATE_DONE before returning. + */ + zil_commit_impl(zilog, 0); + + /* + * Now that we've ensured all lwb's are LWB_STATE_DONE, we use + * txg_wait_synced() to ensure the data from the zilog has + * migrated to the main pool before calling zil_destroy(). + */ + txg_wait_synced(zilog->zl_dmu_pool, 0); zil_destroy(zilog, B_FALSE); diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index b7aba119ce..56d6360843 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -1716,20 +1716,6 @@ zio_reexecute(zio_t *pio) } void -zio_cancel(zio_t *zio) -{ - /* - * Disallow cancellation of a zio that's already been issued. - */ - VERIFY3P(zio->io_executor, ==, NULL); - - zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - zio->io_done = NULL; - - zio_nowait(zio); -} - -void zio_suspend(spa_t *spa, zio_t *zio) { if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) -- 2.11.4.GIT