From 4772f1cb60082751b4da727cb80cc02fa0ddb6e1 Mon Sep 17 00:00:00 2001 From: Mo Chen Date: Wed, 23 Aug 2023 13:29:10 -0500 Subject: [PATCH 1/4] Abort a read when the disk is known to be bad Reads on a known bad disk can read corrupt cache metadata, which causes a crash. Abort reads early on a bad disk so that ATS is less likely to crash after a disk is marked bad. --- iocore/cache/Cache.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/iocore/cache/Cache.cc b/iocore/cache/Cache.cc index 660714715f9..62264c01a2a 100644 --- a/iocore/cache/Cache.cc +++ b/iocore/cache/Cache.cc @@ -2193,6 +2193,11 @@ CacheVC::handleReadDone(int event, Event *e) } goto Ldone; } + if (DISK_BAD(vol->disk)) { + io.aio_result = -1; + Warning("Canceling cache read: disk is bad."); + goto Ldone; + } doc = reinterpret_cast(buf->data()); ink_assert(vol->mutex->nthread_holding < 1000); From 9cfca7c7166aab9a4179fba1a5ab2b3b2a94540c Mon Sep 17 00:00:00 2001 From: Mo Chen Date: Tue, 29 Aug 2023 13:53:41 -0500 Subject: [PATCH 2/4] Add volume info to bad disk warning --- iocore/cache/Cache.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iocore/cache/Cache.cc b/iocore/cache/Cache.cc index 62264c01a2a..a348a1b3ad4 100644 --- a/iocore/cache/Cache.cc +++ b/iocore/cache/Cache.cc @@ -2195,7 +2195,7 @@ CacheVC::handleReadDone(int event, Event *e) } if (DISK_BAD(vol->disk)) { io.aio_result = -1; - Warning("Canceling cache read: disk is bad."); + Warning("Canceling cache read: disk %s is bad.", vol->hash_text.get()); goto Ldone; } From 6808224dc8e31a649256c422e09121e8d7f15e77 Mon Sep 17 00:00:00 2001 From: Mo Chen Date: Tue, 29 Aug 2023 13:59:16 -0500 Subject: [PATCH 3/4] Move the disk bad check before taking the vol lock --- iocore/cache/Cache.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/iocore/cache/Cache.cc b/iocore/cache/Cache.cc index a348a1b3ad4..aac80a1c19b 100644 --- a/iocore/cache/Cache.cc +++ b/iocore/cache/Cache.cc @@ -2180,6 +2180,11 @@ CacheVC::handleReadDone(int event, Event *e) return EVENT_CONT; } { + if (DISK_BAD(vol->disk)) { + io.aio_result = -1; + Warning("Canceling cache read: disk %s is bad.", vol->hash_text.get()); + goto Ldone; + } MUTEX_TRY_LOCK(lock, vol->mutex, mutex->thread_holding); if (!lock.is_locked()) { VC_SCHED_LOCK_RETRY(); @@ -2193,11 +2198,6 @@ CacheVC::handleReadDone(int event, Event *e) } goto Ldone; } - if (DISK_BAD(vol->disk)) { - io.aio_result = -1; - Warning("Canceling cache read: disk %s is bad.", vol->hash_text.get()); - goto Ldone; - } doc = reinterpret_cast(buf->data()); ink_assert(vol->mutex->nthread_holding < 1000); From c8e792bb0bc5cc12edde5e68e5666da4dc8ad0e7 Mon Sep 17 00:00:00 2001 From: Mo Chen Date: Mon, 18 Sep 2023 10:46:35 -0500 Subject: [PATCH 4/4] Move disk bad code outside the scope for locking vol->mutex --- iocore/cache/Cache.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/iocore/cache/Cache.cc b/iocore/cache/Cache.cc index aac80a1c19b..4232a25cae5 100644 --- a/iocore/cache/Cache.cc +++ b/iocore/cache/Cache.cc @@ -2179,12 +2179,12 @@ CacheVC::handleReadDone(int event, Event *e) } else if (is_io_in_progress()) { return EVENT_CONT; } + if (DISK_BAD(vol->disk)) { + io.aio_result = -1; + Warning("Canceling cache read: disk %s is bad.", vol->hash_text.get()); + goto Ldone; + } { - if (DISK_BAD(vol->disk)) { - io.aio_result = -1; - Warning("Canceling cache read: disk %s is bad.", vol->hash_text.get()); - goto Ldone; - } MUTEX_TRY_LOCK(lock, vol->mutex, mutex->thread_holding); if (!lock.is_locked()) { VC_SCHED_LOCK_RETRY();