From c36258b5925e6cf6bf72904635100593573bfcff Mon Sep 17 00:00:00 2001 From: David Teigland Date: Thu, 27 Sep 2007 15:53:38 -0500 Subject: [DLM] block dlm_recv in recovery transition Introduce a per-lockspace rwsem that's held in read mode by dlm_recv threads while working in the dlm. This allows dlm_recv activity to be suspended when the lockspace transitions to, from and between recovery cycles. The specific bug prompting this change is one where an in-progress recovery cycle is aborted by a new recovery cycle. While dlm_recv was processing a recovery message, the recovery cycle was aborted and dlm_recoverd began cleaning up. dlm_recv decremented recover_locks_count on an rsb after dlm_recoverd had reset it to zero. This is fixed by suspending dlm_recv (taking write lock on the rwsem) before aborting the current recovery. The transitions to/from normal and recovery modes are simplified by using this new ability to block dlm_recv. The switch from normal to recovery mode means dlm_recv goes from processing locking messages, to saving them for later, and vice versa. Races are avoided by blocking dlm_recv when setting the flag that switches between modes. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/member.c | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) (limited to 'fs/dlm/member.c') diff --git a/fs/dlm/member.c b/fs/dlm/member.c index d09977528f69..e9cdcab306e2 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -18,10 +18,6 @@ #include "rcom.h" #include "config.h" -/* - * Following called by dlm_recoverd thread - */ - static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new) { struct dlm_member *memb = NULL; @@ -250,18 +246,30 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) return error; } -/* - * Following called from lockspace.c - */ +/* Userspace guarantees that dlm_ls_stop() has completed on all nodes before + dlm_ls_start() is called on any of them to start the new recovery. */ int dlm_ls_stop(struct dlm_ls *ls) { int new; /* - * A stop cancels any recovery that's in progress (see RECOVERY_STOP, - * dlm_recovery_stopped()) and prevents any new locks from being - * processed (see RUNNING, dlm_locking_stopped()). + * Prevent dlm_recv from being in the middle of something when we do + * the stop. This includes ensuring dlm_recv isn't processing a + * recovery message (rcom), while dlm_recoverd is aborting and + * resetting things from an in-progress recovery. i.e. we want + * dlm_recoverd to abort its recovery without worrying about dlm_recv + * processing an rcom at the same time. Stopping dlm_recv also makes + * it easy for dlm_receive_message() to check locking stopped and add a + * message to the requestqueue without races. + */ + + down_write(&ls->ls_recv_active); + + /* + * Abort any recovery that's in progress (see RECOVERY_STOP, + * dlm_recovery_stopped()) and tell any other threads running in the + * dlm to quit any processing (see RUNNING, dlm_locking_stopped()). */ spin_lock(&ls->ls_recover_lock); @@ -270,9 +278,15 @@ int dlm_ls_stop(struct dlm_ls *ls) ls->ls_recover_seq++; spin_unlock(&ls->ls_recover_lock); + /* + * Let dlm_recv run again, now any normal messages will be saved on the + * requestqueue for later. + */ + + up_write(&ls->ls_recv_active); + /* * This in_recovery lock does two things: - * * 1) Keeps this function from returning until all threads are out * of locking routines and locking is truely stopped. * 2) Keeps any new requests from being processed until it's unlocked @@ -284,9 +298,8 @@ int dlm_ls_stop(struct dlm_ls *ls) /* * The recoverd suspend/resume makes sure that dlm_recoverd (if - * running) has noticed the clearing of RUNNING above and quit - * processing the previous recovery. This will be true for all nodes - * before any nodes start the new recovery. + * running) has noticed RECOVERY_STOP above and quit processing the + * previous recovery. */ dlm_recoverd_suspend(ls); -- cgit v1.2.3