From deeeaf13b291420fe4a4a52606b9fc9128387340 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 1 May 2011 18:16:26 -0400 Subject: jbd2: fix fsync() tid wraparound bug If an application program does not make any changes to the indirect blocks or extent tree, i_datasync_tid will not get updated. If there are enough commits (i.e., 2**31) such that tid_geq()'s calculations wrap, and there isn't a currently active transaction at the time of the fdatasync() call, this can end up triggering a BUG_ON in fs/jbd2/commit.c: J_ASSERT(journal->j_running_transaction != NULL); It's pretty rare that this can happen, since it requires the use of fdatasync() plus *very* frequent and excessive use of fsync(). But with the right workload, it can. We fix this by replacing the use of tid_geq() with an equality test, since there's only one valid transaction id that we is valid for us to wait until it is commited: namely, the currently running transaction (if it exists). Signed-off-by: "Theodore Ts'o" --- fs/jbd2/journal.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'fs/jbd2') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index e0ec3db1c39..8c10004f787 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -479,9 +479,12 @@ int __jbd2_log_space_left(journal_t *journal) int __jbd2_log_start_commit(journal_t *journal, tid_t target) { /* - * Are we already doing a recent enough commit? + * The only transaction we can possibly wait upon is the + * currently running transaction (if it exists). Otherwise, + * the target tid must be an old one. */ - if (!tid_geq(journal->j_commit_request, target)) { + if (journal->j_running_transaction && + journal->j_running_transaction->t_tid == target) { /* * We want a new commit: OK, mark the request and wakeup the * commit thread. We do _not_ do the commit ourselves. @@ -493,7 +496,14 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target) journal->j_commit_sequence); wake_up(&journal->j_wait_commit); return 1; - } + } else if (!tid_geq(journal->j_commit_request, target)) + /* This should never happen, but if it does, preserve + the evidence before kjournald goes into a loop and + increments j_commit_sequence beyond all recognition. */ + WARN(1, "jbd: bad log_start_commit: %u %u %u %u\n", + journal->j_commit_request, journal->j_commit_sequence, + target, journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0); return 0; } -- cgit v1.2.3 From 229309caebe4508d650bb6d8f7d51f2b116f5bbd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 8 May 2011 19:09:53 -0400 Subject: jbd2: Fix forever sleeping process in do_get_write_access() In do_get_write_access() we wait on BH_Unshadow bit for buffer to get from shadow state. The waking code in journal_commit_transaction() has a bug because it does not issue a memory barrier after the buffer is moved from the shadow state and before wake_up_bit() is called. Thus a waitqueue check can happen before the buffer is actually moved from the shadow state and waiting process may never be woken. Fix the problem by issuing proper barrier. Reported-by: Tao Ma Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/jbd2') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 6e28000a4b2..78c29921868 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -760,8 +760,13 @@ wait_for_iobuf: required. */ JBUFFER_TRACE(jh, "file as BJ_Forget"); jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); - /* Wake up any transactions which were waiting for this - IO to complete */ + /* + * Wake up any transactions which were waiting for this IO to + * complete. The barrier must be here so that changes by + * jbd2_journal_file_buffer() take effect before wake_up_bit() + * does the waitqueue check. + */ + smp_mb(); wake_up_bit(&bh->b_state, BH_Unshadow); JBUFFER_TRACE(jh, "brelse shadowed buffer"); __brelse(bh); -- cgit v1.2.3 From 1be2add685181ba31554ffefa428b0f80a408bff Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 8 May 2011 19:37:54 -0400 Subject: jbd2: only print the debugging information for tid wraparound once If we somehow wrap, we don't want to keep printing the warning message over and over again. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/journal.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/jbd2') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 8c10004f787..cd2d341f602 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -500,10 +500,11 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target) /* This should never happen, but if it does, preserve the evidence before kjournald goes into a loop and increments j_commit_sequence beyond all recognition. */ - WARN(1, "jbd: bad log_start_commit: %u %u %u %u\n", - journal->j_commit_request, journal->j_commit_sequence, - target, journal->j_running_transaction ? - journal->j_running_transaction->t_tid : 0); + WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n", + journal->j_commit_request, + journal->j_commit_sequence, + target, journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0); return 0; } -- cgit v1.2.3 From 28e35e42fb255cbaeee8b9f89643f26fe376374d Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sun, 22 May 2011 21:45:26 -0400 Subject: jbd2: Fix the wrong calculation of t_max_wait in update_t_max_wait t_max_wait is added in commit 8e85fb3f to indicate how long we were waiting for new transaction to start. In commit 6d0bf005, it is moved to another function named update_t_max_wait to avoid a build warning. But the wrong thing is that the original 'ts' is initialized in the start of function start_this_handle and we can calculate t_max_wait in the right way. while with this change, ts is initialized within the function and t_max_wait can never be calculated right. This patch moves the initialization of ts to the original beginning of start_this_handle and pass it to function update_t_max_wait so that it can be calculated right and the build warning is avoided also. Cc: Jan Kara Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" Reviewed-by: Eric Sandeen --- fs/jbd2/transaction.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/jbd2') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 05fa77a2371..85a055ef93f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -82,7 +82,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) */ /* - * Update transiaction's maximum wait time, if debugging is enabled. + * Update transaction's maximum wait time, if debugging is enabled. * * In order for t_max_wait to be reliable, it must be protected by a * lock. But doing so will mean that start_this_handle() can not be @@ -91,11 +91,10 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) * means that maximum wait time reported by the jbd2_run_stats * tracepoint will always be zero. */ -static inline void update_t_max_wait(transaction_t *transaction) +static inline void update_t_max_wait(transaction_t *transaction, + unsigned long ts) { #ifdef CONFIG_JBD2_DEBUG - unsigned long ts = jiffies; - if (jbd2_journal_enable_debug && time_after(transaction->t_start, ts)) { ts = jbd2_time_diff(ts, transaction->t_start); @@ -121,6 +120,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle, tid_t tid; int needed, need_to_start; int nblocks = handle->h_buffer_credits; + unsigned long ts = jiffies; if (nblocks > journal->j_max_transaction_buffers) { printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", @@ -271,7 +271,7 @@ repeat: /* OK, account for the buffers that this operation expects to * use and add the handle to the running transaction. */ - update_t_max_wait(transaction); + update_t_max_wait(transaction, ts); handle->h_transaction = transaction; atomic_inc(&transaction->t_updates); atomic_inc(&transaction->t_handle_count); -- cgit v1.2.3 From 81be12c8179c1c397d3f179cdd9b3f7146cf47f1 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 24 May 2011 11:52:40 -0400 Subject: jbd2: fix sending of data flush on journal commit In data=ordered mode, it's theoretically possible (however rare) that an inode is filed to transaction's t_inode_list and a flusher thread writes all the data and inode is reclaimed before the transaction starts to commit. In such a case, we could erroneously omit sending a flush to file system device when it is different from the journal device (because data can still be in disk cache only). Fix the problem by setting a flag in a transaction when some inode is added to it and then send disk flush in the commit code when the flag is set. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 3 +-- fs/jbd2/transaction.c | 7 +++++++ include/linux/jbd2.h | 4 +++- 3 files changed, 11 insertions(+), 3 deletions(-) (limited to 'fs/jbd2') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 78c29921868..2d5095ecc25 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -219,7 +219,6 @@ static int journal_submit_data_buffers(journal_t *journal, ret = err; spin_lock(&journal->j_list_lock); J_ASSERT(jinode->i_transaction == commit_transaction); - commit_transaction->t_flushed_data_blocks = 1; clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); smp_mb__after_clear_bit(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); @@ -683,7 +682,7 @@ start_journal_io: * then we must flush the file system device before we issue * the commit record */ - if (commit_transaction->t_flushed_data_blocks && + if (commit_transaction->t_need_data_flush && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 85a055ef93f..20065c9f247 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2147,6 +2147,13 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) jinode->i_next_transaction == transaction) goto done; + /* + * We only ever set this variable to 1 so the test is safe. Since + * t_need_data_flush is likely to be set, we do the test to save some + * cacheline bouncing + */ + if (!transaction->t_need_data_flush) + transaction->t_need_data_flush = 1; /* On some different transaction's list - should be * the committing one */ if (jinode->i_transaction) { diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index a32dcaec04e..4d57955061f 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -658,7 +658,9 @@ struct transaction_s * waiting for it to finish. */ unsigned int t_synchronous_commit:1; - unsigned int t_flushed_data_blocks:1; + + /* Disk flush needs to be sent to fs partition [no locking] */ + int t_need_data_flush; /* * For use by the filesystem to store fs-specific data -- cgit v1.2.3 From bbd2be36910728f485ac78ea36e0f4f5a38e691e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 24 May 2011 11:59:18 -0400 Subject: jbd2: Add function jbd2_trans_will_send_data_barrier() Provide a function which returns whether a transaction with given tid will send a flush to the filesystem device. The function will be used by ext4 to detect whether fsync needs to send a separate flush or not. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 10 +++++++++- fs/jbd2/journal.c | 41 +++++++++++++++++++++++++++++++++++++++++ include/linux/jbd2.h | 4 +++- 3 files changed, 53 insertions(+), 2 deletions(-) (limited to 'fs/jbd2') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 2d5095ecc25..5b506e53c70 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -677,6 +677,10 @@ start_journal_io: err = 0; } + write_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_COMMIT); + commit_transaction->t_state = T_COMMIT_DFLUSH; + write_unlock(&journal->j_state_lock); /* * If the journal is not located on the file system device, * then we must flush the file system device before we issue @@ -804,6 +808,10 @@ wait_for_iobuf: jbd2_journal_abort(journal, err); jbd_debug(3, "JBD: commit phase 5\n"); + write_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH); + commit_transaction->t_state = T_COMMIT_JFLUSH; + write_unlock(&journal->j_state_lock); if (!JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { @@ -959,7 +967,7 @@ restart_loop: jbd_debug(3, "JBD: commit phase 7\n"); - J_ASSERT(commit_transaction->t_state == T_COMMIT); + J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH); commit_transaction->t_start = jiffies; stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index cd2d341f602..9a782699030 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -587,6 +587,47 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) return ret; } +/* + * Return 1 if a given transaction has not yet sent barrier request + * connected with a transaction commit. If 0 is returned, transaction + * may or may not have sent the barrier. Used to avoid sending barrier + * twice in common cases. + */ +int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid) +{ + int ret = 0; + transaction_t *commit_trans; + + if (!(journal->j_flags & JBD2_BARRIER)) + return 0; + read_lock(&journal->j_state_lock); + /* Transaction already committed? */ + if (tid_geq(journal->j_commit_sequence, tid)) + goto out; + commit_trans = journal->j_committing_transaction; + if (!commit_trans || commit_trans->t_tid != tid) { + ret = 1; + goto out; + } + /* + * Transaction is being committed and we already proceeded to + * submitting a flush to fs partition? + */ + if (journal->j_fs_dev != journal->j_dev) { + if (!commit_trans->t_need_data_flush || + commit_trans->t_state >= T_COMMIT_DFLUSH) + goto out; + } else { + if (commit_trans->t_state >= T_COMMIT_JFLUSH) + goto out; + } + ret = 1; +out: + read_unlock(&journal->j_state_lock); + return ret; +} +EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier); + /* * Wait for a specified commit to complete. * The caller may not hold the journal lock. diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 4d57955061f..4ecb7b16b27 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -529,9 +529,10 @@ struct transaction_s enum { T_RUNNING, T_LOCKED, - T_RUNDOWN, T_FLUSH, T_COMMIT, + T_COMMIT_DFLUSH, + T_COMMIT_JFLUSH, T_FINISHED } t_state; @@ -1230,6 +1231,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); int jbd2_journal_force_commit_nested(journal_t *journal); int jbd2_log_wait_commit(journal_t *journal, tid_t tid); int jbd2_log_do_checkpoint(journal_t *journal); +int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid); void __jbd2_log_wait_for_space(journal_t *journal); extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); -- cgit v1.2.3 From c867516de5256e9cfba2ec5847fa27e0f0ddd2c5 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Tue, 24 May 2011 17:09:58 -0400 Subject: jbd2: Fix comment to match the code in jbd2__journal_start() jbd2__journal_start() returns an ERR_PTR() value rather than NULL on failure. Signed-off-by: Eryu Guan Signed-off-by: "Theodore Ts'o" --- fs/jbd2/transaction.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/jbd2') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 20065c9f247..9dfdf010f1d 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -316,7 +316,8 @@ static handle_t *new_handle(int nblocks) * This function is visible to journal users (like ext3fs), so is not * called with the journal already locked. * - * Return a pointer to a newly allocated handle, or NULL on failure + * Return a pointer to a newly allocated handle, or an ERR_PTR() value + * on failure. */ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) { -- cgit v1.2.3 From 3991b4008cb12f3abfe8dbb049b03d1cc39a8440 Mon Sep 17 00:00:00 2001 From: Ding Dinghua Date: Wed, 25 May 2011 17:43:48 -0400 Subject: jbd2: fix a potential leak of a journal_head on an error path drop jh->b_jcount in error path Signed-off-by: Ding Dinghua Signed-off-by: "Theodore Ts'o" --- fs/jbd2/transaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/jbd2') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 9dfdf010f1d..3eec82d32fd 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -922,8 +922,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) */ JBUFFER_TRACE(jh, "cancelling revoke"); jbd2_journal_cancel_revoke(handle, jh); - jbd2_journal_put_journal_head(jh); out: + jbd2_journal_put_journal_head(jh); return err; } -- cgit v1.2.3