xfs: serialise inode reclaim within an AG

Memory reclaim via shrinkers has a terrible habit of having N+M concurrent shrinker executions (N = num CPUs, M = num kswapds) all trying to shrink the same cache. When the cache they are all working on is protected by a single spinlock, massive contention an slowdowns occur. Wrap the per-ag inode caches with a reclaim mutex to serialise reclaim access to the AG. This will block concurrent reclaim in each AG but still allow reclaim to scan multiple AGs concurrently. Allow shrinkers to move on to the next AG if it can't get the lock, and if we can't get any AG, then start blocking on locks. To prevent reclaimers from continually scanning the same inodes in each AG, add a cursor that tracks where the last reclaim got up to and start from that point on the next reclaim. This should avoid only ever scanning a small number of inodes at the satart of each AG and not making progress. If we have a non-shrinker based reclaim pass, ignore the cursor and reset it to zero once we are done. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Alex Elder <aelder@sgi.com>
author: Dave Chinner <dchinner@redhat.com> 2010-09-27 11:09:51 +1000
committer: Alex Elder <aelder@sgi.com> 2010-10-18 15:07:55 -0500
commit: 69b491c214d7fd4d4df972ae5377be99ca3753db (patch)
tree: b0d022080d8da893e525ee6502878424cffbd8c2 /fs/xfs
parent: e3a20c0b02e1704ab115dfa9d012caf0fbc45ed0 (diff)
3 files changed, 33 insertions, 0 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 754bc591a24..37d33254981 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -837,8 +837,12 @@ xfs_reclaim_inodes_ag(
 	int			error = 0;
 	int			last_error = 0;
 	xfs_agnumber_t		ag;
+	int			trylock = flags & SYNC_TRYLOCK;
+	int			skipped;
 
+restart:
 	ag = 0;
+	skipped = 0;
 	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
 		unsigned long	first_index = 0;
 		int		done = 0;
@@ -846,6 +850,15 @@ xfs_reclaim_inodes_ag(
 
 		ag = pag->pag_agno + 1;
 
+		if (trylock) {
+			if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
+				skipped++;
+				continue;
+			}
+			first_index = pag->pag_ici_reclaim_cursor;
+		} else
+			mutex_lock(&pag->pag_ici_reclaim_lock);
+
 		do {
 			struct xfs_inode *batch[XFS_LOOKUP_BATCH];
 			int	i;
@@ -898,8 +911,25 @@ xfs_reclaim_inodes_ag(
 
 		} while (nr_found && !done && *nr_to_scan > 0);
 
+		if (trylock && !done)
+			pag->pag_ici_reclaim_cursor = first_index;
+		else
+			pag->pag_ici_reclaim_cursor = 0;
+		mutex_unlock(&pag->pag_ici_reclaim_lock);
 		xfs_perag_put(pag);
 	}
+
+	/*
+	 * if we skipped any AG, and we still have scan count remaining, do
+	 * another pass this time using blocking reclaim semantics (i.e
+	 * waiting on the reclaim locks and ignoring the reclaim cursors). This
+	 * ensure that when we get more reclaimers than AGs we block rather
+	 * than spin trying to execute reclaim.
+	 */
+	if (trylock && skipped && *nr_to_scan > 0) {
+		trylock = 0;
+		goto restart;
+	}
 	return XFS_ERROR(last_error);
 }
 
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 51c42c202bf..baeec83d01f 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -230,6 +230,8 @@ typedef struct xfs_perag {
 	rwlock_t	pag_ici_lock;	/* incore inode lock */
 	struct radix_tree_root pag_ici_root;	/* incore inode cache root */
 	int		pag_ici_reclaimable;	/* reclaimable inodes */
+	struct mutex	pag_ici_reclaim_lock;	/* serialisation point */
+	unsigned long	pag_ici_reclaim_cursor;	/* reclaim restart point */
 
 	/* for rcu-safe freeing */
 	struct rcu_head	rcu_head;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d66e87c7c3a..59859c343e0 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -477,6 +477,7 @@ xfs_initialize_perag(
 		pag->pag_agno = index;
 		pag->pag_mount = mp;
 		rwlock_init(&pag->pag_ici_lock);
+		mutex_init(&pag->pag_ici_reclaim_lock);
 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
 
 		if (radix_tree_preload(GFP_NOFS))
author	Dave Chinner <dchinner@redhat.com>	2010-09-27 11:09:51 +1000
committer	Alex Elder <aelder@sgi.com>	2010-10-18 15:07:55 -0500
commit	69b491c214d7fd4d4df972ae5377be99ca3753db (patch)
tree	b0d022080d8da893e525ee6502878424cffbd8c2 /fs/xfs
parent	e3a20c0b02e1704ab115dfa9d012caf0fbc45ed0 (diff)