From: Shantanu Goel <sgoel01@yahoo.com>

The attached patch implements write throttling for the NFS client.  In the
stock client, a streaming write will hog all the request slots blocking out
other access to the same mount point.  For example, running a single dd in
my NFS home directory completely blocks my X session until the dd process
exits.  With the patch, I have tested running 4 dd's without causing a
completely lock out.



---

 fs/nfs/inode.c            |    3 +
 fs/nfs/pagelist.c         |   30 +++++++++-
 fs/nfs/write.c            |  126 ++++++++++++++++++++++++++++++++++++++++------
 include/linux/nfs_fs_sb.h |    2 
 include/linux/nfs_page.h  |    3 -
 5 files changed, 146 insertions(+), 18 deletions(-)

diff -puN fs/nfs/inode.c~nfs-write-throttling fs/nfs/inode.c
--- 25/fs/nfs/inode.c~nfs-write-throttling	2004-02-20 19:31:28.000000000 -0800
+++ 25-akpm/fs/nfs/inode.c	2004-02-20 19:31:28.000000000 -0800
@@ -372,6 +372,9 @@ nfs_sb_init(struct super_block *sb, rpc_
 	if (sb->s_maxbytes > MAX_LFS_FILESIZE) 
 		sb->s_maxbytes = MAX_LFS_FILESIZE; 
 
+	server->wactive = 0;
+	init_waitqueue_head(&server->writerq);
+
 	/* We're airborne Set socket buffersize */
 	rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
 	return 0;
diff -puN fs/nfs/pagelist.c~nfs-write-throttling fs/nfs/pagelist.c
--- 25/fs/nfs/pagelist.c~nfs-write-throttling	2004-02-20 19:31:28.000000000 -0800
+++ 25-akpm/fs/nfs/pagelist.c	2004-02-20 19:31:28.000000000 -0800
@@ -249,6 +249,8 @@ nfs_coalesce_requests(struct list_head *
  * @file: if set, ensure we match requests from this file
  * @idx_start: lower bound of page->index to scan
  * @npages: idx_start + npages sets the upper bound to scan.
+ * @max_req: if set, stop after this many coalesced requests.
+ * @wpages: if max_req is set, max # pages per coalesced request.
  *
  * Moves elements from one of the inode request lists.
  * If the number of requests is set to 0, the entire address_space
@@ -259,18 +261,22 @@ nfs_coalesce_requests(struct list_head *
 int
 nfs_scan_list(struct list_head *head, struct list_head *dst,
 	      struct file *file,
-	      unsigned long idx_start, unsigned int npages)
+	      unsigned long idx_start, unsigned int npages,
+	      unsigned int max_req, unsigned int wpages)
 {
 	struct list_head	*pos, *tmp;
-	struct nfs_page		*req;
+	struct nfs_page		*req, *prev;
 	unsigned long		idx_end;
-	int			res;
+	int			res, is_contig;
+	unsigned int		nreq, pages;
 
 	res = 0;
 	if (npages == 0)
 		idx_end = ~0;
 	else
 		idx_end = idx_start + npages - 1;
+	nreq = pages = 0;
+	prev = NULL;
 
 	list_for_each_safe(pos, tmp, head) {
 
@@ -284,11 +290,29 @@ nfs_scan_list(struct list_head *head, st
 		if (req->wb_index > idx_end)
 			break;
 
+		is_contig = (max_req &&
+			     prev &&
+			     pages < wpages &&
+			     req->wb_pgbase == 0 &&
+			     prev->wb_pgbase + prev->wb_bytes == PAGE_CACHE_SIZE &&
+			     req->wb_index == prev->wb_index + 1 &&
+			     req->wb_cred == prev->wb_cred);
+
+		if (max_req && !is_contig && nreq == max_req)
+			break;
+
 		if (!nfs_lock_request(req))
 			continue;
 		nfs_list_remove_request(req);
 		nfs_list_add_request(req, dst);
 		res++;
+
+		if (!is_contig) {
+			nreq++;
+			pages = 1;
+		} else
+			pages++;
+		prev = req;
 	}
 	return res;
 }
diff -puN fs/nfs/write.c~nfs-write-throttling fs/nfs/write.c
--- 25/fs/nfs/write.c~nfs-write-throttling	2004-02-20 19:31:28.000000000 -0800
+++ 25-akpm/fs/nfs/write.c	2004-02-20 19:31:28.000000000 -0800
@@ -125,6 +125,66 @@ void nfs_commit_release(struct rpc_task 
 }
 
 /*
+ * The following definitions are for throttling write requests.
+ * Once # outstanding write requests reaches ASYNC_REQ_LIMIT,
+ * writers are forced to wait until # requests drops to ASYNC_REQ_RESUME.
+ */
+#define ASYNC_REQ_LIMIT		RPC_MAXREQS
+#define ASYNC_REQ_RESUME	(ASYNC_REQ_LIMIT * 3 / 4)
+#define NFS_BDI(inode)		(&NFS_SERVER(inode)->backing_dev_info)
+#define IS_CONGESTED(inode)	bdi_write_congested(NFS_BDI(inode))
+
+/*
+ * A write request is being initiated.  Increment active
+ * request count and check for congestion.
+ */
+static __inline__ void WRITE_START(struct inode *inode)
+{
+	spin_lock(&nfs_wreq_lock);
+	if (++NFS_SERVER(inode)->wactive >= ASYNC_REQ_LIMIT)
+		set_bit(BDI_write_congested, &NFS_BDI(inode)->state);
+	spin_unlock(&nfs_wreq_lock);
+}
+
+/*
+ * A write request has just completed on an inode.
+ * Check if congestion has now cleared.
+ */
+static __inline__ void WRITE_DONE(struct inode *inode)
+{
+	spin_lock(&nfs_wreq_lock);
+	NFS_SERVER(inode)->wactive--;
+	if (IS_CONGESTED(inode) && NFS_SERVER(inode)->wactive <= ASYNC_REQ_RESUME) {
+		clear_bit(BDI_write_congested, &NFS_BDI(inode)->state);
+		wake_up_all(&NFS_SERVER(inode)->writerq);
+	}
+	spin_unlock(&nfs_wreq_lock);
+}
+
+/*
+ * Wait for congestion to expire.
+ */
+static __inline__ int CONGESTION_WAIT(struct inode *inode)
+{
+	int err = 0;
+	int intr = NFS_SERVER(inode)->flags & NFS_MOUNT_INTR;
+	DECLARE_WAITQUEUE(wait, current);
+
+	do {
+		set_current_state(intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
+		add_wait_queue(&NFS_SERVER(inode)->writerq, &wait);
+		if (IS_CONGESTED(inode)) {
+			io_schedule();
+			if (intr && signalled())
+				err = -ERESTARTSYS;
+		}
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&NFS_SERVER(inode)->writerq, &wait);
+	} while (!err && IS_CONGESTED(inode));
+	return err;
+}
+
+/*
  * Write a page synchronously.
  * Offset is the data offset within the page.
  */
@@ -163,7 +223,9 @@ nfs_writepage_sync(struct file *file, st
 			wdata.args.count = count;
 		wdata.args.offset = page_offset(page) + wdata.args.pgbase;
 
+		WRITE_START(inode);
 		result = NFS_PROTO(inode)->write(&wdata, file);
+		WRITE_DONE(inode);
 
 		if (result < 0) {
 			/* Must mark the page invalid after I/O error */
@@ -286,20 +348,39 @@ nfs_writepages(struct address_space *map
 	struct inode *inode = mapping->host;
 	int is_sync = !wbc->nonblocking;
 	int err;
+	long npages = wbc->nr_to_write;
 
 	err = generic_writepages(mapping, wbc);
 	if (err)
 		goto out;
-	err = nfs_flush_file(inode, NULL, 0, 0, 0);
-	if (err < 0)
-		goto out;
-	if (wbc->sync_mode == WB_SYNC_HOLD)
-		goto out;
 	if (is_sync && wbc->sync_mode == WB_SYNC_ALL) {
+		npages -= NFS_I(inode)->ndirty + NFS_I(inode)->ncommit;
 		err = nfs_wb_all(inode);
-	} else
+		goto out;
+	}
+	if (wbc->sync_mode != WB_SYNC_HOLD)
+		npages -= NFS_I(inode)->ncommit;
+	while (npages > 0) {
+		if (IS_CONGESTED(inode)) {
+			if (wbc->nonblocking) {
+				wbc->encountered_congestion = 1;
+				break;
+			}
+			err = CONGESTION_WAIT(inode);
+			if (err)
+				goto out;
+		}
+		err = nfs_flush_file(inode, NULL, 0, 0, 0);
+		if (err < 0)
+			goto out;
+		if (err == 0)
+			break;
+		npages -= err;
+	}
+	if (wbc->sync_mode != WB_SYNC_HOLD)
 		nfs_commit_file(inode, NULL, 0, 0, 0);
 out:
+	wbc->nr_to_write = npages;
 	return err;
 }
 
@@ -479,11 +560,11 @@ nfs_wait_on_requests(struct inode *inode
  * The requests are *not* checked to ensure that they form a contiguous set.
  */
 static int
-nfs_scan_dirty(struct inode *inode, struct list_head *dst, struct file *file, unsigned long idx_start, unsigned int npages)
+nfs_scan_dirty(struct inode *inode, struct list_head *dst, struct file *file, unsigned long idx_start, unsigned int npages, unsigned int max_req)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int	res;
-	res = nfs_scan_list(&nfsi->dirty, dst, file, idx_start, npages);
+	res = nfs_scan_list(&nfsi->dirty, dst, file, idx_start, npages, max_req, NFS_SERVER(inode)->wpages);
 	nfsi->ndirty -= res;
 	sub_page_state(nr_dirty,res);
 	if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty))
@@ -508,7 +589,7 @@ nfs_scan_commit(struct inode *inode, str
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int	res;
-	res = nfs_scan_list(&nfsi->commit, dst, file, idx_start, npages);
+	res = nfs_scan_list(&nfsi->commit, dst, file, idx_start, npages, 0, 0);
 	nfsi->ncommit -= res;
 	if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit))
 		printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n");
@@ -633,6 +714,9 @@ nfs_strategy(struct inode *inode)
 {
 	unsigned int	dirty, wpages;
 
+	if (IS_CONGESTED(inode))
+		return;
+
 	dirty  = NFS_I(inode)->ndirty;
 	wpages = NFS_SERVER(inode)->wpages;
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -776,6 +860,8 @@ nfs_write_rpcsetup(struct list_head *hea
 
 	NFS_PROTO(inode)->write_setup(data, count, how);
 
+	WRITE_START(inode);
+
 	dprintk("NFS: %4d initiated write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
 		task->tk_pid,
 		inode->i_sb->s_id,
@@ -865,6 +951,8 @@ nfs_writeback_done(struct rpc_task *task
 	dprintk("NFS: %4d nfs_writeback_done (status %d)\n",
 		task->tk_pid, task->tk_status);
 
+	WRITE_DONE(data->inode);
+
 	/* We can't handle that yet but we check for it nevertheless */
 	if (resp->count < argp->count && task->tk_status >= 0) {
 		static unsigned long    complain;
@@ -1072,10 +1160,14 @@ int nfs_flush_file(struct inode *inode, 
 {
 	LIST_HEAD(head);
 	int			res,
+				nreq,
 				error = 0;
 
+	nreq = ASYNC_REQ_LIMIT - NFS_SERVER(inode)->wactive;
+	if (nreq < 1)
+		nreq = 1;
 	spin_lock(&nfs_wreq_lock);
-	res = nfs_scan_dirty(inode, &head, file, idx_start, npages);
+	res = nfs_scan_dirty(inode, &head, file, idx_start, npages, nreq);
 	spin_unlock(&nfs_wreq_lock);
 	if (res)
 		error = nfs_flush_list(&head, NFS_SERVER(inode)->wpages, how);
@@ -1120,13 +1212,19 @@ int nfs_sync_file(struct inode *inode, s
 
 	do {
 		error = 0;
-		if (wait)
-			error = nfs_wait_on_requests(inode, file, idx_start, npages);
-		if (error == 0)
+		if (IS_CONGESTED(inode))
+			error = CONGESTION_WAIT(inode);
+		if (error == 0) {
 			error = nfs_flush_file(inode, file, idx_start, npages, how);
+			if (error == 0 && wait)
+				error = nfs_wait_on_requests(inode, file, idx_start, npages);
+		}
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-		if (error == 0)
+		if (error == 0 && NFS_PROTO(inode)->version > 2) {
 			error = nfs_commit_file(inode, file, idx_start, npages, how);
+			if (error == 0 && wait)
+				error = nfs_wait_on_requests(inode, file, idx_start, npages);
+		}
 #endif
 	} while (error > 0);
 	return error;
diff -puN include/linux/nfs_fs_sb.h~nfs-write-throttling include/linux/nfs_fs_sb.h
--- 25/include/linux/nfs_fs_sb.h~nfs-write-throttling	2004-02-20 19:31:28.000000000 -0800
+++ 25-akpm/include/linux/nfs_fs_sb.h	2004-02-20 19:31:28.000000000 -0800
@@ -28,6 +28,8 @@ struct nfs_server {
 	char *			hostname;	/* remote hostname */
 	struct nfs_fh		fh;
 	struct sockaddr_in	addr;
+	unsigned int		wactive;	/* # write requests in progress */
+	wait_queue_head_t	writerq;	/* writers waiting to write */
 #ifdef CONFIG_NFS_V4
 	/* Our own IP address, as a null-terminated string.
 	 * This is used to generate the clientid, and the callback address.
diff -puN include/linux/nfs_page.h~nfs-write-throttling include/linux/nfs_page.h
--- 25/include/linux/nfs_page.h~nfs-write-throttling	2004-02-20 19:31:28.000000000 -0800
+++ 25-akpm/include/linux/nfs_page.h	2004-02-20 19:31:28.000000000 -0800
@@ -53,7 +53,8 @@ extern	void nfs_release_request(struct n
 extern	void nfs_list_add_request(struct nfs_page *, struct list_head *);
 
 extern	int nfs_scan_list(struct list_head *, struct list_head *,
-			  struct file *, unsigned long, unsigned int);
+			  struct file *, unsigned long, unsigned int,
+			  unsigned int, unsigned int);
 extern	int nfs_coalesce_requests(struct list_head *, struct list_head *,
 				  unsigned int);
 extern  int nfs_wait_on_request(struct nfs_page *);

_