From: Trond Myklebust <trond.myklebust@fys.uio.no>

NFSv4: Share open_owner structs between several different processes.  Reduces
the load on the server.


---

 fs/nfs/nfs4proc.c       |   80 ++++++++++++++++++++++++++++----
 fs/nfs/nfs4state.c      |  118 ++++++++++++++++++++++++++++++++++++++++++------
 fs/nfs/nfs4xdr.c        |   82 +++++++++++++++++++++++++++++++++
 include/linux/nfs4.h    |    1 
 include/linux/nfs_fs.h  |    7 ++
 include/linux/nfs_xdr.h |    1 
 6 files changed, 263 insertions(+), 26 deletions(-)

diff -puN fs/nfs/nfs4proc.c~nfs-28-open_owner fs/nfs/nfs4proc.c
--- 25/fs/nfs/nfs4proc.c~nfs-28-open_owner	2004-01-14 02:10:02.000000000 -0800
+++ 25-akpm/fs/nfs/nfs4proc.c	2004-01-14 02:10:02.000000000 -0800
@@ -616,8 +616,13 @@ retry:
 		memcpy(&state->stateid, &oc_res.stateid, sizeof(state->stateid));
 	} else
 		memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid));
+	spin_lock(&inode->i_lock);
+	if (flags & FMODE_READ)
+		state->nreaders++;
+	if (flags & FMODE_WRITE)
+		state->nwriters++;
 	state->state |= flags & (FMODE_READ|FMODE_WRITE);
-	state->pid = current->pid;
+	spin_unlock(&inode->i_lock);
 
 	up(&sp->so_sema);
 	nfs4_put_state_owner(sp);
@@ -634,6 +639,21 @@ out_up:
 		iput(inode);
 		inode = NULL;
 	}
+	/* NOTE: BAD_SEQID means the server and client disagree about the
+	 * book-keeping w.r.t. state-changing operations
+	 * (OPEN/CLOSE/LOCK/LOCKU...)
+	 * It is actually a sign of a bug on the client or on the server.
+	 *
+	 * If we receive a BAD_SEQID error in the particular case of
+	 * doing an OPEN, we assume that nfs4_increment_seqid() will
+	 * have unhashed the old state_owner for us, and that we can
+	 * therefore safely retry using a new one. We should still warn
+	 * the user though...
+	 */
+	if (status == -NFS4ERR_BAD_SEQID) {
+		printk(KERN_WARNING "NFS: v4 server returned a bad sequence-id error!\n");
+		goto retry;
+	}
 	status = nfs4_handle_error(server, status);
 	if (!status)
 		goto retry;
@@ -722,6 +742,36 @@ nfs4_do_close(struct inode *inode, struc
 	 * the state_owner. we keep this around to process errors
 	 */
 	nfs4_increment_seqid(status, sp);
+	if (!status)
+		memcpy(&state->stateid, &res.stateid, sizeof(state->stateid));
+
+	return status;
+}
+
+int
+nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode) 
+{
+	struct nfs4_state_owner *sp = state->owner;
+	int status = 0;
+	struct nfs_closeargs arg = {
+		.fh		= NFS_FH(inode),
+		.seqid		= sp->so_seqid,
+		.share_access	= mode,
+	};
+	struct nfs_closeres res = {
+		.status		= 0,
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+	};
+
+	memcpy(&arg.stateid, &state->stateid, sizeof(arg.stateid));
+	status = rpc_call_sync(NFS_SERVER(inode)->client, &msg, 0);
+	nfs4_increment_seqid(status, sp);
+	if (!status)
+		memcpy(&state->stateid, &res.stateid, sizeof(state->stateid));
 
 	return status;
 }
@@ -771,7 +821,7 @@ nfs4_open_revalidate(struct inode *dir, 
 		return 1;
 	}
 	d_drop(dentry);
-	nfs4_put_open_state(state);
+	nfs4_close_state(state, openflags);
 	iput(inode);
 	return 0;
 }
@@ -872,15 +922,14 @@ nfs4_proc_setattr(struct dentry *dentry,
 	fattr->valid = 0;
 	
 	if (size_change) {
-		state = nfs4_find_state_bypid(inode, current->pid);
-
+		struct rpc_cred *cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0);
+		state = nfs4_find_state(inode, cred, FMODE_WRITE);
 		if (!state) {
-			struct rpc_cred *cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0);
 			state = nfs4_do_open(dentry->d_parent->d_inode, 
 				&dentry->d_name, FMODE_WRITE, NULL, cred);
-			put_rpccred(cred);
 			need_iput = 1;
 		}
+		put_rpccred(cred);
 		if (IS_ERR(state))
 			return PTR_ERR(state);
 
@@ -895,7 +944,7 @@ nfs4_proc_setattr(struct dentry *dentry,
 out:
 	if (state) {
 		inode = state->inode;
-		nfs4_put_open_state(state);
+		nfs4_close_state(state, FMODE_WRITE);
 		if (need_iput)
 			iput(inode);
 	}
@@ -1161,7 +1210,7 @@ nfs4_proc_create(struct inode *dir, stru
 			status = nfs4_do_setattr(NFS_SERVER(dir), &fattr,
 			                     NFS_FH(inode), sattr, state);
 			if (status != 0) {
-				nfs4_put_open_state(state);
+				nfs4_close_state(state, flags);
 				iput(inode);
 				inode = ERR_PTR(status);
 			}
@@ -1742,6 +1791,7 @@ nfs4_proc_file_open(struct inode *inode,
 {
 	struct dentry *dentry = filp->f_dentry;
 	struct nfs4_state *state;
+	struct rpc_cred *cred;
 
 	dprintk("nfs4_proc_file_open: starting on (%.*s/%.*s)\n",
 	                       (int)dentry->d_parent->d_name.len,
@@ -1750,12 +1800,14 @@ nfs4_proc_file_open(struct inode *inode,
 
 
 	/* Find our open stateid */
-	state = nfs4_find_state_bypid(inode, current->pid);
+	cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0);
+	state = nfs4_find_state(inode, cred, filp->f_mode);
+	put_rpccred(cred);
 	if (state == NULL) {
 		printk(KERN_WARNING "NFS: v4 raced in function %s\n", __FUNCTION__);
 		return -EIO; /* ERACE actually */
 	}
-	nfs4_put_open_state(state);
+	nfs4_close_state(state, filp->f_mode);
 	if (filp->f_mode & FMODE_WRITE) {
 		lock_kernel();
 		nfs_set_mmcred(inode, state->owner->so_cred);
@@ -1774,7 +1826,7 @@ nfs4_proc_file_release(struct inode *ino
 	struct nfs4_state *state = (struct nfs4_state *)filp->private_data;
 
 	if (state)
-		nfs4_put_open_state(state);
+		nfs4_close_state(state, filp->f_mode);
 	return 0;
 }
 
@@ -1816,6 +1868,9 @@ nfs4_async_handle_error(struct rpc_task 
 			rpc_delay(task, NFS4_POLL_RETRY_TIME);
 			task->tk_status = 0;
 			return -EAGAIN;
+		case -NFS4ERR_OLD_STATEID:
+			task->tk_status = 0;
+			return -EAGAIN;
 	}
 	return 0;
 }
@@ -1892,6 +1947,9 @@ nfs4_handle_error(struct nfs_server *ser
 		case -NFS4ERR_DELAY:
 			ret = nfs4_delay(server->client);
 			break;
+		case -NFS4ERR_OLD_STATEID:
+			ret = 0;
+			break;
 		default:
 			if (errorcode <= -1000) {
 				printk(KERN_WARNING "%s could not handle NFSv4 error %d\n",
diff -puN fs/nfs/nfs4state.c~nfs-28-open_owner fs/nfs/nfs4state.c
--- 25/fs/nfs/nfs4state.c~nfs-28-open_owner	2004-01-14 02:10:02.000000000 -0800
+++ 25-akpm/fs/nfs/nfs4state.c	2004-01-14 02:10:02.000000000 -0800
@@ -188,6 +188,23 @@ nfs4_client_grab_unused(struct nfs4_clie
 	return sp;
 }
 
+static struct nfs4_state_owner *
+nfs4_find_state_owner(struct nfs4_client *clp, struct rpc_cred *cred)
+{
+	struct nfs4_state_owner *sp, *res = NULL;
+
+	list_for_each_entry(sp, &clp->cl_state_owners, so_list) {
+		if (sp->so_cred != cred)
+			continue;
+		atomic_inc(&sp->so_count);
+		/* Move to the head of the list */
+		list_move(&sp->so_list, &clp->cl_state_owners);
+		res = sp;
+		break;
+	}
+	return res;
+}
+
 /*
  * nfs4_alloc_state_owner(): this is called on the OPEN or CREATE path to
  * create a new state_owner.
@@ -208,6 +225,15 @@ nfs4_alloc_state_owner(void)
 	return sp;
 }
 
+static void
+nfs4_unhash_state_owner(struct nfs4_state_owner *sp)
+{
+	struct nfs4_client *clp = sp->so_client;
+	spin_lock(&clp->cl_lock);
+	list_del_init(&sp->so_list);
+	spin_unlock(&clp->cl_lock);
+}
+
 struct nfs4_state_owner *
 nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred)
 {
@@ -217,7 +243,9 @@ nfs4_get_state_owner(struct nfs_server *
 	get_rpccred(cred);
 	new = nfs4_alloc_state_owner();
 	spin_lock(&clp->cl_lock);
-	sp = nfs4_client_grab_unused(clp, cred);
+	sp = nfs4_find_state_owner(clp, cred);
+	if (sp == NULL)
+		sp = nfs4_client_grab_unused(clp, cred);
 	if (sp == NULL && new != NULL) {
 		list_add(&new->so_list, &clp->cl_state_owners);
 		new->so_client = clp;
@@ -248,6 +276,8 @@ nfs4_put_state_owner(struct nfs4_state_o
 		return;
 	if (clp->cl_nunused >= OPENOWNER_POOL_SIZE)
 		goto out_free;
+	if (list_empty(&sp->so_list))
+		goto out_free;
 	list_move(&sp->so_list, &clp->cl_unused);
 	clp->cl_nunused++;
 	spin_unlock(&clp->cl_lock);
@@ -269,24 +299,38 @@ nfs4_alloc_open_state(void)
 	state = kmalloc(sizeof(*state), GFP_KERNEL);
 	if (!state)
 		return NULL;
-	state->pid = current->pid;
 	state->state = 0;
+	state->nreaders = 0;
+	state->nwriters = 0;
 	memset(state->stateid.data, 0, sizeof(state->stateid.data));
 	atomic_set(&state->count, 1);
 	return state;
 }
 
 static struct nfs4_state *
-__nfs4_find_state_bypid(struct inode *inode, pid_t pid)
+__nfs4_find_state(struct inode *inode, struct rpc_cred *cred, mode_t mode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs4_state *state;
 
+	mode &= (FMODE_READ|FMODE_WRITE);
 	list_for_each_entry(state, &nfsi->open_states, inode_states) {
-		if (state->pid == pid) {
-			atomic_inc(&state->count);
-			return state;
-		}
+		if (state->owner->so_cred != cred)
+			continue;
+		if ((mode & FMODE_READ) != 0 && state->nreaders == 0)
+			continue;
+		if ((mode & FMODE_WRITE) != 0 && state->nwriters == 0)
+			continue;
+		if ((state->state & mode) != mode)
+			continue;
+		/* Add the state to the head of the inode's list */
+		list_move(&state->inode_states, &nfsi->open_states);
+		atomic_inc(&state->count);
+		if (mode & FMODE_READ)
+			state->nreaders++;
+		if (mode & FMODE_WRITE)
+			state->nwriters++;
+		return state;
 	}
 	return NULL;
 }
@@ -298,7 +342,12 @@ __nfs4_find_state_byowner(struct inode *
 	struct nfs4_state *state;
 
 	list_for_each_entry(state, &nfsi->open_states, inode_states) {
+		/* Is this in the process of being freed? */
+		if (state->nreaders == 0 && state->nwriters == 0)
+			continue;
 		if (state->owner == owner) {
+			/* Add the state to the head of the inode's list */
+			list_move(&state->inode_states, &nfsi->open_states);
 			atomic_inc(&state->count);
 			return state;
 		}
@@ -307,16 +356,12 @@ __nfs4_find_state_byowner(struct inode *
 }
 
 struct nfs4_state *
-nfs4_find_state_bypid(struct inode *inode, pid_t pid)
+nfs4_find_state(struct inode *inode, struct rpc_cred *cred, mode_t mode)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs4_state *state;
 
 	spin_lock(&inode->i_lock);
-	state = __nfs4_find_state_bypid(inode, pid);
-	/* Add the state to the tail of the inode's list */
-	if (state)
-		list_move_tail(&state->inode_states, &nfsi->open_states);
+	state = __nfs4_find_state(inode, cred, mode);
 	spin_unlock(&inode->i_lock);
 	return state;
 }
@@ -387,6 +432,50 @@ nfs4_put_open_state(struct nfs4_state *s
 	nfs4_put_state_owner(owner);
 }
 
+void
+nfs4_close_state(struct nfs4_state *state, mode_t mode)
+{
+	struct inode *inode = state->inode;
+	struct nfs4_state_owner *owner = state->owner;
+	int newstate;
+	int status = 0;
+
+	down(&owner->so_sema);
+	/* Protect against nfs4_find_state() */
+	spin_lock(&inode->i_lock);
+	if (mode & FMODE_READ)
+		state->nreaders--;
+	if (mode & FMODE_WRITE)
+		state->nwriters--;
+	if (state->nwriters == 0 && state->nreaders == 0)
+		list_del_init(&state->inode_states);
+	spin_unlock(&inode->i_lock);
+	do {
+	 	newstate = 0;
+		if (state->state == 0)
+			break;
+		if (state->nreaders)
+			newstate |= FMODE_READ;
+		if (state->nwriters)
+			newstate |= FMODE_WRITE;
+		if (state->state == newstate)
+			break;
+		if (newstate != 0)
+			status = nfs4_do_downgrade(inode, state, newstate);
+		else
+			status = nfs4_do_close(inode, state);
+		if (!status) {
+			state->state = newstate;
+			break;
+		}
+		up(&owner->so_sema);
+		status = nfs4_handle_error(NFS_SERVER(inode), status);
+		down(&owner->so_sema);
+	} while (!status);
+	up(&owner->so_sema);
+	nfs4_put_open_state(state);
+}
+
 /*
 * Called with sp->so_sema held.
 *
@@ -399,6 +488,9 @@ nfs4_increment_seqid(int status, struct 
 {
 	if (status == NFS_OK || seqid_mutating_err(-status))
 		sp->so_seqid++;
+	/* If the server returns BAD_SEQID, unhash state_owner here */
+	if (status == -NFS4ERR_BAD_SEQID)
+		nfs4_unhash_state_owner(sp);
 }
 
 static int reclaimer(void *);
diff -puN fs/nfs/nfs4xdr.c~nfs-28-open_owner fs/nfs/nfs4xdr.c
--- 25/fs/nfs/nfs4xdr.c~nfs-28-open_owner	2004-01-14 02:10:02.000000000 -0800
+++ 25-akpm/fs/nfs/nfs4xdr.c	2004-01-14 02:10:02.000000000 -0800
@@ -176,6 +176,14 @@ static int nfs_stat_to_errno(int);
 					op_decode_hdr_maxsz + \
 					4 + 5 + 2 + 3 + \
 					decode_getattr_maxsz
+#define NFS4_enc_open_downgrade_sz \
+				compound_encode_hdr_maxsz + \
+                                encode_putfh_maxsz + \
+                                op_encode_hdr_maxsz + 7
+#define NFS4_dec_open_downgrade_sz \
+				compound_decode_hdr_maxsz + \
+                                decode_putfh_maxsz + \
+                                op_decode_hdr_maxsz + 4
 #define NFS4_enc_close_sz       compound_encode_hdr_maxsz + \
                                 encode_putfh_maxsz + \
                                 op_encode_hdr_maxsz + 5
@@ -712,6 +720,22 @@ encode_open_reclaim(struct xdr_stream *x
 }
 
 static int
+encode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeargs *arg)
+{
+	uint32_t *p;
+
+	RESERVE_SPACE(16+sizeof(arg->stateid.data));
+	WRITE32(OP_OPEN_DOWNGRADE);
+	WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data));
+	WRITE32(arg->seqid);
+	WRITE32(arg->share_access);
+	/* No deny modes */
+	WRITE32(0);
+
+	return 0;
+}
+
+static int
 encode_putfh(struct xdr_stream *xdr, struct nfs_fh *fh)
 {
 	int len = fh->size;
@@ -1129,6 +1153,27 @@ out:
 	return status;
 }
 
+/*
+ * Encode an OPEN_DOWNGRADE request
+ */
+static int
+nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, uint32_t *p, struct nfs_closeargs *args)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr = {
+		.nops	= 2,
+	};
+	int status;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_compound_hdr(&xdr, &hdr);
+	status = encode_putfh(&xdr, args->fh);
+	if (status)
+		goto out;
+	status = encode_open_downgrade(&xdr, args);
+out:
+	return status;
+}
 
 /*
  * Encode a READ request
@@ -2001,6 +2046,19 @@ decode_open_confirm(struct xdr_stream *x
         return 0;
 }
 
+static int
+decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
+{
+	uint32_t *p;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE);
+	if (status)
+		return status;
+	READ_BUF(sizeof(res->stateid.data));
+	COPYMEM(res->stateid.data, sizeof(res->stateid.data));
+	return 0;
+}
 
 static int
 decode_putfh(struct xdr_stream *xdr)
@@ -2377,6 +2435,29 @@ decode_compound(struct xdr_stream *xdr, 
 
 	DECODE_TAIL;
 }
+
+/*
+ * Decode OPEN_DOWNGRADE response
+ */
+static int
+nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_closeres *res)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr;
+        int status;
+
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_putfh(&xdr);
+        if (status)
+                goto out;
+        status = decode_open_downgrade(&xdr, res);
+out:
+        return status;
+}
+
 /*
  * END OF "GENERIC" DECODE ROUTINES.
  */
@@ -2827,6 +2908,7 @@ struct rpc_procinfo	nfs4_procedures[] = 
   PROC(OPEN,		enc_open,	dec_open),
   PROC(OPEN_CONFIRM,	enc_open_confirm,	dec_open_confirm),
   PROC(OPEN_RECLAIM,	enc_open_reclaim,	dec_open_reclaim),
+  PROC(OPEN_DOWNGRADE,	enc_open_downgrade,	dec_open_downgrade),
   PROC(CLOSE,		enc_close,	dec_close),
   PROC(SETATTR,		enc_setattr,	dec_setattr),
   PROC(FSINFO,		enc_fsinfo,	dec_fsinfo),
diff -puN include/linux/nfs4.h~nfs-28-open_owner include/linux/nfs4.h
--- 25/include/linux/nfs4.h~nfs-28-open_owner	2004-01-14 02:10:02.000000000 -0800
+++ 25-akpm/include/linux/nfs4.h	2004-01-14 02:10:02.000000000 -0800
@@ -290,6 +290,7 @@ enum {
 	NFSPROC4_CLNT_OPEN,
 	NFSPROC4_CLNT_OPEN_CONFIRM,
 	NFSPROC4_CLNT_OPEN_RECLAIM,
+	NFSPROC4_CLNT_OPEN_DOWNGRADE,
 	NFSPROC4_CLNT_CLOSE,
 	NFSPROC4_CLNT_SETATTR,
 	NFSPROC4_CLNT_FSINFO,
diff -puN include/linux/nfs_fs.h~nfs-28-open_owner include/linux/nfs_fs.h
--- 25/include/linux/nfs_fs.h~nfs-28-open_owner	2004-01-14 02:10:02.000000000 -0800
+++ 25-akpm/include/linux/nfs_fs.h	2004-01-14 02:10:02.000000000 -0800
@@ -549,10 +549,11 @@ struct nfs4_state {
 
 	struct nfs4_state_owner *owner;	/* Pointer to the open owner */
 	struct inode *inode;		/* Pointer to the inode */
-	pid_t pid;			/* Thread that called OPEN */
 
 	nfs4_stateid stateid;
 
+	unsigned int nreaders;
+	unsigned int nwriters;
 	int state;			/* State on the server (R,W, or RW) */
 	atomic_t count;
 };
@@ -568,6 +569,7 @@ extern int nfs4_open_reclaim(struct nfs4
 extern int nfs4_proc_async_renew(struct nfs4_client *);
 extern int nfs4_proc_renew(struct nfs4_client *);
 extern int nfs4_do_close(struct inode *, struct nfs4_state *);
+int nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode);
 extern int nfs4_wait_clnt_recover(struct rpc_clnt *, struct nfs4_client *);
 extern struct inode *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int);
@@ -586,7 +588,8 @@ extern struct nfs4_state_owner * nfs4_ge
 extern void nfs4_put_state_owner(struct nfs4_state_owner *);
 extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
 extern void nfs4_put_open_state(struct nfs4_state *);
-extern struct nfs4_state *nfs4_find_state_bypid(struct inode *, pid_t);
+extern void nfs4_close_state(struct nfs4_state *, mode_t);
+extern struct nfs4_state *nfs4_find_state(struct inode *, struct rpc_cred *, mode_t mode);
 extern void nfs4_increment_seqid(int status, struct nfs4_state_owner *sp);
 extern int nfs4_handle_error(struct nfs_server *, int);
 extern void nfs4_schedule_state_recovery(struct nfs4_client *);
diff -puN include/linux/nfs_xdr.h~nfs-28-open_owner include/linux/nfs_xdr.h
--- 25/include/linux/nfs_xdr.h~nfs-28-open_owner	2004-01-14 02:10:02.000000000 -0800
+++ 25-akpm/include/linux/nfs_xdr.h	2004-01-14 02:10:02.000000000 -0800
@@ -153,6 +153,7 @@ struct nfs_closeargs {
 	struct nfs_fh *         fh;
 	nfs4_stateid            stateid;
 	__u32                   seqid;
+	__u32			share_access;
 };
 
 struct nfs_closeres {

_