From: Trond Myklebust NFSv4: Share open_owner structs between several different processes. Reduces the load on the server. --- fs/nfs/nfs4proc.c | 80 ++++++++++++++++++++++++++++---- fs/nfs/nfs4state.c | 118 ++++++++++++++++++++++++++++++++++++++++++------ fs/nfs/nfs4xdr.c | 82 +++++++++++++++++++++++++++++++++ include/linux/nfs4.h | 1 include/linux/nfs_fs.h | 7 ++ include/linux/nfs_xdr.h | 1 6 files changed, 263 insertions(+), 26 deletions(-) diff -puN fs/nfs/nfs4proc.c~nfs-28-open_owner fs/nfs/nfs4proc.c --- 25/fs/nfs/nfs4proc.c~nfs-28-open_owner 2004-01-09 22:16:24.000000000 -0800 +++ 25-akpm/fs/nfs/nfs4proc.c 2004-01-09 22:16:24.000000000 -0800 @@ -616,8 +616,13 @@ retry: memcpy(&state->stateid, &oc_res.stateid, sizeof(state->stateid)); } else memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid)); + spin_lock(&inode->i_lock); + if (flags & FMODE_READ) + state->nreaders++; + if (flags & FMODE_WRITE) + state->nwriters++; state->state |= flags & (FMODE_READ|FMODE_WRITE); - state->pid = current->pid; + spin_unlock(&inode->i_lock); up(&sp->so_sema); nfs4_put_state_owner(sp); @@ -634,6 +639,21 @@ out_up: iput(inode); inode = NULL; } + /* NOTE: BAD_SEQID means the server and client disagree about the + * book-keeping w.r.t. state-changing operations + * (OPEN/CLOSE/LOCK/LOCKU...) + * It is actually a sign of a bug on the client or on the server. + * + * If we receive a BAD_SEQID error in the particular case of + * doing an OPEN, we assume that nfs4_increment_seqid() will + * have unhashed the old state_owner for us, and that we can + * therefore safely retry using a new one. We should still warn + * the user though... + */ + if (status == -NFS4ERR_BAD_SEQID) { + printk(KERN_WARNING "NFS: v4 server returned a bad sequence-id error!\n"); + goto retry; + } status = nfs4_handle_error(server, status); if (!status) goto retry; @@ -722,6 +742,36 @@ nfs4_do_close(struct inode *inode, struc * the state_owner. we keep this around to process errors */ nfs4_increment_seqid(status, sp); + if (!status) + memcpy(&state->stateid, &res.stateid, sizeof(state->stateid)); + + return status; +} + +int +nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode) +{ + struct nfs4_state_owner *sp = state->owner; + int status = 0; + struct nfs_closeargs arg = { + .fh = NFS_FH(inode), + .seqid = sp->so_seqid, + .share_access = mode, + }; + struct nfs_closeres res = { + .status = 0, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + + memcpy(&arg.stateid, &state->stateid, sizeof(arg.stateid)); + status = rpc_call_sync(NFS_SERVER(inode)->client, &msg, 0); + nfs4_increment_seqid(status, sp); + if (!status) + memcpy(&state->stateid, &res.stateid, sizeof(state->stateid)); return status; } @@ -771,7 +821,7 @@ nfs4_open_revalidate(struct inode *dir, return 1; } d_drop(dentry); - nfs4_put_open_state(state); + nfs4_close_state(state, openflags); iput(inode); return 0; } @@ -872,15 +922,14 @@ nfs4_proc_setattr(struct dentry *dentry, fattr->valid = 0; if (size_change) { - state = nfs4_find_state_bypid(inode, current->pid); - + struct rpc_cred *cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0); + state = nfs4_find_state(inode, cred, FMODE_WRITE); if (!state) { - struct rpc_cred *cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0); state = nfs4_do_open(dentry->d_parent->d_inode, &dentry->d_name, FMODE_WRITE, NULL, cred); - put_rpccred(cred); need_iput = 1; } + put_rpccred(cred); if (IS_ERR(state)) return PTR_ERR(state); @@ -895,7 +944,7 @@ nfs4_proc_setattr(struct dentry *dentry, out: if (state) { inode = state->inode; - nfs4_put_open_state(state); + nfs4_close_state(state, FMODE_WRITE); if (need_iput) iput(inode); } @@ -1161,7 +1210,7 @@ nfs4_proc_create(struct inode *dir, stru status = nfs4_do_setattr(NFS_SERVER(dir), &fattr, NFS_FH(inode), sattr, state); if (status != 0) { - nfs4_put_open_state(state); + nfs4_close_state(state, flags); iput(inode); inode = ERR_PTR(status); } @@ -1742,6 +1791,7 @@ nfs4_proc_file_open(struct inode *inode, { struct dentry *dentry = filp->f_dentry; struct nfs4_state *state; + struct rpc_cred *cred; dprintk("nfs4_proc_file_open: starting on (%.*s/%.*s)\n", (int)dentry->d_parent->d_name.len, @@ -1750,12 +1800,14 @@ nfs4_proc_file_open(struct inode *inode, /* Find our open stateid */ - state = nfs4_find_state_bypid(inode, current->pid); + cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0); + state = nfs4_find_state(inode, cred, filp->f_mode); + put_rpccred(cred); if (state == NULL) { printk(KERN_WARNING "NFS: v4 raced in function %s\n", __FUNCTION__); return -EIO; /* ERACE actually */ } - nfs4_put_open_state(state); + nfs4_close_state(state, filp->f_mode); if (filp->f_mode & FMODE_WRITE) { lock_kernel(); nfs_set_mmcred(inode, state->owner->so_cred); @@ -1774,7 +1826,7 @@ nfs4_proc_file_release(struct inode *ino struct nfs4_state *state = (struct nfs4_state *)filp->private_data; if (state) - nfs4_put_open_state(state); + nfs4_close_state(state, filp->f_mode); return 0; } @@ -1816,6 +1868,9 @@ nfs4_async_handle_error(struct rpc_task rpc_delay(task, NFS4_POLL_RETRY_TIME); task->tk_status = 0; return -EAGAIN; + case -NFS4ERR_OLD_STATEID: + task->tk_status = 0; + return -EAGAIN; } return 0; } @@ -1892,6 +1947,9 @@ nfs4_handle_error(struct nfs_server *ser case -NFS4ERR_DELAY: ret = nfs4_delay(server->client); break; + case -NFS4ERR_OLD_STATEID: + ret = 0; + break; default: if (errorcode <= -1000) { printk(KERN_WARNING "%s could not handle NFSv4 error %d\n", diff -puN fs/nfs/nfs4state.c~nfs-28-open_owner fs/nfs/nfs4state.c --- 25/fs/nfs/nfs4state.c~nfs-28-open_owner 2004-01-09 22:16:24.000000000 -0800 +++ 25-akpm/fs/nfs/nfs4state.c 2004-01-09 22:16:24.000000000 -0800 @@ -188,6 +188,23 @@ nfs4_client_grab_unused(struct nfs4_clie return sp; } +static struct nfs4_state_owner * +nfs4_find_state_owner(struct nfs4_client *clp, struct rpc_cred *cred) +{ + struct nfs4_state_owner *sp, *res = NULL; + + list_for_each_entry(sp, &clp->cl_state_owners, so_list) { + if (sp->so_cred != cred) + continue; + atomic_inc(&sp->so_count); + /* Move to the head of the list */ + list_move(&sp->so_list, &clp->cl_state_owners); + res = sp; + break; + } + return res; +} + /* * nfs4_alloc_state_owner(): this is called on the OPEN or CREATE path to * create a new state_owner. @@ -208,6 +225,15 @@ nfs4_alloc_state_owner(void) return sp; } +static void +nfs4_unhash_state_owner(struct nfs4_state_owner *sp) +{ + struct nfs4_client *clp = sp->so_client; + spin_lock(&clp->cl_lock); + list_del_init(&sp->so_list); + spin_unlock(&clp->cl_lock); +} + struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred) { @@ -217,7 +243,9 @@ nfs4_get_state_owner(struct nfs_server * get_rpccred(cred); new = nfs4_alloc_state_owner(); spin_lock(&clp->cl_lock); - sp = nfs4_client_grab_unused(clp, cred); + sp = nfs4_find_state_owner(clp, cred); + if (sp == NULL) + sp = nfs4_client_grab_unused(clp, cred); if (sp == NULL && new != NULL) { list_add(&new->so_list, &clp->cl_state_owners); new->so_client = clp; @@ -248,6 +276,8 @@ nfs4_put_state_owner(struct nfs4_state_o return; if (clp->cl_nunused >= OPENOWNER_POOL_SIZE) goto out_free; + if (list_empty(&sp->so_list)) + goto out_free; list_move(&sp->so_list, &clp->cl_unused); clp->cl_nunused++; spin_unlock(&clp->cl_lock); @@ -269,24 +299,38 @@ nfs4_alloc_open_state(void) state = kmalloc(sizeof(*state), GFP_KERNEL); if (!state) return NULL; - state->pid = current->pid; state->state = 0; + state->nreaders = 0; + state->nwriters = 0; memset(state->stateid.data, 0, sizeof(state->stateid.data)); atomic_set(&state->count, 1); return state; } static struct nfs4_state * -__nfs4_find_state_bypid(struct inode *inode, pid_t pid) +__nfs4_find_state(struct inode *inode, struct rpc_cred *cred, mode_t mode) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs4_state *state; + mode &= (FMODE_READ|FMODE_WRITE); list_for_each_entry(state, &nfsi->open_states, inode_states) { - if (state->pid == pid) { - atomic_inc(&state->count); - return state; - } + if (state->owner->so_cred != cred) + continue; + if ((mode & FMODE_READ) != 0 && state->nreaders == 0) + continue; + if ((mode & FMODE_WRITE) != 0 && state->nwriters == 0) + continue; + if ((state->state & mode) != mode) + continue; + /* Add the state to the head of the inode's list */ + list_move(&state->inode_states, &nfsi->open_states); + atomic_inc(&state->count); + if (mode & FMODE_READ) + state->nreaders++; + if (mode & FMODE_WRITE) + state->nwriters++; + return state; } return NULL; } @@ -298,7 +342,12 @@ __nfs4_find_state_byowner(struct inode * struct nfs4_state *state; list_for_each_entry(state, &nfsi->open_states, inode_states) { + /* Is this in the process of being freed? */ + if (state->nreaders == 0 && state->nwriters == 0) + continue; if (state->owner == owner) { + /* Add the state to the head of the inode's list */ + list_move(&state->inode_states, &nfsi->open_states); atomic_inc(&state->count); return state; } @@ -307,16 +356,12 @@ __nfs4_find_state_byowner(struct inode * } struct nfs4_state * -nfs4_find_state_bypid(struct inode *inode, pid_t pid) +nfs4_find_state(struct inode *inode, struct rpc_cred *cred, mode_t mode) { - struct nfs_inode *nfsi = NFS_I(inode); struct nfs4_state *state; spin_lock(&inode->i_lock); - state = __nfs4_find_state_bypid(inode, pid); - /* Add the state to the tail of the inode's list */ - if (state) - list_move_tail(&state->inode_states, &nfsi->open_states); + state = __nfs4_find_state(inode, cred, mode); spin_unlock(&inode->i_lock); return state; } @@ -387,6 +432,50 @@ nfs4_put_open_state(struct nfs4_state *s nfs4_put_state_owner(owner); } +void +nfs4_close_state(struct nfs4_state *state, mode_t mode) +{ + struct inode *inode = state->inode; + struct nfs4_state_owner *owner = state->owner; + int newstate; + int status = 0; + + down(&owner->so_sema); + /* Protect against nfs4_find_state() */ + spin_lock(&inode->i_lock); + if (mode & FMODE_READ) + state->nreaders--; + if (mode & FMODE_WRITE) + state->nwriters--; + if (state->nwriters == 0 && state->nreaders == 0) + list_del_init(&state->inode_states); + spin_unlock(&inode->i_lock); + do { + newstate = 0; + if (state->state == 0) + break; + if (state->nreaders) + newstate |= FMODE_READ; + if (state->nwriters) + newstate |= FMODE_WRITE; + if (state->state == newstate) + break; + if (newstate != 0) + status = nfs4_do_downgrade(inode, state, newstate); + else + status = nfs4_do_close(inode, state); + if (!status) { + state->state = newstate; + break; + } + up(&owner->so_sema); + status = nfs4_handle_error(NFS_SERVER(inode), status); + down(&owner->so_sema); + } while (!status); + up(&owner->so_sema); + nfs4_put_open_state(state); +} + /* * Called with sp->so_sema held. * @@ -399,6 +488,9 @@ nfs4_increment_seqid(int status, struct { if (status == NFS_OK || seqid_mutating_err(-status)) sp->so_seqid++; + /* If the server returns BAD_SEQID, unhash state_owner here */ + if (status == -NFS4ERR_BAD_SEQID) + nfs4_unhash_state_owner(sp); } static int reclaimer(void *); diff -puN fs/nfs/nfs4xdr.c~nfs-28-open_owner fs/nfs/nfs4xdr.c --- 25/fs/nfs/nfs4xdr.c~nfs-28-open_owner 2004-01-09 22:16:24.000000000 -0800 +++ 25-akpm/fs/nfs/nfs4xdr.c 2004-01-09 22:16:24.000000000 -0800 @@ -176,6 +176,14 @@ static int nfs_stat_to_errno(int); op_decode_hdr_maxsz + \ 4 + 5 + 2 + 3 + \ decode_getattr_maxsz +#define NFS4_enc_open_downgrade_sz \ + compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + op_encode_hdr_maxsz + 7 +#define NFS4_dec_open_downgrade_sz \ + compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ + op_decode_hdr_maxsz + 4 #define NFS4_enc_close_sz compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ op_encode_hdr_maxsz + 5 @@ -712,6 +720,22 @@ encode_open_reclaim(struct xdr_stream *x } static int +encode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeargs *arg) +{ + uint32_t *p; + + RESERVE_SPACE(16+sizeof(arg->stateid.data)); + WRITE32(OP_OPEN_DOWNGRADE); + WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data)); + WRITE32(arg->seqid); + WRITE32(arg->share_access); + /* No deny modes */ + WRITE32(0); + + return 0; +} + +static int encode_putfh(struct xdr_stream *xdr, struct nfs_fh *fh) { int len = fh->size; @@ -1129,6 +1153,27 @@ out: return status; } +/* + * Encode an OPEN_DOWNGRADE request + */ +static int +nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, uint32_t *p, struct nfs_closeargs *args) +{ + struct xdr_stream xdr; + struct compound_hdr hdr = { + .nops = 2, + }; + int status; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, &hdr); + status = encode_putfh(&xdr, args->fh); + if (status) + goto out; + status = encode_open_downgrade(&xdr, args); +out: + return status; +} /* * Encode a READ request @@ -2001,6 +2046,19 @@ decode_open_confirm(struct xdr_stream *x return 0; } +static int +decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res) +{ + uint32_t *p; + int status; + + status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE); + if (status) + return status; + READ_BUF(sizeof(res->stateid.data)); + COPYMEM(res->stateid.data, sizeof(res->stateid.data)); + return 0; +} static int decode_putfh(struct xdr_stream *xdr) @@ -2377,6 +2435,29 @@ decode_compound(struct xdr_stream *xdr, DECODE_TAIL; } + +/* + * Decode OPEN_DOWNGRADE response + */ +static int +nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_closeres *res) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status) + goto out; + status = decode_putfh(&xdr); + if (status) + goto out; + status = decode_open_downgrade(&xdr, res); +out: + return status; +} + /* * END OF "GENERIC" DECODE ROUTINES. */ @@ -2827,6 +2908,7 @@ struct rpc_procinfo nfs4_procedures[] = PROC(OPEN, enc_open, dec_open), PROC(OPEN_CONFIRM, enc_open_confirm, dec_open_confirm), PROC(OPEN_RECLAIM, enc_open_reclaim, dec_open_reclaim), + PROC(OPEN_DOWNGRADE, enc_open_downgrade, dec_open_downgrade), PROC(CLOSE, enc_close, dec_close), PROC(SETATTR, enc_setattr, dec_setattr), PROC(FSINFO, enc_fsinfo, dec_fsinfo), diff -puN include/linux/nfs4.h~nfs-28-open_owner include/linux/nfs4.h --- 25/include/linux/nfs4.h~nfs-28-open_owner 2004-01-09 22:16:24.000000000 -0800 +++ 25-akpm/include/linux/nfs4.h 2004-01-09 22:16:24.000000000 -0800 @@ -290,6 +290,7 @@ enum { NFSPROC4_CLNT_OPEN, NFSPROC4_CLNT_OPEN_CONFIRM, NFSPROC4_CLNT_OPEN_RECLAIM, + NFSPROC4_CLNT_OPEN_DOWNGRADE, NFSPROC4_CLNT_CLOSE, NFSPROC4_CLNT_SETATTR, NFSPROC4_CLNT_FSINFO, diff -puN include/linux/nfs_fs.h~nfs-28-open_owner include/linux/nfs_fs.h --- 25/include/linux/nfs_fs.h~nfs-28-open_owner 2004-01-09 22:16:24.000000000 -0800 +++ 25-akpm/include/linux/nfs_fs.h 2004-01-09 22:16:24.000000000 -0800 @@ -550,10 +550,11 @@ struct nfs4_state { struct nfs4_state_owner *owner; /* Pointer to the open owner */ struct inode *inode; /* Pointer to the inode */ - pid_t pid; /* Thread that called OPEN */ nfs4_stateid stateid; + unsigned int nreaders; + unsigned int nwriters; int state; /* State on the server (R,W, or RW) */ atomic_t count; }; @@ -569,6 +570,7 @@ extern int nfs4_open_reclaim(struct nfs4 extern int nfs4_proc_async_renew(struct nfs4_client *); extern int nfs4_proc_renew(struct nfs4_client *); extern int nfs4_do_close(struct inode *, struct nfs4_state *); +int nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode); extern int nfs4_wait_clnt_recover(struct rpc_clnt *, struct nfs4_client *); extern struct inode *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); extern int nfs4_open_revalidate(struct inode *, struct dentry *, int); @@ -587,7 +589,8 @@ extern struct nfs4_state_owner * nfs4_ge extern void nfs4_put_state_owner(struct nfs4_state_owner *); extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); extern void nfs4_put_open_state(struct nfs4_state *); -extern struct nfs4_state *nfs4_find_state_bypid(struct inode *, pid_t); +extern void nfs4_close_state(struct nfs4_state *, mode_t); +extern struct nfs4_state *nfs4_find_state(struct inode *, struct rpc_cred *, mode_t mode); extern void nfs4_increment_seqid(int status, struct nfs4_state_owner *sp); extern int nfs4_handle_error(struct nfs_server *, int); extern void nfs4_schedule_state_recovery(struct nfs4_client *); diff -puN include/linux/nfs_xdr.h~nfs-28-open_owner include/linux/nfs_xdr.h --- 25/include/linux/nfs_xdr.h~nfs-28-open_owner 2004-01-09 22:16:24.000000000 -0800 +++ 25-akpm/include/linux/nfs_xdr.h 2004-01-09 22:16:24.000000000 -0800 @@ -153,6 +153,7 @@ struct nfs_closeargs { struct nfs_fh * fh; nfs4_stateid stateid; __u32 seqid; + __u32 share_access; }; struct nfs_closeres { _