From 5e6f1ca5638786e461bf6ef12bbce88f52fdabb1 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sun, 17 Oct 2010 21:48:47 -0700 Subject: [PATCH] kernel - Fix NFS client & server bugs * A very long standing bug in the server cache was finally whacked. The write-gather code was improperly returning the wrong mbuf for the server to reply with, causing client stalls. This behavior depends on the client doing burst asynchronous writes. Newer releases of DragonFly do burst asynchronous writes but older ones tended not to. * The server cache was not MPSAFE. Add a MP token to fix that. * Remove critical sectons from the server cache which are no longer needed. * Fix a potential client-side rpc request race where a request's NEEDSXMIT flag is not set until after the request possibly blocks, which can lead to issues if another thread picks up the request and then believes that it has already been transmitted when it has not. * Document a big problem with NFSv2 and HAMMER-served directories. NFSv2 only has 32-bit directory cookies. It is possible to work around the problem by using rdirplus (which is the default now). However, some servers may not be able to handle rdirplus with a NFSv2 mount. Users who need to serve out NFSv2 cannot serve HAMMER directories with NFSv2 unless the clients support rdirplus. Our defaults are NFSv3 and rdirplus and NFSv3 does NOT have this problem. Reported-by: Thomas Nikolajsen --- sys/vfs/nfs/nfs_bio.c | 3 +-- sys/vfs/nfs/nfs_serv.c | 29 ++++++++++++----------------- sys/vfs/nfs/nfs_socket.c | 16 +++++++--------- sys/vfs/nfs/nfs_srvcache.c | 18 +++++++++++++++--- sys/vfs/nfs/nfs_vfsops.c | 21 ++++++++++++++++++--- sys/vfs/nfs/nfs_vnops.c | 9 ++++++++- 6 files changed, 61 insertions(+), 35 deletions(-) diff --git a/sys/vfs/nfs/nfs_bio.c b/sys/vfs/nfs/nfs_bio.c index 5e20bb4a76..eba7a17755 100644 --- a/sys/vfs/nfs/nfs_bio.c +++ b/sys/vfs/nfs/nfs_bio.c @@ -941,7 +941,7 @@ nfs_asyncio(struct vnode *vp, struct bio *bio) } /* - * nfs_dio() - Execute a BIO operation synchronously. The BIO will be + * nfs_doio() - Execute a BIO operation synchronously. The BIO will be * completed and its error returned. The caller is responsible * for brelse()ing it. ONLY USE FOR BIO_SYNC IOs! Otherwise * our error probe will be against an invalid pointer. @@ -1056,7 +1056,6 @@ nfs_doio(struct vnode *vp, struct bio *bio, struct thread *td) * XXX This is having problems, give up for now. */ if (vn_cache_strategy(vp, bio)) { - kprintf("X"); error = biowait(&bio->bio_buf->b_bio1, "nfsrsw"); return (error); } diff --git a/sys/vfs/nfs/nfs_serv.c b/sys/vfs/nfs/nfs_serv.c index b1b8fa0639..35646eda73 100644 --- a/sys/vfs/nfs/nfs_serv.c +++ b/sys/vfs/nfs/nfs_serv.c @@ -1229,7 +1229,6 @@ nfsrv_writegather(struct nfsrv_descript **ndp, struct nfssvc_sock *slp, i = 0; len = 0; #endif - *mrq = NULL; if (*ndp) { nfsd = *ndp; *ndp = NULL; @@ -1310,7 +1309,6 @@ nfsmout: /* * Add this entry to the hash and time queues. */ - crit_enter(); owp = NULL; wp = slp->ns_tq.lh_first; while (wp && wp->nd_time < nfsd->nd_time) { @@ -1353,7 +1351,6 @@ nfsmout: LIST_INSERT_HEAD(wpp, nfsd, nd_hash); } } - crit_exit(); } /* @@ -1362,7 +1359,6 @@ nfsmout: */ loop1: cur_usec = nfs_curusec(); - crit_enter(); for (nfsd = slp->ns_tq.lh_first; nfsd; nfsd = owp) { owp = nfsd->nd_tq.le_next; if (nfsd->nd_time > cur_usec) @@ -1372,14 +1368,15 @@ loop1: NFS_DPF(WG, ("P%03x", nfsd->nd_retxid & 0xfff)); LIST_REMOVE(nfsd, nd_tq); LIST_REMOVE(nfsd, nd_hash); - crit_exit(); info.mrep = nfsd->nd_mrep; + info.mreq = NULL; info.v3 = (nfsd->nd_flag & ND_NFSV3); nfsd->nd_mrep = NULL; cred = &nfsd->nd_cr; forat_ret = aftat_ret = 1; error = nfsrv_fhtovp(&nfsd->nd_fh, 1, &mp, &vp, cred, slp, - nfsd->nd_nam, &rdonly, (nfsd->nd_flag & ND_KERBAUTH), TRUE); + nfsd->nd_nam, &rdonly, + (nfsd->nd_flag & ND_KERBAUTH), TRUE); if (!error) { if (info.v3) forat_ret = VOP_GETATTR(vp, &forat); @@ -1490,7 +1487,6 @@ loop1: * Done. Put it at the head of the timer queue so that * the final phase can return the reply. */ - crit_enter(); if (nfsd != swp) { nfsd->nd_time = 0; LIST_INSERT_HEAD(&slp->ns_tq, nfsd, nd_tq); @@ -1499,30 +1495,29 @@ loop1: if (nfsd) { LIST_REMOVE(nfsd, nd_tq); } - crit_exit(); } while (nfsd); - crit_enter(); swp->nd_time = 0; LIST_INSERT_HEAD(&slp->ns_tq, swp, nd_tq); - crit_exit(); goto loop1; } - crit_exit(); /* * Search for a reply to return. */ - crit_enter(); - for (nfsd = slp->ns_tq.lh_first; nfsd; nfsd = nfsd->nd_tq.le_next) + for (nfsd = slp->ns_tq.lh_first; nfsd; nfsd = nfsd->nd_tq.le_next) { if (nfsd->nd_mreq) { NFS_DPF(WG, ("X%03x", nfsd->nd_retxid & 0xfff)); LIST_REMOVE(nfsd, nd_tq); - *mrq = nfsd->nd_mreq; - *ndp = nfsd; break; } - crit_exit(); - *mrq = info.mreq; + } + if (nfsd) { + *ndp = nfsd; + *mrq = nfsd->nd_mreq; + } else { + *ndp = NULL; + *mrq = NULL; + } return (0); } diff --git a/sys/vfs/nfs/nfs_socket.c b/sys/vfs/nfs/nfs_socket.c index caee990a25..f1d7dea80e 100644 --- a/sys/vfs/nfs/nfs_socket.c +++ b/sys/vfs/nfs/nfs_socket.c @@ -468,6 +468,7 @@ nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top, */ error = so_pru_sosend(so, sendnam, NULL, top, NULL, flags, curthread /*XXX*/); + /* * ENOBUFS for dgram sockets is transient and non fatal. * No need to log, and no need to break a soft mount. @@ -1272,9 +1273,6 @@ nfs_request_try(struct nfsreq *rep) rep->r_flags |= R_LOCKED; rep->r_mrep = NULL; - /* - * Do the client side RPC. - */ nfsstats.rpcrequests++; if (nmp->nm_flag & NFSMNT_FORCE) { @@ -1282,8 +1280,11 @@ nfs_request_try(struct nfsreq *rep) rep->r_flags &= ~R_LOCKED; return (0); } + rep->r_flags |= R_NEEDSXMIT; /* in case send lock races us */ /* + * Do the client side RPC. + * * Chain request into list of outstanding requests. Be sure * to put it LAST so timer finds oldest requests first. Note * that our control of R_LOCKED prevents the request from @@ -1314,20 +1315,17 @@ nfs_request_try(struct nfsreq *rep) if (nmp->nm_so) { if (nmp->nm_soflags & PR_CONNREQUIRED) error = nfs_sndlock(nmp, rep); - if (error == 0) { + if (error == 0 && (rep->r_flags & R_NEEDSXMIT)) { m2 = m_copym(rep->r_mreq, 0, M_COPYALL, MB_WAIT); error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep); - if (nmp->nm_soflags & PR_CONNREQUIRED) - nfs_sndunlock(nmp); rep->r_flags &= ~R_NEEDSXMIT; if ((rep->r_flags & R_SENT) == 0) { rep->r_flags |= R_SENT; } - } else { - rep->r_flags |= R_NEEDSXMIT; + if (nmp->nm_soflags & PR_CONNREQUIRED) + nfs_sndunlock(nmp); } } else { - rep->r_flags |= R_NEEDSXMIT; rep->r_rtt = -1; } if (error == EPIPE) diff --git a/sys/vfs/nfs/nfs_srvcache.c b/sys/vfs/nfs/nfs_srvcache.c index 355eedbe2d..24ba8ac207 100644 --- a/sys/vfs/nfs/nfs_srvcache.c +++ b/sys/vfs/nfs/nfs_srvcache.c @@ -73,6 +73,8 @@ static u_long nfsrvhash; #define NETFAMILY(rp) \ (((rp)->rc_flag & RC_INETADDR) ? AF_INET : AF_ISO) +struct lwkt_token srvcache_token = LWKT_TOKEN_MP_INITIALIZER(srvcache_token); + /* * Static array that defines which nfs rpc's are nonidempotent */ @@ -133,7 +135,6 @@ static int nfsv2_repstat[NFS_NPROCS] = { void nfsrv_initcache(void) { - nfsrvhashtbl = hashinit(desirednfsrvcache, M_NFSD, &nfsrvhash); TAILQ_INIT(&nfsrvlruhead); } @@ -168,6 +169,8 @@ nfsrv_getcache(struct nfsrv_descript *nd, struct nfssvc_sock *slp, */ if (!nd->nd_nam2) return (RC_DOIT); + + lwkt_gettoken(&srvcache_token); loop: for (rp = NFSRCHASH(nd->nd_retxid)->lh_first; rp != 0; rp = rp->rc_hash.le_next) { @@ -210,9 +213,11 @@ loop: rp->rc_flag &= ~RC_WANTED; wakeup((caddr_t)rp); } + lwkt_reltoken(&srvcache_token); return (ret); } } + nfsstats.srvcache_misses++; NFS_DPF(RC, ("M%03x", nd->nd_retxid & 0xfff)); if (numnfsrvcache < desirednfsrvcache) { @@ -239,9 +244,9 @@ loop: rp->rc_nam = NULL; rp->rc_flag &= ~RC_NAM; } - rp->rc_flag &= (RC_LOCKED | RC_WANTED); } TAILQ_INSERT_TAIL(&nfsrvlruhead, rp, rc_lru); + rp->rc_state = RC_INPROG; rp->rc_xid = nd->nd_retxid; saddr = (struct sockaddr_in *)nd->nd_nam; @@ -263,6 +268,8 @@ loop: rp->rc_flag &= ~RC_WANTED; wakeup((caddr_t)rp); } + lwkt_reltoken(&srvcache_token); + return (RC_DOIT); } @@ -276,6 +283,8 @@ nfsrv_updatecache(struct nfsrv_descript *nd, int repvalid, struct mbuf *repmbuf) if (!nd->nd_nam2) return; + + lwkt_gettoken(&srvcache_token); loop: for (rp = NFSRCHASH(nd->nd_retxid)->lh_first; rp != 0; rp = rp->rc_hash.le_next) { @@ -328,9 +337,10 @@ loop: rp->rc_flag &= ~RC_WANTED; wakeup((caddr_t)rp); } - return; + break; } } + lwkt_reltoken(&srvcache_token); NFS_DPF(RC, ("L%03x", nd->nd_retxid & 0xfff)); } @@ -342,6 +352,7 @@ nfsrv_cleancache(void) { struct nfsrvcache *rp; + lwkt_gettoken(&srvcache_token); while ((rp = TAILQ_FIRST(&nfsrvlruhead)) != NULL) { if (rp->rc_flag & RC_LOCKED) { rp->rc_flag |= RC_WANTED; @@ -363,6 +374,7 @@ nfsrv_cleancache(void) kfree(rp, M_NFSD); } numnfsrvcache = 0; + lwkt_reltoken(&srvcache_token); } #endif /* NFS_NOSERVER */ diff --git a/sys/vfs/nfs/nfs_vfsops.c b/sys/vfs/nfs/nfs_vfsops.c index 11da476799..c7bb09b5c2 100644 --- a/sys/vfs/nfs/nfs_vfsops.c +++ b/sys/vfs/nfs/nfs_vfsops.c @@ -769,12 +769,27 @@ nfs_decode_args(struct nfsmount *nmp, struct nfs_args *argp) * Silently clear NFSMNT_NOCONN if it's a TCP mount, it makes * no sense in that context. */ - if (nmp->nm_sotype == SOCK_STREAM) + if (nmp->nm_sotype == SOCK_STREAM) { nmp->nm_flag &= ~NFSMNT_NOCONN; + argp->flags &= ~NFSMNT_NOCONN; + } - /* Also clear RDIRPLUS if not NFSv3, it crashes some servers */ - if ((argp->flags & NFSMNT_NFSV3) == 0) + /* + * Ok, this is a problem. bootp is NFSv2 and we will wind up + * mounting a diskless root NFSv2 until we fix it. However, + * NFSv2 just doesn't work with HAMMER directories (the cookies + * are only 32 bits). The only way around this is to allow + * the 'readdirplus' for NFSv2. + * + * If this is a problem the client needs to specify the 'nordirplus' + * option for any NFSv2 NFS mount. + */ +#if 0 + if ((argp->flags & NFSMNT_NFSV3) == 0) { nmp->nm_flag &= ~NFSMNT_RDIRPLUS; + argp->flags &= ~NFSMNT_RDIRPLUS; + } +#endif /* * Re-bind if rsrvd port flag has changed diff --git a/sys/vfs/nfs/nfs_vnops.c b/sys/vfs/nfs/nfs_vnops.c index 5040a077ed..c54c1bcedd 100644 --- a/sys/vfs/nfs/nfs_vnops.c +++ b/sys/vfs/nfs/nfs_vnops.c @@ -1434,7 +1434,8 @@ nfs_writerpc_uio(struct vnode *vp, struct uio *uiop, nfsstats.rpccnt[NFSPROC_WRITE]++; len = (tsiz > nmp->nm_wsize) ? nmp->nm_wsize : tsiz; nfsm_reqhead(&info, vp, NFSPROC_WRITE, - NFSX_FH(info.v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len)); + NFSX_FH(info.v3) + 5 * NFSX_UNSIGNED + + nfsm_rndup(len)); ERROROUT(nfsm_fhtom(&info, vp)); if (info.v3) { tl = nfsm_build(&info, 5 * NFSX_UNSIGNED); @@ -2438,6 +2439,12 @@ nfs_readdirrpc_uio(struct vnode *vp, struct uio *uiop) *tl++ = dnp->n_cookieverf.nfsuquad[0]; *tl++ = dnp->n_cookieverf.nfsuquad[1]; } else { + /* + * WARNING! HAMMER DIRECTORIES WILL NOT WORK WELL + * WITH NFSv2!!! There's nothing I can really do + * about it other than to hope the server supports + * rdirplus w/NFSv2. + */ tl = nfsm_build(&info, 2 * NFSX_UNSIGNED); *tl++ = cookie.nfsuquad[0]; } -- 2.11.4.GIT