Merge commit '49169a56b4da7a6f2d206ecc2166fbe2457343b9'
[unleashed.git] / usr / src / uts / common / nfs / rnode.h
blob0d3e37e9df8116c4e7b713a5cd1b4b4c0e181107
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
29 #ifndef _NFS_RNODE_H
30 #define _NFS_RNODE_H
32 #include <sys/avl.h>
33 #include <sys/list.h>
34 #include <nfs/nfs.h>
36 #ifdef __cplusplus
37 extern "C" {
38 #endif
40 typedef enum nfs_access_type {
41 NFS_ACCESS_UNKNOWN,
42 NFS_ACCESS_ALLOWED,
43 NFS_ACCESS_DENIED
44 } nfs_access_type_t;
46 typedef struct acache_hash {
47 struct acache *next; /* next and prev must be first */
48 struct acache *prev;
49 krwlock_t lock;
50 } acache_hash_t;
52 typedef struct acache {
53 struct acache *next; /* next and prev must be first */
54 struct acache *prev;
55 uint32_t known;
56 uint32_t allowed;
57 struct rnode *rnode;
58 cred_t *cred;
59 struct acache *list;
60 struct acache_hash *hashq;
61 } acache_t;
63 #define NFS_FHANDLE_LEN 72
65 typedef struct nfs_fhandle {
66 int fh_len;
67 char fh_buf[NFS_FHANDLE_LEN];
68 } nfs_fhandle;
70 typedef struct rddir_cache {
71 lloff_t _cookie; /* cookie used to find this cache entry */
72 lloff_t _ncookie; /* cookie used to find the next cache entry */
73 char *entries; /* buffer containing dirent entries */
74 int eof; /* EOF reached after this request */
75 int entlen; /* size of dirent entries in buf */
76 int buflen; /* size of the buffer used to store entries */
77 int flags; /* control flags, see below */
78 kcondvar_t cv; /* cv for blocking */
79 int error; /* error from RPC operation */
80 kmutex_t lock;
81 uint_t count; /* reference count */
82 avl_node_t tree; /* AVL tree links */
83 } rddir_cache;
85 #define nfs_cookie _cookie._p._l
86 #define nfs_ncookie _ncookie._p._l
87 #define nfs3_cookie _cookie._f
88 #define nfs3_ncookie _ncookie._f
90 #define RDDIR 0x1 /* readdir operation in progress */
91 #define RDDIRWAIT 0x2 /* waiting on readdir in progress */
92 #define RDDIRREQ 0x4 /* a new readdir is required */
93 #define RDDIRCACHED 0x8 /* entry is in the cache */
95 #define HAVE_RDDIR_CACHE(rp) (avl_numnodes(&(rp)->r_dir) > 0)
97 typedef struct symlink_cache {
98 char *contents; /* contents of the symbolic link */
99 int len; /* length of the contents */
100 int size; /* size of the allocated buffer */
101 } symlink_cache;
103 typedef struct commit {
104 page_t *c_pages; /* list of pages to commit */
105 offset3 c_commbase; /* base offset to do commit from */
106 count3 c_commlen; /* len to commit */
107 kcondvar_t c_cv; /* condvar for waiting for commit */
108 } commit_t;
111 * The various values for the commit states. These are stored in
112 * the p_fsdata byte in the page struct.
113 * NFSv3,4 can use asynchronous writes - the NFS server can send a response
114 * before storing the data to the stable store (disk). The response contains
115 * information if the data are on a disk or not. NFS client marks pages
116 * which are already on the stable store as C_NOCOMMIT. The pages which were
117 * sent but are not yet on the stable store are only partially 'safe' and are
118 * marked as C_DELAYCOMMIT, which can be later changed to C_COMMIT if the
119 * commit operation is in progress. If the NFS server is e.g. rebooted, the
120 * client needs to resend all the uncommitted data. The client walks all the
121 * vp->v_object's list and if C_DELAYCOMMIT or C_COMMIT is set, the page is
122 * marked as dirty and thus will be written to the server again.
124 #define C_NOCOMMIT 0 /* no commit is required */
125 #define C_COMMIT 1 /* a commit is required so do it now */
126 #define C_DELAYCOMMIT 2 /* a commit is required, but can be delayed */
129 * The lock manager holds state making it possible for the client
130 * and server to be out of sync. For example, if the response from
131 * the server granting a lock request is lost, the server will think
132 * the lock is granted and the client will think the lock is lost.
133 * To deal with this, a list of processes for which the client is
134 * not sure if the server holds a lock is attached to the rnode.
135 * When such a process closes the rnode, an unlock request is sent
136 * to the server to unlock the entire file.
138 * The list is kept as a singularly linked NULL terminated list.
139 * Because it is only added to under extreme error conditions, the
140 * list shouldn't get very big. DEBUG kernels print a console warning
141 * when the number of entries on a list go beyond nfs_lmpl_high_water
142 * an arbitrary number defined in nfs_add_locking_id()
144 #define RLMPL_PID 1
145 #define RLMPL_OWNER 2
146 typedef struct lock_manager_pid_list {
147 int lmpl_type;
148 pid_t lmpl_pid;
149 union {
150 pid_t _pid;
151 struct {
152 int len;
153 char *owner;
154 } _own;
155 } un;
156 struct lock_manager_pid_list *lmpl_next;
157 } lmpl_t;
159 #define lmpl_opid un._pid
160 #define lmpl_own_len un._own.len
161 #define lmpl_owner un._own.owner
164 * A homegrown reader/writer lock implementation. It addresses
165 * two requirements not addressed by the system primitives. They
166 * are that the `enter" operation is optionally interruptible and
167 * that they can be re`enter'ed by writers without deadlock.
169 typedef struct nfs_rwlock {
170 int count;
171 int waiters;
172 kthread_t *owner;
173 kmutex_t lock;
174 kcondvar_t cv;
175 kcondvar_t cv_rd;
176 } nfs_rwlock_t;
179 * The format of the hash bucket used to lookup rnodes from a file handle.
181 typedef struct rhashq {
182 struct rnode *r_hashf;
183 struct rnode *r_hashb;
184 krwlock_t r_lock;
185 } rhashq_t;
188 * Remote file information structure.
190 * The rnode is the "inode" for remote files. It contains all the
191 * information necessary to handle remote file on the client side.
193 * Note on file sizes: we keep two file sizes in the rnode: the size
194 * according to the client (r_size) and the size according to the server
195 * (r_attr.va_size). They can differ because we modify r_size during a
196 * write system call (nfs_rdwr), before the write request goes over the
197 * wire (before the file is actually modified on the server). If an OTW
198 * request occurs before the cached data is written to the server the file
199 * size returned from the server (r_attr.va_size) may not match r_size.
200 * r_size is the one we use, in general. r_attr.va_size is only used to
201 * determine whether or not our cached data is valid.
203 * Each rnode has 3 locks associated with it (not including the rnode
204 * hash table and free list locks):
206 * r_rwlock: Serializes nfs_write and nfs_setattr requests
207 * and allows nfs_read requests to proceed in parallel.
208 * Serializes reads/updates to directories.
210 * r_lkserlock: Serializes lock requests with map, write, and
211 * readahead operations.
213 * r_statelock: Protects all fields in the rnode except for
214 * those listed below. This lock is intented
215 * to be held for relatively short periods of
216 * time (not accross entire putpage operations,
217 * for example).
219 * The following members are protected by the mutex rpfreelist_lock:
220 * r_freef
221 * r_freeb
223 * The following members are protected by the hash bucket rwlock:
224 * r_hashf
225 * r_hashb
227 * Note: r_modaddr is only accessed when the r_statelock mutex is held.
228 * Its value is also controlled via r_rwlock. It is assumed that
229 * there will be only 1 writer active at a time, so it safe to
230 * set r_modaddr and release r_statelock as long as the r_rwlock
231 * writer lock is held.
233 * r_inmap informs nfsX_read()/write() that there is a call to nfsX_map()
234 * in progress. nfsX_read()/write() check r_inmap to decide whether
235 * to perform directio on the file or not. r_inmap is atomically
236 * incremented in nfsX_map() before the address space routines are
237 * called and atomically decremented just before nfsX_map() exits.
238 * r_inmap is not protected by any lock.
240 * r_mapcnt tells that the rnode has mapped pages. r_inmap can be 0
241 * while the rnode has mapped pages.
243 * 64-bit offsets: the code formerly assumed that atomic reads of
244 * r_size were safe and reliable; on 32-bit architectures, this is
245 * not true since an intervening bus cycle from another processor
246 * could update half of the size field. The r_statelock must now
247 * be held whenever any kind of access of r_size is made.
249 * Lock ordering:
250 * r_rwlock > r_lkserlock > r_statelock
252 struct exportinfo; /* defined in nfs/export.h */
253 struct servinfo; /* defined in nfs/nfs_clnt.h */
254 struct failinfo; /* defined in nfs/nfs_clnt.h */
255 struct mntinfo; /* defined in nfs/nfs_clnt.h */
257 #ifdef _KERNEL
259 typedef struct rnode {
260 /* the hash fields must be first to match the rhashq_t */
261 struct rnode *r_hashf; /* hash queue forward pointer */
262 struct rnode *r_hashb; /* hash queue back pointer */
263 struct rnode *r_freef; /* free list forward pointer */
264 struct rnode *r_freeb; /* free list back pointer */
265 rhashq_t *r_hashq; /* pointer to the hash bucket */
266 vnode_t *r_vnode; /* vnode for remote file */
267 nfs_rwlock_t r_rwlock; /* serializes write/setattr requests */
268 nfs_rwlock_t r_lkserlock; /* serialize lock with other ops */
269 kmutex_t r_statelock; /* protects (most of) rnode contents */
270 nfs_fhandle r_fh; /* file handle */
271 struct servinfo *r_server; /* current server */
272 char *r_path; /* path to this rnode */
273 uoff_t r_nextr; /* next byte read offset (read-ahead) */
274 cred_t *r_cred; /* current credentials */
275 cred_t *r_unlcred; /* unlinked credentials */
276 char *r_unlname; /* unlinked file name */
277 vnode_t *r_unldvp; /* parent dir of unlinked file */
278 len_t r_size; /* client's view of file size */
279 struct vattr r_attr; /* cached vnode attributes */
280 hrtime_t r_attrtime; /* time attributes become invalid */
281 hrtime_t r_mtime; /* client time file last modified */
282 long r_mapcnt; /* count of mmapped pages */
283 uint_t r_count; /* # of refs not reflect in v_count */
284 uint_t r_awcount; /* # of outstanding async write */
285 uint_t r_gcount; /* getattrs waiting to flush pages */
286 ushort_t r_flags; /* flags, see below */
287 short r_error; /* async write error */
288 kcondvar_t r_cv; /* condvar for blocked threads */
289 int (*r_putapage) /* address of putapage routine */
290 (vnode_t *, page_t *, uoff_t *, size_t *, int, cred_t *);
291 avl_tree_t r_dir; /* cache of readdir responses */
292 rddir_cache *r_direof; /* pointer to the EOF entry */
293 symlink_cache r_symlink; /* cached readlink response */
294 writeverf3 r_verf; /* version 3 write verifier */
295 uoff_t r_modaddr; /* address for page in writerp */
296 commit_t r_commit; /* commit information */
297 uoff_t r_truncaddr; /* base for truncate operation */
298 vsecattr_t *r_secattr; /* cached security attributes (acls) */
299 cookieverf3 r_cookieverf; /* version 3 readdir cookie verifier */
300 lmpl_t *r_lmpl; /* pids that may be holding locks */
301 nfs3_pathconf_info *r_pathconf; /* cached pathconf information */
302 acache_t *r_acache; /* list of access cache entries */
303 kthread_t *r_serial; /* id of purging thread */
304 list_t r_indelmap; /* list of delmap callers */
305 uint_t r_inmap; /* to serialize read/write and mmap */
306 list_node_t r_mi_link; /* linkage into list of rnodes for */
307 /* this mntinfo */
308 } rnode_t;
309 #endif /* _KERNEL */
312 * Flags
314 #define RREADDIRPLUS 0x1 /* issue a READDIRPLUS instead of READDIR */
315 #define RDIRTY 0x2 /* dirty pages from write operation */
316 #define RSTALE 0x4 /* file handle is stale */
317 #define RMODINPROGRESS 0x8 /* page modification happening */
318 #define RTRUNCATE 0x10 /* truncating, don't commit */
319 #define RHAVEVERF 0x20 /* have a write verifier to compare against */
320 #define RCOMMIT 0x40 /* commit in progress */
321 #define RCOMMITWAIT 0x80 /* someone is waiting to do a commit */
322 #define RHASHED 0x100 /* rnode is in hash queues */
323 #define ROUTOFSPACE 0x200 /* an out of space error has happened */
324 #define RDIRECTIO 0x400 /* bypass the buffer cache */
325 #define RLOOKUP 0x800 /* a lookup has been performed */
326 #define RWRITEATTR 0x1000 /* attributes came from WRITE */
327 #define RINDNLCPURGE 0x2000 /* in the process of purging DNLC references */
328 #define RDELMAPLIST 0x4000 /* delmap callers tracking for as callback */
329 #define RINCACHEPURGE 0x8000 /* purging caches due to file size change */
332 * Convert between vnode and rnode
334 #define RTOV(rp) ((rp)->r_vnode)
335 #define VTOR(vp) ((rnode_t *)((vp)->v_data))
337 #define VTOFH(vp) (RTOFH(VTOR(vp)))
338 #define RTOFH(rp) ((fhandle_t *)(&(rp)->r_fh.fh_buf))
339 #define VTOFH3(vp) (RTOFH3(VTOR(vp)))
340 #define RTOFH3(rp) ((nfs_fh3 *)(&(rp)->r_fh))
342 #ifdef _KERNEL
343 extern int nfs_async_readahead(vnode_t *, uoff_t, caddr_t,
344 struct seg *, cred_t *,
345 void (*)(vnode_t *, uoff_t,
346 caddr_t, struct seg *, cred_t *));
347 extern int nfs_async_putapage(vnode_t *, page_t *, uoff_t, size_t,
348 int, cred_t *, int (*)(vnode_t *, page_t *,
349 uoff_t, size_t, int, cred_t *));
350 extern int nfs_async_pageio(vnode_t *, page_t *, uoff_t, size_t,
351 int, cred_t *, int (*)(vnode_t *, page_t *,
352 uoff_t, size_t, int, cred_t *));
353 extern void nfs_async_readdir(vnode_t *, rddir_cache *,
354 cred_t *, int (*)(vnode_t *,
355 rddir_cache *, cred_t *));
356 extern void nfs_async_commit(vnode_t *, page_t *, offset3, count3,
357 cred_t *, void (*)(vnode_t *, page_t *,
358 offset3, count3, cred_t *));
359 extern void nfs_async_inactive(vnode_t *, cred_t *, void (*)(vnode_t *,
360 cred_t *, caller_context_t *));
361 extern int writerp(rnode_t *, caddr_t, int, struct uio *, int);
362 extern int nfs_putpages(vnode_t *, uoff_t, size_t, int, cred_t *);
363 extern void nfs_invalidate_pages(vnode_t *, uoff_t, cred_t *);
364 extern int rfs2call(struct mntinfo *, rpcproc_t, xdrproc_t, caddr_t,
365 xdrproc_t, caddr_t, cred_t *, int *, enum nfsstat *,
366 int, struct failinfo *);
367 extern int rfs3call(struct mntinfo *, rpcproc_t, xdrproc_t, caddr_t,
368 xdrproc_t, caddr_t, cred_t *, int *, nfsstat3 *,
369 int, struct failinfo *);
370 extern void nfs_setswaplike(vnode_t *, vattr_t *);
371 extern vnode_t *makenfsnode(fhandle_t *, struct nfsfattr *, struct vfs *,
372 hrtime_t, cred_t *, char *, char *);
373 extern vnode_t *makenfs3node_va(nfs_fh3 *, vattr_t *, struct vfs *, hrtime_t,
374 cred_t *, char *, char *);
375 extern vnode_t *makenfs3node(nfs_fh3 *, fattr3 *, struct vfs *, hrtime_t,
376 cred_t *, char *, char *);
377 extern void rp_addfree(rnode_t *, cred_t *);
378 extern void rp_rmhash(rnode_t *);
379 extern int check_rtable(struct vfs *);
380 extern void destroy_rtable(struct vfs *, cred_t *);
381 extern void rflush(struct vfs *, cred_t *);
382 extern nfs_access_type_t nfs_access_check(rnode_t *, uint32_t, cred_t *);
383 extern void nfs_access_cache(rnode_t *rp, uint32_t, uint32_t, cred_t *);
384 extern int nfs_access_purge_rp(rnode_t *);
385 extern int nfs_putapage(vnode_t *, page_t *, uoff_t *, size_t *,
386 int, cred_t *);
387 extern int nfs3_putapage(vnode_t *, page_t *, uoff_t *, size_t *,
388 int, cred_t *);
389 extern void nfs_printfhandle(nfs_fhandle *);
390 extern void nfs_write_error(vnode_t *, int, cred_t *);
391 extern rddir_cache *rddir_cache_alloc(int);
392 extern void rddir_cache_hold(rddir_cache *);
393 extern void rddir_cache_rele(rddir_cache *);
394 #ifdef DEBUG
395 extern char *rddir_cache_buf_alloc(size_t, int);
396 extern void rddir_cache_buf_free(void *, size_t);
397 #endif
398 extern int nfs_rw_enter_sig(nfs_rwlock_t *, krw_t, int);
399 extern int nfs_rw_tryenter(nfs_rwlock_t *, krw_t);
400 extern void nfs_rw_exit(nfs_rwlock_t *);
401 extern int nfs_rw_lock_held(nfs_rwlock_t *, krw_t);
402 extern void nfs_rw_init(nfs_rwlock_t *, char *, krw_type_t, void *);
403 extern void nfs_rw_destroy(nfs_rwlock_t *);
404 extern int nfs_directio(vnode_t *, int, cred_t *);
405 extern int nfs3_rddir_compar(const void *, const void *);
406 extern int nfs_rddir_compar(const void *, const void *);
407 extern struct zone *nfs_zone(void);
408 extern zoneid_t nfs_zoneid(void);
410 #endif
412 #ifdef __cplusplus
414 #endif
416 #endif /* _NFS_RNODE_H */